In [1]:
# start by importing some libraries
import pandas as pd
import numpy as np

In [2]:
# now import the dataset 
abalone_data = pd.read_csv('abalone.csv')
abalone_data = abalone_data[abalone_data['Type'].isin(['M', 'F'])].reset_index().drop(columns = ['index'])

In [3]:
# lets visualize to make sure the dataset was imported correctly
abalone_data.head()

Unnamed: 0,Type,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20


In [4]:
# the data is good now lets make a train-test split
from sklearn.model_selection import train_test_split
X = abalone_data.iloc[:, 1:-1]
y = abalone_data.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [5]:
# now we have our training data so lets import KNN
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors = 3)
model = classifier.fit(X_train, y_train)

model  # now we are looking at the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [6]:
# lets try out the model on the test data
model.score(X_test, y_test)

0.5500705218617772

In [7]:
# so these are good results but lets see if we can make them better, two main parameters we will look at: 
    # number of neighbors used
    # weights
# starting with the number of neighbors
from sklearn.metrics import accuracy_score

knn_neighbors_results = pd.DataFrame(columns = ['neighbors', 'accuracy'])

number_neighbors = list(range(1,11))
for neigh in number_neighbors: 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
    classifier = KNeighborsClassifier(n_neighbors = int(neigh))
    model = classifier.fit(X_train, y_train)
    accuracy_score = model.score(X_test, y_test)
    
    to_append = {'neighbors': neigh, 
                'accuracy': accuracy_score}
    knn_neighbors_results = knn_neighbors_results.append(to_append, ignore_index = True)

In [8]:
# we can see here that the accuracy of KNN is not great on any iteration
knn_neighbors_results

Unnamed: 0,neighbors,accuracy
0,1.0,0.506347
1,2.0,0.526093
2,3.0,0.530324
3,4.0,0.538787
4,5.0,0.519041
5,6.0,0.496474
6,7.0,0.533145
7,8.0,0.497884
8,9.0,0.506347
9,10.0,0.500705


In [9]:
knn_neighbors_results = pd.DataFrame(columns = ['weight', 'accuracy'])

weighting = 'uniform', 'distance'
for w in weighting: 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
    classifier = KNeighborsClassifier(n_neighbors = 4, weights = w)
    model = classifier.fit(X_train, y_train)
    accuracy_score = model.score(X_test, y_test)
    
    to_append = {'weight': w, 
                'accuracy': accuracy_score}
    knn_neighbors_results = knn_neighbors_results.append(to_append, ignore_index = True)

In [10]:
# so lets use uniform weights at k = 4
knn_neighbors_results
# lets investigate the dataset a bit more and see if there is an imbalance issue

Unnamed: 0,weight,accuracy
0,uniform,0.540197
1,distance,0.551481


In [11]:
# lets check for imbalance
from collections import Counter

Counter(y)

Counter({'M': 1528, 'F': 1307})

In [12]:
# there is no huge imbalance but for fun lets balance
from imblearn.over_sampling import SMOTE 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

Counter(y_train)  # verify the balance

Counter({'F': 1159, 'M': 1159})

In [13]:
# lets try KNN again 
classifier = KNeighborsClassifier(n_neighbors = 4, weights = w)
model = classifier.fit(X_train, y_train)
model.score(X_test, y_test)  # no huge change which we would expect because the data was no imbalance heavily

0.5669957686882934

In [14]:
from sklearn.svm import LinearSVC

X = abalone_data.iloc[:, 1:-1]
y = abalone_data.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

classifier = LinearSVC()
model = classifier.fit(X_train, y_train)
model.score(X_test, y_test) 

0.5655853314527504

In [15]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
model = classifier.fit(X_train, y_train)
model.score(X_test, y_test)

0.5133991537376587

In [16]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
model = classifier.fit(X_train, y_train)
model.score(X_test, y_test)

0.5289139633286318

In [17]:
'''
We now can see how all of the models run and it is the same general form lets now try and compare these using 
cross-validation.
'''
from sklearn.model_selection import cross_val_score
import statistics

models = [
    KNeighborsClassifier(n_neighbors = 4, weights = 'uniform'), 
    DecisionTreeClassifier(), 
    GaussianNB(), 
    LinearSVC()]
cv_data = pd.DataFrame(columns = ['Model Name', 'CV_1', 'CV_2', 'CV_3', 'CV_4', 'CV_5', 'AVG_CV'])


for classifier in models:
    cv_scores = cross_val_score(classifier, X, y, cv = 5)
    to_append = {
        'Model Name': classifier, 
        'CV_1': cv_scores[0],
        'CV_2': cv_scores[1],
        'CV_3': cv_scores[2],
        'CV_4': cv_scores[3],
        'CV_5': cv_scores[4],
        'AVG_CV': statistics.mean(cv_scores)}
    cv_data = cv_data.append(to_append, ignore_index = True)

In [18]:
cv_data

Unnamed: 0,Model Name,CV_1,CV_2,CV_3,CV_4,CV_5,AVG_CV
0,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.569665,0.493827,0.529101,0.477954,0.544974,0.523104
1,"DecisionTreeClassifier(ccp_alpha=0.0, class_we...",0.553792,0.4903,0.539683,0.4903,0.527337,0.520282
2,"GaussianNB(priors=None, var_smoothing=1e-09)",0.546737,0.500882,0.516755,0.514991,0.536155,0.523104
3,"LinearSVC(C=1.0, class_weight=None, dual=True,...",0.534392,0.544974,0.548501,0.560847,0.578483,0.553439
