In [401]:
import numpy as np
import pandas as pd
from sklearn import tree
from scipy.stats import mode as mode
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import mean_squared_error,classification_report
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
plt.style.use('dark_background')

Predicting the sex of abolone using various and weight measurements?

In [423]:
X =  pd.read_csv('abalone.data',names=['sex','len','diam','ht','whole_wt','shucked_wt',
                                       'viscera_wt','shell_wt','rings'])
y = X.pop('sex')
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   len         4177 non-null   float64
 1   diam        4177 non-null   float64
 2   ht          4177 non-null   float64
 3   whole_wt    4177 non-null   float64
 4   shucked_wt  4177 non-null   float64
 5   viscera_wt  4177 non-null   float64
 6   shell_wt    4177 non-null   float64
 7   rings       4177 non-null   int64  
dtypes: float64(7), int64(1)
memory usage: 261.2 KB


In [424]:
pd.unique(y)

array(['M', 'F', 'I'], dtype=object)

> Male, female, imature

In [425]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X))

## Data splitting 

In [426]:
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y)

## Model construction

In [427]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train,y_train)

y_predict = knn.predict(X_test)
y_train_predict = knn.predict(X_train)


print(f'Training Set Accuracy:\n\t{overall_accuracy(y_train,y_train_predict)}')
print(f'Test Set Accuracy:\n\t{overall_accuracy(y_test,y_predict)}')

Training Set Accuracy:
	0.6794380587484036
Test Set Accuracy:
	0.5406698564593302


In [428]:
print(classification_report(y_train,y_train_predict))

              precision    recall  f1-score   support

           F       0.60      0.65      0.62       970
           I       0.76      0.81      0.79      1008
           M       0.68      0.59      0.63      1154

    accuracy                           0.68      3132
   macro avg       0.68      0.68      0.68      3132
weighted avg       0.68      0.68      0.68      3132



## Hyperparameter turning (choose whatever approach your like)

In [429]:
knn = KNeighborsClassifier()

grid = {'n_neighbors':[n for n in range(1,51)],
        'weights':['uniform','distance'],
        'p':[1,2]}

GRID = GridSearchCV(knn,grid,cv=8,verbose=1).fit(X_train,y_train)

Fitting 8 folds for each of 200 candidates, totalling 1600 fits


In [430]:
GRID.best_params_

{'n_neighbors': 43, 'p': 1, 'weights': 'distance'}

## Best Model

In [431]:
knn = KNeighborsClassifier(n_neighbors=43,
                           weights='distance',
                           p=1)

knn.fit(X_train,y_train)

y_predict = knn.predict(X_test)
y_train_predict = knn.predict(X_train)


print(f'Training Set Accuracy:\n\t{overall_accuracy(y_train,y_train_predict)}')
print(f'Test Set Accuracy:\n\t{overall_accuracy(y_test,y_predict)}\n')
print(classification_report(y_test,y_predict))

Training Set Accuracy:
	1.0
Test Set Accuracy:
	0.5492822966507177

              precision    recall  f1-score   support

           F       0.47      0.36      0.41       337
           I       0.69      0.81      0.75       334
           M       0.46      0.48      0.47       374

    accuracy                           0.55      1045
   macro avg       0.54      0.55      0.54      1045
weighted avg       0.54      0.55      0.54      1045



The accuracy dropped a significantly from the test set to the training set, indicating that there was overfitting of the model is an issue. This is especially notable with the grid search model, which had 100% accuracy on the training set, suggesting the model essentially memorized the train set.

## Compare Results with random forests

In [432]:
X =  pd.read_csv('abalone.data',names=['sex','len','diam','ht','whole_wt','shucked_wt',
                                       'viscera_wt','shell_wt','rings'])

X = X[X.sex!='I']
y = X.pop('sex')

X_train,X_test,y_train,y_test=train_test_split(X,y)

forest = RandomForestClassifier()

forest.fit(X_train,y_train)

y_predict = forest.predict(X_test)
y_train_predict = forest.predict(X_train)

print('Base model\n')
print(f'Training Set Accuracy:\n\t{overall_accuracy(y_train,y_train_predict)}')
print(f'Test Set Accuracy:\n\t{overall_accuracy(y_test,y_predict)}\n')

forest = RandomForestClassifier()

grid = {'n_estimators':[50,100,150],
        'max_depth':[d for d in range(1,15)],
        'max_features':[f for f in range(1,len(X_train.columns))]}

GRID = GridSearchCV(forest,grid,cv=8,verbose=1).fit(X_train,y_train)

print(GRID.best_params_)

Base model

Training Set Accuracy:
	1.0
Test Set Accuracy:
	0.5260930888575458

Fitting 8 folds for each of 294 candidates, totalling 2352 fits
{'max_depth': 4, 'max_features': 7, 'n_estimators': 150}


In [434]:
forest = RandomForestClassifier(n_estimators=150,
                                max_depth=4,
                                max_features=7)

forest.fit(X_train,y_train)

y_predict = forest.predict(X_test)
y_train_predict = forest.predict(X_train)

print(f'Training Set Accuracy:\n\t{overall_accuracy(y_train,y_train_predict)}')
print(f'Test Set Accuracy:\n\t{overall_accuracy(y_test,y_predict)}\n')
print(classification_report(y_test,y_predict))

Training Set Accuracy:
	0.6039510818438382
Test Set Accuracy:
	0.5669957686882934

              precision    recall  f1-score   support

           F       0.55      0.22      0.32       321
           M       0.57      0.85      0.68       388

    accuracy                           0.57       709
   macro avg       0.56      0.54      0.50       709
weighted avg       0.56      0.57      0.52       709



The random forest model performed better and had less issues with overfitting.