<a href="https://colab.research.google.com/github/arijitd60/arijitd60/blob/main/KNN_with_Abalone_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

KNN with Abalone dataset

Importing Modules

In [104]:
import pandas as pd
import sklearn.model_selection
import sklearn.linear_model
import sklearn.neighbors
import sklearn.metrics
import plotly.express as px

Loading Dataset

In [94]:
df = pd.read_csv("abalone.data.csv")
df = df.set_index("ID")
df.head()

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
2,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
3,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
4,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
5,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


Converting to Numerical Value

In [95]:
df['Sex'].replace(0, 'M',inplace=True)
df['Sex'].replace(1, 'F',inplace=True)
df['Sex'].replace(2, 'I',inplace=True)

Splitting train and test sets

In [96]:
x = df.drop(["Sex"], axis=1)
y = df["Sex"]
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y)
print("df:", df.shape)
print("x:", x.shape)
print("y:", y.shape)
print("x_train:", x_train.shape)
print("x_test:", x_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

df: (4177, 9)
x: (4177, 8)
y: (4177,)
x_train: (3132, 8)
x_test: (1045, 8)
y_train: (3132,)
y_test: (1045,)


Training a Logistic Regression Model

In [97]:
model = sklearn.neighbors.KNeighborsClassifier()
model.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')


Testing the Trained Model

In [99]:
y_predicted = model.predict(x_test)
accuracy = sklearn.metrics.accuracy_score(y_test, y_predicted)
print("Accuracy =", accuracy)

cm = sklearn.metrics.confusion_matrix(y_test, y_predicted)
cm

Accuracy = 0.5368421052631579


array([[136,  31, 161],
       [ 55, 251,  34],
       [139,  64, 174]])

Hyperparameter Tuning

In [100]:
k_list = range(1,10)
metric_list = ["euclidean", "manhattan", "minkowski"]
result_df = pd.DataFrame(columns=["K", "Metric", "Accuracy"])

for k in k_list:
  for metric in metric_list:
    model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k, metric=metric)
    model.fit(x_train, y_train)
    y_predicted = model.predict(x_test)
    accuracy = sklearn.metrics.accuracy_score(y_test, y_predicted)
    result_df = result_df.append({"K": k, "Metric": metric, "Accuracy": accuracy}, ignore_index=True)

result_df

Unnamed: 0,K,Metric,Accuracy
0,1,euclidean,0.49378
1,1,manhattan,0.480383
2,1,minkowski,0.49378
3,2,euclidean,0.494737
4,2,manhattan,0.488038
5,2,minkowski,0.494737
6,3,euclidean,0.534928
7,3,manhattan,0.541627
8,3,minkowski,0.534928
9,4,euclidean,0.519617


In [101]:
k_df = result_df[result_df["Metric"]=="euclidean"]

fig = px.line(x=k_df["K"], y=k_df["Accuracy"], labels={'x':'k', 'y':'Accuracy'})
fig.show()

In [102]:
metric_df = result_df[result_df["K"]==5]

fig = px.bar(metric_df, x='Metric', y='Accuracy')
fig.show()