<a href="https://colab.research.google.com/github/arijitd60/arijitd60/blob/main/KNN_with_WaterQuality_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

KNN Classification with WaterQuality dataset

In [6]:
import pandas as pd
import sklearn.model_selection
import sklearn.linear_model
import sklearn.neighbors
import sklearn.metrics
import plotly.express as px

Loading the WaterQuality Dataset

In [7]:
df = pd.read_csv("waterQuality.csv")
df = df.set_index("Number")
df.head()

Unnamed: 0_level_0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,viruses,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.65,9.08,0.04,2.85,0.007,0.35,0.83,0.17,0.05,0.2,0.0,0.054,16.08,1.13,0.007,37.75,6.78,0.08,0.34,0.02,1
2,2.32,21.16,0.01,3.31,0.002,5.28,0.68,0.66,0.9,0.65,0.65,0.1,2.01,1.93,0.003,32.26,3.21,0.08,0.27,0.05,1
3,1.01,14.02,0.04,0.58,0.008,4.24,0.53,0.02,0.99,0.05,0.003,0.078,14.16,1.11,0.006,50.28,7.07,0.07,0.44,0.01,0
4,1.36,11.33,0.04,2.96,0.001,7.23,0.03,1.66,1.08,0.71,0.71,0.016,1.41,1.29,0.004,9.12,1.72,0.02,0.45,0.05,1
5,0.92,24.33,0.03,0.2,0.006,2.67,0.69,0.57,0.61,0.13,0.001,0.117,6.74,1.11,0.003,16.9,2.41,0.02,0.06,0.02,1


Splitting train and test sets

In [8]:
x = df.drop(["is_safe"], axis=1)
y = df["is_safe"]
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y)
print("df:", df.shape)
print("x:", x.shape)
print("y:", y.shape)
print("x_train:", x_train.shape)
print("x_test:", x_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

df: (7999, 21)
x: (7999, 20)
y: (7999,)
x_train: (5999, 20)
x_test: (2000, 20)
y_train: (5999,)
y_test: (2000,)


Training a K-Nearest Neighbor Model

In [9]:
model = sklearn.neighbors.KNeighborsClassifier()
model.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

Testing the Trained Model

In [10]:
y_predicted = model.predict(x_test)
accuracy = sklearn.metrics.accuracy_score(y_test, y_predicted)
print("Accuracy =", accuracy)

cm = sklearn.metrics.confusion_matrix(y_test, y_predicted)
cm

Accuracy = 0.882


array([[1736,   50],
       [ 186,   28]])

Hyperparameter Tuning

In [11]:
k_list = range(1,10)
metric_list = ["euclidean", "manhattan", "minkowski"]
result_df = pd.DataFrame(columns=["K", "Metric", "Accuracy"])

for k in k_list:
  for metric in metric_list:
    model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k, metric=metric)
    model.fit(x_train, y_train)
    y_predicted = model.predict(x_test)
    accuracy = sklearn.metrics.accuracy_score(y_test, y_predicted)
    result_df = result_df.append({"K": k, "Metric": metric, "Accuracy": accuracy}, ignore_index=True)

result_df

Unnamed: 0,K,Metric,Accuracy
0,1,euclidean,0.8345
1,1,manhattan,0.8305
2,1,minkowski,0.8345
3,2,euclidean,0.89
4,2,manhattan,0.89
5,2,minkowski,0.89
6,3,euclidean,0.873
7,3,manhattan,0.877
8,3,minkowski,0.873
9,4,euclidean,0.8865


Visualize

In [12]:
k_df = result_df[result_df["Metric"]=="euclidean"]

fig = px.line(x=k_df["K"], y=k_df["Accuracy"], labels={'x':'k', 'y':'Accuracy'})
fig.show()

In [13]:
metric_df = result_df[result_df["K"]==5]

fig = px.bar(metric_df, x='Metric', y='Accuracy')
fig.show()