In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import statistics as st

# 2. Analyze the performance of kNN with respect to accuracy, recall, precision, FPR, and ROC metrics obtained for heart dataset.

In [None]:
heartdata = pd.read_csv('D6_Heart_Dataset_2.csv')
heartdata.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:
X = heartdata.drop('target',axis=1)
Y = heartdata['target']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(242, 13)
(61, 13)
(242,)
(61,)


In [None]:
Y_train.unique()

array([1, 0])

In [None]:
Y_train.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,131
0,111


In [None]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train,Y_train)
Y_pred = knn.predict(X_test)

In [None]:
print('The accuracy is '+str(accuracy_score(Y_test,Y_pred)*100))
print('The recall is '+str(recall_score(Y_test,Y_pred)*100))
print('The precision is '+str(precision_score(Y_test,Y_pred)*100))
print("The roc score is", roc_auc_score(Y_test, Y_pred)*100)
print(confusion_matrix(Y_test, Y_pred))
target_names = ['class 0', 'class 1']
print(classification_report(Y_test, Y_pred, target_names=target_names))

The accuracy is 67.21311475409836
The recall is 67.64705882352942
The precision is 71.875
The roc score is 67.15686274509805
[[18  9]
 [11 23]]
              precision    recall  f1-score   support

     class 0       0.62      0.67      0.64        27
     class 1       0.72      0.68      0.70        34

    accuracy                           0.67        61
   macro avg       0.67      0.67      0.67        61
weighted avg       0.68      0.67      0.67        61



# 4. Test the kNN algorithm with Euclidean and Manhattan distance metrics on heart dataset. Study the impact on performance metrics.


In [None]:
# writing KNN algorithm with Euclidean distance
def euclidean(inpt,X,Y,k):
  diff = inpt - X
  diff_sqr = diff**2
  diff_sqr = diff_sqr.sum(axis=1)
  diff_root = diff_sqr**0.5
  sort_dist_indices = diff_root.argsort()
  sorted_labels = Y[sort_dist_indices]
  k_labels=sorted_labels[:k]
  return st.mode(k_labels)

In [None]:
print('The predicted class is ',euclidean(X.loc[0],X,Y,3))


## Using Euclidean Distance

In [None]:
scores = {}
for i in range(1,11,2):
  euclideanKNN = KNeighborsClassifier(n_neighbors=i,metric='euclidean')
  euclideanKNN.fit(X_train,Y_train)
  Y_pred = euclideanKNN.predict(X_test)
  accurate = round((accuracy_score(Y_test,Y_pred)*100),2)
  scores[i] = accurate
print(scores)
print(max(scores,key=scores.get))

{1: 52.46, 3: 63.93, 5: 63.93, 7: 67.21, 9: 67.21}
7


In [None]:
print('The accuracy is '+str(accuracy_score(Y_test,Y_pred)*100))
print(classification_report(Y_test,Y_pred))

The accuracy is 67.21311475409836
              precision    recall  f1-score   support

           0       0.63      0.63      0.63        27
           1       0.71      0.71      0.71        34

    accuracy                           0.67        61
   macro avg       0.67      0.67      0.67        61
weighted avg       0.67      0.67      0.67        61



## Using Manhattan Distance

In [None]:
scores={}
for i in range(1,11,2):
  euclideanManhattan = KNeighborsClassifier(n_neighbors=i,metric='manhattan')
  euclideanManhattan.fit(X_train,Y_train)
  Y_pred = euclideanManhattan.predict(X_test)
  scores[i] = round((accuracy_score(Y_test,Y_pred)*100),2)
print(scores)
print(max(scores,key=scores.get))

{1: 60.66, 3: 65.57, 5: 68.85, 7: 70.49, 9: 72.13}
9


In [None]:
print('The accuracy is '+str(accuracy_score(Y_test,Y_pred)*100))
print(classification_report(Y_test,Y_pred))

The accuracy is 72.1311475409836
              precision    recall  f1-score   support

           0       0.68      0.70      0.69        27
           1       0.76      0.74      0.75        34

    accuracy                           0.72        61
   macro avg       0.72      0.72      0.72        61
weighted avg       0.72      0.72      0.72        61



# 6. Test the KNN with spambase dataset and compare it with the one with heart disease dataset.

In [None]:
spambasedata = pd.read_csv('spambase.csv')
spambasedata.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [None]:
X = spambasedata.drop('spam',axis=1)
Y = spambasedata['spam']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(3680, 57)
(921, 57)
(3680,)
(921,)


In [None]:
scores = {}
for i in range(1,11,2):
  kn = KNeighborsClassifier(n_neighbors=i,metric='euclidean')
  kn.fit(X_train,Y_train)
  Y_pred = kn.predict(X_test)
  accurate = round((accuracy_score(Y_test,Y_pred)*100),2)
  scores[i] = accurate
print(scores)

{1: 80.46, 3: 79.8, 5: 80.56, 7: 80.35, 9: 79.8}


In [None]:
print(max(scores,key=scores.get))

# print(f"The accuracy is {round((accuracy_score(Y_test,Y_pred)*100),2)}%")

5


In [None]:
scores = {}
for i in range(1,11,2):
  kn = KNeighborsClassifier(n_neighbors=i,metric='manhattan')
  kn.fit(X_train,Y_train)
  Y_pred = kn.predict(X_test)
  accurate = round((accuracy_score(Y_test,Y_pred)*100),2)
  scores[i] = accurate
print(scores)
print(max(scores,key=scores.get))

{1: 84.15, 3: 82.95, 5: 84.04, 7: 83.17, 9: 82.52}
1
