### K-NN implemented with different distance metrics

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.metrics import confusion_matrix, precision_score, recall_score,accuracy_score

print('pandas version {}'.format(pd.__version__))

pandas version 0.24.1


In [2]:
data =  pd.read_csv('/Users/aakashvarshney/Downloads/indian_liver_patient.csv')

In [3]:
data.head(5)

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


### Exploratory Data Analysis

In [4]:
# shape
print('Data set shape {} '.format(data.shape))

Data set shape (583, 11) 


It is small dataset with 10 indendent features and 1 Label. Let see the Label information.

In [5]:
data['Dataset'].value_counts()

1    416
2    167
Name: Dataset, dtype: int64

There are 167 case of Non Liver Patient while 416 observation from Liver patients

In [6]:
data['Dataset'] = [0 if x == 2 else x for x in data['Dataset']]
data['Gender'] = [1 if x == 'Male' else 0 for x in data['Gender']]

In [7]:
# remove Null/ missing values
data.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

There are 4 observations have missing values for Albumin_and_Globulin_Ratio. Let drop them.

In [8]:
data.dropna(inplace=True)

In [9]:
y = data['Dataset']
X = data.drop(['Dataset'],axis = 1)

### Split data in training and test dataset

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state = 42)

In [11]:
### Preprocess the data from KNN as knn highly depends on distance
from sklearn.preprocessing import StandardScaler
scale = StandardScaler().fit(X_train)
X_train_scaled = scale.transform(X_train)
X_test_scaled = scale.transform(X_test)

  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
  """


In [12]:
pd.DataFrame(X_train_scaled,columns = X_train.columns).head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,0.232937,-1.844498,-0.406846,-0.468453,-0.626836,-0.314676,-0.300257,-0.477483,-0.708987,-0.778005
1,0.983651,0.542153,-0.406846,-0.468453,-0.315105,-0.32589,-0.327225,-0.204218,-0.207433,-0.171826
2,-1.018252,0.542153,-0.406846,-0.435577,-0.421862,-0.348318,-0.337338,-2.208162,-0.959764,1.343624
3,-0.705454,0.542153,-0.262207,-0.271201,-0.39624,-0.320283,-0.31037,1.344284,1.046453,0.131264
4,1.922043,0.542153,-0.392382,-0.468453,-0.353538,-0.213753,-0.273288,-0.295306,-0.332821,-0.474916


In [13]:
def KNN_test(neighbors, p_value):
    knn_clf  = KNeighborsClassifier(n_neighbors=neighbors, metric= p_value).fit(X_train_scaled, y_train)
    print ('Neighbor', neighbors)
    print ('Distance type : {}'.format(p_value))
    print('-'*10)
    print('Training Data Score : {}'.format(knn_clf.score(X_train_scaled,y_train)))
    print('Test Data Score : {}'.format(knn_clf.score(X_test_scaled,y_test)))
    print('-'*40)

    

### Types of Distance in K-NN {euclidean, manhattan, chebyshev, minkowski, wminkowski, seuclidean, mahalanobis}

In [58]:
for x in range(10):
    KNN_test(x + 1, 'euclidean')

Neighbor 1
Distance type : euclidean
----------
Training Data Score : 1.0
Test Data Score : 0.6436781609195402
----------------------------------------
Neighbor 2
Distance type : euclidean
----------
Training Data Score : 0.8296296296296296
Test Data Score : 0.632183908045977
----------------------------------------
Neighbor 3
Distance type : euclidean
----------
Training Data Score : 0.8469135802469135
Test Data Score : 0.7011494252873564
----------------------------------------
Neighbor 4
Distance type : euclidean
----------
Training Data Score : 0.7802469135802469
Test Data Score : 0.6379310344827587
----------------------------------------
Neighbor 5
Distance type : euclidean
----------
Training Data Score : 0.7901234567901234
Test Data Score : 0.6609195402298851
----------------------------------------
Neighbor 6
Distance type : euclidean
----------
Training Data Score : 0.7555555555555555
Test Data Score : 0.6206896551724138
----------------------------------------
Neighbor 7
Dis

In [59]:
distance_given = {'euclidean', 'manhattan', 'minkowski',  'chebyshev', 'canberra', 'jaccard', 'dice'}


In [60]:
for i in distance_given:
    KNN_test(5, i)

Neighbor 5
Distance type : manhattan
----------
Training Data Score : 0.7901234567901234
Test Data Score : 0.6436781609195402
----------------------------------------
Neighbor 5
Distance type : canberra
----------
Training Data Score : 0.7679012345679013
Test Data Score : 0.6896551724137931
----------------------------------------
Neighbor 5
Distance type : jaccard
----------
Training Data Score : 0.7432098765432099
Test Data Score : 0.6494252873563219
----------------------------------------
Neighbor 5
Distance type : euclidean
----------
Training Data Score : 0.7901234567901234
Test Data Score : 0.6609195402298851
----------------------------------------
Neighbor 5
Distance type : minkowski
----------
Training Data Score : 0.7901234567901234
Test Data Score : 0.6609195402298851
----------------------------------------
Neighbor 5
Distance type : dice
----------
Training Data Score : 0.7432098765432099
Test Data Score : 0.6494252873563219
----------------------------------------
Neighb