# Classification with KNN

In [141]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

np.random.seed(18091998)

In [142]:
# Returns a DataFrame containing the 'df' numeric variables
def numeric(df):
    return df.select_dtypes(include = np.number)

# Returns a DataFrame containing the 'df' categorical variables
def categoric(df):
    return df.select_dtypes(include = 'object')

def compute_metrics(y_true,y_pred):
    accuracy = accuracy_score(y_true,y_pred)
    f1_score_macro = f1_score(y_true,y_pred,average='macro')
    return [accuracy,f1_score_macro]
    
def confusion(true, pred):
    pred = pd.Series(pred)
    true = pd.Series(true)
    
    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

results = pd.DataFrame(columns=['Accuracy', 'F1-score (macro avg)'])

## KNN - MinMax

In [143]:
# Loading train data
train = pd.read_csv('./data/preprocessed/trainMinMax.csv')
train_data = numeric(train).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
train_label = train.outcome

# Loading test data
test = pd.read_csv('./data/preprocessed/testMinMax.csv')
test_data = numeric(test).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
test_label = test.outcome

In [144]:
knn = KNeighborsClassifier()

knn_cv = GridSearchCV(
     estimator=knn,
     param_grid={
         'n_neighbors': [1,3,5,6,7,10,15],
         'metric': ['euclidean', 'minkowski', 'manhattan'],
         'algorithm': ['ball_tree'],
         'weights': ['uniform', 'distance']
     },
     scoring=['accuracy', 'f1_macro'],
     refit=False,
     cv=10
 )
    
knn_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(knn_cv.cv_results_)

In [145]:
cols = ['param_n_neighbors', 'param_metric', 'param_algorithm', 'param_weights',
     'mean_test_accuracy',
    'mean_test_f1_macro'
]
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_n_neighbors,param_metric,param_algorithm,param_weights,mean_test_accuracy,mean_test_f1_macro
39,10,manhattan,ball_tree,distance,0.719195,0.559399
11,10,euclidean,ball_tree,distance,0.712529,0.55445
25,10,minkowski,ball_tree,distance,0.712529,0.55445
21,6,minkowski,ball_tree,distance,0.699425,0.555372
7,6,euclidean,ball_tree,distance,0.699425,0.555372


In [146]:
knn = KNeighborsClassifier(n_neighbors=10, metric='manhattan',weights='distance', algorithm = 'ball_tree')
knn.fit(train_data, train_label)

test_predicted=knn.predict(test_data)

results.loc['KNN-MinMax-all',:] = compute_metrics(test_label, test_predicted)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
KNN-MinMax-all,0.731343,0.596497


## KNN - Std

In [147]:
# Loading train data
train = pd.read_csv('./data/preprocessed/trainStd.csv')
train_data = numeric(train).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
train_label = train.outcome

# Loading test data
test = pd.read_csv('./data/preprocessed/testStd.csv')
test_data = numeric(test).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
test_label = test.outcome

In [148]:
knn = KNeighborsClassifier()

knn_cv = GridSearchCV(
     estimator=knn,
     param_grid={
         'n_neighbors': [1,3,5,6,7,10,15],
         'metric': ['euclidean', 'minkowski', 'manhattan'],
         'algorithm': ['ball_tree'],
         'weights': ['uniform', 'distance']
     },
     scoring=['accuracy', 'f1_macro'],
     refit=False,
     cv=10
 )
    
knn_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(knn_cv.cv_results_)

In [149]:
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_n_neighbors,param_metric,param_algorithm,param_weights,mean_test_accuracy,mean_test_f1_macro
35,6,manhattan,ball_tree,distance,0.732759,0.60933
37,7,manhattan,ball_tree,distance,0.729195,0.600945
21,6,minkowski,ball_tree,distance,0.725632,0.606748
7,6,euclidean,ball_tree,distance,0.725632,0.606748
39,10,manhattan,ball_tree,distance,0.722529,0.571964


In [150]:
knn = KNeighborsClassifier(n_neighbors=6, metric='manhattan', algorithm = 'ball_tree', weights='distance')
knn.fit(train_data, train_label)

test_predicted=knn.predict(test_data)

results.loc['KNN-Std-all',:] = compute_metrics(test_label, test_predicted)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
KNN-MinMax-all,0.731343,0.596497
KNN-Std-all,0.701493,0.568396


## KNN - Std - reduced

In [151]:
# Loading train data
train = pd.read_csv('./data/preprocessed/trainStd.csv')
train_data = numeric(train).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
train_data = train_data.drop(['nasogastricTube_none','nasogastricTube_significant','nasogastricTube_slight',
                             'nasogastricReflux_<1liter','nasogastricReflux_>1liter','nasogastricReflux_none',
                             'rectalExamination_absent','rectalExamination_decreased','rectalExamination_increased',
                             'rectalExamination_normal','abdomen_distendedLarge','abdomen_distendedSmall',
                             'abdomen_firmFecesLarge','abdomen_normal','abdomen_other'], axis=1)

# Loading test 
test = pd.read_csv('./data/preprocessed/testStd.csv')
test_data = numeric(test).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
test_data = test_data.drop(['nasogastricTube_none','nasogastricTube_significant','nasogastricTube_slight',
                             'nasogastricReflux_<1liter','nasogastricReflux_>1liter','nasogastricReflux_none',
                             'rectalExamination_absent','rectalExamination_decreased','rectalExamination_increased',
                             'rectalExamination_normal','abdomen_distendedLarge','abdomen_distendedSmall',
                             'abdomen_firmFecesLarge','abdomen_normal','abdomen_other'], axis=1)

In [152]:
knn = KNeighborsClassifier()

knn_cv = GridSearchCV(
     estimator=knn,
     param_grid={
         'n_neighbors': [1,3,5,6,7,10,15],
         'metric': ['euclidean', 'minkowski', 'manhattan'],
         'algorithm': ['ball_tree'],
         'weights': ['uniform', 'distance']
     },
     scoring=['accuracy', 'f1_macro'],
     refit=False,
     cv=10
 )
    
knn_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(knn_cv.cv_results_)

In [153]:
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_n_neighbors,param_metric,param_algorithm,param_weights,mean_test_accuracy,mean_test_f1_macro
27,15,minkowski,ball_tree,distance,0.728966,0.591936
13,15,euclidean,ball_tree,distance,0.728966,0.591936
39,10,manhattan,ball_tree,distance,0.722529,0.588678
26,15,minkowski,ball_tree,uniform,0.722184,0.583355
12,15,euclidean,ball_tree,uniform,0.722184,0.583355


In [154]:
knn = KNeighborsClassifier(n_neighbors=15, metric='minkowski',algorithm = 'ball_tree', weights='distance')
knn.fit(train_data, train_label)

test_predicted=knn.predict(test_data)

results.loc['KNN-Std-reduced',:] = compute_metrics(test_label, test_predicted)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
KNN-MinMax-all,0.731343,0.596497
KNN-Std-all,0.701493,0.568396
KNN-Std-reduced,0.701493,0.533397


## KNN - MinMax - reduced

In [155]:
# Loading train data
train = pd.read_csv('./data/preprocessed/trainMinMax.csv')
train_data = numeric(train).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
train_data = train_data.drop(['nasogastricTube_none','nasogastricTube_significant','nasogastricTube_slight',
                             'nasogastricReflux_<1liter','nasogastricReflux_>1liter','nasogastricReflux_none',
                             'rectalExamination_absent','rectalExamination_decreased','rectalExamination_increased',
                             'rectalExamination_normal','abdomen_distendedLarge','abdomen_distendedSmall',
                             'abdomen_firmFecesLarge','abdomen_normal','abdomen_other'], axis=1)

# Loading test 
test = pd.read_csv('./data/preprocessed/testMinMax.csv')
test_data = numeric(test).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
test_data = test_data.drop(['nasogastricTube_none','nasogastricTube_significant','nasogastricTube_slight',
                             'nasogastricReflux_<1liter','nasogastricReflux_>1liter','nasogastricReflux_none',
                             'rectalExamination_absent','rectalExamination_decreased','rectalExamination_increased',
                             'rectalExamination_normal','abdomen_distendedLarge','abdomen_distendedSmall',
                             'abdomen_firmFecesLarge','abdomen_normal','abdomen_other'], axis=1)

In [156]:
knn = KNeighborsClassifier()

knn_cv = GridSearchCV(
     estimator=knn,
     param_grid={
         'n_neighbors': [1,3,5,6,7,10,15],
         'metric': ['euclidean', 'minkowski', 'manhattan'],
         'algorithm': ['ball_tree'],
         'weights': ['uniform', 'distance']
     },
     scoring=['accuracy', 'f1_macro'],
     refit=False,
     cv=10
 )
    
knn_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(knn_cv.cv_results_)

In [157]:
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_n_neighbors,param_metric,param_algorithm,param_weights,mean_test_accuracy,mean_test_f1_macro
27,15,minkowski,ball_tree,distance,0.715747,0.582649
13,15,euclidean,ball_tree,distance,0.715747,0.582649
41,15,manhattan,ball_tree,distance,0.712529,0.583954
25,10,minkowski,ball_tree,distance,0.705862,0.566033
11,10,euclidean,ball_tree,distance,0.705862,0.566033


In [158]:
knn = KNeighborsClassifier(n_neighbors=15, metric='minkowski',algorithm = 'ball_tree', weights='distance')
knn.fit(train_data, train_label)

test_predicted=knn.predict(test_data)

results.loc['KNN-MinMax-reduced',:] = compute_metrics(test_label, test_predicted)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
KNN-MinMax-all,0.731343,0.596497
KNN-Std-all,0.701493,0.568396
KNN-Std-reduced,0.701493,0.533397
KNN-MinMax-reduced,0.686567,0.479588


## KNN - minimal Std

In [159]:
# Loading train data
train = pd.read_csv('./data/preprocessed/trainStd.csv')
train_data = numeric(train).iloc[:,:5]

# Loading test data
test = pd.read_csv('./data/preprocessed/testStd.csv')
test_data = numeric(test).iloc[:,:5]

In [160]:
knn = KNeighborsClassifier()

knn_cv = GridSearchCV(
     estimator=knn,
     param_grid={
         'n_neighbors': [1,3,5,6,7,10,15],
         'metric': ['euclidean', 'minkowski', 'manhattan'],
         'algorithm': ['ball_tree'],
         'weights': ['uniform', 'distance']
     },
     scoring=['accuracy', 'f1_macro'],
     refit=False,
     cv=10
 )
    
knn_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(knn_cv.cv_results_)

In [161]:
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_n_neighbors,param_metric,param_algorithm,param_weights,mean_test_accuracy,mean_test_f1_macro
21,6,minkowski,ball_tree,distance,0.725862,0.588778
7,6,euclidean,ball_tree,distance,0.725862,0.588778
37,7,manhattan,ball_tree,distance,0.715747,0.557229
35,6,manhattan,ball_tree,distance,0.715747,0.553161
9,7,euclidean,ball_tree,distance,0.715747,0.55714


In [162]:
knn = KNeighborsClassifier(n_neighbors=6, metric='minkowski',algorithm = 'ball_tree', weights='distance')
knn.fit(train_data, train_label)

test_predicted=knn.predict(test_data)

results.loc['KNN-Std-minimal',:] = compute_metrics(test_label, test_predicted)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
KNN-MinMax-all,0.731343,0.596497
KNN-Std-all,0.701493,0.568396
KNN-Std-reduced,0.701493,0.533397
KNN-MinMax-reduced,0.686567,0.479588
KNN-Std-minimal,0.686567,0.494253


## KNN - minimal MinMax

In [163]:
# Loading train data
train = pd.read_csv('./data/preprocessed/trainMinMax.csv')
train_data = numeric(train).iloc[:,:5]

# Loading test data
test = pd.read_csv('./data/preprocessed/testMinMax.csv')
test_data = numeric(test).iloc[:,:5]

In [164]:
knn = KNeighborsClassifier()

knn_cv = GridSearchCV(
     estimator=knn,
     param_grid={
         'n_neighbors': [1,3,5,6,7,10,15],
         'metric': ['euclidean', 'minkowski', 'manhattan'],
         'algorithm': ['ball_tree'],
         'weights': ['uniform', 'distance']
     },
     scoring=['accuracy', 'f1_macro'],
     refit=False,
     cv=10
 )
    
knn_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(knn_cv.cv_results_)

In [165]:
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_n_neighbors,param_metric,param_algorithm,param_weights,mean_test_accuracy,mean_test_f1_macro
25,10,minkowski,ball_tree,distance,0.705862,0.531966
11,10,euclidean,ball_tree,distance,0.705862,0.531966
41,15,manhattan,ball_tree,distance,0.705747,0.492757
37,7,manhattan,ball_tree,distance,0.69908,0.529498
5,5,euclidean,ball_tree,distance,0.695632,0.537489


In [166]:
knn = KNeighborsClassifier(n_neighbors=10, metric='minkowski',algorithm = 'ball_tree', weights='distance')
knn.fit(train_data, train_label)

test_predicted=knn.predict(test_data)

results.loc['KNN-MinMax-minimal',:] = compute_metrics(test_label, test_predicted)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
KNN-MinMax-all,0.731343,0.596497
KNN-Std-all,0.701493,0.568396
KNN-Std-reduced,0.701493,0.533397
KNN-MinMax-reduced,0.686567,0.479588
KNN-Std-minimal,0.686567,0.494253
KNN-MinMax-minimal,0.716418,0.561563
