In [None]:
#import libraries and data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced
from imblearn.under_sampling import ClusterCentroids


mitbih_test = pd.read_csv('../../input/mitbih_test.csv')
mitbih_train = pd.read_csv('../../input/mitbih_train.csv')
ptbdb_abnormal = pd.read_csv('../../input/ptbdb_abnormal.csv')
ptbdb_normal = pd.read_csv('../../input/ptbdb_normal.csv')

#rename columns

for df in [ptbdb_abnormal, ptbdb_normal, mitbih_test, mitbih_train]:
    df.columns = [i for i in range(len(df.columns))]

#combine datasets, remove class 4, combine classes 1,2,3

ptbdb = pd.concat([ptbdb_abnormal, ptbdb_normal])
mitbih = pd.concat([mitbih_train, mitbih_test])

mitbih_recoded = mitbih.loc[mitbih[187] != 4]
mitbih_recoded.loc[:, 187] = mitbih_recoded[187].replace([1,2,3], 1)

df_total = pd.concat([mitbih_recoded, ptbdb])

#split into train and test

X = df_total.drop(187, axis=1)
y = df_total[187]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

In [7]:
#create knn model and paramaters to test

knn = KNeighborsClassifier()
param_grid = {'n_neighbors': range(1,31)}

#create grid search, fit to data, return best score and params

grid = GridSearchCV(knn, param_grid, scoring='accuracy', n_jobs=-1, cv=5)
grid.fit(X_train, y_train)

print('best params: ', grid.best_params_)
print('best score: ', grid.best_score_)

best params:  {'n_neighbors': 1}
best score:  0.9670346368748325


In [8]:
#evaluate best estimator on training and test set

knn_best = grid.best_estimator_

print('best estimator score on training set: ', knn_best.score(X_train, y_train))
print('best estimator score on test set: ', knn_best.score(X_test, y_test))
print('best estimator classification report: \n', classification_report(y_test, knn_best.predict(X_test)))

best estimator score on training set:  1.0
best estimator score on test set:  0.9686947522745892
best estimator classification report: 
               precision    recall  f1-score   support

         0.0       0.98      0.99      0.98     18923
         1.0       0.93      0.90      0.91      4268

    accuracy                           0.97     23191
   macro avg       0.95      0.94      0.95     23191
weighted avg       0.97      0.97      0.97     23191

