In [1]:
#import libraries and data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import data_load_data

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced
from imblearn.under_sampling import ClusterCentroids


mitbih_test = data_load_data.load_dataframe("test")
mitbih_train = data_load_data.load_dataframe("train")
ptbdb_abnormal = data_load_data.load_dataframe("abnormal")
ptbdb_normal = data_load_data.load_dataframe("normal")

#rename columns

for df in [ptbdb_abnormal, ptbdb_normal, mitbih_test, mitbih_train]:
    df.columns = [i for i in range(len(df.columns))]

#combine datasets, remove class 4, combine classes 1,2,3

ptbdb = pd.concat([ptbdb_abnormal, ptbdb_normal])
mitbih = pd.concat([mitbih_train, mitbih_test])

mitbih_recoded = mitbih.loc[mitbih[187] != 4]
mitbih_recoded.loc[:, 187] = mitbih_recoded[187].replace([1,2,3], 1)

df_total = pd.concat([mitbih_recoded, ptbdb])

#split into train and test

X = df_total.drop(187, axis=1)
y = df_total[187]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

In [2]:
#create and fit SMOTE resampler

sm = SMOTE()
X_sm, y_sm = sm.fit_resample(X_train, y_train)

#evaluate knn model on resampled data

knn = KNeighborsClassifier()
knn.fit(X_sm, y_sm)
print(knn.score(X_test, y_test))

0.9575697468845673


In [3]:
#evaluate on training set to check overfitting

knn.score(X_sm, y_sm)

0.9864811317016471

In [4]:
print(classification_report(y_test, knn.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97     18923
         1.0       0.85      0.93      0.89      4268

    accuracy                           0.96     23191
   macro avg       0.92      0.95      0.93     23191
weighted avg       0.96      0.96      0.96     23191



In [5]:
#higher score on training set suggest overfitting, however not as strong as some other overfitting suggestins (accuracy score of 1 on training set)

#SMOTE does not appear to improve performace on knn model, cluster centroids is a better resampling method as it seems to reduce overfitting

In [6]:
#create, fit, and evaluate lr model

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print(lr.score(X_test, y_test))
print(lr.score(X_train, y_train))

0.8404984692337545
0.8392695442197404


In [7]:
#create SMOTE resampler, fit to data and evaluate

sm = SMOTE()
X_sm, y_sm = sm.fit_resample(X_train, y_train)
lr.fit(X_sm, y_sm)
print(lr.score(X_test, y_test))

0.725108878444224


In [8]:
print(classification_report(y_test, lr.predict(X_test)))
print(lr.score(X_sm, y_sm))

              precision    recall  f1-score   support

         0.0       0.91      0.73      0.81     18923
         1.0       0.37      0.68      0.48      4268

    accuracy                           0.73     23191
   macro avg       0.64      0.71      0.65     23191
weighted avg       0.81      0.73      0.75     23191

0.7145187494221295


In [9]:
#performs worse than lr model on non-resampled dataset