In [None]:
#import libraries and data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import data_load_data

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced
from imblearn.under_sampling import ClusterCentroids


mitbih_test = data_load_data.load_dataframe("test")
mitbih_train = data_load_data.load_dataframe("train")
ptbdb_abnormal = data_load_data.load_dataframe("abnormal")
ptbdb_normal = data_load_data.load_dataframe("normal")

#rename columns

for df in [ptbdb_abnormal, ptbdb_normal, mitbih_test, mitbih_train]:
    df.columns = [i for i in range(len(df.columns))]

#combine datasets, remove class 4, combine classes 1,2,3

ptbdb = pd.concat([ptbdb_abnormal, ptbdb_normal])
mitbih = pd.concat([mitbih_train, mitbih_test])

mitbih_recoded = mitbih.loc[mitbih[187] != 4]
mitbih_recoded.loc[:, 187] = mitbih_recoded[187].replace([1,2,3], 1)

df_total = pd.concat([mitbih_recoded, ptbdb])

#split into train and test

X = df_total.drop(187, axis=1)
y = df_total[187]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

In [None]:
#create cluster centroids resample, fit to data 

cc = ClusterCentroids()
X_cc, y_cc = cc.fit_resample(X_train, y_train)

#test on LogisticRegression model

lr = LogisticRegression(max_iter=1000)

lr.fit(X_cc, y_cc)
y_cc_pred = lr.predict(X_test)
print(classification_report_imbalanced(y_test, y_cc_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.90      0.69      0.65      0.78      0.67      0.45     18923
        1.0       0.32      0.65      0.69      0.43      0.67      0.45      4268

avg / total       0.79      0.68      0.66      0.72      0.67      0.45     23191



In [4]:
lr.score(X_test, y_test)

0.6834116683196068

In [5]:
lr.score(X_cc, y_cc)

0.6685136323658751

In [None]:
#performs worse than logistic regression on non-resampled dataset

In [None]:
#test resampling on knn model

knn = KNeighborsClassifier()
knn.fit(X_cc, y_cc)
y_cc_knn_pred = knn.predict(X_test)
print(classification_report(y_test, y_cc_knn_pred))
print(knn.score(X_test, y_test))
print(knn.score(X_cc, y_cc))

              precision    recall  f1-score   support

         0.0       0.98      0.97      0.97     18923
         1.0       0.86      0.90      0.88      4268

    accuracy                           0.95     23191
   macro avg       0.92      0.93      0.92     23191
weighted avg       0.95      0.95      0.95     23191

0.9536889310508387
0.9381999413661682


In [None]:
#appears to avoid overfitting problem of knn on non-resampled dataset. 

In [None]:
#test resampling on random forest classifier

rf = RandomForestClassifier()
rf.fit(X_cc, y_cc)
y_cc_rf_pred = rf.predict(X_test)
print(classification_report(y_test, y_cc_rf_pred))
print(rf.score(X_test, y_test))
print(rf.score(X_cc, y_cc))

              precision    recall  f1-score   support

         0.0       0.98      0.65      0.78     18923
         1.0       0.38      0.95      0.54      4268

    accuracy                           0.70     23191
   macro avg       0.68      0.80      0.66     23191
weighted avg       0.87      0.70      0.74     23191

0.7029451080160407
1.0


In [None]:
#probable overfitting