In [1]:
#import libraries and data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import data_load_data

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced
from imblearn.under_sampling import ClusterCentroids


mitbih_test = data_load_data.load_dataframe("test")
mitbih_train = data_load_data.load_dataframe("train")
ptbdb_abnormal = data_load_data.load_dataframe("abnormal")
ptbdb_normal = data_load_data.load_dataframe("normal")

#rename columns

for df in [ptbdb_abnormal, ptbdb_normal, mitbih_test, mitbih_train]:
    df.columns = [i for i in range(len(df.columns))]

#combine datasets, remove class 4, combine classes 1,2,3

ptbdb = pd.concat([ptbdb_abnormal, ptbdb_normal])
mitbih = pd.concat([mitbih_train, mitbih_test])

mitbih_recoded = mitbih.loc[mitbih[187] != 4]
mitbih_recoded.loc[:, 187] = mitbih_recoded[187].replace([1,2,3], 1)

df_total = pd.concat([mitbih_recoded, ptbdb])

#split into train and test

X = df_total.drop(187, axis=1)
y = df_total[187]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

In [2]:
X = df_total.drop(187, axis=1)
y = df_total[187]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print(pd.crosstab(y_test, y_pred))

col_0    0.0   1.0
187               
0.0    18459   464
1.0     3235  1033


In [3]:

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.85      0.98      0.91     18923
         1.0       0.69      0.24      0.36      4268

    accuracy                           0.84     23191
   macro avg       0.77      0.61      0.63     23191
weighted avg       0.82      0.84      0.81     23191



In [4]:
lr.score(X_test, y_test)

0.8404984692337545

In [5]:
#check overfitting

lr.score(X_train, y_train)

0.8392695442197404

In [6]:
ro = RandomOverSampler()
X_ro, y_ro = ro.fit_resample(X_train, y_train)
lr.fit(X_ro, y_ro)
lr.score(X_test, y_test)

0.7259712819628305

In [7]:
print(classification_report(y_test, lr.predict(X_test)))
print(lr.score(X_train, y_train))

              precision    recall  f1-score   support

         0.0       0.91      0.74      0.81     18923
         1.0       0.37      0.68      0.48      4268

    accuracy                           0.73     23191
   macro avg       0.64      0.71      0.65     23191
weighted avg       0.81      0.73      0.75     23191

0.7279116898796947


In [8]:


ru = RandomUnderSampler()
X_ru, y_ru = ru.fit_resample(X_train, y_train)
lr.fit(X_ru, y_ru)
lr.score(X_test, y_test)

0.7245914363330602

In [9]:
print(classification_report(y_test, lr.predict(X_test)))
print(lr.score(X_train, y_train))

              precision    recall  f1-score   support

         0.0       0.91      0.74      0.81     18923
         1.0       0.37      0.67      0.47      4268

    accuracy                           0.72     23191
   macro avg       0.64      0.70      0.64     23191
weighted avg       0.81      0.72      0.75     23191

0.724957957828468
