In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from imblearn.combine import SMOTETomek
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import seaborn as sns


import warnings
warnings.filterwarnings('ignore')

### default , smote , SMOTETomek

In [None]:
df = pd.read_csv('./../0.data/heart_2020_final.csv')
X = df.iloc[:,1:].values
y = df['HeartDisease']
smote = SMOTE(random_state = 42)
X_smote,y_smote = smote.fit_resample(X,y)

X_train_ns,y_train_ns = SMOTETomek(sampling_strategy=0.5).fit_resample(X,y)

### Scaling for logistic

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit(X).transform(X)

X_SMOTE_scaled = scaler.fit(X_smote).transform(X_smote)
X_SMOTETomek_scaled = scaler.fit(X_train_ns).transform(X_train_ns)

In [None]:
def fold_K(X, y, model):    
    kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
    acc_test_score = []
    acc_train_score = []
    rec_test_score = []
    rec_train_score = []
    
    for train_index, test_index in kf.split(X): # 5번
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_train = model.predict(X_train)

        acc_train_score.append(accuracy_score(y_train,y_pred_train))
        acc_test_score.append(accuracy_score(y_test,y_pred))
        
        rec_test_score.append(recall_score(y_test , y_pred))
        rec_train_score.append(recall_score(y_train , y_pred_train))

    print('acc test score : {}'.format(np.array(acc_test_score).mean()))
    print('acc train score : {}'.format(np.array(acc_train_score).mean()))
    print('rec test score : {}'.format(np.array(rec_test_score).mean()))
    print('rec train score : {}'.format(np.array(rec_train_score).mean()))
    return model

### Logistic scaled

In [None]:
lr = LogisticRegression(random_state = 42)
fold_K(X_scaled,y,lr)

acc test score : 0.9112711513982413
acc train score : 0.9112951867137333
rec test score : 0.10474028109255204
rec train score : 0.10516703382524055


### Logistic scaler SMOTE

In [None]:
lr = LogisticRegression(random_state = 42)
fold_K(X_SMOTE_scaled,y_smote,lr)

acc test score : 0.7705533859769023
acc train score : 0.7706335442077693
rec test score : 0.8022918329446729
rec train score : 0.802416248731795


### Logistic scaler SMOTETomek

In [None]:
lr = LogisticRegression(random_state = 42)
fold_K(X_SMOTETomek_scaled,y_train_ns,lr)

acc test score : 0.7774134444563707
acc train score : 0.7775074604720674
rec test score : 0.6009803481808536
rec train score : 0.6010057783362077


### Decision Tree

In [None]:
dt = DecisionTreeClassifier()
fold_K(X,y,dt)

acc test score : 0.9969897288823768
acc train score : 0.8558748683717357
rec test score : 0.2514356272419051
rec train score : 0.9666817385415254


### Decision Tree SMOTE

In [None]:
dt = DecisionTreeClassifier()
fold_K(X_smote,y_smote,dt)

acc test score : 0.9093151534570472
acc train score : 0.9983453632079854
rec test score : 0.9133788400146011
rec train score : 0.9966907515699838


### Decision Tree SMOTETomek

In [None]:
dt = DecisionTreeClassifier()
fold_K(X_train_ns,y_train_ns,dt)

acc test score : 0.891845650288122
acc train score : 0.9977822213821248
rec test score : 0.8443629215055711
rec train score : 0.9933067138214957


### Random Forest

In [None]:
# RandomForestClassifier 모델 구축, 학습 및 평가
rf_clf = RandomForestClassifier(random_state=42)
rf_clf = fold_K(X,y,rf_clf)

acc test score : 0.8972348194476532
acc train score : 0.9969275845516437
rec test score : 0.12037197208292709
rec train score : 0.9706725677719585


### Random Forest SMOTE

In [None]:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf = fold_K(X_smote,y_smote,rf_clf)

acc test score : 0.9364306104565194
acc train score : 0.9983230463345002
rec test score : 0.9143439939170053
rec train score : 0.9982930110442126


### Random Forest SMOTETomek

In [None]:
rf_clf = RandomForestClassifier(random_state=42)
fold_K(X_train_ns,y_train_ns,rf_clf)

acc test score : 0.927369439118576
acc train score : 0.9977551827990023
rec test score : 0.8399052625502913
rec train score : 0.9957585267485621
