# 6. Capstone Project - Hyperparameter Optimization; Decision Trees

#### Loading data and libralies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as stats

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [3]:
# Original data set
original_df = pd.read_csv("./capstone_clean_heart_disease_fe.csv")

# Train set : Under sampled data set and Over sampled data set
under_sampled_df = pd.read_csv("./under_sampled_df_fe.csv")
over_sampled_df = pd.read_csv("./over_sampled_df_fe.csv")
# Test set : Under sampled data set and Over sampled data set
test_sampled_df = pd.read_csv("./test_sampled_df_fe.csv")
               
# Train set : SMOTE data set         
smote_df = pd.read_csv("./smote_df.csv")
# Test set : SMOTE data set  
test_smote_df = pd.read_csv("./test_smote_df.csv")

Split each dataset into train set and test set

In [4]:
# Split the original data set into train and test set
X = original_df.drop(columns="HeartDisease")
y = original_df["HeartDisease"]

from sklearn.model_selection import train_test_split

X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(X, y, test_size = 0.3, random_state=42)

In [5]:
# Split other data set into X and y
X_train_under = under_sampled_df.drop(columns="HeartDisease")
y_train_under = under_sampled_df["HeartDisease"]

X_train_over = over_sampled_df.drop(columns="HeartDisease")
y_train_over = over_sampled_df["HeartDisease"]

X_test_sampled = test_sampled_df.drop(columns="HeartDisease")
y_test_sampled = test_sampled_df["HeartDisease"]

X_train_smote = smote_df.drop(columns="HeartDisease")
y_train_smote = smote_df["HeartDisease"]

X_test_smote = test_smote_df.drop(columns="HeartDisease")
y_test_smote = test_smote_df["HeartDisease"]

In [6]:
X_train = [X_train_original, X_train_under, X_train_over, X_train_smote]
X_test = [X_test_original, X_test_sampled, X_test_sampled, X_test_smote]
y_train = [y_train_original, y_train_under, y_train_over,y_train_smote]
y_test = [y_test_original, y_test_sampled, y_test_sampled, y_test_smote]

train_score = []
test_score = []

for index in range(4):
    
    scaler = StandardScaler()
    scaler.fit(X_train[index])
    X_scaled_train = scaler.transform(X_train[index])
    X_scaled_test = scaler.transform(X_test[index])
    
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_scaled_train,y_train[index])
    
    train_score.append(model.score(X_scaled_train,y_train[index])*100)
    test_score.append(model.score(X_scaled_test,y_test[index])*100)

In [7]:
before_selection = pd.DataFrame((train_score, test_score), columns=["Original", "Under Sampled", "Over Sampled", "SMOTE"], 
                                index=["Train score", "Test score"])
before_selection

Unnamed: 0,Original,Under Sampled,Over Sampled,SMOTE
Train score,91.452915,75.78702,75.675365,80.968618
Test score,91.389649,73.322747,73.284094,77.891185


**Festure selection: RFE**  
The columns that I'll use for modeling

In [8]:
# RFE : Original data final

from sklearn.feature_selection import RFE

scaler = StandardScaler()
scaler.fit(X_train_original)
X_scaled_train = scaler.transform(X_train_original)
X_scaled_test = scaler.transform(X_test_original)

rfe_original = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42), n_features_to_select=7)
rfe_original.fit(X_scaled_train,y_train_original)
    
train_score_or_rfe = rfe_original.score(X_scaled_train,y_train_original)
test_score_or_rfe = rfe_original.score(X_scaled_test,y_test_original)

In [9]:
X_train_original.loc[:, rfe_original.support_]

Unnamed: 0,BMI,Smoking,DiffWalking,Sex,AgeCategory,GenHealth,Asthma
176993,27.12,1,0,1,35,3,0
267593,30.23,0,0,0,18,2,0
175707,32.55,0,0,1,60,3,0
317039,33.07,0,0,0,55,3,0
262926,23.06,0,0,1,30,4,0
...,...,...,...,...,...,...,...
119879,26.62,0,0,0,40,4,0
259178,25.04,0,0,1,25,4,0
131932,36.05,0,0,0,18,1,0
146867,27.44,0,0,0,40,3,0


In [10]:
# RFE : Under sampled data final

scaler = StandardScaler()
scaler.fit(X_train_under)
X_scaled_train = scaler.transform(X_train_under)
X_scaled_test = scaler.transform(X_test_sampled)
   
rfe_under = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42), n_features_to_select=9)
rfe_under.fit(X_scaled_train,y_train_under)
    
train_score_u_rfe = rfe_under.score(X_scaled_train,y_train_under)
test_score_u_rfe = rfe_under.score(X_scaled_test,y_test_sampled)

In [11]:
X_train_under.loc[:, rfe_under.support_]

Unnamed: 0,Smoking,AlcoholDrinking,PhysicalHealth,DiffWalking,Sex,AgeCategory,GenHealth,Asthma,Race_Asian
0,1,0,0.0,0,1,65,1,0,0
1,0,0,5.0,0,0,80,0,0,0
2,0,0,0.0,0,0,40,4,0,0
3,0,0,7.0,1,0,60,2,0,0
4,1,0,0.0,0,1,70,3,0,0
...,...,...,...,...,...,...,...,...,...
38177,0,0,0.0,0,1,75,3,0,0
38178,0,0,30.0,1,0,50,0,0,0
38179,0,0,0.0,0,0,75,2,0,0
38180,1,0,15.0,0,1,70,3,0,0


In [12]:
# RFE : Over sampled data final

scaler = StandardScaler()
scaler.fit(X_train_over)
X_scaled_train = scaler.transform(X_train_over)
X_scaled_test = scaler.transform(X_test_sampled)
   
rfe_over = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42), n_features_to_select=11)
rfe_over.fit(X_scaled_train,y_train_over)
    
train_score_ov_rfe = rfe_over.score(X_scaled_train,y_train_over)
test_score_ov_rfe = rfe_over.score(X_scaled_test,y_test_sampled)

In [13]:
X_train_over.loc[:, rfe_over.support_]

Unnamed: 0,BMI,Smoking,AlcoholDrinking,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,GenHealth,Asthma,Race_Asian
0,27.12,1,0,0.0,2.0,0,1,35,3,0,0
1,30.23,0,0,0.0,0.0,0,0,18,2,0,0
2,32.55,0,0,0.0,0.0,0,1,60,3,0,0
3,33.07,0,0,0.0,0.0,0,0,55,3,0,0
4,23.06,0,0,0.0,0.0,0,1,30,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...
408515,42.87,0,0,30.0,0.0,1,0,60,2,0,0
408516,24.41,0,0,0.0,0.0,0,1,65,2,0,0
408517,33.00,0,0,0.0,0.0,0,1,60,1,0,0
408518,31.32,1,0,3.0,2.0,0,0,65,2,1,0


In [14]:
# RFE : SMOTE data final

scaler = StandardScaler()
scaler.fit(X_train_smote)
X_scaled_train = scaler.transform(X_train_smote)
X_scaled_test = scaler.transform(X_test_smote)
    
rfe_smote = RFE(estimator=LogisticRegression(max_iter=1500, random_state=42), n_features_to_select=10)
rfe_smote.fit(X_scaled_train,y_train_smote)
    
train_score_s_rfe = rfe_smote.score(X_scaled_train,y_train_smote)
test_score_s_rfe = rfe_smote.score(X_scaled_test,y_test_smote)

In [15]:
X_train_smote.loc[:, rfe_smote.support_]

Unnamed: 0,AlcoholDrinking,AgeCategory,GenHealth,Asthma,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
0,0,35,3,0,0,0,0,0,0,1
1,0,18,2,0,0,0,0,1,0,0
2,0,60,3,0,0,0,0,0,0,1
3,0,55,3,0,0,0,0,1,0,0
4,0,30,4,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
408515,0,80,1,0,0,0,0,0,0,1
408516,0,65,3,0,0,0,0,0,0,1
408517,0,70,2,0,0,0,0,0,0,0
408518,0,75,2,0,0,0,0,0,0,1


### Optimizing Hyperparameters: Decision Trees

### Baseline Model

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA

In [57]:
def D_Tree(X_train, y_train, X_test, y_test):
    
    DT_model = DecisionTreeClassifier()
    DT_model.fit(X_train, y_train)
    
    train_score = DT_model.score(X_train, y_train)
    test_score = DT_model.score(X_test, y_test)
    
    # Evaluation(precision & Recall)
    y_pred = DT_model.predict(X_test)

    report_initial = classification_report(y_test, y_pred)
    f1_micro = f1_score(y_test, y_pred, average='micro')
    
    return print(f"Train score: {train_score}\nTest score: {test_score}\n{report_initial}\nf1 micro: {f1_micro}")

In [58]:
def D_Tree_w_normalize(X_train, y_train, X_test, y_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_scaled_train = scaler.transform(X_train)
    X_scaled_test = scaler.transform(X_test)
    
    DT_model = DecisionTreeClassifier()
    DT_model.fit(X_scaled_train, y_train)
    
    train_score = DT_model.score(X_scaled_train, y_train)
    test_score = DT_model.score(X_scaled_test, y_test)
    
    # Evaluation(precision & Recall)
    y_pred = DT_model.predict(X_scaled_test)

    report_initial = classification_report(y_test, y_pred)
    f1_micro = f1_score(y_test, y_pred, average='micro')
    
    return print(f"Train score: {train_score}\nTest score: {test_score}\n{report_initial}\nf1 micro: {f1_micro}")

In [59]:
def D_Tree_w_pca_normalize(X_train, y_train, X_test, y_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_scaled_train = scaler.transform(X_train)
    X_scaled_test = scaler.transform(X_test)
    
    my_PCA = PCA()
    my_PCA.fit(X_scaled_train)

    X_train_PCA = my_PCA.transform(X_scaled_train)
    X_test_PCA = my_PCA.transform(X_scaled_test)   
    
    DT_model = DecisionTreeClassifier()
    DT_model.fit(X_train_PCA, y_train)
    
    train_score = DT_model.score(X_train_PCA, y_train)
    test_score = DT_model.score(X_test_PCA, y_test)
    
    # Evaluation(precision & Recall)
    y_pred = DT_model.predict(X_scaled_test)

    report_initial = classification_report(y_test, y_pred)
    f1_micro = f1_score(y_test, y_pred, average='micro')
    
    return print(f"Train score: {train_score}\nTest score: {test_score}\n{report_initial}\nf1 micro: {f1_micro}")

**Original data**

In [60]:
print("Before normalization and reducing dimention\n")

print("Original dataset")
D_Tree(X_train_original.loc[:, rfe_original.support_], y_train_original, \
            X_test_original.loc[:, rfe_original.support_], y_test_original)

Before normalization and reducing dimention

Original dataset
Train score: 0.962641761174116
Test score: 0.8861703683583712
              precision    recall  f1-score   support

           0       0.92      0.95      0.94     87544
           1       0.25      0.17      0.20      8178

    accuracy                           0.89     95722
   macro avg       0.59      0.56      0.57     95722
weighted avg       0.87      0.89      0.88     95722

f1 micro: 0.8861703683583712


In [61]:
print("Only normalization\n")

print("Original dataset")
D_Tree_w_normalize(X_train_original.loc[:, rfe_original.support_], y_train_original, \
            X_test_original.loc[:, rfe_original.support_], y_test_original)

Only normalization

Original dataset
Train score: 0.962641761174116
Test score: 0.8862226029543887
              precision    recall  f1-score   support

           0       0.92      0.95      0.94     87544
           1       0.25      0.17      0.20      8178

    accuracy                           0.89     95722
   macro avg       0.59      0.56      0.57     95722
weighted avg       0.87      0.89      0.88     95722

f1 micro: 0.8862226029543887


In [62]:
print("Normalization and reducing dimention\n")

print("Original dataset")
D_Tree_w_pca_normalize(X_train_original.loc[:, rfe_original.support_], y_train_original, \
            X_test_original.loc[:, rfe_original.support_], y_test_original)

Normalization and reducing dimention

Original dataset
Train score: 0.962641761174116
Test score: 0.8869329934602286
              precision    recall  f1-score   support

           0       0.93      0.83      0.88     87544
           1       0.14      0.30      0.19      8178

    accuracy                           0.79     95722
   macro avg       0.54      0.57      0.54     95722
weighted avg       0.86      0.79      0.82     95722

f1 micro: 0.7874678757234491


The accuracy scores doesn't really change with normalization/dimention reduce. The scores of precision, recall, and f1 score became higher/lower on each when PCA was added, but the f1 micro score became very lower. Thefore, I'll optimize this dataset with no PCA.

**Under Sampled data**

In [63]:
print("Before normalization and reducing dimention\n")

print("Under sampled dataset")
D_Tree(X_train_under.loc[:, rfe_under.support_], y_train_under, \
        X_test_sampled.loc[:, rfe_under.support_], y_test_sampled)

Before normalization and reducing dimention

Under sampled dataset
Train score: 0.8012152323084176
Test score: 0.7147468711476985
              precision    recall  f1-score   support

           0       0.97      0.71      0.82     87544
           1       0.19      0.74      0.31      8178

    accuracy                           0.71     95722
   macro avg       0.58      0.73      0.56     95722
weighted avg       0.90      0.71      0.78     95722

f1 micro: 0.7147468711476985


In [64]:
print("Only normalization\n")

print("Under sampled dataset")
D_Tree_w_normalize(X_train_under.loc[:, rfe_under.support_], y_train_under, \
        X_test_sampled.loc[:, rfe_under.support_], y_test_sampled)

Only normalization

Under sampled dataset
Train score: 0.8012152323084176
Test score: 0.7147259773092914
              precision    recall  f1-score   support

           0       0.97      0.71      0.82     87544
           1       0.19      0.74      0.31      8178

    accuracy                           0.71     95722
   macro avg       0.58      0.73      0.56     95722
weighted avg       0.90      0.71      0.78     95722

f1 micro: 0.7147259773092913


In [65]:
print("Normalization and reducing dimention\n")

print("Under sampled dataset")
D_Tree_w_pca_normalize(X_train_under.loc[:, rfe_under.support_], y_train_under, \
        X_test_sampled.loc[:, rfe_under.support_], y_test_sampled)

Normalization and reducing dimention

Under sampled dataset
Train score: 0.8012152323084176
Test score: 0.7163870374626523
              precision    recall  f1-score   support

           0       0.90      0.44      0.60     87544
           1       0.08      0.49      0.13      8178

    accuracy                           0.45     95722
   macro avg       0.49      0.47      0.36     95722
weighted avg       0.83      0.45      0.56     95722

f1 micro: 0.4481832807505067


The test score increased when it's with PCA. However, all scores of precision, recall, and f1 became lower. Therefore, I'll optimize this dataset with no PCA.

**Over Sampled data**

In [66]:
print("Before normalization and reducing dimention\n")

print("Over sampled dataset")
D_Tree(X_train_over.loc[:, rfe_over.support_], y_train_over, \
        X_test_sampled.loc[:, rfe_over.support_], y_test_sampled)

Before normalization and reducing dimention

Over sampled dataset
Train score: 0.9753108782923725
Test score: 0.837926495476484
              precision    recall  f1-score   support

           0       0.93      0.89      0.91     87544
           1       0.19      0.27      0.22      8178

    accuracy                           0.84     95722
   macro avg       0.56      0.58      0.56     95722
weighted avg       0.87      0.84      0.85     95722

f1 micro: 0.837926495476484


In [67]:
print("Only normalization\n")

print("Over sampled dataset")
D_Tree_w_normalize(X_train_over.loc[:, rfe_over.support_], y_train_over, \
        X_test_sampled.loc[:, rfe_over.support_], y_test_sampled)

Only normalization

Over sampled dataset
Train score: 0.9753108782923725
Test score: 0.8374668310315287
              precision    recall  f1-score   support

           0       0.93      0.89      0.91     87544
           1       0.19      0.27      0.22      8178

    accuracy                           0.84     95722
   macro avg       0.56      0.58      0.56     95722
weighted avg       0.87      0.84      0.85     95722

f1 micro: 0.8374668310315287


In [68]:
print("Normalization and reducing dimention\n")

print("Over sampled dataset")
D_Tree_w_pca_normalize(X_train_over.loc[:, rfe_over.support_], y_train_over, \
        X_test_sampled.loc[:, rfe_over.support_], y_test_sampled)

Normalization and reducing dimention

Over sampled dataset
Train score: 0.9753108782923725
Test score: 0.8399740916403753
              precision    recall  f1-score   support

           0       0.91      0.79      0.84     87544
           1       0.06      0.16      0.09      8178

    accuracy                           0.73     95722
   macro avg       0.49      0.47      0.47     95722
weighted avg       0.84      0.73      0.78     95722

f1 micro: 0.7329245105618353


The test score increased when it's with PCA. However, all scores of precision, recall, and f1 became lower. Therefore, I'll optimize this dataset with no PCA.

**SMOTE data**

In [69]:
print("Before normalization and reducing dimention\n")

print("SMOTE dataset")
D_Tree(X_train_smote.loc[:, rfe_smote.support_], y_train_smote, \
        X_test_smote.loc[:, rfe_smote.support_], y_test_smote)

Before normalization and reducing dimention

SMOTE dataset
Train score: 0.8059678840693234
Test score: 0.753400472200748
              precision    recall  f1-score   support

           0       0.96      0.76      0.85     87544
           1       0.20      0.64      0.31      8178

    accuracy                           0.75     95722
   macro avg       0.58      0.70      0.58     95722
weighted avg       0.89      0.75      0.80     95722

f1 micro: 0.753400472200748


In [70]:
print("Only normalization\n")

print("SMOTE dataset")
D_Tree_w_normalize(X_train_smote.loc[:, rfe_smote.support_], y_train_smote, \
        X_test_smote.loc[:, rfe_smote.support_], y_test_smote)

Only normalization

SMOTE dataset
Train score: 0.8059678840693234
Test score: 0.753400472200748
              precision    recall  f1-score   support

           0       0.96      0.76      0.85     87544
           1       0.20      0.64      0.31      8178

    accuracy                           0.75     95722
   macro avg       0.58      0.70      0.58     95722
weighted avg       0.89      0.75      0.80     95722

f1 micro: 0.753400472200748


In [71]:
print("Normalization and reducing dimention\n")

print("SMOTE dataset")
D_Tree_w_pca_normalize(X_train_smote.loc[:, rfe_smote.support_], y_train_smote, \
        X_test_smote.loc[:, rfe_smote.support_], y_test_smote)

Normalization and reducing dimention

SMOTE dataset
Train score: 0.8059678840693234
Test score: 0.7534422598775621
              precision    recall  f1-score   support

           0       0.91      0.35      0.50     87544
           1       0.08      0.63      0.15      8178

    accuracy                           0.37     95722
   macro avg       0.50      0.49      0.32     95722
weighted avg       0.84      0.37      0.47     95722

f1 micro: 0.37025971041139966


The test scores doesn't really change with normalization/dimention reduce. However, all scores of precision, recall, and f1 became lower. Therefore, I'll optimize this dataset with no PCA.

### GridSearchCV

In [85]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

def DT_gridsearch(depth, split, leaf, X_train, y_train, X_test, y_test):
    DT_param = {
            'max_depth': depth,
            'min_samples_split': split,
            'min_samples_leaf': leaf,
            'criterion': ["gini", "entropy", "log_loss"],
            'splitter': ["best", "random"],
            'random_state': [42]
        }

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_scaled_train = scaler.transform(X_train)
    X_scaled_test = scaler.transform(X_test)
    

    # Create Randomized Search
    clf = GridSearchCV(DecisionTreeClassifier(), DT_param, cv=5, scoring="f1")
    # Fit the model
    clf.fit(X_scaled_train, y_train)
    
    # prediction and evaluation
    y_pred = clf.predict(X_scaled_test)
    score = f1_score(y_test, y_pred)

    
    # Result
    best_clf = clf.best_estimator_
    print('Hyperparameter :\n', best_clf)
    print('Train score:\n', best_clf.score(X_scaled_train, y_train))
    print('Test score:\n', best_clf.score(X_scaled_test, y_test))
    print(f"Best F1 Score: {score}")

Each datasets have too many rows, so use sample rows from each datasets for GridSearch. Thefore, I'll take samples and do GridSearch.

In [107]:
# Take samples for GridSearch

X_GS_original, X_rest_original, y_GS_original, y_rest_original = \
                train_test_split(X_train_original.loc[:, rfe_original.support_], y_train_original, train_size = 0.05, random_state=42)

X_GS_under, X_rest_under, y_GS_under, y_rest_under = \
                train_test_split(X_train_under.loc[:, rfe_under.support_], y_train_under, train_size = 0.3, random_state=42)

X_GS_over, X_rest_over, y_GS_over, y_rest_over = \
                train_test_split(X_train_over.loc[:, rfe_over.support_], y_train_over, train_size = 0.02, random_state=42)

X_GS_smote, X_rest_smote, y_GS_smote, y_rest_smote = \
                train_test_split(X_train_smote.loc[:, rfe_smote.support_], y_train_smote, train_size = 0.06, random_state=42)

**Original data**

train size = 0.05

In [108]:
%%time

depth = range(1, 41, 2)
split = range(2, 41, 2)
leaf =  range(1, 41, 2)

DT_gridsearch(depth, split, leaf, X_GS_original, y_GS_original, \
        X_test_original.loc[:, rfe_original.support_], y_test_original)

Hyperparameter :
 DecisionTreeClassifier(criterion='entropy', max_depth=27, min_samples_split=6,
                       random_state=42)
Train score:
 0.9648070206859497
Test score:
 0.8777397045611249
Best F1 Score: 0.22078700312936944
CPU times: total: 32min 29s
Wall time: 33min 12s


train size = full  
The best hyperparameters are **max_depth=27, min_samples_split=6**. Therefore, I'll use the numbers close to the results.

In [114]:
%%time

depth = range(21, 31, 2)
split = range(2, 11, 2)
leaf =  range(1, 11, 2)

DT_gridsearch(depth, split, leaf, X_train_original.loc[:, rfe_original.support_], y_train_original, \
        X_test_original.loc[:, rfe_original.support_], y_test_original)

Hyperparameter :
 DecisionTreeClassifier(criterion='entropy', max_depth=29, random_state=42,
                       splitter='random')
Train score:
 0.9622208989438149
Test score:
 0.8876015962892543
Best F1 Score: 0.19894274439728984
CPU times: total: 13min 18s
Wall time: 13min 55s


The hyperparameter of max_depth was 29, but that is the highest range which I set, so the best max_depth might be higher. Therefore, I'll set the range of max_depth higher than above and do GridSearch.

In [122]:
%%time

depth = range(29, 41, 2)
split = range(2, 11, 2)
leaf =  range(1, 11, 2)

DT_gridsearch(depth, split, leaf, X_train_original.loc[:, rfe_original.support_], y_train_original, \
        X_test_original.loc[:, rfe_original.support_], y_test_original)

Hyperparameter :
 DecisionTreeClassifier(criterion='entropy', max_depth=31, random_state=42,
                       splitter='random')
Train score:
 0.9626014658541936
Test score:
 0.8876329370468649
Best F1 Score: 0.19982145514060407
CPU times: total: 16min 15s
Wall time: 16min 58s


The hyperparameter of max_depth is within the range, and the f1 score is improved. Therefore, I'll use those hyperparameters for evaluating the model.

In [133]:
scaler = StandardScaler()
scaler.fit(X_train_original.loc[:, rfe_original.support_])
X_scaled_train = scaler.transform(X_train_original.loc[:, rfe_original.support_])
X_scaled_test = scaler.transform(X_test_original.loc[:, rfe_original.support_])
    
DT_model = DecisionTreeClassifier(criterion='entropy', max_depth=31, random_state=42, splitter='random')

DT_model.fit(X_scaled_train, y_train_original)
    
train_score_original = DT_model.score(X_scaled_train, y_train_original)
test_score_original = DT_model.score(X_scaled_test, y_test_original)
    
    # Evaluation(precision & Recall)
y_pred = DT_model.predict(X_scaled_test)

report_initial_original = classification_report(y_test_original, y_pred)

print("After optimization\n")
    
print(f"Train score: {train_score_original}\nTest score: {test_score_original}\n{report_initial_original}")

After optimization

Train score: 0.9626014658541936
Test score: 0.8876329370468649
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     87544
           1       0.26      0.16      0.20      8178

    accuracy                           0.89     95722
   macro avg       0.59      0.56      0.57     95722
weighted avg       0.87      0.89      0.88     95722



In [134]:
print("Before optimization\n")

D_Tree_w_normalize(X_train_original.loc[:, rfe_original.support_], y_train_original, \
            X_test_original.loc[:, rfe_original.support_], y_test_original)

Before optimization

Train score: 0.962641761174116
Test score: 0.886327072146424
              precision    recall  f1-score   support

           0       0.92      0.95      0.94     87544
           1       0.25      0.17      0.20      8178

    accuracy                           0.89     95722
   macro avg       0.59      0.56      0.57     95722
weighted avg       0.87      0.89      0.88     95722

f1 micro: 0.886327072146424


The test score, recall for 0, and precision for 1 improved little bit. However, the recall for 1 became slightly lower than before optimization. Also, both recall scores for 1 of before/after optimization are lower than the other modeling methods/datasets. Therefore, this models cannot be a final model.

**Under sampled data**

train size = 0.3

In [109]:
%%time

depth = range(1, 41, 2)
split = range(2, 41, 2)
leaf =  range(1, 41, 2)

DT_gridsearch(depth, split, leaf, X_GS_under, y_GS_under, \
        X_test_sampled.loc[:, rfe_under.support_], y_test_sampled)

Hyperparameter :
 DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_split=28,
                       random_state=42, splitter='random')
Train score:
 0.751527850532565
Test score:
 0.7662292889826791
Best F1 Score: 0.3414849474706454
CPU times: total: 27min 42s
Wall time: 28min 58s


train size =full  
The best hyperparameters are **max_depth=7, min_samples_split=28**. Therefore, I'll use the numbers close to the results.

In [123]:
%%time

depth = range(1, 11, 2)
split = range(20, 31, 2)
leaf =  range(1, 11, 2)

DT_gridsearch(depth, split, leaf, X_train_under.loc[:, rfe_under.support_], y_train_under, \
        X_test_sampled.loc[:, rfe_under.support_], y_test_sampled)

Hyperparameter :
 DecisionTreeClassifier(criterion='entropy', max_depth=9, min_samples_leaf=9,
                       min_samples_split=30, random_state=42,
                       splitter='random')
Train score:
 0.7625320831805563
Test score:
 0.7185286558993753
Best F1 Score: 0.32536244585221724
CPU times: total: 57.6 s
Wall time: 1min 2s


All hyperparameters of max_depth, sample_split, and sample_leaf are on edge of the range. So all the best parameters might be higher. Therefore, I'll set the range of all the parameters higher than above and do GridSearch.

In [126]:
%%time

depth = range(1, 16, 2)
split = range(20, 35, 2)
leaf =  range(1, 16, 2)

DT_gridsearch(depth, split, leaf, X_train_under.loc[:, rfe_under.support_], y_train_under, \
        X_test_sampled.loc[:, rfe_under.support_], y_test_sampled)

Hyperparameter :
 DecisionTreeClassifier(criterion='entropy', max_depth=9, min_samples_leaf=9,
                       min_samples_split=30, random_state=42,
                       splitter='random')
Train score:
 0.7625320831805563
Test score:
 0.7185286558993753
Best F1 Score: 0.32536244585221724
CPU times: total: 4min 3s
Wall time: 4min 22s


Now, the all hyperparameters are within the range. Therefore, I'll use those hyperparameters for evaluating the model.

In [136]:
scaler = StandardScaler()
scaler.fit(X_train_under.loc[:, rfe_under.support_])
X_scaled_train = scaler.transform(X_train_under.loc[:, rfe_under.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_under.support_])
    
DT_model = DecisionTreeClassifier(criterion='entropy', max_depth=9, min_samples_leaf=9,
                       min_samples_split=30, random_state=42, splitter='random')

DT_model.fit(X_scaled_train, y_train_under)
    
train_score_under = DT_model.score(X_scaled_train, y_train_under)
test_score_under = DT_model.score(X_scaled_test, y_test_sampled)
    
    # Evaluation(precision & Recall)
y_pred = DT_model.predict(X_scaled_test)

report_initial_under = classification_report(y_test_sampled, y_pred)

print("After optimization\n")
    
print(f"Train score: {train_score_under}\nTest score: {test_score_under}\n{report_initial_under}")

After optimization

Train score: 0.7625320831805563
Test score: 0.7185286558993753
              precision    recall  f1-score   support

           0       0.97      0.71      0.82     87544
           1       0.20      0.79      0.33      8178

    accuracy                           0.72     95722
   macro avg       0.59      0.75      0.57     95722
weighted avg       0.91      0.72      0.78     95722



In [138]:
print("Before optimization\n")

D_Tree_w_normalize(X_train_under.loc[:, rfe_under.support_], y_train_under, \
        X_test_sampled.loc[:, rfe_under.support_], y_test_sampled)

Before optimization

Train score: 0.8012152323084176
Test score: 0.7148304465013268
              precision    recall  f1-score   support

           0       0.97      0.71      0.82     87544
           1       0.19      0.74      0.31      8178

    accuracy                           0.71     95722
   macro avg       0.58      0.73      0.56     95722
weighted avg       0.90      0.71      0.78     95722

f1 micro: 0.7148304465013269


The test score, precision for 1, and recall for 1 improved by optimizing hyperparameters. Also, the recall score for 1 is as high as the other chosen machine learning models. Therefore, I'll compare this model to other datasets models with dicision tree.

**Over sampled data**

train size = 0.4

In [110]:
%%time

depth = range(1, 41, 2)
split = range(2, 41, 2)
leaf =  range(1, 41, 2)

DT_gridsearch(depth, split, leaf, X_GS_over, y_GS_over, \
        X_test_sampled.loc[:, rfe_over.support_], y_test_sampled)

Hyperparameter :
 DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=27,
                       random_state=42, splitter='random')
Train score:
 0.7510403916768665
Test score:
 0.6977392866843568
Best F1 Score: 0.3131632047477745
CPU times: total: 35min 26s
Wall time: 37min 30s


train size = full  
The best hyperparameters are **max_depth=7, min_samples_leaf=27**. Therefore, I'll use the numbers close to the results.

In [116]:
%%time

depth = range(1, 11, 2)
split = range(2, 11, 2)
leaf =  range(21, 31, 2)

DT_gridsearch(depth, split, leaf, X_train_over.loc[:, rfe_over.support_], y_train_over, \
        X_test_sampled.loc[:, rfe_over.support_], y_test_sampled)

Hyperparameter :
 DecisionTreeClassifier(max_depth=9, min_samples_leaf=23, random_state=42)
Train score:
 0.7661509840399491
Test score:
 0.7124903365997367
Best F1 Score: 0.32142416845427424
CPU times: total: 14min 13s
Wall time: 14min 31s


The hyperparameter of max_depth was 9, but that is the highest range which I set, so the best max_depth might be higher. Therefore, I'll set the range of max_depth higher than above and do GridSearch.

In [124]:
%%time

depth = range(7, 20, 2)
split = range(2, 11, 2)
leaf =  range(21, 31, 2)

DT_gridsearch(depth, split, leaf, X_train_over.loc[:, rfe_over.support_], y_train_over, \
        X_test_sampled.loc[:, rfe_over.support_], y_test_sampled)

Hyperparameter :
 DecisionTreeClassifier(max_depth=19, min_samples_leaf=21, random_state=42)
Train score:
 0.8293522960932145
Test score:
 0.7494933244186289
Best F1 Score: 0.3006387260477732
CPU times: total: 36min 1s
Wall time: 37min 13s


The hyperparameters of max_depth and min_samples_leaf are on edge of the range. So I'll set the range of those parameters higher/lower than above and try GridSearch again.

In [141]:
%%time

depth = range(7, 26, 2)
split = range(2, 11, 2)
leaf =  range(15, 31, 2)

DT_gridsearch(depth, split, leaf, X_train_over.loc[:, rfe_over.support_], y_train_over, \
        X_test_sampled.loc[:, rfe_over.support_], y_test_sampled)

Hyperparameter :
 DecisionTreeClassifier(max_depth=25, min_samples_leaf=15, random_state=42)
Train score:
 0.8592578086752178
Test score:
 0.7646100165061324
Best F1 Score: 0.2905094779268216
CPU times: total: 1h 26min 55s
Wall time: 1h 31min 20s


The hyperparameters of max_depth and min_samples_leaf are on edge of the range again. So I'll set the range of those parameters higher/lower than above and try GridSearch again.

In [142]:
%%time

depth = range(21, 32, 2)
split = range(2, 11, 2)
leaf =  range(9, 19, 2)

DT_gridsearch(depth, split, leaf, X_train_over.loc[:, rfe_over.support_], y_train_over, \
        X_test_sampled.loc[:, rfe_over.support_], y_test_sampled)

Hyperparameter :
 DecisionTreeClassifier(criterion='entropy', max_depth=31, min_samples_leaf=9,
                       random_state=42)
Train score:
 0.8983574855576226
Test score:
 0.7821712876872611
Best F1 Score: 0.28280535204485263
CPU times: total: 41min 56s
Wall time: 43min 16s


The test score is getting better on each gridsearch. However, the f1 score is getting lower. Therefore, I'll check the models with (max_depth=9, min_samples_leaf=23, random_state=42) which has the highest f1 score and (criterion='entropy', max_depth=31, min_samples_leaf=9, random_state=42) which has the highest test score so far.

In [154]:
## Model 1 ##
# highest f1 score

scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_train = scaler.transform(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_over.support_])
    
DT_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=23, random_state=42)

DT_model.fit(X_scaled_train, y_train_over)
    
train_score_over = DT_model.score(X_scaled_train, y_train_over)
test_score_over = DT_model.score(X_scaled_test, y_test_sampled)
    
    # Evaluation(precision & Recall)
y_pred = DT_model.predict(X_scaled_test)

report_initial_over = classification_report(y_test_sampled, y_pred)
    
print(f"Train score: {train_score_over}\nTest score: {test_score_over}\n{report_initial_over}")

Train score: 0.7661509840399491
Test score: 0.7124903365997367
              precision    recall  f1-score   support

           0       0.97      0.70      0.82     87544
           1       0.20      0.80      0.32      8178

    accuracy                           0.71     95722
   macro avg       0.59      0.75      0.57     95722
weighted avg       0.91      0.71      0.78     95722



In [157]:
## Model 2 ##
# highest test score

scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_train = scaler.transform(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_over.support_])
    
DT_model = DecisionTreeClassifier(criterion='entropy', max_depth=31, min_samples_leaf=9, random_state=42)

DT_model.fit(X_scaled_train, y_train_over)
    
train_score_over = DT_model.score(X_scaled_train, y_train_over)
test_score_over = DT_model.score(X_scaled_test, y_test_sampled)
    
    # Evaluation(precision & Recall)
y_pred = DT_model.predict(X_scaled_test)

report_initial_over = classification_report(y_test_sampled, y_pred)
    
print(f"Train score: {train_score_over}\nTest score: {test_score_over}\n{report_initial_over}")

Train score: 0.8983574855576226
Test score: 0.7821712876872611
              precision    recall  f1-score   support

           0       0.95      0.81      0.87     87544
           1       0.20      0.50      0.28      8178

    accuracy                           0.78     95722
   macro avg       0.57      0.66      0.58     95722
weighted avg       0.88      0.78      0.82     95722



Model 2 has much higher test score. However, the recall score for 1 is only 0.5, which means only 50% of person who have a heart disease are predicted correctly. On the other hand, Model 1 has 0.8 of recall score for 1. Therefore, I'll evaluate Model 1.

In [159]:
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_train = scaler.transform(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_over.support_])
    
DT_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=23, random_state=42)

DT_model.fit(X_scaled_train, y_train_over)
    
train_score_over = DT_model.score(X_scaled_train, y_train_over)
test_score_over = DT_model.score(X_scaled_test, y_test_sampled)
    
    # Evaluation(precision & Recall)
y_pred = DT_model.predict(X_scaled_test)

report_initial_over = classification_report(y_test_sampled, y_pred)

print("After optimization\n")
    
print(f"Train score: {train_score_over}\nTest score: {test_score_over}\n{report_initial_over}")

After optimization

Train score: 0.7661509840399491
Test score: 0.7124903365997367
              precision    recall  f1-score   support

           0       0.97      0.70      0.82     87544
           1       0.20      0.80      0.32      8178

    accuracy                           0.71     95722
   macro avg       0.59      0.75      0.57     95722
weighted avg       0.91      0.71      0.78     95722



In [160]:
print("Before optimization\n")

D_Tree_w_normalize(X_train_over.loc[:, rfe_over.support_], y_train_over, \
        X_test_sampled.loc[:, rfe_over.support_], y_test_sampled)

Before optimization

Train score: 0.9753108782923725
Test score: 0.8375086187083429
              precision    recall  f1-score   support

           0       0.93      0.89      0.91     87544
           1       0.19      0.27      0.22      8178

    accuracy                           0.84     95722
   macro avg       0.56      0.58      0.56     95722
weighted avg       0.87      0.84      0.85     95722

f1 micro: 0.8375086187083429


The accuracy scores became lower than the model before optimization. However, the recall score for 1 improved a lot. Therefore, I'll compare this model to other dataset models.

**SMOTE data**

train size = 0.06

In [111]:
%%time

depth = range(1, 41, 2)
split = range(2, 41, 2)
leaf =  range(1, 41, 2)

DT_gridsearch(depth, split, leaf, X_GS_smote, y_GS_smote, \
        X_test_smote.loc[:, rfe_smote.support_], y_test_smote)

Hyperparameter :
 DecisionTreeClassifier(criterion='entropy', max_depth=15, min_samples_leaf=3,
                       min_samples_split=22, random_state=42,
                       splitter='random')
Train score:
 0.8012728978825834
Test score:
 0.7476337728004011
Best F1 Score: 0.3040133682906451
CPU times: total: 37min 41s
Wall time: 39min 14s


train size = full  
The best hyperparameters are **max_depth=15, min_samples_leaf=3,
                       min_samples_split=22**. Therefore, I'll use the numbers close to the results.

In [125]:
%%time

depth = range(11, 21, 2)
split = range(18, 29, 2)
leaf =  range(1, 11, 2)

DT_gridsearch(depth, split, leaf, X_train_smote.loc[:, rfe_smote.support_], y_train_smote, \
        X_test_smote.loc[:, rfe_smote.support_], y_test_smote)

Hyperparameter :
 DecisionTreeClassifier(criterion='entropy', max_depth=13, min_samples_split=18,
                       random_state=42)
Train score:
 0.8058528346225399
Test score:
 0.753578069827208
Best F1 Score: 0.3065616180620884
CPU times: total: 15min 52s
Wall time: 16min 33s


The hyperparameter of min_samples_split is on edge of the range. So the best parameter of min_samples_split might be lower. Therefore, I'll set the range of the parameter lower than above and do GridSearch.

In [130]:
%%time

depth = range(11, 21, 2)
split = range(14, 29, 2)
leaf =  range(1, 11, 2)

DT_gridsearch(depth, split, leaf, X_train_smote.loc[:, rfe_smote.support_], y_train_smote, \
        X_test_smote.loc[:, rfe_smote.support_], y_test_smote)

Hyperparameter :
 DecisionTreeClassifier(max_depth=15, min_samples_leaf=3, min_samples_split=16,
                       random_state=42, splitter='random')
Train score:
 0.805302065994321
Test score:
 0.7533586845239338
Best F1 Score: 0.30596466472646033
CPU times: total: 20min 27s
Wall time: 20min 57s


Now, all the hyperparameters are within the range. However, the test score and f1 score became lower after setting the range of min_samples_split wider. Therefore, I'll use the hyperparameters one before.

In [139]:
scaler = StandardScaler()
scaler.fit(X_train_smote.loc[:, rfe_smote.support_])
X_scaled_train = scaler.transform(X_train_smote.loc[:, rfe_smote.support_])
X_scaled_test = scaler.transform(X_test_smote.loc[:, rfe_smote.support_])
    
DT_model = DecisionTreeClassifier(criterion='entropy', max_depth=13, min_samples_split=18, random_state=42)

DT_model.fit(X_scaled_train, y_train_smote)
    
train_score_smote = DT_model.score(X_scaled_train, y_train_smote)
test_score_smote = DT_model.score(X_scaled_test, y_test_smote)
    
    # Evaluation(precision & Recall)
y_pred = DT_model.predict(X_scaled_test)

report_initial_smote = classification_report(y_test_smote, y_pred)

print("After optimization\n")
    
print(f"Train score: {train_score_smote}\nTest score: {test_score_smote}\n{report_initial_smote}")

After optimization

Train score: 0.8058528346225399
Test score: 0.753578069827208
              precision    recall  f1-score   support

           0       0.96      0.76      0.85     87544
           1       0.20      0.64      0.31      8178

    accuracy                           0.75     95722
   macro avg       0.58      0.70      0.58     95722
weighted avg       0.89      0.75      0.80     95722



In [140]:
print("Before optimization\n")

D_Tree_w_normalize(X_train_smote.loc[:, rfe_smote.support_], y_train_smote, \
        X_test_smote.loc[:, rfe_smote.support_], y_test_smote)

Before optimization

Train score: 0.8059678840693234
Test score: 0.753400472200748
              precision    recall  f1-score   support

           0       0.96      0.76      0.85     87544
           1       0.20      0.64      0.31      8178

    accuracy                           0.75     95722
   macro avg       0.58      0.70      0.58     95722
weighted avg       0.89      0.75      0.80     95722

f1 micro: 0.753400472200748


The test score improved slightly, but other scores didn't change. However, this model got better test score than the other models. Therefore, I'll compare this model to others at the end.

### Model Comparison

In [161]:
# Under sampled data

scaler = StandardScaler()
scaler.fit(X_train_under.loc[:, rfe_under.support_])
X_scaled_train = scaler.transform(X_train_under.loc[:, rfe_under.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_under.support_])
    
DT_model = DecisionTreeClassifier(criterion='entropy', max_depth=9, min_samples_leaf=9,
                       min_samples_split=30, random_state=42, splitter='random')

DT_model.fit(X_scaled_train, y_train_under)
    
train_score_under = DT_model.score(X_scaled_train, y_train_under)
test_score_under = DT_model.score(X_scaled_test, y_test_sampled)
    
    # Evaluation(precision & Recall)
y_pred = DT_model.predict(X_scaled_test)

report_initial_under = classification_report(y_test_sampled, y_pred)

print("Under Sampled data\n")
    
print(f"Train score: {train_score_under}\nTest score: {test_score_under}\n{report_initial_under}")

Under Sampled data

Train score: 0.7625320831805563
Test score: 0.7185286558993753
              precision    recall  f1-score   support

           0       0.97      0.71      0.82     87544
           1       0.20      0.79      0.33      8178

    accuracy                           0.72     95722
   macro avg       0.59      0.75      0.57     95722
weighted avg       0.91      0.72      0.78     95722



In [162]:
# Over sampled data
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_train = scaler.transform(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_over.support_])
    
DT_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=23, random_state=42)

DT_model.fit(X_scaled_train, y_train_over)
    
train_score_over = DT_model.score(X_scaled_train, y_train_over)
test_score_over = DT_model.score(X_scaled_test, y_test_sampled)
    
    # Evaluation(precision & Recall)
y_pred = DT_model.predict(X_scaled_test)

report_initial_over = classification_report(y_test_sampled, y_pred)

print("Over Sampled data\n")
    
print(f"Train score: {train_score_over}\nTest score: {test_score_over}\n{report_initial_over}")

Over Sampled data

Train score: 0.7661509840399491
Test score: 0.7124903365997367
              precision    recall  f1-score   support

           0       0.97      0.70      0.82     87544
           1       0.20      0.80      0.32      8178

    accuracy                           0.71     95722
   macro avg       0.59      0.75      0.57     95722
weighted avg       0.91      0.71      0.78     95722



In [163]:
# SMOTE data
scaler = StandardScaler()
scaler.fit(X_train_smote.loc[:, rfe_smote.support_])
X_scaled_train = scaler.transform(X_train_smote.loc[:, rfe_smote.support_])
X_scaled_test = scaler.transform(X_test_smote.loc[:, rfe_smote.support_])
    
DT_model = DecisionTreeClassifier(criterion='entropy', max_depth=13, min_samples_split=18, random_state=42)

DT_model.fit(X_scaled_train, y_train_smote)
    
train_score_smote = DT_model.score(X_scaled_train, y_train_smote)
test_score_smote = DT_model.score(X_scaled_test, y_test_smote)
    
    # Evaluation(precision & Recall)
y_pred = DT_model.predict(X_scaled_test)

report_initial_smote = classification_report(y_test_smote, y_pred)

print("SMOTE data\n")
    
print(f"Train score: {train_score_smote}\nTest score: {test_score_smote}\n{report_initial_smote}")

SMOTE data

Train score: 0.8058528346225399
Test score: 0.753578069827208
              precision    recall  f1-score   support

           0       0.96      0.76      0.85     87544
           1       0.20      0.64      0.31      8178

    accuracy                           0.75     95722
   macro avg       0.58      0.70      0.58     95722
weighted avg       0.89      0.75      0.80     95722



SMOTE data model has the best test score. However, Over sampled data has the highest recall score for 1. I would like to pridict person who has heart disease more than pridicting person who doesn't have heart disease, so I'll choose Over sampled data model from Dicision Tree models.

**From Logistic Regression: Over Sampled dataset**