# 9. Capstone Project - Final model

#### Loading data and libralies

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as stats

from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [21]:
# Train set : Under sampled data set and Over sampled data set
under_sampled_df = pd.read_csv("./under_sampled_df_fe.csv")
over_sampled_df = pd.read_csv("./over_sampled_df_fe.csv")
# Test set : Under sampled data set and Over sampled data set
test_sampled_df = pd.read_csv("./test_sampled_df_fe.csv")
               
# Train set : SMOTE data set         
smote_df = pd.read_csv("./smote_df.csv")
# Test set : SMOTE data set  
test_smote_df = pd.read_csv("./test_smote_df.csv")

Split each dataset into train set and test set

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
# Split other data set into X and y
X_train_under = under_sampled_df.drop(columns="HeartDisease")
y_train_under = under_sampled_df["HeartDisease"]

X_train_over = over_sampled_df.drop(columns="HeartDisease")
y_train_over = over_sampled_df["HeartDisease"]

X_test_sampled = test_sampled_df.drop(columns="HeartDisease")
y_test_sampled = test_sampled_df["HeartDisease"]

X_train_smote = smote_df.drop(columns="HeartDisease")
y_train_smote = smote_df["HeartDisease"]

X_test_smote = test_smote_df.drop(columns="HeartDisease")
y_test_smote = test_smote_df["HeartDisease"]

**Festure selection: RFE**  
The columns that I'll use for modeling

In [24]:
# RFE : Under sampled data final

scaler = StandardScaler()
scaler.fit(X_train_under)
X_scaled_train = scaler.transform(X_train_under)
X_scaled_test = scaler.transform(X_test_sampled)
   
rfe_under = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42), n_features_to_select=9)
rfe_under.fit(X_scaled_train,y_train_under)
    
train_score_u_rfe = rfe_under.score(X_scaled_train,y_train_under)
test_score_u_rfe = rfe_under.score(X_scaled_test,y_test_sampled)

In [25]:
X_train_under.loc[:, rfe_under.support_]

Unnamed: 0,Smoking,AlcoholDrinking,PhysicalHealth,DiffWalking,Sex,AgeCategory,GenHealth,Asthma,Race_Asian
0,1,0,0.0,0,1,65,1,0,0
1,0,0,5.0,0,0,80,0,0,0
2,0,0,0.0,0,0,40,4,0,0
3,0,0,7.0,1,0,60,2,0,0
4,1,0,0.0,0,1,70,3,0,0
...,...,...,...,...,...,...,...,...,...
38177,0,0,0.0,0,1,75,3,0,0
38178,0,0,30.0,1,0,50,0,0,0
38179,0,0,0.0,0,0,75,2,0,0
38180,1,0,15.0,0,1,70,3,0,0


In [26]:
# RFE : Over sampled data final

scaler = StandardScaler()
scaler.fit(X_train_over)
X_scaled_train = scaler.transform(X_train_over)
X_scaled_test = scaler.transform(X_test_sampled)
   
rfe_over = RFE(estimator=LogisticRegression(max_iter=1000, random_state=42), n_features_to_select=11)
rfe_over.fit(X_scaled_train,y_train_over)
    
train_score_ov_rfe = rfe_over.score(X_scaled_train,y_train_over)
test_score_ov_rfe = rfe_over.score(X_scaled_test,y_test_sampled)

In [27]:
X_train_over.loc[:, rfe_over.support_]

Unnamed: 0,BMI,Smoking,AlcoholDrinking,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,GenHealth,Asthma,Race_Asian
0,27.12,1,0,0.0,2.0,0,1,35,3,0,0
1,30.23,0,0,0.0,0.0,0,0,18,2,0,0
2,32.55,0,0,0.0,0.0,0,1,60,3,0,0
3,33.07,0,0,0.0,0.0,0,0,55,3,0,0
4,23.06,0,0,0.0,0.0,0,1,30,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...
408515,42.87,0,0,30.0,0.0,1,0,60,2,0,0
408516,24.41,0,0,0.0,0.0,0,1,65,2,0,0
408517,33.00,0,0,0.0,0.0,0,1,60,1,0,0
408518,31.32,1,0,3.0,2.0,0,0,65,2,1,0


In [28]:
# RFE : SMOTE data final

scaler = StandardScaler()
scaler.fit(X_train_smote)
X_scaled_train = scaler.transform(X_train_smote)
X_scaled_test = scaler.transform(X_test_smote)
    
rfe_smote = RFE(estimator=LogisticRegression(max_iter=1500, random_state=42), n_features_to_select=10)
rfe_smote.fit(X_scaled_train,y_train_smote)
    
train_score_s_rfe = rfe_smote.score(X_scaled_train,y_train_smote)
test_score_s_rfe = rfe_smote.score(X_scaled_test,y_test_smote)

In [29]:
X_train_smote.loc[:, rfe_smote.support_]

Unnamed: 0,AlcoholDrinking,AgeCategory,GenHealth,Asthma,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
0,0,35,3,0,0,0,0,0,0,1
1,0,18,2,0,0,0,0,1,0,0
2,0,60,3,0,0,0,0,0,0,1
3,0,55,3,0,0,0,0,1,0,0
4,0,30,4,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
408515,0,80,1,0,0,0,0,0,0,1
408516,0,65,3,0,0,0,0,0,0,1
408517,0,70,2,0,0,0,0,0,0,0
408518,0,75,2,0,0,0,0,0,0,1


### Model Selecting

I've been using the same test set for all process. Now I'll use 2 different test sets.

In [89]:
clean_cvt_df = pd.read_csv("./capstone_clean_heart_disease.csv")
X = clean_cvt_df.drop(columns="HeartDisease")
y = clean_cvt_df["HeartDisease"].map({"No":0, "Yes":1})

# test set 1
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size = 0.3, random_state=76)

yes_no_columns = ["Smoking", "AlcoholDrinking", "DiffWalking", "PhysicalActivity", "Asthma"]

for column in yes_no_columns:
    X_test1[column] = X_test1[column].map({"No":0, "Yes":1})

X_test1["Sex"] = X_test1["Sex"].replace({"Male":1, "Female":0})

X_test1["AgeCategory"] = X_test1["AgeCategory"].replace({'55-59':55, '80 or older':80, '65-69':65, '75-79':75,
                                                    '40-44':40, '70-74':70,'60-64':60, '50-54':50, '45-49':45,
                                                    '18-24':18, '35-39':35, '30-34':30, '25-29':25})
    
X_test1["GenHealth"] = X_test1["GenHealth"].replace({'Very good':3, 'Fair':1, 'Good':2, 'Poor':0, 'Excellent':4})

X_test1 = pd.get_dummies(X_test1)


# test set 2
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size = 0.3, random_state=121)

yes_no_columns = ["Smoking", "AlcoholDrinking", "DiffWalking", "PhysicalActivity", "Asthma"]

for column in yes_no_columns:
    X_test2[column] = X_test2[column].map({"No":0, "Yes":1})

X_test2["Sex"] = X_test2["Sex"].replace({"Male":1, "Female":0})

X_test2["AgeCategory"] = X_test2["AgeCategory"].replace({'55-59':55, '80 or older':80, '65-69':65, '75-79':75,
                                                    '40-44':40, '70-74':70,'60-64':60, '50-54':50, '45-49':45,
                                                    '18-24':18, '35-39':35, '30-34':30, '25-29':25})
    
X_test2["GenHealth"] = X_test2["GenHealth"].replace({'Very good':3, 'Fair':1, 'Good':2, 'Poor':0, 'Excellent':4})

X_test2 = pd.get_dummies(X_test2)

**Logistic Regression**

In [48]:
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_train = scaler.transform(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_over.support_])
    
my_PCA = PCA()
my_PCA.fit(X_scaled_train)

X_train_PCA = my_PCA.transform(X_scaled_train)
X_test_PCA = my_PCA.transform(X_scaled_test)
    
log_model_over = LogisticRegression(max_iter=1000, C=0.0001, penalty="l2", solver="liblinear", random_state=42)
log_model_over.fit(X_train_PCA,y_train_over)

train_score_log_o = log_model_over.score(X_train_PCA, y_train_over)
test_score_log_o = log_model_over.score(X_test_PCA, y_test_sampled)

# Evaluation(precision & Recall)
y_pred = log_model_over.predict(X_test_PCA)
f1_sco_log_o = f1_score(y_test_sampled, y_pred, pos_label=1)

report_initial = classification_report(y_test_sampled, y_pred)

print("Logistic Regression: Over sampled data\n")

print(f"Train score: {train_score_log_o}\nTest score: {test_score_log_o}\nF1 score: {f1_sco_log_o}\n\n{report_initial}")

Logistic Regression: Over sampled data

Train score: 0.756489278370704
Test score: 0.7338647332901528
F1 score: 0.335879454626033

              precision    recall  f1-score   support

           0       0.97      0.73      0.83     87544
           1       0.21      0.79      0.34      8178

    accuracy                           0.73     95722
   macro avg       0.59      0.76      0.58     95722
weighted avg       0.91      0.73      0.79     95722



In [90]:
print("Logistic Regression: Over sampled data\n")

print(" Test set 1")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test1.loc[:, rfe_over.support_])
y_pred1 = log_model_over.predict(X_scaled_test)
report_initial1 = classification_report(y_test1, y_pred1)
print(report_initial1)


print("\n\nTest set 2")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test2.loc[:, rfe_over.support_])
y_pred2 = log_model_over.predict(X_scaled_test)
report_initial2 = classification_report(y_test2, y_pred2)
print(report_initial2)

Logistic Regression: Over sampled data

 Test set 1
              precision    recall  f1-score   support

           0       0.89      0.44      0.59     87548
           1       0.07      0.43      0.11      8174

    accuracy                           0.44     95722
   macro avg       0.48      0.43      0.35     95722
weighted avg       0.82      0.44      0.55     95722



Test set 2
              precision    recall  f1-score   support

           0       0.89      0.44      0.59     87484
           1       0.07      0.43      0.12      8238

    accuracy                           0.44     95722
   macro avg       0.48      0.43      0.35     95722
weighted avg       0.82      0.44      0.54     95722



The accuracy score of the original test set was 73. However, the accuracy scores of new 2 test sets was both 44. Therefore, this model is rejected.

In [58]:
scaler = StandardScaler()
scaler.fit(X_train_smote.loc[:, rfe_smote.support_])
X_scaled_train = scaler.transform(X_train_smote.loc[:, rfe_smote.support_])
X_scaled_test = scaler.transform(X_test_smote.loc[:, rfe_smote.support_])
   
my_PCA = PCA()
my_PCA.fit(X_scaled_train)

X_train_PCA = my_PCA.transform(X_scaled_train)
X_test_PCA = my_PCA.transform(X_scaled_test)

log_model_smote = LogisticRegression(max_iter=1000, C=0.001, penalty=None, solver="lbfgs", random_state=42)
log_model_smote.fit(X_train_PCA,y_train_smote)

train_score_log_s = log_model_smote.score(X_train_PCA, y_train_smote)
test_score_log_s = log_model_smote.score(X_test_PCA, y_test_smote)

# Evaluation(precision & Recall)
y_pred = log_model_smote.predict(X_test_PCA)
f1_sco_log_s = f1_score(y_test_smote, y_pred, pos_label=1)

report_initial = classification_report(y_test_smote, y_pred)

print("Logistic Regression: SMOTE data\n")

print(f"Train score: {train_score_log_s}\nTest score: {test_score_log_s}\nF1 score: {f1_sco_log_s}\n\n{report_initial}")



Logistic Regression: SMOTE data

Train score: 0.8012165867032214
Test score: 0.7836756440525688
F1 score: 0.3248231112850109

              precision    recall  f1-score   support

           0       0.96      0.80      0.87     87544
           1       0.22      0.61      0.32      8178

    accuracy                           0.78     95722
   macro avg       0.59      0.70      0.60     95722
weighted avg       0.89      0.78      0.82     95722



In [91]:
print("Logistic Regression: SMOTE data\n")

print("Test set 1")
scaler = StandardScaler()
scaler.fit(X_train_smote.loc[:, rfe_smote.support_])
X_scaled_test = scaler.transform(X_test1.loc[:, rfe_smote.support_])
y_pred1 = log_model_smote.predict(X_scaled_test)
report_initial1 = classification_report(y_test1, y_pred1)
print(report_initial1)


print("\n\nTest set 2")
scaler = StandardScaler()
scaler.fit(X_train_smote.loc[:, rfe_smote.support_])
X_scaled_test = scaler.transform(X_test2.loc[:, rfe_smote.support_])
y_pred2 = log_model_smote.predict(X_scaled_test)
report_initial2 = classification_report(y_test2, y_pred2)
print(report_initial2)

Logistic Regression: SMOTE data

Test set 1
              precision    recall  f1-score   support

           0       0.94      0.24      0.38     87548
           1       0.09      0.82      0.16      8174

    accuracy                           0.29     95722
   macro avg       0.51      0.53      0.27     95722
weighted avg       0.86      0.29      0.36     95722



Test set 2
              precision    recall  f1-score   support

           0       0.94      0.24      0.38     87484
           1       0.09      0.83      0.17      8238

    accuracy                           0.29     95722
   macro avg       0.51      0.53      0.27     95722
weighted avg       0.86      0.29      0.36     95722



The accuracy score of the original test set was 78. However, the accuracy scores of new 2 test sets was both 29. Therefore, this model is rejected.

**Naive Bayes**

In [16]:
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_train = scaler.transform(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_over.support_])
    
my_PCA = PCA()
my_PCA.fit(X_scaled_train)

X_train_PCA = my_PCA.transform(X_scaled_train)
X_test_PCA = my_PCA.transform(X_scaled_test)

my_gaussian_nb = GaussianNB(var_smoothing=0.1).fit(X_train_PCA, y_train_over)
    
train_pred = my_gaussian_nb.predict(X_train_PCA)
test_pred = my_gaussian_nb.predict(X_test_PCA)
    
train_score_nb = accuracy_score(y_train_over, train_pred)
test_score_nb = accuracy_score(y_test_sampled, test_pred)

f1_sco_nb = f1_score(y_test_sampled, test_pred, pos_label=1)
    
report_initial = classification_report(y_test_sampled, test_pred)

print("Naive Bayes\n")
    
print(f"Train score: {train_score_nb}\nTest score: {test_score_nb}\nF1 score: {f1_sco_nb}\n\n{report_initial}")

Naive Bayes

Train score: 0.7496719866836385
Test score: 0.7213075364075134
F1 score: 0.3254696705352853

              precision    recall  f1-score   support

           0       0.97      0.72      0.82     87544
           1       0.21      0.79      0.33      8178

    accuracy                           0.72     95722
   macro avg       0.59      0.75      0.57     95722
weighted avg       0.91      0.72      0.78     95722



In [92]:
print("Naive Bayes\n")

print("Test set 1")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test1.loc[:, rfe_over.support_])
y_pred1 = my_gaussian_nb.predict(X_scaled_test)
report_initial1 = classification_report(y_test1, y_pred1)
print(report_initial1)


print("\n\nTest set 2")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test2.loc[:, rfe_over.support_])
y_pred2 = my_gaussian_nb.predict(X_scaled_test)
report_initial2 = classification_report(y_test2, y_pred2)
print(report_initial2)

Naive Bayes

Test set 1
              precision    recall  f1-score   support

           0       0.90      0.41      0.57     87548
           1       0.07      0.49      0.13      8174

    accuracy                           0.42     95722
   macro avg       0.48      0.45      0.35     95722
weighted avg       0.83      0.42      0.53     95722



Test set 2
              precision    recall  f1-score   support

           0       0.90      0.41      0.57     87484
           1       0.07      0.49      0.13      8238

    accuracy                           0.42     95722
   macro avg       0.48      0.45      0.35     95722
weighted avg       0.83      0.42      0.53     95722



The accuracy score of the original test set was 72. However, the accuracy scores of new 2 test sets was both 42. Therefore, this model is rejected.

**Decision Tree**

In [30]:
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_train = scaler.transform(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_over.support_])
    
DT_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=23, random_state=42)

DT_model.fit(X_scaled_train, y_train_over)
    
train_score_dt = DT_model.score(X_scaled_train, y_train_over)
test_score_dt = DT_model.score(X_scaled_test, y_test_sampled)
    
    # Evaluation(precision & Recall)
y_pred = DT_model.predict(X_scaled_test)
f1_sco_dt = f1_score(y_test_sampled, y_pred, pos_label=1)

report_initial = classification_report(y_test_sampled, y_pred)

print("Decision Tree\n")
    
print(f"Train score: {train_score_dt}\nTest score: {test_score_dt}\nF1 score: {f1_sco_dt}\n\n{report_initial}")

Decision Tree

Train score: 0.7661509840399491
Test score: 0.7124903365997367
F1 score: 0.32142416845427424

              precision    recall  f1-score   support

           0       0.97      0.70      0.82     87544
           1       0.20      0.80      0.32      8178

    accuracy                           0.71     95722
   macro avg       0.59      0.75      0.57     95722
weighted avg       0.91      0.71      0.78     95722



In [93]:
print("Decision Tree\n")

print("Test set 1")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test1.loc[:, rfe_over.support_])
y_pred1 = DT_model.predict(X_scaled_test)
report_initial1 = classification_report(y_test1, y_pred1)
print(report_initial1)


print("\n\nTest set 2")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test2.loc[:, rfe_over.support_])
y_pred2 = DT_model.predict(X_scaled_test)
report_initial2 = classification_report(y_test2, y_pred2)
print(report_initial2)

Decision Tree

Test set 1
              precision    recall  f1-score   support

           0       0.98      0.71      0.82     87548
           1       0.21      0.81      0.33      8174

    accuracy                           0.72     95722
   macro avg       0.59      0.76      0.57     95722
weighted avg       0.91      0.72      0.78     95722



Test set 2
              precision    recall  f1-score   support

           0       0.98      0.71      0.82     87484
           1       0.21      0.81      0.33      8238

    accuracy                           0.72     95722
   macro avg       0.59      0.76      0.58     95722
weighted avg       0.91      0.72      0.78     95722



The accuracy score of the original test set was 71. Also The accuracy score of new 2 test sets was both 72, and they are the almost same accuracy score with the accuracy score of the original test set. Therefore, I'll keep this model.

**Random Forest**

In [64]:
scaler = StandardScaler()
scaler.fit(X_train_under.loc[:, rfe_under.support_])
X_scaled_train = scaler.transform(X_train_under.loc[:, rfe_under.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_under.support_])
   
my_PCA = PCA()
my_PCA.fit(X_scaled_train)

X_train_PCA = my_PCA.transform(X_scaled_train)
X_test_PCA = my_PCA.transform(X_scaled_test)   
    
RF_model_under = RandomForestClassifier(max_depth=20, min_samples_leaf=6, 
                                  min_samples_split=20,n_estimators=10, random_state=42)
RF_model_under.fit(X_train_PCA, y_train_under)
    
train_score_rf = RF_model_under.score(X_train_PCA, y_train_under)
test_score_rf = RF_model_under.score(X_test_PCA, y_test_sampled)
    
    # Evaluation(precision & Recall)
y_pred = RF_model_under.predict(X_test_PCA)
f1_sco_rf = f1_score(y_test_sampled, y_pred, pos_label=1)

report_initial = classification_report(y_test_sampled, y_pred)

print("Rando Forest\n")
    
print(f"Train score: {train_score_rf}\nTest score: {test_score_rf}\nF1 score: {f1_sco_rf}\n\n{report_initial}")

Rando Forest

Train score: 0.7811010423759887
Test score: 0.7131171517519483
F1 score: 0.31981769994798503

              precision    recall  f1-score   support

           0       0.97      0.71      0.82     87544
           1       0.20      0.79      0.32      8178

    accuracy                           0.71     95722
   macro avg       0.59      0.75      0.57     95722
weighted avg       0.91      0.71      0.78     95722



In [94]:
print("Rando Forest\n")

print("Test set 1")
scaler = StandardScaler()
scaler.fit(X_train_under.loc[:, rfe_under.support_])
X_scaled_test = scaler.transform(X_test1.loc[:, rfe_under.support_])
y_pred1 = RF_model_under.predict(X_scaled_test)
report_initial1 = classification_report(y_test1, y_pred1)
print(report_initial1)


print("\n\nTest set 2")
scaler = StandardScaler()
scaler.fit(X_train_under.loc[:, rfe_under.support_])
X_scaled_test = scaler.transform(X_test2.loc[:, rfe_under.support_])
y_pred2 = RF_model_under.predict(X_scaled_test)
report_initial2 = classification_report(y_test2, y_pred2)
print(report_initial2)

Rando Forest

Test set 1
              precision    recall  f1-score   support

           0       0.91      0.37      0.53     87548
           1       0.08      0.60      0.14      8174

    accuracy                           0.39     95722
   macro avg       0.50      0.49      0.34     95722
weighted avg       0.84      0.39      0.50     95722



Test set 2
              precision    recall  f1-score   support

           0       0.91      0.37      0.53     87484
           1       0.08      0.59      0.14      8238

    accuracy                           0.39     95722
   macro avg       0.49      0.48      0.34     95722
weighted avg       0.84      0.39      0.50     95722



The accuracy score of the original test set was 71. However, the accuracy scores of new 2 test sets was both 39. Therefore, this model is rejected.

**LinearSVC**

In [67]:
# over sampled data

scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_train = scaler.transform(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_over.support_])
       
l_svc_model = LinearSVC()
l_svc_model.fit(X_scaled_train, y_train_over)
    
train_score_svc = l_svc_model.score(X_scaled_train, y_train_over)
test_score_svc = l_svc_model.score(X_scaled_test, y_test_sampled)
    
# Evaluation(precision & Recall)
y_pred = l_svc_model.predict(X_scaled_test)
f1_sco_svc = f1_score(y_test_sampled, y_pred, pos_label=1)

report_initial = classification_report(y_test_sampled, y_pred)

print("LinearSVC\n")
    
print(f"Train score: {train_score_svc}\nTest score: {test_score_svc}\nF1 score: {f1_sco_svc}\n\n{report_initial}")



LinearSVC

Train score: 0.756313032409674
Test score: 0.7281711623242306
F1 score: 0.3324781939456131

              precision    recall  f1-score   support

           0       0.97      0.72      0.83     87544
           1       0.21      0.79      0.33      8178

    accuracy                           0.73     95722
   macro avg       0.59      0.76      0.58     95722
weighted avg       0.91      0.73      0.79     95722



In [95]:
print("LinearSVC\n")

print("Test set 1")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test1.loc[:, rfe_over.support_])
y_pred1 = l_svc_model.predict(X_scaled_test)
report_initial1 = classification_report(y_test1, y_pred1)
print(report_initial1)


print("\n\nTest set 2")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test2.loc[:, rfe_over.support_])
y_pred2 = l_svc_model.predict(X_scaled_test)
report_initial2 = classification_report(y_test2, y_pred2)
print(report_initial2)

LinearSVC

Test set 1
              precision    recall  f1-score   support

           0       0.97      0.72      0.83     87548
           1       0.21      0.78      0.33      8174

    accuracy                           0.73     95722
   macro avg       0.59      0.75      0.58     95722
weighted avg       0.91      0.73      0.79     95722



Test set 2
              precision    recall  f1-score   support

           0       0.97      0.72      0.83     87484
           1       0.21      0.79      0.34      8238

    accuracy                           0.73     95722
   macro avg       0.59      0.76      0.58     95722
weighted avg       0.91      0.73      0.79     95722



The accuracy score of the original test set was 73. Also The accuracy score of new 2 test sets was both 73, and they are the same as the accuracy score of the original test set. Therefore, I'll keep this model.

### Comparing the Better Models

In [103]:
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_train = scaler.transform(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_over.support_])
    
DT_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=23, random_state=42)

DT_model.fit(X_scaled_train, y_train_over)
    
y_pred = DT_model.predict(X_scaled_test)

report_initial = classification_report(y_test_sampled, y_pred)

print("Decision Tree\n\n")

print("Original Test set")
print(report_initial)


print("\n\nTest set 1")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test1.loc[:, rfe_over.support_])
y_pred1 = DT_model.predict(X_scaled_test)
report_initial1 = classification_report(y_test1, y_pred1)
print(report_initial1)


print("\n\nTest set 2")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test2.loc[:, rfe_over.support_])
y_pred2 = DT_model.predict(X_scaled_test)
report_initial2 = classification_report(y_test2, y_pred2)
print(report_initial2)

Decision Tree


Original Test set
              precision    recall  f1-score   support

           0       0.97      0.70      0.82     87544
           1       0.20      0.80      0.32      8178

    accuracy                           0.71     95722
   macro avg       0.59      0.75      0.57     95722
weighted avg       0.91      0.71      0.78     95722



Test set 1
              precision    recall  f1-score   support

           0       0.98      0.71      0.82     87548
           1       0.21      0.81      0.33      8174

    accuracy                           0.72     95722
   macro avg       0.59      0.76      0.57     95722
weighted avg       0.91      0.72      0.78     95722



Test set 2
              precision    recall  f1-score   support

           0       0.98      0.71      0.82     87484
           1       0.21      0.81      0.33      8238

    accuracy                           0.72     95722
   macro avg       0.59      0.76      0.58     95722
weighted avg  

In [104]:
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_train = scaler.transform(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_over.support_])
       
l_svc_model = LinearSVC()
l_svc_model.fit(X_scaled_train, y_train_over)
    
y_pred = l_svc_model.predict(X_scaled_test)

report_initial = classification_report(y_test_sampled, y_pred)

print("LinearSVC\n\n")

print("Original Test set")
print(report_initial)


print("\n\nTest set 1")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test1.loc[:, rfe_over.support_])
y_pred1 = l_svc_model.predict(X_scaled_test)
report_initial1 = classification_report(y_test1, y_pred1)
print(report_initial1)


print("\n\nTest set 2")
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test2.loc[:, rfe_over.support_])
y_pred2 = l_svc_model.predict(X_scaled_test)
report_initial2 = classification_report(y_test2, y_pred2)
print(report_initial2)



LinearSVC


Original Test set
              precision    recall  f1-score   support

           0       0.97      0.72      0.83     87544
           1       0.21      0.79      0.33      8178

    accuracy                           0.73     95722
   macro avg       0.59      0.76      0.58     95722
weighted avg       0.91      0.73      0.79     95722



Test set 1
              precision    recall  f1-score   support

           0       0.97      0.72      0.83     87548
           1       0.21      0.78      0.33      8174

    accuracy                           0.73     95722
   macro avg       0.59      0.75      0.58     95722
weighted avg       0.91      0.73      0.79     95722



Test set 2
              precision    recall  f1-score   support

           0       0.97      0.72      0.83     87484
           1       0.21      0.79      0.34      8238

    accuracy                           0.73     95722
   macro avg       0.59      0.76      0.58     95722
weighted avg      

The accuracy score of LinearSVC model is higher than the Decision Tree model. howevoer, the precision score for 0 and the recall score for 1 are better witn Decision Tree model. Therefore, I'll create a web app with Decison Tree model.

### Final Model

In [106]:
scaler = StandardScaler()
scaler.fit(X_train_over.loc[:, rfe_over.support_])
X_scaled_train = scaler.transform(X_train_over.loc[:, rfe_over.support_])
X_scaled_test = scaler.transform(X_test_sampled.loc[:, rfe_over.support_])
    
DT_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=23, random_state=42)

DT_model.fit(X_scaled_train, y_train_over)
    
train_score_dt = DT_model.score(X_scaled_train, y_train_over)
test_score_dt = DT_model.score(X_scaled_test, y_test_sampled)
    
    # Evaluation(precision & Recall)
y_pred = DT_model.predict(X_scaled_test)
f1_sco_dt = f1_score(y_test_sampled, y_pred, pos_label=1)

report_initial = classification_report(y_test_sampled, y_pred)

print("Decision Tree\n")
    
print(f"Train score: {train_score_dt}\nTest score: {test_score_dt}\nF1 score: {f1_sco_dt}\n\n{report_initial}")

Decision Tree

Train score: 0.7661509840399491
Test score: 0.7124903365997367
F1 score: 0.32142416845427424

              precision    recall  f1-score   support

           0       0.97      0.70      0.82     87544
           1       0.20      0.80      0.32      8178

    accuracy                           0.71     95722
   macro avg       0.59      0.75      0.57     95722
weighted avg       0.91      0.71      0.78     95722



This is my final model. The recall score for 0 is 0.7, so 30% of people who doesn't have heart disease will be predicted as their risk of developing heart disease is high. The recall score for 1 is 0.8, so 20% of people who have heart disease will be predicted as their risk of developing heart disease is low. 