In [8]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split ,KFold, cross_val_score, GridSearchCV
from collections import Counter
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, roc_auc_score
# Classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [4]:
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

## 1. Model Building

Before model building and selection we have to decide that what is our goal? Which metric is more important for us? For example:

- If false positives (incorrectly diagnosing a healthy person with heart disease) have serious consequences, we might prioritize precision.
- If missing actual cases of heart disease is more concerning, we might prioritize recall.

In this case, our priority will be recall but we also check other metrics as well.

### 1.1 Creating KFold Object

In [5]:
# We create KFold object to be sure that we have the same splits of the data every time. We pass kf object to cv parameter
kf = KFold(n_splits=10, shuffle=False)

### 1.2 Average Recall Scores of Different Classification Models

##### 1.2.1 Building Models for Cross-Validation

In [6]:
lg = LogisticRegression()
rf = RandomForestClassifier()
xgb = XGBClassifier()
dt = DecisionTreeClassifier()
knc = KNeighborsClassifier()
svc = SVC()

models= [lg, rf, xgb, dt, knc, svc]

##### 1.2.2 KFold Cross-Validation for each Model

In [13]:
avg_recall_scores = []

for model in models:
    score = cross_val_score(model, X_train, y_train, cv=kf, scoring='recall')
    avg_score = round(score.mean(),3)
    avg_recall_scores.append(avg_score)

avg_recall_scores

[0.837, 0.861, 0.825, 0.775, 0.835, 0.853]

Random Forest Classifier have best average recall score. So we proceed with RF.

In [17]:
rf_before = RandomForestClassifier()

before_score = cross_val_score(rf_before, X_train, y_train, cv=kf, scoring='recall')
before_avg_score = round(before_score.mean(),2)

print('Cross Validation Recall scores are: {}'.format(before_score))
print('Average Cross Validation Recall score: ', before_avg_score)

Cross Validation Recall scores are: [0.91666667 0.77142857 0.92592593 0.86206897 0.86363636 0.85714286
 0.74193548 0.75       0.79310345 0.92857143]
Average Cross Validation Recall score:  0.84


### 1.3  Hyperparameter Tuning with GridSearchCV

In [18]:
classifier_rf = RandomForestClassifier()

In [19]:
rf_params = {
    'n_estimators': [130,150,170,190],
    'max_depth': [8,10,12],
    'min_samples_split': [3,4,5],
    'min_samples_leaf': [1,2,3],
    'random_state': [13]
}

grid_rf = GridSearchCV(rf, param_grid=rf_params, cv=kf, 
                          scoring='recall').fit(X_train, y_train)

In [20]:
print('Best parameters:', grid_rf.best_params_)
print('Best score:', grid_rf.best_score_)

Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 150, 'random_state': 13}
Best score: 0.8722376527604558


### 1.4 K-Fold Cross-Validation After Tuning Hyperparameters

In [21]:
rf_after = RandomForestClassifier(max_depth=10, min_samples_leaf=2,
                                  min_samples_split=5, n_estimators=150, random_state=13)

after_score = cross_val_score(rf_after, X_train, y_train, cv=kf, scoring='recall')
after_avg_score = round(after_score.mean(), 3)

print('Cross Validation Recall scores are: {}'.format(after_score))
print('Average Cross Validation Recall score: ', after_avg_score)

Cross Validation Recall scores are: [0.91666667 0.82857143 0.92592593 0.93103448 0.86363636 0.85714286
 0.77419355 0.83333333 0.82758621 0.96428571]
Average Cross Validation Recall score:  0.872


### 1.5 Comparing Before-After Avg Recall Scores

In [22]:
print(f'Average Recall Score Before Hyperparameter Tuning: {before_avg_score}')
print(f'Average Recall Score After Hyperparameter Tuning: {after_avg_score}')

Average Recall Score Before Hyperparameter Tuning: 0.84
Average Recall Score After Hyperparameter Tuning: 0.872


## 2. Model Training

In [23]:
rf_model = rf_after.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

rf_recall = round(recall_score(y_test,y_pred),3)
rf_accuracy = round(accuracy_score(y_test, y_pred),3)
rf_precision = round(precision_score(y_test, y_pred),3)
rf_f1 = round(f1_score(y_test, y_pred),3)
rf_roc_auc =round(roc_auc_score(y_test, y_pred),3)

print('All Metrics of Tuned Random Forest Classifier Model')
print(f'Accuracy: {rf_accuracy}')
print(f'Recall: {rf_recall}')
print(f'Precision: {rf_precision}')
print(f'F1 Score: {rf_f1}')
print(f'ROC-AUC: {rf_roc_auc}')

All Metrics of Tuned Random Forest Classifier Model
Accuracy: 0.907
Recall: 0.886
Precision: 0.933
F1 Score: 0.909
ROC-AUC: 0.908
