In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Load in Data

In [2]:
# Importing the input dataset
heart_df_undersampled = pd.read_csv('Resources/heart_undersampled.csv')

### Prepare Data

In [3]:
# Removing HeartDisease target from features data
y = heart_df_undersampled.loc[:,"HeartDisease"]
X = heart_df_undersampled.drop(columns="HeartDisease")

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [4]:
print('Training Features Shape:', X_train.shape)
print('Training Target Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Target Shape:', y_test.shape)

Training Features Shape: (41059, 53)
Training Target Shape: (41059,)
Testing Features Shape: (13687, 53)
Testing Target Shape: (13687,)


In [5]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

print(X_train_scaled.shape)
print(X_test_scaled.shape)

(41059, 53)
(13687, 53)


### Determine Random Forest Parameters

In [6]:
# Create a random forest classifier.
rfc = RandomForestClassifier(random_state=78)

from pprint import pprint

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rfc.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 78,
 'verbose': 0,
 'warm_start': False}


### Random Search with Cross Validation

In [7]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [8]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rfc = RandomForestClassifier(random_state=78)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(X_train_scaled, y_train);

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [9]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}

In [10]:
rf_random.cv_results_

{'mean_fit_time': array([ 22.58109164,  77.35815159,  69.27899591, 144.40588601,
         53.76798916,  57.25598923,  74.24172616,  46.58845607,
         84.72075486,  84.73158089,  17.14968149,  64.42888784,
        100.94630837,  83.42324138,  56.11978825, 113.40463058,
         41.45651921,  45.57292819, 122.42410715,  28.43678284,
        130.92222532, 128.26649356,  62.44446365,  85.80734968,
        107.96079683,  33.75282431, 113.22554048,  41.45960132,
        138.11369888,  58.75153526,  78.04737655,  61.1121703 ,
         67.16127229,  35.18123229,  89.20231477,  57.68911974,
         56.99801985,  46.47188195,  40.09780701,  53.26440644,
          6.7014904 ,  30.52370747,  41.3772045 ,  19.38315582,
         16.52334102,  59.41805927, 104.69775136,  27.78878792,
         80.58847237,  45.03718233, 128.1221656 , 143.71103279,
         66.79656355,  63.35987719,  98.10064435,  38.27878523,
         50.45279654,  85.53371755,  76.95645428,  62.01957536,
         10.69878626,  

### Evaluate the model

In [11]:
# Evaluate best random search model
best_random = rf_random.best_estimator_
y_pred = best_random.predict(X_test_scaled)

In [12]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[5054, 1850],
       [1375, 5408]], dtype=int64)

In [13]:
# Calculate the Balanced Accuracy Score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7646633667188898

In [14]:
# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.73      0.76      6904
           1       0.75      0.80      0.77      6783

    accuracy                           0.76     13687
   macro avg       0.77      0.76      0.76     13687
weighted avg       0.77      0.76      0.76     13687



### Grid Search

In [15]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [30, 40, 50, 60],
    'max_features': [2, 3],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [800, 900, 1400, 1800]
}

# Create a base model
rfc = RandomForestClassifier(random_state=78)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rfc, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, return_train_score=True)

In [16]:
# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train);

Fitting 3 folds for each of 288 candidates, totalling 864 fits


In [17]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 30,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 7,
 'n_estimators': 900}

### Evaluate the Best Model from Grid Search

In [18]:
best_grid = grid_search.best_estimator_
y_pred = best_grid.predict(X_test_scaled)

In [19]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[5019, 1885],
       [1348, 5435]], dtype=int64)

In [20]:
# Calculate the Balanced Accuracy Score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7641188740544702

In [21]:
# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.73      0.76      6904
           1       0.74      0.80      0.77      6783

    accuracy                           0.76     13687
   macro avg       0.77      0.76      0.76     13687
weighted avg       0.77      0.76      0.76     13687



### Final round of Grid Search

In [22]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [50, 60, None],
    'max_features': [3, 4],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [5],
    'n_estimators': [600, 800, 1000]
}

# Create a base model
rfc = RandomForestClassifier(random_state=78)

# Instantiate the grid search model
grid_search_final = GridSearchCV(estimator = rfc, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, return_train_score=True)

In [23]:
# Fit the grid search to the data
grid_search_final.fit(X_train_scaled, y_train);

Fitting 3 folds for each of 54 candidates, totalling 162 fits


In [24]:
grid_search_final.best_params_

{'bootstrap': True,
 'max_depth': 50,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 5,
 'n_estimators': 600}

In [25]:
best_grid_final = grid_search_final.best_estimator_
y_pred = best_grid_final.predict(X_test_scaled)

In [26]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[5007, 1897],
       [1340, 5443]], dtype=int64)

In [27]:
# Calculate the Balanced Accuracy Score
balanced_accuracy_score(y_test, y_pred)

0.7638395222088348

In [28]:
# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.73      0.76      6904
           1       0.74      0.80      0.77      6783

    accuracy                           0.76     13687
   macro avg       0.77      0.76      0.76     13687
weighted avg       0.77      0.76      0.76     13687

