In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Load in Data

In [2]:
# Importing the input dataset
heart_df_undersampled = pd.read_csv('Resources/heart_undersampled.csv')

### Prepare Data

In [4]:
# Removing HeartDisease target from features data
y = heart_df_undersampled.loc[:,"HeartDisease"]
X = heart_df_undersampled.drop(columns="HeartDisease")

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [5]:
print('Training Features Shape:', X_train.shape)
print('Training Target Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Target Shape:', y_test.shape)

Training Features Shape: (41059, 53)
Training Target Shape: (41059,)
Testing Features Shape: (13687, 53)
Testing Target Shape: (13687,)


### Determine Random Forest Parameters

In [7]:
# Create a random forest classifier.
rfc = RandomForestClassifier(random_state=78)

from pprint import pprint

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rfc.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 78,
 'verbose': 0,
 'warm_start': False}


### Random Search with Cross Validation

In [9]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [12]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rfc = RandomForestClassifier(random_state=78)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(X_train, y_train);

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [13]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}

In [14]:
rf_random.cv_results_

{'mean_fit_time': array([ 21.18251276,  59.41997059,  47.30510426, 133.13164743,
         45.74760032,  52.25813874,  70.87713281,  46.69548909,
         85.5533696 ,  63.4858768 ,  17.43487978,  63.34821002,
         71.27513917,  56.75468961,  52.940509  , 118.3170952 ,
         45.20006378,  50.16053724, 127.73993889,  32.22190213,
        133.07848382, 131.55899986,  66.51805639,  91.38278627,
        109.51181149,  34.4300077 , 121.86353135,  44.927224  ,
        131.52040124,  56.6737268 ,  70.03906759,  57.00244085,
         62.18024437,  34.97159576,  83.64268279,  50.28215003,
         55.40200766,  46.71194895,  40.62841042,  50.57009959,
          5.47618628,  28.21296271,  35.84320656,  16.90555747,
         13.78727365,  59.85209139, 104.20705279,  25.70931371,
         76.12739428,  43.73157303, 126.12110519, 140.45612741,
         64.64759668,  63.02729185, 101.87875923,  38.97791608,
         48.72105098, 103.10820556,  94.56388625,  69.33224034,
         12.18965872,  

### Evaluate the model

In [18]:
# Evaluate best random search model
best_random = rf_random.best_estimator_
y_pred = best_random.predict(X_test)

In [19]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[5051, 1853],
       [1375, 5408]], dtype=int64)

In [21]:
# Calculate the Balanced Accuracy Score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7644461013654715

In [22]:
# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.73      0.76      6904
           1       0.74      0.80      0.77      6783

    accuracy                           0.76     13687
   macro avg       0.77      0.76      0.76     13687
weighted avg       0.77      0.76      0.76     13687



### Grid Search