# Heart Attack Data Exploration - Using Random Forest and Hyperparameter Tuning using GridSearch

In [1]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


## Loading and Preprocessing Heart Attack Data

In [2]:
# Loading data
file_path = Path("../Resources/heart_attack_prediction_dataset.csv")
df_heart = pd.read_csv(file_path)
df_heart.head()


Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [3]:
#check for missing variables
df_heart.isnull().sum()

Patient ID                         0
Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            0
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         0
Heart Attack Risk                  0
dtype: int64

In [4]:
# Define target vector
y = df_heart["Heart Attack Risk"]
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: Heart Attack Risk, dtype: int64

In [5]:
# Define features set
X = df_heart.copy()
X.drop(["Patient ID","Blood Pressure","Blood Pressure","Country","Continent","Hemisphere",
        "Smoking", "Diabetes", "Obesity", "Alcohol Consumption", "Previous Heart Problems", "Medication Use",
        "Family History", "Diet", "Sleep Hours Per Day", "Physical Activity Days Per Week", "Heart Attack Risk"], axis=1, inplace=True)
X.head()

Unnamed: 0,Age,Sex,Cholesterol,Heart Rate,Exercise Hours Per Week,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides
0,67,Male,208,72,4.168189,9,6.615001,261404,31.251233,286
1,21,Male,389,98,1.813242,1,4.963459,285768,27.194973,235
2,21,Female,324,72,2.078353,9,9.463426,235282,28.176571,587
3,84,Male,383,73,9.82813,9,7.648981,125640,36.464704,378
4,66,Male,318,93,5.804299,6,1.514821,160555,21.809144,231


In [6]:
# Sex Convert categorical column to binary (dummy) variables
df_dummies = pd.get_dummies(X['Sex'], prefix='Sex')
X.drop(["Sex"], axis=1, inplace=True)
# Concatenate the dummy variables with the original DataFrame
X = pd.concat([X, df_dummies], axis=1)
X.head()

Unnamed: 0,Age,Cholesterol,Heart Rate,Exercise Hours Per Week,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Sex_Female,Sex_Male
0,67,208,72,4.168189,9,6.615001,261404,31.251233,286,False,True
1,21,389,98,1.813242,1,4.963459,285768,27.194973,235,False,True
2,21,324,72,2.078353,9,9.463426,235282,28.176571,587,True,False
3,84,383,73,9.82813,9,7.648981,125640,36.464704,378,False,True
4,66,318,93,5.804299,6,1.514821,160555,21.809144,231,False,True


In [7]:
print(f'X : {X.shape}')

X : (8763, 11)


In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101)

In [9]:
print(f'X_train : {X_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_test : {y_test.shape}')

X_train : (7010, 11)
y_train : (7010,)
X_test : (1753, 11)
y_test : (1753,)


## Build Random Forest Model with Hyperparameters

In [39]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop =80, num =10)]
# Number of features to consider in every split
max_features = ['log2', 'log2']
# Max number of levels in tree
max_depth = [2,5]
# Min number of samples required to split a node
min_samples_split = [2,5]
# Min number of samples required in each leaf node
min_samples_leaf = [1,2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [40]:
# Create the random grid
param_grid = {'n_estimators' : n_estimators,
              'max_features' : max_features,
              'max_depth' : max_depth,
              'min_samples_split' : min_samples_split,
              'min_samples_leaf' : min_samples_leaf,
              'bootstrap' : bootstrap}
print(param_grid)

{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_features': ['log2', 'log2'], 'max_depth': [2, 5], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [41]:
rf_Model = RandomForestClassifier()

In [42]:
from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 4)

In [43]:
rf_Grid.fit(X_train, y_train)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


In [45]:
rf_Grid.best_params_

{'bootstrap': True,
 'max_depth': 5,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 17}

## Model Evaluation

In [46]:
print (f'Train Accuracy - : {rf_Grid.score(X_train, y_train):.3f}')
print (f'Test Accuracy - : {rf_Grid.score(X_test, y_test):.3f}')

Train Accuracy - : 0.647
Test Accuracy - : 0.624
[CV] END bootstrap=True, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=17; total time=   0.0s
[CV] END bootstrap=True, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=25; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=33; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=48; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=56; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=64; total time=   0.1s
[CV] END bootstrap=True, max_depth=2, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=   0.2s
[CV] END boot