In [1]:
# Standard Libs
import pandas as pd
import numpy as np

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# Statistics Tools
from statsmodels.graphics.mosaicplot import mosaic

# Random Forest
from sklearn.ensemble import RandomForestClassifier as RFC

In [2]:
# Importing Data from CSV file
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
# Independent Variables
X = df.drop(['id','stroke'],axis=1)
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked


In [4]:
# Target Variable
y = df['stroke']
y.shape

(5110,)

In [8]:
# Generating gender dummies 
gender_dummies = pd.get_dummies(X['gender'])
gender_dummies = gender_dummies.drop(['Other'],axis=1)
gender_dummies.head()

Unnamed: 0,Female,Male
0,0,1
1,1,0
2,0,1
3,1,0
4,1,0


In [10]:
# Generating ever_married dummies
ever_married_dummies = pd.get_dummies(X['ever_married'])
ever_married_dummies.head()

Unnamed: 0,No,Yes
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


In [12]:
# Generating work_type dummies
work_type_dummies = pd.get_dummies(X['work_type'])
work_type_dummies = work_type_dummies.drop(['Never_worked'],axis=1)
work_type_dummies.head()

Unnamed: 0,Govt_job,Private,Self-employed,children
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,0,0,1,0


In [14]:
# Generating Residence_type dummies
residence_type_dummies = pd.get_dummies(X['Residence_type'])
residence_type_dummies.head()

Unnamed: 0,Rural,Urban
0,0,1
1,1,0
2,1,0
3,0,1
4,1,0


In [15]:
# Generating smoking_status dummies
smoking_status_dummies = pd.get_dummies(X['smoking_status'])
smoking_status_dummies.head()

Unnamed: 0,Unknown,formerly smoked,never smoked,smokes
0,0,1,0,0
1,0,0,1,0
2,0,0,1,0
3,0,0,0,1
4,0,0,1,0


In [18]:
# Checking bmi NA values 
X['bmi'].isna().value_counts()

False    4909
True      201
Name: bmi, dtype: int64

In [21]:
# Replacing bmi NA values with bmi mean
bmi_mean = X['bmi'].mean()
X['bmi'].fillna(value=bmi_mean, inplace=True)

In [22]:
# Checking bmi NA values 
X['bmi'].isna().value_counts()

False    5110
Name: bmi, dtype: int64

In [23]:
# Combining Dummies
X_new = pd.concat([gender_dummies,ever_married_dummies,work_type_dummies,residence_type_dummies,smoking_status_dummies],axis=1)
X_new.head()

Unnamed: 0,Female,Male,No,Yes,Govt_job,Private,Self-employed,children,Rural,Urban,Unknown,formerly smoked,never smoked,smokes
0,0,1,0,1,0,1,0,0,0,1,0,1,0,0
1,1,0,0,1,0,0,1,0,1,0,0,0,1,0
2,0,1,0,1,0,1,0,0,1,0,0,0,1,0
3,1,0,0,1,0,1,0,0,0,1,0,0,0,1
4,1,0,0,1,0,0,1,0,1,0,0,0,1,0


In [24]:
# Merging DataFrames
X_new = pd.concat([X.drop(['gender','ever_married','work_type','Residence_type','smoking_status'],axis=1),X_new],axis=1)
X_new.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,Female,Male,No,Yes,Govt_job,Private,Self-employed,children,Rural,Urban,Unknown,formerly smoked,never smoked,smokes
0,67.0,0,1,228.69,36.6,0,1,0,1,0,1,0,0,0,1,0,1,0,0
1,61.0,0,0,202.21,28.893237,1,0,0,1,0,0,1,0,1,0,0,0,1,0
2,80.0,0,1,105.92,32.5,0,1,0,1,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,0,0,1,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,0,0,1,0,0,1,0,1,0,0,0,1,0


In [25]:
# Splitting Data Into Training and Testing data
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X_new,y,test_size=0.1)

In [26]:
X_train.shape

(4599, 19)

In [27]:
X_test.shape

(511, 19)

In [28]:
y_train.shape

(4599,)

In [30]:
y_test.shape

(511,)

In [31]:
# Creating Random Forest Model
stroke_rf_model = RFC()

In [32]:
# Training Random Forest Model
stroke_rf_model.fit(X_train,y_train)

RandomForestClassifier()

In [33]:
# Scoring Random Forest Model
print("Stroke Random Forest Model Score : {:.4f}".format(stroke_rf_model.score(X_test,y_test)))

Stroke Random Forest Model Score : 0.9609


In [47]:
# Tuning Random Forest Model
scores = []
n_est = 10
depth = 5
for i in range(10):
    X_train, X_test, y_train, y_test = tts(X_new,y,test_size=0.1)
    stroke_rf_model = RFC(n_estimators=n_est,max_depth=depth)
    stroke_rf_model.fit(X_train,y_train)
    scores.append(stroke_rf_model.score(X_test,y_test))
    
print("Stroke Random Forest Mean Score : {:.4f}".format(np.average(scores)))

Stroke Random Forest Mean Score : 0.9532
