In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from collections import Counter

# Read data
medical_df = pd.read_csv('Resources/Cleaned_Dataset/clean_stroke.csv')
print(medical_df.shape)
medical_df.head(10)

(5109, 12)


Unnamed: 0,ID#,Gender,Age,Hypertension,Heart_Disease,Ever_Married,Work_Type,Residence_Type,Avg_Glucose_Lvl,BMI,Smoker,Stroke
0,1,Male,67,0,1,Yes,Private,Urban,228.69,36.6,Former,1
1,2,Female,61,0,0,Yes,Self-employed,Rural,202.21,,Never,1
2,3,Male,80,0,1,Yes,Private,Rural,105.92,32.5,Never,1
3,4,Female,49,0,0,Yes,Private,Urban,171.23,34.4,Current,1
4,5,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,Never,1
5,6,Male,81,0,0,Yes,Private,Urban,186.21,29.0,Former,1
6,7,Male,74,1,1,Yes,Private,Rural,70.09,27.4,Never,1
7,8,Female,69,0,0,No,Private,Urban,94.39,22.8,Never,1
8,9,Female,59,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,10,Female,78,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [2]:
# Get data info
medical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID#              5109 non-null   int64  
 1   Gender           5109 non-null   object 
 2   Age              5109 non-null   int64  
 3   Hypertension     5109 non-null   int64  
 4   Heart_Disease    5109 non-null   int64  
 5   Ever_Married     5109 non-null   object 
 6   Work_Type        5109 non-null   object 
 7   Residence_Type   5109 non-null   object 
 8   Avg_Glucose_Lvl  5109 non-null   float64
 9   BMI              4908 non-null   float64
 10  Smoker           5109 non-null   object 
 11  Stroke           5109 non-null   int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 479.1+ KB


In [3]:
# Check missing data
medical_df.isnull().sum()

ID#                  0
Gender               0
Age                  0
Hypertension         0
Heart_Disease        0
Ever_Married         0
Work_Type            0
Residence_Type       0
Avg_Glucose_Lvl      0
BMI                201
Smoker               0
Stroke               0
dtype: int64

In [4]:
# Check unique value counts
medical_df.nunique()

ID#                5109
Gender                2
Age                  83
Hypertension          2
Heart_Disease         2
Ever_Married          2
Work_Type             5
Residence_Type        2
Avg_Glucose_Lvl    3978
BMI                 418
Smoker                4
Stroke                2
dtype: int64

In [5]:
# Drop ID# column
medical_df.drop(columns=['ID#'], inplace=True)
medical_df.head()

Unnamed: 0,Gender,Age,Hypertension,Heart_Disease,Ever_Married,Work_Type,Residence_Type,Avg_Glucose_Lvl,BMI,Smoker,Stroke
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,Former,1
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,,Never,1
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,Never,1
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,Current,1
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,Never,1


In [6]:
# Stroke value counts
medical_df['Stroke'].value_counts()

0    4860
1     249
Name: Stroke, dtype: int64

In [7]:
# Get a list of categorical columns
categorical_columns = medical_df.dtypes[medical_df.dtypes=='object'].index.tolist()
categorical_columns

['Gender', 'Ever_Married', 'Work_Type', 'Residence_Type', 'Smoker']

In [8]:
# Get value counts for categorical columns
for i in range(len(categorical_columns)):
    print(medical_df[categorical_columns[i]].value_counts())

Female    2994
Male      2115
Name: Gender, dtype: int64
Yes    3353
No     1756
Name: Ever_Married, dtype: int64
Private          2924
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: Work_Type, dtype: int64
Urban    2596
Rural    2513
Name: Residence_Type, dtype: int64
Never      1892
Unknown    1544
Former      884
Current     789
Name: Smoker, dtype: int64


In [9]:
# Create OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit & transform OneHotEncoder using categorical columns
encode_df = pd.DataFrame(enc.fit_transform(medical_df[categorical_columns]))

# Add column names
encode_df.columns = enc.get_feature_names(categorical_columns)
print(encode_df.shape)
encode_df.head(10)

(5109, 15)




Unnamed: 0,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Work_Type_Govt_job,Work_Type_Never_worked,Work_Type_Private,Work_Type_Self-employed,Work_Type_children,Residence_Type_Rural,Residence_Type_Urban,Smoker_Current,Smoker_Former,Smoker_Never,Smoker_Unknown
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
7,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
9,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [10]:
# Get encode_df info
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender_Female            5109 non-null   float64
 1   Gender_Male              5109 non-null   float64
 2   Ever_Married_No          5109 non-null   float64
 3   Ever_Married_Yes         5109 non-null   float64
 4   Work_Type_Govt_job       5109 non-null   float64
 5   Work_Type_Never_worked   5109 non-null   float64
 6   Work_Type_Private        5109 non-null   float64
 7   Work_Type_Self-employed  5109 non-null   float64
 8   Work_Type_children       5109 non-null   float64
 9   Residence_Type_Rural     5109 non-null   float64
 10  Residence_Type_Urban     5109 non-null   float64
 11  Smoker_Current           5109 non-null   float64
 12  Smoker_Former            5109 non-null   float64
 13  Smoker_Never             5109 non-null   float64
 14  Smoker_Unknown          

In [11]:
# Drop redundant columns
encode_df.drop(columns=['Gender_Female', 'Ever_Married_No', 'Residence_Type_Rural'], inplace=True)
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender_Male              5109 non-null   float64
 1   Ever_Married_Yes         5109 non-null   float64
 2   Work_Type_Govt_job       5109 non-null   float64
 3   Work_Type_Never_worked   5109 non-null   float64
 4   Work_Type_Private        5109 non-null   float64
 5   Work_Type_Self-employed  5109 non-null   float64
 6   Work_Type_children       5109 non-null   float64
 7   Residence_Type_Urban     5109 non-null   float64
 8   Smoker_Current           5109 non-null   float64
 9   Smoker_Former            5109 non-null   float64
 10  Smoker_Never             5109 non-null   float64
 11  Smoker_Unknown           5109 non-null   float64
dtypes: float64(12)
memory usage: 479.1 KB


In [12]:
# Merge encoded df with medical df
medical_df = medical_df.merge(encode_df, left_index=True, right_index=True).drop(categorical_columns, axis=1)
print(medical_df.shape)
medical_df.head(10)

(5109, 18)


Unnamed: 0,Age,Hypertension,Heart_Disease,Avg_Glucose_Lvl,BMI,Stroke,Gender_Male,Ever_Married_Yes,Work_Type_Govt_job,Work_Type_Never_worked,Work_Type_Private,Work_Type_Self-employed,Work_Type_children,Residence_Type_Urban,Smoker_Current,Smoker_Former,Smoker_Never,Smoker_Unknown
0,67,0,1,228.69,36.6,1,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,61,0,0,202.21,,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,80,0,1,105.92,32.5,1,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,49,0,0,171.23,34.4,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,79,1,0,174.12,24.0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,81,0,0,186.21,29.0,1,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6,74,1,1,70.09,27.4,1,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,69,0,0,94.39,22.8,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8,59,0,0,76.15,,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,78,0,0,58.57,24.2,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [13]:
# Create SimpleImputer instance to replace missing BMI feature values with median BMI
imputer = SimpleImputer(strategy='median')

# Fit SimpleImputer & transform data
med_transformed = imputer.fit_transform(medical_df)

In [14]:
# Add SimpleImputer outcome to dataframe
med_df_transformed = pd.DataFrame(med_transformed, columns=medical_df.columns)
print(med_df_transformed.shape)
med_df_transformed.head(10)

(5109, 18)


Unnamed: 0,Age,Hypertension,Heart_Disease,Avg_Glucose_Lvl,BMI,Stroke,Gender_Male,Ever_Married_Yes,Work_Type_Govt_job,Work_Type_Never_worked,Work_Type_Private,Work_Type_Self-employed,Work_Type_children,Residence_Type_Urban,Smoker_Current,Smoker_Former,Smoker_Never,Smoker_Unknown
0,67.0,0.0,1.0,228.69,36.6,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,61.0,0.0,0.0,202.21,28.1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,80.0,0.0,1.0,105.92,32.5,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,49.0,0.0,0.0,171.23,34.4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,79.0,1.0,0.0,174.12,24.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,81.0,0.0,0.0,186.21,29.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6,74.0,1.0,1.0,70.09,27.4,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,69.0,0.0,0.0,94.39,22.8,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8,59.0,0.0,0.0,76.15,28.1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,78.0,0.0,0.0,58.57,24.2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [15]:
med_df_transformed.isnull().sum()

Age                        0
Hypertension               0
Heart_Disease              0
Avg_Glucose_Lvl            0
BMI                        0
Stroke                     0
Gender_Male                0
Ever_Married_Yes           0
Work_Type_Govt_job         0
Work_Type_Never_worked     0
Work_Type_Private          0
Work_Type_Self-employed    0
Work_Type_children         0
Residence_Type_Urban       0
Smoker_Current             0
Smoker_Former              0
Smoker_Never               0
Smoker_Unknown             0
dtype: int64

In [16]:
# Create feature & target datasets
X = med_df_transformed.drop(columns=['Stroke'])
y = med_df_transformed['Stroke']
print(X.shape)
print(y.shape)

(5109, 17)
(5109,)


In [17]:
# Split into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4087, 17)
(1022, 17)
(4087,)
(1022,)


In [18]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit & transform
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

SMOTE 

In [19]:
# Run SMOTE oversampling instance
X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train_scaled, y_train)

# Check new stroke training distribution
print(Counter(y_train_resampled))

Counter({0.0: 3888, 1.0: 3888})


In [20]:
# Create RandomForestClassifier instance
rf_model = RandomForestClassifier(n_estimators=100, bootstrap=False, max_depth=13, min_samples_split=2, random_state=2)

# Fit the model
rf_model = rf_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
y_pred1 = rf_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred1)
cm_df = pd.DataFrame(cm, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y_test, y_pred1))
print(f' Accuracy: {accuracy_score(y_test, y_pred1):.3f}; Precision: {precision_score(y_test, y_pred1):.3f}; Recall: {recall_score(y_test, y_pred1):.3f}')
cm_df

              precision    recall  f1-score   support

         0.0       0.95      0.91      0.93       972
         1.0       0.07      0.14      0.10        50

    accuracy                           0.87      1022
   macro avg       0.51      0.52      0.51      1022
weighted avg       0.91      0.87      0.89      1022

 Accuracy: 0.872; Precision: 0.074; Recall: 0.140


Unnamed: 0,Predicted-,Predicted+
Stroke-,884,88
Stroke+,43,7


In [21]:
# Create AdaBoostClassifier instance
ada_model = AdaBoostClassifier(n_estimators=128, random_state=2)
# Fit the model
ada_model = ada_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
y_pred2 = ada_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred2)
cm_df = pd.DataFrame(cm, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y_test, y_pred2))
print(f' Accuracy: {accuracy_score(y_test, y_pred2):.3f}; Precision: {precision_score(y_test, y_pred2):.3f}; Recall: {recall_score(y_test, y_pred2):.3f}')
cm_df

              precision    recall  f1-score   support

         0.0       0.97      0.84      0.90       972
         1.0       0.12      0.44      0.19        50

    accuracy                           0.82      1022
   macro avg       0.55      0.64      0.55      1022
weighted avg       0.93      0.82      0.86      1022

 Accuracy: 0.820; Precision: 0.124; Recall: 0.440


Unnamed: 0,Predicted-,Predicted+
Stroke-,816,156
Stroke+,28,22
