In [36]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Fill in NaN and getting overview of data

In [2]:
df = pd.read_csv('Downloads/stroke.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
df = df[df['gender']!='Other']

In [134]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [135]:
df.value_counts('gender')

gender
Female    2994
Male      2115
Name: count, dtype: int64

#### Here we see that smoking_status has a lot of rows with Unknown, for the purpose of simplicity I will treat Unknown as a category on its own when it's being one hot encoded

In [108]:
df.value_counts('smoking_status')

smoking_status
never smoked       1892
Unknown            1544
formerly smoked     884
smokes              789
Name: count, dtype: int64

In [4]:
df.drop('id',axis=1,inplace=True)

In [5]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


# Splitting the data

In [6]:
X = df.drop('stroke', axis = 1)
y = df.stroke

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train['bmi'].fillna(X_train['bmi'].mean(),inplace=True)
X_test['bmi'].fillna(X_test['bmi'].mean(),inplace=True)

In [13]:
X_test.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
dtype: int64

# Pipeline creation - Preprocessing

In [75]:
column_trans = make_column_transformer(
    (OneHotEncoder(),['work_type','smoking_status']),
    (OrdinalEncoder(),['gender','ever_married','Residence_type']),
    (StandardScaler(), ['age','avg_glucose_level','bmi']),
    remainder='passthrough')

In [76]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
smote = SMOTE()

In [100]:
logreg = LogisticRegression(class_weight='balanced')
# pipe = make_pipeline(column_trans, smote, logreg)
pipe = ImbPipeline(steps=[('preprocessor', column_trans), 
                               ('smote', smote), 
                               ('classifier', logreg)])

In [78]:
svm = SVC(class_weight='balanced')
pipe_svm = ImbPipeline(steps=[('preprocessor', column_trans), 
                               ('smote', smote), 
                               ('classifier', svm)])

In [79]:
rfc = RandomForestClassifier(class_weight='balanced')
pipe_rfc = ImbPipeline(steps=[('preprocessor', column_trans), 
                               ('smote', smote), 
                               ('classifier', rfc)])

In [101]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("Ecaluation Metrics for Logistic Regression")
print("Accuracy:", accuracy_score(y_test,y_pred))
print("F1:", f1_score(y_test,y_pred))
print("Recall:", recall_score(y_test,y_pred))
print("Precision:", precision_score(y_test,y_pred))

Ecaluation Metrics for Logistic Regression
Accuracy: 0.738747553816047
F1: 0.2764227642276423
Recall: 0.8225806451612904
Precision: 0.16612377850162866


In [106]:
param_grid_log_reg = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],  # 'l1' requires solver='liblinear'
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__max_iter': [3000]
}

# Initialize GridSearchCV
grid_search_log_reg = GridSearchCV(pipe, param_grid_log_reg, scoring='f1', cv=5)

# Fit GridSearchCV
grid_search_log_reg.fit(X_train, y_train)
best_pipe_log = grid_search.best_estimator_

best_pipe_log.fit(X_train, y_train)
y_pred = best_pipe_log.predict(X_test)

print("Ecaluation Metrics for Logistic Regression")
print("Accuracy:", accuracy_score(y_test,y_pred))
print("F1:", f1_score(y_test,y_pred))
print("Recall:", recall_score(y_test,y_pred))
print("Precision:", precision_score(y_test,y_pred))

Ecaluation Metrics for Logistic Regression
Accuracy: 0.8639921722113503
F1: 0.23204419889502761
Recall: 0.3387096774193548
Precision: 0.17647058823529413


In [86]:
pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)

print("Ecaluation Metrics for SVC")
print("Accuracy:", accuracy_score(y_test,y_pred))
print("F1:", f1_score(y_test,y_pred))
print("Recall:", recall_score(y_test,y_pred))
print("Precision:", precision_score(y_test,y_pred))

Ecaluation Metrics for SVC
Accuracy: 0.8140900195694716
F1: 0.24603174603174602
Recall: 0.5
Precision: 0.1631578947368421


In [92]:
param_grid_svc = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'classifier__gamma': ['scale', 'auto']  # Only relevant for 'rbf', 'poly', and 'sigmoid' kernels
}

# Initialize GridSearchCV
grid_search_svc = GridSearchCV(pipe_svm, param_grid_svc, scoring='f1', cv=5)

# Fit GridSearchCV
grid_search_svc.fit(X_train, y_train)
best_pipe_svc = grid_search.best_estimator_

print("Ecaluation Metrics for Logistic Regression")
print("Accuracy:", accuracy_score(y_test,y_pred))
print("F1:", f1_score(y_test,y_pred))
print("Recall:", recall_score(y_test,y_pred))
print("Precision:", precision_score(y_test,y_pred))

Ecaluation Metrics for Logistic Regression
Accuracy: 0.8620352250489237
F1: 0.23783783783783785
Recall: 0.3548387096774194
Precision: 0.17886178861788618


In [84]:
pipe_rfc.fit(X_train, y_train)
y_pred = pipe_rfc.predict(X_test)

print("Ecaluation Metrics for RFC")
print("Accuracy:", accuracy_score(y_test,y_pred))
print("F1:", f1_score(y_test,y_pred,average='weighted'))
print("Recall:", recall_score(y_test,y_pred,average='weighted'))
print("Precision:", precision_score(y_test,y_pred,average='weighted'))

Ecaluation Metrics for RFC
Accuracy: 0.9217221135029354
F1: 0.9098330989907789
Recall: 0.9217221135029354
Precision: 0.9004199282452707


### With hyperparameter tuning for RFC

In [107]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for RandomForest
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipe_rfc, param_grid, scoring='f1', cv=5)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)
best_pipe_rfc = grid_search.best_estimator_

best_pipe_rfc.fit(X_train, y_train)
y_pred = best_pipe_rfc.predict(X_test)

print("Ecaluation Metrics for RFC")
print("Accuracy:", accuracy_score(y_test,y_pred))
print("F1:", f1_score(y_test,y_pred,average='weighted'))
print("Recall:", recall_score(y_test,y_pred,average='weighted'))
print("Precision:", precision_score(y_test,y_pred,average='weighted'))

Ecaluation Metrics for RFC
Accuracy: 0.8620352250489237
F1: 0.882088473125941
Recall: 0.8620352250489237
Precision: 0.9071189196731178


In [83]:
import numpy as np
print("Test class distribution:", dict(zip(*np.unique(y_test, return_counts=True))))
print("Predicted class distribution:", dict(zip(*np.unique(y_pred, return_counts=True))))

Test class distribution: {0: 960, 1: 62}
Predicted class distribution: {0: 992, 1: 30}


In [148]:
pipe

In [110]:
y_prob = best_pipe_rfc.predict_proba(X_test)
positive_prob = y_prob[:,1]
print(X_test[:5],positive_prob[:5]*100)

      gender   age  hypertension  heart_disease ever_married      work_type  \
4688    Male  31.0             0              0           No  Self-employed   
4478    Male  40.0             0              0          Yes  Self-employed   
3521    Male  52.0             0              0          Yes        Private   
4355  Female  79.0             1              0          Yes  Self-employed   
3826  Female  75.0             0              0          Yes       Govt_job   

     Residence_type  avg_glucose_level   bmi smoking_status  
4688          Rural              64.85  23.0        Unknown  
4478          Rural              65.29  28.3   never smoked  
3521          Rural             111.04  30.0   never smoked  
4355          Rural              76.64  19.5   never smoked  
3826          Rural              94.77  27.2   never smoked   [ 1.81989964  5.60666766 24.2868313  56.50919841 39.81096457]


In [111]:
import joblib

joblib.dump(best_pipe_rfc,'best_pipe_rfc.pkl')

['best_pipe_rfc.pkl']