In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
!pip install xgboost
from xgboost import XGBClassifier
import random



In [97]:
data = pd.read_csv("healthcare-dataset-stroke-data (1).csv")
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [98]:
data.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


In [99]:
data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [100]:
data.shape

(5110, 12)

In [101]:
median_bmi = data['bmi'].median()
data['bmi'] = data['bmi'].fillna(median_bmi)

## Now I have built two functions for encoding and scaling the data

In [103]:
from sklearn.preprocessing import LabelEncoder
cat_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
def label_encoder(data):
    for cat in cat_cols:
        le = LabelEncoder()
        data[cat] = le.fit_transform(data[cat].astype(str))
    return data


There is huge class-imabalnce in out data, as most of target variable or the stroke label had 0's or negative labels. Thus, I used an undersampling technique to create a balance between the majority class and the minority class for a better understanding of the data. 

In [104]:
# Separate the majority and minority class examples
majority_class = data[data.stroke == 0]
minority_class = data[data.stroke == 1]

# Determine the number of examples in the minority class
num_minority_examples = len(minority_class)

# Randomly select examples from the majority class to keep
undersampled_majority = majority_class.sample(n=num_minority_examples, random_state=42)

# Combine the minority and undersampled majority class examples
undersampled_data = pd.concat([undersampled_majority, minority_class])

# Shuffle the combined dataset
undersampled_data = undersampled_data.sample(frac=1).reset_index(drop=True)

In [105]:
from sklearn.preprocessing import StandardScaler
num_cols = ['age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi']
def scaler(data):
    scaler = StandardScaler()
    data[num_cols] = scaler.fit_transform(data[num_cols])
    return data

## We are splitting the dataset into train, validate and test sets, for better functioning and evaluation of the model

In [106]:
train_data, test_data = train_test_split(undersampled_data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(undersampled_data, test_size=0.2, random_state=42)
# Separate target variable from features for each dataset
train_target = train_data['stroke']
train_features = train_data.drop(['id', 'stroke'], axis=1)

val_target = val_data['stroke']
val_features = val_data.drop(['id', 'stroke'], axis=1)

test_target = test_data['stroke']
test_features = test_data.drop(['id', 'stroke'], axis=1)

In [107]:
train_features = label_encoder(train_features)
train_features = scaler(train_features)
train_features.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
56,0,-0.111995,2.202939,-0.383372,0.469956,-1.613813,0.975182,1.115826,-0.253952,2
227,1,1.057771,-0.453939,2.608433,0.469956,1.172727,0.975182,2.239834,-0.196985,3
280,1,-1.379242,-0.453939,-0.383372,0.469956,-0.220543,0.975182,-0.156378,-0.225468,0
485,0,1.106512,2.202939,-0.383372,-2.127858,-1.613813,0.975182,-0.368114,-1.022999,1
220,1,-0.599397,-0.453939,-0.383372,0.469956,-1.613813,0.975182,-0.426462,-0.496059,1


In [108]:
val_features = label_encoder(val_features)
vaL_features = scaler(val_features)
val_features.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
487,1,-1.604266,-0.531085,-0.314485,-1.527525,1.661265,-1.199593,-0.11151,-1.103112,0
73,0,-0.06436,-0.531085,-0.314485,0.654654,-0.226536,0.833616,0.784941,-0.105698,2
231,0,0.185355,-0.531085,-0.314485,-1.527525,0.717364,0.833616,-0.990985,1.196482,2
175,1,1.225832,-0.531085,-0.314485,0.654654,-0.226536,-1.199593,-0.651461,0.088243,1
237,1,1.059355,-0.531085,-0.314485,0.654654,0.717364,0.833616,1.755138,-0.285787,0


In [109]:
test_features = label_encoder(test_features)
test_features = scaler(test_features)
test_features.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
487,1,-1.604266,-0.531085,-0.314485,-1.527525,1.661265,-1.199593,-0.11151,-1.103112,0
73,0,-0.06436,-0.531085,-0.314485,0.654654,-0.226536,0.833616,0.784941,-0.105698,2
231,0,0.185355,-0.531085,-0.314485,-1.527525,0.717364,0.833616,-0.990985,1.196482,2
175,1,1.225832,-0.531085,-0.314485,0.654654,-0.226536,-1.199593,-0.651461,0.088243,1
237,1,1.059355,-0.531085,-0.314485,0.654654,0.717364,0.833616,1.755138,-0.285787,0


## Creating the evaluation metrics function for calculating the accuracy, precision, recall, F1 and roc_auc_curve

In [110]:
def evaluate(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    return acc, prec, rec, f1, roc_auc

## Training the ANN model to check for it's predictions

In [111]:
ann = MLPClassifier(random_state=42)
ann.fit(train_features, train_target)
val_preds_ann = ann.predict(val_features)
val_acc_ann, val_prec_ann, val_rec_ann, val_f1_ann, val_roc_auc_ann = evaluate(val_target, val_preds_ann)
print(f"ANN: Accuracy={val_acc_ann:.4f}, Precision={val_prec_ann:.4f}, Recall={val_rec_ann:.4f}, F1-score={val_f1_ann:.4f}, ROC AUC={val_roc_auc_ann:.4f}")
test_preds_ann = ann.predict(test_features)
test_acc_ann, test_prec_ann, test_rec_ann, test_f1_ann, test_roc_auc_ann = evaluate(test_target, test_preds_ann)
print(f"ANN: Accuracy={test_acc_ann:.4f}, Precision={test_prec_ann:.4f}, Recall={test_rec_ann:.4f}, F1-score={test_f1_ann:.4f}, ROC AUC={test_roc_auc_ann:.4f}")

ANN: Accuracy=0.7300, Precision=0.7451, Recall=0.7308, F1-score=0.7379, ROC AUC=0.7300
ANN: Accuracy=0.7300, Precision=0.7451, Recall=0.7308, F1-score=0.7379, ROC AUC=0.7300




The Model gets an accuracy of 73%, which is not bad. The Precision and Recall score are also good, signifies that in the predictions the FP cases are 74.5%, but where as the recall which stands for FN is 73%. Thus in the healthcare industry, the weightage and importance of less FN is much more as the model should not predict False Negative cases. Thus it means that 73% times the model predicts accurate stroke cases. 

## Hyper-parameter Tuning of the XGB Classifier using GridSearch for better and more accurate predictions

In [112]:
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 1],
    'subsample': [0.5, 0.75, 1],
    'colsample_bytree': [0.5, 0.75, 1],
    'scale_pos_weight': [6]
}

xgb_model = XGBClassifier()

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='f1', cv=5, n_jobs=-1)

grid_search.fit(train_features, train_target)

print(grid_search.best_params_)

{'colsample_bytree': 0.75, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'scale_pos_weight': 6, 'subsample': 0.5}


In [115]:
xgb_model = XGBClassifier(colsample_bytree= 0.75, learning_rate = 0.01, max_depth = 5, n_estimators = 100, scale_pos_weight = 6, subsample = 1)
xgb_model.fit(train_features, train_target)

val_preds_xgb = xgb_model.predict(val_features)
val_acc_xgb, val_prec_xgb, val_rec_xgb, val_f1_xgb, val_roc_auc_xgb = evaluate(val_target, val_preds_xgb)
print(f"XGBC: Accuracy={val_acc_xgb:.4f}, Precision={val_prec_xgb:.4f}, Recall={val_rec_xgb:.4f}, F1-score={val_f1_xgb:.4f}, ROC AUC={val_roc_auc_xgb:.4f}")

test_preds_xgb = xgb_model.predict(test_features)
test_acc_xgb, test_prec_xgb, test_rec_xgb, test_f1_xgb, test_roc_auc_xgb = evaluate(test_target, test_preds_xgb)
print(f"XGBC: Accuracy={test_acc_xgb:.4f}, Precision={test_prec_xgb:.4f}, Recall={test_rec_xgb:.4f}, F1-score={test_f1_xgb:.4f}, ROC AUC={test_roc_auc_xgb:.4f}")

XGBC: Accuracy=0.7300, Precision=0.6667, Recall=0.9615, F1-score=0.7874, ROC AUC=0.7204
XGBC: Accuracy=0.7300, Precision=0.6667, Recall=0.9615, F1-score=0.7874, ROC AUC=0.7204


In [116]:
from sklearn.model_selection import cross_val_score

xgb_model = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.01, subsample=0.5, colsample_bytree=1, scale_pos_weight=6)

# Perform 5-fold cross-validation on the training set
scores = cross_val_score(xgb_model, train_features, train_target, cv=5, scoring='f1_macro')

print(f"Cross-validation F1-score: {scores.mean():.4f} +/- {scores.std():.4f}")


Cross-validation F1-score: 0.7042 +/- 0.0344


## Final Model Selection
Thus this XGB Classifier provides pretty much accurate results in terms of FP and FN. Two most important criterias here are the following:
1. Number of patients that the model predicts might experience a stroke but in reality does not(False Positive) - In this case 	    if the patient does not suffer a stroke, but when the model predicts that he/she might, then there is a chance a that the     	 patient becomes much more skeptical about his/her health, And thus is reality decreases the chances of experiencing a stroke.
   the FP percentage is 66.6%, it means that the model predict that patient might experience a stroke 66.67 times but in reality    he / she might not.
2. Second case, is the one of False Negatives. That is in this case, the patients that the model predicts negative but in   	    reality might experience a stroke. Thus the model, works exceptionally well, in this case giving an accurate results of 96%.    Which means that 96% of times the model does not predicts a False Negative. 96% times if the case in reality is negative, the    model also predicts negative. In an model prediction or detection, the cases of FN should be extremely minimal. Only then the    model is viable for use, Because FP cases or situations have less harmful adversities than FN, as FN can lead stoppage of        medical treatment or early detection and reduction of the medical issue. 