<div style="width:100%;text-align: center;"> <img align=middle src="https://my.clevelandclinic.org/-/scassets/images/org/patient-experience/patient-stories/173-advanced-stroke-procedure-saves-patient-after-deep-brain-bleed/deep-brain-bleeds-new-2.gif?la=en" alt="Heat beating" style="height:366px;margin-top:3rem;"> </div>

# <h1 style='background:#E64848; border:0; color:white'><center>🧠Brain Stroke Prediction</center></h1> 

# **<span style="color:#FEB139;">📰Get the Data</span>**

In [None]:
#Environment check
import os
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import mean_absolute_error, accuracy_score, roc_curve, auc
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

import imblearn
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler

#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline

#metrics
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from numpy import mean
from scipy.stats import uniform
from scipy import interp
from sklearn.utils import class_weight

# **<span style="color:#FEB139;">📄About the Dataset</span>**

This dataset contains 11 columns:

> 1) gender: "Male", "Female" or "Other"

> 2) age: age of the patient

> 3) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

> 4) heart disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease 

> 5) ever-married: "No" or "Yes"

> 6) worktype: "children", "Govtjov", "Neverworked", "Private" or "Self-employed" 

> 7) Residencetype: "Rural" or "Urban"

> 8) avgglucoselevel: average glucose level in blood

> 9) bmi: body mass index

> 10) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*

> 11) stroke: 1 if the patient had a stroke or 0 if not

*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/brain-stroke-dataset/brain_stroke.csv')

In [None]:
df.head()

In [None]:
#check for null
df.isnull().any()

In [None]:
#Check for correlation
plt.figure(figsize = (15, 10))
sns.heatmap(df.corr(), annot = True, cmap = 'coolwarm')

In [None]:
sns.pairplot(data = df, hue = 'gender')


# **<span style="color:#FEB139;">📐Feature engineering and mapping</span>**

In [None]:
df.shape

In [None]:
df.columns

## **<span style="color:#FEB139;">Encoded Features:</span>**

'Female':1,'Male':0

'Yes': 1, 'No': 0

'Private': 0, 'Self-employed': 1, 'Govt_job':2, 'children':3

'Urban': 1, 'Rural':0

'formerly smoked':0, 'never smoked':1, 'smokes':2, 'Unknown':3

In [None]:
# Encode features into numerics

df['gender'] = df['gender'].map({'Female':1,'Male':0})
df['ever_married'] = df['ever_married'].map({'Yes': 1, 'No': 0})
df['work_type'] = df['work_type'].map({'Private': 0, 'Self-employed': 1, 'Govt_job':2, 'children':3})
df['Residence_type'] = df['Residence_type'].map({'Urban': 1, 'Rural':0})
df['smoking_status'] = df['smoking_status'].map({'formerly smoked':0, 'never smoked':1, 'smokes':2, 'Unknown':3})

In [None]:
df.head()

## **<span style="color:#FEB139;">Scaling Features:</span>**

In [None]:
X = df.iloc[:,:-1]
y = df['stroke']  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scale=MinMaxScaler()
X_scaled=pd.DataFrame(scale.fit_transform(X_train),columns=X_train.columns)
X_scaled

In [None]:
y.value_counts()

Imbalanced Dataset 

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')

X, y = oversample.fit_resample(X_scaled, y_train)

In [None]:
y.shape

## Model 1 - Logistic Regression with weight balancing 

In [None]:
model_1 = LogisticRegression(class_weight = 'balanced')

model_1.fit(X_train, y_train)

In [None]:
# Predictions
predictions_1 = model_1.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, predictions_1, labels = model_1.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = model_1.classes_)
disp.plot()

In [None]:
clf_report = classification_report(y_test, predictions_1)
print(clf_report)

In [None]:
# ROC Curve

y_pred_prob = model_1.predict_proba(X_test)[:,1]

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr)
plt.show()

In [None]:
auc = metrics.roc_auc_score(y_test, y_pred_prob)
auc

# Model 2 - Random Forest Classifier

In [None]:
model_2 = RandomForestClassifier()

# Fit 
model_2.fit(X_train, y_train)

In [None]:
# Predictions
predictions_2 = model_2.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, predictions_2, labels = model_2.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = model_2.classes_)
disp.plot()

In [None]:
clf_report = classification_report(y_test, predictions_2)
print(clf_report)

In [None]:
# ROC Curve

y_pred_prob = model_2.predict_proba(X_test)[:,1]

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr)
plt.show()

In [None]:
steps = [('over', SMOTE()), ('model', RandomForestClassifier(n_estimators = 140))]
pipeline = Pipeline(steps=steps)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))

**<span style="color:#FEB139;">Most Likely to overfit </span>**

# Model 3 XGBoost

In [None]:
model_3 = XGBClassifier()

# Fit 
model_3.fit(X_train, y_train)

In [None]:
# Predictions
predictions_3 = model_3.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, predictions_3, labels = model_3.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = model_3.classes_)
disp.plot()

In [None]:
clf_report = classification_report(y_test, predictions_3)
print(clf_report)

In [None]:
# ROC Curve

y_pred_prob = model_3.predict_proba(X_test)[:,1]

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr)
plt.show()

In [None]:
steps = [('model', XGBClassifier())]
pipeline = Pipeline(steps=steps)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))


## 🤘Conclusion

Logistic Regression Classifier did better than XGBoost and Random Forest Classifier since, they seem to most likely overfit.
 
**This is not end of 🧠Brain Stroke Prediction**

**Stay Tuned for more analysis on this dataset**

**Please share your feedback and suggestions and help me improve 😇**

**Gifs -** Tenor.com and clevelandclinic

<div style="width:100%;text-align: center;"> <img align=middle src="https://c.tenor.com/NtzQJkqSqD8AAAAM/beyin-mazgi.gif" alt="Heat beating" style="height:366px;margin-top:3rem;"> </div>