In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

In [32]:
df = pd.read_csv('imbalanced_cerebral.csv').set_index('id')

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43400 entries, 30669 to 36271
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             43400 non-null  object 
 1   age                43400 non-null  float64
 2   hypertension       43400 non-null  int64  
 3   heart_disease      43400 non-null  int64  
 4   ever_married       43400 non-null  object 
 5   work_type          43400 non-null  object 
 6   Residence_type     43400 non-null  object 
 7   avg_glucose_level  43400 non-null  float64
 8   bmi                41938 non-null  float64
 9   smoking_status     30108 non-null  object 
 10  stroke             43400 non-null  int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 4.0+ MB


In [34]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,43400.0,43400.0,43400.0,43400.0,41938.0,43400.0
mean,42.217894,0.093571,0.047512,104.48275,28.605038,0.018041
std,22.519649,0.291235,0.212733,43.111751,7.77002,0.133103
min,0.08,0.0,0.0,55.0,10.1,0.0
25%,24.0,0.0,0.0,77.54,23.2,0.0
50%,44.0,0.0,0.0,91.58,27.7,0.0
75%,60.0,0.0,0.0,112.07,32.9,0.0
max,82.0,1.0,1.0,291.05,97.6,1.0


In [35]:
df.head(10)

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0
32257,Female,47.0,0,0,Yes,Private,Urban,210.95,50.1,,0
52800,Female,52.0,0,0,Yes,Private,Urban,77.59,17.7,formerly smoked,0
41413,Female,75.0,0,1,Yes,Self-employed,Rural,243.53,27.0,never smoked,0
15266,Female,32.0,0,0,Yes,Private,Rural,77.67,32.3,smokes,0
28674,Female,74.0,1,0,Yes,Self-employed,Urban,205.84,54.6,never smoked,0


In [36]:
df = df.astype({
    'gender': 'category',
    'hypertension': 'category',
    'heart_disease': 'category',
    'ever_married': 'category',
    'work_type': 'category',
    'Residence_type': 'category',
    'smoking_status': 'category',
    'stroke': 'category'
})

In [37]:
df.isna().sum()

gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64

In [38]:
df = df.dropna()

In [39]:
df.shape

(29072, 11)

In [40]:
df = df.drop(columns=['ever_married', 'work_type', 'Residence_type']).reset_index(drop=True)

In [41]:
cats = ['gender', 'hypertension', 'heart_disease', 'smoking_status']

for category in cats:
    df[category] = df[category].cat.codes

In [42]:
X = df.drop(columns=['stroke'])
y = df['stroke'].to_numpy()

In [43]:
X_std = StandardScaler().fit_transform(X)

In [44]:
X_res, y_res = SMOTE().fit_resample(X_std, y)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33, random_state=42)

In [46]:
models = {
    'MultiLayerPerceptron': [MLPClassifier(max_iter=350)],
    'GradientBoostingClassifier': [GradientBoostingClassifier()],
    'KNeighborsClassifier': [KNeighborsClassifier()],
    'SVC': [SVC()],
    'DecisionTreeClassifier': [DecisionTreeClassifier()],
    'RandomForestClassifier': [RandomForestClassifier()],
    'AdaBoostClassifier': [AdaBoostClassifier()],
    'GaussianNB': [GaussianNB()]
    }

In [47]:
for model in tuple(models.keys()):
    models[model][0].fit(X_train, y_train)




In [48]:
for model in tuple(models.keys()):
    models[model].append(models[model][0].predict(X_test))

In [49]:
for model in tuple(models.keys()):
    models[model].append(recall_score(y_test, models[model][1]))

In [50]:
for key, value in models.items():
    display(
        key,
        pd.DataFrame(confusion_matrix(y_test, value[1])),
    )

'MultiLayerPerceptron'

Unnamed: 0,0,1
0,7544,1829
1,861,8592


'GradientBoostingClassifier'

Unnamed: 0,0,1
0,7576,1797
1,1406,8047


'KNeighborsClassifier'

Unnamed: 0,0,1
0,7873,1500
1,206,9247


'SVC'

Unnamed: 0,0,1
0,7180,2193
1,1454,7999


'DecisionTreeClassifier'

Unnamed: 0,0,1
0,9018,355
1,289,9164


'RandomForestClassifier'

Unnamed: 0,0,1
0,8791,582
1,277,9176


'AdaBoostClassifier'

Unnamed: 0,0,1
0,6884,2489
1,1662,7791


'GaussianNB'

Unnamed: 0,0,1
0,7477,1896
1,3126,6327


In [51]:
recalls = list()
for key, value in models.items():
    recalls.append((key, value[2]))

print(
    *sorted(recalls,
    key=lambda x: x[1],
    reverse=True),
    sep='\n'
)

('KNeighborsClassifier', 0.9782079763038188)
('RandomForestClassifier', 0.9706971331852322)
('DecisionTreeClassifier', 0.9694276949116682)
('MultiLayerPerceptron', 0.9089178038717868)
('GradientBoostingClassifier', 0.8512641489474241)
('SVC', 0.8461863958531683)
('AdaBoostClassifier', 0.8241827991113932)
('GaussianNB', 0.6693113297365916)


In [56]:
grid_search = GridSearchCV(models['KNeighborsClassifier'][0],
{
    'n_neighbors': [5, 7, 9, 12, 15, 19, 35],
    'weights': ['uniform', 'distance'],
    'algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [5, 15, 30, 45, 75, 90],
    'metric': ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan']
},
scoring='recall',
n_jobs=4,
verbose=1,
cv=2
)

In [57]:
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 1260 candidates, totalling 2520 fits


In [58]:
grid_search.best_params_

{'algorithm': 'ball_tree',
 'leaf_size': 5,
 'metric': 'euclidean',
 'n_neighbors': 19,
 'weights': 'distance'}

In [59]:
grid_search.best_score_

0.9809134837036282

In [60]:
y_pred = grid_search.predict(X_test)

In [61]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

Unnamed: 0,0,1
0,7452,1921
1,107,9346


In [62]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.80      0.88      9373
           1       0.83      0.99      0.90      9453

    accuracy                           0.89     18826
   macro avg       0.91      0.89      0.89     18826
weighted avg       0.91      0.89      0.89     18826

