In [287]:
import pandas as pd
import numpy as np

df = pd.read_csv('./healthcare-dataset-stroke-data.csv') 

### Data analysis

In [288]:
print(df.head(10))
df.describe()


df.stroke.value_counts() # Dataset is imbalanced


      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   
5  56669    Male  81.0             0              0          Yes   
6  53882    Male  74.0             1              1          Yes   
7  10434  Female  69.0             0              0           No   
8  27419  Female  59.0             0              0          Yes   
9  60491  Female  78.0             0              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private    

0    4861
1     249
Name: stroke, dtype: int64

### Data cleansing

In [289]:
df.drop("id", axis=1, inplace=True)
df = df.sample(frac = 1)

#### Treatment of null values


In [290]:

df['smoking_status'].replace('Unknown', np.nan, inplace=True)
df['bmi'].fillna(df['bmi'].mean(), inplace=True)
df['smoking_status'].fillna(df['smoking_status'].mode()[0], inplace = True)

#df.dropna(inplace=True)


## Encoding

### Label encoding for categorical features with 2 values

In [291]:

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

label_features = ['ever_married','Residence_type']
df[label_features] = df[label_features].apply(le.fit_transform) # For label_features


### One hot encoding for categorical features with >2 values

In [292]:
ohe_features = ['gender','work_type','smoking_status']
for feat in ohe_features:
    df[feat] = pd.Categorical(df[feat])
    df_dummies = pd.get_dummies(df[feat], prefix = feat + '_encoded',drop_first=True)
    df.drop(feat, axis=1, inplace=True)
    df = pd.concat([df, df_dummies], axis=1)


## Train-test split

In [293]:
from sklearn.model_selection import train_test_split

labels = df.columns.values

if 'stroke' in labels:
    labels=df.pop('stroke')
    
train_x, test_x, train_y, test_y = train_test_split(df, labels, test_size=0.33, random_state=42)
labels
df

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,gender_encoded_Male,gender_encoded_Other,work_type_encoded_Never_worked,work_type_encoded_Private,work_type_encoded_Self-employed,work_type_encoded_children,smoking_status_encoded_never smoked,smoking_status_encoded_smokes
3818,35.0,0,0,0,1,93.60,28.500000,1,0,0,1,0,0,0,1
4324,80.0,0,0,0,0,230.74,30.200000,0,0,0,0,1,0,0,0
1795,57.0,0,0,1,0,230.59,23.200000,1,0,0,1,0,0,0,0
3475,47.0,0,0,1,0,131.19,28.300000,1,0,0,1,0,0,0,1
682,20.0,0,0,0,1,55.25,20.400000,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1585,67.0,0,0,1,1,110.68,25.100000,1,0,0,1,0,0,0,0
1300,71.0,0,0,1,1,214.77,28.893237,0,0,0,1,0,0,1,0
1055,57.0,0,0,1,0,111.64,31.500000,1,0,0,1,0,0,1,0
2276,17.0,0,0,0,1,78.46,23.500000,0,0,0,1,0,0,1,0


In [269]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

### Oversampling

(only used for those methods that don't use class weights)

In [270]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

sm = SMOTE(random_state=5)
train_x, train_y = sm.fit_resample(train_x, train_y)
#os = RandomOverSampler(sampling_strategy = 1)
#train_x, train_y = os.fit_resample(train_x, train_y)



## Test models

In [271]:
from sklearn import svm
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, accuracy_score

clf = svm.SVC()
clf.fit(train_x, train_y)

y_pred = clf.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))


Confusion matrix: 
 [[1289  317]
 [  39   42]] 

Accuracy Score: 0.78897 

ROC AUC Score: 0.66057 

F1: 0.87866 0.19091 



In [272]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(class_weight='balanced')
log.fit(train_x, train_y)

y_pred = log.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('Logistic Regression\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))

Logistic Regression

Confusion matrix: 
 [[1200  406]
 [  17   64]] 

Accuracy Score: 0.74926 

ROC AUC Score: 0.76866 

F1: 0.85016 0.23230 



In [273]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x_os,train_y_os)

y_pred = gnb.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('Gaussian Naive Bayes\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))

Gaussian Naive Bayes

Confusion matrix: 
 [[1606    0]
 [  81    0]] 

Accuracy Score: 0.95199 

ROC AUC Score: 0.50000 

F1: 0.97540 0.00000 



In [274]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

tree_para = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
tree = GridSearchCV(DecisionTreeClassifier(random_state=0,class_weight='balanced'), tree_para, cv=5)
tree.fit(train_x_os, train_y_os)

y_pred = tree.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('Decision Tree with Grid Search\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))



Decision Tree with Grid Search

Confusion matrix: 
 [[1294  312]
 [  10   71]] 

Accuracy Score: 0.80913 

ROC AUC Score: 0.84114 

F1: 0.88935 0.30603 



In [275]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=0,class_weight='balanced_subsample')
rfc.fit(train_x,train_y)

y_pred = rfc.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('Random Forest\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))

Random Forest

Confusion matrix: 
 [[1537   69]
 [  72    9]] 

Accuracy Score: 0.91642 

ROC AUC Score: 0.53407 

F1: 0.95614 0.11321 



In [276]:
from xgboost import XGBClassifier

xgb = XGBClassifier(eval_metric='error',use_label_encoder=False)
xgb.fit(train_x_os,train_y_os)

y_pred = xgb.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('XGB\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))

XGB

Confusion matrix: 
 [[893 713]
 [  4  77]] 

Accuracy Score: 0.57499 

ROC AUC Score: 0.75333 

F1: 0.71354 0.17681 



In [277]:
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(random_state=42)
brf.fit(train_x,train_y)

y_pred = brf.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('Balanced Random Forest\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))


Balanced Random Forest

Confusion matrix: 
 [[1538   68]
 [  74    7]] 

Accuracy Score: 0.91583 

ROC AUC Score: 0.52204 

F1: 0.95587 0.08974 

