In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv(r'C:\Users\amina\OneDrive\Desktop\healthcare-dataset-stroke-data.csv')

In [3]:
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
data.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [5]:
data.isna().sum()/len(data)

id                   0.000000
gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
bmi                  0.039335
smoking_status       0.000000
stroke               0.000000
dtype: float64

In [6]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())

In [7]:
data = data.drop('id', axis=1)

In [8]:
data['ever_married'] = data['ever_married'].replace(['Yes', 'No'], [1, 0])
data['Residence_type'] = data['Residence_type'].replace(['Urban', 'Rural'], [1, 0])

In [9]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,1,Private,1,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,1,Self-employed,0,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,1,Private,0,105.92,32.5,never smoked,1
3,Female,49.0,0,0,1,Private,1,171.23,34.4,smokes,1
4,Female,79.0,1,0,1,Self-employed,0,174.12,24.0,never smoked,1


In [10]:
data.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married           int64
work_type             object
Residence_type         int64
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [11]:
data = pd.get_dummies(data)

In [12]:
data.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,1,1,228.69,36.6,1,0,1,0,0,0,1,0,0,0,1,0,0
1,61.0,0,0,1,0,202.21,28.893237,1,1,0,0,0,0,0,1,0,0,0,1,0
2,80.0,0,1,1,0,105.92,32.5,1,0,1,0,0,0,1,0,0,0,0,1,0
3,49.0,0,0,1,1,171.23,34.4,1,1,0,0,0,0,1,0,0,0,0,0,1
4,79.0,1,0,1,0,174.12,24.0,1,1,0,0,0,0,0,1,0,0,0,1,0


In [13]:
data['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [14]:
X = data.drop('stroke', axis=1)
y = data['stroke']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [16]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
y_train

3482    0
2499    0
614     0
4098    0
1159    0
       ..
1751    0
3329    0
3391    0
4566    0
4430    0
Name: stroke, Length: 4088, dtype: int64

In [18]:
log_model = LogisticRegressionCV(class_weight='balanced')

In [19]:
log_model.fit(X_train, y_train)

LogisticRegressionCV(class_weight='balanced')

In [20]:
from sklearn.metrics import confusion_matrix, recall_score

In [21]:
confusion_matrix(y_test, log_model.predict(X_test))

array([[719, 253],
       [  6,  44]], dtype=int64)

In [22]:
recall_score(y_test, log_model.predict(X_test))

0.88

In [23]:
rf_model = RandomForestClassifier(n_estimators=200, class_weight='balanced')

In [24]:
params = {
    'max_depth':[2, 4, 6, 8],
    'min_samples_split':[2, 4, 6, 8],
    'min_samples_leaf':[1, 3, 5, 7]
}

In [25]:
rf_tuner = RandomizedSearchCV(rf_model, params, n_iter=20, n_jobs=-1, cv=5, scoring='balanced_accuracy')

In [26]:
rf_tuner.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=RandomForestClassifier(class_weight='balanced',
                                                    n_estimators=200),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'max_depth': [2, 4, 6, 8],
                                        'min_samples_leaf': [1, 3, 5, 7],
                                        'min_samples_split': [2, 4, 6, 8]},
                   scoring='balanced_accuracy')

In [27]:
confusion_matrix(y_test, rf_tuner.predict(X_test))

array([[680, 292],
       [  5,  45]], dtype=int64)

In [28]:
gb_model = GradientBoostingClassifier(n_estimators=200)

In [29]:
params = {
    'learning_rate':[0.01, 0.1, 0.2, 1],
    'min_samples_split':[2, 4, 6, 8],
    'min_samples_leaf':[1, 3, 5, 7]
}

In [30]:
gb_tuner = RandomizedSearchCV(gb_model, params, n_iter=20, n_jobs=-1, cv=5, scoring='recall')

In [31]:
gb_tuner.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(n_estimators=200),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'learning_rate': [0.01, 0.1, 0.2, 1],
                                        'min_samples_leaf': [1, 3, 5, 7],
                                        'min_samples_split': [2, 4, 6, 8]},
                   scoring='recall')

In [32]:
confusion_matrix(y_test, gb_tuner.predict(X_test))#not good

array([[914,  58],
       [ 37,  13]], dtype=int64)

In [33]:
X_train.shape

(4088, 19)

In [34]:
input = keras.layers.Input(shape=(X_train.shape[1],))
x = keras.layers.Dense(64, activation='relu')(input)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Dense(32, activation='relu')(x)
x = keras.layers.BatchNormalization()(x)
output = keras.layers.Dense(1, activation='sigmoid')(x)

model = keras.models.Model(input, output)

model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=[keras.metrics.Recall(name='recall')])

In [35]:
from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(np.unique(y))
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_dict = {}
for i in classes:
    class_dict[i] = class_weights[i]

In [36]:
class myCallback(keras.callbacks.Callback):
    best = 0
    def on_epoch_end(self, epoch, logs={}):
        if logs.get('val_recall') > self.best:
            self.best = logs.get('val_recall')
            model.save('bestmodel.hdf5')
        print(f'\nbest recall is : {self.best}')
mc = myCallback()

In [37]:
from keras.callbacks import ReduceLROnPlateau
callbacks = [ReduceLROnPlateau(patience=2), mc]

In [38]:
model.fit(X_train, y_train, validation_split=0.2, batch_size=32, epochs=1000, callbacks=callbacks, class_weight=class_dict)

Epoch 1/1000
best recall is : 0.6585366129875183
Epoch 2/1000
best recall is : 0.8536585569381714
Epoch 3/1000
best recall is : 0.8536585569381714
Epoch 4/1000
best recall is : 0.8780487775802612
Epoch 5/1000
best recall is : 0.8780487775802612
Epoch 6/1000
best recall is : 0.8780487775802612
Epoch 7/1000
best recall is : 0.8780487775802612
Epoch 8/1000
best recall is : 0.8780487775802612
Epoch 9/1000
best recall is : 0.8780487775802612
Epoch 10/1000
best recall is : 0.8780487775802612
Epoch 11/1000
best recall is : 0.8780487775802612
Epoch 12/1000
best recall is : 0.8780487775802612
Epoch 13/1000
best recall is : 0.8780487775802612
Epoch 14/1000
best recall is : 0.8780487775802612
Epoch 15/1000
best recall is : 0.8780487775802612
Epoch 16/1000
best recall is : 0.8780487775802612
Epoch 17/1000
best recall is : 0.8780487775802612
Epoch 18/1000
best recall is : 0.8780487775802612
Epoch 19/1000
best recall is : 0.8780487775802612
Epoch 20/1000
best recall is : 0.8780487775802612
Epoch 21/

<tensorflow.python.keras.callbacks.History at 0x291bfa02640>

In [39]:
confusion_matrix(y_test, np.round(model.predict(X_test)))

array([[686, 286],
       [  8,  42]], dtype=int64)

In [57]:
final_model = LogisticRegressionCV(class_weight='balanced')

In [51]:
X_train1 = log_model.predict(X_train).reshape(-1,1)
X_train2 = rf_tuner.predict(X_train).reshape(-1,1)
X_train3 = np.round(model.predict(X_train))

In [52]:
X_train3

array([[0.],
       [1.],
       [0.],
       ...,
       [1.],
       [0.],
       [1.]], dtype=float32)

In [55]:
X_train = np.concatenate([X_train1, X_train2, X_train3], axis=1)

In [54]:
X_test1 = log_model.predict(X_test).reshape(-1,1)
X_test2 = rf_tuner.predict(X_test).reshape(-1,1)
X_test3 = np.round(model.predict(X_test))

In [56]:
X_test = np.concatenate([X_test1, X_test2, X_test3], axis=1)

In [58]:
final_model.fit(X_train, y_train)

LogisticRegressionCV(class_weight='balanced')

In [59]:
confusion_matrix(y_test, np.round(final_model.predict(X_test)))

array([[738, 234],
       [  8,  42]], dtype=int64)