## Stroke modeling with Keras

In [1]:
import numpy as np
import pandas as pd

In [5]:
# import data sets

stroke_raw_df = pd.read_csv("stroke_raw_df.csv", index_col=0)
stroke_norm_df = pd.read_csv("stroke_norm_df.csv", index_col=0)
stroke_scaled_df = pd.read_csv("stroke_scaled_df.csv", index_col=0)

stroke_raw_df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1,67.0,0,1,1,1,228.69,36.6,1,0,1,0,0,1,0,0
1,51676,0,61.0,0,0,1,0,202.21,33.2,1,0,0,1,0,0,1,0
2,31112,1,80.0,0,1,1,0,105.92,32.5,1,0,1,0,0,0,1,0


In [6]:
stroke_norm_df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1,0.736842,0,1,1,1,0.801173,0.313507,1,0,1,0,0,1,0,0
1,51676,0,0.631579,0,0,1,0,0.678875,0.271375,1,0,0,1,0,0,1,0
2,31112,1,0.964912,0,1,1,0,0.234159,0.262701,1,0,1,0,0,0,1,0


In [7]:
stroke_scaled_df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1,0.874738,0,1,1,1,2.464581,0.819354,1,0,1,0,0,1,0,0
1,51676,0,0.499979,0,0,1,0,1.918619,0.342896,1,0,0,1,0,0,1,0
2,31112,1,1.686716,0,1,1,0,-0.066679,0.244801,1,0,1,0,0,0,1,0


### Create Train and Test data sets for each data set

In [16]:
from sklearn.model_selection import train_test_split

# Raw data

X_raw = stroke_raw_df.drop(['id', 'stroke'], axis=1)
y_raw = stroke_raw_df['stroke']

X_raw_train, X_raw_test, y_raw_train, y_raw_test = train_test_split(X_raw, y_raw, test_size = 0.25, 
                                                                    random_state = 42, stratify = y_raw)

In [17]:
# Normed data

X_norm = stroke_norm_df.drop(['id', 'stroke'], axis=1)
y_norm = stroke_norm_df['stroke']

X_norm_train, X_norm_test, y_norm_train, y_norm_test = train_test_split(X_norm, y_norm, test_size = 0.25, 
                                                                    random_state = 42, stratify = y_norm)

In [18]:
# Scaled data

X_scaled = stroke_scaled_df.drop(['id', 'stroke'], axis=1)
y_scaled = stroke_scaled_df['stroke']

X_scaled_train, X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X_scaled, y_scaled, test_size = 0.25, 
                                                                    random_state = 42, stratify = y_scaled)

### Create new data set with random over and under sampling

In [11]:
minority_class_percent = (len(stroke_raw_df[stroke_raw_df['stroke'] == 1])/len(stroke_raw_df))*100
minority_class_percent

#minority class 6.6% of the all the data, heavily imbalanced data set

6.6078116639914395

In [26]:
#Resample the training data with RandomOversampler and Clustered Centroids
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids

# define over/under sampling strategies
ros = RandomOverSampler(sampling_strategy=.15, random_state = 42)
cc = ClusterCentroids(sampling_strategy=.75, random_state=42)

#orignal number of each target class
print(Counter(y_raw_train))


Counter({0: 2618, 1: 185})


**Raw data, over and under sampling**

In [25]:
# apply over sampling strategy on original data
X_r_train_resampled, y_r_train_resampled = ros.fit_resample(X_raw_train, y_raw_train)
print(Counter(y_r_train_resampled))

#apply under sampling strategy on oversampled data
X_r_train_resampled, y_r_train_resampled = cc.fit_resample(X_r_train_resampled, y_r_train_resampled)
print(Counter(y_r_train_resampled))

Counter({0: 2618, 1: 392})
Counter({0: 522, 1: 392})


**Normed data, over and under sampling**

In [27]:
# apply over sampling strategy on original data
X_n_train_resampled, y_n_train_resampled = ros.fit_resample(X_norm_train, y_norm_train)
print(Counter(y_n_train_resampled))

#apply under sampling strategy on oversampled data
X_n_train_resampled, y_n_train_resampled = cc.fit_resample(X_n_train_resampled, y_n_train_resampled)
print(Counter(y_n_train_resampled))

Counter({0: 2618, 1: 392})
Counter({0: 522, 1: 392})


**Scaled data, over and under sampling**

In [28]:
# apply over sampling strategy on original data
X_s_train_resampled, y_s_train_resampled = ros.fit_resample(X_scaled_train, y_scaled_train)
print(Counter(y_s_train_resampled))

#apply under sampling strategy on oversampled data
X_s_train_resampled, y_s_train_resampled = cc.fit_resample(X_s_train_resampled, y_s_train_resampled)
print(Counter(y_s_train_resampled))

Counter({0: 2618, 1: 392})
Counter({0: 522, 1: 392})


### Create new data set with SMOTEEN

In [45]:
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours

#create new minority class data points
#clean up majority class data points
# may lead to more false positive, but fewer false negatives

smote = SMOTEENN(random_state=42) #, enn=EditedNearestNeighbours(sampling_strategy='majority'))

**Raw data, SMOTEEN**

In [46]:
# apply SMOTEEN on original data
X_r_train_smoteen, y_r_train_smoteen = smote.fit_resample(X_raw_train, y_raw_train)
print(Counter(y_r_train_smoteen))

Counter({1: 2410, 0: 1800})


**Normalized data, SMOTEEN**

In [48]:
# apply SMOTEEN on normalized data
X_n_train_smoteen, y_n_train_smoteen = smote.fit_resample(X_norm_train, y_norm_train)
print(Counter(y_n_train_smoteen))

Counter({1: 2256, 0: 1985})


**Scaled data, SMOTEEN**

In [50]:
# apply SMOTEEN on scaled data
X_s_train_smoteen, y_s_train_smoteen = smote.fit_resample(X_scaled_train, y_scaled_train)
print(Counter(y_s_train_smoteen))

Counter({1: 2489, 0: 2023})


## Simple Logistic Regression to compare

In [52]:
# raw = [X_raw_train, y_raw_train]
# raw_sampled = [X_r_train_resampled, y_r_train_resampled]
# raw_smoteen = [X_r_train_smoteen, y_r_train_smoteen]
# raw_test = [X_raw_test, y_raw_test]

# normed = [X_norm_train, y_norm_train]
# normed_sampled = [X_n_train_resampled, y_n_train_resampled]
# normed_smoteen = [X_n_train_smoteen, y_n_train_smoteen]
# norm_test = [X_norm_test, y_norm_test]

# scaled = [X_scaled_train, y_scaled_train]
# scaled_sampled = [X_s_train_resampled, y_s_train_resampled]
# scaled_smoteen = [X_s_train_smoteen, y_s_train_smoteen]
# scaled_test = [X_scaled_test, y_scaled_test]

# train_list = [raw, raw_sampled, raw_smoteen, 
#               normed, normed_sampled, normed_smoteen, scaled, scaled_sampled, scaled_smoteen]

# test_list = [raw_test, norm_test, scaled_test]

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

model = LogisticRegression(max_iter=5000, random_state=42)

print(Counter(y_norm_test))

Counter({0: 873, 1: 62})


In [54]:
model.fit(X_norm_train, y_norm_train)
y_pred = model.predict(X_norm_test)
confusion_matrix(y_norm_test, y_pred)

array([[873,   0],
       [ 62,   0]])

In [55]:
model.fit(X_n_train_resampled, y_n_train_resampled)
y_pred = model.predict(X_norm_test)
confusion_matrix(y_norm_test, y_pred)

array([[640, 233],
       [ 24,  38]])

In [57]:
print(classification_report(y_norm_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.73      0.83       873
           1       0.14      0.61      0.23        62

    accuracy                           0.73       935
   macro avg       0.55      0.67      0.53       935
weighted avg       0.91      0.73      0.79       935



In [58]:
model.fit(X_n_train_smoteen, y_n_train_smoteen)
y_pred = model.predict(X_norm_test)
confusion_matrix(y_norm_test, y_pred)

array([[599, 274],
       [ 13,  49]])

In [59]:
print(classification_report(y_norm_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.69      0.81       873
           1       0.15      0.79      0.25        62

    accuracy                           0.69       935
   macro avg       0.57      0.74      0.53       935
weighted avg       0.92      0.69      0.77       935



In [60]:
model.fit(X_scaled_train, y_scaled_train)
y_pred = model.predict(X_scaled_test)
confusion_matrix(y_scaled_test, y_pred)

array([[873,   0],
       [ 62,   0]])

In [63]:
model.fit(X_s_train_resampled, y_s_train_resampled)
y_pred = model.predict(X_scaled_test)
print(confusion_matrix(y_scaled_test, y_pred))
print(classification_report(y_scaled_test, y_pred))

[[628 245]
 [ 22  40]]
              precision    recall  f1-score   support

           0       0.97      0.72      0.82       873
           1       0.14      0.65      0.23        62

    accuracy                           0.71       935
   macro avg       0.55      0.68      0.53       935
weighted avg       0.91      0.71      0.79       935



In [64]:
model.fit(X_s_train_smoteen, y_s_train_smoteen)
y_pred = model.predict(X_scaled_test)
print(confusion_matrix(y_scaled_test, y_pred))
print(classification_report(y_scaled_test, y_pred))

[[631 242]
 [ 14  48]]
              precision    recall  f1-score   support

           0       0.98      0.72      0.83       873
           1       0.17      0.77      0.27        62

    accuracy                           0.73       935
   macro avg       0.57      0.75      0.55       935
weighted avg       0.92      0.73      0.79       935



In [152]:
#test C
# test L1, L2
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, roc_auc_score

#logistic regression model
best_lr_model = LogisticRegression(max_iter=5000, random_state=42)

#tune parameters
params_lr = {'C': [.01, .1, 1, 10, 100],
             'penalty': ['l1', 'l2']
            }

grid_lr = GridSearchCV(estimator=best_lr_model, param_grid=params_lr, scoring='recall',
                      cv=10, n_jobs=-1)

grid_lr.fit(X_n_train_smoteen, y_n_train_smoteen)

# model.fit(X_n_train_smoteen, y_n_train_smoteen)
# y_pred = model.predict(X_norm_test)
# confusion_matrix(y_norm_test, y_pred)

        nan 0.85018486        nan 0.84974238]


GridSearchCV(cv=10,
             estimator=LogisticRegression(max_iter=5000, random_state=42),
             n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
             scoring='recall')

In [156]:
best_params = grid_lr.best_params_
best_CV_score = grid_lr.best_score_
print("Best Parameters: \n", best_params)
print("Best Recall Score: \n", best_CV_score)

Best Parameters: 
 {'C': 0.01, 'penalty': 'l2'}
Best Recall Score: 
 0.8581632251720747


1

In [154]:
# name best model
lr_best_model = grid_lr.best_estimator_
# use model to predict on test data (held back)
y_pred = lr_best_model.predict(X_norm_test)

print(confusion_matrix(y_norm_test, y_pred))
print(classification_report(y_norm_test, y_pred))

[[561 312]
 [ 10  52]]
              precision    recall  f1-score   support

           0       0.98      0.64      0.78       873
           1       0.14      0.84      0.24        62

    accuracy                           0.66       935
   macro avg       0.56      0.74      0.51       935
weighted avg       0.93      0.66      0.74       935



In [155]:
from sklearn.metrics import balanced_accuracy_score
print(balanced_accuracy_score(y_norm_test, y_pred))
print(recall_score(y_norm_test, y_pred))
print(roc_auc_score(y_norm_test, y_pred))

0.7406606806340761
0.8387096774193549
0.7406606806340761


In [161]:
# Going back to original normed data: how does the model perform?
model_y = lr_best_model.predict(X_norm)
print(confusion_matrix(y_norm, model_y))
print(classification_report(y_norm, model_y))

[[2220 1271]
 [  44  203]]
              precision    recall  f1-score   support

           0       0.98      0.64      0.77      3491
           1       0.14      0.82      0.24       247

    accuracy                           0.65      3738
   macro avg       0.56      0.73      0.50      3738
weighted avg       0.92      0.65      0.74      3738



In [162]:
print(balanced_accuracy_score(y_norm, model_y))
print(recall_score(y_norm, model_y))
print(roc_auc_score(y_norm, model_y))

0.7288916438685016
0.8218623481781376
0.7288916438685017


## Keras

In [67]:
from keras.models import Sequential
from keras.layers import Dense
from keras import metrics
from keras.callbacks import EarlyStopping


In [189]:
# normalized smoteen
# BINARY

n_cols = X_n_train_smoteen.shape[1]

model = Sequential()
model.add(Dense(100, activation='relu', input_shape=(n_cols,)))
model.add(Dense(100, activation='relu'))
#model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

early_stopping_monitor = EarlyStopping(patience=2, monitor='loss')

model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['binary_accuracy'])
model.fit(X_n_train_smoteen, y_n_train_smoteen, callbacks=[early_stopping_monitor], 
          epochs=50, verbose=2, validation_split=.3)

Epoch 1/50
93/93 - 1s - loss: 0.5392 - binary_accuracy: 0.7217 - val_loss: 0.6977 - val_binary_accuracy: 0.6033
Epoch 2/50
93/93 - 0s - loss: 0.3900 - binary_accuracy: 0.8268 - val_loss: 0.4773 - val_binary_accuracy: 0.7965
Epoch 3/50
93/93 - 0s - loss: 0.3484 - binary_accuracy: 0.8491 - val_loss: 0.4642 - val_binary_accuracy: 0.7950
Epoch 4/50
93/93 - 0s - loss: 0.3304 - binary_accuracy: 0.8592 - val_loss: 0.6489 - val_binary_accuracy: 0.7078
Epoch 5/50
93/93 - 0s - loss: 0.3137 - binary_accuracy: 0.8629 - val_loss: 0.5798 - val_binary_accuracy: 0.7376
Epoch 6/50
93/93 - 0s - loss: 0.3010 - binary_accuracy: 0.8703 - val_loss: 0.4387 - val_binary_accuracy: 0.8075
Epoch 7/50
93/93 - 0s - loss: 0.2934 - binary_accuracy: 0.8716 - val_loss: 0.3128 - val_binary_accuracy: 0.8696
Epoch 8/50
93/93 - 0s - loss: 0.2842 - binary_accuracy: 0.8831 - val_loss: 0.5246 - val_binary_accuracy: 0.7698
Epoch 9/50
93/93 - 0s - loss: 0.2703 - binary_accuracy: 0.8844 - val_loss: 0.3022 - val_binary_accuracy:

<tensorflow.python.keras.callbacks.History at 0x14c5fb220>

In [190]:
model.evaluate(X_norm_test, y_norm_test)



[0.7934988141059875, 0.7700534462928772]

In [191]:
y_pred = model.predict(X_norm_test)
y_pred = np.round(y_pred)
print(confusion_matrix(y_norm_test, y_pred))
print(classification_report(y_norm_test, y_pred))

[[693 180]
 [ 35  27]]
              precision    recall  f1-score   support

           0       0.95      0.79      0.87       873
           1       0.13      0.44      0.20        62

    accuracy                           0.77       935
   macro avg       0.54      0.61      0.53       935
weighted avg       0.90      0.77      0.82       935



In [192]:
# normalized

weights = {0:.54,1:7.6}

n_cols = X_norm_train.shape[1]

model = Sequential()
model.add(Dense(75, activation='relu', input_shape=(n_cols,)))
model.add(Dense(75, activation='relu'))
#model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

early_stopping_monitor = EarlyStopping(patience=3, monitor='loss')

model.compile(optimizer="adam", loss='binary_crossentropy', metrics=["accuracy"])
model.fit(X_norm_train, y_norm_train, callbacks=[early_stopping_monitor], 
          epochs=50, verbose=2, class_weight=weights, validation_split=.3)



Epoch 1/50
62/62 - 0s - loss: 0.6799 - accuracy: 0.3767 - val_loss: 0.6963 - val_accuracy: 0.5648
Epoch 2/50
62/62 - 0s - loss: 0.6225 - accuracy: 0.6055 - val_loss: 0.6077 - val_accuracy: 0.7111
Epoch 3/50
62/62 - 0s - loss: 0.5714 - accuracy: 0.6967 - val_loss: 0.5350 - val_accuracy: 0.7455
Epoch 4/50
62/62 - 0s - loss: 0.5455 - accuracy: 0.7212 - val_loss: 0.4825 - val_accuracy: 0.7741
Epoch 5/50
62/62 - 0s - loss: 0.5288 - accuracy: 0.7339 - val_loss: 0.6604 - val_accuracy: 0.6492
Epoch 6/50
62/62 - 0s - loss: 0.5146 - accuracy: 0.7365 - val_loss: 0.5869 - val_accuracy: 0.6813
Epoch 7/50
62/62 - 0s - loss: 0.5019 - accuracy: 0.7263 - val_loss: 0.5765 - val_accuracy: 0.6873
Epoch 8/50
62/62 - 0s - loss: 0.4918 - accuracy: 0.7339 - val_loss: 0.5487 - val_accuracy: 0.7027
Epoch 9/50
62/62 - 0s - loss: 0.4945 - accuracy: 0.7222 - val_loss: 0.6558 - val_accuracy: 0.6361
Epoch 10/50
62/62 - 0s - loss: 0.4810 - accuracy: 0.7375 - val_loss: 0.4835 - val_accuracy: 0.7444
Epoch 11/50
62/62 -

<tensorflow.python.keras.callbacks.History at 0x14c8b2430>

In [193]:
model.evaluate(X_norm_test, y_norm_test)



[0.5427139401435852, 0.7133690118789673]

In [194]:
y_pred = model.predict(X_norm_test)
y_pred = np.round(y_pred)
print(confusion_matrix(y_norm_test, y_pred))
print(classification_report(y_norm_test, y_pred))

[[635 238]
 [ 30  32]]
              precision    recall  f1-score   support

           0       0.95      0.73      0.83       873
           1       0.12      0.52      0.19        62

    accuracy                           0.71       935
   macro avg       0.54      0.62      0.51       935
weighted avg       0.90      0.71      0.78       935



In [170]:
from sklearn.utils import class_weight
# Calculate the weights for each class so that we can balance the data
weights = class_weight.compute_class_weight('balanced',
                                            np.unique(y_norm_train),
                                            y_norm_train)

508     0
4221    0
502     0
2512    0
       ..
4140    0
3071    0
433     0
1201    0
3771    0
Name: stroke, Length: 2803, dtype: int64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


In [171]:
weights

array([0.53533231, 7.57567568])