# Stroke modeling with Logistic Regression and Keras

In [1]:
import numpy as np
import pandas as pd

In [249]:
# import data sets

stroke_raw_df = pd.read_csv("stroke_raw_df.csv", index_col=0)
stroke_norm_df = pd.read_csv("stroke_norm_df.csv", index_col=0)
stroke_scaled_df = pd.read_csv("stroke_scaled_df.csv", index_col=0)

stroke_raw_df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1,67.0,0,1,1,1,228.69,36.6,1,0,1,0,0,1,0,0
1,51676,0,61.0,0,0,1,0,202.21,33.2,1,0,0,1,0,0,1,0
2,31112,1,80.0,0,1,1,0,105.92,32.5,1,0,1,0,0,0,1,0


In [250]:
stroke_norm_df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1,0.736842,0,1,1,1,0.801173,0.313507,1,0,1,0,0,1,0,0
1,51676,0,0.631579,0,0,1,0,0.678875,0.271375,1,0,0,1,0,0,1,0
2,31112,1,0.964912,0,1,1,0,0.234159,0.262701,1,0,1,0,0,0,1,0


In [251]:
stroke_scaled_df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1,0.874738,0,1,1,1,2.464581,0.819354,1,0,1,0,0,1,0,0
1,51676,0,0.499979,0,0,1,0,1.918619,0.342896,1,0,0,1,0,0,1,0
2,31112,1,1.686716,0,1,1,0,-0.066679,0.244801,1,0,1,0,0,0,1,0


### Create Train and Test data sets for each data set

In [252]:
from sklearn.model_selection import train_test_split

# Raw data

X_raw = stroke_raw_df.drop(['id', 'stroke'], axis=1)
y_raw = stroke_raw_df['stroke']

X_raw_train, X_raw_test, y_raw_train, y_raw_test = train_test_split(X_raw, y_raw, test_size = 0.25, 
                                                                    random_state = 42, stratify = y_raw)

In [253]:
# Normed data

X_norm = stroke_norm_df.drop(['id', 'stroke'], axis=1)
y_norm = stroke_norm_df['stroke']

X_norm_train, X_norm_test, y_norm_train, y_norm_test = train_test_split(X_norm, y_norm, test_size = 0.25, 
                                                                    random_state = 42, stratify = y_norm)

In [254]:
# Scaled data

X_scaled = stroke_scaled_df.drop(['id', 'stroke'], axis=1)
y_scaled = stroke_scaled_df['stroke']

X_scaled_train, X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X_scaled, y_scaled, test_size = 0.25, 
                                                                    random_state = 42, stratify = y_scaled)

### Create new data set with random over sampling and cluster centroid under sampling

In [255]:
minority_class_percent = (len(stroke_raw_df[stroke_raw_df['stroke'] == 1])/len(stroke_raw_df))*100
print("Percentage of stroke patients (minority class): ", round(minority_class_percent, 2))

#minority class 6.6% of the all the data, heavily imbalanced data set

Percentage of stroke patients (minority class):  6.61


#### Set up RandomOverSampler and ClusterCentroids

In [256]:
#Resample the training data with RandomOversampler and Clustered Centroids
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids

# define over/under sampling strategies
ros = RandomOverSampler(sampling_strategy=.15, random_state = 42)
cc = ClusterCentroids(sampling_strategy=.75, random_state=42)

#orignal number of each target class
print(Counter(y_raw_train))


Counter({0: 2618, 1: 185})


**Raw data, over and under sampling**

In [257]:
# apply over sampling strategy on original data
X_r_train_resampled, y_r_train_resampled = ros.fit_resample(X_raw_train, y_raw_train)
print(Counter(y_r_train_resampled))

#apply under sampling strategy on oversampled data
X_r_train_resampled, y_r_train_resampled = cc.fit_resample(X_r_train_resampled, y_r_train_resampled)
print(Counter(y_r_train_resampled))

Counter({0: 2618, 1: 392})
Counter({0: 522, 1: 392})


**Normed data, over and under sampling**

In [258]:
# apply over sampling strategy on original data
X_n_train_resampled, y_n_train_resampled = ros.fit_resample(X_norm_train, y_norm_train)
print(Counter(y_n_train_resampled))

#apply under sampling strategy on oversampled data
X_n_train_resampled, y_n_train_resampled = cc.fit_resample(X_n_train_resampled, y_n_train_resampled)
print(Counter(y_n_train_resampled))

Counter({0: 2618, 1: 392})
Counter({0: 522, 1: 392})


**Scaled data, over and under sampling**

In [259]:
# apply over sampling strategy on original data
X_s_train_resampled, y_s_train_resampled = ros.fit_resample(X_scaled_train, y_scaled_train)
print(Counter(y_s_train_resampled))

#apply under sampling strategy on oversampled data
X_s_train_resampled, y_s_train_resampled = cc.fit_resample(X_s_train_resampled, y_s_train_resampled)
print(Counter(y_s_train_resampled))

Counter({0: 2618, 1: 392})
Counter({0: 522, 1: 392})


### Create new data set with SMOTEEN

In [260]:
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours

#create new minority class data points
#clean up majority class data points
# may lead to more false positive, but fewer false negatives

smote = SMOTEENN(random_state=42) #, enn=EditedNearestNeighbours(sampling_strategy='majority'))

**Raw data, SMOTEEN**

In [261]:
# apply SMOTEEN on original data
X_r_train_smoteen, y_r_train_smoteen = smote.fit_resample(X_raw_train, y_raw_train)
print(Counter(y_r_train_smoteen))

Counter({1: 2410, 0: 1800})


**Normalized data, SMOTEEN**

In [262]:
# apply SMOTEEN on normalized data
X_n_train_smoteen, y_n_train_smoteen = smote.fit_resample(X_norm_train, y_norm_train)
print(Counter(y_n_train_smoteen))

Counter({1: 2256, 0: 1985})


**Scaled data, SMOTEEN**

In [263]:
# apply SMOTEEN on scaled data
X_s_train_smoteen, y_s_train_smoteen = smote.fit_resample(X_scaled_train, y_scaled_train)
print(Counter(y_s_train_smoteen))

Counter({1: 2489, 0: 2023})


## Simple Logistic Regression to compare

In [209]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score

def run_LR(X_train, y_train, X_test, y_test):
    """
    Automate run and performance of Logistic Regression Models
    """
    model = LogisticRegression(max_iter=5000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    y_train_pred = model.predict(X_train)
    print("Train data balanced accuracy: ", balanced_accuracy_score(y_train, y_train_pred))
    print("Test data balanced accuracy: ", balanced_accuracy_score(y_test, y_pred))
    

print(Counter(y_norm_test))

Counter({0: 873, 1: 62})


#### Normed data, no resampling strategy

Poor performance - no stroke victims identified

In [210]:
run_LR(X_norm_train, y_norm_train, X_norm_test, y_norm_test)


[[873   0]
 [ 62   0]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.97       873
           1       0.00      0.00      0.00        62

    accuracy                           0.93       935
   macro avg       0.47      0.50      0.48       935
weighted avg       0.87      0.93      0.90       935

Train data balanced accuracy:  0.5027027027027027
Test data balanced accuracy:  0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Normed data, resampled

In [213]:
run_LR(X_n_train_resampled, y_n_train_resampled, X_norm_test, y_norm_test)


[[640 233]
 [ 24  38]]
              precision    recall  f1-score   support

           0       0.96      0.73      0.83       873
           1       0.14      0.61      0.23        62

    accuracy                           0.73       935
   macro avg       0.55      0.67      0.53       935
weighted avg       0.91      0.73      0.79       935

Train data balanced accuracy:  0.6969074986316366
Test data balanced accuracy:  0.6730037320326645


#### Normed data, SMOTEEN

In [214]:
run_LR(X_n_train_smoteen, y_n_train_smoteen, X_norm_test, y_norm_test)

[[599 274]
 [ 13  49]]
              precision    recall  f1-score   support

           0       0.98      0.69      0.81       873
           1       0.15      0.79      0.25        62

    accuracy                           0.69       935
   macro avg       0.57      0.74      0.53       935
weighted avg       0.92      0.69      0.77       935

Train data balanced accuracy:  0.8154661736070171
Test data balanced accuracy:  0.7382311643202897


#### Scaled data, no resampling strategy

Poor performance - no stroke victims identified

In [215]:
run_LR(X_scaled_train, y_scaled_train, X_scaled_test, y_scaled_test)

[[873   0]
 [ 62   0]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.97       873
           1       0.00      0.00      0.00        62

    accuracy                           0.93       935
   macro avg       0.47      0.50      0.48       935
weighted avg       0.87      0.93      0.90       935

Train data balanced accuracy:  0.5027027027027027
Test data balanced accuracy:  0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Scaled data, resampled

In [217]:
run_LR(X_s_train_resampled, y_s_train_resampled, X_scaled_test, y_scaled_test)

[[628 245]
 [ 22  40]]
              precision    recall  f1-score   support

           0       0.97      0.72      0.82       873
           1       0.14      0.65      0.23        62

    accuracy                           0.71       935
   macro avg       0.55      0.68      0.53       935
weighted avg       0.91      0.71      0.79       935

Train data balanced accuracy:  0.7393365392133866
Test data balanced accuracy:  0.682259912057052


#### Scaled data, SMOTEEN

In [245]:
run_LR(X_s_train_smoteen, y_s_train_smoteen, X_scaled_test, y_scaled_test)

[[631 242]
 [ 14  48]]
              precision    recall  f1-score   support

           0       0.98      0.72      0.83       873
           1       0.17      0.77      0.27        62

    accuracy                           0.73       935
   macro avg       0.57      0.75      0.55       935
weighted avg       0.92      0.73      0.79       935

Train data balanced accuracy:  0.8467888467040445
Test data balanced accuracy:  0.7484942541477293


#### Run Logistic Regression using Lasso results on normed, SMOTEEN data

In [267]:
features_to_drop = ['gender', 'bmi', 'work_type_Govt_job', 'work_type_Self-employed', 'smoking_status_Unknown',
                    'smoking_status_formerly smoked', 'smoking_status_smokes']
X_lasso_train = X_n_train_smoteen.drop(features_to_drop, axis=1)
X_lasso_test = X_norm_test.drop(features_to_drop, axis=1)

run_LR(X_lasso_train, y_n_train_smoteen, X_lasso_test, y_norm_test)


[[593 280]
 [ 12  50]]
              precision    recall  f1-score   support

           0       0.98      0.68      0.80       873
           1       0.15      0.81      0.26        62

    accuracy                           0.69       935
   macro avg       0.57      0.74      0.53       935
weighted avg       0.93      0.69      0.77       935

Train data balanced accuracy:  0.8006652285760223
Test data balanced accuracy:  0.7428592543324835


##### SMOTEEN resampling strategy outperformed the over sampling, clustered centroid strategy.

Normed and scaled data both performed somewhat similarly.

### GridSearch to tune Logistic Regression model with normed SMOTEEN data

Goal to drive the false negative rate down (at the expense of the false postive rate).   Using recall as the scoring metric.

In [240]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, roc_auc_score

#logistic regression model
best_lr_model = LogisticRegression(max_iter=5000, random_state=42)

#tune parameters
params_lr = {'C': [.01, .1, 1, 10, 100],
             'penalty': ['l1', 'l2']
            }

grid_lr = GridSearchCV(estimator=best_lr_model, param_grid=params_lr, scoring='recall',
                      cv=10, n_jobs=-1)
grid_lr.fit(X_n_train_smoteen, y_n_train_smoteen)

        nan 0.85018486        nan 0.84974238]


GridSearchCV(cv=10,
             estimator=LogisticRegression(max_iter=5000, random_state=42),
             n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
             scoring='recall')

In [268]:
best_params = grid_lr.best_params_
best_CV_score = grid_lr.best_score_
print("Best Parameters: \n", best_params)
print("Best Recall Score: \n", best_CV_score)

Best Parameters: 
 {'C': 0.01, 'penalty': 'l2'}
Best Recall Score: 
 0.8581632251720747


In [271]:
# name best model
lr_best_model = grid_lr.best_estimator_
# use model to predict on test data (held back)
y_pred = lr_best_model.predict(X_norm_test)

print(confusion_matrix(y_norm_test, y_pred))
print(classification_report(y_norm_test, y_pred))
y_train_pred = lr_best_model.predict(X_n_train_smoteen)
print("Train data balanced accuracy: ", balanced_accuracy_score(y_n_train_smoteen, y_train_pred))
print("Test data balanced accuracy: ", balanced_accuracy_score(y_norm_test, y_pred))
print("Test recall score: ", recall_score(y_norm_test, y_pred))

[[561 312]
 [ 10  52]]
              precision    recall  f1-score   support

           0       0.98      0.64      0.78       873
           1       0.14      0.84      0.24        62

    accuracy                           0.66       935
   macro avg       0.56      0.74      0.51       935
weighted avg       0.93      0.66      0.74       935

Train data balanced accuracy:  0.7911032656269539
Test data balanced accuracy:  0.7406606806340761
Test recall score:  0.8387096774193549


##### Parameter tuned logistic regression model is slightly overfitted, but performs well for recall in the test sample.

Looking at the original normed data and model performance

In [273]:
# Going back to original normed data: how does the model perform?
model_y = lr_best_model.predict(X_norm)
print(confusion_matrix(y_norm, model_y))
print(classification_report(y_norm, model_y))
print("Data balanced accuracy: ", balanced_accuracy_score(y_norm, model_y))
print("Test recall score: ", recall_score(y_norm, model_y))

[[2220 1271]
 [  44  203]]
              precision    recall  f1-score   support

           0       0.98      0.64      0.77      3491
           1       0.14      0.82      0.24       247

    accuracy                           0.65      3738
   macro avg       0.56      0.73      0.50      3738
weighted avg       0.92      0.65      0.74      3738

Data balanced accuracy:  0.7288916438685016
Test recall score:  0.8218623481781376


In [195]:
lr_best_model.coef_

array([[-0.18092877,  1.92034162,  0.23838581,  0.2714037 , -0.00416373,
        -0.06605773,  0.66203399, -0.05068571, -0.39502637, -0.07505169,
         0.06365918, -0.05889108,  0.09041746, -0.26327729, -0.16072952]])

## Keras

In [67]:
from keras.models import Sequential
from keras.layers import Dense
from keras import metrics
from keras.callbacks import EarlyStopping

#### Binary Keras model with normalized, SMOTEEN data

In [274]:
# number of features
n_cols = X_n_train_smoteen.shape[1]

model = Sequential()
model.add(Dense(100, activation='relu', input_shape=(n_cols,)))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

early_stopping_monitor = EarlyStopping(patience=2, monitor='loss')

model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['binary_accuracy'])
model.fit(X_n_train_smoteen, y_n_train_smoteen, callbacks=[early_stopping_monitor], 
          epochs=50, verbose=2, validation_split=.3)

Epoch 1/50
93/93 - 1s - loss: 0.5277 - binary_accuracy: 0.7268 - val_loss: 0.5147 - val_binary_accuracy: 0.7714
Epoch 2/50
93/93 - 0s - loss: 0.3816 - binary_accuracy: 0.8241 - val_loss: 0.5635 - val_binary_accuracy: 0.7596
Epoch 3/50
93/93 - 0s - loss: 0.3500 - binary_accuracy: 0.8443 - val_loss: 0.6560 - val_binary_accuracy: 0.6999
Epoch 4/50
93/93 - 0s - loss: 0.3235 - binary_accuracy: 0.8629 - val_loss: 0.6051 - val_binary_accuracy: 0.7321
Epoch 5/50
93/93 - 0s - loss: 0.3111 - binary_accuracy: 0.8649 - val_loss: 0.3818 - val_binary_accuracy: 0.8515
Epoch 6/50
93/93 - 0s - loss: 0.2971 - binary_accuracy: 0.8780 - val_loss: 0.3969 - val_binary_accuracy: 0.8397
Epoch 7/50
93/93 - 0s - loss: 0.2895 - binary_accuracy: 0.8787 - val_loss: 0.5747 - val_binary_accuracy: 0.7549
Epoch 8/50
93/93 - 0s - loss: 0.2764 - binary_accuracy: 0.8841 - val_loss: 0.5161 - val_binary_accuracy: 0.7824
Epoch 9/50
93/93 - 0s - loss: 0.2724 - binary_accuracy: 0.8827 - val_loss: 0.3385 - val_binary_accuracy:

<tensorflow.python.keras.callbacks.History at 0x14d9306d0>

In [275]:
model.evaluate(X_norm_test, y_norm_test)



[0.713529646396637, 0.7818182110786438]

##### Evaluate on held back test data

In [277]:
y_pred = model.predict(X_norm_test)
y_pred = np.round(y_pred)
print(confusion_matrix(y_norm_test, y_pred))
print(classification_report(y_norm_test, y_pred))

print("Test data balanced accuracy: ", balanced_accuracy_score(y_norm_test, y_pred))
print("Test recall score: ", recall_score(y_norm_test, y_pred))

[[709 164]
 [ 40  22]]
              precision    recall  f1-score   support

           0       0.95      0.81      0.87       873
           1       0.12      0.35      0.18        62

    accuracy                           0.78       935
   macro avg       0.53      0.58      0.53       935
weighted avg       0.89      0.78      0.83       935

Test data balanced accuracy:  0.583490374311791
Test recall score:  0.3548387096774194


#####  The binary keras model on normed, SMOTEEN data performs poorly

### Keras model using normalized data, balanced class weighting, no resampling strategy

In [284]:
# Calculated weights to use
from sklearn.utils import class_weight

# Calculate the weights for each class so that we can balance the data
weights = class_weight.compute_class_weight('balanced',
                                            [0, 1],
                                            np.array(y_norm_train))
weights

array([0.53533231, 7.57567568])

In [300]:
# using normalized training data

weights = {0:.54, 1:7.58}

n_cols = X_norm_train.shape[1]

model = Sequential()
model.add(Dense(50, activation='relu', input_shape=(n_cols,)))
model.add(Dense(25, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

early_stopping_monitor = EarlyStopping(patience=3, monitor='loss')

model.compile(optimizer="adam", loss='binary_crossentropy', metrics=["accuracy"])
model.fit(X_norm_train, y_norm_train, callbacks=[early_stopping_monitor], 
          epochs=50, verbose=2, class_weight=weights, validation_split=.3)



Epoch 1/50
62/62 - 1s - loss: 0.7049 - accuracy: 0.8007 - val_loss: 0.7047 - val_accuracy: 0.4899
Epoch 2/50
62/62 - 0s - loss: 0.6692 - accuracy: 0.6040 - val_loss: 0.6810 - val_accuracy: 0.5981
Epoch 3/50
62/62 - 0s - loss: 0.6293 - accuracy: 0.6988 - val_loss: 0.6303 - val_accuracy: 0.6730
Epoch 4/50
62/62 - 0s - loss: 0.5878 - accuracy: 0.7039 - val_loss: 0.5278 - val_accuracy: 0.7479
Epoch 5/50
62/62 - 0s - loss: 0.5575 - accuracy: 0.7258 - val_loss: 0.6800 - val_accuracy: 0.6171
Epoch 6/50
62/62 - 0s - loss: 0.5360 - accuracy: 0.6927 - val_loss: 0.5158 - val_accuracy: 0.7337
Epoch 7/50
62/62 - 0s - loss: 0.5202 - accuracy: 0.7396 - val_loss: 0.5781 - val_accuracy: 0.6813
Epoch 8/50
62/62 - 0s - loss: 0.5097 - accuracy: 0.7105 - val_loss: 0.5554 - val_accuracy: 0.7004
Epoch 9/50
62/62 - 0s - loss: 0.5031 - accuracy: 0.7319 - val_loss: 0.5685 - val_accuracy: 0.6849
Epoch 10/50
62/62 - 0s - loss: 0.4963 - accuracy: 0.7207 - val_loss: 0.5210 - val_accuracy: 0.7158
Epoch 11/50
62/62 -

<tensorflow.python.keras.callbacks.History at 0x14d4853a0>

In [301]:
model.evaluate(X_norm_test, y_norm_test)



[0.47032326459884644, 0.7465240359306335]

In [302]:
y_pred = model.predict(X_norm_test)
y_pred = np.round(y_pred)
print(confusion_matrix(y_norm_test, y_pred))
print(classification_report(y_norm_test, y_pred))

print("Test data balanced accuracy: ", balanced_accuracy_score(y_norm_test, y_pred))
print("Test recall score: ", recall_score(y_norm_test, y_pred))

[[665 208]
 [ 29  33]]
              precision    recall  f1-score   support

           0       0.96      0.76      0.85       873
           1       0.14      0.53      0.22        62

    accuracy                           0.75       935
   macro avg       0.55      0.65      0.53       935
weighted avg       0.90      0.75      0.81       935

Test data balanced accuracy:  0.6469995935409969
Test recall score:  0.532258064516129


##### This model performs slightly better, but both significantly under perform logistic regression.

Maybe with enough time and effort, a neural network might perform as well as or better than a different machine learning model.  With other ML models available, it seems better to spend time to tune those.