## Stroke modeling with Keras

In [1]:
import numpy as np
import pandas as pd

In [5]:
# import data sets

stroke_raw_df = pd.read_csv("stroke_raw_df.csv", index_col=0)
stroke_norm_df = pd.read_csv("stroke_norm_df.csv", index_col=0)
stroke_scaled_df = pd.read_csv("stroke_scaled_df.csv", index_col=0)

stroke_raw_df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1,67.0,0,1,1,1,228.69,36.6,1,0,1,0,0,1,0,0
1,51676,0,61.0,0,0,1,0,202.21,33.2,1,0,0,1,0,0,1,0
2,31112,1,80.0,0,1,1,0,105.92,32.5,1,0,1,0,0,0,1,0


In [6]:
stroke_norm_df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1,0.736842,0,1,1,1,0.801173,0.313507,1,0,1,0,0,1,0,0
1,51676,0,0.631579,0,0,1,0,0.678875,0.271375,1,0,0,1,0,0,1,0
2,31112,1,0.964912,0,1,1,0,0.234159,0.262701,1,0,1,0,0,0,1,0


In [7]:
stroke_scaled_df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1,0.874738,0,1,1,1,2.464581,0.819354,1,0,1,0,0,1,0,0
1,51676,0,0.499979,0,0,1,0,1.918619,0.342896,1,0,0,1,0,0,1,0
2,31112,1,1.686716,0,1,1,0,-0.066679,0.244801,1,0,1,0,0,0,1,0


### Create Train and Test data sets for each data set

In [16]:
from sklearn.model_selection import train_test_split

# Raw data

X_raw = stroke_raw_df.drop(['id', 'stroke'], axis=1)
y_raw = stroke_raw_df['stroke']

X_raw_train, X_raw_test, y_raw_train, y_raw_test = train_test_split(X_raw, y_raw, test_size = 0.25, 
                                                                    random_state = 42, stratify = y_raw)

In [17]:
# Normed data

X_norm = stroke_norm_df.drop(['id', 'stroke'], axis=1)
y_norm = stroke_norm_df['stroke']

X_norm_train, X_norm_test, y_norm_train, y_norm_test = train_test_split(X_norm, y_norm, test_size = 0.25, 
                                                                    random_state = 42, stratify = y_norm)

In [18]:
# Scaled data

X_scaled = stroke_scaled_df.drop(['id', 'stroke'], axis=1)
y_scaled = stroke_scaled_df['stroke']

X_scaled_train, X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X_scaled, y_scaled, test_size = 0.25, 
                                                                    random_state = 42, stratify = y_scaled)

### Create new data set with random over and under sampling

In [11]:
minority_class_percent = (len(stroke_raw_df[stroke_raw_df['stroke'] == 1])/len(stroke_raw_df))*100
minority_class_percent

#minority class 6.6% of the all the data, heavily imbalanced data set

6.6078116639914395

In [26]:
#Resample the training data with RandomOversampler and Clustered Centroids
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids

# define over/under sampling strategies
ros = RandomOverSampler(sampling_strategy=.15, random_state = 42)
cc = ClusterCentroids(sampling_strategy=.75, random_state=42)

#orignal number of each target class
print(Counter(y_raw_train))


Counter({0: 2618, 1: 185})


**Raw data, over and under sampling**

In [25]:
# apply over sampling strategy on original data
X_r_train_resampled, y_r_train_resampled = ros.fit_resample(X_raw_train, y_raw_train)
print(Counter(y_r_train_resampled))

#apply under sampling strategy on oversampled data
X_r_train_resampled, y_r_train_resampled = cc.fit_resample(X_r_train_resampled, y_r_train_resampled)
print(Counter(y_r_train_resampled))

Counter({0: 2618, 1: 392})
Counter({0: 522, 1: 392})


**Normed data, over and under sampling**

In [27]:
# apply over sampling strategy on original data
X_n_train_resampled, y_n_train_resampled = ros.fit_resample(X_norm_train, y_norm_train)
print(Counter(y_n_train_resampled))

#apply under sampling strategy on oversampled data
X_n_train_resampled, y_n_train_resampled = cc.fit_resample(X_n_train_resampled, y_n_train_resampled)
print(Counter(y_n_train_resampled))

Counter({0: 2618, 1: 392})
Counter({0: 522, 1: 392})


**Scaled data, over and under sampling**

In [28]:
# apply over sampling strategy on original data
X_s_train_resampled, y_s_train_resampled = ros.fit_resample(X_scaled_train, y_scaled_train)
print(Counter(y_s_train_resampled))

#apply under sampling strategy on oversampled data
X_s_train_resampled, y_s_train_resampled = cc.fit_resample(X_s_train_resampled, y_s_train_resampled)
print(Counter(y_s_train_resampled))

Counter({0: 2618, 1: 392})
Counter({0: 522, 1: 392})


### Create new data set with SMOTEEN

In [45]:
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours

#create new minority class data points
#clean up majority class data points
# may lead to more false positive, but fewer false negatives

smote = SMOTEENN(random_state=42) #, enn=EditedNearestNeighbours(sampling_strategy='majority'))

**Raw data, SMOTEEN**

In [46]:
# apply SMOTEEN on original data
X_r_train_smoteen, y_r_train_smoteen = smote.fit_resample(X_raw_train, y_raw_train)
print(Counter(y_r_train_smoteen))

Counter({1: 2410, 0: 1800})


**Normalized data, SMOTEEN**

In [48]:
# apply SMOTEEN on normalized data
X_n_train_smoteen, y_n_train_smoteen = smote.fit_resample(X_norm_train, y_norm_train)
print(Counter(y_n_train_smoteen))

Counter({1: 2256, 0: 1985})


**Scaled data, SMOTEEN**

In [50]:
# apply SMOTEEN on scaled data
X_s_train_smoteen, y_s_train_smoteen = smote.fit_resample(X_scaled_train, y_scaled_train)
print(Counter(y_s_train_smoteen))

Counter({1: 2489, 0: 2023})


## Simple Logistic Regression to compare

In [52]:
raw = [X_raw_train, y_raw_train]
raw_sampled = [X_r_train_resampled, y_r_train_resampled]
raw_smoteen = [X_r_train_smoteen, y_r_train_smoteen]
raw_test = [X_raw_test, y_raw_test]

normed = [X_norm_train, y_norm_train]
normed_sampled = [X_n_train_resampled, y_n_train_resampled]
normed_smoteen = [X_n_train_smoteen, y_n_train_smoteen]
norm_test = [X_norm_test, y_norm_test]

scaled = [X_scaled_train, y_scaled_train]
scaled_sampled = [X_s_train_resampled, y_s_train_resampled]
scaled_smoteen = [X_s_train_smoteen, y_s_train_smoteen]
scaled_test = [X_scaled_test, y_scaled_test]

train_list = [raw, raw_sampled, raw_smoteen, 
              normed, normed_sampled, normed_smoteen, scaled, scaled_sampled, scaled_smoteen]

test_list = [raw_test, norm_test, scaled_test]

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, r2_score
from sklearn.metrics import confusion_matrix, classification_report

model = LogisticRegression(max_iter=5000, random_state=42)

print(Counter(y_norm_test))

Counter({0: 873, 1: 62})


In [54]:
model.fit(X_norm_train, y_norm_train)
y_pred = model.predict(X_norm_test)
confusion_matrix(y_norm_test, y_pred)

array([[873,   0],
       [ 62,   0]])

In [55]:
model.fit(X_n_train_resampled, y_n_train_resampled)
y_pred = model.predict(X_norm_test)
confusion_matrix(y_norm_test, y_pred)

array([[640, 233],
       [ 24,  38]])

In [57]:
print(classification_report(y_norm_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.73      0.83       873
           1       0.14      0.61      0.23        62

    accuracy                           0.73       935
   macro avg       0.55      0.67      0.53       935
weighted avg       0.91      0.73      0.79       935



In [58]:
model.fit(X_n_train_smoteen, y_n_train_smoteen)
y_pred = model.predict(X_norm_test)
confusion_matrix(y_norm_test, y_pred)

array([[599, 274],
       [ 13,  49]])

In [59]:
print(classification_report(y_norm_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.69      0.81       873
           1       0.15      0.79      0.25        62

    accuracy                           0.69       935
   macro avg       0.57      0.74      0.53       935
weighted avg       0.92      0.69      0.77       935



In [60]:
model.fit(X_scaled_train, y_scaled_train)
y_pred = model.predict(X_scaled_test)
confusion_matrix(y_scaled_test, y_pred)

array([[873,   0],
       [ 62,   0]])

In [63]:
model.fit(X_s_train_resampled, y_s_train_resampled)
y_pred = model.predict(X_scaled_test)
print(confusion_matrix(y_scaled_test, y_pred))
print(classification_report(y_scaled_test, y_pred))

[[628 245]
 [ 22  40]]
              precision    recall  f1-score   support

           0       0.97      0.72      0.82       873
           1       0.14      0.65      0.23        62

    accuracy                           0.71       935
   macro avg       0.55      0.68      0.53       935
weighted avg       0.91      0.71      0.79       935



In [64]:
model.fit(X_s_train_smoteen, y_s_train_smoteen)
y_pred = model.predict(X_scaled_test)
print(confusion_matrix(y_scaled_test, y_pred))
print(classification_report(y_scaled_test, y_pred))

[[631 242]
 [ 14  48]]
              precision    recall  f1-score   support

           0       0.98      0.72      0.83       873
           1       0.17      0.77      0.27        62

    accuracy                           0.73       935
   macro avg       0.57      0.75      0.55       935
weighted avg       0.92      0.73      0.79       935



## Keras

In [66]:
from keras.models import Sequential
from keras.layers import Dense
from keras import metrics
from keras.callbacks import EarlyStopping
from keras.callbacks import History 