In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn import metrics, preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import tabpy_client

In [2]:
#Read the original Kaggle CSV
stroke_df_initial = pd.read_csv(os.path.join('Resources/healthcare-dataset-stroke-data.csv'))
stroke_df_initial.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


In [3]:
#ID column is not necessary. Drop
stroke_df = stroke_df_initial.drop(['id'], axis = 1)
stroke_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
#Checking for nulls
print(stroke_df.isna().sum())

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


201 rows of NA for bmi column only. Consider Drop NA but do complete data exploration first.

In [5]:
#Checking number of entries, data types
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [6]:
#Value counts for hypertension column to ensure only two choices
stroke_df["hypertension"].value_counts()

0    4612
1     498
Name: hypertension, dtype: int64

In [7]:
#Value counts for heart diseease column to ensure only two choices
stroke_df["heart_disease"].value_counts()

0    4834
1     276
Name: heart_disease, dtype: int64

In [8]:
#Value counts for stroke column to ensure only two choices
stroke_df["stroke"].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [9]:
#Only 249 patients with stroke. How many will we lose if we Drop NA for the bmi column?
bmi_null_stroke = stroke_df.loc[(stroke_df['bmi'].isna() == True) & (stroke_df['stroke'] == 1)]
len(bmi_null_stroke)

40

40 is too many rows of valuable data to lose here. Consider other options. Insert mean into BMI? Other ideas?

In [10]:
#Check the value counts of all columns with datatype 'object' for a guide to encoding needs
#Check the value counts of all columns with datatype 'object' for a guide to encoding needs
counts = stroke_df.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(
    stroke_df[counts]
    .melt(var_name='column', value_name='value')
    .value_counts())
.rename(columns={0: 'counts'})
.sort_values(by=['column', 'counts']))

TypeError: rename() got an unexpected keyword argument 'columns'

Note that gender only has one 'other'; probably easier to remove that point for binary encoding. Note that one of the work_types is "children," meaning the dataset includes children. Look at age more closely.


In [None]:
#Examining age column
stroke_df.sort_values(by=['age']).head()

This dataset even contains babies. Children are likely less useful in predicting stroke, although these particular children may have some other health issues, which is why they were included. How many children actually are labeled with stroke?

In [None]:
#Children stroke count
children_stroke = stroke_df.loc[(stroke_df['age'] < 17) & (stroke_df['stroke'] == 1)]
len(children_stroke)

Remove children from the dataset in this instance and compare model performance with the children data left in vs deleted.

In [None]:
#Removing children from the dataset

no_children_df = stroke_df[stroke_df.age > 16]
no_children_df.sort_values(by=['age'])


In [None]:
#Ensure children is no longer listed as a work type
no_children_df.sort_values(by=['age'])
no_children_df['work_type'].value_counts()

In [None]:
#Drop the 'other' row in gender column
cleaned_stroke_df = no_children_df[no_children_df.gender != 'Other']
cleaned_stroke_df

In [None]:
# Use Pandas get_dummies to convert categorical data

encoded_df = pd.get_dummies(cleaned_stroke_df, columns=['gender','ever_married', 'Residence_type'])
encoded_df


In [None]:
#Drop the extra dummies columns
encoded_stroke_df = encoded_df.drop(columns=['gender_Female','ever_married_No','Residence_type_Rural'])
encoded_stroke_df

In [None]:
#Label encode the categorical columns that have more than two choices
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'Country'. 
encoded_stroke_df['work_type']= label_encoder.fit_transform(encoded_stroke_df['work_type']) 
encoded_stroke_df.head()

In [None]:

# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'Country'. 
encoded_stroke_df['smoking_status']= label_encoder.fit_transform(encoded_stroke_df['smoking_status']) 
encoded_stroke_df.head()

In [None]:
#Histogram of bmi column to view outliers
encoded_stroke_df.hist(column = 'bmi')

In [None]:
#Sort by bmi to see highest values
encoded_stroke_df.sort_values(by=['bmi'], ascending = False).head()

In [None]:
#Determine bmi outliers using upper and lower bounds
quartiles = encoded_stroke_df['bmi'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
outliers = encoded_stroke_df.loc[(encoded_stroke_df['bmi'] < lower_bound) | (encoded_stroke_df['bmi'] > upper_bound)]
        
#Display upper & lower bounds and number of outliers
       
print(lower_bound)
print(upper_bound)
print(len(outliers))

In [None]:
#quick look at the data
encoded_stroke_df.describe()

In [None]:
#Get the median values to compare mean and median in bmi
encoded_stroke_df.median()

Mean and median are nearly identical. Either is an acceptable replacement.

In [None]:
#Set mean to a variable
mean_bmi = encoded_stroke_df['bmi'].mean()
mean_bmi

In [None]:
#replace NaN with mean bmi
replace_stroke_df = encoded_stroke_df.fillna(mean_bmi)
replace_stroke_df.tail()

In [None]:
#Make a correlation heatmap of all of the features
correlation = replace_stroke_df.corr()
plt.figure (figsize = (10,8))
sns.heatmap(correlation, annot = True)


Marital status had a small correlation with age. Age makes more sense to keep in the features for a medical model, so try removing ever_married to see the effect.


In [None]:
#Remove ever_married column and then train/test the model to see if it helps performance.
final_stroke_df = replace_stroke_df.drop(['ever_married_Yes'], axis = 1)



Removing the ever_married column does not make improvement to the model. Actually the F1 and precision drop one-tenth. Keep the ever_married column.

In [None]:
#Model work starts here

In [None]:
# Assign X (data) and y (target)
X = replace_stroke_df.drop(["stroke"], axis=1)
y = replace_stroke_df["stroke"]
print(X.shape, y.shape)

In [None]:
#Split data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
#Check the distribution of stroke/no stroke in the test data
y_test.value_counts()

The Gaussian Naive Bayes model is not likely to be very affected by feature scaling, especially given the small dataset and the fact that it weights features accordingly by design, but try a standard scaler and see if it affects the model performance.


In [None]:
# Create a StandardScater model and fit it to the training data

from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)


# Transform the training and testing data using the X_scaler model. Y is the target so we don't scale the y.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

The model performance is exactly the same with scaling.

In [None]:
# Define the model (Gaussian Naive Bayes)
nbclf = GaussianNB()

#Fit (train) model using the training data
nbclf.fit(X_train, y_train)



In [None]:
#Try cross validation and calculate accuracy score as well as AUC ROC score for 10-fold
scores = cross_val_score(nbclf, X_test, y_test, cv=10, scoring='roc_auc')

# Show accuracy statistics for cross-validation

print("Accuracy: %0.3f" % (scores.mean()))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y_test, cross_val_predict(nbclf, X_test, y_test, cv=10)))

In [None]:
#Try KFold cross validation

kfold = KFold(n_splits= 10, shuffle = True, random_state = 42)
result = result = cross_val_score(nbclf, X_test, y_test, cv=kfold, scoring='accuracy')
print("Accuracy: %0.3f" % (result.mean()))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y_test, cross_val_predict(nbclf, X_test, y_test, cv=kfold)))

Accuracy does not improve with Cross validation

In [None]:
#Make predictions for original test/train split
predictions = nbclf.predict(X_test)
sum(predictions)

In [None]:
#Confusion matrix to see correct and incorrect predictions for original test/train split

confusion_matrix = confusion_matrix(y_test, predictions)
print(confusion_matrix)

In [None]:
#Compute precision, recall F1-score and support
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

In [None]:
#Create an informative confusion matrix using seaborn

#Define variables for Information inside each box
category = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
counts = ['{0:0.0f}'.format(value) for value in confusion_matrix.flatten()]
percentages = ['{0:.1%}'.format(value) for value in confusion_matrix.flatten()/np.sum(confusion_matrix)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(category, counts, percentages)]

#Convert all of the labels to an array and reshape
labels = np.asarray(labels).reshape(2,2)

#Change the 0,1 to no stroke and stroke for labels
tick_labels = ['No Stroke', 'Stroke']

#Calculate all of the values to be displayed underneath the matrix as xlabel. Calculate precision, recall, F1 for both
#stroke and no stroke
accuracy  = np.trace(confusion_matrix) / float(np.sum(confusion_matrix))
precision_stroke = confusion_matrix[1,1] / sum(confusion_matrix[:,1])
recall_stroke    = confusion_matrix[1,1] / sum(confusion_matrix[1,:])
f1_score_stroke  = 2*precision_stroke*recall_stroke / (precision_stroke + recall_stroke)
precision = confusion_matrix[0,0] / sum(confusion_matrix[:,0])
recall    = confusion_matrix[0,0] / sum(confusion_matrix[0,:])
f1_score  = 2*precision*recall / (precision + recall)

#Set up the text for the x label display underneath the matrix
stats_text = "\n\nAccuracy={:0.2f}\nPrecision (stroke)={:0.2f}, Precision (no stroke)={:0.2f}\nRecall (stroke)={:0.2f}, Recall (no stroke)={:0.2f}\nF1 Score (stroke)={:0.2f}, F1 Score (no stroke)={:0.2f} ".format(accuracy,precision_stroke, precision, recall_stroke, recall, f1_score_stroke, f1_score)

#Make the seaborn heatmap
sns.heatmap(confusion_matrix, annot = labels, fmt = '', xticklabels = tick_labels, yticklabels = tick_labels)
plt.xlabel(stats_text, fontsize = 14)
plt.title("Gaussian Naive Bayes model", fontsize = 14)



In [None]:
#Import SMOTE from imbalanced learn in order to create a balanced class dataset with synthetic samples
from imblearn.over_sampling import SMOTE
from collections import Counter
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
print(sorted(Counter(y_resampled).items()))

nbclf_smote = GaussianNB().fit(X_resampled, y_resampled)

In [None]:
#Make predictions for original test data but with model fitting after SMOTE synthetic data creation
predictions = nbclf_smote.predict(X_test)
sum(predictions)

In [None]:
#Confusion matrix to see correct and incorrect predictions for original test data with SMOTE

confusion_matrix = confusion_matrix(y_test, predictions)
print(confusion_matrix)

In [None]:
#Compute precision, recall F1-score and support for original test data with SMOTE
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

Using SMOTE does not improve performance of our model. It only slightly improves recall; all other scores decrease.