In [1]:
# Imports
import getpass
import psycopg2
import math
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from collections import Counter

In [2]:
# Reading Data
username = input("What is your Postgres Username? (postgres by default)")

What is your Postgres Username? (postgres by default)postgres


In [3]:
# Reading Data
password = getpass.getpass(prompt= "What is your Postgres Password?")

What is your Postgres Password?········


In [4]:
# Reading Data
port = input("What is your Postgres Port number?")

What is your Postgres Port number?5432


In [5]:
# Reading Data
conn = f'postgresql://{username}:{password}@localhost:{port}/SanAntonio_Stroke_Pred'.format(username, password, port)

In [6]:
engine = create_engine(conn)

In [7]:
# Get Data
medical_df = pd.read_sql_query('SELECT personal."Identifier", medical."Age", \
                           medical."Gender", personal."Work_Type", personal."Residence_Type", \
                           personal."Ever_Married", medical."Hypertension", medical."Heart_Disease",\
                           medical."Avg_Glucose_Lvl", medical."BMI", medical."Smoker", personal."Stroke"\
                                FROM personal\
                                INNER JOIN medical\
                                ON personal."Identifier" = medical."Identifier";', conn)

In [8]:
# Get data info
medical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Identifier       5109 non-null   int64  
 1   Age              5109 non-null   int64  
 2   Gender           5109 non-null   object 
 3   Work_Type        5109 non-null   object 
 4   Residence_Type   5109 non-null   object 
 5   Ever_Married     5109 non-null   object 
 6   Hypertension     5109 non-null   int64  
 7   Heart_Disease    5109 non-null   int64  
 8   Avg_Glucose_Lvl  5109 non-null   float64
 9   BMI              4908 non-null   float64
 10  Smoker           5109 non-null   object 
 11  Stroke           5109 non-null   int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 479.1+ KB


In [9]:
# Check missing data
medical_df.isnull().sum()

Identifier           0
Age                  0
Gender               0
Work_Type            0
Residence_Type       0
Ever_Married         0
Hypertension         0
Heart_Disease        0
Avg_Glucose_Lvl      0
BMI                201
Smoker               0
Stroke               0
dtype: int64

In [10]:
# Check unique value counts
medical_df.nunique()

Identifier         5109
Age                  83
Gender                2
Work_Type             5
Residence_Type        2
Ever_Married          2
Hypertension          2
Heart_Disease         2
Avg_Glucose_Lvl    3978
BMI                 418
Smoker                4
Stroke                2
dtype: int64

In [11]:
# Drop ID# column
medical_df.drop(columns=['Identifier'], inplace=True)
medical_df.head()

Unnamed: 0,Age,Gender,Work_Type,Residence_Type,Ever_Married,Hypertension,Heart_Disease,Avg_Glucose_Lvl,BMI,Smoker,Stroke
0,67,Male,Private,Urban,Yes,0,1,228.69,36.6,Former,1
1,61,Female,Self-employed,Rural,Yes,0,0,202.21,,Never,1
2,80,Male,Private,Rural,Yes,0,1,105.92,32.5,Never,1
3,49,Female,Private,Urban,Yes,0,0,171.23,34.4,Current,1
4,79,Female,Self-employed,Rural,Yes,1,0,174.12,24.0,Never,1


In [12]:
# Stroke value counts
medical_df['Stroke'].value_counts()

0    4860
1     249
Name: Stroke, dtype: int64

In [13]:
# Get a list of categorical columns
categorical_columns = medical_df.dtypes[medical_df.dtypes=='object'].index.tolist()
categorical_columns

['Gender', 'Work_Type', 'Residence_Type', 'Ever_Married', 'Smoker']

In [14]:
# Get value counts for categorical columns
for i in range(len(categorical_columns)):
    print(medical_df[categorical_columns[i]].value_counts())

Female    2994
Male      2115
Name: Gender, dtype: int64
Private          2924
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: Work_Type, dtype: int64
Urban    2596
Rural    2513
Name: Residence_Type, dtype: int64
Yes    3353
No     1756
Name: Ever_Married, dtype: int64
Never      1892
Unknown    1544
Former      884
Current     789
Name: Smoker, dtype: int64


In [15]:
# Create OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit & transform OneHotEncoder using categorical columns
encode_df = pd.DataFrame(enc.fit_transform(medical_df[categorical_columns]))

# Add column names
encode_df.columns = enc.get_feature_names(categorical_columns)
print(encode_df.shape)
encode_df.head(10)

(5109, 15)




Unnamed: 0,Gender_Female,Gender_Male,Work_Type_Govt_job,Work_Type_Never_worked,Work_Type_Private,Work_Type_Self-employed,Work_Type_children,Residence_Type_Rural,Residence_Type_Urban,Ever_Married_No,Ever_Married_Yes,Smoker_Current,Smoker_Former,Smoker_Never,Smoker_Unknown
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
6,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
9,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [16]:
# Get encode_df info
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender_Female            5109 non-null   float64
 1   Gender_Male              5109 non-null   float64
 2   Work_Type_Govt_job       5109 non-null   float64
 3   Work_Type_Never_worked   5109 non-null   float64
 4   Work_Type_Private        5109 non-null   float64
 5   Work_Type_Self-employed  5109 non-null   float64
 6   Work_Type_children       5109 non-null   float64
 7   Residence_Type_Rural     5109 non-null   float64
 8   Residence_Type_Urban     5109 non-null   float64
 9   Ever_Married_No          5109 non-null   float64
 10  Ever_Married_Yes         5109 non-null   float64
 11  Smoker_Current           5109 non-null   float64
 12  Smoker_Former            5109 non-null   float64
 13  Smoker_Never             5109 non-null   float64
 14  Smoker_Unknown          

In [17]:
# Drop redundant columns
encode_df.drop(columns=['Gender_Female', 'Ever_Married_No', 'Residence_Type_Rural'], inplace=True)
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender_Male              5109 non-null   float64
 1   Work_Type_Govt_job       5109 non-null   float64
 2   Work_Type_Never_worked   5109 non-null   float64
 3   Work_Type_Private        5109 non-null   float64
 4   Work_Type_Self-employed  5109 non-null   float64
 5   Work_Type_children       5109 non-null   float64
 6   Residence_Type_Urban     5109 non-null   float64
 7   Ever_Married_Yes         5109 non-null   float64
 8   Smoker_Current           5109 non-null   float64
 9   Smoker_Former            5109 non-null   float64
 10  Smoker_Never             5109 non-null   float64
 11  Smoker_Unknown           5109 non-null   float64
dtypes: float64(12)
memory usage: 479.1 KB


In [18]:
# Merge encoded df with medical df
medical_df = medical_df.merge(encode_df, left_index=True, right_index=True).drop(categorical_columns, axis=1)
print(medical_df.shape)
medical_df.head(10)

(5109, 18)


Unnamed: 0,Age,Hypertension,Heart_Disease,Avg_Glucose_Lvl,BMI,Stroke,Gender_Male,Work_Type_Govt_job,Work_Type_Never_worked,Work_Type_Private,Work_Type_Self-employed,Work_Type_children,Residence_Type_Urban,Ever_Married_Yes,Smoker_Current,Smoker_Former,Smoker_Never,Smoker_Unknown
0,67,0,1,228.69,36.6,1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,61,0,0,202.21,,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,80,0,1,105.92,32.5,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,49,0,0,171.23,34.4,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
4,79,1,0,174.12,24.0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5,81,0,0,186.21,29.0,1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
6,74,1,1,70.09,27.4,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,69,0,0,94.39,22.8,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8,59,0,0,76.15,,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
9,78,0,0,58.57,24.2,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


SimpleImputer

In [19]:
# Create SimpleImputer instance to replace missing BMI feature values with median BMI
imputer = SimpleImputer(strategy='median')

# Fit SimpleImputer & transform data
med_transformed = imputer.fit_transform(medical_df)

In [20]:
# Add SimpleImputer outcome to dataframe
med_df_transformed = pd.DataFrame(med_transformed, columns=medical_df.columns)
print(med_df_transformed.shape)
med_df_transformed.head(10)

(5109, 18)


Unnamed: 0,Age,Hypertension,Heart_Disease,Avg_Glucose_Lvl,BMI,Stroke,Gender_Male,Work_Type_Govt_job,Work_Type_Never_worked,Work_Type_Private,Work_Type_Self-employed,Work_Type_children,Residence_Type_Urban,Ever_Married_Yes,Smoker_Current,Smoker_Former,Smoker_Never,Smoker_Unknown
0,67.0,0.0,1.0,228.69,36.6,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,61.0,0.0,0.0,202.21,28.1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,80.0,0.0,1.0,105.92,32.5,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,49.0,0.0,0.0,171.23,34.4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
4,79.0,1.0,0.0,174.12,24.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5,81.0,0.0,0.0,186.21,29.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
6,74.0,1.0,1.0,70.09,27.4,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,69.0,0.0,0.0,94.39,22.8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8,59.0,0.0,0.0,76.15,28.1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
9,78.0,0.0,0.0,58.57,24.2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [21]:
med_df_transformed.isnull().sum()

Age                        0
Hypertension               0
Heart_Disease              0
Avg_Glucose_Lvl            0
BMI                        0
Stroke                     0
Gender_Male                0
Work_Type_Govt_job         0
Work_Type_Never_worked     0
Work_Type_Private          0
Work_Type_Self-employed    0
Work_Type_children         0
Residence_Type_Urban       0
Ever_Married_Yes           0
Smoker_Current             0
Smoker_Former              0
Smoker_Never               0
Smoker_Unknown             0
dtype: int64

In [22]:
# Create feature & target datasets
X1 = med_df_transformed.drop(columns=['Stroke'])
y1 = med_df_transformed['Stroke']
print(X1.shape)
print(y1.shape)

(5109, 17)
(5109,)


In [23]:
# Split into training & testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=2, stratify=y1)
print(X1_train.shape)
print(X1_test.shape)
print(y1_train.shape)
print(y1_test.shape)

(4087, 17)
(1022, 17)
(4087,)
(1022,)


In [24]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit & transform
scaler.fit(X1_train)
X1_train_scaled = scaler.transform(X1_train)
X1_test_scaled = scaler.transform(X1_test)

KNNImputer

In [25]:
# Create KNNImputer instance
# n_neighbors = sqrt(N) where N = number of samples: https://towardsdatascience.com/how-to-find-the-optimal-value-of-k-in-knn-35d936e554eb
kimputer = KNNImputer(n_neighbors=int(math.sqrt(len(medical_df))))

# Fit KNNImputer & transform data
med2_transformed = kimputer.fit_transform(medical_df)

# Add imputed values to dataframe
med2_transformed_df = pd.DataFrame(med2_transformed, columns=medical_df.columns)
print(med2_transformed_df.shape)
med2_transformed_df.head()

(5109, 18)


Unnamed: 0,Age,Hypertension,Heart_Disease,Avg_Glucose_Lvl,BMI,Stroke,Gender_Male,Work_Type_Govt_job,Work_Type_Never_worked,Work_Type_Private,Work_Type_Self-employed,Work_Type_children,Residence_Type_Urban,Ever_Married_Yes,Smoker_Current,Smoker_Former,Smoker_Never,Smoker_Unknown
0,67.0,0.0,1.0,228.69,36.6,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,61.0,0.0,0.0,202.21,35.246479,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,80.0,0.0,1.0,105.92,32.5,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,49.0,0.0,0.0,171.23,34.4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
4,79.0,1.0,0.0,174.12,24.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [26]:
# Create feature & target dataframes
X2 = med2_transformed_df.drop(columns=['Stroke'])
y2 = med2_transformed_df['Stroke']
print(X2.shape)
print(y2.shape)

(5109, 17)
(5109,)


In [27]:
# Split data into training & testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, stratify=y2)
print(X2_train.shape)
print(X2_test.shape)
print(y2_train.shape)
print(y2_test.shape)

(4087, 17)
(1022, 17)
(4087,)
(1022,)


In [28]:
# Scale feature data with scaler instance from before
scaler.fit(X2_train)

# Fit & transform
X2_train_scaled = scaler.transform(X2_train)
X2_test_scaled = scaler.transform(X2_test)

SMOTE 

In [29]:
# Run SMOTE oversampling instance
X1_train_resampled, y1_train_resampled = SMOTE().fit_resample(X1_train_scaled, y1_train)
X2_train_resampled, y2_train_resampled = SMOTE().fit_resample(X2_train_scaled, y2_train)
# Check new stroke training distribution
print(Counter(y1_train_resampled))
print(Counter(y2_train_resampled))

Counter({0.0: 3888, 1.0: 3888})
Counter({0.0: 3888, 1.0: 3888})


In [30]:
# Create RandomForestClassifier instances
rf_model1 = RandomForestClassifier(n_estimators=100, bootstrap=False, max_depth=13, min_samples_split=2, random_state=2)
rf_model2 = RandomForestClassifier(n_estimators=100, bootstrap=False, max_depth=13, min_samples_split=2, random_state=2)

# Fit the models
rf_model1 = rf_model1.fit(X1_train_resampled, y1_train_resampled)
rf_model2 = rf_model2.fit(X2_train_resampled, y2_train_resampled)

In [31]:
# Evaluate rf_model1
y1_pred1 = rf_model1.predict(X1_test_scaled)
cm1 = confusion_matrix(y1_test, y1_pred1)
cm1_df = pd.DataFrame(cm1, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y1_test, y1_pred1))
print(f' Accuracy: {accuracy_score(y1_test, y1_pred1):.3f}; Precision: {precision_score(y1_test, y1_pred1):.3f}; Recall: {recall_score(y1_test, y1_pred1):.3f}')
cm1_df

              precision    recall  f1-score   support

         0.0       0.96      0.91      0.94       972
         1.0       0.12      0.24      0.16        50

    accuracy                           0.88      1022
   macro avg       0.54      0.58      0.55      1022
weighted avg       0.92      0.88      0.90      1022

 Accuracy: 0.880; Precision: 0.124; Recall: 0.240


Unnamed: 0,Predicted-,Predicted+
Stroke-,887,85
Stroke+,38,12


In [32]:
# Evaluate rf_model2
y2_pred1 = rf_model2.predict(X2_test_scaled)
cm2 = confusion_matrix(y2_test, y2_pred1)
cm2_df = pd.DataFrame(cm2, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y2_test, y2_pred1))
print(f' Accuracy: {accuracy_score(y2_test, y2_pred1):.3f}; Precision: {precision_score(y2_test, y2_pred1):.3f}; Recall: {recall_score(y2_test, y2_pred1):.3f}')
cm2_df

              precision    recall  f1-score   support

         0.0       0.96      0.91      0.93       972
         1.0       0.13      0.26      0.17        50

    accuracy                           0.88      1022
   macro avg       0.54      0.58      0.55      1022
weighted avg       0.92      0.88      0.90      1022

 Accuracy: 0.877; Precision: 0.127; Recall: 0.260


Unnamed: 0,Predicted-,Predicted+
Stroke-,883,89
Stroke+,37,13


In [33]:
# Create AdaBoostClassifier instances
ada_model1 = AdaBoostClassifier(n_estimators=128, random_state=2)
ada_model2 = AdaBoostClassifier(n_estimators=128, random_state=2)
# Fit ada_model1 to SimpleImputer dataset
ada_model1 = ada_model1.fit(X1_train_resampled, y1_train_resampled)

# Fit ada_model2 to the KNNImputer data set
ada_model2 = ada_model2.fit(X2_train_resampled, y2_train_resampled)


In [34]:
# Evaluate ada_model1
y1_pred2 = ada_model1.predict(X1_test_scaled)
cm1 = confusion_matrix(y1_test, y1_pred2)
cm1_df = pd.DataFrame(cm1, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y1_test, y1_pred2))
print(f' Accuracy: {accuracy_score(y1_test, y1_pred2):.3f}; Precision: {precision_score(y1_test, y1_pred2):.3f}; Recall: {recall_score(y1_test, y1_pred2):.3f}')
cm1_df

              precision    recall  f1-score   support

         0.0       0.97      0.86      0.91       972
         1.0       0.15      0.48      0.23        50

    accuracy                           0.84      1022
   macro avg       0.56      0.67      0.57      1022
weighted avg       0.93      0.84      0.88      1022

 Accuracy: 0.839; Precision: 0.147; Recall: 0.480


Unnamed: 0,Predicted-,Predicted+
Stroke-,833,139
Stroke+,26,24


In [35]:
y2_pred2 = ada_model2.predict(X2_test_scaled)
cm2 = confusion_matrix(y2_test, y2_pred2)
cm2_df = pd.DataFrame(cm2, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y2_test, y2_pred2))
print(f' Accuracy: {accuracy_score(y2_test, y2_pred2):.3f}; Precision: {precision_score(y2_test, y2_pred2):.3f}; Recall: {recall_score(y2_test, y2_pred2):.3f}')
cm2_df

              precision    recall  f1-score   support

         0.0       0.97      0.84      0.90       972
         1.0       0.13      0.48      0.21        50

    accuracy                           0.82      1022
   macro avg       0.55      0.66      0.55      1022
weighted avg       0.93      0.82      0.86      1022

 Accuracy: 0.820; Precision: 0.132; Recall: 0.480


Unnamed: 0,Predicted-,Predicted+
Stroke-,814,158
Stroke+,26,24
