In [None]:
# Imports
import getpass
import psycopg2
import math
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from collections import Counter

In [None]:
# Reading Data
username = input("What is your Postgres Username? (postgres by default)")

In [None]:
# Reading Data
password = getpass.getpass(prompt= "What is your Postgres Password?")

In [None]:
# Reading Data
port = input("What is your Postgres Port number?")

In [None]:
# Reading Data
conn = f'postgresql://{username}:{password}@localhost:{port}/SanAntonio_Stroke_Pred'.format(username, password, port)

In [None]:
engine = create_engine(conn)

In [None]:
# Get Data
medical_df = pd.read_sql_query('SELECT personal."Identifier", medical."Age", \
                           medical."Gender", personal."Work_Type", personal."Residence_Type", \
                           personal."Ever_Married", medical."Hypertension", medical."Heart_Disease",\
                           medical."Avg_Glucose_Lvl", medical."BMI", medical."Smoker", personal."Stroke"\
                                FROM personal\
                                INNER JOIN medical\
                                ON personal."Identifier" = medical."Identifier";', conn)

In [None]:
# Get data info
medical_df.info()

In [None]:
# Check missing data
medical_df.isnull().sum()

In [None]:
# Check unique value counts
medical_df.nunique()

In [None]:
# Drop ID# column
medical_df.drop(columns=['Identifier'], inplace=True)
medical_df.head()

In [None]:
# Stroke value counts
medical_df['Stroke'].value_counts()

In [None]:
# Get a list of categorical columns
categorical_columns = medical_df.dtypes[medical_df.dtypes=='object'].index.tolist()
categorical_columns

In [None]:
# Get value counts for categorical columns
for i in range(len(categorical_columns)):
    print(medical_df[categorical_columns[i]].value_counts())

In [None]:
# Create OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit & transform OneHotEncoder using categorical columns
encode_df = pd.DataFrame(enc.fit_transform(medical_df[categorical_columns]))

# Add column names
encode_df.columns = enc.get_feature_names(categorical_columns)
print(encode_df.shape)
encode_df.head(10)

In [None]:
# Get encode_df info
encode_df.info()

In [None]:
# Drop redundant columns
encode_df.drop(columns=['Gender_Female', 'Ever_Married_No', 'Residence_Type_Rural'], inplace=True)
encode_df.info()

In [None]:
# Merge encoded df with medical df
medical_df = medical_df.merge(encode_df, left_index=True, right_index=True).drop(categorical_columns, axis=1)
print(medical_df.shape)
medical_df.head(10)

SimpleImputer

In [None]:
# Create SimpleImputer instance to replace missing BMI feature values with median BMI
imputer = SimpleImputer(strategy='median')

# Fit SimpleImputer & transform data
med_transformed = imputer.fit_transform(medical_df)

In [None]:
# Add SimpleImputer outcome to dataframe
med_df_transformed = pd.DataFrame(med_transformed, columns=medical_df.columns)
print(med_df_transformed.shape)
med_df_transformed.head(10)

In [None]:
med_df_transformed.isnull().sum()

In [None]:
# Create feature & target datasets
X1 = med_df_transformed.drop(columns=['Stroke'])
y1 = med_df_transformed['Stroke']
print(X1.shape)
print(y1.shape)

In [None]:
# Split into training & testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=2, stratify=y1)
print(X1_train.shape)
print(X1_test.shape)
print(y1_train.shape)
print(y1_test.shape)

In [None]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit & transform
scaler.fit(X1_train)
X1_train_scaled = scaler.transform(X1_train)
X1_test_scaled = scaler.transform(X1_test)

KNNImputer

In [None]:
# Create KNNImputer instance
# n_neighbors = sqrt(N) where N = number of samples: https://towardsdatascience.com/how-to-find-the-optimal-value-of-k-in-knn-35d936e554eb
kimputer = KNNImputer(n_neighbors=int(math.sqrt(len(medical_df))))

# Fit KNNImputer & transform data
med2_transformed = kimputer.fit_transform(medical_df)

# Add imputed values to dataframe
med2_transformed_df = pd.DataFrame(med2_transformed, columns=medical_df.columns)
print(med2_transformed_df.shape)
med2_transformed_df.head()

In [None]:
# Create feature & target dataframes
X2 = med2_transformed_df.drop(columns=['Stroke'])
y2 = med2_transformed_df['Stroke']
print(X2.shape)
print(y2.shape)

In [None]:
# Split data into training & testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, stratify=y2)
print(X2_train.shape)
print(X2_test.shape)
print(y2_train.shape)
print(y2_test.shape)

In [None]:
# Scale feature data with scaler instance from before
scaler.fit(X2_train)

# Fit & transform
X2_train_scaled = scaler.transform(X2_train)
X2_test_scaled = scaler.transform(X2_test)

SMOTE 

In [None]:
# Run SMOTE oversampling instance
X1_train_resampled, y1_train_resampled = SMOTE().fit_resample(X1_train_scaled, y1_train)
X2_train_resampled, y2_train_resampled = SMOTE().fit_resample(X2_train_scaled, y2_train)
# Check new stroke training distribution
print(Counter(y1_train_resampled))
print(Counter(y2_train_resampled))

In [None]:
# Create RandomForestClassifier instances
rf_model1 = RandomForestClassifier(n_estimators=100, bootstrap=False, max_depth=13, min_samples_split=2, random_state=2)
rf_model2 = RandomForestClassifier(n_estimators=100, bootstrap=False, max_depth=13, min_samples_split=2, random_state=2)

# Fit the models
rf_model1 = rf_model1.fit(X1_train_resampled, y1_train_resampled)
rf_model2 = rf_model2.fit(X2_train_resampled, y2_train_resampled)

In [None]:
# Evaluate rf_model1
y1_pred1 = rf_model1.predict(X1_test_scaled)
cm1 = confusion_matrix(y1_test, y1_pred1)
cm1_df = pd.DataFrame(cm1, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y1_test, y1_pred1))
print(f' Accuracy: {accuracy_score(y1_test, y1_pred1):.3f}; Precision: {precision_score(y1_test, y1_pred1):.3f}; Recall: {recall_score(y1_test, y1_pred1):.3f}')
cm1_df

In [None]:
# Evaluate rf_model2
y2_pred1 = rf_model2.predict(X2_test_scaled)
cm2 = confusion_matrix(y2_test, y2_pred1)
cm2_df = pd.DataFrame(cm2, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y2_test, y2_pred1))
print(f' Accuracy: {accuracy_score(y2_test, y2_pred1):.3f}; Precision: {precision_score(y2_test, y2_pred1):.3f}; Recall: {recall_score(y2_test, y2_pred1):.3f}')
cm2_df

In [None]:
# Create AdaBoostClassifier instances
ada_model1 = AdaBoostClassifier(n_estimators=128, random_state=2)
ada_model2 = AdaBoostClassifier(n_estimators=128, random_state=2)
# Fit ada_model1 to SimpleImputer dataset
ada_model1 = ada_model1.fit(X1_train_resampled, y1_train_resampled)

# Fit ada_model2 to the KNNImputer data set
ada_model2 = ada_model2.fit(X2_train_resampled, y2_train_resampled)


In [None]:
# Evaluate ada_model1
y1_pred2 = ada_model1.predict(X1_test_scaled)
cm1 = confusion_matrix(y1_test, y1_pred2)
cm1_df = pd.DataFrame(cm1, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y1_test, y1_pred2))
print(f' Accuracy: {accuracy_score(y1_test, y1_pred2):.3f}; Precision: {precision_score(y1_test, y1_pred2):.3f}; Recall: {recall_score(y1_test, y1_pred2):.3f}')
cm1_df

In [None]:
y2_pred2 = ada_model2.predict(X2_test_scaled)
cm2 = confusion_matrix(y2_test, y2_pred2)
cm2_df = pd.DataFrame(cm2, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y2_test, y2_pred2))
print(f' Accuracy: {accuracy_score(y2_test, y2_pred2):.3f}; Precision: {precision_score(y2_test, y2_pred2):.3f}; Recall: {recall_score(y2_test, y2_pred2):.3f}')
cm2_df

In [None]:
# Create SVC instances
svc_model1 = SVC(kernel='linear', random_state=2)
svc_model2 = SVC(kernel='linear', random_state=2)

# Fit model 1 to SimpleImputer dataset
svc_model1 = svc_model1.fit(X1_train_resampled, y1_train_resampled)

# Fit model 2 to KNNImputer dataset
svc_model2 = svc_model2.fit(X2_train_resampled, y2_train_resampled)

In [None]:
# Evaluate svc_model1
y1_pred3 = svc_model1.predict(X1_test_scaled)
cm1 = confusion_matrix(y1_test, y1_pred3)
cm1_df = pd.DataFrame(cm1, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y1_test, y1_pred3))
print(f' Accuracy: {accuracy_score(y1_test, y1_pred3):.3f}; Precision: {precision_score(y1_test, y1_pred3):.3f}; Recall: {recall_score(y1_test, y1_pred3):.3f}')
cm1_df

In [None]:
# Evaluate svc_model2
y2_pred3 = svc_model2.predict(X2_test_scaled)
cm2 = confusion_matrix(y2_test, y2_pred3)
cm2_df = pd.DataFrame(cm2, index=['Stroke-', 'Stroke+'], columns=['Predicted-', 'Predicted+'])
print(classification_report(y2_test, y2_pred3))
print(f' Accuracy: {accuracy_score(y2_test, y2_pred3):.3f}; Precision: {precision_score(y2_test, y2_pred3):.3f}; Recall: {recall_score(y2_test, y2_pred3):.3f}')
cm2_df