In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
from sklearn.impute import KNNImputer


from sklearn.metrics import accuracy_score, f1_score

import numpy as np 
import pandas as pd 
import os 



In [32]:
data=pd.read_csv("Resources/healthcare-dataset-stroke-data.csv")
data.head()

Unnamed: 0,app_id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [43]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [46]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop id column
    df = df.drop('app_id', axis=1)
    
    # Binary encoding
    df['ever_married'] = df['ever_married'].replace({'No': 0, 'Yes': 1})
    df['Residence_type'] = df['Residence_type'].replace({'Rural': 0, 'Urban': 1})
    
    # One-hot encoding
    for column in ['gender', 'work_type', 'smoking_status']:
        df = onehot_encode(df, column=column)
    
    # Split df into X and y
    y = df['stroke']
    X = df.drop('stroke', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    
    # KNN imputation of missing values
    imputer = KNNImputer()
    imputer.fit(X_train)
    X_train = pd.DataFrame(imputer.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(imputer.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test
    

In [47]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)


In [50]:
models={
   " Logistic Regression": LogisticRegression(),
    " K-Nearest Nieghbors": KNeighborsClassifier(),
    " Decision Tree": DecisionTreeClassifier(),
    " Support Vector Machine (RBF Kernel)": SVC(),
    " Support Vector Machine (Linear Kernel)": LinearSVC(),
    " Neural Network": MLPClassifier(),
    " Random Forest": RandomForestClassifier(),
    " Gradient Boosting": GradientBoostingClassifier()
        }


for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + "trained")

 Logistic Regressiontrained
 K-Nearest Nieghborstrained
 Decision Treetrained
 Support Vector Machine (RBF Kernel)trained




 Support Vector Machine (Linear Kernel)trained




 Neural Networktrained
 Random Foresttrained
 Gradient Boostingtrained


In [54]:
print("Model Scores\n ----------------------------")

for name, model in models.items():
    y_pred=model.predict(X_test)
    
    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}"\
        .format(accuracy_score(y_test, y_pred) * 100, f1_score(y_test, y_pred))
    )

Model Scores
 ----------------------------

 Logistic Regression Accuracy: 94.586%
				       F1-Score: 0.00000

 K-Nearest Nieghbors Accuracy: 94.260%
				       F1-Score: 0.00000

 Decision Tree Accuracy: 90.802%
				       F1-Score: 0.16568

 Support Vector Machine (RBF Kernel) Accuracy: 94.586%
				       F1-Score: 0.00000

 Support Vector Machine (Linear Kernel) Accuracy: 94.586%
				       F1-Score: 0.00000

 Neural Network Accuracy: 94.064%
				       F1-Score: 0.02151

 Random Forest Accuracy: 94.455%
				       F1-Score: 0.00000

 Gradient Boosting Accuracy: 94.586%
				       F1-Score: 0.00000


In [55]:
# Fix Class Imbalance

oversampled_data= pd.concat([X_train, y_train], axis=1).copy()


num_samples = y_train.value_counts()[0] - y_train.value_counts()[1]
new_samples = oversampled_data.query("stroke == 1").sample(num_samples, replace=True, random_state=1)

oversampled_data = pd.concat([oversampled_data, new_samples], axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)

y_train_oversampled = oversampled_data['stroke']
X_train_oversampled = oversampled_data.drop('stroke', axis=1)


In [59]:
models={
   " Logistic Regression": LogisticRegression(),
    " K-Nearest Nieghbors": KNeighborsClassifier(),
    " Decision Tree": DecisionTreeClassifier(),
    " Support Vector Machine (RBF Kernel)": SVC(),
    " Support Vector Machine (Linear Kernel)": LinearSVC(),
    " Neural Network": MLPClassifier(),
    " Random Forest": RandomForestClassifier(),
    " Gradient Boosting": GradientBoostingClassifier()
        }

for name, model in models.items():
    model.fit(X_train_oversampled, y_train_oversampled)
    print(name + " trained.")

 Logistic Regression trained.
 K-Nearest Nieghbors trained.
 Decision Tree trained.
 Support Vector Machine (RBF Kernel) trained.




 Support Vector Machine (Linear Kernel) trained.




 Neural Network trained.
 Random Forest trained.
 Gradient Boosting trained.


In [60]:
print("Model Performance\n-----------------")
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}"\
        .format(accuracy_score(y_test, y_pred) * 100, f1_score(y_test, y_pred))
    )

Model Performance
-----------------

 Logistic Regression Accuracy: 73.190%
				       F1-Score: 0.25678

 K-Nearest Nieghbors Accuracy: 85.845%
				       F1-Score: 0.11429

 Decision Tree Accuracy: 92.237%
				       F1-Score: 0.16783

 Support Vector Machine (RBF Kernel) Accuracy: 78.082%
				       F1-Score: 0.19617

 Support Vector Machine (Linear Kernel) Accuracy: 72.929%
				       F1-Score: 0.25760

 Neural Network Accuracy: 85.975%
				       F1-Score: 0.16342

 Random Forest Accuracy: 93.803%
				       F1-Score: 0.04040

 Gradient Boosting Accuracy: 79.256%
				       F1-Score: 0.22439
