In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, recall_score, f1_score, roc_auc_score, precision_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import ADASYN

### Read Dataset

In [3]:
patients = pd.read_csv('healthcare-dataset-stroke-data-preprocessed.csv')
patients.sample(10)

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,stroke,is_male,is_urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
1355,0.902344,0.0,0.0,1.0,0.182578,0.107675,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
424,0.829102,0.0,0.0,1.0,0.112917,0.136312,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1481,0.487305,0.0,0.0,1.0,0.256117,0.263459,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4336,0.975586,0.0,1.0,1.0,0.32407,0.258877,0.166667,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3401,0.206543,0.0,0.0,0.0,0.06366,0.145475,0.166667,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
530,0.87793,1.0,0.0,1.0,0.271859,0.246277,0.5,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
232,0.890137,0.0,0.0,1.0,0.073031,0.27606,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3558,0.816895,0.0,0.0,1.0,0.122611,0.230241,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3462,1.0,0.0,0.0,1.0,0.735805,0.15693,0.5,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3294,0.743652,0.0,0.0,1.0,0.39627,0.271478,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


### Functions

In [9]:
def get_predict_after_augmentation(clf, X_train, y_train, X_test):
    adasyn = ADASYN(n_neighbors=3)
    X_balance, y_balance = adasyn.fit_resample(X_train, y_train.astype('int'))
    predict = clf.fit(X_balance, y_balance).predict(X_test)
    return predict

def get_metric_scores(predict, actual):
    scores = {}
    scores['accuracy'] = accuracy_score(actual, predict)
    scores['precision'] = precision_score(actual, predict)
    scores['recall'] = recall_score(actual, predict)
    scores['f1'] = f1_score(actual, predict)
    scores['ROC'] = roc_auc_score(actual, predict)
    return scores

In [37]:
def get_k_cross_validation_metrics(clf, X, y, k):
    skf = StratifiedKFold(n_splits=k, shuffle=True)
    skf_splited = skf.split(X, y)
    
    metrics = ['accuracy', 'precision', 'recall','f1', 'ROC']
    cv_results = {k:np.array([]) for k in metrics}
    for i, (train_index, test_index) in enumerate(skf_splited):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
#         y_predict = get_predict_after_augmentation(clf, X_train, y_train, X_test)
        y_predict = clf.fit(X_train, y_train).predict(X_test)
        scores = get_metric_scores(y_predict, y_test)
        
        for metric in scores.keys():
            cv_results[metric] = np.append(cv_results[metric], scores[metric])
        
    return cv_results

In [11]:
def name(cls):
    return cls.__class__.__name__

def get_accuracys(classifiers, X, y, k):
    cls_accs = {name(cls):[] for cls in classifiers}
    for cls in classifiers:
        cv_results = get_k_cross_validation_metrics(cls, X, y, k)
        cls_accs[name(cls)] = cv_results
    return cls_accs

In [19]:
def build_result(cv_results):
    metrics = ['accuracy', 'precision', 'recall','f1', 'ROC']
    result = pd.DataFrame([], columns=["classifier", *metrics])
    for cls, cls_metrics in cv_results.items():
        scores = {metric:cls_metrics[metric].mean() for metric in metrics}
        scores["classifier"] = cls
        result = result.append(scores, ignore_index=True)
    return result

### Model Training

In [8]:
X = patients.drop(["stroke"], axis=1)
y = patients.stroke.astype('int')

In [28]:
k_folds = 10
knn = KNeighborsClassifier(n_neighbors=5)
svc = svm.SVC(probability=True)
adaBoost = AdaBoostClassifier(n_estimators=100) # default n_estimators=50
classifiers = [knn, svc, adaBoost]
cv_results = get_accuracys(classifiers, X, y, k_folds)

In [29]:
result = build_result(cv_results)
result.head()

  result = result.append(scores, ignore_index=True)
  result = result.append(scores, ignore_index=True)
  result = result.append(scores, ignore_index=True)


Unnamed: 0,classifier,accuracy,precision,recall,f1,ROC
0,KNeighborsClassifier,0.820548,0.122174,0.4345,0.190306,0.637432
1,SVC,0.755969,0.121383,0.646833,0.20425,0.704204
2,AdaBoostClassifier,0.757926,0.124758,0.661833,0.209801,0.712319


In [30]:
result.to_excel("PredictResult(balanced-adasyn-revised).xlsx", sheet_name="balanced", float_format="%.4f", index=False)

### Train Model with Feature Selected

In [31]:
patients = pd.read_csv('healthcare-dataset-stroke-data-preprocessed.csv')
feature_selected = patients[["age", "hypertension", "heart_disease", "ever_married", "avg_glucose_level", "bmi", "smoking_status", "work_type_Self-employed", "stroke"]]
X = feature_selected.drop(["stroke"], axis=1)
y = feature_selected.stroke
feature_selected.sample(5)

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,work_type_Self-employed,stroke
20,0.865723,0.0,0.0,1.0,0.640846,0.138603,1.0,0.0,1.0
4860,0.012207,0.0,0.0,0.0,0.250254,0.090493,0.0,0.0,0.0
1385,0.54834,0.0,0.0,1.0,0.156634,0.200458,0.5,0.0,0.0
137,0.926758,0.0,0.0,1.0,0.702428,0.281787,0.5,0.0,1.0
4368,0.487305,0.0,0.0,1.0,0.212076,0.252005,1.0,0.0,0.0


In [32]:
selected_feature_cv_results = get_accuracys(classifiers, X, y, k_folds)
result = build_result(selected_feature_cv_results)

  result = result.append(scores, ignore_index=True)
  result = result.append(scores, ignore_index=True)
  result = result.append(scores, ignore_index=True)


In [33]:
result.head()

Unnamed: 0,classifier,accuracy,precision,recall,f1,ROC
0,KNeighborsClassifier,0.806458,0.122136,0.4785,0.19444,0.650892
1,SVC,0.712524,0.121944,0.787667,0.211035,0.748181
2,AdaBoostClassifier,0.740313,0.124171,0.711167,0.211263,0.726495


In [34]:
result.to_excel("PredictResult(feature_selected-revised).xlsx", float_format="%.4f", index=False)

In [29]:
patients = pd.read_csv('healthcare-dataset-stroke-data-preprocessed.csv')
feature_selected = patients[["age", "hypertension", "heart_disease", "ever_married", "avg_glucose_level", "bmi", "smoking_status", "work_type_Self-employed", "stroke"]]
X = feature_selected.drop(["stroke"], axis=1)
y = feature_selected.stroke
feature_selected.sample(5)

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,work_type_Self-employed,stroke
41,0.658203,0.0,0.0,1.0,0.074324,0.208477,0.0,0.0,1.0
354,0.389648,0.0,0.0,1.0,0.198366,0.170676,1.0,0.0,0.0
3462,1.0,0.0,0.0,1.0,0.735805,0.15693,0.5,1.0,0.0
3221,0.79248,0.0,0.0,1.0,0.779152,0.20504,0.5,0.0,0.0
771,0.267578,0.0,0.0,1.0,0.288847,0.185567,0.0,0.0,0.0


In [31]:
feature_selected.stroke.value_counts()

0.0    4861
1.0     249
Name: stroke, dtype: int64

In [41]:
k_folds = 10
# knn = KNeighborsClassifier(n_neighbors=5)
# adaBoost = AdaBoostClassifier(n_estimators=100, class_weight="balanced")
forest = RandomForestClassifier(n_estimators=100, class_weight="balanced")
svc = svm.SVC(kernel="linear", class_weight='balanced', probability=True)
classifiers = [svc, forest]
cv_results = get_accuracys(classifiers, X, y, k_folds)
result = build_result(cv_results)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  result = result.append(scores, ignore_index=True)
  result = result.append(scores, ignore_index=True)


In [42]:
result.head()

Unnamed: 0,classifier,accuracy,precision,recall,f1,ROC
0,SVC,0.719569,0.130178,0.831167,0.22496,0.772508
1,RandomForestClassifier,0.950685,0.216667,0.016,0.029385,0.50728


In [None]:
result.to_excel("PredictResult().xlsx", float_format="%.4f", index=False)