In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, recall_score, f1_score, roc_auc_score, precision_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier

### Read Dataset

In [27]:
patients = pd.read_csv('data-preprocessed-augmentation(ADASYN).csv')
patients.sample(10)

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,stroke,is_male,is_urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
4066,0.963379,0.0,0.0,0.0,0.339765,0.237113,0.666667,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2814,0.755859,0.0,0.0,1.0,0.118179,0.17984,0.5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1759,0.54834,0.0,0.0,1.0,0.560244,0.15693,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4614,0.951172,0.0,0.0,1.0,0.028899,0.160367,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1045,0.230957,0.0,0.0,0.0,0.092143,0.130584,0.333333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4242,0.487305,0.0,0.0,1.0,0.276752,0.304696,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
3169,0.682617,0.0,0.0,1.0,0.109916,0.238259,0.333333,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
7490,0.962324,0.0,0.0,1.0,0.047494,0.158175,0.5,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3015,0.658203,0.0,0.0,1.0,0.047364,0.178694,0.833333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1737,0.951172,1.0,1.0,1.0,0.794202,0.359679,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


### Functions

In [5]:
def get_k_cross_validation_metrics(clf, X, y, k):
    skf = StratifiedKFold(n_splits=k, shuffle=True)
    cv_results = cross_validate(clf, X, y, cv=skf, scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'specificity': make_scorer(recall_score,pos_label=0),
    'ROC': make_scorer(roc_auc_score),
    })
    return cv_results

In [6]:
def name(cls):
    return cls.__class__.__name__

def get_accuracys(classifiers, X, y, k):
    cls_accs = {name(cls):[] for cls in classifiers}
    for cls in classifiers:
        cv_results = get_k_cross_validation_metrics(cls, X, y, k)
        cls_accs[name(cls)] = cv_results
    return cls_accs

In [19]:
def build_result(cv_results):
    metrics = ['accuracy', 'precision', 'recall','f1', 'specificity', 'ROC']
    result = pd.DataFrame([], columns=["classifier", *metrics])
    for cls, cls_metrics in cv_results.items():
        scores = {metric:cls_metrics['test_' + metric].mean() for metric in metrics}
        scores["classifier"] = cls
        result = result.append(scores, ignore_index=True)
    return result

### Model Training

In [25]:
X = patients.drop(["stroke"], axis=1)
y = patients.stroke

In [26]:
k_folds = 10
knn = KNeighborsClassifier(n_neighbors=5)
svc = svm.SVC(probability=True)
adaBoost = AdaBoostClassifier(n_estimators=100) # default n_estimators=50
classifiers = [knn, svc, adaBoost]
cv_results = get_accuracys(classifiers, X, y, k_folds)

In [29]:
result = build_result(cv_results)
result.head()

  result = result.append(scores, ignore_index=True)
  result = result.append(scores, ignore_index=True)
  result = result.append(scores, ignore_index=True)


Unnamed: 0,classifier,accuracy,precision,recall,f1,specificity,ROC
0,KNeighborsClassifier,0.90307,0.849352,0.979235,0.909621,0.827607,0.903421
1,SVC,0.83466,0.790262,0.909249,0.845473,0.760747,0.834998
2,AdaBoostClassifier,0.824738,0.789887,0.882879,0.833704,0.767129,0.825004


In [30]:
result.to_excel("PredictResult(balanced-1k-adasyn).xlsx", sheet_name="balanced", float_format="%.4f", index=False)

### Train Model with Feature Selected

In [22]:
patients = pd.read_csv('data-preprocessed-augmentation(ADASYN).csv')
feature_selected = patients[["age", "hypertension", "heart_disease", "ever_married", "avg_glucose_level", "bmi", "smoking_status", "work_type_Self-employed", "stroke"]]
X = feature_selected.drop(["stroke"], axis=1)
y = feature_selected.stroke
feature_selected.sample(5)

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,work_type_Self-employed,stroke
3788,0.560547,0.0,0.0,1.0,0.578571,0.139748,0.0,0.0,0.0
6228,0.969768,1.0,0.0,1.0,0.10895,0.138789,0.01833,0.0,1.0
6185,0.686634,0.223638,0.0,1.0,0.328439,0.206427,0.0,0.0,1.0
5983,0.95148,0.0,0.0,1.0,0.220027,0.278981,0.039141,1.0,1.0
2853,0.365234,0.0,0.0,0.0,0.400148,0.247423,0.0,0.0,0.0


In [20]:
selected_feature_cv_results = get_accuracys(classifiers, X, y, k_folds)
result = build_result(selected_feature_cv_results)

  result = result.append(scores, ignore_index=True)
  result = result.append(scores, ignore_index=True)
  result = result.append(scores, ignore_index=True)


In [21]:
result.head()

Unnamed: 0,classifier,accuracy,precision,recall,f1,specificity,ROC
0,KNeighborsClassifier,0.966461,0.976247,0.956169,0.966086,0.976752,0.96646
1,SVC,0.920885,0.885153,0.967277,0.92438,0.874512,0.920895
2,AdaBoostClassifier,0.952366,0.946769,0.958634,0.95265,0.9461,0.952367


In [23]:
result.to_excel("PredictResult(feature_selected).xlsx", float_format="%.4f", index=False)