In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, recall_score, f1_score, roc_auc_score, precision_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier

### Read Dataset

In [14]:
patients = pd.read_csv('data-preprocessed-augmentation(ADASYN).csv')
patients.sample(5)

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,stroke,is_male,is_urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
7242,0.776648,0.0,0.0,0.390639,0.175801,0.198704,0.30468,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5412,0.792129,0.0,0.0,0.0,0.170601,0.220099,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3813,0.511719,0.0,0.0,1.0,0.060013,0.248568,0.5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
7399,0.833753,0.0,0.0,1.0,0.798691,0.261334,0.747413,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
7458,0.967802,0.0,0.0,1.0,0.264029,0.206277,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


### Functions

In [6]:
def get_k_cross_validation_metrics(clf, X, y, k):
    skf = StratifiedKFold(n_splits=k, shuffle=True)
    cv_results = cross_validate(clf, X, y, cv=skf, scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'specificity': make_scorer(recall_score,pos_label=0),
    'ROC': make_scorer(roc_auc_score),
    })
    return cv_results

In [7]:
def name(cls):
    return cls.__class__.__name__

def get_accuracys(classifiers, X, y, k):
    cls_accs = {name(cls):[] for cls in classifiers}
    for cls in classifiers:
        cv_results = get_k_cross_validation_metrics(cls, X, y, k)
        cls_accs[name(cls)] = cv_results
    return cls_accs

### Model Training

In [15]:
X = patients.drop(["stroke"], axis=1)
y = patients.stroke

In [18]:
k_folds = 10
knn = KNeighborsClassifier(n_neighbors=5)
svc = svm.SVC(probability=True)
adaBoost = AdaBoostClassifier(n_estimators=100) # default n_estimators=50
classifiers = [knn, svc, adaBoost]
cv_results = get_accuracys(classifiers, X, y, k_folds)

In [19]:
metrics = ['accuracy', 'precision', 'recall','f1', 'specificity', 'ROC']
result = pd.DataFrame([], columns=["classifier", *metrics])
for cls, cls_metrics in cv_results.items():
    scores = {metric:cls_metrics['test_' + metric].mean() for metric in metrics}
    scores["classifier"] = cls
    result = result.append(scores, ignore_index=True)
result

Unnamed: 0,classifier,accuracy,precision,recall,f1,specificity,ROC
0,KNeighborsClassifier,0.904722,0.850357,0.981518,0.911191,0.828636,0.905077
1,SVC,0.836106,0.791355,0.911333,0.847018,0.761576,0.836455
2,AdaBoostClassifier,0.826702,0.791326,0.885805,0.835769,0.768152,0.826979


In [20]:
result.to_excel("PredictResult(balanced).xlsx", sheet_name="balanced", float_format="%.4f", index=False)