In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, recall_score, f1_score, roc_auc_score, precision_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier

### Read Dataset

In [3]:
patients = pd.read_csv('healthcare-dataset-stroke-data.csv')
patients.sample(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5057,38349,Female,49.0,0,0,Yes,Govt_job,Urban,69.92,47.6,never smoked,0
4062,26330,Female,69.0,0,0,Yes,Private,Urban,91.65,25.7,formerly smoked,0
4778,48127,Male,53.0,0,0,Yes,Self-employed,Urban,109.09,26.3,smokes,0
3758,38284,Male,8.0,0,0,No,children,Rural,77.08,16.9,Unknown,0
762,40055,Female,17.0,0,0,No,Private,Rural,173.43,25.6,smokes,0


In [6]:
mask = (patients.stroke == 1) & ((patients.smoking_status == "formerly smoked") | (patients.smoking_status == "smokes"))
patients[mask]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
11,12095,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1
12,12175,Female,54.0,0,0,Yes,Private,Urban,104.51,27.3,smokes,1
...,...,...,...,...,...,...,...,...,...,...,...,...
239,32221,Male,60.0,0,1,Yes,Private,Urban,91.92,35.9,smokes,1
240,10548,Male,66.0,0,0,Yes,Private,Rural,76.46,21.2,formerly smoked,1
241,52282,Male,57.0,0,0,Yes,Private,Rural,197.28,34.5,formerly smoked,1
243,40460,Female,68.0,1,1,Yes,Private,Urban,247.51,40.5,formerly smoked,1


In [7]:
patients.gender.value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

### Functions

In [6]:
def get_k_cross_validation_metrics(clf, X, y, k):
    skf = StratifiedKFold(n_splits=k, shuffle=True)
    cv_results = cross_validate(clf, X, y, cv=skf, scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'specificity': make_scorer(recall_score,pos_label=0),
    'ROC': make_scorer(roc_auc_score),
    })
    return cv_results

In [7]:
def name(cls):
    return cls.__class__.__name__

def get_accuracys(classifiers, X, y, k):
    cls_accs = {name(cls):[] for cls in classifiers}
    for cls in classifiers:
        cv_results = get_k_cross_validation_metrics(cls, X, y, k)
        cls_accs[name(cls)] = cv_results
    return cls_accs

### Model Training

In [15]:
X = patients.drop(["stroke"], axis=1)
y = patients.stroke

In [18]:
k_folds = 10
knn = KNeighborsClassifier(n_neighbors=5)
svc = svm.SVC(probability=True)
adaBoost = AdaBoostClassifier(n_estimators=100) # default n_estimators=50
classifiers = [knn, svc, adaBoost]
cv_results = get_accuracys(classifiers, X, y, k_folds)

In [19]:
metrics = ['accuracy', 'precision', 'recall','f1', 'specificity', 'ROC']
result = pd.DataFrame([], columns=["classifier", *metrics])
for cls, cls_metrics in cv_results.items():
    scores = {metric:cls_metrics['test_' + metric].mean() for metric in metrics}
    scores["classifier"] = cls
    result = result.append(scores, ignore_index=True)
result

Unnamed: 0,classifier,accuracy,precision,recall,f1,specificity,ROC
0,KNeighborsClassifier,0.904722,0.850357,0.981518,0.911191,0.828636,0.905077
1,SVC,0.836106,0.791355,0.911333,0.847018,0.761576,0.836455
2,AdaBoostClassifier,0.826702,0.791326,0.885805,0.835769,0.768152,0.826979


In [20]:
result.to_excel("PredictResult(balanced).xlsx", sheet_name="balanced", float_format="%.4f", index=False)