In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, recall_score, f1_score, roc_auc_score, precision_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture

from imblearn.over_sampling import ADASYN
import warnings
warnings.filterwarnings('ignore')

### Read Dataset

In [43]:
patients = pd.read_csv('healthcare-dataset-stroke-data-preprocessed.csv')
patients[["age", "hypertension", "heart_disease", "ever_married", "avg_glucose_level", "bmi", "smoking_status", "work_type_Self-employed", "stroke"]]
patients.sample(10)

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,stroke,is_male,is_urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
4916,0.658203,0.0,0.0,1.0,0.082356,0.31386,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
2235,0.401855,0.0,0.0,1.0,0.118641,0.343643,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3857,0.316406,0.0,0.0,0.0,0.142184,0.289805,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
2283,0.780273,0.0,1.0,1.0,0.128197,0.219931,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1894,0.243164,0.0,0.0,0.0,0.073123,0.18824,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1002,0.975586,0.0,1.0,1.0,0.210876,0.218786,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
102,0.902344,0.0,0.0,1.0,0.814745,0.278351,0.5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1456,0.755859,0.0,0.0,1.0,0.110193,0.403207,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3290,0.682617,0.0,0.0,1.0,0.498107,0.221077,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2174,0.707031,0.0,0.0,1.0,0.063337,0.382589,0.5,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


### Functions

In [4]:
def get_predict_after_augmentation(clf, X_train, y_train, X_test):
    adasyn = ADASYN(n_neighbors=3)
    X_balance, y_balance = adasyn.fit_resample(X_train, y_train.astype('int'))
    predict = clf.fit(X_balance, y_balance).predict(X_test)
    return predict

def get_metric_scores(predict, actual):
    scores = {}
    scores['accuracy'] = accuracy_score(actual, predict)
    scores['precision'] = precision_score(actual, predict)
    scores['recall'] = recall_score(actual, predict)
    scores['f1'] = f1_score(actual, predict)
    scores['ROC'] = roc_auc_score(actual, predict)
    return scores

In [30]:
def build_gmms(X, y, mixiures=2):
    gmms = []
    for clazz in [0, 1]:
        gmm = GaussianMixture(n_components=mixiures)
        gmm.fit(X[y == clazz])
        gmms.append(gmm)
    return gmms

In [18]:
def get_k_cross_validation_metrics(clf, X, y, k):
    skf = StratifiedKFold(n_splits=k, shuffle=True)
    skf_splited = skf.split(X, y)
    
    metrics = ['accuracy', 'precision', 'recall','f1', 'ROC']
    cv_results = {k:np.array([]) for k in metrics}
    for i, (train_index, test_index) in enumerate(skf_splited):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        gmms = build_gmms(X_train, y_train)
        probs = [np.exp(gmm.score_samples(X_test)) for gmm in gmms]
        y_predict = np.argmax(probs, axis=0)
#         y_predict = get_predict_after_augmentation(clf, X_train, y_train, X_test)
#         y_predict = clf.fit(X_train, y_train).predict(X_test)
        scores = get_metric_scores(y_predict, y_test)
        
        for metric in scores.keys():
            cv_results[metric] = np.append(cv_results[metric], scores[metric])
        
    return cv_results

In [9]:
def name(cls):
    return 'GMM'

def get_accuracys(classifiers, X, y, k):
    cls_accs = {name(cls):[] for cls in classifiers}
    for cls in classifiers:
        cv_results = get_k_cross_validation_metrics(cls, X, y, k)
        cls_accs[name(cls)] = cv_results
    return cls_accs

In [10]:
def build_result(cv_results):
    metrics = ['accuracy', 'precision', 'recall','f1', 'ROC']
    result = pd.DataFrame([], columns=["classifier", *metrics])
    for cls, cls_metrics in cv_results.items():
        scores = {metric:cls_metrics[metric].mean() for metric in metrics}
        scores["classifier"] = cls
        result = result.append(scores, ignore_index=True)
    return result

### Model Training

In [44]:
X = patients.drop(["stroke"], axis=1)
y = patients.stroke.astype('int')

In [45]:
k_folds = 10
cv_results = get_accuracys([None], X, y, k_folds)

In [46]:
result = build_result(cv_results)
result.head()

Unnamed: 0,classifier,accuracy,precision,recall,f1,ROC
0,GMM,0.495695,0.070961,0.771833,0.129777,0.626726


In [None]:
result.to_excel("PredictResult(balanced-adasyn-revised).xlsx", sheet_name="balanced", float_format="%.4f", index=False)

### Train Model with Feature Selected

In [75]:
patients = pd.read_csv('healthcare-dataset-stroke-data-preprocessed.csv')
feature_selected = patients.copy()
feature_selected = patients[["age", "avg_glucose_level", "bmi", "stroke"]]
X = feature_selected.drop(["stroke"], axis=1)
y = feature_selected.stroke
feature_selected.sample(5)

Unnamed: 0,age,avg_glucose_level,bmi,stroke
411,0.780273,0.568599,0.295533,0.0
2888,0.682617,0.15885,0.146621,0.0
475,0.987793,0.064122,0.255441,0.0
4833,0.499512,0.048703,0.187858,0.0
4982,0.243164,0.095928,0.250859,0.0


In [76]:
selected_feature_cv_results = get_accuracys(['GMM'], X, y, k_folds)
result = build_result(selected_feature_cv_results)

In [77]:
result.head()

Unnamed: 0,classifier,accuracy,precision,recall,f1,ROC
0,GMM,0.682192,0.117624,0.846667,0.206422,0.760197


In [None]:
result.to_excel("PredictResult(feature_selected-revised).xlsx", float_format="%.4f", index=False)

In [98]:
patients = pd.read_csv('data-preprocessed(standardization).csv')
# feature_selected = patients[["age", "hypertension", "avg_glucose_level", "bmi", "stroke"]]
feature_selected = patients[["age", "hypertension", "heart_disease", "ever_married", "avg_glucose_level", "bmi", "smoking_status", "work_type_Self-employed", "stroke"]]
X = feature_selected.drop(["stroke"], axis=1)
y = feature_selected.stroke
feature_selected.sample(5)

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,work_type_Self-employed,stroke
2863,0.602386,-0.36768,-0.252933,0.564092,2.086483,1.180271,-0.83682,-0.474219,False
2329,-1.678965,-0.36768,-0.252933,-1.772762,0.336672,0.124738,-0.83682,-0.474219,False
2897,-0.299544,-0.36768,-0.252933,-1.772762,-0.615178,0.138447,0.403745,-0.474219,False
2656,0.177948,2.719753,-0.252933,0.564092,-0.15959,0.919815,-0.83682,-0.474219,False
3199,-1.732019,-0.36768,-0.252933,-1.772762,-0.625661,-1.054168,-0.83682,-0.474219,False


In [99]:
feature_selected.stroke.value_counts()

False    3246
True      180
Name: stroke, dtype: int64

In [100]:
selected_feature_cv_results = get_accuracys(['GMM'], X, y, k_folds)
result = build_result(selected_feature_cv_results)

In [101]:
result.head()

Unnamed: 0,classifier,accuracy,precision,recall,f1,ROC
0,GMM,0.659999,0.105284,0.7,0.181427,0.678892


In [None]:
result.to_excel("PredictResult().xlsx", float_format="%.4f", index=False)