# Models

## Setup

In [12]:
import pandas as pd
import numpy as np
import os

from pycaret.classification import *

In [2]:
data_dir = '../data/final/'

train = pd.read_csv(data_dir + 'train.csv', index_col=0).reset_index(drop=True)
test = pd.read_csv(data_dir + 'test.csv', index_col=0).reset_index(drop=True)

train.head(5)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [3]:
TARGET = 'stroke'
FEATURES = [c for c in train.columns if c not in [TARGET]]

numerical = train[FEATURES].select_dtypes(include=np.number).columns.to_list()
categorical = train[FEATURES].select_dtypes(exclude=np.number).columns.to_list()

print(f'Target: {TARGET}')
print('Features: ', FEATURES)
print(f'Shapes: {train.shape}, {test.shape}')

Target: stroke
Features:  ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
Shapes: (15304, 11), (10204, 10)


In [4]:
setup(
    data=train, 
    target='stroke',
    train_size=0.75,
    normalize=True,
    transformation=True,
    fix_imbalance=True,
    fold = 4,
    verbose=False,
    session_id=42,
    experiment_name='stroke'
)
pull()

Unnamed: 0,Description,Value
0,Session id,42
1,Target,stroke
2,Target type,Binary
3,Original data shape,"(15304, 11)"
4,Transformed data shape,"(25834, 20)"
5,Transformed train set shape,"(22008, 20)"
6,Transformed test set shape,"(3826, 20)"
7,Ordinal features,2
8,Numeric features,5
9,Categorical features,5


## Compare models

In [5]:
top3 = compare_models(n_select=3, sort='AUC', verbose=False)
pull()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9295,0.8741,0.3503,0.2508,0.2913,0.2555,0.2599,1.57
lda,Linear Discriminant Analysis,0.9103,0.8724,0.4557,0.2196,0.2959,0.2544,0.274,0.285
lightgbm,Light Gradient Boosting Machine,0.9555,0.8647,0.095,0.3538,0.1479,0.1331,0.1647,0.3925
ada,Ada Boost Classifier,0.9475,0.8634,0.154,0.269,0.1948,0.1698,0.1774,0.6225
gbc,Gradient Boosting Classifier,0.9537,0.8629,0.1181,0.3377,0.1732,0.155,0.1787,1.4075
rf,Random Forest Classifier,0.957,0.8321,0.0865,0.4046,0.1413,0.1289,0.171,0.7675
nb,Naive Bayes,0.2054,0.8225,0.9979,0.0494,0.0942,0.0168,0.0905,0.28
et,Extra Trees Classifier,0.9497,0.8027,0.0971,0.2527,0.1383,0.1173,0.1329,0.6575
knn,K Neighbors Classifier,0.8972,0.6837,0.308,0.1463,0.1983,0.1507,0.1628,1.38
qda,Quadratic Discriminant Analysis,0.9232,0.6659,0.0799,0.0962,0.0494,0.0307,0.0415,0.2725


In [6]:
tuned_top3 = [tune_model(m, n_iter=30, optimize='AUC') for m in top3]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9188,0.8774,0.4538,0.2432,0.3167,0.2777,0.2931
1,0.9035,0.8659,0.521,0.2199,0.3092,0.2664,0.2954
2,0.9041,0.89,0.5508,0.2265,0.321,0.279,0.3112
3,0.9014,0.8649,0.4153,0.1863,0.2572,0.2125,0.2322
Mean,0.907,0.8746,0.4852,0.219,0.301,0.2589,0.283
Std,0.0069,0.0102,0.0536,0.0207,0.0256,0.0272,0.0301


Fitting 4 folds for each of 30 candidates, totalling 120 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8895,0.8738,0.5966,0.2088,0.3094,0.2642,0.3077
1,0.8739,0.8676,0.6555,0.1955,0.3012,0.2535,0.3105
2,0.8763,0.8894,0.7203,0.2088,0.3238,0.2778,0.3434
3,0.8707,0.8659,0.5932,0.1781,0.274,0.2249,0.2748
Mean,0.8776,0.8742,0.6414,0.1978,0.3021,0.2551,0.3091
Std,0.0072,0.0093,0.0519,0.0126,0.0181,0.0194,0.0243


Fitting 4 folds for each of 30 candidates, totalling 120 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8979,0.8739,0.6134,0.2281,0.3326,0.2896,0.3317
1,0.8895,0.8588,0.605,0.2105,0.3124,0.2673,0.3119
2,0.8926,0.8863,0.6186,0.2173,0.3216,0.2776,0.323
3,0.8836,0.8698,0.6102,0.2,0.3013,0.2551,0.303
Mean,0.8909,0.8722,0.6118,0.214,0.3169,0.2724,0.3174
Std,0.0052,0.0098,0.0049,0.0102,0.0115,0.0127,0.0109


Fitting 4 folds for each of 30 candidates, totalling 120 fits


In [7]:
meta = create_model('lr', verbose=False)
stacker = stack_models(tuned_top3, meta_model=meta, verbose=False)
pull()

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9453,0.8805,0.3361,0.339,0.3376,0.309,0.309
1,0.9324,0.8635,0.2857,0.2378,0.2595,0.2244,0.2255
2,0.9345,0.8903,0.4068,0.2892,0.338,0.3046,0.3095
3,0.9313,0.8665,0.2627,0.2199,0.2394,0.2037,0.2046
Mean,0.9359,0.8752,0.3228,0.2714,0.2936,0.2604,0.2622
Std,0.0056,0.0108,0.0553,0.0466,0.0447,0.047,0.0477


In [8]:
calibrated_stacker = calibrate_model(stacker, method = 'sigmoid', verbose=False)
pull()

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9369,0.8796,0.3445,0.2847,0.3118,0.2791,0.2804
1,0.9226,0.8631,0.3361,0.2186,0.2649,0.226,0.2319
2,0.9251,0.8895,0.4407,0.2587,0.326,0.2892,0.3007
3,0.9258,0.8651,0.3136,0.2189,0.2578,0.2201,0.224
Mean,0.9276,0.8743,0.3587,0.2452,0.2901,0.2536,0.2593
Std,0.0055,0.0108,0.0487,0.028,0.0293,0.0308,0.0323


## Final model evaluation

In [9]:
evaluate_model(calibrated_stacker)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [10]:
#interpret_model(calibrated_stacker)

## Finalize and predict test set

In [14]:
final_model = finalize_model(calibrated_stacker)
preds = predict_model(final_model, data=test, raw_score=True)
preds.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,prediction_label,prediction_score_0,prediction_score_1
0,Female,57.0,0,0,Yes,Private,Rural,82.540001,33.400002,Unknown,0,0.8778,0.1222
1,Male,70.0,1,0,Yes,Private,Urban,72.059998,28.5,Unknown,1,0.3981,0.6019
2,Female,5.0,0,0,No,children,Urban,103.720001,19.5,Unknown,0,0.9981,0.0019
3,Female,56.0,0,0,Yes,Govt_job,Urban,69.239998,41.400002,smokes,0,0.889,0.111
4,Male,32.0,0,0,Yes,Private,Rural,111.150002,30.1,smokes,0,0.9798,0.0202


In [26]:
out_path = '../submissions/'
os.makedirs(out_path, exist_ok=True)

sub = pd.read_csv('../data/raw/sample_submission.csv')
sub['stroke'] = preds['prediction_score_1']

sub.to_csv(out_path + 'sub1.csv', index=False)
sub.head()