## Plan A

1. A quick baseline(ordinal features/ no resample/ accuracy)
2. simply use ordinal features + resample
3. one hot features + resample
4. target encoding + resample
5. do not resample + best model above

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('heart_disease_health_indicators_BRFSS2015.csv')

In [3]:
target = 'HeartDiseaseorAttack'
ordinal_features = ['Education', 'Age', 'PhysHlth', 'MentHlth','GenHlth']
categorical_features = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'PhysActivity', 'Fruits', 'Veggies', 'Diabetes', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost']
continuous_features = ['BMI']

### Split data into train_val and test

In [6]:
from sklearn.model_selection import train_test_split
train_val, test = train_test_split(data, test_size = 0.2, stratify=data[target].values, random_state=0)

### A quick baseline

In [17]:
X_train_val = train_val.drop(columns=[target])
y_train_val = train_val[target]

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val.values, test_size=0.2, random_state=0)

X_test = test.drop(columns=[target])
y_test = test[target]

In [21]:
import lightgbm as lgb
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

params = {'boosting_type': 'gbdt', 'objective': 'binary', 
              'num_leaves': 64, 'learning_rate': 0.05, 'max_bin': 512, 
              'subsample_for_bin': 200, 'subsample': 1, 'subsample_freq': 1,
              'colsample_bytree': 0.8, 'reg_alpha': 5, 'reg_lambda': 10, 
              'metric': 'auc', 'feature_name' : list(X_train.columns)}

lgbmclf = lgb.LGBMClassifier(**params)
lgbmclf.fit(X_train, y_train, eval_set=[(X_val, y_val)], categorical_feature = categorical_features, early_stopping_rounds=20)

[1]	valid_0's auc: 0.827503
[2]	valid_0's auc: 0.836467
[3]	valid_0's auc: 0.838579
[4]	valid_0's auc: 0.841272
[5]	valid_0's auc: 0.841711
[6]	valid_0's auc: 0.843057
[7]	valid_0's auc: 0.843555
[8]	valid_0's auc: 0.844125
[9]	valid_0's auc: 0.844463
[10]	valid_0's auc: 0.844355
[11]	valid_0's auc: 0.844598
[12]	valid_0's auc: 0.84503
[13]	valid_0's auc: 0.84543
[14]	valid_0's auc: 0.84562
[15]	valid_0's auc: 0.845828
[16]	valid_0's auc: 0.845898
[17]	valid_0's auc: 0.846305
[18]	valid_0's auc: 0.846624
[19]	valid_0's auc: 0.846841
[20]	valid_0's auc: 0.84684
[21]	valid_0's auc: 0.846908
[22]	valid_0's auc: 0.847066
[23]	valid_0's auc: 0.847463
[24]	valid_0's auc: 0.847438
[25]	valid_0's auc: 0.847516
[26]	valid_0's auc: 0.847622
[27]	valid_0's auc: 0.847643
[28]	valid_0's auc: 0.847785
[29]	valid_0's auc: 0.847942
[30]	valid_0's auc: 0.848156
[31]	valid_0's auc: 0.848286
[32]	valid_0's auc: 0.848222
[33]	valid_0's auc: 0.848293
[34]	valid_0's auc: 0.848384
[35]	valid_0's auc: 0.84845

LGBMClassifier(colsample_bytree=0.8,
               feature_name=['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
                             'Stroke', 'Diabetes', 'PhysActivity', 'Fruits',
                             'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                             'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth',
                             'DiffWalk', 'Sex', 'Age', 'Education', 'Income'],
               learning_rate=0.05, max_bin=512, metric='auc', num_leaves=64,
               objective='binary', reg_alpha=5, reg_lambda=10, subsample=1,
               subsample_for_bin=200, subsample_freq=1)

In [22]:
lgbmclf.score(X_test, y_test)

0.9075804162724692

+ This is not a good accuracy, let us calculate the proportion of labels in data set

In [24]:
percent_1 = data['HeartDiseaseorAttack'].values.sum()/ len(data['HeartDiseaseorAttack'].values)
percent_0 = 1-percent_1
print('Label 0 take {:.2f} percent' .format(percent_0*100))
print('Label 1 take {:.2f} percent' .format(percent_1*100))

Label 0 take 90.58 percent
Label 1 take 9.42 percent


+ So if we predict all labels to be 0, we get almost the same result

### Ordinal features and resample

+ Resample data first and use the same algorithm to build the model

In [99]:
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence
def under_sample(label, target):
    rand_state = RandomState(MT19937(SeedSequence(123456789)))
    target_usamp = target[target == label]
    target_osamp = target[target != label]
    resampled_idx = rand_state.choice(target_osamp.index, size=len(target_usamp))
    
    return resampled_idx

In [103]:
rsp_idx = under_sample(1, y_train_val)
data_balanced_0 = train_val.loc[rsp_idx, :]
data_label_1 = train_val[y_train_val == 1]
print(data_balanced_0.shape, data_label_1.shape)
data_all = pd.concat([data_balanced_0, data_label_1], axis=0)

(19114, 22) (19114, 22)


In [104]:
data_all.shape

(38228, 22)

In [105]:
X_train, X_val, y_train, y_val = train_test_split(data_all.drop(columns=[target]), data_all[target], random_state=0, test_size=0.2, stratify=data_all[target])

In [109]:
lgbmclf = lgb.LGBMClassifier(**params)
lgbmclf.fit(X_train, y_train, eval_set=[(X_val, y_val)], categorical_feature = categorical_features, early_stopping_rounds=50)
score = lgbmclf.score(X_test, y_test)

[1]	valid_0's auc: 0.822593
[2]	valid_0's auc: 0.82888
[3]	valid_0's auc: 0.836089
[4]	valid_0's auc: 0.840762
[5]	valid_0's auc: 0.840335
[6]	valid_0's auc: 0.841749
[7]	valid_0's auc: 0.84237
[8]	valid_0's auc: 0.842356
[9]	valid_0's auc: 0.843462
[10]	valid_0's auc: 0.844147
[11]	valid_0's auc: 0.844515
[12]	valid_0's auc: 0.844967
[13]	valid_0's auc: 0.844874
[14]	valid_0's auc: 0.844857
[15]	valid_0's auc: 0.845232
[16]	valid_0's auc: 0.845694
[17]	valid_0's auc: 0.845801
[18]	valid_0's auc: 0.846021
[19]	valid_0's auc: 0.846544
[20]	valid_0's auc: 0.846759
[21]	valid_0's auc: 0.846526
[22]	valid_0's auc: 0.846979
[23]	valid_0's auc: 0.847537
[24]	valid_0's auc: 0.847704
[25]	valid_0's auc: 0.84796
[26]	valid_0's auc: 0.848113
[27]	valid_0's auc: 0.848355
[28]	valid_0's auc: 0.848405
[29]	valid_0's auc: 0.848494
[30]	valid_0's auc: 0.848658
[31]	valid_0's auc: 0.848821
[32]	valid_0's auc: 0.848947
[33]	valid_0's auc: 0.849085
[34]	valid_0's auc: 0.849179
[35]	valid_0's auc: 0.8493

In [110]:
print(score)

0.727767265846736


### Target Encoding vs one_hot_encoding

+ A random forest
+ A logistic regression
+ A SVM
+ A lbgm

+ Here we still use resampled data 

+ transform ordinal features into one_hot_encoding

In [157]:
from sklearn.preprocessing import OneHotEncoder
def one_hot_enc(df_train, df_val, cols, is_train=True, encoder=None):
    ohe = OneHotEncoder(handle_unknown='ignore')
    if is_train == True:
        ohe.fit(df_train[cols])
        mat = ohe.transform(df_train[cols]).toarray()
    else:
        mat=encoder.transform(df_val[cols]).toarray()
    one_hot_name = []
    for col in cols:
        for i in range(df_train[col].nunique()):
            one_hot_name.append(col+ '_' +str(i))
    one_hot_mat = pd.DataFrame(mat, columns=one_hot_name)
    return one_hot_mat, ohe

In [118]:
to_enc_features = ordinal_features + ['Diabetes']
one_hot_mat, oh_encoder = one_hot_enc(data_all, ordinal_features+['Diabetes'], is_train=True)

In [125]:
data_all_one_hot = pd.concat([data_all.drop(columns=to_enc_features).reset_index(), one_hot_mat], axis=1)

In [164]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

def run_clfs(clfs, params, X_train_val, y_train_val, One_hot =False):
    result_cv = {}
    for clf in clfs:
        result_cv[clf.__class__.__name__] = {}
    
    skf = StratifiedKFold(shuffle=True, random_state=0)
    for i, (tr_idx, val_idx) in zip(range(1,6), skf.split(X_train_val, y_train_val)):
        X_train, y_train = X_train_val.iloc[tr_idx], y_train_val.iloc[tr_idx]
        X_val, y_val = X_train_val.iloc[val_idx], y_train_val.iloc[val_idx]
        ### One hot encoder and standard scaler
        if One_hot == True:
            one_hot_mat_train, oh_encoder = one_hot_enc(X_train, X_val, ordinal_features+['Diabetes'], is_train=True)
            one_hot_mat_val, _ = one_hot_enc(X_train, X_val, ordinal_features+['Diabetes'], is_train=False, encoder=oh_encoder)
        
            X_train = pd.concat([X_train.drop(columns=to_enc_features).reset_index(), one_hot_mat_train], axis=1)
            X_val = pd.concat([X_val.drop(columns=to_enc_features).reset_index(), one_hot_mat_val], axis=1)
        sc = StandardScaler().fit(X_train[continuous_features])
        X_train[continuous_features] = sc.transform(X_train[continuous_features])
        X_val[continuous_features] = sc.transform(X_val[continuous_features])
        
        ## run clfs
        for clf in clfs:
            if params[clf.__class__.__name__] != None:
                clf.fit(X_train, y_train, **params[clf.__class__.__name__])
            else:
                clf.fit(X_train, y_train)
            
            ## performance on validation set
            score_val = clf.score(X_val, y_val)
            result_cv[clf.__class__.__name__ ][str(i) + '_val_score'] = score_val
            
    return result_cv, clfs

In [165]:
clfs = [SVC(), RandomForestClassifier(), LogisticRegression()]
params = {}
for clf in clfs:
    params[clf.__class__.__name__] = None
    
result, clfs_fitted = run_clfs(clfs, params, X_train_val=data_all.drop(columns=[target]), y_train_val=data_all[target], One_hot=True)

In [166]:
result_df_ = pd.DataFrame(result)
result_df_ ## result that I use one hot

Unnamed: 0,SVC,RandomForestClassifier,LogisticRegression
1_val_score,0.515825,0.775307,0.500262
2_val_score,0.507716,0.774392,0.5
3_val_score,0.510463,0.771515,0.5
4_val_score,0.524657,0.768215,0.499935
5_val_score,0.495487,0.761936,0.500196


In [160]:
result_df = pd.DataFrame(result)
result_df ## result that I forget use one hot

Unnamed: 0,SVC,RandomForestClassifier,LogisticRegression
1_val_score,0.764583,0.770076,0.770337
2_val_score,0.772299,0.774523,0.779623
3_val_score,0.76746,0.771515,0.772168
4_val_score,0.767691,0.765337,0.764421
5_val_score,0.758012,0.766383,0.764683


In [163]:
for clf in clfs_fitted:
    print(clf.__class__.__name__)
    print(clf.score(X_test, y_test))

SVC
0.7558932513402712
RandomForestClassifier
0.7303492589088616
LogisticRegression
0.6376340271207821


+ Compare with the accuracy above

### label encoding

In [None]:
from sklearn.model_selection import StratifiedKFold
def mean_enc(df, target, cols):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for tr_idx, val_idx in skf.split(X, y):
        X_train, X_val = df.iloc[tr_idx], df.iloc[val_idx]
        for col in cols:
            mean_ = X_train.groupby(by=col)['target'].mean()
            X_val[col+'_mean_taregt'] = X_val[col].apply(mean)
        df_new_mean.iloc[val_idx] = X_val
    global_mean = df.target.mean()
    df_new.fillna(global_mean, inplace=True)
    return df_new