In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('drive/MyDrive/heart_disease_health_indicators_BRFSS2015.csv')

In [3]:
target = 'HeartDiseaseorAttack'
ordinal_features = ['Education', 'Age', 'PhysHlth', 'MentHlth','GenHlth','Income']
categorical_features = ['Sex', 'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'PhysActivity', 'Fruits', 'Veggies', 'Diabetes', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk']
continuous_features = ['BMI']
health = ['PhysActivity', 'Fruits', 'Veggies', 'NoDocbcCost']
unhealth = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'Diabetes', 'HvyAlcoholConsump', 'AnyHealthcare']

In [4]:
inter = health + unhealth

1. Prapare data
  + Unbalanced
  + balanced
  + split data

In [5]:
## Split data into train_val and test
from sklearn.model_selection import train_test_split
train_val, test = train_test_split(data, test_size = 0.2, stratify=data[target].values, random_state=0)

X_train_val = train_val.drop(columns=[target])
y_train_val = train_val[target]
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val.values, test_size=0.2, random_state=0)
X_test = test.drop(columns=[target])
y_test = test[target]

## Get balanced data
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence
def under_sample(label, target):
    rand_state = RandomState(MT19937(SeedSequence(123456789)))
    target_usamp = target[target == label]
    target_osamp = target[target != label]
    resampled_idx = rand_state.choice(target_osamp.index, size=len(target_usamp))
    
    return resampled_idx

rsp_idx = under_sample(1, y_train_val)
data_balanced_0 = train_val.loc[rsp_idx, :]
data_label_1 = train_val[y_train_val == 1]
print(data_balanced_0.shape, data_label_1.shape)
balanced_data = pd.concat([data_balanced_0, data_label_1], axis=0)

(19114, 22) (19114, 22)


In [6]:
import lightgbm as lgb

In [7]:
from sklearn.model_selection import StratifiedKFold
def target_enc(df, cols, target):
    X, y = df[cols], df[target]
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    target_mean_features = []
    for col in cols:
        colname = col + '_target_mean'
        target_mean_features.append(colname)
        df[colname] = 0
    for tr_idx, val_idx in skf.split(X,y):
        X_train, X_val = df.iloc[tr_idx], df.iloc[val_idx]
        for col in cols:
            mean = X_train.groupby(by=col)[target].mean()
            df[col+'_target_mean'].iloc[val_idx] = X_val[col].map(mean)
    prior = df[target].mean()
    df[target_mean_features].fillna(prior, inplace=True)
    
    
    #regularization
    
    for feature in target_mean_features:
        df[feature] = df[feature] * len(df) + df[feature].nunique() * prior
        df[feature] = df[feature] / (len(df) + df[feature].nunique())
    return df[target_mean_features]

def prepare_train(train, target):
    train['sumHealth'] = train[health].sum(axis=1)
    train['sumUnhealth'] = train[unhealth].sum(axis=1)
    train['diff_health'] = train['sumHealth'] - train['sumUnhealth']

    ## Label encoding and interactive labels
    features_mean_enc = categorical_features
    mean_enc_cat = target_enc(train[categorical_features+[target]], categorical_features, target)
    #mean_enc_ord = target_enc(train[ordinal_features + [target]], ordinal_features, target)
    
    train = pd.concat([train.drop(columns=features_mean_enc), mean_enc_cat], axis=1)
    
    for i in inter:
      for j in inter:
        if i!=j:
          train[i+'|'+j] = train[i+'_target_mean'] * train[j+'_target_mean']

            
    return train


def prepare_test(test, train, target):
    test['sumHealth'] = test[health].sum(axis=1)
    test['sumUnhealth'] = test[unhealth].sum(axis=1)
    test['diff_health'] = test['sumHealth'] - test['sumUnhealth']
    
    features_to_enc = categorical_features
    for feature in features_to_enc:
        test[feature + '_target_mean'] = test[feature].map(train.groupby(by=feature)[target].mean())
        
    prior = train[target].mean()
    test[features_to_enc].fillna(prior, inplace=True)
    
    #Smooth
    for feature in features_to_enc:
        test[feature] = test[feature] * len(test) + test[feature].nunique() * prior
        test[feature] = test[feature] / (len(test) + test[feature].nunique())
    
    for i in inter:
      for j in inter:
        if i!=j:
          test[i+'|'+j] = test[i+'_target_mean'] * test[j+'_target_mean']
    
    test = test.drop(columns=features_to_enc)
    return test

def prepare(train_val, X_test, y_test, target):
    train = prepare_train(train_val, target)
    X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=[target]), train[target], test_size=0.1)
    test = pd.concat([X_test, y_test], axis=1)
    test_set = prepare_test(test, train_val, target)
    X_test_ = test_set.drop(columns=[target])
    y_test_ = test_set[target]


    return X_train, X_val, y_train, y_val, X_test_, y_test_

In [8]:
X_train_, X_val_, y_train_, y_val_, X_test_, y_test_ = prepare(balanced_data, X_test, y_test, target)

### We use label encoding and interactive features and see improvement on SVC LogisticRegression and RandomForest

In [9]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

clfs = [SVC(), RandomForestClassifier(), LogisticRegression()]

X_ = pd.concat([X_train_, X_val_], axis=0)
y_ = pd.concat([y_train_, y_val_], axis=0)
for clf in clfs:
  clf.fit(X_train_, y_train_)
  print('{} score on val is {}' .format(clf.__class__.__name__, clf.score(X_val_, y_val_)))
  clf.fit(X_, y_)
  print('{} score on test is {}' .format(clf.__class__.__name__, clf.score(X_test_, y_test_)))
  

SVC score on val is 0.7590897201150929
SVC score on test is 0.6995230211289813
RandomForestClassifier score on val is 0.7501961810096782
RandomForestClassifier score on test is 0.7258751182592242
LogisticRegression score on val is 0.7637980643473712
LogisticRegression score on test is 0.7411896877956481


In [9]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

X_train_u, X_val_u, y_train_u, y_val_u, X_test_, y_test_ = prepare(train_val, X_test, y_test, target)
clfs = [RandomForestClassifier(), LogisticRegression()]

X_u = pd.concat([X_train_u, X_val_u], axis=0)
y_u = pd.concat([y_train_u, y_val_u], axis=0)
for clf in clfs:
  clf.fit(X_u, y_u)
  print('{} score on test is {}' .format(clf.__class__.__name__, clf.score(X_test_, y_test_)))
  

RandomForestClassifier score on test is 0.9067131819615263
LogisticRegression score on test is 0.9068314411857458


In [22]:
params = {'boosting_type': 'gbdt', 'objective': 'binary', 'n_estimators':200,
              'num_leaves': 200, 'learning_rate': 0.05, 'max_bin': 512, 'max_depth' : 10, 
              'subsample_for_bin': 200, 'subsample': 1, 'subsample_freq': 1,
              'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'reg_alpha': 5, 'reg_lambda': 2, 
              'metric': 'auc'}

In [23]:
lgbclf = lgb.LGBMClassifier(**params)
lgbclf.fit(X_u, y_u, eval_set=[(X_val_u, y_val_u)], early_stopping_rounds=100)


[1]	valid_0's auc: 0.810607
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.824293
[3]	valid_0's auc: 0.84236
[4]	valid_0's auc: 0.84719
[5]	valid_0's auc: 0.849766
[6]	valid_0's auc: 0.850983
[7]	valid_0's auc: 0.851136
[8]	valid_0's auc: 0.852592
[9]	valid_0's auc: 0.854019
[10]	valid_0's auc: 0.85442
[11]	valid_0's auc: 0.854965
[12]	valid_0's auc: 0.855236
[13]	valid_0's auc: 0.855715
[14]	valid_0's auc: 0.856076
[15]	valid_0's auc: 0.856619
[16]	valid_0's auc: 0.856698
[17]	valid_0's auc: 0.85723
[18]	valid_0's auc: 0.857864
[19]	valid_0's auc: 0.858168
[20]	valid_0's auc: 0.85839
[21]	valid_0's auc: 0.858597
[22]	valid_0's auc: 0.858959
[23]	valid_0's auc: 0.85917
[24]	valid_0's auc: 0.859546
[25]	valid_0's auc: 0.859812
[26]	valid_0's auc: 0.860126
[27]	valid_0's auc: 0.860288
[28]	valid_0's auc: 0.860429
[29]	valid_0's auc: 0.860699
[30]	valid_0's auc: 0.861046
[31]	valid_0's auc: 0.861299
[32]	valid_0's auc: 0.861719
[33]	valid_0's auc: 0.86

LGBMClassifier(bagging_fraction=0.8, feature_fraction=0.8, learning_rate=0.05,
               max_bin=512, max_depth=10, metric='auc', n_estimators=200,
               num_leaves=200, objective='binary', reg_alpha=5, reg_lambda=2,
               subsample=1, subsample_for_bin=200, subsample_freq=1)

+ Other metrics: Recall, AUC_Roc score

In [20]:
print(lgbclf.score(X_test_, y_test_))

0.9066737622201199


In [24]:
from sklearn.metrics import recall_score

pred = lgbclf.predict(X_test_)
recall_score_ = recall_score(y_test_, pred)
print(recall_score_)


0.11236660389202763


In [25]:

for clf in clfs:
  y_pred = clf.predict(X_test_)
  recall_score_ = recall_score(y_test_, y_pred)
  print(recall_score_)

0.07407407407407407
0.11445909186022181


### Performance of vanilla model

In [27]:
lgbclf_vani = lgb.LGBMClassifier(**params)
lgbclf_vani.fit(X_train, y_train, eval_set=[(X_val, y_val)], categorical_feature = categorical_features, early_stopping_rounds=20)

[1]	valid_0's auc: 0.832779
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.841611
[3]	valid_0's auc: 0.842772
[4]	valid_0's auc: 0.844267
[5]	valid_0's auc: 0.844823
[6]	valid_0's auc: 0.844978
[7]	valid_0's auc: 0.846427
[8]	valid_0's auc: 0.846037
[9]	valid_0's auc: 0.846525
[10]	valid_0's auc: 0.84663
[11]	valid_0's auc: 0.846847
[12]	valid_0's auc: 0.847175
[13]	valid_0's auc: 0.846896
[14]	valid_0's auc: 0.847012
[15]	valid_0's auc: 0.847302
[16]	valid_0's auc: 0.847221
[17]	valid_0's auc: 0.847083
[18]	valid_0's auc: 0.847285
[19]	valid_0's auc: 0.847598
[20]	valid_0's auc: 0.847875
[21]	valid_0's auc: 0.847819
[22]	valid_0's auc: 0.847767
[23]	valid_0's auc: 0.847699
[24]	valid_0's auc: 0.848078
[25]	valid_0's auc: 0.848101
[26]	valid_0's auc: 0.848206
[27]	valid_0's auc: 0.848307
[28]	valid_0's auc: 0.848384
[29]	valid_0's auc: 0.848517
[30]	valid_0's auc: 0.848691
[31]	valid_0's auc: 0.848872
[32]	valid_0's auc: 0.848965
[33]	valid_0's auc: 

LGBMClassifier(bagging_fraction=0.8, feature_fraction=0.8, learning_rate=0.05,
               max_bin=512, max_depth=10, metric='auc', n_estimators=200,
               num_leaves=200, objective='binary', reg_alpha=5, reg_lambda=2,
               subsample=1, subsample_for_bin=200, subsample_freq=1)

In [29]:
lgbclf_vani.score(X_test, y_test)

0.9075804162724692

In [32]:
pred = lgbclf_vani.predict(X_test)
recall_score_ = recall_score(y_test, pred)
print(recall_score_)

0.08600125549278091


### Recall score on balanced data

In [33]:
X_train_, X_val_, y_train_, y_val_, X_test_, y_test_ = prepare(balanced_data, X_test, y_test, target)

In [34]:
X_ = pd.concat([X_train_, X_val_], axis=0)
y_ = pd.concat([y_train_, y_val_], axis=0)

lgb_balanced = lgb.LGBMClassifier(**params)
lgb_balanced.fit(X_, y_)
print(recall_score(y_test_, lgb_balanced.predict(X_test_)))

0.8160703075957313


### Recall score on balanced data without feature engineering

In [39]:
X_raw = balanced_data.drop(columns=[target, 'sumHealth', 'sumUnhealth', 'diff_health'])
y_raw = balanced_data[target]

lgb_balanced_vani = lgb.LGBMClassifier(**params)
lgb_balanced_vani.fit(X_raw, y_raw)
print(recall_score(y_test, lgb_balanced_vani.predict(X_test)))

0.8162795563925508
