In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('heart_disease_health_indicators_BRFSS2015.csv')

In [4]:
target = 'HeartDiseaseorAttack'
ordinal_features = ['Education', 'Age', 'PhysHlth', 'MentHlth','GenHlth','Income']
categorical_features = ['Sex', 'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'PhysActivity', 'Fruits', 'Veggies', 'Diabetes', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk']
continuous_features = ['BMI']
health = ['PhysActivity', 'Fruits', 'Veggies', 'NoDocbcCost']
unhealth = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'Diabetes', 'HvyAlcoholConsump', 'AnyHealthcare']
inter = health + unhealth

In [5]:
## Split data into train_val and test
from sklearn.model_selection import train_test_split
train_val, test = train_test_split(data, test_size = 0.2, stratify=data[target].values, random_state=0)

X_train_val = train_val.drop(columns=[target])
y_train_val = train_val[target]
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val.values, test_size=0.2, random_state=0)
X_test = test.drop(columns=[target])
y_test = test[target]

## Get balanced data
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence
def under_sample(label, target):
    rand_state = RandomState(MT19937(SeedSequence(123456789)))
    target_usamp = target[target == label]
    target_osamp = target[target != label]
    resampled_idx = rand_state.choice(target_osamp.index, size=len(target_usamp))
    
    return resampled_idx

rsp_idx = under_sample(1, y_train_val)
data_balanced_0 = train_val.loc[rsp_idx, :]
data_label_1 = train_val[y_train_val == 1]
print(data_balanced_0.shape, data_label_1.shape)
balanced_data = pd.concat([data_balanced_0, data_label_1], axis=0)

(19114, 22) (19114, 22)


In [6]:
from sklearn.model_selection import StratifiedKFold
def target_enc(df, cols, target):
    X, y = df[cols], df[target]
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    target_mean_features = []
    for col in cols:
        colname = col + '_target_mean'
        target_mean_features.append(colname)
        df[colname] = 0
    for tr_idx, val_idx in skf.split(X,y):
        X_train, X_val = df.iloc[tr_idx], df.iloc[val_idx]
        for col in cols:
            mean = X_train.groupby(by=col)[target].mean()
            df[col+'_target_mean'].iloc[val_idx] = X_val[col].map(mean)
    prior = df[target].mean()
    df[target_mean_features].fillna(prior, inplace=True)
    
    
    #regularization
    
    for feature in target_mean_features:
        df[feature] = df[feature] * len(df) + df[feature].nunique() * prior
        df[feature] = df[feature] / (len(df) + df[feature].nunique())
    return df[target_mean_features]

def prepare_train(train, target):
    train['sumHealth'] = train[health].sum(axis=1)
    train['sumUnhealth'] = train[unhealth].sum(axis=1)
    train['diff_health'] = train['sumHealth'] - train['sumUnhealth']

    ## Label encoding and interactive labels
    features_mean_enc = categorical_features
    mean_enc_cat = target_enc(train[categorical_features+[target]], categorical_features, target)
    #mean_enc_ord = target_enc(train[ordinal_features + [target]], ordinal_features, target)
    
    train = pd.concat([train.drop(columns=features_mean_enc), mean_enc_cat], axis=1)
    
    for i in inter:
      for j in inter:
        if i!=j:
          train[i+'|'+j] = train[i+'_target_mean'] * train[j+'_target_mean']

            
    return train


def prepare_test(test, train, target):
    test['sumHealth'] = test[health].sum(axis=1)
    test['sumUnhealth'] = test[unhealth].sum(axis=1)
    test['diff_health'] = test['sumHealth'] - test['sumUnhealth']
    
    features_to_enc = categorical_features
    for feature in features_to_enc:
        test[feature + '_target_mean'] = test[feature].map(train.groupby(by=feature)[target].mean())
        
    prior = train[target].mean()
    test[features_to_enc].fillna(prior, inplace=True)
    
    #Smooth
    for feature in features_to_enc:
        test[feature] = test[feature] * len(test) + test[feature].nunique() * prior
        test[feature] = test[feature] / (len(test) + test[feature].nunique())
    
    for i in inter:
      for j in inter:
        if i!=j:
          test[i+'|'+j] = test[i+'_target_mean'] * test[j+'_target_mean']
    
    test = test.drop(columns=features_to_enc)
    return test

def prepare(train_val, X_test, y_test, target):
    train = prepare_train(train_val, target)
    X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=[target]), train[target], test_size=0.1)
    test = pd.concat([X_test, y_test], axis=1)
    test_set = prepare_test(test, train_val, target)
    X_test_ = test_set.drop(columns=[target])
    y_test_ = test_set[target]


    return X_train, X_val, y_train, y_val, X_test_, y_test_

In [11]:
## Balanced data
X_train_, X_val_, y_train_, y_val_, X_test_, y_test_ = prepare(balanced_data, X_test, y_test, target)

In [12]:
## Unbalanced data
X_train_u, X_val_u, y_train_u, y_val_u, X_test_u, y_test_u = prepare(train_val, X_test, y_test, target)

In [15]:
##lgbmodel on unbalanced data

import lightgbm as lgb
params = {'boosting_type': 'gbdt', 'objective': 'binary', 'n_estimators':1000,
              'num_leaves': 100, 'learning_rate': 0.01, 'max_bin': 512, 
              'subsample_for_bin': 200, 'subsample': 1, 'subsample_freq': 1,
              'colsample_bytree': 0.8, 'reg_alpha': 5, 'reg_lambda': 10, 
              'metric': 'auc'}
lgbm = lgb.LGBMClassifier(**params)
lgbm.fit(X_train_u, y_train_u, eval_set=[(X_val_u, y_val_u)], early_stopping_rounds=20)
lgbm.score(X_test_u, y_test_u)

[1]	valid_0's auc: 0.831092
[2]	valid_0's auc: 0.83196
[3]	valid_0's auc: 0.83316
[4]	valid_0's auc: 0.833288
[5]	valid_0's auc: 0.834179
[6]	valid_0's auc: 0.837458
[7]	valid_0's auc: 0.837582
[8]	valid_0's auc: 0.839421
[9]	valid_0's auc: 0.839822
[10]	valid_0's auc: 0.840225
[11]	valid_0's auc: 0.840085
[12]	valid_0's auc: 0.840217
[13]	valid_0's auc: 0.840389
[14]	valid_0's auc: 0.840472
[15]	valid_0's auc: 0.840235
[16]	valid_0's auc: 0.840985
[17]	valid_0's auc: 0.840993
[18]	valid_0's auc: 0.841082
[19]	valid_0's auc: 0.841091
[20]	valid_0's auc: 0.84131
[21]	valid_0's auc: 0.841792
[22]	valid_0's auc: 0.841929
[23]	valid_0's auc: 0.841919
[24]	valid_0's auc: 0.841948
[25]	valid_0's auc: 0.842037
[26]	valid_0's auc: 0.842262
[27]	valid_0's auc: 0.842374
[28]	valid_0's auc: 0.842614
[29]	valid_0's auc: 0.842829
[30]	valid_0's auc: 0.842948
[31]	valid_0's auc: 0.843155
[32]	valid_0's auc: 0.843251
[33]	valid_0's auc: 0.843193
[34]	valid_0's auc: 0.843295
[35]	valid_0's auc: 0.8433

0.9076789656259855

In [16]:
##lgbmodel on balanced data
lgbm_balanced = lgb.LGBMClassifier(**params)
lgbm_balanced.fit(X_train_, y_train_, eval_set=[(X_val_, y_val_)], early_stopping_rounds=20)

[1]	valid_0's auc: 0.834164
[2]	valid_0's auc: 0.834621
[3]	valid_0's auc: 0.835682
[4]	valid_0's auc: 0.835961
[5]	valid_0's auc: 0.839162
[6]	valid_0's auc: 0.841637
[7]	valid_0's auc: 0.841117
[8]	valid_0's auc: 0.842902
[9]	valid_0's auc: 0.842954
[10]	valid_0's auc: 0.843099
[11]	valid_0's auc: 0.843119
[12]	valid_0's auc: 0.843213
[13]	valid_0's auc: 0.843214
[14]	valid_0's auc: 0.84328
[15]	valid_0's auc: 0.84321
[16]	valid_0's auc: 0.843811
[17]	valid_0's auc: 0.843755
[18]	valid_0's auc: 0.843556
[19]	valid_0's auc: 0.843901
[20]	valid_0's auc: 0.844134
[21]	valid_0's auc: 0.844292
[22]	valid_0's auc: 0.844322
[23]	valid_0's auc: 0.844305
[24]	valid_0's auc: 0.844227
[25]	valid_0's auc: 0.844469
[26]	valid_0's auc: 0.844535
[27]	valid_0's auc: 0.844371
[28]	valid_0's auc: 0.844326
[29]	valid_0's auc: 0.844351
[30]	valid_0's auc: 0.844616
[31]	valid_0's auc: 0.845011
[32]	valid_0's auc: 0.845024
[33]	valid_0's auc: 0.845041
[34]	valid_0's auc: 0.845028
[35]	valid_0's auc: 0.845

LGBMClassifier(colsample_bytree=0.8, learning_rate=0.01, max_bin=512,
               metric='auc', n_estimators=1000, num_leaves=100,
               objective='binary', reg_alpha=5, reg_lambda=10, subsample=1,
               subsample_for_bin=200, subsample_freq=1)

In [29]:
lgbm_balanced.score(X_test_, y_test)

0.7350402081362346

#### End

+ imbalanced vani

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBRFClassifier

In [24]:
clfs = [RandomForestClassifier(), LogisticRegression(), GradientBoostingClassifier(), XGBRFClassifier()]

In [25]:
for clf in clfs:
    print('---------------------')
    print(clf.__class__.__name__)
    clf.fit(X_train_u, y_train_u)
    print(classification_report(y_test, clf.predict(X_test_u)))

---------------------
RandomForestClassifier
              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95     45957
         1.0       0.54      0.08      0.13      4779

    accuracy                           0.91     50736
   macro avg       0.73      0.53      0.54     50736
weighted avg       0.88      0.91      0.87     50736

---------------------
LogisticRegression
              precision    recall  f1-score   support

         0.0       0.92      0.99      0.95     45957
         1.0       0.52      0.12      0.19      4779

    accuracy                           0.91     50736
   macro avg       0.72      0.55      0.57     50736
weighted avg       0.88      0.91      0.88     50736

---------------------
GradientBoostingClassifier
              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95     45957
         1.0       0.54      0.11      0.18      4779

    accuracy                           0.91    

In [None]:
from sklearn.ensemble import VotingClassifier

clf1 = RandomForestClassifier()
clf2 = LogisticRegression()
clf3 = GradientBoostingClassifier()
clf4 = XGBRFClassifier()
clf5 = lgb.LGBMClassifier(**params)

eclf = VotingClassifier(estimators=[
    ('rf', clf1),('lr', clf2), ('gbdt', clf3), ('xgb', clf4), ('lgbm', clf5)
], voting='hard')

eclf.fit(X_train_u, y_train_u)

In [27]:
eclf.score(X_test_u, y_test_u)

0.9075212866603595

In [28]:
print(classification_report(y_test, eclf.predict(X_test_u)))

              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95     45957
         1.0       0.56      0.08      0.14      4779

    accuracy                           0.91     50736
   macro avg       0.74      0.54      0.55     50736
weighted avg       0.88      0.91      0.87     50736

