This notebook contains the pipeline for running cross validated LightGBM in WiDS2020 Datathon.

In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [2]:
def label_var(data,variables_cat):
    lb = []
    for cat in variables_cat:
        le = LabelEncoder()
        lb.append(le.fit(list(data[cat].dropna())))
    
    return lb

def label_enc(data, le, categorical_features):
    i = 0
    for feature in categorical_features:
        data.loc[data[feature].notnull(), feature] = le[i].transform(data.loc[data[feature].notnull(), feature])
        i += 1

In [6]:
data_dir = '../data/'        
        
df_train = pd.read_csv(data_dir + "training_v2.csv")
df_test = pd.read_csv(data_dir + "unlabeled.csv")

useless_columns = ['encounter_id','patient_id','hospital_death','readmission_status']
train_columns = [x for x in df_train.columns if x not in useless_columns]

categorical_features = []
for col in train_columns:
    if df_train[col].dtypes == 'object':
        categorical_features.append(col)

        
df_train[categorical_features] = df_train[categorical_features].fillna("")
df_test[categorical_features] = df_test[categorical_features].fillna("")

df_concat = pd.concat([df_train[categorical_features],df_test[categorical_features]])

lb = label_var(df_concat, categorical_features)
label_enc(df_train, lb, categorical_features)
label_enc(df_test, lb, categorical_features)

for df in [df_train, df_test]:
    for feature in categorical_features:
        df[feature] = df[feature].astype(int)


categorical_index = [train_columns.index(x) for x in categorical_features]

target = df_train['hospital_death']

folds = StratifiedKFold(n_splits=6, shuffle=True, random_state=256)
oof = np.zeros(len(df_train))
scores = []
predictions = np.zeros(len(df_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train, target.values)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X = df_train.iloc[trn_idx][train_columns]
    Y = df_train.iloc[trn_idx]['hospital_death']
    
    model_best = lgb.LGBMClassifier(verbose_eval=200, categorical_feature=categorical_index)
    model_best.fit(X, Y)

    oof[val_idx] = model_best.predict(df_train.iloc[val_idx][train_columns])
    
    score = roc_auc_score(target.loc[val_idx],model_best.predict(df_train.loc[val_idx,train_columns].values, prediction_type='Class'))
    
    scores.append(score)
    
    predictions += model_best.predict_proba(df_test[train_columns])[:, 1] / folds.n_splits

fold 0


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


fold 1


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


fold 2


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


fold 3


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


fold 4


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


fold 5


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


In [8]:
strAUC = roc_auc_score(target, oof)
print(strAUC)
print ("mean: "+str(np.mean(np.array(scores))))
print ("std: "+str(np.std(np.array(scores))))

0.674665942678357
mean: 0.6746662487534879
std: 0.004620264871866327
