In [30]:
import re
import os
import gc
import numpy as np 
import pandas as pd 
# import xgboost
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization

from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support, mean_absolute_error)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.stats import randint as sp_randInt 
from scipy.stats import uniform as sp_randFloat
from sklearn.model_selection import RandomizedSearchCV

from catboost import Pool, cv, CatBoostClassifier

In [31]:
TARGET_COL = "hospital_death"
df = pd.read_csv(r".\input\training_v2.csv")
test = pd.read_csv(r".\input\unlabeled.csv")

In [32]:
categorical_cols =  ['hospital_id',
 'ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 'apache_2_bodysystem']

#['apache_3j_bodysystem', 'apache_2_bodysystem',
# "hospital_admit_source","icu_id","ethnicity"]

In [33]:
df[categorical_cols] = df[categorical_cols].fillna("")

# same transformation for test data
test[categorical_cols] = test[categorical_cols].fillna("")

## Train model(s)

In [34]:
## useful "hidden" function - df._get_numeric_data()  - returns only numeric columns from a pandas dataframe. Useful for scikit learn models! 

X_train = df.drop([TARGET_COL],axis=1)
y_train = df[TARGET_COL]


In [35]:
## catBoost Pool object
display(X_train)
display(y_train)
train_pool = Pool(data=X_train,label = y_train,cat_features=categorical_cols)

### OPT/TODO:  do train test split for early stopping then add that as an eval pool object : 

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,icu_type,pre_icu_los_days,readmission_status,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,68.00,22.73,0,Caucasian,M,180.30,Floor,Floor,92,admit,CTICU,0.54,0,73.90,2.30,113.00,502.01,0,0.00,0.40,31.00,2.51,,3.00,6.00,0.00,4.00,168.00,118.00,27.40,0.00,40.00,,,,,36.00,134.00,39.30,,0.00,14.10,46.00,32.00,68.00,37.00,68.00,37.00,119.00,72.00,66.00,40.00,89.00,46.00,89.00,46.00,34.00,10.00,100.00,74.00,122.00,64.00,131.00,73.00,131.00,73.00,39.90,37.20,,,68.00,63.00,68.00,63.00,119.00,108.00,,,86.00,85.00,86.00,85.00,26.00,18.00,100.00,74.00,,,131.00,115.00,131.00,115.00,39.50,37.50,2.30,2.30,0.40,0.40,31.00,30.00,8.50,7.40,2.51,2.23,168.00,109.00,19.00,15.00,8.90,8.90,27.40,27.40,,,1.30,1.00,233.00,233.00,4.00,3.40,136.00,134.00,14.10,14.10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.10,0.05,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,Sepsis,Cardiovascular
1,114252,59342,81,77.00,27.42,0,Caucasian,F,160.00,Floor,Floor,90,admit,Med-Surg ICU,0.93,0,70.20,,108.00,203.01,0,0.00,,9.00,0.56,1.00,1.00,3.00,0.00,1.00,145.00,120.00,36.90,0.00,46.00,37.00,37.00,51.00,7.45,33.00,145.00,35.10,,1.00,12.70,,,95.00,31.00,95.00,31.00,118.00,72.00,,,120.00,38.00,120.00,38.00,32.00,12.00,100.00,70.00,,,159.00,67.00,159.00,67.00,36.30,35.10,,,61.00,48.00,61.00,48.00,114.00,100.00,,,85.00,57.00,85.00,57.00,31.00,28.00,95.00,70.00,,,95.00,71.00,95.00,71.00,36.30,36.30,1.60,1.60,0.50,0.50,11.00,9.00,8.60,8.00,0.71,0.56,145.00,128.00,27.00,26.00,11.30,11.10,36.90,36.10,1.30,1.30,3.50,3.50,557.00,487.00,4.20,3.80,145.00,145.00,23.30,12.70,,,,,9.00,9.00,8.60,8.60,0.56,0.56,145.00,143.00,27.00,27.00,11.30,11.30,36.90,36.90,1.30,1.30,3.50,3.50,557.00,557.00,4.20,4.20,145.00,145.00,12.70,12.70,37.00,37.00,7.45,7.45,51.00,51.00,54.80,51.00,37.00,37.00,7.45,7.45,51.00,51.00,51.00,51.00,0.47,0.29,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,Respiratory,Respiratory
2,119783,50777,118,25.00,31.95,0,Caucasian,F,172.70,Emergency Department,Accident & Emergency,93,admit,Med-Surg ICU,0.00,0,95.30,,122.00,703.03,0,0.00,,,,,3.00,6.00,0.00,5.00,,102.00,,0.00,68.00,,,,,37.00,,36.70,,0.00,,,,88.00,48.00,88.00,48.00,96.00,68.00,,,102.00,68.00,102.00,68.00,21.00,8.00,98.00,91.00,,,148.00,105.00,148.00,105.00,37.00,36.70,,,88.00,58.00,88.00,58.00,96.00,78.00,,,91.00,83.00,91.00,83.00,20.00,16.00,98.00,91.00,,,148.00,124.00,148.00,124.00,36.70,36.70,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Metabolic,Metabolic
3,79267,46918,118,81.00,22.64,1,Caucasian,F,165.10,Operating Room,Operating Room / Recovery,92,admit,CTICU,0.00,0,61.70,,203.00,1206.03,1,0.00,,,,0.60,4.00,6.00,0.00,5.00,185.00,114.00,25.90,1.00,60.00,30.00,30.00,142.00,7.39,4.00,,34.80,,1.00,8.00,62.00,30.00,48.00,42.00,48.00,42.00,116.00,92.00,92.00,52.00,84.00,84.00,84.00,84.00,23.00,7.00,100.00,95.00,164.00,78.00,158.00,84.00,158.00,84.00,38.00,34.80,62.00,44.00,62.00,44.00,,,100.00,96.00,92.00,71.00,92.00,71.00,,,12.00,11.00,100.00,99.00,136.00,106.00,136.00,106.00,,,35.60,34.80,,,,,,,,,,,185.00,88.00,,,11.60,8.90,34.00,25.90,1.60,1.10,,,198.00,43.00,5.00,3.50,,,9.00,8.00,,,,,,,,,,,,,,,11.60,11.60,34.00,34.00,1.60,1.10,,,43.00,43.00,,,,,8.80,8.80,37.00,27.00,7.44,7.34,337.00,102.00,342.50,236.67,36.00,33.00,7.37,7.34,337.00,265.00,337.00,337.00,0.04,0.03,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Cardiovascular,Cardiovascular
4,92056,34377,33,19.00,,0,Caucasian,M,188.00,,Accident & Emergency,91,admit,Med-Surg ICU,0.07,0,,,119.00,601.01,0,0.00,,,,,,,,,,60.00,,0.00,103.00,,,,,16.00,,36.70,,0.00,,,,99.00,57.00,99.00,57.00,89.00,60.00,,,104.00,90.00,104.00,90.00,18.00,16.00,100.00,96.00,,,147.00,120.00,147.00,120.00,37.20,36.70,,,99.00,68.00,99.00,68.00,89.00,76.00,,,104.00,92.00,104.00,92.00,,,100.00,100.00,,,130.00,120.00,130.00,120.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Trauma,Trauma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91708,91592,78108,30,75.00,23.06,0,Caucasian,M,177.80,Acute Care/Floor,Floor,927,admit,Cardiac ICU,0.30,0,72.90,,113.00,501.06,0,0.00,,,,,4.00,6.00,0.00,5.00,381.00,115.00,,0.00,48.00,,,,,9.00,,36.60,,1.00,,,,104.00,44.00,104.00,44.00,115.00,70.00,,,109.00,48.00,109.00,48.00,27.00,9.00,100.00,85.00,,,128.00,67.00,128.00,67.00,38.20,36.60,,,55.00,44.00,55.00,44.00,111.00,96.00,,,64.00,55.00,64.00,55.00,24.00,16.00,98.00,97.00,,,94.00,87.00,94.00,87.00,38.20,38.20,,,,,27.00,27.00,8.20,8.20,1.10,1.10,400.00,187.00,28.00,28.00,13.20,13.20,39.00,39.00,1.10,1.10,,,92.00,92.00,4.20,4.20,140.00,140.00,5.68,5.68,,,,,27.00,27.00,8.20,8.20,1.10,1.10,208.00,208.00,28.00,28.00,13.20,13.20,39.00,39.00,1.10,1.10,,,92.00,92.00,4.20,4.20,140.00,140.00,5.68,5.68,48.00,48.00,7.34,7.34,144.00,144.00,,,48.00,48.00,7.34,7.34,144.00,144.00,,,0.12,0.05,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,Sepsis,Cardiovascular
91709,66119,13486,121,56.00,47.18,0,Caucasian,F,183.00,Emergency Department,Floor,925,admit,Med-Surg ICU,0.12,0,158.00,,113.00,501.05,0,0.00,,34.00,2.30,,4.00,6.00,0.00,5.00,177.00,100.00,33.00,0.00,62.00,,,,,33.00,136.00,37.40,,0.00,4.22,94.00,34.00,73.00,40.00,73.00,40.00,100.00,76.00,166.00,30.00,82.00,47.00,82.00,47.00,33.00,12.00,100.00,45.00,131.00,81.00,127.00,65.00,127.00,65.00,37.40,36.10,,,64.00,56.00,64.00,56.00,89.00,83.00,,,78.00,68.00,78.00,68.00,28.00,22.00,99.00,96.00,,,127.00,107.00,127.00,107.00,36.10,36.10,,,,,34.00,34.00,7.80,7.80,2.30,2.30,177.00,177.00,30.00,30.00,10.50,10.50,33.00,33.00,,,1.40,1.40,133.00,133.00,3.80,3.80,136.00,136.00,4.22,4.22,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,85.00,58.00,7.26,7.15,187.00,63.00,187.00,63.00,,,,,,,,,0.03,0.02,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Sepsis,Cardiovascular
91710,8981,58179,195,48.00,27.24,0,Caucasian,M,170.20,Emergency Department,Accident & Emergency,908,admit,Med-Surg ICU,0.05,0,78.90,2.90,123.00,702.01,0,0.00,,33.00,2.30,,3.00,6.00,0.00,4.00,538.00,158.00,36.00,0.00,57.00,,,,,4.00,135.00,35.80,,0.00,17.55,,,77.00,49.00,77.00,49.00,158.00,79.00,,,82.00,57.00,82.00,57.00,34.00,0.00,99.00,81.00,,,110.00,75.00,110.00,75.00,36.90,35.80,,,,,,,,,,,,,,,,,,,,,,,,,36.60,36.60,3.10,2.80,,,33.00,18.00,7.50,6.60,2.30,1.00,538.00,86.00,20.00,7.00,12.30,12.30,36.00,36.00,,,,,227.00,227.00,4.70,3.20,139.00,135.00,17.55,17.55,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,20.00,14.90,7.36,6.93,136.00,60.00,,,15.00,15.00,6.93,6.93,136.00,136.00,,,0.05,0.02,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,Metabolic,Metabolic
91711,33776,120598,66,,23.30,0,Caucasian,F,154.90,Emergency Department,Accident & Emergency,922,admit,Med-Surg ICU,0.08,0,55.90,,108.00,203.01,0,0.00,,,,,4.00,5.00,0.00,4.00,,60.00,,0.00,54.00,,,,,14.00,,36.30,,0.00,,,,99.00,32.00,99.00,32.00,82.00,60.00,,,108.00,54.00,108.00,54.00,23.00,14.00,99.00,83.00,,,153.00,111.00,153.00,111.00,36.50,36.30,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,39.00,39.00,7.37,7.37,55.00,55.00,137.50,137.50,,,,,,,,,0.07,0.02,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Respiratory,Respiratory


0        0
1        0
2        0
3        0
4        0
        ..
91708    0
91709    0
91710    0
91711    0
91712    0
Name: hospital_death, Length: 91713, dtype: int64

## Train a basic model

In [37]:
model_basic = CatBoostClassifier(verbose=False, iterations=40,learning_rate=0.1, task_type="GPU",)
model_basic.fit(train_pool, plot=True,silent=True, early_stopping_rounds=40)
print(model_basic.get_best_score())

TypeError: __init__() got an unexpected keyword argument 'iterations'

In [None]:
def cat_bayesian(
    num_leaves,  # int
    learning_rate, 
    lambda_l2,
    max_depth #int):
    
    # LightGBM expects next three parameters need to be integer. So we make them integer
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int

    param = {
        'num_leaves': num_leaves,
        'max_bin': 63,
        'min_data_in_leaf': min_data_in_leaf,
        'learning_rate': learning_rate,
        'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'feature_fraction': feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'min_gain_to_split': min_gain_to_split,
        'max_depth': max_depth,
        'save_binary': True, 
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
        'data_random_seed': 1337,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,   

    }    
    
    
    xg_train = lgb.Dataset(train_df.iloc[bayesian_tr_index][predictors].values,
                           label=train_df.iloc[bayesian_tr_index][target].values,
                           feature_name=predictors,
                           free_raw_data = False
                           )
    xg_valid = lgb.Dataset(train_df.iloc[bayesian_val_index][predictors].values,
                           label=train_df.iloc[bayesian_val_index][target].values,
                           feature_name=predictors,
                           free_raw_data = False
                           )   

    num_round = 5000
    clf = lgb.train(param, xg_train, num_round, valid_sets = [xg_valid], verbose_eval=250, early_stopping_rounds = 50)
    
    predictions = clf.predict(train_df.iloc[bayesian_val_index][predictors].values, num_iteration=clf.best_iteration)   
    
    score = metrics.roc_auc_score(train_df.iloc[bayesian_val_index][target].values, predictions)
    
    return score

In [None]:
### hyperparameter tuning example grid for catboost : 
grid = {'learning_rate': sp_randFloat(0.01, 0.2),
        'depth':sp_randInt(2, 12),
        'l2_leaf_reg': sp_randInt(1,11),
        "iterations": [500000],
       "custom_metric":['AUC'],
       'task_type': ["GPU"],
       'early_stopping_rounds': [400]}
#'Logloss',
model = CatBoostClassifier()

## can also do randomized search - more efficient typically, especially for large search space - `randomized_search`
grid_search_result = BayesianOptimization(param_distributions=grid, 
                                            X=train_pool,
                                            plot=True,
                                            partition_random_seed=42,
                                            cv = 2,
                                            refit=True, 
                                            n_iter=100,
                                            stratified=True
                                              scoring = 'auc'
                                          )

print(model.get_best_score())

print("best model params: \n",grid_search_result["params"])

In [None]:
test[TARGET_COL] = model.predict(test.drop([TARGET_COL],axis=1),prediction_type='Probability')[:,1]
test[["encounter_id","hospital_death"]].to_csv("submissionCB.csv",index=False)

print('CT submission file created)

In [None]:
df ['CTboost'] = model.predict(X_train ,prediction_type='Probability')[:,1]
df.to_csv("New_X_includingCatboost",index=False)
false_positive_rate, recall, thresholds = roc_curve(y_train, X_train ['CTboost'])

print ('False Positive rate: ')
print(false_positive_rate)

print ('Recall rate: ')
print(recall)

In [None]:
train = pd.read_csv("New_X_includingCatboost.csv")
test = pd.read_csv(".\input\unlabeled.csv")

In [None]:
def eval_auc(pred,real):
    false_positive_rate, recall, thresholds = roc_curve(real, pred)
    roc_auc = auc(false_positive_rate, recall)
    return roc_auc    

In [None]:
class Base_Model(object):
    
    def __init__(self, train_df, test_df, features, categoricals=[], n_splits=5, verbose=True,ps={}):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.categoricals = categoricals
        self.target = 'hospital_death'
        self.cv = self.get_cv()
        self.verbose = verbose
#         self.params = self.get_params()
        self.params = self.set_params(ps)
        self.y_pred, self.score, self.model , self.oof_pred = self.fit()
        
    def train_model(self, train_set, val_set):
        raise NotImplementedError
        
    def get_cv(self):
        cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        return cv.split(self.train_df, self.train_df[self.target])
    
    def get_params(self):
        raise NotImplementedError
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError
        
    def convert_x(self, x):
        return x
        
    def fit(self):
        oof_pred = np.zeros((len(self.train_df), ))
        y_pred = np.zeros((len(self.test_df), ))
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model = self.train_model(train_set, val_set)
            conv_x_val = self.convert_x(x_val)
            oof_pred[val_idx] = model.predict(conv_x_val).reshape(oof_pred[val_idx].shape)
            x_test = self.convert_x(self.test_df[self.features])
            y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits

            print('Partial score of fold {} is: {}'.format(fold,eval_auc(oof_pred[val_idx],y_val) ))
        #print(oof_pred, self.train_df[self.target].values)
        loss_score = eval_auc(oof_pred,self.train_df[self.target].values) 
        if self.verbose:
            print('Our oof AUC score is: ', loss_score)
        return y_pred, loss_score, model , oof_pred

In [None]:
class Lgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return lgb.train(self.params, train_set, valid_sets=[train_set, val_set], verbose_eval=verbosity)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = lgb.Dataset(x_train, y_train, categorical_feature=self.categoricals)
        val_set   = lgb.Dataset(x_val,    y_val,  categorical_feature=self.categoricals)
        return train_set, val_set
        
    def get_params(self):
        params = {'n_estimators':500000,
                    'boosting_type': 'gbdt',
                    'objective': 'binary',
                    'metric': 'auc',
                    'subsample': 0.75,
                    'subsample_freq': 1,
                    'learning_rate': 0.1,
                    'feature_fraction': 0.9,
                    'max_depth': 15,
                    'lambda_l1': 1,  
                    'lambda_l2': 1,
                    'early_stopping_rounds': 100,
                    #'is_unbalance' : True ,
                    'scale_pos_weight' : 3
                  'early_stopping_rounds': 400
                  
                    }
        return params
    def set_params(self,ps={}):
        params = self.get_params()
        if 'subsample_freq' in ps:
            params['subsample_freq']=int(ps['subsample_freq'])
            params['learning_rate']=ps['learning_rate']
            params['feature_fraction']=ps['feature_fraction']
            params['lambda_l1']=ps['lambda_l1']
            params['lambda_l2']=ps['lambda_l2']
            params['scale_pos_weight']=ps['scale_pos_weight']
            params['max_depth']=int(ps['max_depth'])
        
        return params  

In [None]:
print('Transform all String features to category.\n')

for usecol in categorical_cols:
    train[usecol] = train[usecol].astype('str')
    test[usecol] = test[usecol].astype('str')
    
    #Fit LabelEncoder
    le = LabelEncoder().fit(
            np.unique(train[usecol].unique().tolist()+
                      test[usecol].unique().tolist()))

    #At the end 0 will be used for dropped values
    train[usecol] = le.transform(train[usecol])+1
    test[usecol]  = le.transform(test[usecol])+1
    
    train[usecol] = train[usecol].replace(np.nan, 0).astype('int').astype('category')
    test[usecol]  = test[usecol].replace(np.nan, 0).astype('int').astype('category')

In [None]:
def LGB_Beyes(subsample_freq,
                    learning_rate,
                    feature_fraction,
                    max_depth,
                    lambda_l1,
                    lambda_l2,
                    scale_pos_weight):
    params={}
    params['subsample_freq']=subsample_freq
    params['learning_rate']=learning_rate
    params['feature_fraction']=feature_fraction
    params['lambda_l1']=lambda_l1
    params['lambda_l2']=lambda_l2
    params['max_depth']=max_depth
    params['scale_pos_weight']=scale_pos_weight
    
    lgb_model= Lgb_Model(train, test, features, categoricals=categoricals_features,ps=params)
    print('auc: ',lgb_model.score)
    return lgb_model.score

bounds_LGB = {
    'subsample_freq': (1, 10),
    'learning_rate': (0.005, 0.2),
    'feature_fraction': (0.5, 1),
    'lambda_l1': (0, 5),
    'lambda_l2': (0, 5),
    'max_depth': (3, 17),
    'scale_pos_weight': (1, 10),
}

# ACTIVATE it if you want to search for better parameter

LGB_BO = BayesianOptimization(LGB_Beyes, bounds_LGB, random_state=1029)
import warnings
init_points = 16
n_iter = 100
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')    
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

In [None]:
lgb_model = Lgb_Model(train,test, features, categoricals=categoricals_features, ps= LGB_BO.max['params']  )

In [None]:
test["hospital_death"] = lgb_model.y_pred
test[["encounter_id","hospital_death"]].to_csv("submissionLightGB.csv",index=False)

tarin['Light_gb'] = lgb_model.mode.predict(X_train)
X_train.to_csv("New_X_includingCatboost_and_LightGB",index=False)