In [193]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from catboost import Pool, cv, CatBoostClassifier, CatBoostRegressor
import pickle
import os
import gc
from sklearn.utils import resample
gc.enable()

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', -1)

In [194]:
def fit_lgb(X_fit, y_fit, le_test, lgb_path, cat_col):
    
    lgb_params = {'n_estimators':5000,
                    'boosting_type': 'gbdt',
                    'objective': 'binary',
                    'metric': 'auc',
                    'subsample': 0.75,
                    'subsample_freq': 1,
                    'learning_rate': 0.1,
                    'feature_fraction': 0.9,
                    'max_depth': 10,
                    'lambda_l1': 1,  
                    'lambda_l2': 1,
                    'early_stopping_rounds': 100,
                    #'is_unbalance' : True ,
                    'scale_pos_weight' : 3,
                  'categorical_feature':"cat_col"
                  
                    }
    
    folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=44000)
    oof = np.zeros(len(X_fit))
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_fit.values, y_fit.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(X_fit.iloc[trn_idx], label=y_fit.iloc[trn_idx])
        val_data = lgb.Dataset(X_fit.iloc[val_idx], label=y_fit.iloc[val_idx])

        num_round = 1000000
        clf = lgb.train(lgb_params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
        oof[val_idx] = clf.predict(X_fit.iloc[val_idx], num_iteration=clf.best_iteration)
    
    #Save LightGBM Model
    save_to = '{}_fold{}.txt'.format(lgb_path, r'\lgb')
    lgb.booster_.save_model(save_to)
    
    
    return oof
    
    
def fit_xgb(X_fit, y_fit, counter, xgb_path, cat_col):
    
    model = xgb.XGBClassifier(max_depth=2,
                              n_estimators=99,
                              colsample_bytree=0.3,
                              learning_rate=0.02,
                              objective='binary:logistic', 
                              n_jobs=-1)
     
    model.fit(X_fit, y_fit, 
              eval_set=[(X_val, y_val)], 
              verbose=0, 
              early_stopping_rounds=4000)
              
    cv_val = model.predict_proba(X_fit)[:,1]
    
    #Save XGBoost Model
    save_to = '{}_fold{}.dat'.format(xgb_path, r'\xgb', counter+1)
    pickle.dump(model, open(save_to, "wb"))
    
    return cv_val
    
    
def fit_cb(X_fit, y_fit, cb_path, cat_col):
    
    params = {'iterations':99, 'verbose':50, 'learning_rate':0.1, 'task_type':"GPU",
             "custom_metric":'AUC'}

    cb_train_pool = Pool(data=X_fit, label=y_fit, cat_features=cat_col)
    
    model = CatBoostClassifier()
                                  
    cb_cv_score = cv(cb_train_pool, plot=True, early_stopping_rounds=40, fold_count=5, params=params)
              
    print('CatBoost Scores: ')
    print(cb_cv_score)
    #Save Catboost Model          
    save_to = "{}_fold{}.mlmodel".format(cb_path, r'\cb', counter+1)
    model.save_model(save_to, format="coreml", 
                     export_parameters={'prediction_type': 'probability'})
    
    cv_val = model.predict_proba(X_fit)[:,1]
                     
    return cv_val


In [195]:
def train_stage(prepaired_data,  models_folders):  # Prepared data contains [X_train, y_train, X_test, le_X_train, le_test, categorical_columns]
    
    X_train = prepaired_data[0]
    y_df = prepaired_data[1]
    le_X_train = prepaired_data[3]
    le_test = prepaired_data[5]
    categorical_columns= prepaired_data[5]                    
    
    lgb_cv_result = np.zeros(X_train.shape[0])
    xgb_cv_result = np.zeros(X_train.shape[0])
    cb_cv_result  = np.zeros(X_train.shape[0])
    
    print('\nModel Fitting...')        
    
    print('LigthGBM')
    lgb_cv_result= fit_lgb(le_X_train, y_df, le_test, models_folders['lgb_path'], cat_col =categorical_columns)

#     print('XGBoost')
#     xgb_cv_result[ids[1]] += fit_xgb(le_X_train, y_df, models_folders['xgb_path'],
#                                      cat_col =categorical_columns)

    print('CatBoost')
    cb_cv_result= fit_cb(X_train,  y_df, models_folders['cb_path'], cat_col =categorical_columns)

    del X_fit, X_val, y_fit, y_val, le_X_fit, le_X_val
    gc.collect()
    
    auc_lgb  = round(roc_auc_score(y_df, lgb_cv_result),4)
    auc_xgb  = round(roc_auc_score(y_df, xgb_cv_result),4)
    auc_cb   = round(roc_auc_score(y_df, cb_cv_result), 4)
    auc_mean = round(roc_auc_score(y_df, (lgb_cv_result*0.4+xgb_cv_result*0.2+cb_cv_result*0.4)), 4)
    auc_mean_lgb_cb = round(roc_auc_score(y_df, (lgb_cv_result+cb_cv_result)/2), 4)
    print('\nLightGBM VAL AUC: {}'.format(auc_lgb))
    print('XGBoost  VAL AUC: {}'.format(auc_xgb))
    print('Catboost VAL AUC: {}'.format(auc_cb))
    print('Mean Catboost+LightGBM VAL AUC: {}'.format(auc_mean_lgb_cb))
    print('Mean XGBoost+Catboost+LightGBM, VAL AUC: {}\n'.format(auc_mean))
    
    return 0

In [196]:
def prediction_stage(X_test, models_folders):
    
    
    
    lgb_models = sorted(os.listdir(lgb_path))
    xgb_models = sorted(os.listdir(xgb_path))
    cb_models  = sorted(os.listdir(cb_path))
    
    lgb_result = np.zeros(df.shape[0])
    xgb_result = np.zeros(df.shape[0])
    cb_result  = np.zeros(df.shape[0])
    
    print('\nMake predictions...\n')
    
    print('With LightGBM...')
    for m_name in lgb_models:
        #Load LightGBM Model
        model = lgb.Booster(model_file='{}{}'.format(lgb_path, m_name))
        lgb_result += model.predict(df.values)
     
    print('With XGBoost...')    
    for m_name in xgb_models:
        #Load Catboost Model
        model = pickle.load(open('{}{}'.format(xgb_path, m_name), "rb"))
        xgb_result += model.predict(df.values)
    
    print('With CatBoost...')        
    for m_name in cb_models:
        #Load Catboost Model
        model = cb.CatBoostClassifier()
        model = model.load_model('{}{}'.format(cb_path, m_name), format = 'coreml')
        cb_result += model.predict(df.values, prediction_type='Probability')[:,1]
    
    lgb_result /= len(lgb_models)
    xgb_result /= len(xgb_models)
    cb_result  /= len(cb_models)
    
    submission = pd.read_csv('sample_submission.csv')
    submission['target'] = (lgb_result*0.4+xgb_result*0.2+cb_result*0.4)
    submission.to_csv('xgb_lgb_cb_starter_submission.csv', index=False)
    submission['target'] = (lgb_result+cb_result)/2
    submission.to_csv('lgb_cb_starter_submission.csv', index=False)
    submission['target'] = xgb_result
    submission.to_csv('xgb_starter_submission.csv', index=False)
    submission['target'] = lgb_result
    submission.to_csv('lgb_starter_submission.csv', index=False)
    submission['target'] = cb_result
    submission.to_csv('cb_starter_submission.csv', index=False)
    
    return 0

In [197]:
def transforme_data(train, test, categorical_cols):
    
    print('Transforming all String features to category.\n')
    for usecol in categorical_cols:
        train[usecol] = train[usecol].astype('str')
        test[usecol] = test[usecol].astype('str')

        #Fit LabelEncoder
        le_train = LabelEncoder().fit(np.unique(train[usecol].unique().tolist()))
        le_test =  LabelEncoder().fit(np.unique(test[usecol].unique().tolist()))

        #At the end 0 will be used for dropped values
        train[usecol] = le_train.transform(train[usecol])+1
        test[usecol]  = le_test.transform(test[usecol])+1

        train[usecol] = train[usecol].replace(np.nan, 0).astype('int').astype('category')
        test[usecol]  = test[usecol].replace(np.nan, 0).astype('int').astype('category')
    
#     Print('Performaing OneHot on all categorical columns')
#     onehot_encoder = OneHotEncoder(sparse=False)
#     integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
#     onehot_encoded = onehot_encoder.fit_transform(integer_encoded)        
                                      
        
    return train, test

In [198]:
def prepare_data (train_path, test_path ):
    
    df = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    
    TARGET_COL = "hospital_death"
    
    categorical_cols = ['hospital_id','ethnicity','gender','hospital_admit_source',
                         'icu_admit_source','icu_stay_type','icu_type','apache_3j_bodysystem','apache_2_bodysystem']
        
    df[categorical_cols] = df[categorical_cols].fillna("")
    test[categorical_cols] = test[categorical_cols].fillna("")
    
#     df.drop(['encounter_id', 'patient_id'], axis=1, inplace=True)
#     test.drop(['encounter_id', 'patient_id'], axis=1, inplace=True)
    

    
    X_train = df.drop([TARGET_COL],axis=1)
    y_train = df[TARGET_COL]
    
    le_X_train, le_test = transforme_data(X_train, test, categorical_cols)    
    
    

    
    return [X_train, y_train, test, le_X_train, le_test, categorical_cols]
    

In [199]:
if __name__ == '__main__':
    
    train_path = r'E:\Python\Kaggle\widsdatathon2020\input\training_v2.csv'
    test_path  = r'E:\Python\Kaggle\widsdatathon2020\input\unlabeled.csv'
    
    models_folders ={'lgb_path' : r'.\lgb_models_stack', 'xgb_path': r'.\xgb_models_stack', 
                     'cb_path' : r'.\cb_models_stack'}

    #Create dir for models
    for i in sorted(models_folders):
        if os.path.exists(models_folders[i]) == False:
            os.mkdir(models_folders[i]) 
    
    print('Prepareing Data for training.\n')
    prepaired_data = prepare_data (train_path, test_path)  # prepared data contains

    print('Train Stage.\n')
    train_stage(prepaired_data,  models_folders)
    
    print('Prediction Stage.\n')
    prediction_stage(test_path, lgb_path, xgb_path, cb_path)
    
    print('\nDone.')


Prepareing Data for training.

Transforming all String features to category.

Train Stage.


Model Fitting...
LigthGBM
Fold 0


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[33]	training's auc: 0.930305	valid_1's auc: 0.882925
Fold 1
Training until validation scores don't improve for 100 rounds


KeyboardInterrupt: 