<a href="https://colab.research.google.com/github/anuj0456/deep_learning_for_coders/blob/master/competitions/kg_ensemble_tabular_playground_feb_2021.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --no-warn-conflicts -q --upgrade xgboost

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
from functools import partial
from xgboost import DMatrix
import lightgbm as lgbm
import xgboost as xgb 
import seaborn as sns
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [None]:
X_train = train.drop(['id', 'target'], axis=1)
y_train = train.target
X_test = test.drop(['id'], axis=1)

In [None]:
cat_cols = [feature for feature in train.columns if 'cat' in feature]
cont_cols = [feature for feature in train.columns if 'con' in feature]

for feature in cat_cols:
    le = LabelEncoder()
    le.fit(train[feature])
    X_train[feature] = le.transform(X_train[feature])
    X_test[feature] = le.transform(X_test[feature])


In [None]:
seed = 0
n_splits = 5
shuffle=True
iterations = 50000
early_stopping_rounds = 400
verbose_eval = 0
baseline_rounds = 5
cb_learning_rate = 0.006
xgb_learning_rate = 0.01

In [None]:
split = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

In [None]:
cb_params = {'iterations':iterations,
             'learning_rate':cb_learning_rate,
             'depth':7,
             'bootstrap_type':'Bernoulli',
             'random_strength':1,
             'min_data_in_leaf':10,
             'l2_leaf_reg':3,
             'loss_function':'RMSE', 
             'eval_metric':'RMSE',
             'random_seed':seed,
             'grow_policy':'Depthwise',
             'max_bin':1024, 
             'model_size_reg': 0,
             'task_type': 'GPU',
             'od_type':'IncToDec',
             'od_wait':100,
             'metric_period':500,
             'verbose':verbose_eval,
             'subsample':0.8,
             'od_pval':1e-10,
             'max_ctr_complexity': 8,
             'has_time': False,
             'simple_ctr' : 'FeatureFreq',
             'combinations_ctr': 'FeatureFreq'
            }

xgb_params= {'objective': 'reg:squarederror',
             'max_depth': 6,
             'eta': xgb_learning_rate,
             'colsample_bytree': 0.4,
             'subsample': 0.6,
             'reg_alpha' : 6,
             'min_child_weight': 100,
             'n_jobs': 2,
             'seed': 2001,
             'tree_method': 'gpu_hist',
             'gpu_id': 0,
             'predictor': 'gpu_predictor',
            }

lgbm_params = {'max_depth': 16,
               'subsample': 0.8032697250789377, 
               'colsample_bytree': 0.21067140508531404,
               'learning_rate': 0.009867383057779643,
               'reg_lambda': 10.987474846877767,
               'reg_alpha': 17.335285595031994,
               'min_child_samples': 31, 
               'num_leaves': 66,
               'max_bin': 522,
               'cat_smooth': 81,
               'cat_l2': 0.029690334194270022,
               'metric': 'rmse',
               'n_jobs': -1, 
               'verbose':-1,
               'n_estimators': iterations
              }


In [None]:
preds_list = []
oof_cb = np.zeros((len(train)))
oof_xgb = np.zeros((len(train)))
oof_cbx = np.zeros((len(train)))
oof_xgbx = np.zeros((len(train)))
oof_lgb = np.zeros((len(train)))
oof_lgb_incremental = np.zeros((len(train)))

for fold, (train_idx, val_idx) in enumerate(split.split(X_train)):
    print(f'Fold {fold+1}')
    X_tr = X_train.iloc[train_idx]
    X_val = X_train.iloc[val_idx]
    y_tr = y_train.iloc[train_idx]
    y_val = y_train.iloc[val_idx]
   
    ptrain = Pool(data=X_tr, label=y_tr, cat_features=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    pvalid = Pool(data=X_val, label=y_val, cat_features=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    ptest = Pool(data=X_test, cat_features=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    CModel = CatBoostRegressor(**cb_params)
    CModel.fit(ptrain,
               eval_set=pvalid,
               use_best_model=True,
               early_stopping_rounds=early_stopping_rounds)
    temp_fold_preds = CModel.predict(pvalid)
    oof_cb[val_idx] = temp_fold_preds
    first_cb_rmse = mean_squared_error(y_val, temp_fold_preds, squared=False)
    print(f'RMSE of CB model is {first_cb_rmse}')
    baseline_preds_tr_cb = CModel.predict(ptrain)
    baseline_preds_vl_cb = CModel.predict(pvalid)
    test_preds_cb = CModel.predict(ptest)   
    
    xtrain = DMatrix(data=X_tr, label=y_tr, nthread=2)
    xvalid = DMatrix(data=X_val, label=y_val, nthread=2)
    xtest = DMatrix(data=X_test, nthread=2)
    XModel = xgb.train(xgb_params, xtrain,
                       evals=[(xvalid,'validation')],
                       verbose_eval=verbose_eval,
                       early_stopping_rounds=early_stopping_rounds,
                       xgb_model=None,
                       num_boost_round=iterations)
    temp_fold_preds = XModel.predict(xvalid)
    oof_xgb[val_idx] = temp_fold_preds
    first_xgb_rmse = mean_squared_error(y_val, temp_fold_preds, squared=False)
    print(f'RMSE of XGB model is {first_xgb_rmse}')
    baseline_preds_tr_xgb = XModel.predict(xtrain)
    baseline_preds_vl_xgb = XModel.predict(xvalid)
    test_preds_xgb = XModel.predict(xtest)
    
    ltrain = lgbm.Dataset(X_tr, label=y_tr, init_score=None, categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], free_raw_data=False)
    lvalid = lgbm.Dataset(X_val, label=y_val, init_score=None, categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], free_raw_data=False)
    ltest =  lgbm.Dataset(X_test, label=y_val, init_score=None, categorical_feature=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], free_raw_data=False)
    LModel = lgbm.train(lgbm_params,
                        train_set=ltrain,
                        num_boost_round=iterations,
                        valid_sets=lvalid, 
                        init_model=None,
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval=verbose_eval)           
    temp_fold_preds = LModel.predict(X_val)
    oof_lgb[val_idx] = temp_fold_preds
    first_lgb_rmse = mean_squared_error(y_val, temp_fold_preds, squared=False)
    print(f'RMSE of LGBM model is {first_lgb_rmse}')
    params = lgbm_params.copy()     
    params.update({'learning_rate': 0.003})
    for i in range(1, 8):
        if i > 2:                      
            params['reg_lambda'] *= 0.9
            params['reg_alpha']  *= 0.9
            params['num_leaves'] += 40                   
        
        LModel = lgbm.train(lgbm_params,
                            train_set=ltrain,
                            num_boost_round=iterations,
                            valid_sets=lvalid, 
                            init_model=LModel,
                            early_stopping_rounds=early_stopping_rounds,
                            verbose_eval=verbose_eval)           
    temp_fold_preds = LModel.predict(X_val)
    oof_lgb_incremental[val_idx] = temp_fold_preds
    second_lgb_rmse = mean_squared_error(y_val, temp_fold_preds, squared=False)
    print(f'RMSE of LGBM model is {second_lgb_rmse}')
    print(f'LGBM improvement using Incremental Improvements {first_lgb_rmse - second_lgb_rmse}')
    baseline_preds_tr_lgb = LModel.predict(X_tr)
    baseline_preds_vl_lgb = LModel.predict(X_val)
    test_preds_lgb = LModel.predict(X_test)
    
    baseline_train = (baseline_preds_tr_xgb+baseline_preds_tr_lgb+baseline_preds_tr_cb)/3
    baseline_valid = (baseline_preds_vl_xgb+baseline_preds_vl_lgb+baseline_preds_vl_cb)/3
    baseline_test = (test_preds_xgb+test_preds_lgb+test_preds_cb)/3
    
    for baseline in range(baseline_rounds):
        print(f'Using prediction of each model as a baseline for the next model round {baseline+1}')
        ptrain = Pool(data=X_tr, label=y_tr, cat_features=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], baseline=baseline_train)
        pvalid = Pool(data=X_val, label=y_val, cat_features=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], baseline=baseline_valid)
        ptest = Pool(data=X_test, cat_features=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], baseline=baseline_test)
        cb_params_ = cb_params.copy()
        cb_params_.update({'learning_rate': cb_learning_rate*(1/(baseline+1))})
        CModel = CatBoostRegressor(**cb_params_)
        CModel.fit(ptrain, 
                   eval_set=pvalid,
                   use_best_model=True,
                   early_stopping_rounds=early_stopping_rounds)
        temp_fold_preds = CModel.predict(pvalid)
        oof_cbx[val_idx] = temp_fold_preds
        second_cb_rmse = mean_squared_error(y_val, temp_fold_preds, squared=False)
        print(f'RMSE of CB model with baseline round {baseline+1} {second_cb_rmse}')   
        baseline_train = CModel.predict(ptrain)
        baseline_valid = CModel.predict(pvalid)
        baseline_test = CModel.predict(ptest)
    
        xtrain = DMatrix(data=X_tr, label=y_tr, base_margin=baseline_train)
        xvalid = DMatrix(data=X_val, label=y_val, base_margin=baseline_valid)
        xtest =  DMatrix(data=X_test, base_margin=baseline_test)
        xgb_params_ = xgb_params.copy()
        xgb_params_.update({'learning_rate': xgb_learning_rate*(1/(baseline+1))})
        XModel = xgb.train(xgb_params_, xtrain,
                           evals=[(xvalid,'validation')],
                           verbose_eval=verbose_eval,
                           early_stopping_rounds=early_stopping_rounds,
                           xgb_model=None,
                           num_boost_round=iterations)
        temp_fold_preds = XModel.predict(xvalid)
        oof_xgbx[val_idx] = temp_fold_preds
        second_xgb_rmse = mean_squared_error(y_val, temp_fold_preds, squared=False)
        print(f'RMSE of XGB model with baseline round {baseline+1} {second_xgb_rmse}')
        baseline_train = XModel.predict(xtrain)
        baseline_valid = XModel.predict(xvalid)
        baseline_test = XModel.predict(xtest)          
        print(f'CB Improvement  using Baseline round {baseline+1}: {first_cb_rmse - second_cb_rmse}')
        print(f'XGB Improvement using Baseline round {baseline+1}: {first_xgb_rmse - second_xgb_rmse}')
        first_cb_rmse = second_cb_rmse
        first_xgb_rmse = second_xgb_rmse
    print('-' * 100)
    print('',end='\n')
    preds_list.append(XModel.predict(xtest))

In [None]:
first_cb_rmse = mean_squared_error(y_train, oof_cb, squared=False)
first_xgb_rmse = mean_squared_error(y_train, oof_xgb, squared=False)
first_lgb_rmse = mean_squared_error(y_train, oof_lgb, squared=False)
second_cb_rmse = mean_squared_error(y_train, oof_cbx, squared=False)
second_xgb_rmse = mean_squared_error(y_train, oof_xgbx, squared=False)
second_lgb_rmse = mean_squared_error(y_train, oof_lgb_incremental, squared=False)
print(f'RMSE for CB model is {first_cb_rmse}')
print(f'RMSE for XGB model is {first_xgb_rmse}')
print(f'RMSE for LGBM model is {first_lgb_rmse}')
print(f'RMSE for CB model with XGB baseline is {second_cb_rmse}')
print(f'RMSE for XGB model with CB baseline is {second_xgb_rmse}')
print(f'RMSE for LGBM model with Incremental Improvement is {second_lgb_rmse}')
print(f'RMSE for CB and XGB blend is {mean_squared_error(y_train, (oof_cbx+oof_xgbx)/2, squared=False)}')
print(f'RMSE for CB, XGB and LGBM blend is {mean_squared_error(y_train, (oof_cbx+oof_xgbx+oof_lgb_incremental)/3, squared=False)}')

In [None]:
preds = np.mean(preds_list, axis=0)
submission_mean = pd.DataFrame({'id':test.id,'target':preds})
submission_mean.to_csv('submission_mean.csv', index=False)
submission_mean.head()