# Submission 2: LGBM (plain optuna tuned model)

In [1]:
# fundamentals
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd 
import numpy as np
import scipy
import math

# data exploration 
from pandas_profiling import ProfileReport
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=True, world_readable=True)
from plotly.offline import iplot

# data preprocessing 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm


# regressors
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR, SVR

import xgboost as xgb 
import catboost as cb
import lightgbm as lgb

from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# metrics for evaluation
from sklearn.metrics import mean_squared_error
from scipy import stats

# saving parameters
from joblib import dump, load

# hyperparameter searching and tuning 
import optuna
import tqdm

In [2]:
#fixing random seed for repoducability
import random

random.seed(0)
np.random.seed(0)

In [3]:
original_df = pd.read_csv('train.csv', index_col = 'id')

In [4]:
categorical_col = ['cat0','cat1','cat2','cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']

In [5]:
X_train = original_df.drop(columns = 'target', axis =1)
Y_train = original_df['target']

In [6]:
X_train.head()

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,A,B,A,A,B,D,A,E,C,I,...,0.281421,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903
2,B,A,A,A,B,B,A,E,A,F,...,0.282354,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464
3,A,A,A,C,B,D,A,B,C,N,...,0.293756,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352
4,A,A,A,C,B,D,A,E,G,K,...,0.769785,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766
6,A,B,A,A,B,B,A,E,C,F,...,0.279105,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743


In [7]:
X_train.columns

Index(['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
       'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
      dtype='object')

In [8]:
X_train.columns[0:10]

Index(['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9'],
      dtype='object')

In [9]:
cat_features = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9']

In [10]:
for table in [X_train, X_val]:
    for feature in table.columns[0:10]:
        table[feature] = table[feature].astype('category')

NameError: name 'X_val' is not defined

In [None]:
for table in [X_train, X_val]:
    le = OrdinalEncoder()
    table[['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
   'cat9']] = le.fit_transform(table[['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
   'cat9']])

In [None]:
X_train

In [None]:
X_train.info()

In [None]:
for table in [X_train, X_val]:
    for feature in table.columns[0:10]:
        table[feature] = table[feature].astype('category')

In [None]:
X_train.info()

In [None]:
cat_features

In [11]:
import joblib

In [23]:
encoded_X_train = joblib.load("encoded_X_train.joblib")
encoded_X_test = joblib.load("encoded_X_test.joblib")

In [24]:
class KFoldsAverageLGBMTuning():
    def __init__(self):
        self.models = []
        self.kfolds = KFold(n_splits = 6, shuffle = False)
        
        
    def fit(self, trial, train_x, train_y, params):
        oof_preds = np.zeros_like(train_y)
        self.train_x = train_x
        self.train_y = train_y.values
        
        # Add a callback for pruning.
        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")
        
        for train_idx, val_idx in self.kfolds.split(train_x):
            X_train_CV, X_val_CV = self.train_x[train_idx], self.train_x[val_idx]
            Y_train_CV, Y_val_CV = self.train_y[train_idx], self.train_y[val_idx]
            
            d_train = lgb.Dataset(data = X_train_CV, label=Y_train_CV)
            d_val = lgb.Dataset(X_val_CV, label=Y_val_CV)
            watchlist = [d_train, d_val]

            
            model = lgb.train(params, 
                             train_set = d_train,
                             valid_sets = d_val,
                             verbose_eval = -1,
                             valid_names = 'valid_0',
                             early_stopping_rounds = 1000,
                             callbacks = [pruning_callback])
            
        
            
            self.models.append(model)
            oof_pred = model.predict(X_val_CV)
            oof_preds[val_idx] = oof_pred
            
        self.oof_preds = oof_preds
        
        self.rmse = mean_squared_error(Y_train, oof_preds, squared = False)

    def predict(self, test_x):
        preds = []
        for model in tqdm.tqdm(self.models):
            pred = model.predict(test_x)
            preds.append(pred)
        preds = np.mean(preds, axis=0)
        return preds

In [25]:
def objective_LGBM(trial):
    num_leaves =  trial.suggest_int('num_leaves', 8, 4056, log=True) 
    lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-10, 1)
    lambda_l2 = trial.suggest_loguniform('lambda_l2',1e-10, 1)
    subsample_for_bin = trial.suggest_int('subsample_for_bin',1000, 1996000, step = 5000)
    min_child_samples = trial.suggest_int('min_child_samples', 1, 40, 1)
    max_depth = trial.suggest_int('max_depth', 2, 124, log=True)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0, 1, step = 0.00001)
    feature_fraction = trial.suggest_float('feature_fraction', 0, 1, step = 0.00001)
    bagging_freq = trial.suggest_int('bagging_freq', 1,10, step = 1)
    max_bin = trial.suggest_int('max_bin', 2,256, log=True)
    
    
    objective_params =  {
        'random_state' : 50,
        'objective': 'rmse',
        'learning_rate' : 0.00115,
        'n_jobs' : -1,
        'n_estimators' : 1000000, 
        'boosting_type' : 'gbdt',
        
        'num_leaves' :  num_leaves,
        'lambda_l1' : lambda_l1,
        'lambda_l2' : lambda_l2,
        'subsample_for_bin' : subsample_for_bin,
        'min_child_samples' : min_child_samples,
        'max_depth' : max_depth,
        'bagging_fraction': bagging_fraction,
        'feature_fraction' : feature_fraction,
        'bagging_freq' : bagging_freq,
        'max_bin' : max_bin
        }

    
    optuna_LGBM = KFoldsAverageLGBMTuning()
    optuna_LGBM.fit(trial = trial, train_x = encoded_X_train, train_y = Y_train, params = objective_params)

    return optuna_LGBM.rmse

In [35]:
encoded_X_train[[7]]

array([[  498.3482  ,  1691.0677  ,  -951.14124 ,   213.12398 ,
          440.00653 ,  1061.3     ,   757.4526  ,  -148.58519 ,
        -1550.1182  , -1501.2585  , -1527.3839  ,   157.17332 ,
         -286.54065 ,   348.116   , -1904.4319  ,  3009.932   ,
         2018.8998  ,  -223.16122 , -2072.0483  ,   913.00214 ,
         -292.94736 , -1307.727   ,  1731.8445  ,  3091.5771  ,
          775.7211  ,  1881.6687  ,  1131.4706  ,  2358.3796  ,
        -1092.1138  ,  -294.51773 ,   546.302   ,  1417.289   ,
         -189.79718 , -2360.9958  ,  3183.4937  , -1308.56    ,
          362.23175 , -1350.6665  ,   323.85452 ,  -698.06714 ,
         2014.0687  ,  1585.8009  , -1594.1528  ,   285.83533 ,
        -2356.8955  ,  2002.6807  ,   -44.350624, -1585.8812  ,
          564.39185 , -1355.8986  ,  3569.4187  ,   240.37863 ,
         1872.4795  ,  -351.29    , -3369.9116  ,  -133.91899 ,
          795.4669  , -3017.6328  ,  2559.9302  ,  1853.8909  ,
          901.5504  , -2035.3801  , -202

lgbm_study = optuna.create_study(direction="minimize", pruner = optuna.pruners.HyperbandPruner())
lgbm_study.optimize(objective_LGBM, n_trials=1500, gc_after_trial = True)
print(lgbm_study.best_trial)

In [37]:
optuna.visualization.plot_optimization_history(lgbm_study)

In [38]:
optuna.visualization.plot_parallel_coordinate(lgbm_study)

In [39]:
optuna.visualization.plot_param_importances(lgbm_study)

In [40]:
optuna_CB_df = lgbm_study.trials_dataframe()
optuna_CB_df

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_fraction,params_bagging_freq,params_feature_fraction,params_lambda_l1,params_lambda_l2,...,system_attrs_completed_rung_0,system_attrs_completed_rung_1,system_attrs_completed_rung_2,system_attrs_completed_rung_3,system_attrs_completed_rung_4,system_attrs_completed_rung_5,system_attrs_completed_rung_6,system_attrs_completed_rung_7,system_attrs_completed_rung_8,state
0,0,0.878259,2021-02-24 12:41:07.152451,2021-02-24 13:04:14.153598,0 days 00:23:07.001147,0.66239,9,0.77535,4.190770e-08,2.924400e-07,...,,,,,,,,,,COMPLETE
1,1,0.878335,2021-02-24 13:04:14.498627,2021-02-24 13:11:10.882608,0 days 00:06:56.383981,0.22753,10,0.15740,6.041460e-08,9.773214e-07,...,0.884508,0.884390,0.884069,0.883137,0.880917,0.877346,0.875623,,,COMPLETE
2,2,0.887270,2021-02-24 13:11:11.169212,2021-02-24 13:12:28.167771,0 days 00:01:16.998559,0.94330,3,0.82758,2.430670e-05,4.229899e-09,...,0.884585,0.884585,0.884585,0.884585,0.884586,0.884590,0.884603,0.888391,0.888389,COMPLETE
3,3,0.879599,2021-02-24 13:12:28.440437,2021-02-24 13:25:41.513329,0 days 00:13:13.072892,0.35809,5,0.53820,5.707204e-03,1.445127e-08,...,0.884546,0.884507,0.884395,0.884076,0.883173,0.881038,0.877881,0.877445,,COMPLETE
4,4,0.884573,2021-02-24 13:25:41.778388,2021-02-24 13:25:42.705596,0 days 00:00:00.927208,0.12046,5,0.76791,2.812893e-08,1.039276e-03,...,0.884573,,,,,,,,,PRUNED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1495,0.884476,2021-02-25 13:45:48.775308,2021-02-25 13:45:50.993805,0 days 00:00:02.218497,0.87836,4,0.72893,1.687007e-08,1.145876e-05,...,0.884529,0.884476,,,,,,,,PRUNED
1496,1496,0.884478,2021-02-25 13:45:51.265867,2021-02-25 13:45:53.140288,0 days 00:00:01.874421,0.77795,5,0.83525,7.765776e-07,4.427607e-07,...,0.884530,0.884478,,,,,,,,PRUNED
1497,1497,0.884488,2021-02-25 13:45:53.373341,2021-02-25 13:45:54.728643,0 days 00:00:01.355302,0.76445,7,0.14226,7.353845e-09,1.221187e-07,...,0.884488,,,,,,,,,PRUNED
1498,1498,0.884537,2021-02-25 13:45:54.916687,2021-02-25 13:45:56.029935,0 days 00:00:01.113248,0.86670,6,0.80584,2.949167e-01,5.759965e-06,...,0.884537,,,,,,,,,PRUNED


In [44]:
completed_optuna_trial_df = optuna_CB_df[optuna_CB_df['state'] == 'COMPLETE']

In [46]:
completed_optuna_trial_df.sort_values(by = 'value')

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_fraction,params_bagging_freq,params_feature_fraction,params_lambda_l1,params_lambda_l2,...,system_attrs_completed_rung_0,system_attrs_completed_rung_1,system_attrs_completed_rung_2,system_attrs_completed_rung_3,system_attrs_completed_rung_4,system_attrs_completed_rung_5,system_attrs_completed_rung_6,system_attrs_completed_rung_7,system_attrs_completed_rung_8,state
11,11,0.876892,2021-02-24 13:30:28.508600,2021-02-24 13:36:49.022887,0 days 00:06:20.514287,0.79834,8,0.70194,0.001696234,2.786169e-10,...,0.884536,0.884489,0.884349,0.883941,0.882802,0.880109,0.876022,0.874163,,COMPLETE
756,756,0.876942,2021-02-25 08:07:46.160549,2021-02-25 08:13:06.991641,0 days 00:05:20.831092,0.99071,6,0.63631,0.04483412,2.707327e-07,...,0.882823,0.880172,0.8761,0.874119,,,,,,COMPLETE
78,78,0.87698,2021-02-24 14:27:13.149461,2021-02-24 14:33:51.430562,0 days 00:06:38.281101,0.9968,7,0.5522,6.053287e-05,3.488173e-05,...,0.884489,0.884348,0.883938,0.8828,0.880125,0.876068,0.874198,,,COMPLETE
424,424,0.876995,2021-02-25 03:35:23.007361,2021-02-25 03:41:40.540687,0 days 00:06:17.533326,0.84607,2,0.78408,0.0154279,3.343249e-08,...,0.882897,0.880341,0.876429,0.874382,,,,,,COMPLETE
1317,1317,0.877135,2021-02-25 12:51:31.127575,2021-02-25 12:56:06.234045,0 days 00:04:35.106470,0.20273,9,0.9531,2.893313e-06,0.2357711,...,0.876369,0.874594,,,,,,,,COMPLETE
1406,1406,0.877185,2021-02-25 13:13:15.023372,2021-02-25 13:29:07.847377,0 days 00:15:52.824005,0.94735,4,0.96279,1.729723e-05,1.2736e-08,...,0.883881,0.88266,0.879819,0.875741,0.874339,,,,,COMPLETE
69,69,0.87727,2021-02-24 14:20:21.663064,2021-02-24 14:26:22.644551,0 days 00:06:00.981487,0.68224,8,0.50846,2.51019e-09,1.268472e-07,...,0.88449,0.88435,0.883946,0.882821,0.880149,0.876155,0.874695,,,COMPLETE
24,24,0.877287,2021-02-24 14:00:37.980236,2021-02-24 14:04:40.882026,0 days 00:04:02.901790,0.12841,6,0.21328,1.767658e-10,1.907395e-10,...,0.884039,0.883078,0.880721,0.876925,0.874667,,,,,COMPLETE
770,770,0.877331,2021-02-25 08:13:46.015384,2021-02-25 08:23:40.328036,0 days 00:09:54.312652,0.79614,6,0.89786,0.0009343606,1.713792e-09,...,0.884479,0.88433,0.883882,0.882659,0.879835,0.8758,0.874793,,,COMPLETE
7,7,0.877413,2021-02-24 13:25:45.022580,2021-02-24 13:30:25.005813,0 days 00:04:39.983233,0.60964,1,0.10316,3.614921e-10,1.826205e-10,...,0.88292,0.880401,0.876482,0.874657,,,,,,COMPLETE
