# Submission 2: LGBM (tuned model)

In [1]:
# fundamentals
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd 
import numpy as np
import scipy
import math

# data exploration 
from pandas_profiling import ProfileReport
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=True, world_readable=True)
from plotly.offline import iplot

# data preprocessing 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm


# regressors
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR, SVR

import xgboost as xgb 
import catboost as cb
import lightgbm as lgb

from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# metrics for evaluation
from sklearn.metrics import mean_squared_error
from scipy import stats

# saving parameters
from joblib import dump, load

# hyperparameter searching and tuning 
import optuna
import tqdm

In [2]:
#fixing random seed for reproducability
import random

random.seed(0)
np.random.seed(0)

In [3]:
original_df = pd.read_csv('train.csv', index_col = 'id')

In [4]:
categorical_col = ['cat0','cat1','cat2','cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']

In [5]:
X_train = original_df.drop(columns = 'target', axis =1)
Y_train = original_df['target']

In [6]:
X_train.head()

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,A,B,A,A,B,D,A,E,C,I,...,0.281421,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903
2,B,A,A,A,B,B,A,E,A,F,...,0.282354,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464
3,A,A,A,C,B,D,A,B,C,N,...,0.293756,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352
4,A,A,A,C,B,D,A,E,G,K,...,0.769785,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766
6,A,B,A,A,B,B,A,E,C,F,...,0.279105,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743


In [7]:
X_train.columns

Index(['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
       'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
      dtype='object')

In [8]:
X_train.columns[0:10]

Index(['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9'],
      dtype='object')

In [9]:
cat_features = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9']

In [11]:
for feature in X_train.columns[0:10]:
    X_train[feature] = X_train[feature].astype('category')

In [12]:
le = OrdinalEncoder()
X_train[['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
'cat9']] = le.fit_transform(X_train[['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
'cat9']])

In [13]:
X_train

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,1.0,0.0,0.0,1.0,3.0,0.0,4.0,2.0,8.0,...,0.281421,0.881122,0.421650,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903
2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,4.0,0.0,5.0,...,0.282354,0.440011,0.346230,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464
3,0.0,0.0,0.0,2.0,1.0,3.0,0.0,1.0,2.0,13.0,...,0.293756,0.914155,0.369602,0.832564,0.865620,0.825251,0.264104,0.695561,0.869133,0.828352
4,0.0,0.0,0.0,2.0,1.0,3.0,0.0,4.0,6.0,10.0,...,0.769785,0.934138,0.578930,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766
6,0.0,1.0,0.0,0.0,1.0,1.0,0.0,4.0,2.0,5.0,...,0.279105,0.382600,0.705940,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499993,0.0,1.0,0.0,2.0,1.0,1.0,0.0,4.0,4.0,11.0,...,0.768447,0.269578,0.258655,0.363598,0.300619,0.340516,0.235711,0.383477,0.215227,0.793630
499996,0.0,1.0,0.0,2.0,1.0,1.0,0.0,4.0,4.0,11.0,...,0.775951,0.197211,0.257024,0.574304,0.227035,0.322583,0.286094,0.324874,0.306933,0.230902
499997,0.0,1.0,0.0,2.0,1.0,1.0,0.0,4.0,2.0,12.0,...,0.297406,0.449482,0.386172,0.476217,0.135947,0.502730,0.235788,0.316671,0.250286,0.349041
499998,0.0,1.0,1.0,2.0,1.0,1.0,0.0,3.0,4.0,5.0,...,0.758642,0.363130,0.324132,0.229017,0.220888,0.515304,0.389391,0.245234,0.303895,0.481138


In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 1 to 499999
Data columns (total 24 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   cat0    300000 non-null  float64
 1   cat1    300000 non-null  float64
 2   cat2    300000 non-null  float64
 3   cat3    300000 non-null  float64
 4   cat4    300000 non-null  float64
 5   cat5    300000 non-null  float64
 6   cat6    300000 non-null  float64
 7   cat7    300000 non-null  float64
 8   cat8    300000 non-null  float64
 9   cat9    300000 non-null  float64
 10  cont0   300000 non-null  float64
 11  cont1   300000 non-null  float64
 12  cont2   300000 non-null  float64
 13  cont3   300000 non-null  float64
 14  cont4   300000 non-null  float64
 15  cont5   300000 non-null  float64
 16  cont6   300000 non-null  float64
 17  cont7   300000 non-null  float64
 18  cont8   300000 non-null  float64
 19  cont9   300000 non-null  float64
 20  cont10  300000 non-null  float64
 21  cont11  30

In [15]:
for feature in X_train.columns[0:10]:
        X_train[feature] = X_train[feature].astype('category')

In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 1 to 499999
Data columns (total 24 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   cat0    300000 non-null  category
 1   cat1    300000 non-null  category
 2   cat2    300000 non-null  category
 3   cat3    300000 non-null  category
 4   cat4    300000 non-null  category
 5   cat5    300000 non-null  category
 6   cat6    300000 non-null  category
 7   cat7    300000 non-null  category
 8   cat8    300000 non-null  category
 9   cat9    300000 non-null  category
 10  cont0   300000 non-null  float64 
 11  cont1   300000 non-null  float64 
 12  cont2   300000 non-null  float64 
 13  cont3   300000 non-null  float64 
 14  cont4   300000 non-null  float64 
 15  cont5   300000 non-null  float64 
 16  cont6   300000 non-null  float64 
 17  cont7   300000 non-null  float64 
 18  cont8   300000 non-null  float64 
 19  cont9   300000 non-null  float64 
 20  cont10  300000 non-null  f

In [17]:
cat_features

['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9']

In [18]:
class KFoldsAverageLGBM():
    def __init__(self, FOLDS):
        self.models = []
        self.kfolds = KFold(n_splits = FOLDS, shuffle = False)
            
        
    def fit(self, train_x, train_y, params, prune = True):
        oof_preds = np.zeros_like(train_y)
        self.train_x = train_x.values
        self.train_y = train_y.values
        
        for train_idx, val_idx in self.kfolds.split(train_x):
            X_train_CV, X_val_CV = self.train_x[train_idx], self.train_x[val_idx]
            Y_train_CV, Y_val_CV = self.train_y[train_idx], self.train_y[val_idx]
            
            d_train = lgb.Dataset(data = X_train_CV, label=Y_train_CV)
            d_val = lgb.Dataset(X_val_CV, label=Y_val_CV)
            watchlist = [d_train, d_val]

            # Add a callback for pruning.
            pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")
            
            # pruning for hyperparameter search, otherwise fit to params
            if prune:
                model = lgb.train(params, 
                                train_set = d_train,
                                valid_sets = d_val,
                                verbose_eval = -1,
                                valid_names = 'valid_0',
                                early_stopping_rounds = 1000,
                                callbacks = [pruning_callback])
            else:
                model = lgb.train(params, 
                                 train_set = d_train,
                                 valid_sets = d_val,
                                 verbose_eval = -1,
                                 valid_names = 'valid_0',
                                 early_stopping_rounds = 1000)       
        
            self.models.append(model)
            oof_pred = model.predict(X_val_CV)
            oof_preds[val_idx] = oof_pred
            
        self.oof_preds = oof_preds
        
        self.rmse = mean_squared_error(Y_train, oof_preds, squared = False)

    def predict(self, test_x):
        preds = []
        for model in tqdm.tqdm(self.models):
            pred = model.predict(test_x)
            preds.append(pred)
        preds = np.mean(preds, axis=0)
        return preds

In [19]:
def objective_LGBM(trial):
    num_leaves =  trial.suggest_int('num_leaves', 8, 4056, log=True) 
    lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-10, 1)
    lambda_l2 = trial.suggest_loguniform('lambda_l2',1e-10, 1)
    subsample_for_bin = trial.suggest_int('subsample_for_bin',1000, 1996000, step = 5000)
    min_child_samples = trial.suggest_int('min_child_samples', 1, 40, 1)
    max_depth = trial.suggest_int('max_depth', 2, 124, log=True)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0, 1, step = 0.00001)
    feature_fraction = trial.suggest_float('feature_fraction', 0, 1, step = 0.00001)
    bagging_freq = trial.suggest_int('bagging_freq', 1,10, step = 1)
    max_bin = trial.suggest_int('max_bin', 2,256, log=True)
    cat_l2 = trial.suggest_float('cat_l2', 0, 100, step = 0.1)
    cat_smooth = trial.suggest_float('cat_smooth', 0, 100, step = 0.1)
    
    
    objective_params =  {
        'random_state' : 50,
        'objective': 'rmse',
        'learning_rate' : 0.00115,
        'n_jobs' : -1,
        'n_estimators' : 1000000, 
        'boosting_type' : 'gbdt',
        
        'num_leaves' :  num_leaves,
        'lambda_l1' : lambda_l1,
        'lambda_l2' : lambda_l2,
        'subsample_for_bin' : subsample_for_bin,
        'min_child_samples' : min_child_samples,
        'max_depth' : max_depth,
        'bagging_fraction': bagging_fraction,
        'feature_fraction' : feature_fraction,
        'bagging_freq' : bagging_freq,
        'max_bin' : max_bin,
        'cat_l2' : cat_l2,
        'cat_smooth' : cat_smooth
        }

    
    optuna_LGBM = KFoldsAverageLGBMTuning()
    optuna_LGBM.fit(trial = trial, train_x = X_train, train_y = Y_train, params = objective_params)

    return optuna_LGBM.rmse

In [20]:
tuned_LGBM = KFoldsAverageLGBM(FOLDS = 10)

# from shogosuzuki & hamza
kaggle_params = {'bagging_freq': 1, 
                 'reg_alpha': 2.4766410381355457, 
                 'reg_lambda': 2.644144282261626, 
                 'colsample_bytree': 0.3, 
                 'subsample': 0.6, 
                 'learning_rate': 0.001, 
                 'max_depth': 20, 
                 'num_leaves': 139, 
                 'min_child_samples': 176, 
                 'min_data_per_group': 9,
                 'n_jobs' : -1,
                 'objective': 'rmse',
                 'n_estimators' : 1000000}



tuned_LGBM.fit(train_x = X_train, train_y = Y_train, params = kaggle_params)
print(tuned_LGBM.rmse)


Found `n_estimators` in params. Will use it instead of argument



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3625
[LightGBM] [Info] Number of data points in the train set: 270000, number of used features: 24
[LightGBM] [Info] Start training from score 7.456728
Training until validation scores don't improve for 1000 rounds
Early stopping, best iteration is:
[18392]	valid_0's rmse: 0.837937
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3625
[LightGBM] [Info] Number of data points in the train set: 270000, number of used features: 24
[LightGBM] [Info] Start training from score 7.456104
Training until validation scores don't improve for 1000 rounds
Early stopping, best iteration is:
[15948]	valid_0's rmse: 0.845988
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3625
[LightGBM] [Info] Number of data points 

Early stopping, best iteration is:
[18076]	valid_0's rmse: 0.846605
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3626
[LightGBM] [Info] Number of data points in the train set: 270000, number of used features: 24
[LightGBM] [Info] Start training from score 7.455675
Training until validation scores don't improve for 1000 rounds
Early stopping, best iteration is:
[17831]	valid_0's rmse: 0.838639
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3626
[LightGBM] [Info] Number of data points in the train set: 270000, number of used features: 24
[LightGBM] [Info] Start training from score 7.455984
Training until validation scores don't improve for 1000 rounds
Early stopping, best iteration is:
[20656]	valid_0's rmse: 0.84059
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3626
[LightGBM] [Info] Number of data points in the train set: 270000, number of used features: 24
[LightG

In [31]:
import joblib

In [None]:
joblib.dump(tuned_LGBM, 'submission2_tuned_LGBM.joblib', compress=3)

In [21]:
test = pd.read_csv('test.csv', index_col = 'id')

In [22]:
le = OrdinalEncoder()
test[['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
   'cat9']] = le.fit_transform(test[['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
   'cat9']])

In [23]:
for feature in test.columns[0:10]:
    test[feature] = test[feature].astype('category')

In [24]:
submission2 = pd.read_csv('sample_submission.csv', index_col='id')

In [25]:
workings = pd.DataFrame(index = test.index)
workings

0
5
15
16
17
...
499987
499990
499991
499994
499995


In [26]:
workings['lgbm'] = tuned_LGBM.predict(test)
workings.tail()

100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [31:09<00:00, 186.97s/it]


Unnamed: 0_level_0,lgbm
id,Unnamed: 1_level_1
499987,7.498402
499990,7.231649
499991,7.528269
499994,7.494877
499995,7.311966


In [27]:
submission2['target'] = workings['lgbm']

In [29]:
submission2.to_csv('submission2.csv')