# Optuna for LightGBM

Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import pickle
import optuna
import lightgbm as lgb

from sklearn.metrics import mean_squared_error

Prepare data

In [None]:
matrix = pd.read_pickle("checkpoint_final.pkl")
# Downcast the float columns to reduce RAM usage
floatcols = [c for c in matrix.columns if matrix[c].dtype=="float32"]
matrix[floatcols] = matrix[floatcols].astype("float16")
matrix['item_cnt_month'] = matrix['item_cnt_month'].clip(0,20)
keep_from_month = 2  # The first couple of months are dropped because of distortions to their features (e.g. wrong item age)
val_month = 33
test_month = 34

dropcols = [
    "shop_id",
    "item_id",
    "new_item",
]  # The features are dropped to reduce overfitting

categoricals = [
    "item_category_id",
    "month",
]
matrix[categoricals] = matrix[categoricals].astype("category") 
train = matrix.drop(columns=dropcols).loc[matrix.date_block_num < val_month, :]
train = train[train.date_block_num >= keep_from_month]
val = matrix.drop(columns=dropcols).loc[matrix.date_block_num == val_month, :]
test = matrix.drop(columns=dropcols).loc[matrix.date_block_num == test_month, :]

X_train = train.drop(columns="item_cnt_month")
y_train = train.item_cnt_month
X_val = val.drop(columns="item_cnt_month")
y_val = val.item_cnt_month
X_test = test.drop(columns="item_cnt_month")

del(matrix, train, test)

For Better Accuracy:

- Use large max_bin (may be slower)

- Use small learning_rate with large num_iterations

- Use large num_leaves (may cause over-fitting)

- Use bigger training data

- Try dart

Deal with Over-fitting:

- Use small max_bin

- Use small num_leaves

- Use min_data_in_leaf and min_sum_hessian_in_leaf

- Use bagging by set bagging_fraction and bagging_freq

- Use feature sub-sampling by set feature_fraction

- Use bigger training data

- Try lambda_l1, lambda_l2 and min_gain_to_split for regularization

- Try max_depth to avoid growing deep tree

- Try extra_trees

- Try increasing path_smooth

# Optuna + CV + custom loss

In [None]:
import optuna

class CFG:
    n_repeats = 2
    n_folds = 2
    num_boost_round = 10000
    seeds = [1, 42, 228, 265, 21, 8081988, 5062023, 666, 1488]

def balanced_log_loss(y_true, y_pred):
    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1 - y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

def bll_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def calc_log_loss_weight(y_true): 
    '''w0, w1 assign different weights to individual data points during training.'''
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1

def lgbm_opt(features, boosting_type, n_trials):
    print('fdafa')
    X, y = train_df[features], train_df.Class
#     X, y = generated_features_train, train_df.Class
    
    def objective(trial):
        bll_list = list()
        
        for i in range(CFG.n_repeats):
            print(f'Repeat {blu}#{i+1}')

            kf = MultilabelStratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=8062023+i)

            # Stratify based on Class and Alpha (3 types of conditions)
            for fold, (train_idx, val_idx) in enumerate(kf.split(X=train_df[features], y=greeks.iloc[:,1:3]), start = 1): 

                # Split the dataset according to the fold indexes.
                X_train = X.iloc[train_idx]
                X_val = X.iloc[val_idx]
                y_train = y.iloc[train_idx]
                y_val = y.iloc[val_idx]

                dtrain = lgb.Dataset(X_train, label=y_train)
                dvalid = lgb.Dataset(X_val, label=y_val)
                
                param = {
#                     'device': 'gpu',
#                     'gpu_platform_id': 0,
#                     'gpu_device_id': 0,
                    'objective': 'binary',
                    'metric': 'none',
                    'is_unbalance': True,
                    'early_stopping_round' : 50, 
#                     'verbosity': 0,
                    'boosting_type': boosting_type, # trial.suggest_categorical('boosting_type', ['goss']),
                    'force_col_wise': False, # Use only with CPU devices

                    'subsample_for_bin': 300000, # Number of data that sampled to construct feature discrete bins; setting this 
                                                 # to larger value will give better training result but may increase train time
                    'n_estimators': trial.suggest_int('n_estimators', 300, 1000),      
                    'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 3e-1),
                    'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
                    'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
                    'num_leaves': trial.suggest_int('num_leaves', 2, 256), # Max number of leaves in one tree
                    'max_bin': trial.suggest_int('max_bin', 32, 255), # Max number of bins that feature values will be 
                                                                       # bucketed in. small number of bins may reduce training 
                                                                       # accuracy but may deal with overfitting
                    'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0), # Randomly select a subset of features 
                                                                                           # if feature_fraction < 1.0
                    'bagging_fraction': None, # Randomly select part of data without 
#                                             # resampling if bagging_fraction < 1.0
                    'bagging_freq': trial.suggest_int('bagging_freq', 1, 7), # Perform bagging at every k iteration
                    'min_data_in_leaf': trial.suggest_int('min_child_samples', 5, 100), # Minimal number of data in one leaf
                                                                                        # aliases: min_child_samples, 
                    'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-4, 1e-1), # Stop trying to split 
                                                                                                           # leave if sum of it's
                                                                                                           # hessian less than k
#                     'cat_smooth': trial.suggest_float('cat_smooth', 10.0, 100.0), # this can reduce the effect of noises in 
#                                                                                   # categorical features, especially for 
#                                                                                   # categories with few data
                    'verbose': -1
                }
                
                if boosting_type != 'goss':
                    param['bagging_fraction'] = trial.suggest_float('bagging_fraction', 0.4, 1.0)
                
                # Add a callback for pruning.
                pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'balanced_log_loss')
                gbm = lgb.train(
                    param, dtrain, valid_sets=[dvalid], verbose_eval=100, callbacks=[pruning_callback], 
                    feval=bll_metric
                )

                preds = gbm.predict(X_val)
                bll = balanced_log_loss(y_val, preds)
                bll_list.append(bll)
                
        return np.mean(bll_list)
            
    study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    df = study.trials_dataframe().sort_values('value')
    df.to_csv(f'optuna_lgbm_{boosting_type}_fold_{fold}.csv')
            
for bt in ['goss', 'gbdt', 'dart']:
    if bt == 'dart':
        lgbm_opt(features, bt, n_trials=1000)
    else:
        lgbm_opt(features, bt, n_trials=5000)

Create a dataframe from the study and select columns with neccessary parameters

In [None]:
df = study.trials_dataframe()
df.sort_values('value')# .iloc[:, [1] + list(range(5, 14))]