# Optuna for XGBoost

Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import pickle
import optuna
import lightgbm as lgb

from sklearn.metrics import mean_squared_error

For Better Accuracy:

- Use large max_bin (may be slower)

- Use small learning_rate with large num_iterations

- Use large num_leaves (may cause over-fitting)

- Use bigger training data

- Try dart

Deal with Over-fitting:

- Use small max_bin

- Use small num_leaves

- Use min_data_in_leaf and min_sum_hessian_in_leaf

- Use bagging by set bagging_fraction and bagging_freq

- Use feature sub-sampling by set feature_fraction

- Use bigger training data

- Try lambda_l1, lambda_l2 and min_gain_to_split for regularization

- Try max_depth to avoid growing deep tree

- Try extra_trees

- Try increasing path_smooth

# Optuna + Multilabel CV + RandomUnderSampling + custom loss

In [None]:
class_imbalance = train_df[train_df['Class'] == 0].shape[0] / train_df[train_df['Class'] == 1].shape[0]

X, y = train_df[features], train_df['Class']
best_iterations = list()

def balanced_log_loss(y_true, y_pred):
    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1 - y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

def optimize_model(params, how):
    bll_list = list()
    best_trial_iterations = list()
    
    for i in range(CFG.n_optimize_repeats):
        print(f'Repeat {blu}#{i+1}')

        # Make random under- or oversampling to balance classes
        if CFG.undersample:
            positive_count_train = train_df['Class'].value_counts()[1]
            sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train * class_imbalance, 
                                                            1: positive_count_train}, 
                                        random_state=15062023+i, 
                                        replacement=True)
        elif CFG.oversample:
            negative_count_train = train_df['Class'].value_counts()[0]
            sampler = RandomOverSampler(sampling_strategy={0: negative_count_train, 
                                                        1: negative_count_train // class_imbalance}, 
                                        random_state=2306020231)


        X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']
        
        if CFG.undersample:
            X_re, y_re = sampler.fit_resample(X_re, y_re)
        
        # Create Stratified Multilabel k-Fold scheme
        kf = MultilabelStratifiedKFold(n_splits=CFG.n_optimize_folds, shuffle=True, random_state=10062023+i)

        # Create an oof array for inner loop
        oof = np.zeros(X_re.shape[0])

        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X=X_re[features], y=X_re.iloc[:,-3:]), start=1): 
            X, y = X_re[features], y_re
            
            # Split the dataset according to the fold indexes.
            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            # oversample
            if CFG.oversample:
                X_train, y_train = sampler.fit_resample(X_train, y_train)

            if how == 'lgbm':
                model = lgb.LGBMClassifier(**params)
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=bll_metric, verbose=0)
                best_iter = model.best_iteration_
            elif how == 'xgboost':
                model = xgb.XGBClassifier(**params)
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
                best_iter = model.get_booster().best_iteration
            elif how == 'catboost':
                train_pool = Pool(X_train, y_train, cat_features=['EJ'])
                val_pool = Pool(X_val, y_val, cat_features=['EJ']) 
                model = cat.CatBoostClassifier(**params)
                model.fit(train_pool, eval_set=val_pool, verbose=0)
                best_iter = model.best_iteration_
            else:
                return 0

            val_preds = model.predict_proba(X_val)[:,1]
            oof[val_idx] = val_preds
        
        bll_list.append(balanced_log_loss(y_re, oof))
        best_trial_iterations.append(best_iter)
    
    best_iterations.append(int(np.mean(best_trial_iterations)))

    return np.mean(bll_list)

def objective(trial):
    params = {
        "n_estimators": CFG.n_estimators, # trial.suggest_int('n_estimators', 100, 1000, step=100),
        "early_stopping_rounds": CFG.early_stopping_rounds,
        "verbosity": 0,
        "random_state": 14062023,
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree"]),# "dart", "gblinear"]), 
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
    }

    if not CFG.undersample:
        params["scale_pos_weight"] = class_imbalance
    
    if params["booster"] in ["gbtree", "dart"]:
        params["learning_rate"] = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True) # alias eta
        # maximum depth of the tree, signifies complexity of the tree.
        params["max_depth"] = trial.suggest_int("max_depth", 3, 10)
        # minimum child weight, larger the term more conservative the tree.
        params["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        # defines how selective algorithm is.
        params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if params["booster"] == "dart":
        params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        params["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        params["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    return optimize_model(params, how='xgboost')

if CFG.xgb_optimize:
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=CFG.n_trials * 2)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    df = study.trials_dataframe().sort_values('value')
    df.to_csv(f'optuna_xgb.csv')

    display(df.head(10))