# Boosting studies

## Setup

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [2]:
# config
class cfg:
    debug = False
    lgbm = True
    xgb = False
    cat = False
    nfolds = 10
    device = 'cpu'
    seed = 42
    colab = False

In [3]:
# libraries
import os
import joblib

import numpy as np
import pandas as pd

from sklearn.preprocessing import  OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

import lightgbm as lgb
import xgboost as xgb
import catboost

import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# set paths
data_path = '../data/final/'
studies_path = '../src/training_files/studies/'

if cfg.colab:
    from google.colab import drive
    drive.mount('/content/gdrive')
    drive_path = '/content/gdrive/My Drive/' 
    
    data_path = drive_path + 'temp/final/'
    studies_path = drive_path + 'temp/studies/'

os.makedirs(studies_path, exist_ok=True)

## Data preparation

In [5]:
# load data
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + '/test.csv')

In [6]:
TARGET = 'Transported'
FEATURES = [col for col in train.columns if col not in [TARGET]]

numerical = train[FEATURES].select_dtypes(include=np.number).columns.to_list()
categorical = train[FEATURES].select_dtypes(exclude=np.number).columns.to_list()

train[numerical] = train[numerical].astype(float)
train[categorical] = train[categorical].astype(object)
train[TARGET] = train[TARGET].astype(float)

test[numerical] = test[numerical].astype(float)
test[categorical] = test[categorical].astype(object)

print(f'Target: {TARGET}')
print(f'Features:\n\tnumerical: {numerical}\n\tcategorical:{categorical}')
print(f'Shapes:\n\ttrain: {train.shape}\n\ttest: {test.shape}')
print(f'Missing values:\n\ttrain: {train.isna().sum().sum()}\n\ttest: {test.isna().sum().sum()}')

train.head(5)

Target: Transported
Features:
	numerical: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Expenditure', 'CabinNum', 'GroupSize', 'FamilySize']
	categorical:['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'NoSpending', 'CabinDeck', 'CabinSide', 'Solo']
Shapes:
	train: (8693, 19)
	test: (4277, 18)
Missing values:
	train: 0
	test: 0


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Expenditure,CabinNum,GroupSize,FamilySize,HomePlanet,CryoSleep,Destination,VIP,NoSpending,CabinDeck,CabinSide,Solo,Transported
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,Europa,False,TRAPPIST-1e,False,True,B,P,True,0.0
1,24.0,4.70048,2.302585,3.258097,6.309918,3.806662,6.602588,0.0,1.0,4.0,Earth,False,TRAPPIST-1e,False,False,F,S,True,1.0
2,58.0,3.78419,8.18228,0.0,8.812248,3.912023,9.248021,0.0,2.0,7.0,Europa,False,TRAPPIST-1e,True,False,A,S,False,0.0
3,33.0,0.0,7.157735,5.918894,8.110728,5.267858,8.551981,0.0,2.0,7.0,Europa,False,TRAPPIST-1e,False,False,A,S,False,0.0
4,16.0,5.717028,4.26268,5.023881,6.338594,1.098612,6.995766,1.0,1.0,9.0,Earth,False,TRAPPIST-1e,False,False,F,S,True,1.0


In [7]:
# define impute strategies
cat_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# combine preprocessing
ct = ColumnTransformer(
    transformers=[('cat', cat_encoder, categorical)],
    verbose_feature_names_out=False,
    remainder='passthrough'
).set_output(transform='pandas')

# apply preprocessing
x = train[FEATURES]
y = train[TARGET]

FEATURES = numerical + categorical
typedict = {k:v for (k,v) in zip(FEATURES, ['float64' if f in numerical else 'object' for f in FEATURES])}

ct = ct.fit(x) 
x = ct.transform(x)
test = ct.transform(test)

x.head(5)

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_missing,CryoSleep_False,CryoSleep_True,CryoSleep_missing,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,...,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Expenditure,CabinNum,GroupSize,FamilySize
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,24.0,4.70048,2.302585,3.258097,6.309918,3.806662,6.602588,0.0,1.0,4.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,58.0,3.78419,8.18228,0.0,8.812248,3.912023,9.248021,0.0,2.0,7.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,33.0,0.0,7.157735,5.918894,8.110728,5.267858,8.551981,0.0,2.0,7.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,16.0,5.717028,4.26268,5.023881,6.338594,1.098612,6.995766,1.0,1.0,9.0


In [8]:
# save encoded data
if not cfg.colab:
    os.makedirs('../data/final_ohe', exist_ok=True)
    temp = x.copy()
    temp[TARGET] = y
    temp.to_csv('../data/final_ohe/train.csv', index=False)
    test.to_csv('../data/final_ohe/test.csv', index=False)

In [9]:
# study params
skf = StratifiedKFold(n_splits=cfg.nfolds)
models_paras_dict = {}

## Models

### lightgbm

In [10]:
# setup
modelname = 'lgb'

# fixed params
fixed_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'device': cfg.device,
        'verbosity': -1,
        'early_stopping_round': 15,
    }

def objective(trial):
    
    # trial parameters
    tuning_params = {
        'n_estimators' : trial.suggest_int('n_estimators', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    params = {**fixed_params, **tuning_params}

    # train and score with cv
    scores = []
    for train_index, test_index in skf.split(x, y):
        
        train_x, valid_x = x.iloc[train_index], x.iloc[test_index]
        train_y, valid_y = y.iloc[train_index], y.iloc[test_index]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            train_x,
            train_y,
            eval_set=[(valid_x, valid_y)],
            callbacks=[lgb.log_evaluation(period=0, show_stdv=False)]
        )
        
        acc = accuracy_score(valid_y,(model.predict(valid_x)))
        scores.append(acc)

    return np.mean(scores)

In [11]:
if cfg.lgbm:

    # create study
    sampler = optuna.samplers.TPESampler(seed=cfg.seed)
    max_trials = 1 if cfg.debug else 250
    time_limit = 3600 * 0.5

    study = optuna.create_study(
        sampler=sampler,
        study_name= f'{modelname}_optimization',
        direction='maximize')

    # perform optimization
    print(f'Starting {modelname} optimization...')
    study.optimize(
        objective,
        n_trials = max_trials,
        timeout = time_limit,
    )

    # optimization results
    print(f"\nNumber of finished trials: {len(study.trials)}")
    print(f"Best score: {study.best_value}")
    best_params = {**fixed_params, **study.best_trial.params}
    print("Best trial parameters:")
    for k, v in best_params.items():
        print(f"\t{k}: {v}")

    # save best params
    params_path = f'{studies_path}{modelname}_best_params.joblib'
    with open(params_path, "wb") as file:
        joblib.dump(best_params, file)

    models_paras_dict[modelname] = best_params

[32m[I 2022-12-22 17:30:40,272][0m A new study created in memory with name: lgb_optimization[0m


Starting lgb optimization...


[32m[I 2022-12-22 17:30:43,017][0m Trial 0 finished with value: 0.7965104559342884 and parameters: {'n_estimators': 1186, 'learning_rate': 0.0951207163345817, 'reg_alpha': 7.31993942079411, 'reg_lambda': 5.9865848459837805, 'num_leaves': 81, 'colsample_bytree': 0.49359671220172163, 'subsample': 0.4348501673009197, 'subsample_freq': 7, 'min_child_samples': 62}. Best is trial 0 with value: 0.7965104559342884.[0m
[32m[I 2022-12-22 17:31:18,574][0m Trial 1 finished with value: 0.7945537875481132 and parameters: {'n_estimators': 2154, 'learning_rate': 0.0030378649352844423, 'reg_alpha': 9.699098521920845, 'reg_lambda': 8.324426409679791, 'num_leaves': 110, 'colsample_bytree': 0.5090949803242604, 'subsample': 0.5100427059120604, 'subsample_freq': 3, 'min_child_samples': 55}. Best is trial 0 with value: 0.7965104559342884.[0m
[32m[I 2022-12-22 17:31:27,591][0m Trial 2 finished with value: 0.7966238112244224 and parameters: {'n_estimators': 1353, 'learning_rate': 0.029831684879606152, 


Number of finished trials: 250
Best score: 0.801801912622515
Best trial parameters:
	objective: binary
	metric: binary_logloss
	device: cpu
	verbosity: -1
	early_stopping_round: 15
	n_estimators: 2212
	learning_rate: 0.014348736068011035
	reg_alpha: 9.318926167996713
	reg_lambda: 9.231047800458917
	num_leaves: 89
	colsample_bytree: 0.9575251915959488
	subsample: 0.8788031086445692
	subsample_freq: 4
	min_child_samples: 88


### xgboost

In [12]:
# setup
modelname = 'xgb'
method = 'gpu_hist' if cfg.device=='gpu' else 'hist'

# fixed params
fixed_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'tree_method': method,
    'early_stopping_rounds': 20,
}

def objective(trial):
    
    # trial parameters
    tuning_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'lambda': trial.suggest_float('lambda', 0, 10.0),
        'alpha': trial.suggest_float('alpha', 0, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1,1.0),
        'subsample': trial.suggest_float('subsample', 0.2,1.0),
        'max_depth': trial.suggest_categorical('max_depth', [2,3,4,5,6,7,8,9,10]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'num_parallel_tree': trial.suggest_int('num_parallel_tree',1,1),
    }
    
    params = {**fixed_params, **tuning_params}
    
    # train and scroe with cv
    scores = []
    for train_index, test_index in skf.split(x, y):
        
        train_x, valid_x = x.iloc[train_index], x.iloc[test_index]
        train_y, valid_y = y.iloc[train_index], y.iloc[test_index]
        
        model = xgb.XGBClassifier(**params)
        model.fit(
            train_x,
            train_y,
            eval_set=[(valid_x,valid_y)],
            verbose=0
        )
        
        acc = accuracy_score(valid_y,(model.predict(valid_x)))
        scores.append(acc)

    return np.mean(scores)

In [13]:
if cfg.xgb:

    # create study
    sampler = optuna.samplers.TPESampler(seed=cfg.seed)
    max_trials = 1 if cfg.debug else 100
    time_limit = 3600 * 0.5

    study = optuna.create_study(
        sampler=sampler,
        study_name= f'{modelname}_optimization',
        direction='maximize')

    # perform optimization
    print(f'Starting {modelname} optimization...')
    study.optimize(
        objective,
        n_trials = max_trials,
        timeout = time_limit,
    )

    # optimization results
    print(f"\nNumber of finished trials: {len(study.trials)}")
    print(f"Best score: {study.best_value}")
    best_params = {**fixed_params, **study.best_trial.params}
    print("Best trial parameters:")
    for k, v in best_params.items():
        print(f"\t{k}: {v}")

    # save best params
    params_path = f'{studies_path}{modelname}_best_params.joblib'
    with open(params_path, "wb") as file:
        joblib.dump(best_params, file)

    models_paras_dict[modelname] = best_params

### catboost

In [14]:
# setup
modelname = 'catboost'
method = 'GPU' if cfg.device=='gpu' else 'CPU'

# fixed params
fixed_params = {
    'loss_function': 'Logloss',
    'custom_metric': 'Logloss',
    'task_type' : method,
    'bootstrap_type': 'Bayesian',
    'allow_writing_files': False,
}

def objective(trial):
    
    # trial parameters
    tuning_params = {
        'num_trees': trial.suggest_int('num_trees', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 1, 25),
        'random_strength': trial.suggest_float('random_strength', 1, 10),
        'depth': trial.suggest_int('depth', 1, 12, step=1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.1, 0.8),
        'l2_leaf_reg':trial.suggest_float('l2_leaf_reg', 0, 10)
    }

    params = {**fixed_params, **tuning_params}
    if method=='GPU':
        params['colsample_bylevel'] = None
    
    # train and score with cv
    scores = []
    for train_index, test_index in skf.split(x, y):
        
        train_x, valid_x = x.iloc[train_index], x.iloc[test_index]
        train_y, valid_y = y.iloc[train_index], y.iloc[test_index]
        
        model = catboost.CatBoostClassifier(**params)
        model.fit(
            catboost.Pool(train_x, train_y),
            eval_set=catboost.Pool(valid_x,valid_y),
            early_stopping_rounds=20,
            verbose=0
        )
        
        acc = accuracy_score(valid_y,(model.predict(valid_x)))
        scores.append(acc)

    return np.mean(scores)

In [15]:
if cfg.cat:

    # create study
    sampler = optuna.samplers.TPESampler(seed=cfg.seed)
    max_trials = 1 if cfg.debug else 50
    time_limit = 3600 * 0.5

    study = optuna.create_study(
        sampler=sampler,
        study_name= f'{modelname}_optimization',
        direction='maximize')

    # perform optimization
    print(f'Starting {modelname} optimization...')
    study.optimize(
        objective,
        n_trials = max_trials,
        timeout = time_limit,
    )

    # optimization results
    print(f"\nNumber of finished trials: {len(study.trials)}")
    print(f"Best score: {study.best_value}")
    best_params = {**fixed_params, **study.best_trial.params}
    print("Best trial parameters:")
    for k, v in best_params.items():
        print(f"\t{k}: {v}")

    # save best params
    params_path = f'{studies_path}{modelname}_best_params.joblib'
    with open(params_path, "wb") as file:
        joblib.dump(best_params, file)

    models_paras_dict[modelname] = best_params

## Save results

In [16]:
# show results
for k, v in models_paras_dict.items():
    print(f'Model: {k}, best parameters: {v}\n')

Model: lgb, best parameters: {'objective': 'binary', 'metric': 'binary_logloss', 'device': 'cpu', 'verbosity': -1, 'early_stopping_round': 15, 'n_estimators': 2212, 'learning_rate': 0.014348736068011035, 'reg_alpha': 9.318926167996713, 'reg_lambda': 9.231047800458917, 'num_leaves': 89, 'colsample_bytree': 0.9575251915959488, 'subsample': 0.8788031086445692, 'subsample_freq': 4, 'min_child_samples': 88}

