# Boosting studies

## Setup

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [2]:
# config
class cfg:
    debug = False
    lgbm = True
    xgb = True
    cat = False
    nfolds = 5
    device = 'cpu'
    seed = 42
    colab = False

In [3]:
# libraries
import os
import joblib

import numpy as np
import pandas as pd

from sklearn.preprocessing import  OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

import lightgbm as lgb
import xgboost as xgb
import catboost

import optuna

In [4]:
# set paths
data_path = '../data/final/'
studies_path = '../src/training_files/studies/'

if cfg.colab:
    from google.colab import drive
    drive.mount('/content/gdrive')
    drive_path = '/content/gdrive/My Drive/' 
    
    data_path = drive_path + 'temp/final/'
    studies_path = drive_path + 'temp/studies/'

os.makedirs(studies_path, exist_ok=True)

## Data preparation

In [5]:
# load data
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + '/test.csv')

In [6]:
TARGET = 'Transported'
FEATURES = [col for col in train.columns if col not in [TARGET]]

numerical = train[FEATURES].select_dtypes(include=np.number).columns.to_list()
categorical = train[FEATURES].select_dtypes(exclude=np.number).columns.to_list()

train[numerical] = train[numerical].astype(float)
train[categorical] = train[categorical].astype(object)
train[TARGET] = train[TARGET].astype(float)

test[numerical] = test[numerical].astype(float)
test[categorical] = test[categorical].astype(object)

print(f'Target: {TARGET}')
print(f'Features:\n\tnumerical: {numerical}\n\tcategorical:{categorical}')
print(f'Shapes:\n\ttrain: {train.shape}\n\ttest: {test.shape}')
print(f'Missing values:\n\ttrain: {train.isna().sum().sum()}\n\ttest: {test.isna().sum().sum()}')

train.head(5)

Target: Transported
Features:
	numerical: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Expenditure', 'NoSpending', 'CabinNum', 'GroupSize', 'FamilySize']
	categorical:['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinDeck', 'CabinSide', 'Solo']
Shapes:
	train: (8693, 19)
	test: (4277, 18)
Missing values:
	train: 0
	test: 0


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Expenditure,NoSpending,CabinDeck,CabinNum,CabinSide,GroupSize,Solo,FamilySize,Transported
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,B,0.0,P,1.0,True,4.0,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,1472.0,0.0,F,0.0,S,1.0,True,4.0,1.0
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,20766.0,0.0,A,0.0,S,2.0,False,7.0,0.0
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,10352.0,0.0,A,0.0,S,2.0,False,7.0,0.0
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,2182.0,0.0,F,1.0,S,1.0,True,9.0,1.0


In [7]:
# define encoding
cat_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# combine preprocessing
ct = ColumnTransformer(
    transformers=[('cat', cat_encoder, categorical)],
    verbose_feature_names_out=False,
    remainder='passthrough'
).set_output(transform='pandas')

# apply preprocessing
x = train[FEATURES]
y = train[TARGET]

ct = ct.fit(x) 
x = ct.transform(x)
test = ct.transform(test)

x.head(5)

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,...,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Expenditure,NoSpending,CabinNum,GroupSize,FamilySize
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,4.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,109.0,9.0,25.0,549.0,44.0,1472.0,0.0,0.0,1.0,4.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,43.0,3576.0,0.0,6715.0,49.0,20766.0,0.0,0.0,2.0,7.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1283.0,371.0,3329.0,193.0,10352.0,0.0,0.0,2.0,7.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,303.0,70.0,151.0,565.0,2.0,2182.0,0.0,1.0,1.0,9.0


In [8]:
# save encoded data
if not cfg.colab:
    ohe_data_path = '../data/final/ohe/'
    os.makedirs(ohe_data_path, exist_ok=True)
    temp = x.copy()
    temp[TARGET] = y
    temp.to_csv(ohe_data_path + 'train.csv', index=False)
    test.to_csv(ohe_data_path + 'test.csv', index=False)

In [9]:
# study params
skf = StratifiedKFold(n_splits=cfg.nfolds)
models_paras_dict = {}

## Models

### lightgbm

In [10]:
# setup
modelname = 'lgb'

# fixed params
fixed_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'device': cfg.device,
        'verbosity': -1,
        'early_stopping_round': 15,
    }

def objective(trial):
    
    # trial parameters
    tuning_params = {
        'n_estimators' : trial.suggest_int('n_estimators', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    params = {**fixed_params, **tuning_params}

    # train and score with cv
    scores = []
    for train_index, test_index in skf.split(x, y):
        
        train_x, valid_x = x.iloc[train_index], x.iloc[test_index]
        train_y, valid_y = y.iloc[train_index], y.iloc[test_index]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            train_x,
            train_y,
            eval_set=[(valid_x, valid_y)],
            callbacks=[lgb.log_evaluation(period=0, show_stdv=False)]
        )
        
        acc = accuracy_score(valid_y,(model.predict(valid_x)))
        scores.append(acc)

    return np.mean(scores)

In [11]:
if cfg.lgbm:

    # create study
    sampler = optuna.samplers.TPESampler(seed=cfg.seed)
    max_trials = 1 if cfg.debug else 200
    time_limit = 3600 * 0.5

    study = optuna.create_study(
        sampler=sampler,
        study_name= f'{modelname}_optimization',
        direction='maximize')

    # perform optimization
    print(f'Starting {modelname} optimization...')
    study.optimize(
        objective,
        n_trials = max_trials,
        timeout = time_limit,
    )

    # optimization results
    print(f"\nNumber of finished trials: {len(study.trials)}")
    print(f"Best score: {study.best_value}")
    best_params = {**fixed_params, **study.best_trial.params}
    print("Best trial parameters:")
    for k, v in best_params.items():
        print(f"\t{k}: {v}")

    # save best params
    params_path = f'{studies_path}{modelname}_best_params.joblib'
    with open(params_path, "wb") as file:
        joblib.dump(best_params, file)

    models_paras_dict[modelname] = best_params

[32m[I 2022-12-28 16:20:50,998][0m A new study created in memory with name: lgb_optimization[0m


Starting lgb optimization...


[32m[I 2022-12-28 16:20:52,478][0m Trial 0 finished with value: 0.7940922093898124 and parameters: {'n_estimators': 1186, 'learning_rate': 0.0951207163345817, 'reg_alpha': 7.31993942079411, 'reg_lambda': 5.9865848459837805, 'num_leaves': 81, 'colsample_bytree': 0.49359671220172163, 'subsample': 0.4348501673009197, 'subsample_freq': 7, 'min_child_samples': 62}. Best is trial 0 with value: 0.7940922093898124.[0m
[32m[I 2022-12-28 16:21:10,817][0m Trial 1 finished with value: 0.7947818641058608 and parameters: {'n_estimators': 2154, 'learning_rate': 0.0030378649352844423, 'reg_alpha': 9.699098521920845, 'reg_lambda': 8.324426409679791, 'num_leaves': 110, 'colsample_bytree': 0.5090949803242604, 'subsample': 0.5100427059120604, 'subsample_freq': 3, 'min_child_samples': 55}. Best is trial 1 with value: 0.7947818641058608.[0m
[32m[I 2022-12-28 16:21:13,590][0m Trial 2 finished with value: 0.7907564960352463 and parameters: {'n_estimators': 1353, 'learning_rate': 0.029831684879606152, 


Number of finished trials: 200
Best score: 0.7978874940361609
Best trial parameters:
	objective: binary
	metric: binary_logloss
	device: cpu
	verbosity: -1
	early_stopping_round: 15
	n_estimators: 2891
	learning_rate: 0.038933232152364
	reg_alpha: 9.946806064915265
	reg_lambda: 4.483302202826843
	num_leaves: 106
	colsample_bytree: 0.48434846063745407
	subsample: 0.5601149859767293
	subsample_freq: 5
	min_child_samples: 70


### xgboost

In [12]:
# setup
modelname = 'xgb'
method = 'gpu_hist' if cfg.device=='gpu' else 'hist'

# fixed params
fixed_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'tree_method': method,
    'early_stopping_rounds': 20,
}

def objective(trial):
    
    # trial parameters
    tuning_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'lambda': trial.suggest_float('lambda', 0, 10.0),
        'alpha': trial.suggest_float('alpha', 0, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1,1.0),
        'subsample': trial.suggest_float('subsample', 0.2,1.0),
        'max_depth': trial.suggest_categorical('max_depth', [2,3,4,5,6,7,8,9,10]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'num_parallel_tree': trial.suggest_int('num_parallel_tree',1,1),
    }
    
    params = {**fixed_params, **tuning_params}
    
    # train and scroe with cv
    scores = []
    for train_index, test_index in skf.split(x, y):
        
        train_x, valid_x = x.iloc[train_index], x.iloc[test_index]
        train_y, valid_y = y.iloc[train_index], y.iloc[test_index]
        
        model = xgb.XGBClassifier(**params)
        model.fit(
            train_x,
            train_y,
            eval_set=[(valid_x,valid_y)],
            verbose=0
        )
        
        acc = accuracy_score(valid_y,(model.predict(valid_x)))
        scores.append(acc)

    return np.mean(scores)

In [13]:
if cfg.xgb:

    # create study
    sampler = optuna.samplers.TPESampler(seed=cfg.seed)
    max_trials = 1 if cfg.debug else 100
    time_limit = 3600 * 0.5

    study = optuna.create_study(
        sampler=sampler,
        study_name= f'{modelname}_optimization',
        direction='maximize')

    # perform optimization
    print(f'Starting {modelname} optimization...')
    study.optimize(
        objective,
        n_trials = max_trials,
        timeout = time_limit,
    )

    # optimization results
    print(f"\nNumber of finished trials: {len(study.trials)}")
    print(f"Best score: {study.best_value}")
    best_params = {**fixed_params, **study.best_trial.params}
    print("Best trial parameters:")
    for k, v in best_params.items():
        print(f"\t{k}: {v}")

    # save best params
    params_path = f'{studies_path}{modelname}_best_params.joblib'
    with open(params_path, "wb") as file:
        joblib.dump(best_params, file)

    models_paras_dict[modelname] = best_params

[32m[I 2022-12-28 16:29:24,285][0m A new study created in memory with name: xgb_optimization[0m


Starting xgb optimization...


[32m[I 2022-12-28 16:29:29,234][0m Trial 0 finished with value: 0.7936311822926421 and parameters: {'n_estimators': 1186, 'learning_rate': 0.0951207163345817, 'lambda': 7.319939418114051, 'alpha': 5.986584841970366, 'colsample_bytree': 0.24041677639819287, 'subsample': 0.32479561626896214, 'max_depth': 7, 'min_child_weight': 2, 'num_parallel_tree': 1}. Best is trial 0 with value: 0.7936311822926421.[0m
[32m[I 2022-12-28 16:29:33,201][0m Trial 1 finished with value: 0.7925960384888475 and parameters: {'n_estimators': 982, 'learning_rate': 0.05295088673159155, 'lambda': 4.319450186421157, 'alpha': 2.9122914019804194, 'colsample_bytree': 0.6506676052501416, 'subsample': 0.3115950885216335, 'max_depth': 5, 'min_child_weight': 2, 'num_parallel_tree': 1}. Best is trial 0 with value: 0.7936311822926421.[0m
[32m[I 2022-12-28 16:29:37,589][0m Trial 2 finished with value: 0.7952418986084485 and parameters: {'n_estimators': 288, 'learning_rate': 0.09493966818808, 'lambda': 9.6563203307455


Number of finished trials: 100
Best score: 0.7984628018562843
Best trial parameters:
	objective: binary:logistic
	eval_metric: logloss
	tree_method: hist
	early_stopping_rounds: 20
	n_estimators: 1856
	learning_rate: 0.025220464694322764
	lambda: 5.42262562491136
	alpha: 4.892831496385892
	colsample_bytree: 0.40628264111914086
	subsample: 0.44609618803514217
	max_depth: 4
	min_child_weight: 3
	num_parallel_tree: 1


### catboost

In [14]:
# setup
modelname = 'catboost'
method = 'GPU' if cfg.device=='gpu' else 'CPU'

# fixed params
fixed_params = {
    'loss_function': 'Logloss',
    'custom_metric': 'Logloss',
    'task_type' : method,
    'bootstrap_type': 'Bayesian',
    'allow_writing_files': False,
}

def objective(trial):
    
    # trial parameters
    tuning_params = {
        'num_trees': trial.suggest_int('num_trees', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 1, 25),
        'random_strength': trial.suggest_float('random_strength', 1, 10),
        'depth': trial.suggest_int('depth', 1, 12, step=1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.1, 0.8),
        'l2_leaf_reg':trial.suggest_float('l2_leaf_reg', 0, 10)
    }

    params = {**fixed_params, **tuning_params}
    if method=='GPU':
        params['colsample_bylevel'] = None
    
    # train and score with cv
    scores = []
    for train_index, test_index in skf.split(x, y):
        
        train_x, valid_x = x.iloc[train_index], x.iloc[test_index]
        train_y, valid_y = y.iloc[train_index], y.iloc[test_index]
        
        model = catboost.CatBoostClassifier(**params)
        model.fit(
            catboost.Pool(train_x, train_y),
            eval_set=catboost.Pool(valid_x,valid_y),
            early_stopping_rounds=20,
            verbose=0
        )
        
        acc = accuracy_score(valid_y,(model.predict(valid_x)))
        scores.append(acc)

    return np.mean(scores)

In [15]:
if cfg.cat:

    # create study
    sampler = optuna.samplers.TPESampler(seed=cfg.seed)
    max_trials = 1 if cfg.debug else 50
    time_limit = 3600 * 0.5

    study = optuna.create_study(
        sampler=sampler,
        study_name= f'{modelname}_optimization',
        direction='maximize')

    # perform optimization
    print(f'Starting {modelname} optimization...')
    study.optimize(
        objective,
        n_trials = max_trials,
        timeout = time_limit,
    )

    # optimization results
    print(f"\nNumber of finished trials: {len(study.trials)}")
    print(f"Best score: {study.best_value}")
    best_params = {**fixed_params, **study.best_trial.params}
    print("Best trial parameters:")
    for k, v in best_params.items():
        print(f"\t{k}: {v}")

    # save best params
    params_path = f'{studies_path}{modelname}_best_params.joblib'
    with open(params_path, "wb") as file:
        joblib.dump(best_params, file)

    models_paras_dict[modelname] = best_params

## Save results

In [16]:
# show results
for k, v in models_paras_dict.items():
    print(f'Model: {k}, best parameters: {v}\n')

Model: lgb, best parameters: {'objective': 'binary', 'metric': 'binary_logloss', 'device': 'cpu', 'verbosity': -1, 'early_stopping_round': 15, 'n_estimators': 2891, 'learning_rate': 0.038933232152364, 'reg_alpha': 9.946806064915265, 'reg_lambda': 4.483302202826843, 'num_leaves': 106, 'colsample_bytree': 0.48434846063745407, 'subsample': 0.5601149859767293, 'subsample_freq': 5, 'min_child_samples': 70}

Model: xgb, best parameters: {'objective': 'binary:logistic', 'eval_metric': 'logloss', 'tree_method': 'hist', 'early_stopping_rounds': 20, 'n_estimators': 1856, 'learning_rate': 0.025220464694322764, 'lambda': 5.42262562491136, 'alpha': 4.892831496385892, 'colsample_bytree': 0.40628264111914086, 'subsample': 0.44609618803514217, 'max_depth': 4, 'min_child_weight': 3, 'num_parallel_tree': 1}

