# Boosting

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


## Setup

In [2]:
import numpy as np
import pandas as pd

import os 
import joblib
import time

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

import optuna
import catboost

In [3]:
class cfg:
    modelname = 'catboost'
    debug = False
    optim = True
    seed = 42
    nfolds = 5
    njobs = 2

In [4]:
# load data
train = pd.read_csv('../data/final/train.csv')
test = pd.read_csv('../data/final/test.csv')

train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,IsSynthetic,...,Monterey County,Napa County,Orange County,Other,Riverside County,San Francisco County,Santa Barbara County,Santa Clara County,Ventura County,Yolo County
0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,0.98,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,4.5,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [5]:
# quick info
TARGET = 'MedHouseVal'
FEATURES = [c for c in train.columns if c not in [TARGET]]

print(f'Target: {TARGET}\nFeatures: {FEATURES}')
print('Train set shape:', train.shape)
print('Test set shape:', test.shape)

x = train[FEATURES]
y = train[TARGET]

Target: MedHouseVal
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'IsSynthetic', 'AnomalyScore', 'rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y', 'rot_45_x', 'pca_lon', 'pca_lat', 'Alameda County', 'Contra Costa County', 'Fresno County', 'Kern County', 'Los Angeles County', 'Monterey County', 'Napa County', 'Orange County', 'Other', 'Riverside County', 'San Francisco County', 'Santa Barbara County', 'Santa Clara County', 'Ventura County', 'Yolo County']
Train set shape: (57777, 33)
Test set shape: (24759, 32)


In [6]:
# setup
params_path = './training_files/params/'
os.makedirs(params_path, exist_ok=True)

cv = KFold(n_splits=cfg.nfolds, shuffle=True, random_state=cfg.seed)

## Hyperparameter tuning

In [7]:
# fixed params
fixed_params = {
    'loss_function': 'RMSE',
    'custom_metric': 'RMSE',
    'task_type' : 'CPU',
    'bootstrap_type': 'Bayesian',
    'allow_writing_files': False,
}

# objective function for optimization
def objective(trial):
    
    # trial parameters
    tuning_params = {
        'num_trees': trial.suggest_int('num_trees', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 1, 25),
        'random_strength': trial.suggest_float('random_strength', 1, 10),
        'depth': trial.suggest_int('depth', 1, 8, step=1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.1, 0.8),
        'l2_leaf_reg':trial.suggest_float('l2_leaf_reg', 0, 10)
    }

    params = {**fixed_params, **tuning_params}

    # train and score with cv
    scores = []
    for train_idx, test_idx in cv.split(x, y):
        
        # split data
        x_train, x_val = x.iloc[train_idx], x.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        
        # fit model
        model = catboost.CatBoostRegressor(**params)
        model.fit(
            catboost.Pool(x_train, y_train),
            eval_set=catboost.Pool(x_val, y_val),
            early_stopping_rounds=20,
            verbose=0
        )
        
        # obtain score
        rmse = np.sqrt(mean_squared_error(y_val, model.predict(x_val)))
        scores.append(rmse)

    # return mean cv score 
    return np.mean(scores)

In [8]:
if cfg.optim:

    # create study
    sampler = optuna.samplers.TPESampler(seed=cfg.seed)
    max_trials = 2 if cfg.debug else 50
    time_limit = 3600 * 0.5

    study = optuna.create_study(
        sampler=sampler,
        study_name= f'{cfg.modelname}_optimization',
        direction='minimize')

    # perform optimization
    print(f'Starting {cfg.modelname} optimization...')
    study.optimize(
        objective,
        n_trials = max_trials,
        timeout = time_limit,
    )

[32m[I 2023-01-08 12:51:45,335][0m A new study created in memory with name: catboost_optimization[0m


Starting catboost optimization...


[32m[I 2023-01-08 12:51:57,581][0m Trial 0 finished with value: 0.606567517154166 and parameters: {'num_trees': 437, 'learning_rate': 0.09556428757689246, 'bagging_temperature': 18.567854603473723, 'random_strength': 6.387926357773329, 'depth': 2, 'colsample_bylevel': 0.20919616423534187, 'l2_leaf_reg': 0.5808361216819946}. Best is trial 0 with value: 0.606567517154166.[0m
[32m[I 2023-01-08 12:53:19,079][0m Trial 1 finished with value: 0.5476696740373589 and parameters: {'num_trees': 880, 'learning_rate': 0.0641003510568888, 'bagging_temperature': 17.993741867105093, 'random_strength': 1.185260448662222, 'depth': 8, 'colsample_bylevel': 0.6827098485602953, 'l2_leaf_reg': 2.1233911067827616}. Best is trial 1 with value: 0.5476696740373589.[0m
[32m[I 2023-01-08 12:53:26,621][0m Trial 2 finished with value: 0.6107493754755131 and parameters: {'num_trees': 263, 'learning_rate': 0.026506405886809047, 'bagging_temperature': 8.301813831028905, 'random_strength': 5.72280788469014, 'dep

In [9]:
if cfg.optim:
    
    # optimization results
    print(f"Number of finished trials: {len(study.trials)}")
    print(f"Best score: {study.best_value}")
    best_params = {**fixed_params, **study.best_trial.params}
    print("Best trial parameters:")
    for k, v in best_params.items():
        print(f"\t{k}: {v}")

    # save best params
    best_params_path = f'{params_path}{cfg.modelname}.joblib'
    with open(best_params_path, "wb") as file:
        joblib.dump(best_params, file)

Number of finished trials: 49
Best score: 0.5303909419153083
Best trial parameters:
	loss_function: RMSE
	custom_metric: RMSE
	task_type: CPU
	bootstrap_type: Bayesian
	allow_writing_files: False
	num_trees: 955
	learning_rate: 0.04970656059425884
	bagging_temperature: 4.605862369391987
	random_strength: 2.1610845898091626
	depth: 8
	colsample_bylevel: 0.17170220706427092
	l2_leaf_reg: 6.345347577047418


## Final model

In [10]:
# load best params
best_params_path = f'{params_path}{cfg.modelname}.joblib'
with open(best_params_path, 'rb') as file:
    best_params = joblib.load(file)

best_params['learning_rate'] = 0.02
print("Final parameters:")
for k, v in best_params.items():
    print(f"\t{k}: {v}")

Final parameters:
	loss_function: RMSE
	custom_metric: RMSE
	task_type: CPU
	bootstrap_type: Bayesian
	allow_writing_files: False
	num_trees: 955
	learning_rate: 0.02
	bagging_temperature: 4.605862369391987
	random_strength: 2.1610845898091626
	depth: 8
	colsample_bylevel: 0.17170220706427092
	l2_leaf_reg: 6.345347577047418


In [11]:
# cv training and predict
def train_model(train, test, params):

    start_time = time.time()
    print(30*'*', f'Model: {cfg.modelname}', 30*'*', '\n')

    # get data
    x = train[FEATURES]
    y = train[TARGET]
    xtest = test[FEATURES]

    # cv loop
    cv = KFold(n_splits=cfg.nfolds, shuffle=True, random_state=cfg.seed)
    preds, cv_scores = [], []
    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):

        print(f'fold {fold+1}/{cfg.nfolds}...')
        fold_start_time = time.time()

        # split data
        x_train, x_val = x.iloc[train_idx], x.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        # define and fit model
        model = catboost.CatBoostRegressor(**params)
        model.fit(
            catboost.Pool(x_train, y_train),
            eval_set=catboost.Pool(x_val, y_val),
            early_stopping_rounds=20,
            verbose=200
        )

        # validation score
        cv_score = np.sqrt(mean_squared_error(y_val, model.predict(x_val)))
        cv_scores.append(cv_score)

        # predict test data
        preds.append(model.predict(xtest))

        fold_run_time = time.time() - fold_start_time
        print(f'rmse: {cv_score:.4f}, run time: {fold_run_time:.2f}\n')

    # print results
    run_time = time.time() - start_time
    print(f'\nTraining completed. Total run time: {run_time:2f}')
    print(f'CV score:\n\t mean: {np.mean(cv_scores):0.6f}\n\t std: {np.std(cv_scores):0.6f}')

    return (cv_scores, preds)

In [12]:
# train model
scores, preds = train_model(train, test, best_params)

****************************** Model: catboost ****************************** 

fold 1/5...
0:	learn: 1.1446988	test: 1.1394422	best: 1.1394422 (0)	total: 11.5ms	remaining: 11s
200:	learn: 0.5793968	test: 0.5701809	best: 0.5701809 (200)	total: 2.29s	remaining: 8.58s
400:	learn: 0.5512678	test: 0.5457476	best: 0.5457476 (400)	total: 4.52s	remaining: 6.24s
600:	learn: 0.5385791	test: 0.5369017	best: 0.5369017 (600)	total: 6.72s	remaining: 3.96s
800:	learn: 0.5293133	test: 0.5315741	best: 0.5315741 (800)	total: 8.94s	remaining: 1.72s
954:	learn: 0.5233545	test: 0.5282434	best: 0.5282434 (954)	total: 10.6s	remaining: 0us

bestTest = 0.5282434092
bestIteration = 954

rmse: 0.5282, run time: 10.79

fold 2/5...
0:	learn: 1.1446188	test: 1.1356876	best: 1.1356876 (0)	total: 11.8ms	remaining: 11.2s
200:	learn: 0.5775014	test: 0.5834148	best: 0.5834148 (200)	total: 2.21s	remaining: 8.28s
400:	learn: 0.5490621	test: 0.5617387	best: 0.5617387 (400)	total: 4.41s	remaining: 6.09s
600:	learn: 0.53595

In [13]:
# get cv score and final predictions
score = np.mean(scores)
final_preds = np.mean(np.array(preds), axis=0)

In [14]:
# save predictions
sub = pd.read_csv('../data/raw/sample_submission.csv', index_col=0)
sub[TARGET] = final_preds
sub.head()

out_path = '../submissions/'
os.makedirs(out_path, exist_ok=True)
sub.to_csv(out_path + f'{cfg.modelname}.csv')