# Boosting

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


## Setup

In [2]:
import numpy as np
import pandas as pd

import os 
import joblib
import time

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

import optuna
import lightgbm as lgb

In [3]:
class cfg:
    modelname = 'lgb'
    debug = False
    optim = False
    seed = 42
    nfolds = 5
    njobs = 2

In [4]:
# load data
train = pd.read_csv('../data/final/train.csv')
valid = pd.read_csv('../data/final/valid.csv')
test = pd.read_csv('../data/final/test.csv')

train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,IsSynthetic,...,Monterey County,Napa County,Orange County,Other,Riverside County,San Francisco County,Santa Barbara County,Santa Clara County,Ventura County,Yolo County
0,5.0855,17.0,6.190805,1.105747,1538.0,3.373563,38.02,-121.36,1.035,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.3636,5.0,4.239899,1.020202,3278.0,2.066007,33.68,-117.53,1.969,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0417,29.0,6.602317,1.023529,1119.0,2.705479,33.84,-117.95,2.638,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.1364,22.0,5.448584,0.986717,1750.0,3.020478,38.47,-121.77,1.188,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.6031,39.0,4.742204,1.075092,1168.0,3.25,36.75,-119.8,0.669,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# quick info
TARGET = 'MedHouseVal'
FEATURES = [c for c in train.columns if c not in [TARGET]]

print(f'Target: {TARGET}\nFeatures: {FEATURES}')
print('Train set shape:', train.shape)
print('Validation set shape:', valid.shape)
print('Test set shape:', test.shape)

x = train[FEATURES]
y = train[TARGET]

Target: MedHouseVal
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'IsSynthetic', 'AnomalyScore', 'rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y', 'rot_45_x', 'pca_lon', 'pca_lat', 'Alameda County', 'Contra Costa County', 'Fresno County', 'Kern County', 'Los Angeles County', 'Monterey County', 'Napa County', 'Orange County', 'Other', 'Riverside County', 'San Francisco County', 'Santa Barbara County', 'Santa Clara County', 'Ventura County', 'Yolo County']
Train set shape: (42922, 33)
Validation set shape: (14855, 33)
Test set shape: (24759, 32)


In [6]:
# setup
params_path = './training_files/params/'
os.makedirs(params_path, exist_ok=True)

cv = KFold(n_splits=cfg.nfolds, shuffle=True, random_state=cfg.seed)

## Hyperparameter tuning

In [7]:
# fixed params
fixed_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'device': 'cpu',
        'verbosity': -1,
        'early_stopping_round': 15,
        'boosting_type': 'gbdt' 
    }

# objective function for optimization
def objective(trial):
    
    # trial parameters
    tuning_params = {
        #'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart']),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2500),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    params = {**fixed_params, **tuning_params}

    # train and score with cv
    scores = []
    for train_idx, test_idx in cv.split(x, y):
        
        # split data
        x_train, x_val = x.iloc[train_idx], x.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        
        # fit model
        model = lgb.LGBMRegressor(**params)
        model.fit(
            x_train,
            y_train,
            eval_set=[(x_val, y_val)],
            callbacks=[lgb.log_evaluation(period=0, show_stdv=False)]
        )
        
        # obtain score
        rmse = np.sqrt(mean_squared_error(y_val, model.predict(x_val)))
        scores.append(rmse)

    # return mean cv score 
    return np.mean(scores)

In [8]:
if cfg.optim:

    # create study
    sampler = optuna.samplers.TPESampler(seed=cfg.seed)
    max_trials = 2 if cfg.debug else 100
    time_limit = 3600 * 0.5

    study = optuna.create_study(
        sampler=sampler,
        study_name= f'{cfg.modelname}_optimization',
        direction='minimize')

    # perform optimization
    print(f'Starting {cfg.modelname} optimization...')
    study.optimize(
        objective,
        n_trials = max_trials,
        timeout = time_limit,
    )

In [9]:
if cfg.optim:
    
    # optimization results
    print(f"Number of finished trials: {len(study.trials)}")
    print(f"Best score: {study.best_value}")
    best_params = {**fixed_params, **study.best_trial.params}
    print("Best trial parameters:")
    for k, v in best_params.items():
        print(f"\t{k}: {v}")

    # save best params
    best_params_path = f'{params_path}{cfg.modelname}.joblib'
    with open(best_params_path, "wb") as file:
        joblib.dump(best_params, file)

## Final model

### Classic boosting

In [10]:
# load best params
best_params_path = f'{params_path}{cfg.modelname}.joblib'
with open(best_params_path, 'rb') as file:
    best_params = joblib.load(file)

best_params['learning_rate'] = 0.02
print("Final parameters:")
for k, v in best_params.items():
    print(f"\t{k}: {v}")

Final parameters:
	objective: regression
	metric: rmse
	device: cpu
	verbosity: -1
	early_stopping_round: 15
	boosting_type: gbdt
	n_estimators: 540
	learning_rate: 0.02
	reg_alpha: 3.687775860273211
	reg_lambda: 9.993624386421892
	num_leaves: 338
	colsample_bytree: 0.4153296968863259
	subsample: 0.8131374963745043
	subsample_freq: 5
	min_child_samples: 31


In [11]:
# cv training and predict
def train_model(train, valid, test, params):

    start_time = time.time()
    print(30*'*', f'Model: {cfg.modelname}', 30*'*', '\n')

    # get data
    x = train[FEATURES]
    y = train[TARGET]

    # cv loop
    cv = KFold(n_splits=cfg.nfolds, shuffle=True, random_state=cfg.seed)
    preds, preds_val, cv_scores = [], [], []
    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):

        print(f'fold {fold+1}/{cfg.nfolds}...')
        fold_start_time = time.time()

        # split data
        x_train, x_val = x.iloc[train_idx], x.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        # define and fit model
        model = lgb.LGBMRegressor(**params)
        model.fit(
            x_train,
            y_train,
            eval_set = [(x_val, y_val)],
            callbacks = [lgb.log_evaluation(200)],
            eval_metric = ['root_mean_squared_error', 'mean_absolute_error'],
        )

        # validation score
        cv_score = np.sqrt(mean_squared_error(y_val, model.predict(x_val)))
        cv_scores.append(cv_score)

        # predict validation and test set
        preds_val.append(model.predict(valid[FEATURES]))
        preds.append(model.predict(test))

        fold_run_time = time.time() - fold_start_time
        print(f'rmse: {cv_score:.4f}, run time: {fold_run_time:.2f}\n')

    # print results
    run_time = time.time() - start_time
    print(f'\nTraining completed. Total run time: {run_time:2f}')
    print(f'CV score:\n\t mean: {np.mean(cv_scores):0.6f}\n\t std: {np.std(cv_scores):0.6f}')

    return (cv_scores, preds, preds_val)

In [12]:
# train model
scores, preds, preds_val = train_model(train, valid, test, best_params)

****************************** Model: lgb ****************************** 

fold 1/5...
[200]	valid_0's rmse: 0.514249	valid_0's l1: 0.35123
[400]	valid_0's rmse: 0.502034	valid_0's l1: 0.337494
rmse: 0.5004, run time: 9.24

fold 2/5...
[200]	valid_0's rmse: 0.520386	valid_0's l1: 0.351226
[400]	valid_0's rmse: 0.509515	valid_0's l1: 0.338595
rmse: 0.5085, run time: 11.05

fold 3/5...
[200]	valid_0's rmse: 0.51816	valid_0's l1: 0.358684
[400]	valid_0's rmse: 0.505782	valid_0's l1: 0.345925
rmse: 0.5042, run time: 11.43

fold 4/5...
[200]	valid_0's rmse: 0.499544	valid_0's l1: 0.341593
[400]	valid_0's rmse: 0.487976	valid_0's l1: 0.327667
rmse: 0.4868, run time: 11.86

fold 5/5...
[200]	valid_0's rmse: 0.508364	valid_0's l1: 0.347599
[400]	valid_0's rmse: 0.496688	valid_0's l1: 0.333658
rmse: 0.4953, run time: 11.34


Training completed. Total run time: 54.932016
CV score:
	 mean: 0.499000
	 std: 0.007499


In [13]:
# get cv score and final predictions
score = np.mean(scores)
final_preds = np.mean(np.array(preds), axis=0)

In [14]:
# save validation predictions
final_preds_val = np.mean(np.array(preds_val), axis=0)
val_path = './training_files/valid_preds/'
os.makedirs(val_path, exist_ok=True)
pd.Series(final_preds_val).to_csv(val_path + f'{cfg.modelname}.csv')

In [15]:
# save predictions
sub = pd.read_csv('../data/raw/sample_submission.csv', index_col=0)
sub[TARGET] = final_preds
sub.head()

out_path = '../submissions/'
os.makedirs(out_path, exist_ok=True)
sub.to_csv(out_path + f'{cfg.modelname}.csv')

### DART boosting 

In [16]:
# use dart boosting
best_params['boosting_type'] = 'dart'
best_params.pop('early_stopping_round')

print("Final parameters:")
for k, v in best_params.items():
    print(f"\t{k}: {v}")

Final parameters:
	objective: regression
	metric: rmse
	device: cpu
	verbosity: -1
	boosting_type: dart
	n_estimators: 540
	learning_rate: 0.02
	reg_alpha: 3.687775860273211
	reg_lambda: 9.993624386421892
	num_leaves: 338
	colsample_bytree: 0.4153296968863259
	subsample: 0.8131374963745043
	subsample_freq: 5
	min_child_samples: 31


In [17]:
# train model
scores, preds, preds_val = train_model(train, valid, test, best_params)

****************************** Model: lgb ****************************** 

fold 1/5...
[200]	valid_0's rmse: 0.903674	valid_0's l1: 0.648589
[400]	valid_0's rmse: 0.631119	valid_0's l1: 0.421809
rmse: 0.5795, run time: 20.10

fold 2/5...
[200]	valid_0's rmse: 0.914123	valid_0's l1: 0.658512
[400]	valid_0's rmse: 0.640903	valid_0's l1: 0.427284
rmse: 0.5888, run time: 24.36

fold 3/5...
[200]	valid_0's rmse: 0.920668	valid_0's l1: 0.666716
[400]	valid_0's rmse: 0.643094	valid_0's l1: 0.434025
rmse: 0.5885, run time: 25.35

fold 4/5...
[200]	valid_0's rmse: 0.898699	valid_0's l1: 0.642739
[400]	valid_0's rmse: 0.624203	valid_0's l1: 0.416601
rmse: 0.5702, run time: 20.25

fold 5/5...
[200]	valid_0's rmse: 0.920179	valid_0's l1: 0.667164
[400]	valid_0's rmse: 0.638948	valid_0's l1: 0.43008
rmse: 0.5833, run time: 23.53


Training completed. Total run time: 113.607406
CV score:
	 mean: 0.582069
	 std: 0.006859


In [18]:
# get cv score and final predictions
score = np.mean(scores)
final_preds = np.mean(np.array(preds), axis=0)

In [19]:
# save validation predictions
final_preds_val = np.mean(np.array(preds_val), axis=0)
val_path = './training_files/valid_preds/'
os.makedirs(val_path, exist_ok=True)
pd.Series(final_preds_val).to_csv(val_path + f'{cfg.modelname}_dart.csv')

In [20]:
# save predictions
sub = pd.read_csv('../data/raw/sample_submission.csv', index_col=0)
sub[TARGET] = final_preds
sub.head()

out_path = '../submissions/'
os.makedirs(out_path, exist_ok=True)
sub.to_csv(out_path + f'{cfg.modelname}_dart.csv')