# Boosting

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


## Setup

In [2]:
import numpy as np
import pandas as pd

import os 
import joblib
import time

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

import optuna
import lightgbm as lgb

In [3]:
class cfg:
    modelname = 'lgb'
    debug = False
    optim = False
    seed = 42
    nfolds = 5
    njobs = 2

In [4]:
# load data
train = pd.read_csv('../data/final/train.csv')
test = pd.read_csv('../data/final/test.csv')

train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,IsSynthetic,...,Monterey County,Napa County,Orange County,Other,Riverside County,San Francisco County,Santa Barbara County,Santa Clara County,Ventura County,Yolo County
0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,0.98,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,4.5,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [5]:
# quick info
TARGET = 'MedHouseVal'
FEATURES = [c for c in train.columns if c not in [TARGET]]

print(f'Target: {TARGET}\nFeatures: {FEATURES}')
print('Train set shape:', train.shape)
print('Test set shape:', test.shape)

x = train[FEATURES]
y = train[TARGET]

Target: MedHouseVal
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'IsSynthetic', 'AnomalyScore', 'rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y', 'rot_45_x', 'pca_lon', 'pca_lat', 'Alameda County', 'Contra Costa County', 'Fresno County', 'Kern County', 'Los Angeles County', 'Monterey County', 'Napa County', 'Orange County', 'Other', 'Riverside County', 'San Francisco County', 'Santa Barbara County', 'Santa Clara County', 'Ventura County', 'Yolo County']
Train set shape: (57777, 33)
Test set shape: (24759, 32)


In [6]:
# setup
params_path = './training_files/params/'
os.makedirs(params_path, exist_ok=True)

cv = KFold(n_splits=cfg.nfolds, shuffle=True, random_state=cfg.seed)

## Hyperparameter tuning

In [7]:
# fixed params
fixed_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'device': 'cpu',
        'verbosity': -1,
        'early_stopping_round': 15,
        'boosting_type': 'gbdt' 
    }

# objective function for optimization
def objective(trial):
    
    # trial parameters
    tuning_params = {
        #'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart']),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2500),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.3),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    params = {**fixed_params, **tuning_params}

    # train and score with cv
    scores = []
    for train_idx, test_idx in cv.split(x, y):
        
        # split data
        x_train, x_val = x.iloc[train_idx], x.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        
        # fit model
        model = lgb.LGBMRegressor(**params)
        model.fit(
            x_train,
            y_train,
            eval_set=[(x_val, y_val)],
            callbacks=[lgb.log_evaluation(period=0, show_stdv=False)]
        )
        
        # obtain score
        rmse = np.sqrt(mean_squared_error(y_val, model.predict(x_val)))
        scores.append(rmse)

    # return mean cv score 
    return np.mean(scores)

In [8]:
if cfg.optim:

    # create study
    sampler = optuna.samplers.TPESampler(seed=cfg.seed)
    max_trials = 2 if cfg.debug else 100
    time_limit = 3600 * 0.5

    study = optuna.create_study(
        sampler=sampler,
        study_name= f'{cfg.modelname}_optimization',
        direction='minimize')

    # perform optimization
    print(f'Starting {cfg.modelname} optimization...')
    study.optimize(
        objective,
        n_trials = max_trials,
        timeout = time_limit,
    )

In [9]:
if cfg.optim:
    
    # optimization results
    print(f"Number of finished trials: {len(study.trials)}")
    print(f"Best score: {study.best_value}")
    best_params = {**fixed_params, **study.best_trial.params}
    print("Best trial parameters:")
    for k, v in best_params.items():
        print(f"\t{k}: {v}")

    # save best params
    best_params_path = f'{params_path}{cfg.modelname}.joblib'
    with open(best_params_path, "wb") as file:
        joblib.dump(best_params, file)

## Final model

### Classic boosting

In [10]:
# load best params
best_params_path = f'{params_path}{cfg.modelname}.joblib'
with open(best_params_path, 'rb') as file:
    best_params = joblib.load(file)

best_params['learning_rate'] = 0.02
print("Final parameters:")
for k, v in best_params.items():
    print(f"\t{k}: {v}")

Final parameters:
	objective: regression
	metric: rmse
	device: cpu
	verbosity: -1
	early_stopping_round: 15
	boosting_type: gbdt
	n_estimators: 2321
	learning_rate: 0.02
	reg_alpha: 2.871535933178378
	reg_lambda: 2.0197593488118737
	num_leaves: 199
	colsample_bytree: 0.43913190463886087
	subsample: 0.9546624815445495
	subsample_freq: 3
	min_child_samples: 17


In [12]:
# cv training and predict
def train_model(train, test, params):

    start_time = time.time()
    print(30*'*', f'Model: {cfg.modelname}', 30*'*', '\n')

    # get data
    x = train[FEATURES]
    y = train[TARGET]
    xtest = test[FEATURES]

    # cv loop
    cv = KFold(n_splits=cfg.nfolds, shuffle=True, random_state=cfg.seed)
    preds, cv_scores = [], []
    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):

        print(f'fold {fold+1}/{cfg.nfolds}...')
        fold_start_time = time.time()

        # split data
        x_train, x_val = x.iloc[train_idx], x.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        # define and fit model
        model = lgb.LGBMRegressor(**params)
        model.fit(
            x_train,
            y_train,
            eval_set = [(x_val, y_val)],
            callbacks = [lgb.log_evaluation(200)],
            eval_metric = ['root_mean_squared_error', 'mean_absolute_error'],
        )

        # validation score
        cv_score = np.sqrt(mean_squared_error(y_val, model.predict(x_val)))
        cv_scores.append(cv_score)

        # predict test data
        preds.append(model.predict(xtest))

        fold_run_time = time.time() - fold_start_time
        print(f'rmse: {cv_score:.4f}, run time: {fold_run_time:.2f}\n')

    # print results
    run_time = time.time() - start_time
    print(f'\nTraining completed. Total run time: {run_time:2f}')
    print(f'CV score:\n\t mean: {np.mean(cv_scores):0.6f}\n\t std: {np.std(cv_scores):0.6f}')

    return (cv_scores, preds)

In [13]:
# train model
scores, preds = train_model(train, test, best_params)

****************************** Model: lgb ****************************** 

fold 1/5...
[200]	valid_0's rmse: 0.505216	valid_0's l1: 0.350576
[400]	valid_0's rmse: 0.497981	valid_0's l1: 0.341424
rmse: 0.4976, run time: 9.09

fold 2/5...
[200]	valid_0's rmse: 0.521025	valid_0's l1: 0.362441
[400]	valid_0's rmse: 0.513595	valid_0's l1: 0.352762
rmse: 0.5130, run time: 10.73

fold 3/5...
[200]	valid_0's rmse: 0.515297	valid_0's l1: 0.355663
[400]	valid_0's rmse: 0.505836	valid_0's l1: 0.344524
rmse: 0.5042, run time: 10.59

fold 4/5...
[200]	valid_0's rmse: 0.532548	valid_0's l1: 0.365608
[400]	valid_0's rmse: 0.52449	valid_0's l1: 0.355433
rmse: 0.5237, run time: 10.22

fold 5/5...
[200]	valid_0's rmse: 0.517855	valid_0's l1: 0.358534
[400]	valid_0's rmse: 0.511163	valid_0's l1: 0.349521
rmse: 0.5105, run time: 9.78


Training completed. Total run time: 50.435129
CV score:
	 mean: 0.509803
	 std: 0.008778


In [14]:
# get cv score and final predictions
score = np.mean(scores)
final_preds = np.mean(np.array(preds), axis=0)

In [15]:
# save predictions
sub = pd.read_csv('../data/raw/sample_submission.csv', index_col=0)
sub[TARGET] = final_preds
sub.head()

out_path = '../submissions/'
os.makedirs(out_path, exist_ok=True)
sub.to_csv(out_path + f'{cfg.modelname}.csv')

### DART boosting 

In [16]:
# # use dart boosting
# best_params['boosting_type'] = 'dart'
# best_params.pop('early_stopping_round')

# print("Final parameters:")
# for k, v in best_params.items():
#     print(f"\t{k}: {v}")

Final parameters:
	objective: regression
	metric: rmse
	device: cpu
	verbosity: -1
	boosting_type: dart
	n_estimators: 2321
	learning_rate: 0.02
	reg_alpha: 2.871535933178378
	reg_lambda: 2.0197593488118737
	num_leaves: 199
	colsample_bytree: 0.43913190463886087
	subsample: 0.9546624815445495
	subsample_freq: 3
	min_child_samples: 17


In [17]:
# # train model
# scores, preds = train_model(train, test, best_params)

****************************** Model: lgb ****************************** 

fold 1/5...
[200]	valid_0's rmse: 0.884257	valid_0's l1: 0.647018
[400]	valid_0's rmse: 0.612541	valid_0's l1: 0.417247
[600]	valid_0's rmse: 0.57538	valid_0's l1: 0.389923
[800]	valid_0's rmse: 0.549117	valid_0's l1: 0.371269
[1000]	valid_0's rmse: 0.529638	valid_0's l1: 0.357756
[1200]	valid_0's rmse: 0.528806	valid_0's l1: 0.357326
[1400]	valid_0's rmse: 0.513253	valid_0's l1: 0.347277
[1600]	valid_0's rmse: 0.509258	valid_0's l1: 0.345017
[1800]	valid_0's rmse: 0.509432	valid_0's l1: 0.344533
[2000]	valid_0's rmse: 0.507007	valid_0's l1: 0.343143
[2200]	valid_0's rmse: 0.503241	valid_0's l1: 0.341318
rmse: 0.5023, run time: 207.07

fold 2/5...
[200]	valid_0's rmse: 0.885426	valid_0's l1: 0.642422
[400]	valid_0's rmse: 0.620896	valid_0's l1: 0.420761
[600]	valid_0's rmse: 0.585209	valid_0's l1: 0.395003
[800]	valid_0's rmse: 0.56048	valid_0's l1: 0.37758
[1000]	valid_0's rmse: 0.542499	valid_0's l1: 0.365627


In [18]:
# # get cv score and final predictions
# score = np.mean(scores)
# final_preds = np.mean(np.array(preds), axis=0)

In [19]:
# # save predictions
# sub = pd.read_csv('../data/raw/sample_submission.csv', index_col=0)
# sub[TARGET] = final_preds
# sub.head()

# out_path = '../submissions/'
# os.makedirs(out_path, exist_ok=True)
# sub.to_csv(out_path + f'{cfg.modelname}_dart.csv')