# Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import optuna
import cupy as cp

rng = np.random.RandomState(42)

# Data

In [3]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()

X_train = cp.array(X_train)
y_train = cp.array(y_train)
X_valid = cp.array(X_valid)
y_valid = cp.array(y_valid)
X_test = cp.array(X_test)
y_test = cp.array(y_test)

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 66), (34938,)
Valid: (2920, 66), (2920,)
Test: (2920, 66), (2920,)


# Baseline

In [3]:
model = XGBRegressor(n_jobs=-1, random_state=rng, device='cuda')
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train.get(), model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid.get(), model.predict(X_valid))}')

Train MAE: 2.2126423069823278
Valid MAE: 3.2406729671399885


# Hyperparameter tuning

In [4]:
def load_data():
    data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
    data = pd.read_csv(data_path)
    data['DATE'] = data['DATE'].astype('datetime64[ns]')
    X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
    y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
    X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
    y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
    X_train, y_train = cp.array(X_train), cp.array(y_train)
    X_valid, y_valid = cp.array(X_valid), cp.array(y_valid)
    return X_train, X_valid, y_train, y_valid

def objective(trial):
    PARAMS = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-6, 0.5),
        'max_depth': trial.suggest_int('max_depth', 1, 12),
        'gamma': trial.suggest_float('gamma', 0.0, 3.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 3.0),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0)
        
    }
    X_train, _, y_train, _ = load_data()
    model = XGBRegressor(n_jobs=-1, device='cuda', random_state=rng, verbosity=0)
    model.set_params(**PARAMS)
    score = np.mean(cross_val_score(model, X_train.get(), y_train.get(), cv=3, scoring='neg_mean_absolute_error'))
    return score

In [5]:
sampler = optuna.samplers.TPESampler(
    n_startup_trials = 50, # the random sampling is used instead of the TPE algorithm until the given number of trials finish in the same study
    n_ei_candidates = 24, # number of candidate samples used to calculate the expected improvement.
    multivariate = True, # multivariate TPE when suggesting candidates; default: False
    seed = 42
)
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=100, interval_steps=20),
    direction='maximize',
    sampler=sampler,
    storage='sqlite:///db.sqlite3'
)
study.optimize(
    objective, n_trials = 1000,
    timeout = 3600*10, # in seconds
    n_jobs = 1,
    show_progress_bar = True
)

[I 2024-12-24 05:17:33,241] A new study created in RDB with name: no-name-b27e61cd-a2bd-4093-9472-bc1bd56ae7d4


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-12-24 05:17:39,113] Trial 0 finished with value: -4.49735025349326 and parameters: {'n_estimators': 406, 'learning_rate': 0.47535720249065166, 'max_depth': 9, 'gamma': 1.7959754525911098, 'reg_lambda': 0.46805592132730955, 'subsample': 0.40919616423534183, 'colsample_bytree': 0.3406585285177396}. Best is trial 0 with value: -4.49735025349326.
[I 2024-12-24 05:17:47,433] Trial 1 finished with value: -3.320701138012 and parameters: {'n_estimators': 873, 'learning_rate': 0.30055790475659266, 'max_depth': 9, 'gamma': 0.06175348288740734, 'reg_lambda': 2.909729556485983, 'subsample': 0.8827098485602951, 'colsample_bytree': 0.44863737747479326}. Best is trial 1 with value: -3.320701138012.
[I 2024-12-24 05:17:49,266] Trial 2 finished with value: -3.001607608014495 and parameters: {'n_estimators': 222, 'learning_rate': 0.09170307152220707, 'max_depth': 4, 'gamma': 1.5742692948967134, 'reg_lambda': 1.2958350559263474, 'subsample': 0.5038603981386294, 'colsample_bytree': 0.7282970263056

In [6]:
print(f"Best score: {study.best_trial.value}")
print(f"Best params: {study.best_trial.params}")

Best score: -2.9740956402485526
Best params: {'n_estimators': 917, 'learning_rate': 0.010134959078493629, 'max_depth': 4, 'gamma': 0.9747186791117592, 'reg_lambda': 1.5466081351741223, 'subsample': 0.5087559284353277, 'colsample_bytree': 0.6205628486587708}


In [4]:
model = XGBRegressor(
    n_estimators=917, learning_rate=0.01,
    max_depth=4, gamma=0.975, reg_lambda=1.55,
    subsample=0.509, colsample_bytree=0.621,
    device='cuda', n_jobs=-1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train.get(), model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid.get(), model.predict(X_valid))}')

Train MAE: 2.8267011426925333
Valid MAE: 3.143140558739231


In [6]:
model = XGBRegressor(
    n_estimators=900, learning_rate=0.01,
    max_depth=4, gamma=1, reg_lambda=1.5,
    subsample=0.5, colsample_bytree=0.6,
    device='cuda', n_jobs=-1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train.get(), model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid.get(), model.predict(X_valid))}')

Train MAE: 2.8277278438185753
Valid MAE: 3.1396710105791485
