# Imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import optuna

rng = np.random.RandomState(42)

# Data

In [4]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 66), (34938,)
Valid: (2920, 66), (2920,)
Test: (2920, 66), (2920,)


# Baseline

In [5]:
model = CatBoostRegressor(verbose=100, random_state=42)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Learning rate set to 0.071783
0:	learn: 10.1134719	total: 67.9ms	remaining: 1m 7s
100:	learn: 3.8919697	total: 934ms	remaining: 8.31s
200:	learn: 3.8086674	total: 1.81s	remaining: 7.19s
300:	learn: 3.7244208	total: 2.64s	remaining: 6.12s
400:	learn: 3.6564320	total: 3.46s	remaining: 5.16s
500:	learn: 3.5954735	total: 4.27s	remaining: 4.26s
600:	learn: 3.5362912	total: 5.11s	remaining: 3.39s
700:	learn: 3.4837109	total: 5.94s	remaining: 2.53s
800:	learn: 3.4284069	total: 6.78s	remaining: 1.68s
900:	learn: 3.3784742	total: 7.61s	remaining: 836ms
999:	learn: 3.3320836	total: 8.44s	remaining: 0us
Train MAE: 2.5278278310914963
Valid MAE: 3.1472270902247033


# Hyperparameter tuning

In [6]:
def load_data():
    data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
    data = pd.read_csv(data_path)
    data['DATE'] = data['DATE'].astype('datetime64[ns]')
    X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
    y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
    X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
    y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
    return X_train, X_valid, y_train, y_valid

def objective(trial):
    PARAMS = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-6, 0.5),
        'depth': trial.suggest_int('depth', 1, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0, 200.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 200.0), # CPU only
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 200.0),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise']),
    }
    X_train, _, y_train, _ = load_data()
    model = CatBoostRegressor(verbose=0, random_state=42)
    model.set_params(**PARAMS)
    score = np.mean(cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'))
    return score

In [7]:
sampler = optuna.samplers.TPESampler(
    n_startup_trials = 50, # the random sampling is used instead of the TPE algorithm until the given number of trials finish in the same study
    n_ei_candidates = 24, # number of candidate samples used to calculate the expected improvement.
    multivariate = True, # multivariate TPE when suggesting candidates; default: False
    seed = 42
)
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=100, interval_steps=20),
    direction='maximize',
    sampler=sampler,
    storage='sqlite:///db.sqlite3'
)
study.optimize(
    objective, n_trials = 1200,
    timeout = 3600*10, # in seconds
    n_jobs = 1,
    show_progress_bar = True
)

[I 2024-12-23 20:53:38,777] A new study created in RDB with name: no-name-2a7fd3fc-0bd5-41f6-ab2a-3833c9affba3


  0%|          | 0/1200 [00:00<?, ?it/s]

[I 2024-12-23 20:55:00,828] Trial 0 finished with value: -3.2139966298610645 and parameters: {'n_estimators': 406, 'learning_rate': 0.47535720249065166, 'depth': 9, 'l2_leaf_reg': 119.73169683940732, 'random_strength': 31.203728088487303, 'bagging_temperature': 31.19890406724053, 'grow_policy': 'Depthwise'}. Best is trial 0 with value: -3.2139966298610645.
[I 2024-12-23 20:55:08,703] Trial 1 finished with value: -3.0706640234616525 and parameters: {'n_estimators': 621, 'learning_rate': 0.3540365808254449, 'depth': 1, 'l2_leaf_reg': 193.98197043239887, 'random_strength': 166.48852816008434, 'bagging_temperature': 42.46782213565523, 'grow_policy': 'Depthwise'}. Best is trial 1 with value: -3.0706640234616525.
[I 2024-12-23 20:55:28,397] Trial 2 finished with value: -3.0938653610159528 and parameters: {'n_estimators': 339, 'learning_rate': 0.26237869105968725, 'depth': 6, 'l2_leaf_reg': 58.245828039608384, 'random_strength': 122.3705789444759, 'bagging_temperature': 27.898772130408368, 'g

In [8]:
print(f"Best score: {study.best_trial.value}")
print(f"Best params: {study.best_trial.params}")

Best score: -2.977548957236266
Best params: {'n_estimators': 908, 'learning_rate': 0.03644442333410188, 'depth': 7, 'l2_leaf_reg': 6.509331968547631, 'random_strength': 0.12253707936635516, 'bagging_temperature': 97.7056969573513, 'grow_policy': 'SymmetricTree'}


In [9]:
model = CatBoostRegressor(
    n_estimators=908, learning_rate=0.036, depth=7,
    l2_leaf_reg=6.51, random_strength=0.1225, bagging_temperature=97.71, 
    grow_policy='SymmetricTree', verbose=100, random_state=42
)
model.fit(X_train, y_train)
print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

0:	learn: 10.4131919	total: 21.2ms	remaining: 19.2s
100:	learn: 3.9236751	total: 1.32s	remaining: 10.5s
200:	learn: 3.8405092	total: 2.54s	remaining: 8.93s
300:	learn: 3.7918985	total: 3.72s	remaining: 7.5s
400:	learn: 3.7510576	total: 4.92s	remaining: 6.22s
500:	learn: 3.7092640	total: 6.13s	remaining: 4.98s
600:	learn: 3.6764926	total: 7.32s	remaining: 3.74s
700:	learn: 3.6405842	total: 8.58s	remaining: 2.53s
800:	learn: 3.6040808	total: 9.81s	remaining: 1.31s
900:	learn: 3.5649589	total: 11s	remaining: 85.8ms
907:	learn: 3.5627254	total: 11.1s	remaining: 0us
Train MAE: 2.6843993606427494
Valid MAE: 3.128768128788785


In [11]:
model = CatBoostRegressor(
    n_estimators=900, depth=7,
    l2_leaf_reg=6.5, random_strength=0.1225, bagging_temperature=100, 
    grow_policy='SymmetricTree', verbose=100, random_state=42
)
model.fit(X_train, y_train)
print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

0:	learn: 10.4657631	total: 20ms	remaining: 18s
100:	learn: 3.9695458	total: 1.3s	remaining: 10.3s
200:	learn: 3.8537351	total: 2.52s	remaining: 8.77s
300:	learn: 3.8177686	total: 3.68s	remaining: 7.32s
400:	learn: 3.7792069	total: 4.93s	remaining: 6.14s
500:	learn: 3.7418762	total: 6.12s	remaining: 4.87s
600:	learn: 3.7050616	total: 7.33s	remaining: 3.65s
700:	learn: 3.6748319	total: 8.51s	remaining: 2.42s
800:	learn: 3.6461959	total: 9.74s	remaining: 1.2s
899:	learn: 3.6190808	total: 10.9s	remaining: 0us
Train MAE: 2.7224466551932154
Valid MAE: 3.1221596410889467
