# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import optuna

rng = np.random.RandomState(42)

# Data

In [2]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

print(data.shape)
data.head()

(40778, 68)


Unnamed: 0,DATE,TARGET,TMAX_d1,TMAX_d1_d2_diff,TMAX_d2,TMAX_d3,TMAX_d4,TMAX_d5,TMAX_d6,TMAX_d7,...,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12
0,2010-01-16,15.6,15.0,-1.1,16.1,10.0,14.4,9.4,4.4,2.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2010-01-17,14.4,15.6,0.6,15.0,16.1,10.0,14.4,9.4,4.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2010-01-18,17.2,14.4,-1.2,15.6,15.0,16.1,10.0,14.4,9.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2010-01-19,21.1,17.2,2.8,14.4,15.6,15.0,16.1,10.0,14.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2010-01-20,23.9,21.1,3.9,17.2,14.4,15.6,15.0,16.1,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 66), (34938,)
Valid: (2920, 66), (2920,)
Test: (2920, 66), (2920,)


# Baseline

In [4]:
model = LGBMRegressor(n_jobs=-1, random_state=rng)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6997
[LightGBM] [Info] Number of data points in the train set: 34938, number of used features: 66
[LightGBM] [Info] Start training from score 20.304783
Train MAE: 2.6757019818083463
Valid MAE: 3.14513157059841


In [5]:
def load_data():
    data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
    data = pd.read_csv(data_path)
    data['DATE'] = data['DATE'].astype('datetime64[ns]')
    X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
    y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
    X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
    y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
    return X_train, X_valid, y_train, y_valid

def objective(trial):
    PARAMS = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-6, 0.5),
        'max_depth': trial.suggest_int('depth', 1, 12),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 100.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 100.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0)
    }
    X_train, _, y_train, _ = load_data()
    model = LGBMRegressor(n_jobs=-1, random_state=rng, verbose=-1)
    model.set_params(**PARAMS)
    score = np.mean(cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'))
    return score

In [6]:
sampler = optuna.samplers.TPESampler(
    n_startup_trials = 50, # the random sampling is used instead of the TPE algorithm until the given number of trials finish in the same study
    n_ei_candidates = 24, # number of candidate samples used to calculate the expected improvement.
    multivariate = True, # multivariate TPE when suggesting candidates; default: False
    seed = 42
)
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=100, interval_steps=20),
    direction='maximize',
    sampler=sampler,
    storage='sqlite:///db.sqlite3'
)
study.optimize(
    objective, n_trials = 1000,
    timeout = 3600*8, # in seconds
    n_jobs = 1,
    show_progress_bar = True
)

[I 2024-12-24 05:19:03,020] A new study created in RDB with name: no-name-ca15554e-182d-476b-9c14-e228e2064239


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-12-24 05:19:10,163] Trial 0 finished with value: -3.289635376285171 and parameters: {'n_estimators': 406, 'learning_rate': 0.47535720249065166, 'depth': 9, 'reg_alpha': 59.86584841970366, 'reg_lambda': 15.601864044243651, 'colsample_bytree': 0.40919616423534183}. Best is trial 0 with value: -3.289635376285171.
[I 2024-12-24 05:19:12,903] Trial 1 finished with value: -3.1370540963829883 and parameters: {'n_estimators': 105, 'learning_rate': 0.4330882067113218, 'depth': 8, 'reg_alpha': 70.80725777960456, 'reg_lambda': 2.0584494295802447, 'colsample_bytree': 0.978936896513396}. Best is trial 1 with value: -3.1370540963829883.
[I 2024-12-24 05:19:17,699] Trial 2 finished with value: -3.0537451568111464 and parameters: {'n_estimators': 841, 'learning_rate': 0.10617034300002741, 'depth': 3, 'reg_alpha': 18.34045098534338, 'reg_lambda': 30.42422429595377, 'colsample_bytree': 0.6673295021425665}. Best is trial 2 with value: -3.0537451568111464.
[I 2024-12-24 05:19:22,922] Trial 3 finis

In [7]:
print(f"Best score: {study.best_trial.value}")
print(f"Best params: {study.best_trial.params}")

Best score: -2.986978768553044
Best params: {'n_estimators': 705, 'learning_rate': 0.01480867700703094, 'depth': 11, 'reg_alpha': 1.0220004489622183, 'reg_lambda': 20.196689916945562, 'colsample_bytree': 0.4223317007343945}


In [8]:
model = LGBMRegressor(
    n_estimators=705, learning_rate=0.0148, depth=11,
    reg_alpha=1.022, reg_lambda=20.2,
    colsample_bytree=0.4223,
    n_jobs=-1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.705238103568836
Valid MAE: 3.124096302901307


In [9]:
model = LGBMRegressor(
    n_estimators=700, learning_rate=0.015, depth=11,
    reg_alpha=0.7, reg_lambda=0.6,
    colsample_bytree=0.4,
    n_jobs=-1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.679292788054296
Valid MAE: 3.1260030826745604
