# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import optuna

rng = np.random.RandomState(42)

# Data

In [2]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 66), (34938,)
Valid: (2920, 66), (2920,)
Test: (2920, 66), (2920,)


# Baseline

In [3]:
model = AdaBoostRegressor(
    DecisionTreeRegressor(random_state=rng),
    random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 0.022728833934398108
Valid MAE: 3.2116438356164383


In [4]:
def load_data():
    data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
    data = pd.read_csv(data_path)
    data['DATE'] = data['DATE'].astype('datetime64[ns]')
    X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
    y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
    X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
    y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
    return X_train, X_valid, y_train, y_valid

def objective(trial):
    PARAMS = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 250),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1),
        'estimator__max_depth': trial.suggest_int('estimator__max_depth', 1, 8),
        'estimator__min_samples_leaf': trial.suggest_int('estimator__min_samples_leaf', 1, 250),
        'estimator__min_impurity_decrease': trial.suggest_float('estimator__min_impurity_decrease', 0.0, 0.1),
        'estimator__ccp_alpha': trial.suggest_float('estimator__ccp_alpha', 0.0, 0.1)
    }
    X_train, _, y_train, _ = load_data()
    model = AdaBoostRegressor(
        DecisionTreeRegressor(random_state=rng),
        random_state=rng
    )
    model.set_params(**PARAMS)
    score = np.mean(cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'))
    return score

In [5]:
sampler = optuna.samplers.TPESampler(
    n_startup_trials = 50, # the random sampling is used instead of the TPE algorithm until the given number of trials finish in the same study
    n_ei_candidates = 24, # number of candidate samples used to calculate the expected improvement.
    multivariate = True, # multivariate TPE when suggesting candidates; default: False
    seed = 42
)
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=100, interval_steps=20),
    direction='maximize',
    sampler=sampler,
    storage='sqlite:///db.sqlite3'
)
study.optimize(
    objective, n_trials = 250,
    timeout = 3600*8, # in seconds
    n_jobs = -1,
    show_progress_bar = True
)

[I 2024-12-24 05:19:48,465] A new study created in RDB with name: no-name-03ee7aa4-54c3-484a-9808-9bc1d0f9d214


  0%|          | 0/250 [00:00<?, ?it/s]

[I 2024-12-24 05:20:21,271] Trial 3 finished with value: -5.391678075003278 and parameters: {'n_estimators': 173, 'learning_rate': 0.9431102074234425, 'estimator__max_depth': 1, 'estimator__min_samples_leaf': 99, 'estimator__min_impurity_decrease': 0.08536701720465606, 'estimator__ccp_alpha': 0.027271129135400165}. Best is trial 3 with value: -5.391678075003278.
[I 2024-12-24 05:21:13,556] Trial 4 finished with value: -3.2753524961167373 and parameters: {'n_estimators': 46, 'learning_rate': 0.3850308880719273, 'estimator__max_depth': 5, 'estimator__min_samples_leaf': 143, 'estimator__min_impurity_decrease': 0.08304071349791145, 'estimator__ccp_alpha': 0.05538452174898876}. Best is trial 4 with value: -3.2753524961167373.
[I 2024-12-24 05:21:31,524] Trial 1 finished with value: -3.184314784438215 and parameters: {'n_estimators': 74, 'learning_rate': 0.21149325582481818, 'estimator__max_depth': 7, 'estimator__min_samples_leaf': 106, 'estimator__min_impurity_decrease': 0.01163682214907357

In [6]:
print(f"Best score: {study.best_trial.value}")
print(f"Best params: {study.best_trial.params}")

Best score: -2.9983176258428883
Best params: {'n_estimators': 43, 'learning_rate': 0.0022052901048161405, 'estimator__max_depth': 8, 'estimator__min_samples_leaf': 41, 'estimator__min_impurity_decrease': 0.0031375680522158084, 'estimator__ccp_alpha': 0.003662013821580379}


In [7]:
model = AdaBoostRegressor(
    DecisionTreeRegressor(
        max_depth=8, min_samples_leaf=41,
        min_impurity_decrease=0.003, ccp_alpha=0.004,
        random_state=rng
    ),
    n_estimators=43, learning_rate=0.0022,
    random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.8406012199125894
Valid MAE: 3.165721397583957


In [10]:
model = AdaBoostRegressor(
    DecisionTreeRegressor(
        max_depth=8, min_samples_leaf=40,
        #min_impurity_decrease=0.003, ccp_alpha=0.006,
        random_state=rng
    ),
    n_estimators=40, learning_rate=0.002,
    random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.8330639295365088
Valid MAE: 3.1648363667866177
