In [2]:
import numpy as np
import pandas as pd
import matplotlib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import optuna

rng = np.random.RandomState(42)

In [3]:
def load_data():
    data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
    data = pd.read_csv(data_path)
    data['DATE'] = data['DATE'].astype('datetime64[ns]')
    X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
    y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
    X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
    y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
    return X_train, X_valid, y_train, y_valid

In [3]:
def objective(trial):
    PARAMS = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 12),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 500),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 1.0),
        'ccp_alpha': trial.suggest_float('ccp_alpha', 0.0, 0.1),
    }
    X_train, X_valid, y_train, y_valid = load_data()
    model = RandomForestRegressor(n_jobs=-1, random_state=rng)
    model.set_params(**PARAMS)
    model.fit(X_train, y_train)
    score = np.mean(cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'))
    return score

In [4]:
sampler = optuna.samplers.TPESampler(
    n_startup_trials = 50, # the random sampling is used instead of the TPE algorithm until the given number of trials finish in the same study
    n_ei_candidates = 24, # number of candidate samples used to calculate the expected improvement.
    multivariate = True, # multivariate TPE when suggesting candidates; default: False
    seed = 42
)
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=100, interval_steps=20),
    direction='maximize',
    sampler=sampler,
    storage='sqlite:///db.sqlite3'
)
study.optimize(
    objective, n_trials = 1000,
    timeout = 3600*10, # in seconds
    n_jobs = 1,
    show_progress_bar = True
)

[I 2024-12-23 20:24:31,868] A new study created in RDB with name: no-name-9ddf3964-c5b5-4e39-b810-ce78dbab46a3


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-12-23 20:25:57,119] Trial 0 finished with value: -3.147394237446227 and parameters: {'n_estimators': 406, 'max_depth': 12, 'min_samples_leaf': 366, 'max_features': 0.6387926357773329, 'min_impurity_decrease': 0.15601864044243652, 'ccp_alpha': 0.015599452033620266}. Best is trial 0 with value: -3.147394237446227.
[I 2024-12-23 20:26:25,580] Trial 1 finished with value: -3.1209715441609696 and parameters: {'n_estimators': 105, 'max_depth': 11, 'min_samples_leaf': 301, 'max_features': 0.737265320016441, 'min_impurity_decrease': 0.020584494295802447, 'ccp_alpha': 0.09699098521619944}. Best is trial 1 with value: -3.1209715441609696.
[I 2024-12-23 20:27:14,613] Trial 2 finished with value: -3.405378463603322 and parameters: {'n_estimators': 841, 'max_depth': 3, 'min_samples_leaf': 91, 'max_features': 0.2650640588680905, 'min_impurity_decrease': 0.3042422429595377, 'ccp_alpha': 0.052475643163223784}. Best is trial 1 with value: -3.1209715441609696.
[I 2024-12-23 20:27:44,093] Trial 3

In [5]:
print(f"Best score: {study.best_trial.value}")
print(f"Best params: {study.best_trial.params}")

Best score: -2.9918265312044894
Best params: {'n_estimators': 546, 'max_depth': 12, 'min_samples_leaf': 4, 'max_features': 0.7944023948624739, 'min_impurity_decrease': 0.005059491029629109, 'ccp_alpha': 0.0007450818258318767}


In [4]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')
X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()
print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

model = RandomForestRegressor(
    n_estimators=546, max_depth=12,
    min_samples_leaf=4, max_features=0.794, 
    min_impurity_decrease=0.005, ccp_alpha=0.0008,
    n_jobs=-1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train: (34938, 66), (34938,)
Valid: (2920, 66), (2920,)
Test: (2920, 66), (2920,)
Train MAE: 2.5433313034555876
Valid MAE: 3.156095080814888


In [5]:
model = RandomForestRegressor(
    n_estimators=550, max_depth=12,
    min_samples_leaf=4, max_features=0.8, 
    min_impurity_decrease=0.005, ccp_alpha=0.0,
    n_jobs=-1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.5435789900881782
Valid MAE: 3.1560122369045276


In [6]:
model = RandomForestRegressor(
    n_estimators=800, max_depth=10,
    min_samples_leaf=20, max_features=0.6,
    n_jobs=-1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.689130946525663
Valid MAE: 3.1501342529336287
