# Imports

In [1]:
!pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.6.0


In [2]:
import numpy as np
import pandas as pd
import matplotlib
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import optuna

rng = np.random.RandomState(42)

In [3]:
import sklearn
sklearn.__version__

'1.6.0'

# Data

In [4]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 52), (34938,)
Valid: (2920, 52), (2920,)
Test: (2920, 52), (2920,)


# Baseline

In [5]:
model = GradientBoostingRegressor(
    n_estimators=10000, 
    validation_fraction=0.1, n_iter_no_change=250,
    verbose=1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

      Iter       Train Loss   Remaining Time 
         1          97.2128           40.72m
         2          82.4999           40.81m
         3          70.5059           40.97m
         4          60.6862           40.66m
         5          52.6820           40.43m
         6          46.0943           40.39m
         7          40.7065           40.33m
         8          36.2631           40.16m
         9          32.6573           40.14m
        10          29.6905           40.07m
        20          17.8837           39.84m
        30          15.9781           39.71m
        40          15.4961           40.13m
        50          15.2853           40.49m
        60          15.1519           40.65m
        70          15.0496           40.88m
        80          14.9626           41.00m
        90          14.8774           41.04m
       100          14.7991           40.99m
       200          14.2417           40.84m
Train MAE: 2.815033953938314
Valid MAE: 3.153980722911

In [8]:
model = GradientBoostingRegressor(
    n_estimators=10000, 
    learning_rate=0.03, max_leaf_nodes=14, max_depth=1,
    min_samples_leaf=15, max_features=1.0,
    validation_fraction=0.1, n_iter_no_change=250,
    verbose=1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

      Iter       Train Loss   Remaining Time 
         1         111.3884           18.57m
         2         107.5568           16.98m
         3         103.9406           16.27m
         4         100.5293           15.87m
         5          97.3006           15.62m
         6          94.2332           15.39m
         7          91.3270           15.20m
         8          88.5588           15.09m
         9          85.9222           15.00m
        10          83.4151           14.91m
        20          63.7222           14.72m
        30          50.7793           14.55m
        40          41.8959           14.52m
        50          35.5806           14.48m
        60          31.0145           14.44m
        70          27.6567           14.37m
        80          25.1399           14.34m
        90          23.2298           14.31m
       100          21.7693           14.27m
       200          16.7046           14.13m
       300          15.8949           13.95m
       40