# Imports

In [2]:
!pip install --upgrade scikit-learn



In [3]:
import numpy as np
import pandas as pd
import matplotlib
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import optuna

rng = np.random.RandomState(42)

In [4]:
import sklearn
sklearn.__version__

'1.6.0'

# Data

In [5]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 66), (34938,)
Valid: (2920, 66), (2920,)
Test: (2920, 66), (2920,)


# Baseline

In [6]:
model = GradientBoostingRegressor(
    n_estimators=10000, 
    validation_fraction=0.1, n_iter_no_change=250,
    verbose=1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

      Iter       Train Loss   Remaining Time 
         1          97.2128           52.77m
         2          82.4999           52.63m
         3          70.5059           52.85m
         4          60.6862           52.82m
         5          52.6820           52.74m
         6          46.0943           52.78m
         7          40.7065           52.67m
         8          36.2631           52.57m
         9          32.6573           52.36m
        10          29.6905           52.29m
        20          17.8649           51.98m
        30          15.8996           52.43m
        40          15.3983           52.93m
        50          15.1907           53.11m
        60          15.0564           53.40m
        70          14.9600           53.60m
        80          14.8743           53.79m
        90          14.7864           53.88m
       100          14.7237           53.94m
       200          14.1466           53.83m
       300          13.6772           53.46m
       40

In [7]:
model = GradientBoostingRegressor(
    n_estimators=10000, 
    learning_rate=0.02, max_leaf_nodes=17, max_depth=8,
    min_samples_leaf=2, max_features=0.238,
    validation_fraction=0.1, n_iter_no_change=250,
    verbose=1, random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

      Iter       Train Loss   Remaining Time 
         1         111.4960           20.60m
         2         107.9009           19.87m
         3         104.4062           20.15m
         4         101.1167           19.68m
         5          97.9400           19.37m
         6          94.8845           19.72m
         7          91.9414           19.68m
         8          89.0869           19.90m
         9          86.3287           20.07m
        10          83.6610           20.18m
        20          62.1004           20.15m
        30          47.4402           20.29m
        40          37.4862           20.19m
        50          30.7065           19.97m
        60          25.9909           19.90m
        70          22.7639           19.83m
        80          20.5320           19.85m
        90          18.9799           19.87m
       100          17.8757           19.95m
       200          14.9334           20.15m
       300          14.3545           20.59m
       40