# Imports

In [3]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

rng = np.random.RandomState(42)

# Data

In [4]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

print(data.shape)
data.head()

(40778, 68)


Unnamed: 0,DATE,TARGET,TMAX_d1,TMAX_d1_d2_diff,TMAX_d2,TMAX_d3,TMAX_d4,TMAX_d5,TMAX_d6,TMAX_d7,...,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12
0,2010-01-16,15.6,15.0,-1.1,16.1,10.0,14.4,9.4,4.4,2.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2010-01-17,14.4,15.6,0.6,15.0,16.1,10.0,14.4,9.4,4.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2010-01-18,17.2,14.4,-1.2,15.6,15.0,16.1,10.0,14.4,9.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2010-01-19,21.1,17.2,2.8,14.4,15.6,15.0,16.1,10.0,14.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2010-01-20,23.9,21.1,3.9,17.2,14.4,15.6,15.0,16.1,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 66), (34938,)
Valid: (2920, 66), (2920,)
Test: (2920, 66), (2920,)


# Baseline

In [6]:
model = DecisionTreeRegressor(random_state=rng)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 5.613715711554186e-17
Valid MAE: 4.6025


# Hyperparameter tuning

In [7]:
param_grid = {
    'max_depth': [None, 12, 10, 8, 6, 5, 4, 3, 2],
    'min_samples_leaf': [1, 10, 25, 50, 100, 200, 500, 1000],
    'min_impurity_decrease': [0, 0.01, 0.1],
    'ccp_alpha': [0, 0.0005, 0.001, 0.005, 0.01, 0.1]
}
model = DecisionTreeRegressor(random_state=rng)
grid = GridSearchCV(
    model, param_grid, cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1, verbose=1
)
grid.fit(X_train, y_train)
print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits
Best score:  -3.091151883583099
Best params:  {'ccp_alpha': 0.005, 'max_depth': None, 'min_impurity_decrease': 0, 'min_samples_leaf': 200}


In [8]:
model = DecisionTreeRegressor(
    max_depth=None, min_samples_leaf=200,
    ccp_alpha=0.005,
    random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.972241081657534
Valid MAE: 3.2418038107909406


In [9]:
model = DecisionTreeRegressor(
    max_depth=10, min_samples_leaf=150,
    # ccp_alpha=0.005,
    random_state=rng
)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.9429952501237864
Valid MAE: 3.2307862801725356
