In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

rng = np.random.RandomState(42)

In [3]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features_w_base.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

print(data.shape)
data.head()

(40778, 56)


Unnamed: 0,DATE,TARGET,TMAX_d1,TMAX_d1_d2_diff,TMAX_d2,TMAX_d3,TMAX_d4,TMAX_d5,TMAX_d6,TMAX_d7,...,TMAX_14day_mean_diff,MONTH,STATION_USW00003967,STATION_USW00012916,STATION_USW00013960,STATION_USW00024025,STATION_USW00024233,STATION_USW00093067,STATION_USW00093225,STATION_USW00094728
0,2010-01-16,15.6,15.0,-1.1,16.1,10.0,14.4,9.4,4.4,2.8,...,0.2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2010-01-17,14.4,15.6,0.6,15.0,16.1,10.0,14.4,9.4,4.4,...,0.357143,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2010-01-18,17.2,14.4,-1.2,15.6,15.0,16.1,10.0,14.4,9.4,...,0.435714,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2010-01-19,21.1,17.2,2.8,14.4,15.6,15.0,16.1,10.0,14.4,...,0.792857,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2010-01-20,23.9,21.1,3.9,17.2,14.4,15.6,15.0,16.1,10.0,...,1.071429,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 54), (34938,)
Valid: (2920, 54), (2920,)
Test: (2920, 54), (2920,)


In [5]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=scaler.feature_names_in_)
X_valid = pd.DataFrame(scaler.transform(X_valid), columns=scaler.feature_names_in_)
X_test = pd.DataFrame(scaler.transform(X_test), columns=scaler.feature_names_in_)

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 54), (34938,)
Valid: (2920, 54), (2920,)
Test: (2920, 54), (2920,)


# Linear Regression

In [6]:
model = LinearRegression()
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.945932661647542
Valid MAE: 3.160086291453305


# Ridge

In [7]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1],
    'fit_intercept': [True, False]
}

model = Ridge(max_iter=1000)
grid = GridSearchCV(
    model, param_grid, cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=1, verbose=1
)
grid.fit(X_train, y_train)
print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

model = Ridge(**grid.best_params_)
model.fit(X_train, y_train)
print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Fitting 5 folds for each of 26 candidates, totalling 130 fits
Best score:  -2.9584193164594303
Best params:  {'alpha': 1.1, 'fit_intercept': True}
Train MAE: 2.9459129290397863
Valid MAE: 3.1603306155776907


# Lasso

In [8]:
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1],
    'fit_intercept': [True, False]
}

model = Lasso(max_iter=10000)
grid = GridSearchCV(
    model, param_grid, cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=1, verbose=1
)
grid.fit(X_train, y_train)
print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

model = Lasso(max_iter=10000, **grid.best_params_)
model.fit(X_train, y_train)
print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Fitting 5 folds for each of 26 candidates, totalling 130 fits


  model = cd_fast.enet_coordinate_descent(


Best score:  -2.9587022834867667
Best params:  {'alpha': 0.001, 'fit_intercept': True}
Train MAE: 2.946196974056016
Valid MAE: 3.161882006568264


# ElasticNet

In [9]:
param_grid = {
    'l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1],
    'fit_intercept': [True, False]
}

model = ElasticNet(max_iter=1000)
grid = GridSearchCV(
    model, param_grid, cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=1, verbose=1
)
grid.fit(X_train, y_train)
print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

model = ElasticNet(max_iter=1000, **grid.best_params_)
model.fit(X_train, y_train)
print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Fitting 5 folds for each of 198 candidates, totalling 990 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best score:  -2.972251243717399
Best params:  {'alpha': 0.1, 'fit_intercept': True, 'l1_ratio': 0.1}
Train MAE: 2.9612073983644445
Valid MAE: 3.1824800538149374


  model = cd_fast.enet_coordinate_descent(
