# Imports

In [2]:
!pip install --upgrade scikit-learn



In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error

rng = np.random.RandomState(42)

In [4]:
import sklearn
sklearn.__version__

'1.6.0'

# Data

In [5]:
data_path = 'https://raw.githubusercontent.com/antbartash/max_temp/master/data/data_features.csv'
data = pd.read_csv(data_path)
data['DATE'] = data['DATE'].astype('datetime64[ns]')

X_train = data.loc[data['DATE'].dt.year <= 2021].drop(columns=['TARGET', 'DATE']).copy()
y_train = data.loc[data['DATE'].dt.year <= 2021, 'TARGET'].copy()
X_valid = data.loc[data['DATE'].dt.year == 2022].drop(columns=['TARGET', 'DATE']).copy()
y_valid = data.loc[data['DATE'].dt.year == 2022, 'TARGET'].copy()
X_test = data.loc[data['DATE'].dt.year == 2023].drop(columns=['TARGET', 'DATE']).copy()
y_test = data.loc[data['DATE'].dt.year == 2023, 'TARGET'].copy()

print(f'Train: {X_train.shape}, {y_train.shape}')
print(f'Valid: {X_valid.shape}, {y_valid.shape}')
print(f'Test: {X_test.shape}, {y_test.shape}')

Train: (34938, 52), (34938,)
Valid: (2920, 52), (2920,)
Test: (2920, 52), (2920,)


# Base models

In [6]:
svr_model = SVR(
    kernel='poly', degree=2, coef0=1,
    C=0.75, gamma='scale',
    max_iter=100000
)
linearsvr_model = LinearSVR(
    max_iter=100000, random_state=rng
)
catboost_model = CatBoostRegressor(
    n_estimators=571, learning_rate=0.035, depth=7,
    l2_leaf_reg=18.65, random_strength=0.1243, bagging_temperature=79.24, 
    grow_policy='SymmetricTree', verbose=0, random_state=0
)
regression_model = LinearRegression()

# 2 voting models

In [7]:
model = VotingRegressor([
    ('svr', svr_model),
    ('linearsvr', linearsvr_model),
    #('catboost', catboost_model),
    #('regression', regression_model)
], n_jobs=-1)
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.888249242555616
Valid MAE: 3.1067285550793913


In [7]:
model = VotingRegressor([
    ('svr', svr_model),
    #('linearsvr', linearsvr_model),
    ('catboost', catboost_model),
    #('regression', regression_model)
])
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.835534088194449
Valid MAE: 3.106728788365591


In [8]:
model = VotingRegressor([
    ('svr', svr_model),
    #('linearsvr', linearsvr_model),
    #('catboost', catboost_model),
    ('regression', regression_model)
])
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.893105993019703
Valid MAE: 3.1121607363504733


In [8]:
model = VotingRegressor([
    #('svr', svr_model),
    ('linearsvr', linearsvr_model),
    ('catboost', catboost_model),
    #('regression', regression_model)
])
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')



Train MAE: 2.8195519189568645
Valid MAE: 3.0872598704073124


In [9]:
model = VotingRegressor([
    #('svr', svr_model),
    ('linearsvr', linearsvr_model),
    #('catboost', catboost_model),
    ('regression', regression_model)
])
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.8883463025942775
Valid MAE: 3.101917317276661




In [10]:
model = VotingRegressor([
    #('svr', svr_model),
    #('linearsvr', linearsvr_model),
    ('catboost', catboost_model),
    ('regression', regression_model)
])
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.8396011781738095
Valid MAE: 3.1098418861283275


# 3 voting models

In [11]:
model = VotingRegressor([
    #('svr', svr_model),
    ('linearsvr', linearsvr_model),
    ('catboost', catboost_model),
    ('regression', regression_model)
])
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')



Train MAE: 2.8450475986575223
Valid MAE: 3.0953212240586203


In [12]:
model = VotingRegressor([
    ('svr', svr_model),
    #('linearsvr', linearsvr_model),
    ('catboost', catboost_model),
    ('regression', regression_model)
])
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')

Train MAE: 2.8506502517435677
Valid MAE: 3.1028598498884605


In [13]:
model = VotingRegressor([
    ('svr', svr_model),
    ('linearsvr', linearsvr_model),
    #('catboost', catboost_model),
    ('regression', regression_model)
])
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')



Train MAE: 2.885746107466003
Valid MAE: 3.1029581751566977


In [14]:
model = VotingRegressor([
    ('svr', svr_model),
    ('linearsvr', linearsvr_model),
    ('catboost', catboost_model),
    #('regression', regression_model)
])
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')



Train MAE: 2.842914017242161
Valid MAE: 3.0950905054092277


# 4 voting models

In [15]:
model = VotingRegressor([
    ('svr', svr_model),
    ('linearsvr', linearsvr_model),
    ('catboost', catboost_model),
    ('regression', regression_model)
])
model.fit(X_train, y_train)

print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train))}')
print(f'Valid MAE: {mean_absolute_error(y_valid, model.predict(X_valid))}')



Train MAE: 2.85375650769095
Valid MAE: 3.096519998016302
