In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
import warnings

warnings.filterwarnings("ignore")

import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px

from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error

from catboost import CatBoostRegressor, EShapCalcType, EFeaturesSelectionAlgorithm
import lightgbm as lgb

from pathlib import Path
data_dir = Path("/content/drive/MyDrive/Colab Notebooks/Eenefit/data")

In [None]:
train = pd.read_parquet(data_dir/'train_features_0.parquet')
cols_int8 = ['county', 'is_business', 'product_type', 'is_consumption', 'hour',
             'day', 'weekday', 'month', 'country_holiday']
train[cols_int8] = train[cols_int8].astype(np.int8)

display(train.head(3))
train.shape

Unnamed: 0,county,is_business,product_type,is_consumption,datetime,data_block_id,date,dayofyear,hour,day,...,target_ratio_168_336,target_ratio_48_216,target_ratio_72_240,target_ratio_48_72,target_all_type_sum_ratio_48_72,target_all_type_sum_ratio_168_336,target_all_county_type_sum_ratio_48_72,target_all_county_type_sum_ratio_168_336,target,country_holiday
0,0,0,1,0,2021-09-01 00:00:00,0,2021-09-01,244,0,1,...,,,,,,,,,0.713,0
1,0,0,1,1,2021-09-01 00:00:00,0,2021-09-01,244,0,1,...,,,,,,,,,96.59,0
2,0,0,2,0,2021-09-01 00:00:00,0,2021-09-01,244,0,1,...,,,,,,,,,0.0,0


(2017824, 171)

In [None]:
drop_cols = ['date', 'datetime', 'segment']

train = train.drop(drop_cols, axis=1)
train['target_log1p'] = np.log1p(train['target'])

In [None]:
%%time
params = dict(
    iterations=6000,
    learning_rate=0.05,
    max_depth=7,
    l2_leaf_reg=3.0,
    bootstrap_type='Bernoulli',
    #bagging_temperature = 0.5,

    min_data_in_leaf=100,
    #random_strength=1,
    #subsample=0.7,

    loss_function='RMSE',
    eval_metric = 'MAE',
    metric_period=100,
    od_type='Iter',
    od_wait=25,
    task_type='GPU',
    allow_writing_files=False,
    )

train_idx = list(range(0, 600))
drop_cols_2 = ['target', 'data_block_id', 'is_consumption', 'is_business']

results = []

for i in range(2):
    for j in range(2):

        mask = (train.is_consumption==i) & (train.is_business==j)

        X_train = train[train.data_block_id.isin(train_idx)]
        X_train = X_train[mask].drop(drop_cols_2, axis=1)
        y_train = X_train.pop('target_log1p')

        X_valid = train[~train.data_block_id.isin(train_idx)]
        X_valid = X_valid[mask].drop(drop_cols_2, axis=1)
        y_valid = X_valid.pop('target_log1p')

        feature_name = list(X_train.columns)

        #model = CatBoostRegressor(**params)
        #summary = model.select_features(
        #    X_train, y_train,
        #    eval_set = [(X_valid, y_valid)],
        #    features_for_select= feature_name,
        #    num_features_to_select=len(feature_name)-10,
        #    steps=3,
        #    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
        #    shap_calc_type=EShapCalcType.Regular,
        #    train_final_model=False,
        #    plot=True,
        #)

        #model = CatBoostRegressor(**params)
        #model.fit(
        #    X_train[summary['selected_features_names']], y_train,
        #    eval_set=[(X_valid[summary['selected_features_names']], y_valid)],
        #    use_best_model=True
        #)


        model = CatBoostRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            use_best_model=True
            )

        model.save_model(f"model_lgb_{i}{j}")

        y_true = np.expm1(y_valid)
        preds = model.predict(X_valid)
        preds = np.expm1(preds)

        mae = mean_absolute_error(y_true, preds)
        print('MAE:', mae)
        print("==============================================================")
        results.append(mae)

        del X_train, y_train, X_valid, y_valid

print(results)
print(np.array(results).mean())

0:	learn: 1.7155835	test: 2.4567766	best: 2.4567766 (0)	total: 16.8ms	remaining: 1m 40s
100:	learn: 0.2389176	test: 0.2474367	best: 0.2474367 (100)	total: 1.04s	remaining: 1m
200:	learn: 0.2148927	test: 0.2311511	best: 0.2311511 (200)	total: 1.96s	remaining: 56.5s
300:	learn: 0.2034154	test: 0.2235933	best: 0.2235681 (299)	total: 2.89s	remaining: 54.8s
400:	learn: 0.1960311	test: 0.2207222	best: 0.2205756 (398)	total: 3.81s	remaining: 53.2s
500:	learn: 0.1905673	test: 0.2181785	best: 0.2181785 (500)	total: 4.74s	remaining: 52s
600:	learn: 0.1859966	test: 0.2164096	best: 0.2164096 (600)	total: 5.66s	remaining: 50.9s
700:	learn: 0.1823384	test: 0.2148543	best: 0.2148543 (700)	total: 6.58s	remaining: 49.8s
800:	learn: 0.1790052	test: 0.2140321	best: 0.2138671 (797)	total: 7.51s	remaining: 48.7s
900:	learn: 0.1761236	test: 0.2131928	best: 0.2131239 (882)	total: 8.44s	remaining: 47.7s
bestTest = 0.212548252
bestIteration = 945
Shrink model to first 946 iterations.
MAE: 71.4122069176172
0:	l