This notebook was run on colab, if the output image cannot be viewed locally please click this link.https://colab.research.google.com/drive/1oa3H13hRTc_Y1xh_H3u1fZfHW6BO04L4?usp=sharing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Set the Catboost environment for colab

In [None]:
!pip install catboost
!pip install optuna

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2
Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m12.0 MB/s[0

# Import

In [None]:
import warnings

warnings.filterwarnings("ignore")

import os
import gc
import datetime

import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa

import catboost as cb
import optuna
from sklearn.metrics import mean_absolute_error

import holidays

# Class

## Load Data

In [None]:
class DataStorage:
    # root = "..\data"
    root = "/content/drive/MyDrive/predict-energy-behavior-of-prosumers/Data"
    # root = "/kaggle/input/predict-energy-behavior-of-prosumers"

    data_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
        "row_id",
    ]
    client_cols = [
        "product_type",
        "county",
        "eic_count",
        "installed_capacity",
        "is_business",
        "date",
    ]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = [
        "latitude",
        "longitude",
        "origin_datetime",
        "hours_ahead",
        "temperature",
        "dewpoint",
        "cloudcover_high",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "forecast_datetime",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
    historical_weather_cols = [
        "datetime",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
        "latitude",
        "longitude",
    ]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
    ]

    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )
        self.df_data = self.df_data.filter(
            (pl.col("datetime") >= pd.to_datetime("2022-01-01"))&
            (pl.col("datetime") <= pd.to_datetime("2023-05-17"))
        )
        self.df_target = self.df_data.select(self.target_cols)

        self.schema_data = self.df_data.schema
        self.schema_client = self.df_client.schema
        self.schema_gas_prices = self.df_gas_prices.schema
        self.schema_electricity_prices = self.df_electricity_prices.schema
        self.schema_forecast_weather = self.df_forecast_weather.schema
        self.schema_historical_weather = self.df_historical_weather.schema
        self.schema_target = self.df_target.schema

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )



## Feature Enegineering

In [None]:
class FeaturesGenerator:
    def __init__(self, data_storage):
        self.data_storage = data_storage
        self.estonian_holidays = list(
            holidays.country_holidays("EE", years=range(2021, 2026)).keys()
        )
    
    
    def _add_general_features(self, df_features):
        #basic time features
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.week().alias("week"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.quarter().alias("quarter"),
                pl.col("datetime").dt.year().alias("year"),
                )
        #Tag different types of prosumers in different regions as id for this category.
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
        # cyclical features encoding (refrence:https://towardsdatascience.com/cyclical-features-encoding-its-about-time-ce23581845ca)
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features
    
    def is_country_holiday(self, row):
        return (
            datetime.date(row["year"], row["month"], row["day"])
            in self.estonian_holidays
        )

    def _add_holidays_features(self, df_features):
        df_features = df_features.with_columns(
            pl.struct(["year", "month", "day"])
            .apply(self.is_country_holiday)
            .alias("is_country_holiday")
        )
        return df_features


    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client
        #Only client information from two days ago is available for each prediction.
        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        return df_features

    
    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )
        
        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            .filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude", "origin_datetime")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )
        #Uses the current information of the weather forecast, and the simultaneous moment-in-time weather forecast information from a week ago.
        for hours_lag in [0, 7 * 24]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )
        #Calculation of the rate of change of weather forecast information compared to the same time one week ago
        df_features = df_features.with_columns(
            (
                pl.col(f"temperature_forecast_local_0h")
                / (pl.col(f"temperature_forecast_local_168h") + 1e-3)
            ).alias(f"temperature_forecast_local_0h/168h"),
            (
                pl.col(f"surface_solar_radiation_downwards_forecast_local_0h")
                / (pl.col(f"surface_solar_radiation_downwards_forecast_local_168h") + 1e-3)
            ).alias(f"surface_solar_radiation_downwards_forecast_local_0h/168h"),
        )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )
        
        #Using historical weather data from the same point in time two days ago and one week ago
        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )
        #Use historical weather data up to 11 a.m. one day prior (historical weather data is updated daily at 11 a.m.)
        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                .filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
        #Calculation of the rate of change of historical weather information at the same time two days ago compared to the same time one week ago
        df_features = df_features.with_columns(
            (
                pl.col(f"temperature_historical_local_48h")
                / (pl.col(f"temperature_historical_local_168h") + 1e-3)
            ).alias(f"temperature_historical_local_48h/168h"),
            (
                pl.col(f"direct_solar_radiation_historical_local_48h")
                / (pl.col(f"direct_solar_radiation_historical_local_168h") + 1e-3)
            ).alias(f"direct_solar_radiation_historical_local_48h/168h"),
        #Calculation of the rate of change of historical weather information(0 - 10 am) at the same time one day ago compared to the same time two days ago
            (
                pl.col(f"temperature_historical_24h")
                / (pl.col(f"temperature") + 1e-3)
            ).alias(f"temperature_historical_24h/48h"),
            (
                pl.col(f"direct_solar_radiation_historical_24h")
                / (pl.col(f"direct_solar_radiation") + 1e-3)
            ).alias(f"direct_solar_radiation_historical_24h/48h"),

        )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target
        #Target values for different product_types
        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )
        #Target values for different counties and product_types
        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        #Target value at the same moment in the last 2-14 days
        hours_list=[i*24 for i in range(2,15)]

        for hours_lag in hours_list:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h"}),
                on=[
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    "datetime",
                ],
                how="left",
            )
        
        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
            #Total value of target for the same product_type at the same time in the last 2, 3, 7 and 14 days.
            df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )
            #Total value of target for the same product_type and county at the same time in the last 2, 3, 7 and 14 days.
            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )
        
        #Mean and standard deviation of target values at the same point in a week
        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24, 6 * 24, 7 *24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )
        #Rate of change in target value for different combinations over a two-week period
        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        return df_features
    
    #The amount of change (difference) in meteorological data from 1h ago, 3h ago, and 24 ago.
    def _additional_features(self,df_features):
        for col in [
                    'temperature',
                    'dewpoint',
                    '10_metre_u_wind_component',
                    '10_metre_v_wind_component',
                    "cloudcover_total",
                    "cloudcover_low",
                    "cloudcover_mid",
                    "cloudcover_high",
                    "windspeed_10m",
                    "winddirection_10m",
                    "shortwave_radiation",
                    "direct_solar_radiation",
                    "diffuse_radiation",
            ]:
            for window in [1,3,24]:
                df_features[f"{col}_diff_{window}"] = df_features.groupby(["county", 'is_consumption', 'product_type', 'is_business'])[col].diff(window)
        return df_features


    def _add_gas_prices_features(self, df_features):
        df_gas_prices = self.data_storage.df_gas_prices

        df_features = df_features.join(df_gas_prices.rename(
                                    {"forecast_date": "date"})
                                    .with_columns(
                                        (pl.col("date") + pl.duration(days=1)).cast(pl.Date)
                                    ),
                                    on= "date",
                                    how="left")

        return df_features

    def _add_electricity_prices_features(self, df_features):
        df_electricity_prices = self.data_storage.df_electricity_prices

        df_features = df_features.join(df_electricity_prices.rename(
                                    {"forecast_date": "datetime"})
                                    .with_columns(
                                        pl.col("datetime")+ pl.duration(hours=24)
                                    ),
                                    on="datetime" ,
                                    how="left")

        return df_features


    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features
    
    #Drop columns that are not needed for modelling Because these features are so sparse.
    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            "date", "datetime", "dayofyear"
        )
        return df_features
    
    def _to_pandas(self, df_features, y):
        '''Because the format of the category features is required to be "categroy" when training with lightgbm later, 
           the table needs to be converted to a pandas dataframe.
        '''
        cat_cols = [
            "county",
            "is_business",
            "product_type",
            "is_consumption",
            "segment",
            "quarter",
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        df_features[cat_cols] = df_features[cat_cols].astype("category")

        for i in ['row_id','year','date']:
            if i in df_features.columns:
                df_features = df_features.drop(i, axis=1)


        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_holidays_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._add_gas_prices_features,
            self._add_electricity_prices_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)
        df_features = self._additional_features(df_features)
        return df_features

# Initialisation

In [None]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

### Feature Generation

In [None]:
df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features[df_train_features['target'].notnull()]

In [None]:
df_train_features.shape

(1604324, 216)

# Catboost Models Hyperparameter Optimization

In [None]:
def catboost_model(mask,trial,is_diff):
    X = df_train_features[mask].drop(columns=["target"])
    Y = df_train_features[mask]["target"] - (df_train_features[mask]["target_48h"].fillna(0) * is_diff)
    train_size=0.85
    X_train, X_test = X.head(int(len(Y)*train_size)) , X.tail(int(len(Y)*(1-train_size)))
    y_train, y_test = Y.head(int(len(Y)*train_size)) , Y.tail(int(len(Y)*(1-train_size)))
    params = {
        "random_seed":49,
        "loss_function":"RMSE",
        "task_type":"GPU",
        'bootstrap_type': 'Bernoulli',

        "iterations": trial.suggest_int("iterations", 5000, 10000, step=500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "depth": trial.suggest_int("depth", 3, 9),
        'random_strength': trial.suggest_uniform('random_strength',10, 50),
        "subsample": trial.suggest_float("subsample", 0.8, 1.0, step = 0.1),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 50, 1000),
        'reg_lambda': trial.suggest_float('reg_lambda', 0,100)
    }
    cat_features = ["county", "is_business", "product_type", "is_consumption", "segment", "quarter"]
    model = cb.CatBoostRegressor(**params, silent=True)
    model.fit(X_train, y_train , cat_features=cat_features , eval_set=(X_test, y_test))
    pred_cat=model.predict(X_test)
    mae = mean_absolute_error(y_test, pred_cat)
    return mae

In [None]:
def Hyperparameter_study(mask,study_name,is_diff,n_trials):
    study=optuna.create_study(direction='minimize',study_name=study_name)
    func = lambda trial: catboost_model(mask,trial,is_diff)
    study.optimize(func, n_trials=n_trials)
    return study

In [None]:
def Hyperparameter_study_visualization(study,target_name,study_name):
    optuna.visualization.plot_optimization_history(study,target_name=target_name).show()
    # plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
    optuna.visualization.plot_parallel_coordinate(study,target_name=target_name).show()
    # plot_slice: shows the evolution of the search. You can see where in the hyperparameter space your search went and which parts of the space were explored more.
    optuna.visualization.plot_slice(study,target_name=target_name).show()
    #Visualize parameter importances.
    optuna.visualization.plot_param_importances(study,target_name=target_name).show()
    #Visualize empirical distribution function
    optuna.visualization.plot_edf(study,target_name=target_name).show()
    print(f"\tBest value (MAE): {study.best_value:.5f}")
    print(f"\tBest params {study_name}:")
    for key, value in study.best_params.items():
        print(f"\t\t{key}: {value}")

In [None]:
consumption = df_train_features["is_consumption"] == 1
production = df_train_features["is_consumption"] == 0
n_trials = 50

## Consumption_model_Catboost

In [None]:
study_consumption = Hyperparameter_study(mask=consumption,study_name="Consumption_model_Catboost",is_diff=0 , n_trials=n_trials)

[I 2024-02-01 00:00:39,228] A new study created in memory with name: Consumption_model_Catboost
[I 2024-02-01 00:02:00,788] Trial 0 finished with value: 63.58144719964348 and parameters: {'iterations': 5000, 'learning_rate': 0.1707890754320389, 'depth': 5, 'random_strength': 43.05296366729851, 'subsample': 1.0, 'min_data_in_leaf': 683, 'reg_lambda': 43.888518304412216}. Best is trial 0 with value: 63.58144719964348.
[I 2024-02-01 00:03:33,155] Trial 1 finished with value: 68.17282872451756 and parameters: {'iterations': 10000, 'learning_rate': 0.09013300970191683, 'depth': 3, 'random_strength': 32.670606958037546, 'subsample': 0.8, 'min_data_in_leaf': 987, 'reg_lambda': 14.172318478171197}. Best is trial 0 with value: 63.58144719964348.
[I 2024-02-01 00:04:55,172] Trial 2 finished with value: 65.1328306851614 and parameters: {'iterations': 6500, 'learning_rate': 0.12446048618088854, 'depth': 4, 'random_strength': 12.322504772686123, 'subsample': 1.0, 'min_data_in_leaf': 959, 'reg_lambd

In [None]:
Hyperparameter_study_visualization(study = study_consumption,target_name="MAE",study_name="Consumption_model_Catboost")

	Best value (MAE): 57.09623
	Best params Consumption_model_Catboost:
		iterations: 9500
		learning_rate: 0.0369884033597276
		depth: 8
		random_strength: 10.157424726456316
		subsample: 0.9
		min_data_in_leaf: 768
		reg_lambda: 40.721327967009245


## Production_model_Catboost

In [None]:
study_production = Hyperparameter_study(mask=production,study_name="Production_model_Catboost",is_diff=0 , n_trials=n_trials)

[I 2024-02-01 02:54:35,447] A new study created in memory with name: Production_model_Catboost
[I 2024-02-01 02:57:29,753] Trial 0 finished with value: 62.15940146638941 and parameters: {'iterations': 9000, 'learning_rate': 0.17317918607011984, 'depth': 6, 'random_strength': 45.459743280093136, 'subsample': 0.9, 'min_data_in_leaf': 382, 'reg_lambda': 31.77313295499663}. Best is trial 0 with value: 62.15940146638941.
[I 2024-02-01 03:00:48,284] Trial 1 finished with value: 58.13004958050111 and parameters: {'iterations': 8500, 'learning_rate': 0.046877111170257994, 'depth': 7, 'random_strength': 34.744753213560585, 'subsample': 0.9, 'min_data_in_leaf': 459, 'reg_lambda': 11.133400584219}. Best is trial 1 with value: 58.13004958050111.
[I 2024-02-01 03:06:22,587] Trial 2 finished with value: 59.51717478114657 and parameters: {'iterations': 9500, 'learning_rate': 0.16095953694776766, 'depth': 9, 'random_strength': 35.55611219614467, 'subsample': 1.0, 'min_data_in_leaf': 450, 'reg_lambda':

In [None]:
Hyperparameter_study_visualization(study = study_production,target_name="MAE",study_name="Production_model_Catboost")

	Best value (MAE): 55.34135
	Best params Production_model_Catboost:
		iterations: 6500
		learning_rate: 0.019811896659577364
		depth: 8
		random_strength: 14.740821849341899
		subsample: 1.0
		min_data_in_leaf: 766
		reg_lambda: 49.31291264495959


## Consumption_48h_diif_model_Catboost

In [None]:
study_consumption_48h_diif = Hyperparameter_study(mask=consumption, study_name="Consumption_48h_diif_model_Catboost", is_diff=1 , n_trials=n_trials)

[I 2024-02-01 05:21:13,358] A new study created in memory with name: Consumption_48h_diif_model_Catboost
[I 2024-02-01 05:22:33,883] Trial 0 finished with value: 69.41876608871874 and parameters: {'iterations': 8500, 'learning_rate': 0.01433954845976601, 'depth': 3, 'random_strength': 16.81199850894088, 'subsample': 0.8, 'min_data_in_leaf': 880, 'reg_lambda': 7.578812093158172}. Best is trial 0 with value: 69.41876608871874.
[I 2024-02-01 05:24:50,471] Trial 1 finished with value: 62.082885781596026 and parameters: {'iterations': 8500, 'learning_rate': 0.0876040269232838, 'depth': 5, 'random_strength': 19.601259113038026, 'subsample': 1.0, 'min_data_in_leaf': 99, 'reg_lambda': 16.052803653749347}. Best is trial 1 with value: 62.082885781596026.
[I 2024-02-01 05:27:40,613] Trial 2 finished with value: 60.91369102986183 and parameters: {'iterations': 8500, 'learning_rate': 0.07536659217863159, 'depth': 6, 'random_strength': 37.373169361277235, 'subsample': 1.0, 'min_data_in_leaf': 541, '

In [None]:
Hyperparameter_study_visualization(study = study_consumption_48h_diif,target_name="MAE",study_name="Consumption_48h_diif_model_Catboost")

	Best value (MAE): 57.07089
	Best params Consumption_48h_diif_model_Catboost:
		iterations: 10000
		learning_rate: 0.0435800174329228
		depth: 9
		random_strength: 32.94004287589372
		subsample: 1.0
		min_data_in_leaf: 688
		reg_lambda: 7.974249285549815


## Production_48h_diif_model_Catboost

In [None]:
study_production_48h_diif = Hyperparameter_study(mask=production, study_name="Production_48h_diif_model_Catboost", is_diff=1 , n_trials=n_trials)

[I 2024-02-01 08:43:41,939] A new study created in memory with name: Production_48h_diif_model_Catboost
[I 2024-02-01 08:47:31,236] Trial 0 finished with value: 56.97466488167443 and parameters: {'iterations': 6500, 'learning_rate': 0.18825463274761806, 'depth': 9, 'random_strength': 21.22151191884936, 'subsample': 0.9, 'min_data_in_leaf': 640, 'reg_lambda': 11.089930369819868}. Best is trial 0 with value: 56.97466488167443.
[I 2024-02-01 08:48:28,886] Trial 1 finished with value: 63.30049582215818 and parameters: {'iterations': 6000, 'learning_rate': 0.029672713494464216, 'depth': 3, 'random_strength': 19.612911091503097, 'subsample': 0.9, 'min_data_in_leaf': 798, 'reg_lambda': 81.89054696229107}. Best is trial 0 with value: 56.97466488167443.
[I 2024-02-01 08:50:17,162] Trial 2 finished with value: 56.903256645401385 and parameters: {'iterations': 5500, 'learning_rate': 0.0366187248558291, 'depth': 6, 'random_strength': 20.564479021167493, 'subsample': 0.9, 'min_data_in_leaf': 289, '

In [None]:
Hyperparameter_study_visualization(study = study_production_48h_diif,target_name="MAE",study_name="Production_48h_diif_model_Catboost")

	Best value (MAE): 51.48665
	Best params Production_48h_diif_model_Catboost:
		iterations: 7000
		learning_rate: 0.02891697060314784
		depth: 9
		random_strength: 39.11590475003397
		subsample: 0.9
		min_data_in_leaf: 610
		reg_lambda: 15.1511098243293
