This notebook was run on colab, if the output image cannot be viewed locally please click this link.https://colab.research.google.com/drive/19frvX3pXL22FGZXrE5Ah40soqfReBEWe?usp=sharing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Set the lightgbm GPU environment

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.5.0


In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!sudo apt install nvidia-driver-460 nvidia-cuda-toolkit clinfo
!apt-get update --fix-missing
!pip install -q  lightgbm==4.1.0 \
  --config-settings=cmake.define.USE_GPU=ON \
  --config-settings=cmake.define.OpenCL_INCLUDE_DIR="/usr/local/cuda/include/" \
  --config-settings=cmake.define.OpenCL_LIBRARY="/usr/local/cuda/lib64/libOpenCL.so"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
clinfo is already the newest version (3.0.21.02.21-1).
Some packages could not be installed. This may mean that you have
requested an impossible situation or if you are using the unstable
distribution that some required packages have not yet been created
or been moved out of Incoming.
The following information may help to resolve the situation:

The following packages have unmet dependencies:
 libnvidia-compute-510 : Depends: libnvidia-compute-525 but it is not installable
 nvidia-cuda-dev : Breaks: libcuda1 (< 495)
                   Recommends: libnvcuvid1 but it is not installable
[1;31mE: [0mUnable to correct problems, you have held broken packages.[0m
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get

# Import pakeages

In [None]:
import warnings

warnings.filterwarnings("ignore")

import os
import gc
import datetime
import holidays

import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa

import lightgbm as lgb
from lightgbm import LGBMRegressor
import optuna
from sklearn.metrics import mean_absolute_error


# Class

## Load data

In [None]:
class DataStorage:
    # root = "..\data"
    root = "/content/drive/MyDrive/predict-energy-behavior-of-prosumers/Data"

    data_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
        "row_id",
    ]
    client_cols = [
        "product_type",
        "county",
        "eic_count",
        "installed_capacity",
        "is_business",
        "date",
    ]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = [
        "latitude",
        "longitude",
        "origin_datetime",
        "hours_ahead",
        "temperature",
        "dewpoint",
        "cloudcover_high",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "forecast_datetime",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
    historical_weather_cols = [
        "datetime",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
        "latitude",
        "longitude",
    ]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
    ]

    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )
        self.df_data = self.df_data.filter(
            (pl.col("datetime") >= pd.to_datetime("2022-01-01"))&
            (pl.col("datetime") <= pd.to_datetime("2023-05-17"))
        )
        self.df_target = self.df_data.select(self.target_cols)

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )



## Feature Enegineering

In [None]:
class FeaturesGenerator:
    def __init__(self, data_storage):
        self.data_storage = data_storage
        self.estonian_holidays = list(
            holidays.country_holidays("EE", years=range(2021, 2026)).keys()
        )
    
    
    def _add_general_features(self, df_features):
        #basic time features
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.week().alias("week"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.quarter().alias("quarter"),
                pl.col("datetime").dt.year().alias("year"),
                )
        #Tag different types of prosumers in different regions as id for this category.
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
        # cyclical features encoding (refrence:https://towardsdatascience.com/cyclical-features-encoding-its-about-time-ce23581845ca)
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features
    
    def is_country_holiday(self, row):
        return (
            datetime.date(row["year"], row["month"], row["day"])
            in self.estonian_holidays
        )

    def _add_holidays_features(self, df_features):
        df_features = df_features.with_columns(
            pl.struct(["year", "month", "day"])
            .apply(self.is_country_holiday)
            .alias("is_country_holiday")
        )
        return df_features


    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client
        #Only client information from two days ago is available for each prediction.
        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        return df_features

    
    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )
        
        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            .filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude", "origin_datetime")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )
        #Uses the current information of the weather forecast, and the simultaneous moment-in-time weather forecast information from a week ago.
        for hours_lag in [0, 7 * 24]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )
        #Calculation of the rate of change of weather forecast information compared to the same time one week ago
        df_features = df_features.with_columns(
            (
                pl.col(f"temperature_forecast_local_0h")
                / (pl.col(f"temperature_forecast_local_168h") + 1e-3)
            ).alias(f"temperature_forecast_local_0h/168h"),
            (
                pl.col(f"surface_solar_radiation_downwards_forecast_local_0h")
                / (pl.col(f"surface_solar_radiation_downwards_forecast_local_168h") + 1e-3)
            ).alias(f"surface_solar_radiation_downwards_forecast_local_0h/168h"),
        )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )
        
        #Using historical weather data from the same point in time two days ago and one week ago
        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )
        #Use historical weather data up to 11 a.m. one day prior (historical weather data is updated daily at 11 a.m.)
        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                .filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
        #Calculation of the rate of change of historical weather information at the same time two days ago compared to the same time one week ago
        df_features = df_features.with_columns(
            (
                pl.col(f"temperature_historical_local_48h")
                / (pl.col(f"temperature_historical_local_168h") + 1e-3)
            ).alias(f"temperature_historical_local_48h/168h"),
            (
                pl.col(f"direct_solar_radiation_historical_local_48h")
                / (pl.col(f"direct_solar_radiation_historical_local_168h") + 1e-3)
            ).alias(f"direct_solar_radiation_historical_local_48h/168h"),
        #Calculation of the rate of change of historical weather information(0 - 10 am) at the same time one day ago compared to the same time two days ago
            (
                pl.col(f"temperature_historical_24h")
                / (pl.col(f"temperature") + 1e-3)
            ).alias(f"temperature_historical_24h/48h"),
            (
                pl.col(f"direct_solar_radiation_historical_24h")
                / (pl.col(f"direct_solar_radiation") + 1e-3)
            ).alias(f"direct_solar_radiation_historical_24h/48h"),

        )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target
        #Target values for different product_types
        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )
        #Target values for different counties and product_types
        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        #Target value at the same moment in the last 2-14 days
        hours_list=[i*24 for i in range(2,15)]

        for hours_lag in hours_list:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h"}),
                on=[
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    "datetime",
                ],
                how="left",
            )
        
        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
            #Total value of target for the same product_type at the same time in the last 2, 3, 7 and 14 days.
            df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )
            #Total value of target for the same product_type and county at the same time in the last 2, 3, 7 and 14 days.
            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )
        
        #Mean and standard deviation of target values at the same point in a week
        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24, 6 * 24, 7 *24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )
        #Rate of change in target value for different combinations over a two-week period
        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        return df_features
    
    #The amount of change (difference) in meteorological data from 1h ago, 3h ago, and 24 ago.
    def _additional_features(self,df_features):
        for col in [
                    'temperature',
                    'dewpoint',
                    '10_metre_u_wind_component',
                    '10_metre_v_wind_component',
                    "cloudcover_total",
                    "cloudcover_low",
                    "cloudcover_mid",
                    "cloudcover_high",
                    "windspeed_10m",
                    "winddirection_10m",
                    "shortwave_radiation",
                    "direct_solar_radiation",
                    "diffuse_radiation",
            ]:
            for window in [1,3,24]:
                df_features[f"{col}_diff_{window}"] = df_features.groupby(["county", 'is_consumption', 'product_type', 'is_business'])[col].diff(window)
        return df_features


    def _add_gas_prices_features(self, df_features):
        df_gas_prices = self.data_storage.df_gas_prices

        df_features = df_features.join(df_gas_prices.rename(
                                    {"forecast_date": "date"})
                                    .with_columns(
                                        (pl.col("date") + pl.duration(days=1)).cast(pl.Date)
                                    ),
                                    on= "date",
                                    how="left")

        return df_features

    def _add_electricity_prices_features(self, df_features):
        df_electricity_prices = self.data_storage.df_electricity_prices

        df_features = df_features.join(df_electricity_prices.rename(
                                    {"forecast_date": "datetime"})
                                    .with_columns(
                                        pl.col("datetime")+ pl.duration(hours=24)
                                    ),
                                    on="datetime" ,
                                    how="left")

        return df_features


    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features
    
    #Drop columns that are not needed for modelling Because these features are so sparse.
    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            "date", "datetime", "dayofyear"
        )
        return df_features
    
    def _to_pandas(self, df_features, y):
        '''Because the format of the category features is required to be "categroy" when training with lightgbm later, 
           the table needs to be converted to a pandas dataframe.
        '''
        cat_cols = [
            "county",
            "is_business",
            "product_type",
            "is_consumption",
            "segment",
            "quarter",
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        df_features[cat_cols] = df_features[cat_cols].astype("category")

        for i in ['row_id','year','date']:
            if i in df_features.columns:
                df_features = df_features.drop(i, axis=1)


        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_holidays_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._add_gas_prices_features,
            self._add_electricity_prices_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)
        df_features = self._additional_features(df_features)
        return df_features

# Initialisation

In [None]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

# Feature Generation

In [None]:
df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features[df_train_features['target'].notnull()]
df_train_features.shape

(1604324, 216)

# LightBGM Models Hyperparameter Optimization

In [None]:
def lgbm_model(mask,trial,is_diff):
    X = df_train_features[mask].drop(columns=["target"])
    Y = df_train_features[mask]["target"] - (df_train_features[mask]["target_48h"].fillna(0) * is_diff)
    train_size=0.85
    X_train, X_test = X.head(int(len(Y)*train_size)) , X.tail(int(len(Y)*(1-train_size)))
    y_train, y_test = Y.head(int(len(Y)*train_size)) , Y.tail(int(len(Y)*(1-train_size)))
    param = {
        "device": "gpu" ,
        "random_state": 49,
        "objective": "regression_l1",
        "verbose": -1,
        "early_stopping_rounds":100,


        "n_estimators": trial.suggest_int("n_estimators", 5000,10000,step=500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 2**2, 2**9, step=4),
        "max_depth": trial.suggest_int("max_depth", 3, 10,step=1),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100, 1000),
        "lambda_l1": trial.suggest_float("lambda_l1", 0, 10),
        "lambda_l2": trial.suggest_float("lambda_l2", 0, 1000),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "feature_fraction":trial.suggest_float("feature_fraction", 0.95, 1, step = 0.05),
        "feature_fraction_bynode":trial.suggest_float("feature_fraction_bynode", 0.9, 1,step = 0.05),
    }
    lgb=LGBMRegressor(**param)
    lgb.fit(X_train, y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="l1",
            )
    pred_lgb=lgb.predict(X_test)
    mae = mean_absolute_error(y_test, pred_lgb)
    return mae

In [None]:
def Hyperparameter_study(mask,study_name,is_diff,n_trials):
    study=optuna.create_study(direction='minimize',study_name=study_name)
    func = lambda trial: lgbm_model(mask,trial,is_diff)
    study.optimize(func, n_trials=n_trials)
    return study

In [None]:
def Hyperparameter_study_visualization(study,target_name,study_name):
    optuna.visualization.plot_optimization_history(study,target_name=target_name).show()
    # plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
    optuna.visualization.plot_parallel_coordinate(study,target_name=target_name).show()
    # plot_slice: shows the evolution of the search. You can see where in the hyperparameter space your search went and which parts of the space were explored more.
    optuna.visualization.plot_slice(study,target_name=target_name).show()
    #Visualize parameter importances.
    optuna.visualization.plot_param_importances(study,target_name=target_name).show()
    #Visualize empirical distribution function
    optuna.visualization.plot_edf(study,target_name=target_name).show()
    print(f"\tBest value (MAE): {study.best_value:.5f}")
    print(f"\tBest params {study_name}:")
    for key, value in study.best_params.items():
        print(f"\t\t{key}: {value}")

In [None]:
consumption = df_train_features["is_consumption"] == 1
production = df_train_features["is_consumption"] == 0
n_trials = 50

## Consumption_model_lgbm

In [None]:
study_consumption = Hyperparameter_study(mask=consumption,study_name="Consumption_model_lgbm",is_diff=0 , n_trials=n_trials)

[I 2024-01-31 23:46:08,999] A new study created in memory with name: Consumption_model_lgbm
[I 2024-01-31 23:47:23,600] Trial 0 finished with value: 60.13583612814955 and parameters: {'n_estimators': 8500, 'learning_rate': 0.049759075637355675, 'num_leaves': 256, 'max_depth': 9, 'min_data_in_leaf': 395, 'lambda_l1': 7.921814734368386, 'lambda_l2': 181.25757594962556, 'min_gain_to_split': 2.9947975487181475, 'feature_fraction': 1.0, 'feature_fraction_bynode': 1.0}. Best is trial 0 with value: 60.13583612814955.
[I 2024-01-31 23:47:43,312] Trial 1 finished with value: 65.94952615958087 and parameters: {'n_estimators': 7000, 'learning_rate': 0.1343001743626785, 'num_leaves': 276, 'max_depth': 4, 'min_data_in_leaf': 217, 'lambda_l1': 5.580831715509781, 'lambda_l2': 818.061650014108, 'min_gain_to_split': 9.915784638100703, 'feature_fraction': 0.95, 'feature_fraction_bynode': 0.9}. Best is trial 0 with value: 60.13583612814955.
[I 2024-01-31 23:48:55,905] Trial 2 finished with value: 63.5047

In [None]:
Hyperparameter_study_visualization(study = study_consumption,target_name="MAE",study_name="Consumption_model_lgbm")

	Best value (MAE): 59.29599
	Best params Consumption_model_lgbm:
		n_estimators: 9000
		learning_rate: 0.010597453319161714
		num_leaves: 220
		max_depth: 10
		min_data_in_leaf: 422
		lambda_l1: 3.0066029792278446
		lambda_l2: 922.4186954404429
		min_gain_to_split: 1.5738469308231477
		feature_fraction: 0.95
		feature_fraction_bynode: 0.9500000000000001


## Production_model_lgbm

In [None]:
study_production = Hyperparameter_study(mask=production,study_name="Production_model_lgbm",is_diff=0 , n_trials=n_trials)

[I 2024-02-01 01:27:17,896] A new study created in memory with name: Production_model_lgbm
[I 2024-02-01 01:27:35,048] Trial 0 finished with value: 68.27453470367875 and parameters: {'n_estimators': 5000, 'learning_rate': 0.1815895949281094, 'num_leaves': 332, 'max_depth': 3, 'min_data_in_leaf': 364, 'lambda_l1': 5.804022384212795, 'lambda_l2': 940.3374349376958, 'min_gain_to_split': 7.838513339305553, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.9500000000000001}. Best is trial 0 with value: 68.27453470367875.
[I 2024-02-01 01:27:49,505] Trial 1 finished with value: 60.51962623994492 and parameters: {'n_estimators': 7000, 'learning_rate': 0.13771701185439203, 'num_leaves': 300, 'max_depth': 10, 'min_data_in_leaf': 248, 'lambda_l1': 0.4008895162290871, 'lambda_l2': 814.5464679172123, 'min_gain_to_split': 9.165914577063125, 'feature_fraction': 0.95, 'feature_fraction_bynode': 1.0}. Best is trial 1 with value: 60.51962623994492.
[I 2024-02-01 01:28:48,550] Trial 2 finished with 

In [None]:
Hyperparameter_study_visualization(study = study_production,target_name="MAE",study_name="Production_model_lgbm")

	Best value (MAE): 55.12006
	Best params Production_model_lgbm:
		n_estimators: 9000
		learning_rate: 0.09406054013635745
		num_leaves: 348
		max_depth: 7
		min_data_in_leaf: 638
		lambda_l1: 2.874920973422175
		lambda_l2: 624.1350035218543
		min_gain_to_split: 2.1429116267260464
		feature_fraction: 1.0
		feature_fraction_bynode: 1.0


## Consumption_48h_diif_model_lgbm

In [None]:
study_consumption_48h_diif = Hyperparameter_study(mask=consumption, study_name="Consumption_48h_diif_model_lgbm", is_diff=1 , n_trials=n_trials)

[I 2024-02-01 01:59:37,384] A new study created in memory with name: Consumption_48h_diif_model_lgbm
[I 2024-02-01 02:00:14,447] Trial 0 finished with value: 61.10165057735021 and parameters: {'n_estimators': 10000, 'learning_rate': 0.08698351195286107, 'num_leaves': 244, 'max_depth': 6, 'min_data_in_leaf': 645, 'lambda_l1': 2.343950342674772, 'lambda_l2': 942.8365220434421, 'min_gain_to_split': 4.6716091775128, 'feature_fraction': 0.95, 'feature_fraction_bynode': 0.9500000000000001}. Best is trial 0 with value: 61.10165057735021.
[I 2024-02-01 02:00:42,242] Trial 1 finished with value: 61.68292384083024 and parameters: {'n_estimators': 8500, 'learning_rate': 0.12515469303381435, 'num_leaves': 236, 'max_depth': 7, 'min_data_in_leaf': 443, 'lambda_l1': 1.9779115055312724, 'lambda_l2': 739.0493052908532, 'min_gain_to_split': 4.337605703794269, 'feature_fraction': 0.95, 'feature_fraction_bynode': 0.9500000000000001}. Best is trial 0 with value: 61.10165057735021.
[I 2024-02-01 02:00:57,40

In [None]:
Hyperparameter_study_visualization(study = study_consumption_48h_diif,target_name="MAE",study_name="Consumption_48h_diif_model_lgbm")

	Best value (MAE): 58.35007
	Best params Consumption_48h_diif_model_lgbm:
		n_estimators: 7500
		learning_rate: 0.03842668549204932
		num_leaves: 132
		max_depth: 7
		min_data_in_leaf: 756
		lambda_l1: 2.0885247537537683
		lambda_l2: 312.32439866548253
		min_gain_to_split: 0.4361570414220892
		feature_fraction: 0.95
		feature_fraction_bynode: 0.9


## Production_48h_diif_model_lgbm

In [None]:
study_production_48h_diif = Hyperparameter_study(mask=production, study_name="Production_48h_diif_model_lgbm", is_diff=1 , n_trials=n_trials)

[I 2024-02-01 03:55:57,517] A new study created in memory with name: Production_48h_diif_model_lgbm
[I 2024-02-01 03:56:16,741] Trial 0 finished with value: 54.940080055554205 and parameters: {'n_estimators': 6000, 'learning_rate': 0.19121257110746206, 'num_leaves': 384, 'max_depth': 6, 'min_data_in_leaf': 880, 'lambda_l1': 3.8504193104904605, 'lambda_l2': 9.528980379418627, 'min_gain_to_split': 8.360904828837993, 'feature_fraction': 0.95, 'feature_fraction_bynode': 0.9500000000000001}. Best is trial 0 with value: 54.940080055554205.
[I 2024-02-01 03:56:41,427] Trial 1 finished with value: 56.58142945148531 and parameters: {'n_estimators': 8500, 'learning_rate': 0.16068982072093052, 'num_leaves': 28, 'max_depth': 5, 'min_data_in_leaf': 644, 'lambda_l1': 7.542187576351075, 'lambda_l2': 443.09811446697546, 'min_gain_to_split': 13.635757612521639, 'feature_fraction': 1.0, 'feature_fraction_bynode': 0.9500000000000001}. Best is trial 0 with value: 54.940080055554205.
[I 2024-02-01 03:57:40

In [None]:
Hyperparameter_study_visualization(study = study_production_48h_diif,target_name="MAE",study_name="Production_48h_diif_model_lgbm")

	Best value (MAE): 52.09550
	Best params Production_48h_diif_model_lgbm:
		n_estimators: 9500
		learning_rate: 0.04232775492179543
		num_leaves: 444
		max_depth: 9
		min_data_in_leaf: 197
		lambda_l1: 1.1897302067741702
		lambda_l2: 92.72276346468713
		min_gain_to_split: 1.6255235205726382
		feature_fraction: 1.0
		feature_fraction_bynode: 1.0
