In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!sudo apt install nvidia-driver-460 nvidia-cuda-toolkit clinfo
!apt-get update --fix-missing
!pip install -q  lightgbm==4.1.0 \
  --config-settings=cmake.define.USE_GPU=ON \
  --config-settings=cmake.define.OpenCL_INCLUDE_DIR="/usr/local/cuda/include/" \
  --config-settings=cmake.define.OpenCL_LIBRARY="/usr/local/cuda/lib64/libOpenCL.so"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
clinfo is already the newest version (3.0.21.02.21-1).
Some packages could not be installed. This may mean that you have
requested an impossible situation or if you are using the unstable
distribution that some required packages have not yet been created
or been moved out of Incoming.
The following information may help to resolve the situation:

The following packages have unmet dependencies:
 libnvidia-compute-510 : Depends: libnvidia-compute-525 but it is not installable
 nvidia-cuda-dev : Breaks: libcuda1 (< 495)
                   Recommends: libnvcuvid1 but it is not installable
[1;31mE: [0mUnable to correct problems, you have held broken packages.[0m
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-se

In [None]:
import warnings

warnings.filterwarnings("ignore")

import os
import gc
import pickle

import holidays
import numpy as np
import pandas as pd
import polars as pl
import plotly.express as px

#from datetime import datetime, timedelta
import datetime

from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

from pathlib import Path
data_dir = Path("/content/drive/MyDrive/Colab Notebooks/Eenefit/data")

In [None]:
class DataStorage:
    root = "/content/drive/MyDrive/Colab Notebooks/Eenefit/data/predict-energy-behavior-of-prosumers"

    data_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
        "row_id",
        "data_block_id"
    ]
    client_cols = [
        "product_type",
        "county",
        "eic_count",
        "installed_capacity",
        "is_business",
        "date",
    ]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = [
        "latitude",
        "longitude",
        "hours_ahead",
        "temperature",
        "dewpoint",
        "cloudcover_high",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "forecast_datetime",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
    historical_weather_cols = [
        "datetime",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
        "latitude",
        "longitude",
    ]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
    ]

    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )


        #self.df_data = self.df_data.filter(
        #    pl.col("datetime") >= pd.to_datetime("2023-05-01")
        #)


        self.df_target = self.df_data.select(self.target_cols)

        self.schema_data = self.df_data.schema
        self.schema_client = self.df_client.schema
        self.schema_gas_prices = self.df_gas_prices.schema
        self.schema_electricity_prices = self.df_electricity_prices.schema
        self.schema_forecast_weather = self.df_forecast_weather.schema
        self.schema_historical_weather = self.df_historical_weather.schema
        self.schema_target = self.df_target.schema

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )

    def update_with_new_data(
        self,
        df_new_client,
        df_new_gas_prices,
        df_new_electricity_prices,
        df_new_forecast_weather,
        df_new_historical_weather,
        df_new_target,
    ):
        df_new_client = pl.from_pandas(
            df_new_client[self.client_cols], schema_overrides=self.schema_client
        )
        df_new_gas_prices = pl.from_pandas(
            df_new_gas_prices[self.gas_prices_cols],
            schema_overrides=self.schema_gas_prices,
        )
        df_new_electricity_prices = pl.from_pandas(
            df_new_electricity_prices[self.electricity_prices_cols],
            schema_overrides=self.schema_electricity_prices,
        )
        df_new_forecast_weather = pl.from_pandas(
            df_new_forecast_weather[self.forecast_weather_cols],
            schema_overrides=self.schema_forecast_weather,
        )
        df_new_historical_weather = pl.from_pandas(
            df_new_historical_weather[self.historical_weather_cols],
            schema_overrides=self.schema_historical_weather,
        )
        df_new_target = pl.from_pandas(
            df_new_target[self.target_cols], schema_overrides=self.schema_target
        )

        self.df_client = pl.concat([self.df_client, df_new_client]).unique(
            ["date", "county", "is_business", "product_type"]
        )
        self.df_gas_prices = pl.concat([self.df_gas_prices, df_new_gas_prices]).unique(
            ["forecast_date"]
        )
        self.df_electricity_prices = pl.concat(
            [self.df_electricity_prices, df_new_electricity_prices]
        ).unique(["forecast_date"])
        self.df_forecast_weather = pl.concat(
            [self.df_forecast_weather, df_new_forecast_weather]
        ).unique(["forecast_datetime", "latitude", "longitude", "hours_ahead"])
        self.df_historical_weather = pl.concat(
            [self.df_historical_weather, df_new_historical_weather]
        ).unique(["datetime", "latitude", "longitude"])
        self.df_target = pl.concat([self.df_target, df_new_target]).unique(
            ["datetime", "county", "is_business", "product_type", "is_consumption"]
        )

    def preprocess_test(self, df_test):
        df_test = df_test.rename(columns={"prediction_datetime": "datetime"})
        df_test = pl.from_pandas(
            df_test[self.data_cols[1:]], schema_overrides=self.schema_data
        )
        return df_test

In [None]:
class FeaturesGenerator:
    def __init__(self, data_storage):
        self.data_storage = data_storage
        estonian_holidays = holidays.country_holidays('EE', years=range(2021, 2026))
        self.estonian_holidays = list(estonian_holidays.keys())

    def _add_general_features(self, df_features):
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.year().alias("year"),
            )
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )

        season_map = {
            2: 0,
            3: 0,
            4: 0,
            5: 1,
            6: 1,
            7: 1,
            8: 2,
            9: 2,
            10: 2,
            11: 3,
            0: 3,
            1: 3
            }

        df_features = df_features.with_columns(
            pl.col('month').replace(season_map).alias('season')
        )

        return df_features

    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client

        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        df_features = df_features.with_columns(
            pl.col('installed_capacity')
            .fill_null(strategy='mean')
            .over(['product_type'])
        )
        return df_features

    def is_country_holiday(self, row):
        return (
            datetime.date(row["year"], row["month"], row["day"])
            in self.estonian_holidays
        )

    def _add_holidays_features(self, df_features):
        df_features = df_features.with_columns(
            pl.struct(["year", "month", "day"])
            .apply(self.is_country_holiday)
            .cast(pl.Int8) ###
            .alias("is_country_holiday")
        )
        return df_features

    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            .filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude", "origin_datetime")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        #for hours_lag in [0, 7 * 24]:
        for hours_lag in [2 * 24, 7 * 24, 0]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )

        df_features = df_features.with_columns(
            (
                pl.col(f"temperature_forecast_local_0h")
                / (pl.col(f"temperature_forecast_local_168h") + 1e-3)
            ).alias(f"temperature_forecast_local_0h/168h"),
            (
                pl.col(f"surface_solar_radiation_downwards_forecast_local_0h")
                / (pl.col(f"surface_solar_radiation_downwards_forecast_local_168h") + 1e-3)
            ).alias(f"surface_solar_radiation_downwards_forecast_local_0h/168h"),
        )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )

        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                .filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )

        df_features = df_features.with_columns(
            (
                pl.col(f"temperature_historical_local_48h")
                / (pl.col(f"temperature_historical_local_168h") + 1e-3)
            ).alias(f"temperature_historical_local_48h/168h"),
            (
                pl.col(f"direct_solar_radiation_historical_local_48h")
                / (pl.col(f"direct_solar_radiation_historical_local_168h") + 1e-3)
            ).alias(f"direct_solar_radiation_historical_local_48h/168h"),
            (
                pl.col(f"temperature_historical_24h")
                / (pl.col(f"temperature") + 1e-3)
            ).alias(f"temperature_historical_24h/48h"),
            (
                pl.col(f"direct_solar_radiation_historical_24h")
                / (pl.col(f"direct_solar_radiation") + 1e-3)
            ).alias(f"direct_solar_radiation_historical_24h/48h"),
        )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target

        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )

        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        for hours_lag in [
            2 * 24,
            3 * 24,
            4 * 24,
            5 * 24,
            6 * 24,
            7 * 24,
            8 * 24,
            9 * 24,
            10 * 24,
            11 * 24,
            12 * 24,
            13 * 24,
            14 * 24,
        ]:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h"}),
                on=[
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    "datetime",
                ],
                how="left",
            )

        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
                df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )

            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )

        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )

        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        # add additional target features
        target_sel_cols = ['target_48h', 'target_72h', 'target_96h',
                           'target_120h', 'target_144h', 'target_168h',
                           'target_192h', 'target_216h', 'target_240h',
                           'target_264h', 'target_288h', 'target_312h',
                           'target_336h']
        group_0 = ['county', 'product_type', 'is_consumption', 'is_business']

        for col in ['target_48h']:
            for window in [12]:
                expressions = [
                    pl.col(col).rolling_mean(window).over(['county']).alias(f"{col}_rmean_{window}"),
                    pl.col(col).rolling_std(window).over(['county']).alias(f"{col}_rstd_{window}"),
                    #pl.col(col).rolling_skew(window).over(['county']).alias(f"{col}_skew_{window}"),
                ]
                df_features = df_features.with_columns(expressions)

        for col in target_sel_cols + ['installed_capacity']:
            expressions = [
                pl.col(col)
                .log1p()
                .over(group_0)
                #.alias(f"{col}_log")
            ]
            df_features = df_features.with_columns(expressions)

        return df_features

    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features

    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            #"date", "datetime", "hour", "dayofyear"
            "datetime", "segment"
        )
        return df_features

    def _to_pandas(self, df_features, y):
        cat_cols = [
            "county",
            "is_business",
            "product_type",
            "is_consumption",
            #"segment",
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        df_features[cat_cols] = df_features[cat_cols].astype("category")

        if 'row_id' in df_features.columns:
            df_features = df_features.drop("row_id", axis=1)

        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._add_holidays_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)

        return df_features

In [None]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

In [None]:
data_storage.df_target.head(2)

target,county,is_business,product_type,is_consumption,datetime
f64,i64,i64,i64,i64,datetime[μs]
0.713,0,0,1,0,2021-09-01 00:00:00
96.59,0,0,1,1,2021-09-01 00:00:00


In [None]:
df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features[df_train_features['target'].notnull()]

df_train_features['target'] = np.log1p(df_train_features['target'])
df_train_features['target'] = df_train_features['target'] - df_train_features['target_48h'].fillna(0)

df_train_features['target'] = df_train_features['target'].astype(np.float32)
#df_train_features[['county', 'is_business', 'product_type', 'is_consumption']] = \
#df_train_features[['county', 'is_business', 'product_type', 'is_consumption']].astype(np.int8)

display(df_train_features.head(3))
df_train_features.shape

Unnamed: 0,county,is_business,product_type,is_consumption,data_block_id,date,dayofyear,hour,day,weekday,...,target_ratio_72_240,target_ratio_48_72,target_all_type_sum_ratio_48_72,target_all_type_sum_ratio_168_336,target_all_county_type_sum_ratio_48_72,target_all_county_type_sum_ratio_168_336,target_48h_rmean_12,target_48h_rstd_12,is_country_holiday,target
0,0,0,1,0,0,2021-09-01,244,0,1,3,...,,,,,,,,,0,0.538246
1,0,0,1,1,0,2021-09-01,244,0,1,3,...,,,,,,,,,0,4.580775
2,0,0,2,0,0,2021-09-01,244,0,1,3,...,,,,,,,,,0,0.0


(2017824, 202)

In [None]:
df_train_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2017824 entries, 0 to 2018351
Columns: 202 entries, county to target
dtypes: category(4), datetime64[ns](1), float32(187), float64(1), int16(1), int32(1), int64(2), int8(5)
memory usage: 1.5 GB


In [None]:
print([x for x in df_train_features.columns.to_list() if 'data' in x])

print([x for x in df_train_features.columns.to_list() if 'date' in x])

['data_block_id']
['date']


In [None]:
#df_train_features.to_parquet('train_features_5.parquet')
#!cp -r /content/train_features_5.parquet "/content/drive/MyDrive/Colab Notebooks/Eenefit/data"

In [None]:
val_start = np.datetime64('2023-02-01')
val_end = np.datetime64('2023-04-30')

#val_end = val_start + np.timedelta64(, 'D')

trn_start = val_start - np.timedelta64(45, 'D')
trn_end = val_start - np.timedelta64(1, 'D')


df_train = df_train_features.copy()
df_train = df_train[(df_train['date'] >= trn_start) & (df_train['date'] <= trn_end)]

df_valid = df_train_features.copy()
df_valid = df_valid[(df_valid['date'] >= val_start) & (df_valid['date'] < val_end)]

In [None]:
print(trn_start)
print(trn_end)

2022-12-18
2023-01-31


In [None]:
class Model:
    def __init__(self):
        self.params_0 = {
            'device': 'gpu',
            'seed': 42,
            'objective': 'regression_l1',
            'boosting_type': 'gbdt',
            'force_col_wise': True,

            'num_iterations': 3000,
            'learning_rate': 0.04,
            "feature_fraction" : 0.9,
            "feature_fraction_bynode" : 0.8,

            "bagging_fraction": 0.9,
            "bagging_freq": 2,

            "max_depth": 7,
            "num_leaves": 70,
            "min_data_in_leaf": 20,
        }
        self.params_1 = {
            'device': 'gpu',
            "seed": 42,
            'objective': 'regression_l1',
            'boosting_type': 'gbdt',
            'force_col_wise': True,
             'num_iterations': 3000,
            'learning_rate': 0.04,
            "feature_fraction" : 0.8,
            "feature_fraction_bynode" : 0.7,

            "bagging_fraction": 0.9,
            "bagging_freq": 2,

            "max_depth": 8,
            "num_leaves": 130,
            "min_data_in_leaf": 20,
        }
        self.params = [self.params_0, self.params_1]
        self.categorical_features = ['county', 'product_type', 'month', 'weekday',
                                     'is_country_holiday', 'season']
        self.drop_cols = ['is_consumption', 'is_business', 'date', 'data_block_id']

        self.models_lgb_00 = []
        self.models_lgb_01 = []
        self.models_lgb_10 = []
        self.models_lgb_11 = []
        self.all_models = [self.models_lgb_00, self.models_lgb_01,
                           self.models_lgb_10, self.models_lgb_11]

        self.num_iters = 3

    def fit(self, df_train_features):
        df_train_features[self.categorical_features] = df_train_features[self.categorical_features].astype(np.int8)

        counter = 0
        for i in range(2):
            for j in range(2):
                mask = (df_train_features['is_consumption'] == i) & (df_train_features['is_business'] == j)
                x = df_train_features[mask].drop(self.drop_cols, axis=1)
                y = x.pop('target')
                ds = lgb.Dataset(x, label=y, free_raw_data=False)

                for k in range(self.num_iters):
                    self.params[j]['seed'] = np.random.randint(5000)

                    model = lgb.train(
                        self.params[j],
                        categorical_feature=self.categorical_features,
                        train_set=ds,
                        init_model=lgb.Booster(
                            model_file=data_dir/f"LGBM_diff_models/model_lgb_{i}{j}"
                            )
                    )
                    self.all_models[counter].append(model)
                counter += 1
                del x, y, ds
            gc.collect()


    def predict(self, df_features):
        df_features[self.categorical_features] = df_features[self.categorical_features].astype(np.int8)
        predictions = np.zeros(len(df_features))

        counter = 0
        for i in range(2):
            for j in range(2):
                mask = (df_features['is_consumption'] == i) & (df_features['is_business'] == j)
                results = []
                for k in range(self.num_iters):
                    preds = self.all_models[counter][k].predict(df_features[mask].drop(self.drop_cols, axis=1))
                    preds = preds + df_features[mask]['target_48h'].fillna(0)
                    preds = np.clip(preds, 0.0, 11.0)
                    results.append(preds)

                preds_mean = np.mean(np.array(results), axis=0)
                predictions[mask.values] = np.expm1(preds_mean)
                counter += 1

        return predictions


In [None]:
%%time
model = Model()
model.fit(df_train)

CPU times: user 1h 20min 3s, sys: 16.7 s, total: 1h 20min 19s
Wall time: 10min 41s


In [None]:
preds = model.predict(df_valid.drop('target', axis=1))
y_true = np.expm1(df_valid['target'] + df_valid['target_48h'].fillna(0))

In [None]:
mean_absolute_error(y_true, preds)

49.59222306179301