<a href="https://colab.research.google.com/github/carlos-alves-one/-Energy-Comp/blob/main/enefit_project_comp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import gc
import pickle
import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import mean_absolute_error
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import VotingRegressor
import lightgbm as lgb
import optuna

In [None]:
class MonthlyKFold:
    def __init__(self, n_splits=3):
        self.n_splits = n_splits
    def split(self, X, y, groups=None):
        dates = 12 * X["year"] + X["month"]
        timesteps = sorted(dates.unique().tolist())
        X = X.reset_index()
        for t in timesteps[-self.n_splits:]:
            idx_train = X[dates.values < t].index
            idx_test = X[dates.values == t].index
            yield idx_train, idx_test
    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [None]:
def to_pandas(X, y=None):
    cat_cols = ["county", "is_business", "product_type", "is_consumption", "category_1"]
    if y is not None:
        df = pd.concat([X.to_pandas(), y.to_pandas()], axis=1)
    else:
        df = X.to_pandas()
    df = df.set_index("row_id")
    df[cat_cols] = df[cat_cols].astype("category")
    df["target_mean"] = df[[f"target_{i}" for i in range(1, 7)]].mean(1)
    df["target_std"] = df[[f"target_{i}" for i in range(1, 7)]].std(1)
    df["target_ratio"] = df["target_6"] / (df["target_7"] + 1e-3)
    return df

# Hyperparameter

In [None]:
def lgb_objective(trial):
    params = {
        'n_iter'           : 1000,
        'verbose'          : -1,
        'random_state'     : 42,
        'objective'        : 'l2',
        'learning_rate'    : trial.suggest_float('learning_rate', 0.01, 0.1),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'lambda_l1'        : trial.suggest_float('lambda_l1', 1e-2, 10.0),
        'lambda_l2'        : trial.suggest_float('lambda_l2', 1e-2, 10.0),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 4, 256),
        'max_depth'        : trial.suggest_int('max_depth', 5, 10),
        'max_bin'          : trial.suggest_int('max_bin', 32, 1024),
    }
    model  = lgb.LGBMRegressor(**params)
    X, y   = df_train.drop(columns=["target"]), df_train["target"]
    cv     = MonthlyKFold(1)
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    return -1 * np.mean(scores)

# Feature Engineering


In [None]:
import polars as pl
import numpy as np
def convert_to_polars(*dfs):
    """Converts a list of dataframes to Polars DataFrames."""
    return [pl.DataFrame(df) if not isinstance(df, pl.DataFrame) else df for df in dfs]
def process_datetime(df, column_name, is_date=False):
    """Processes datetime columns to ensure correct format."""
    if column_name in df.columns:
        if is_date and df.dtypes[df.columns.index(column_name)] == pl.Date:
            df = df.with_columns(pl.col(column_name).cast(pl.Date))
        elif not is_date and df.dtypes[df.columns.index(column_name)] == pl.Datetime:
            df = df.with_columns(pl.col(column_name).cast(pl.Datetime))
    return df
def process_location(df):
    """Converts latitude and longitude to float."""
    return df.with_columns(
        pl.col("latitude").cast(pl.Float32),
        pl.col("longitude").cast(pl.Float32)
    )
def join_dataframes(df_main, dfs, join_conditions, suffixes):
    """Joins multiple dataframes with specified conditions and suffixes."""
    for df, condition, suffix in zip(dfs, join_conditions, suffixes):
        if isinstance(condition, list):
            condition_check = all(col in df_main.columns and col in df.columns for col in condition)
        else:
            condition_check = condition in df_main.columns and condition in df.columns
        if condition_check:
            df_main = df_main.join(df, on=condition, how="left", suffix=suffix)
        else:
            print(f"Skipping join for {suffix} due to missing column in condition: {condition}")
    return df_main
def add_time_features(df):
    """Adds time-related features to the dataframe."""
    if "datetime" in df.columns and df.dtypes[df.columns.index("datetime")] in [pl.Datetime, pl.Date]:
        df = df.with_columns(
            pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
            pl.col("datetime").dt.hour().alias("hour"),
            pl.col("datetime").dt.day().alias("day"),
            pl.col("datetime").dt.weekday().alias("weekday"),
            pl.col("datetime").dt.month().alias("month"),
            pl.col("datetime").dt.year().alias("year"),
            (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
            (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
            (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
            (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)")
        )
    else:
        print("Warning: 'datetime' column not found. Time features cannot be added.")
    return df
def feature_eng(df_data, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target):
    df_data, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target = convert_to_polars(df_data, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target)
    df_data = process_datetime(df_data, "datetime")
    df_client = process_datetime(df_client, "date", is_date=True)
    df_gas = process_datetime(df_gas, "forecast_date", is_date=True)
    df_electricity = process_datetime(df_electricity, "forecast_date")
    df_location = process_location(df_location)
    df_forecast = process_datetime(df_forecast, "forecast_datetime")
    df_historical = process_datetime(df_historical, "datetime")
    join_conditions = [
        "date",
        ["county", "is_business", "product_type", "date"],
        "datetime",
        "datetime",
        "datetime",
        "datetime",
        ["county", "datetime"],
        "datetime"
    ]
    suffixes = ["_gas", "_client", "_elec", "_fcast", "_hist", "_loc", "_target"]
    df_data = join_dataframes(df_data, [df_gas, df_client, df_electricity, df_forecast, df_historical, df_location, df_target], join_conditions, suffixes)
    df_data = add_time_features(df_data)
    df_data = df_data.drop(["date", "datetime", "hour", "dayofyear"])
    return df_data


# Global Variables

In [None]:
root = "/kaggle/input/predict-energy-behavior-of-prosumers"
data_cols        = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols      = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_cols         = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_cols = ['forecast_date', 'euros_per_mwh']
forecast_cols    = ['latitude', 'longitude', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
historical_cols  = ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure','cloudcover_total','cloudcover_low','cloudcover_mid','cloudcover_high','windspeed_10m','winddirection_10m','shortwave_radiation','direct_solar_radiation','diffuse_radiation','latitude','longitude']
location_cols    = ['longitude', 'latitude', 'county']
target_cols      = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime']
save_path = None
load_path = None

In [None]:
df_data        = pl.read_csv(os.path.join(root, "train.csv"), columns=data_cols, try_parse_dates=True)
df_client      = pl.read_csv(os.path.join(root, "client.csv"), columns=client_cols, try_parse_dates=True)
df_gas         = pl.read_csv(os.path.join(root, "gas_prices.csv"), columns=gas_cols, try_parse_dates=True)
df_electricity = pl.read_csv(os.path.join(root, "electricity_prices.csv"), columns=electricity_cols, try_parse_dates=True)
df_forecast    = pl.read_csv(os.path.join(root, "forecast_weather.csv"), columns=forecast_cols, try_parse_dates=True)
df_historical  = pl.read_csv(os.path.join(root, "historical_weather.csv"), columns=historical_cols, try_parse_dates=True)
df_location    = pl.read_csv(os.path.join(root, "weather_station_to_county_mapping.csv"), columns=location_cols, try_parse_dates=True)
df_target      = df_data.select(target_cols)
schema_data        = df_data.schema
schema_client      = df_client.schema
schema_gas         = df_gas.schema
schema_electricity = df_electricity.schema
schema_forecast    = df_forecast.schema
schema_historical  = df_historical.schema
schema_target      = df_target.schema

In [None]:
X, y = df_data.drop("target"), df_data.select("target")
X = feature_eng(X, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target)
df_train = to_pandas(X, y)
df_train = df_train[df_train["target"].notnull() & df_train["year"].gt(2021)]
df_train.info(verbose=True)

# Tuning

In [None]:
best_params = {
    'n_iter'           : 900,
    'verbose'          : -1,
    'objective'        : 'l2',
    'learning_rate'    : 0.05689066836106983,
    'colsample_bytree' : 0.8915976762048253,
    'colsample_bynode' : 0.5942203285139224,
    'lambda_l1'        : 3.6277555139102864,
    'lambda_l2'        : 1.6591278779517808,
    'min_data_in_leaf' : 186,
    'max_depth'        : 9,
    'max_bin'          : 813,
}

# Training

In [None]:
if load_path is not None:
    model = pickle.load(open(load_path, "rb"))
else:
    model = VotingRegressor([
        ('lgb_1', lgb.LGBMRegressor(**best_params, random_state=100)),
        ('lgb_2', lgb.LGBMRegressor(**best_params, random_state=101)),
        ('lgb_3', lgb.LGBMRegressor(**best_params, random_state=102)),
        ('lgb_4', lgb.LGBMRegressor(**best_params, random_state=103)),
        ('lgb_5', lgb.LGBMRegressor(**best_params, random_state=104)),
    ])
    model.fit(
        X=df_train.drop(columns=["target"]),
        y=df_train["target"]
    )
if save_path is not None:
    with open(save_path, "wb") as f:
        pickle.dump(model, f)

# Prediction


In [None]:
import enefit
env = enefit.make_env()
iter_test = env.iter_test()
for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    test = test.rename(columns={"prediction_datetime": "datetime"})
    df_test           = pl.from_pandas(test[data_cols[1:]], schema_overrides=schema_data)
    df_client         = pl.from_pandas(client[client_cols], schema_overrides=schema_client)
    df_gas            = pl.from_pandas(gas_prices[gas_cols], schema_overrides=schema_gas)
    df_electricity    = pl.from_pandas(electricity_prices[electricity_cols], schema_overrides=schema_electricity)
    df_new_forecast   = pl.from_pandas(forecast_weather[forecast_cols], schema_overrides=schema_forecast)
    df_new_historical = pl.from_pandas(historical_weather[historical_cols], schema_overrides=schema_historical)
    df_new_target     = pl.from_pandas(revealed_targets[target_cols], schema_overrides=schema_target)
    df_forecast       = pl.concat([df_forecast, df_new_forecast]).unique()
    df_historical     = pl.concat([df_historical, df_new_historical]).unique()
    df_target         = pl.concat([df_target, df_new_target]).unique()
    X_test = feature_eng(df_test, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target)
    X_test = to_pandas(X_test)
    sample_prediction["target"] = model.predict(X_test).clip(0)
    env.predict(sample_prediction)