<a href="https://colab.research.google.com/github/carlos-alves-one/-Energy-Comp/blob/main/enefit_lgbm_baseline_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries and Packages

In [None]:
import os  # Import the os module for interacting with the operating system
import gc  # Import the garbage collector module for memory management

import numpy as np               # Import numpy for numerical operations and array handling
import pandas as pd              # Import pandas for data manipulation and analysis
import matplotlib.pyplot as plt  # Import matplotlib for creating static, animated, and interactive visualizations

import seaborn as sns  # Import seaborn for statistical data visualization

from sklearn.model_selection import cross_val_score  # Import cross_val_score for cross-validation of models

import xgboost as xgb   # Import XGBoost for gradient boosting framework
import lightgbm as lgb  # Import LightGBM for gradient boosting framework
import catboost as cb   # Import CatBoost for gradient boosting on decision trees

import optuna  # Import Optuna for hyperparameter optimization
import shap    # Import SHAP for explaining the output of machine learning models

from datetime import datetime  # Import datetime for handling date and time
import pytz                    # Import pytz for handling timezone information


# Input Data Files

In [None]:
# Iterating through all files in the '/kaggle/input' directory and printing their full paths
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Load the Data

In [None]:
data_dir = "/kaggle/input/predict-energy-behavior-of-prosumers"

df_train = pd.read_csv(os.path.join(data_dir, "train.csv"))
# df_gprice = pd.read_csv(os.path.join(data_dir, "gas_prices.csv"))
# df_eprice = pd.read_csv(os.path.join(data_dir, "electricity_prices.csv"))
# df_client = pd.read_csv(os.path.join(data_dir, "client.csv"))
# df_weather = pd.read_csv(os.path.join(data_dir, "forecast_weather.csv"))
# df_hweather = pd.read_csv(os.path.join(data_dir, "historical_weather.csv"))

df_train["datetime"] = pd.to_datetime(df_train["datetime"])

df_train["month"]   = df_train["datetime"].dt.month
df_train["day"]     = df_train["datetime"].dt.day
df_train["weekday"] = df_train["datetime"].dt.weekday
df_train["hour"]    = df_train["datetime"].dt.hour

df_train["county"]         = df_train["county"].astype("category")
df_train["is_business"]    = df_train["is_business"].astype("category")
df_train["product_type"]   = df_train["product_type"].astype("category")
df_train["is_consumption"] = df_train["is_consumption"].astype("category")

df_train = df_train.set_index(["row_id", "datetime"])
df_train = df_train.drop(columns=["prediction_unit_id", "data_block_id"])

df_train = df_train.dropna(subset=["target"])


# Declare Class MonthlyKFold

Custom cross-validator designed for time series data, where the data is split based on unique monthly time steps.

In [None]:
class MonthlyKFold:
    def __init__(self, n_splits=3):
        self.n_splits = n_splits  # Initialize the class with n_splits, default is 3

    def split(self, X, y, groups=None):
        # Calculate monthly timesteps from the second level of the multi-index of X
        dates = X.index.get_level_values(1) - pd.offsets.MonthBegin(1, normalize=True)
        # Sort and list unique dates for splitting
        timesteps = sorted(dates.unique().tolist())
        # Reset index of X for easier indexing later
        X = X.reset_index().copy()

        # Iterate over the last n_splits months
        for t in timesteps[-self.n_splits:]:
            # Define training set indices (all data before the current timestep)
            idx_train = X[dates < t].index
            # Define test set indices (data from the current timestep)
            idx_test = X[dates == t].index

            # Yield indices for the training and test sets
            yield idx_train, idx_test

    def get_n_splits(self, X, y, groups=None):
        # Return the number of split iterations in the cross-validator
        return self.n_splits


# Feature Engineering

Designed for preprocessing a DataFrame. It handles categorical data conversion, datetime feature extraction, and data cleaning.

In [None]:
def feature_eng(df):
    # Convert specified columns to 'category' data type for efficient storage and computation
    categorical_columns = ["county", "is_business", "product_type", "is_consumption"]
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype('category')

    # Convert 'currently_scored' to an integer if it's a boolean, or to a datetime object if not
    if 'currently_scored' in df.columns:
        if pd.api.types.is_bool_dtype(df['currently_scored']):
            df['currently_scored'] = df['currently_scored'].astype(int)
        else:
            df['currently_scored'] = pd.to_datetime(df['currently_scored'], errors='coerce')

    # Extract datetime features from 'prediction_datetime' if present
    if 'prediction_datetime' in df.columns:
        df['prediction_datetime'] = pd.to_datetime(df['prediction_datetime'], errors='coerce')
        df['month'] = df['prediction_datetime'].dt.month      # Extract month
        df['day'] = df['prediction_datetime'].dt.day          # Extract day
        df['weekday'] = df['prediction_datetime'].dt.weekday  # Extract day of the week
        df['hour'] = df['prediction_datetime'].dt.hour        # Extract hour

    # Set 'row_id' as the index if it exists, useful for identification and lookup
    if 'row_id' in df.columns:
        df = df.set_index('row_id')

    # Drop columns that are no longer needed
    df = df.drop(columns=['currently_scored', 'prediction_datetime', 'prediction_unit_id'], errors='ignore')

    return df
