<a href="https://colab.research.google.com/github/carlos-alves-one/-Energy-Comp/blob/main/enefit_lgbm_baseline_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Necessary Libraries and Packages

In [None]:
import os  # Import the os module for interacting with the operating system
import gc  # Import the garbage collector module for memory management

import numpy as np               # Import numpy for numerical operations and array handling
import pandas as pd              # Import pandas for data manipulation and analysis
import matplotlib.pyplot as plt  # Import matplotlib for creating static, animated, and interactive visualizations

import seaborn as sns  # Import seaborn for statistical data visualization

from sklearn.model_selection import cross_val_score  # Import cross_val_score for cross-validation of models

import xgboost as xgb   # Import XGBoost for gradient boosting framework
import lightgbm as lgb  # Import LightGBM for gradient boosting framework
import catboost as cb   # Import CatBoost for gradient boosting on decision trees

import optuna  # Import Optuna for hyperparameter optimization
import shap    # Import SHAP for explaining the output of machine learning models

from datetime import datetime  # Import datetime for handling date and time
import pytz                    # Import pytz for handling timezone information


# Input Data Files

In [None]:
# Iterating through all files in the '/kaggle/input' directory and printing their full paths
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Load the Data

In [None]:
data_dir = "/kaggle/input/predict-energy-behavior-of-prosumers"

## Loading the Dataset Train

In [None]:
df_train = pd.read_csv(os.path.join(data_dir, "train.csv"))

## Basic Information


In [None]:
# Check the first few rows, data types, and summary statistics
print(df_train.head(5).T)
print(df_train.info())
print(df.describe().T)


In [None]:




# df_gprice   = pd.read_csv(os.path.join(data_dir, "gas_prices.csv"))
# df_eprice   = pd.read_csv(os.path.join(data_dir, "electricity_prices.csv"))
# df_client   = pd.read_csv(os.path.join(data_dir, "client.csv"))
# df_weather  = pd.read_csv(os.path.join(data_dir, "forecast_weather.csv"))
# df_hweather = pd.read_csv(os.path.join(data_dir, "historical_weather.csv"))

df_train["datetime"] = pd.to_datetime(df_train["datetime"])

df_train["month"]   = df_train["datetime"].dt.month
df_train["day"]     = df_train["datetime"].dt.day
df_train["weekday"] = df_train["datetime"].dt.weekday
df_train["hour"]    = df_train["datetime"].dt.hour

df_train["county"]         = df_train["county"].astype("category")
df_train["is_business"]    = df_train["is_business"].astype("category")
df_train["product_type"]   = df_train["product_type"].astype("category")
df_train["is_consumption"] = df_train["is_consumption"].astype("category")

df_train = df_train.set_index(["row_id", "datetime"])
df_train = df_train.drop(columns=["prediction_unit_id", "data_block_id"])

df_train = df_train.dropna(subset=["target"])


# Declare Class MonthlyKFold

Custom cross-validator designed for time series data, where the data is split based on unique monthly time steps.

In [None]:
class MonthlyKFold:
    def __init__(self, n_splits=3):
        self.n_splits = n_splits  # Initialize the class with n_splits, default is 3

    def split(self, X, y, groups=None):
        # Calculate monthly timesteps from the second level of the multi-index of X
        dates = X.index.get_level_values(1) - pd.offsets.MonthBegin(1, normalize=True)
        # Sort and list unique dates for splitting
        timesteps = sorted(dates.unique().tolist())
        # Reset index of X for easier indexing later
        X = X.reset_index().copy()

        # Iterate over the last n_splits months
        for t in timesteps[-self.n_splits:]:
            # Define training set indices (all data before the current timestep)
            idx_train = X[dates < t].index
            # Define test set indices (data from the current timestep)
            idx_test = X[dates == t].index

            # Yield indices for the training and test sets
            yield idx_train, idx_test

    def get_n_splits(self, X, y, groups=None):
        # Return the number of split iterations in the cross-validator
        return self.n_splits


# Feature Engineering

Designed for preprocessing a DataFrame. It handles categorical data conversion, datetime feature extraction, and data cleaning.

In [None]:
def feature_eng(df):
    # Convert specified columns to 'category' data type for efficient storage and computation
    categorical_columns = ["county", "is_business", "product_type", "is_consumption"]
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype('category')

    # Convert 'currently_scored' to an integer if it's a boolean, or to a datetime object if not
    if 'currently_scored' in df.columns:
        if pd.api.types.is_bool_dtype(df['currently_scored']):
            df['currently_scored'] = df['currently_scored'].astype(int)
        else:
            df['currently_scored'] = pd.to_datetime(df['currently_scored'], errors='coerce')

    # Extract datetime features from 'prediction_datetime' if present
    if 'prediction_datetime' in df.columns:
        df['prediction_datetime'] = pd.to_datetime(df['prediction_datetime'], errors='coerce')
        df['month'] = df['prediction_datetime'].dt.month      # Extract month
        df['day'] = df['prediction_datetime'].dt.day          # Extract day
        df['weekday'] = df['prediction_datetime'].dt.weekday  # Extract day of the week
        df['hour'] = df['prediction_datetime'].dt.hour        # Extract hour

    # Set 'row_id' as the index if it exists, useful for identification and lookup
    if 'row_id' in df.columns:
        df = df.set_index('row_id')

    # Drop columns that are no longer needed
    df = df.drop(columns=['currently_scored', 'prediction_datetime', 'prediction_unit_id'], errors='ignore')

    return df


# Declare Function lgb_objective

Designed to optimize hyperparameters for a LightGBM regressor using Optuna, a hyperparameter optimization framework. It utilizes a custom cross-validation strategy and aims to minimize the mean absolute error of the model predictions.

In [None]:
def lgb_objective(trial):
    # Define hyperparameters for the LightGBM model with ranges to be optimized by Optuna
    params = {
        'n_iter'           : 200,  # Number of boosting iterations
        'verbosity'        : -1,   # Control verbosity; -1 means no output
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 1.0),         # Fraction of features to be considered for each split
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.5, 1.0),         # Fraction of features for each node
        'max_depth'        : trial.suggest_int('max_depth', 3, 10),                     # Maximum depth of the tree
        'learning_rate'    : trial.suggest_float('learning_rate', 0.01, 0.1, log=True), # Learning rate for gradient descent
        'lambda_l1'        : trial.suggest_float('lambda_l1', 1e-2, 10.0),   # L1 regularization term
        'lambda_l2'        : trial.suggest_float('lambda_l2', 1e-2, 10.0),   # L2 regularization term
        'num_leaves'       : trial.suggest_int('num_leaves', 16, 256),       # Maximum number of leaves in one tree
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 4, 256),  # Minimum data in one leaf
    }

    # Initialize the LightGBM regressor with the defined parameters
    model  = lgb.LGBMRegressor(**params)
    # Separate features (X) and target (y) from the training dataset
    X, y   = df_train.drop(columns=["target"]), df_train["target"]
    # Initialize a custom cross-validation strategy, MonthlyKFold
    cv = MonthlyKFold(3)
    # Perform cross-validation and compute the negative mean absolute error
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')

    # Return the average negative mean absolute error as the optimization objective
    return -1 * np.mean(scores)


# Initializing the LightGBM regressor model

With a set of optimized hyperparameters:

In [None]:
best_params = {
    'n_iter': 300,            # Increased to allow more thorough hyperparameter search
    'verbosity': 1,           # Set to 1 for standard output of messages
    'colsample_bytree': 0.8,  # Reduced to introduce more randomness and prevent overfitting
    'colsample_bynode': 0.8,  # Same reasoning as colsample_bytree
    'max_depth': 6,           # Reduced to combat potential overfitting
    'learning_rate': 0.1,     # Slightly increased for potentially better convergence
    'lambda_l1': 1.0,         # Increased for stronger L1 regularization to prevent overfitting
    'lambda_l2': 1.0,         # Increased for stronger L2 regularization to prevent overfitting
    'num_leaves': 120,        # Reduced to prevent overfitting, especially in LightGBM
    'min_data_in_leaf': 20    # Increased to ensure a reasonable number of samples per leaf
}

# Initialize the LightGBM model with the optimized parameters
model = lgb.LGBMRegressor(**best_params)


# Train the LightGBM model

Using the Training Data

In [None]:
# Separate features and target variable from the training dataset
X_train, y_train = df_train.drop(columns=["target"]), df_train["target"]

# Fit the LightGBM model to the training data
model.fit(X_train, y_train)


# Submit API

In [None]:
# Importing the 'enefit' library, a custom or specific library for a certain task or competition
import enefit

# Initializing an environment using a function from the 'enefit' library. This environment is used for simulation or testing.
env = enefit.make_env()

# Creating an iterator for the test set. Used in scenarios to provided a streaming test set, to make predictions iteratively.
iter_test = env.iter_test()

# Initialize a list to store all predictions
all_predictions = []

# Flag to indicate if the shape and column names have been printed
printed_info = False

for (test, revealed_targets, client, historical_weather,
     forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:

    # Apply feature engineering
    X_test = feature_eng(test)

    # Print shape and column names only once
    if not printed_info:
        print("\n-> Shape of X_test:", X_test.shape)
        print("\n-> Column names in X_test:")
        for column in X_test.columns:
            print(column)
        printed_info = True

    try:
        # Make predictions
        y_pred = model.predict(X_test)
        y_pred = y_pred.clip(0)

        # Assign predictions to sample_prediction
        sample_prediction['target'] = y_pred

        # Store the predictions for later use
        all_predictions.append(y_pred)

        # Make submission
        env.predict(sample_prediction)

    except Exception as e:  # Catching a broader range of exceptions
        print("\n-> Error in prediction or data processing:", e)

        # Assuming 'model' is your trained model and 'X_train' is the training data
        if hasattr(model, 'n_features_'):
            print(f"\n-> Number of features used in the model: {model.n_features_}")

            # If X_train is a pandas DataFrame, we can get the feature names directly
            if isinstance(X_train, pd.DataFrame):
                feature_names = X_train.columns
                print("\n-> Feature names:")
                for name in feature_names:
                    print(name)
            else:
                print("\nFeature names are not available as X_train is not a pandas DataFrame.")
        else:
            print("The model does not have the attribute 'n_features_'.")

        break  # Exit the loop if there's an error

# Get the current time in GMT
gmt_time = datetime.now(pytz.timezone('GMT'))

# Formats the GMT time into a string with the format "hour:minute:second AM/PM month/day/year GMT"
formatted_time = gmt_time.strftime('%I:%M:%S %p %m/%d/%Y GMT')
print("\n-> Model Submission was Made Successfully at", formatted_time)

# After the loop, convert the list of predictions to a DataFrame
df_predictions = pd.DataFrame(all_predictions)

# Optionally name the columns
column_names = [f'Prediction {j}' for j in range(df_predictions.shape[1])]
df_predictions.columns = column_names

# Set the index to represent each iteration
df_predictions.index = [f'Iteration {i}' for i in range(df_predictions.shape[0])]

# Print the DataFrame
print("\n-> All Predictions of The Model:\n")
print(df_predictions)
