The goal of this notebook is to setup a global model training framework, where a single model is trained on all pc types.

In [None]:
import os
from typing import Literal

from dotenv import load_dotenv
import mlflow
import numpy as np
import pandas as pd

from constants import processed_names
from constants.paths import PROCESSED_DATA_DIR
from src.utils.data_split import adaptive_train_test_split
from src.utils.logger import logger

load_dotenv()
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))

# 1. Load and Prepare Data 

We define a function to load data and separate features and target variable from the dataframe. There are different types of features:
- Target variable: `pc_price`.
- Meta features: `region`, `pc_type` and `date`. (Used for grouping and weighting but not as model features.)
- Numerical features: `pc_price_lag_*`, `pc_price_rolling_mean_*`, `regional_avg_price`, `regional_price_volaility_`, `price_deviation_from_regional_avg`, exogenous features like `bpa_capacity_loss_kt` and their lags (less lags than for target), and time features like `month_sin`, `month_cos` and raw `month` or `year`.
- Categorical binary features (can keep as is, tree based models handle $0$ and $1$):  `is_recycled`, `is_glass_filled`, `is_flame_retardant`.

In [None]:
def load_and_prepare_data(horizon: int = 3) -> tuple[pd.DataFrame, str, list[str]]:
    """Load processed data and separate features and target variable.

    Args:
        horizon (int, optional): Forecast horizon in months. Defaults to 3.

    Returns:
        tuple[pd.DataFrame, str, list[str]]: DataFrame, target column name,
        feature column names.
    """
    # Load processed data
    df = pd.read_csv(PROCESSED_DATA_DIR / f"multi_{horizon}m.csv")

    # Separate features and target
    target_col = processed_names.LONG_PC_PRICE
    meta_cols = [
        processed_names.LONG_DATE,
        processed_names.LONG_REGION,
        processed_names.LONG_PC_TYPE,
    ]
    feature_cols = [col for col in df.columns if col not in meta_cols + [target_col]]
    logger.info(f"Data loaded with {df.shape[0]} rows and {df.shape[1]} columns.")
    logger.info(f"Target: {target_col}. Features: {len(feature_cols)} columns.")

    return df, target_col, feature_cols

In [None]:
df, target_col, feature_cols = load_and_prepare_data(horizon=3)

In [None]:
df.head()

# 2. Split Data

Because the data is imbalanced across different pc types, we need to ensure that the train set and the test set contain enough samples from each pc type. This is a problem especially for rare pc types (`gf20` notably). To do this, we use a function that performs an adaptive train-test split, ensuring that each pc type is represented in both sets with a minimum number of samples.

In [None]:
train, test = adaptive_train_test_split(
    df=df,
    group_col=processed_names.LONG_PC_TYPE,
)

# 3. Prepare Features and Target

In [None]:
def prepare_training_data(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    feature_cols: list[str],
    target_col: str,
    horizon: int,
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, pd.DataFrame, pd.DataFrame]:
    """Prepare features and target variable for training.

    To align the target variable with the features, the target is shifted
    by -horizon months within each group defined by region and pc_type.

    Args:
        train_df (pd.DataFrame): Training dataframe.
        test_df (pd.DataFrame): Testing dataframe.
        feature_cols (list[str]): List of feature column names.
        target_col (str): Target column name.
        horizon (int): Forecast horizon in months.

    Returns:
        tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray,
        pd.DataFrame, pd.DataFrame]:
            X_train, y_train, X_test, y_test, aligned training and testing dataframes
    """
    # Target (shift by -horizon to align with features)
    # Group by region and pc_type to shift correctly
    train_df["target"] = train_df.groupby(
        [processed_names.LONG_REGION, processed_names.LONG_PC_TYPE]
    )[target_col].shift(-horizon)
    test_df["target"] = test_df.groupby(
        [processed_names.LONG_REGION, processed_names.LONG_PC_TYPE]
    )[target_col].shift(-horizon)

    # Drop rows with NaN in target (due to shifting)
    train_mask = ~train_df["target"].isna()
    test_mask = ~test_df["target"].isna()

    X_train = train_df.loc[train_mask, feature_cols].values
    y_train = train_df.loc[train_mask, "target"].values
    train_df_aligned = train_df[train_mask].copy()
    logger.info(f"Training data prepared with {X_train.shape[0]} samples.")

    X_test = test_df.loc[test_mask, feature_cols].values
    y_test = test_df.loc[test_mask, "target"].values
    test_df_aligned = test_df[test_mask].copy()
    logger.info(f"Testing data prepared with {X_test.shape[0]} samples.")

    return X_train, y_train, X_test, y_test, train_df_aligned, test_df_aligned

In [None]:
X_train, y_train, X_test, y_test, train_df_aligned, test_df_aligned = (
    prepare_training_data(
        train_df=train,
        test_df=test,
        feature_cols=feature_cols,
        target_col=target_col,
        horizon=3,
    )
)

# 4. Compute Sample Weights

Because the data is imbalanced across different pc types, we compute sample weights to give more importance to under-represented pc types during model training. This helps the model to learn better representations for these rare pc types. Without this, the model might be biased towards the more common pc types, leading to poor performance on the rare ones. The global performance metric might be good, but the performance on rare pc types would be bad.

Additionally, we can also weight samples based on region: we are only concerned about performance in Europe, so we can give more weight to samples from this region. We keep pc types from all regions in the training set to have more data, but we want to prioritize performance on European pc types.

In [None]:
def compute_sample_weights(
    df: pd.DataFrame,
    group_col: str,
    region_col: str,
    target_region: str,
    method: Literal["inverse_frequency", "sqrt_inverse", "balanced"] = "balanced",
) -> pd.Series:
    """Compute sample weights based on group frequency and region.

    Args:
        df (pd.DataFrame): Dataframe containing the data.
        group_col (str): Column name for grouping (e.g., pc_type).
        region_col (str): Column name for region.
        target_region (str): Region to prioritize.
        method (Literal["inverse_frequency", "sqrt_inverse", "balanced"], optional):
            Method to compute weights. Defaults to "balanced".

    Returns:
        pd.Series: Sample weights for each row in the dataframe.
    """
    # Validate method
    if method not in ["inverse_frequency", "sqrt_inverse", "balanced"]:
        raise ValueError(
            f"Invalid method: {method}. Choose from 'inverse_frequency', "
            "'sqrt_inverse', 'balanced'."
        )

    group_counts = df[group_col].value_counts()
    n_samples = len(df)
    n_groups = len(group_counts)

    # Simple inverse frequency weights (1/count)
    if method == "inverse_frequency":
        weights_map = {group: 1.0 / count for group, count in group_counts.items()}
        weights = df[group_col].map(weights_map).values

    # Square root of inverse frequency weights
    # Less aggressive than inverse_frequency
    elif method == "sqrt_inverse":
        weights_map = {
            group: 1.0 / np.sqrt(count) for group, count in group_counts.items()
        }
        weights = df[group_col].map(weights_map).values

    elif method == "balanced":
        # Sklearn-style balanced weights: n_samples / (n_groups * count)
        # Normalized so sum(weights) â‰ˆ n_samples (maintains effective sample size)
        weights_map = {
            group: n_samples / (n_groups * count)
            for group, count in group_counts.items()
        }
        weights = df[group_col].map(weights_map).values

    return weights

# 5. Define evaluation metrics

# 6. Train Global Model

We train a single global model on all pc types using the computed sample weights, and log it to MLflow.

In [None]:
with mlflow.start_run(run_name="test_model"):
    # Tags
    mlflow.set_tags(
        {
            "model.type": "global",
            "model.algorithm": "xgboost",
            "data.horizon": 3,
        }
    )
    # Compute sample weights
    sample_weights = compute_sample_weights(
        df=train_df_aligned,
        group_col=processed_names.LONG_PC_TYPE,
        region_col=processed_names.LONG_REGION,
        target_region="Europe",
        method="balanced",
    )

    # Train global model
    from xgboost import XGBRegressor

    global_model = XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
    )
    global_model.fit(
        X_train,
        y_train,
        sample_weight=sample_weights,
        eval_set=[(X_test, y_test)],
        # verbose=True,
    )

    # Log model to MLflow
    mlflow.xgboost.log_model(
        global_model, name="test_global_model", input_example=X_train[:5]
    )