This notebook is used to make predictions on data using the models trained in `global_model_training.ipynb`.

In [None]:
# Define parameters to retrieve the correct model from MLflow
horizon = 3
function = "prediction"
group_by_pc_types = False

In [None]:
from typing import Any

from dotenv import load_dotenv
import mlflow
import pandas as pd

from constants import processed_names
from constants.paths import SE_PREDICTIONS_DATA_DIR
from src.modeling.multivariate_data_prep import (
    load_and_prepare_data,
)

load_dotenv()

# 1. Load Predictions Data

In [None]:
uni_preds = pd.read_csv(SE_PREDICTIONS_DATA_DIR / "se_predictions_uni.csv")
uni_grouped_preds = pd.read_csv(
    SE_PREDICTIONS_DATA_DIR / "se_predictions_uni_grouped.csv"
)

multi_preds = pd.read_csv(SE_PREDICTIONS_DATA_DIR / "se_predictions_multi.csv")
multi_grouped_preds = pd.read_csv(
    SE_PREDICTIONS_DATA_DIR / "se_predictions_multi_grouped.csv"
)

# 2. Load prediciton models from MLflow

In [None]:
# mlflow_runs = mlflow.search_runs()

In [None]:
# mlflow_runs[
#     (mlflow_runs["tags.horizon"] == str(horizon))
#     & (mlflow_runs["tags.function"] == function)
#     & (mlflow_runs["tags.grouped_by_pc_types"] == str(group_by_pc_types))
# ][["run_id", "tags.mlflow.runName"]]

In [None]:
model_3m_uri = "models:/m-3c0554ecaddb4f1fafe1e6c3c3070e6b"  # XGBoost
# model_3m_uri = "models:/m-7e484d84f5254e2082d2d740a77c653c"  # Catboost
# model_3m_uri = "models:/m-e28aaa81fc834123a7fc7486a9db66ae"  # RandomForest
# model_3m_uri = "models:/m-964efc13017646d5830bd89454d26715"  # LightGBM
model_3m = mlflow.pyfunc.load_model(model_3m_uri)

model_6m_uri = "models:/m-ef1ddec2cfc845fb94ca9d7ca4533fc9"  # XGBoost
# model_6m_uri = "models:/m-83edac81cd0649bf81cb7253dc621053"  # Catboost
# model_6m_uri = "models:/m-d2b4272168054f449fc03924417d5028"  # RandomForest
# model_6m_uri = "models:/m-6b45c3ca975743e5b2593f99e1125aff"  # LightGBM
model_6m = mlflow.pyfunc.load_model(model_6m_uri)

model_9m_uri = "models:/m-8cd9199c03b743d5b57592c5c55f275c"  # XGBoost
# model_9m_uri = "models:/m-568b5a45ea9a4ef78580c4435f4c1c2f"  # Catboost
# model_9m_uri = "models:/m-b2bf0b210964424f92f08989635efd0d"  # RandomForest
# model_9m_uri = "models:/m-347e3f10db3840a0b45e43b2b00f24c8"  # LightGBM
model_9m = mlflow.pyfunc.load_model(model_9m_uri)

# 3. Prepare Data for Predictions

In [None]:
data, target_col, feature_cols, mappings_dict = load_and_prepare_data(
    group_by_pc_types=group_by_pc_types, horizon=6
)

# 4. Make Predictions

In [None]:
def multi_predict_pc_prices(
    model: Any,
    horizon: int,
    group_by_pc_types: bool,
    data_date: str,
    target_date: str,
    predictions_df: pd.DataFrame,
) -> pd.DataFrame:
    """Make predictions for PC prices and compare with SE predicted prices.

    Args:
        model: Trained MLflow model for making predictions.
        horizon: Prediction horizon in months.
        group_by_pc_types: Whether to group data by PC types.
        data_date: Date string representing the date of the input features.
        target_date: Date string representing the date for which predictions are made.
        predictions_df: DataFrame containing actual prices for comparison.

    Returns: DataFrame with predicted and actual prices grouped by PC type.
    """
    data, _, feature_cols, _ = load_and_prepare_data(
        group_by_pc_types=group_by_pc_types, horizon=horizon
    )
    # Filter data and reset index before prediction
    data_filtered = data[data[processed_names.LONG_DATE] == data_date][
        feature_cols
    ].reset_index(drop=True)
    preds = model.predict(data_filtered)

    results_df = data[data[processed_names.LONG_DATE] == data_date][
        [
            processed_names.LONG_DATE,
            processed_names.LONG_PC_TYPE,
            processed_names.LONG_REGION,
        ]
    ].copy()
    results_df["target_date"] = target_date
    results_df["predicted_price"] = preds
    results_df["actual_price"] = predictions_df[
        predictions_df[processed_names.LONG_DATE] == target_date
    ][processed_names.LONG_PC_PRICE].values

    # results_df = results_df.groupby(processed_names.LONG_PC_TYPE).agg(
    #     {"predicted_price": "min", "actual_price": "min"}
    # )
    results_df["date"] = target_date
    return results_df

In [None]:
# multi_predict_pc_prices(
#     model=model_6m,
#     horizon=6,
#     group_by_pc_types=False,
#     data_date="2025-04-01",
#     target_date="2025-10-01",
#     predictions_df=multi_preds,
# )

In [None]:
def multi_predict_all_pc_prices(
    group_by_pc_types: bool, predictions: pd.DataFrame
) -> pd.DataFrame:
    """Make predictions for all PC prices from July to December 2025.

    Args:
        group_by_pc_types: Whether to group data by PC types.
        predictions: DataFrame containing actual prices for comparison.

    Returns: DataFrame with predicted and actual prices for all PCs from
             July to December 2025.
    """
    full_results_df = pd.DataFrame()
    for i in range(7, 10):
        # Use best 3 month model to predict prices for July, August, September
        data_date = f"2025-0{i - 3}-01"
        target_date = f"2025-0{i}-01"
        results_df = multi_predict_pc_prices(
            model=model_3m,
            horizon=3,
            group_by_pc_types=group_by_pc_types,
            data_date=data_date,
            target_date=target_date,
            predictions_df=predictions,
        )
        full_results_df = pd.concat([full_results_df, results_df], axis=0)

    for i in range(10, 13):
        # Use best 6 month model to predict prices for October, November, December
        data_date = f"2025-0{i - 6}-01"
        target_date = f"2025-{i}-01"
        results_df = multi_predict_pc_prices(
            model=model_6m,
            horizon=6,
            group_by_pc_types=group_by_pc_types,
            data_date=data_date,
            target_date=target_date,
            predictions_df=predictions,
        )
        full_results_df = pd.concat([full_results_df, results_df], axis=0)

    for i in range(1, 4):
        # Use best 9 months model to predict January, February, March
        data, _, feature_cols, _ = load_and_prepare_data(
            group_by_pc_types=group_by_pc_types, horizon=9
        )
        data_date = f"2025-0{i + 3}-01"
        target_date = f"2026-0{i}-01"

        preds = model_9m.predict(
            data[data[processed_names.LONG_DATE] == data_date][feature_cols]
        )

        results_df = pd.DataFrame(columns=full_results_df.columns)
        results_df["predicted_price"] = preds
        results_df[processed_names.LONG_DATE] = target_date
        results_df[processed_names.LONG_PC_TYPE] = data[
            data[processed_names.LONG_DATE] == data_date
        ][processed_names.LONG_PC_TYPE].values

        full_results_df = pd.concat([full_results_df, results_df], axis=0)

    return full_results_df.groupby(
        [processed_names.LONG_DATE, processed_names.LONG_PC_TYPE]
    ).agg({"predicted_price": "min", "actual_price": "min"})

In [None]:
comparison_df = multi_predict_all_pc_prices(
    group_by_pc_types=group_by_pc_types, predictions=multi_preds
)

# 5. Visualize Predictions vs Actual Prices

In [None]:
# Create comprehensive visualization dataframe
# 1. Get actual data up to 2025-06
actual_data = (
    data[data[processed_names.LONG_DATE] <= "2025-06-01"]
    .groupby([processed_names.LONG_DATE, processed_names.LONG_PC_TYPE])
    .agg({"pc_price": "min"})
)
actual_data = actual_data.rename(columns={processed_names.LONG_PC_PRICE: "actual"})

# 2. Get SE forecasts for 2025-07 to 2025-12
se_forecast_data = comparison_df[
    (comparison_df.index.get_level_values(processed_names.LONG_DATE) >= "2025-07-01")
    & (comparison_df.index.get_level_values(processed_names.LONG_DATE) <= "2025-12-01")
][["actual_price"]].rename(columns={"actual_price": "se_forecast"})

# 3. Get our forecasts for 2025-07 to 2026-03
our_forecast_data = comparison_df[
    (comparison_df.index.get_level_values(processed_names.LONG_DATE) >= "2025-07-01")
    & (comparison_df.index.get_level_values(processed_names.LONG_DATE) <= "2026-03-01")
][["predicted_price"]].rename(columns={"predicted_price": "our_forecast"})

# Combine all data
viz_df = actual_data.join(se_forecast_data, how="outer").join(
    our_forecast_data, how="outer"
)
viz_df = viz_df.reset_index()

In [None]:
import matplotlib.dates as mdates
import matplotlib.pyplot as plt


def plot_forecasts_vs_actual(
    viz_df: pd.DataFrame, forecast_start_date: str = "2025-07-01"
):
    """Plot actual data, our forecasts, and SE forecasts.

    Args:
        viz_df: DataFrame with columns: date, pc_type, actual, se_forecast, our_forecast
        forecast_start_date: Date where forecasts begin (default: "2025-07-01")
    """
    # Create separate figures for each PC type
    pc_types = sorted(viz_df["pc_type"].unique())
    forecast_date = pd.to_datetime(forecast_start_date)

    for pc_type in pc_types:
        # Create a new figure for each PC type
        fig, ax = plt.subplots(1, 1, figsize=(12, 6))

        # Filter data for this PC type
        pc_data = viz_df[viz_df["pc_type"] == pc_type].copy()
        pc_data["date"] = pd.to_datetime(pc_data["date"])
        pc_data = pc_data.sort_values("date")

        # Plot actual data
        actual_mask = pc_data["actual"].notna()
        ax.plot(
            pc_data[actual_mask]["date"],
            pc_data[actual_mask]["actual"],
            marker="o",
            linewidth=1,
            markersize=2,
            label="Actual",
            color="#003A70",
            alpha=0.8,
        )

        # Plot SE forecast
        se_mask = pc_data["se_forecast"].notna()
        ax.plot(
            pc_data[se_mask]["date"],
            pc_data[se_mask]["se_forecast"],
            marker="o",
            linewidth=1,
            markersize=2,
            label="SE Forecast",
            color="#3CC956",
            alpha=0.8,
        )

        # Plot our forecast
        our_mask = pc_data["our_forecast"].notna()
        ax.plot(
            pc_data[our_mask]["date"],
            pc_data[our_mask]["our_forecast"],
            marker="o",
            linewidth=1,
            markersize=2,
            label="Our Forecast",
            color="#F10101",
            alpha=0.8,
        )

        # Add vertical line at forecast start
        ax.axvline(
            x=forecast_date,
            color="#000000",
            linestyle="--",
            linewidth=1,
            label="Forecast Start",
            alpha=1,
        )

        # Formatting
        ax.set_xlabel("Date", fontsize=12)
        ax.set_ylabel("PC Price", fontsize=12)
        ax.set_title(
            f"PC Price Forecasts:Actual vs SE vs Our Model - {pc_type.capitalize()} PC",
            fontsize=14,
            fontweight="bold",
        )
        ax.legend(fontsize=10, loc="best")
        ax.grid(True, alpha=0.3, linestyle="--")
        ax.tick_params(axis="x", rotation=45)

        # Format x-axis to show dates nicely
        ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
        ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6))

        plt.tight_layout()
        plt.show()

In [None]:
plot_forecasts_vs_actual(viz_df)