This notebook is used to make predictions on data using the models trained in `global_model_training.ipynb`.

In [None]:
# Define parameters to retrieve the correct model from MLflow
horizon = 3
function = "prediction"
group_by_pc_types = False

In [None]:
from typing import Any

import mlflow
import pandas as pd

from constants import processed_names
from constants.paths import SE_PREDICTIONS_DATA_DIR
from src.modeling.multivariate_data_prep import (
    load_and_prepare_data,
)

# 1. Load Predictions Data

In [None]:
uni_preds = pd.read_csv(SE_PREDICTIONS_DATA_DIR / "se_predictions_uni.csv")
uni_grouped_preds = pd.read_csv(
    SE_PREDICTIONS_DATA_DIR / "se_predictions_uni_grouped.csv"
)

multi_preds = pd.read_csv(SE_PREDICTIONS_DATA_DIR / "se_predictions_multi.csv")
multi_grouped_preds = pd.read_csv(
    SE_PREDICTIONS_DATA_DIR / "se_predictions_multi_grouped.csv"
)

# 2. Load prediciton models from MLflow

In [None]:
mlflow_runs = mlflow.search_runs()

In [None]:
mlflow_runs[
    (mlflow_runs["tags.horizon"] == str(horizon))
    & (mlflow_runs["tags.function"] == function)
    & (mlflow_runs["tags.grouped_by_pc_types"] == str(group_by_pc_types))
][["run_id", "tags.mlflow.runName"]]

In [None]:
model_3m_uri = "models:/m-7e484d84f5254e2082d2d740a77c653c"
model_3m = mlflow.pyfunc.load_model(model_3m_uri)

model_6m_uri = "models:/m-83edac81cd0649bf81cb7253dc621053"
model_6m = mlflow.pyfunc.load_model(model_6m_uri)

# 3. Prepare Data for Predictions

In [None]:
data, target_col, feature_cols, mappings_dict = load_and_prepare_data(
    group_by_pc_types=group_by_pc_types, horizon=horizon
)

# 4. Make Predictions

In [None]:
def multi_predict_pc_prices(
    model: Any,
    data_df: pd.DataFrame,
    feature_cols: list[str],
    data_date: str,
    target_date: str,
    predictions_df: pd.DataFrame,
) -> pd.DataFrame:
    """Make predictions for PC prices and compare with SE predicted prices.

    Args:
        model: Trained MLflow model for making predictions.
        data_df: DataFrame containing the features for prediction.
        feature_cols: List of feature column names to be used for prediction.
        data_date: Date string representing the date of the input features.
        target_date: Date string representing the date for which predictions are made.
        predictions_df: DataFrame containing actual prices for comparison.

    Returns: DataFrame with predicted and actual prices grouped by PC type.
    """
    preds = model.predict(
        data_df[data_df[processed_names.LONG_DATE] == data_date][feature_cols]
    )
    results_df = data_df[data_df[processed_names.LONG_DATE] == data_date][
        [
            processed_names.LONG_DATE,
            processed_names.LONG_PC_TYPE,
            processed_names.LONG_REGION,
        ]
    ].copy()
    results_df["target_date"] = target_date
    results_df["predicted_price"] = preds
    results_df["actual_price"] = predictions_df[
        predictions_df[processed_names.LONG_DATE] == target_date
    ][processed_names.LONG_PC_PRICE].values

    results_df = results_df.groupby(processed_names.LONG_PC_TYPE).agg(
        {"predicted_price": "min", "actual_price": "min"}
    )
    results_df["date"] = target_date
    return results_df

In [None]:
multi_predict_pc_prices(
    model=model_6m,
    data_df=data,
    feature_cols=feature_cols,
    data_date="2025-04-01",
    target_date="2025-10-01",
    predictions_df=multi_preds,
)

In [None]:
def multi_predict_all_pc_prices(
    group_by_pc_types: bool, horizon: int, predictions: pd.DataFrame
) -> pd.DataFrame:
    """Make predictions for all PC prices from July to December 2025.

    Args:
        group_by_pc_types: Whether to group data by PC types.
        horizon: Prediction horizon in months.
        predictions: DataFrame containing actual prices for comparison.

    Returns: DataFrame with predicted and actual prices for all PCs from
             July to December 2025.
    """
    data, _, feature_cols, _ = load_and_prepare_data(
        group_by_pc_types=group_by_pc_types, horizon=horizon
    )
    full_results_df = pd.DataFrame()
    for i in range(7, 10):
        # Use best 3 month model to predict prices for July, August, September
        data_date = f"2025-0{i - 3}-01"
        target_date = f"2025-0{i}-01"
        results_df = multi_predict_pc_prices(
            model=model_3m,
            data_df=data,
            feature_cols=feature_cols,
            data_date=data_date,
            target_date=target_date,
            predictions_df=predictions,
        )
        full_results_df = pd.concat([full_results_df, results_df], axis=0)

    for i in range(10, 13):
        # Use best 6 month model to predict prices for October, November, December
        data_date = f"2025-0{i - 6}-01"
        target_date = f"2025-{i}-01"
        results_df = multi_predict_pc_prices(
            model=model_6m,
            data_df=data,
            feature_cols=feature_cols,
            data_date=data_date,
            target_date=target_date,
            predictions_df=predictions,
        )
        full_results_df = pd.concat([full_results_df, results_df], axis=0)
    return full_results_df

In [None]:
comparison_df = multi_predict_all_pc_prices(
    group_by_pc_types=group_by_pc_types, horizon=horizon, predictions=multi_preds
)

In [None]:
comparison_df

# 5. Visualize Predictions vs Actual Prices

In [None]:
import matplotlib.pyplot as plt

# Prepare data for visualization
viz_df = comparison_df.reset_index()
viz_df_melted = viz_df.melt(
    id_vars=["pc_type", "date"],
    value_vars=["predicted_price", "actual_price"],
    var_name="price_type",
    value_name="price",
)

# Create histogram comparing predicted vs actual prices by date and PC type
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
dates = sorted(viz_df["date"].unique())

for idx, date in enumerate(dates):
    row = idx // 3
    col = idx % 3
    ax = axes[row, col]

    date_data = viz_df_melted[viz_df_melted["date"] == date]

    # Create grouped bar chart for each PC type
    pc_types = sorted(date_data["pc_type"].unique())
    x_pos = range(len(pc_types))
    width = 0.35

    predicted = date_data[date_data["price_type"] == "predicted_price"].set_index(
        "pc_type"
    )["price"]
    actual = date_data[date_data["price_type"] == "actual_price"].set_index("pc_type")[
        "price"
    ]

    ax.bar(
        [x - width / 2 for x in x_pos],
        [predicted.get(pc, 0) for pc in pc_types],
        width,
        label="Our Prediction",
        alpha=0.8,
    )
    ax.bar(
        [x + width / 2 for x in x_pos],
        [actual.get(pc, 0) for pc in pc_types],
        width,
        label="SE Prediction",
        alpha=0.8,
    )

    ax.set_xlabel("PC Type")
    ax.set_ylabel("Price")
    ax.set_title(f"Date: {date}")
    ax.set_xticks(x_pos)
    ax.set_xticklabels(pc_types, rotation=45, ha="right")
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle(
    "Our Predictions vs SE Predictions for PC Prices by Date and PC Type",
    y=1.02,
    fontsize=16,
)
plt.show()

# 6. Compute MAPE (Mean Absolute Percentage Error)

In [None]:
from sklearn.metrics import mean_absolute_percentage_error

# Calculate MAPE for the entire dataset
mape_overall = (
    mean_absolute_percentage_error(
        comparison_df["actual_price"], comparison_df["predicted_price"]
    )
    * 100
)

print(f"Overall MAPE: {mape_overall:.2f}%")

# Calculate MAPE by PC type
mape_by_pc_type = viz_df.groupby("pc_type").apply(
    lambda x: mean_absolute_percentage_error(x["actual_price"], x["predicted_price"])
    * 100,
    include_groups=False,
)

print("\nMAPE by PC Type:")
for pc_type, mape in mape_by_pc_type.items():
    print(f"  {pc_type}: {mape:.2f}%")

# Calculate MAPE by date
mape_by_date = viz_df.groupby("date").apply(
    lambda x: mean_absolute_percentage_error(x["actual_price"], x["predicted_price"])
    * 100,
    include_groups=False,
)

print("\nMAPE by Date:")
for date, mape in mape_by_date.items():
    print(f"  {date}: {mape:.2f}%")