## Ensemble modelling notebook

In [None]:
from pathlib import Path
import matplotlib.pyplot as plt
from typing import List, Tuple
import numpy as np
import pandas as pd
from src.data_preprocessing.data_loader import load_data, time_split
from src.modeling.evaluation import smape, mae
from src.modeling.univariate_modeling import (
    get_best_cv_model,
)
from src.data_preprocessing.feature_engineering import preprocessor
from config.config_data import (
    GROUPING_FUNCS,
    GROUPING_NAMES,
    GROUPING_VARS,
    TIME_VARS,
    DROP_VARS,
)
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing

### Initializing the variables

In [None]:
DATA_DIR = (
    Path("..")
    / ".."
    / ".."
    / "hfactory_magic_folders"
    / "plastic_cost_prediction"
    / "data"
)
MAIN_FILE = "PA6_cleaned_dataset.csv"

In [None]:
df = load_data(DATA_DIR / MAIN_FILE)

### Best univariate models for each column

In [None]:
models = get_best_cv_model(df)

In [None]:
df = df.dropna()
split = time_split(df)

### Ensemble modelling to predict the best compound price

In [None]:
def univar_preds_per_split(
    df: pd.DataFrame,
    target: str,
    train_idx: np.ndarray[int],
    test_idx: np.ndarray[int],
    univar_models: dict[str, dict],
) -> pd.DataFrame:
    """Given a split between train and test, computes a dataframe with the predictions
    of the best univar models for each column for the test set.

    Parameters
    ----------
    df: pd.DataFrame
      Original dataset for which we want to get the univar model predictions.
    target: str
      Target variable of the dataset.
    train_idx: np.ndarray[int]
      Indexes of the elements in the training set.
    test_idx: np.ndarray[int]
      Indexes of the elements in the test set.
    univar_models: dict[str,dict]
      Dict of models with evalutaion information for each column.

    Returns
    -------
    univar_preds_df: pd.DataFrame
      Dataframe with the predictions of the best univar model for each column.
    """
    # get selected models
    models_selected = {
        k: {v["selected"]: v[v["selected"]]} for k, v in univar_models.items()
    }

    # predict test values with best model for each column on the train data
    best_models = {}
    for col, val in models_selected.items():
        if col != target:
            model_type = list(val.keys())[0]
            test = df[col].iloc[test_idx]
            train = df[col].iloc[train_idx]

            if model_type == "baseline":
                # define constant value for last point in train data
                best_models[col] = list(np.ones(3) * df[col].iloc[-1])

            elif model_type == "ARIMA":
                # fit ARIMA with best order and seasonal order
                model = ARIMA(
                    train,
                    order=val[model_type]["order"],
                    seasonal_order=val[model_type]["seasonal_order"],
                )
                model_fit = model.fit()
                test = df[col].iloc[test_idx]
                preds = model_fit.predict(
                    start=test.index[0], end=test.index[-1]
                )

                preds_for_model = [preds[2], preds[5], preds[8]]
                best_models[col] = preds_for_model

            elif model_type == "ETS":
                # fit ETS with best trend, season and seasonal periods
                model = ExponentialSmoothing(
                    train,
                    trend=val[model_type]["trend"],
                    seasonal=val[model_type]["seasonal"],
                    seasonal_periods=val[model_type]["seasonal_periods"],
                )
                model_fit = model.fit()
                preds = model_fit.predict(
                    start=test.index[0], end=test.index[-1]
                )
                preds_for_model = [preds[2], preds[5], preds[8]]

                best_models[col] = preds_for_model

            elif model_type == "XGB":
                preds_for_model = []
                for horizon in [3, 6, 9]:
                    X_train = df.iloc[train_idx]
                    X_test = df.iloc[np.append(train_idx, test_idx)]

                    # lag horizon+ to create exogenous columns
                    for lag in range(horizon, horizon + 12):
                        X_train[f"lag_{lag}"] = X_train[col].shift(lag)
                        X_test[f"lag_{lag}"] = X_test[col].shift(lag)

                    # get train data
                    X_train = X_train.dropna()
                    y_train = X_train[col]
                    X_train = X_train.drop(columns=col)

                    # get test data
                    X_test = X_test.dropna()
                    X_test = X_test.drop(columns=col)

                    # fit model for fold and horizon
                    model = XGBRegressor(max_depth=3)
                    model.fit(X_train, y_train)
                    # get target and prediction for horizon
                    preds_for_model.append(model.predict(X_test)[horizon - 1])
                best_models[col] = preds_for_model
    # creating the dataframe
    univar_preds_df = pd.DataFrame.from_dict(best_models)
    # adding time as as index of the dataframe
    time = [
        df.index[test_idx][2],
        df.index[test_idx][5],
        df.index[test_idx][8],
    ]
    univar_preds_df["time"] = time
    univar_preds_df = univar_preds_df.set_index("time")
    return univar_preds_df

In [None]:
def ensemble_method_univar_preds(
    df: pd.DataFrame,
    target: str,
    model_list: List[str],
    split: List[Tuple[np.ndarray[int], np.ndarray[int]]],
    univar_models: dict[str, dict],
) -> dict[str, dict]:
    """Given a list of the best prediction models and the models to test,
    returns a dict with the MAE ans SMAPE for the different ensemble models.

    Parameters
    ----------
    df: pd.DataFrame
      Dataframe for which we want to make predictions.
    target: str
      Target variable.
    model_list: List[str]
      List of ensemble models we want to test.
    split: List[Tuple[np.ndarray[int], np.ndarray[int]]]
      Different splits used in cross validation.
    univar_models: dict[str, dict])
      Dict of models with evalutaion information for each column.

    Returns
    -------
    results: dict[str, dict]
      Dictionary containing the metric scores for each of the ensemble models.


    """
    ## doing feature engineering
    processed_df = preprocessor(
        GROUPING_VARS,
        GROUPING_NAMES,
        GROUPING_FUNCS,
        TIME_VARS,
        DROP_VARS,
        df=df,
    )
    X = processed_df.drop(target, axis=1)
    Y = processed_df[target]
    # intializing the results dictionary
    results = dict()
    # calculating all the metrics for each ensemble model
    for mod in model_list:
        # initializing the metrics
        total_smape = 0
        total_mae = 0
        preds_3 = []
        preds_6 = []
        preds_9 = []
        target_3 = []
        target_6 = []
        target_9 = []
        preds = []
        Smape_per_fold = []
        mae_per_fold = []
        # calculating smape and mae for each split
        for train_idx, test_idx in split:
            if mod == "Ridge":
                model = Ridge()
            elif mod == "Lasso":
                model = Lasso()
            elif mod == "LR":
                model = LinearRegression()
            elif mod == "XGB":
                model = XGBRegressor(max_depth=3)
            elif mod == "RF":
                model = RandomForestRegressor(max_depth=3)
            # fittig the model
            X_train = X.iloc[train_idx]
            Y_train = Y.iloc[train_idx]
            Y_test = Y.iloc[test_idx]
            model.fit(X_train, Y_train)
            # getting predictions for the predicted values.
            X_test = univar_preds_per_split(
                df, target, train_idx, test_idx, univar_models
            )
            X_test = preprocessor(
                GROUPING_VARS,
                GROUPING_NAMES,
                GROUPING_FUNCS,
                TIME_VARS,
                DROP_VARS,
                df=X_test,
            )
            # calculating the different metrics
            preds_for_metrics = model.predict(X_test)
            Y_for_metrics = np.array([Y_test[2], Y_test[5], Y_test[8]])
            sMAPE = smape(Y_for_metrics, preds_for_metrics)
            MAE = mae(Y_for_metrics, preds_for_metrics)
            for pred in preds_for_metrics:
                preds += [pred]
            preds_3.append(preds_for_metrics[0])
            preds_6.append(preds_for_metrics[1])
            preds_9.append(preds_for_metrics[2])
            target_3.append(Y_for_metrics[0])
            target_6.append(Y_for_metrics[1])
            target_9.append(Y_for_metrics[2])
            total_smape += sMAPE
            Smape_per_fold += [sMAPE]
            total_mae += MAE
            mae_per_fold += [MAE]
        total_smape_3 = smape(np.array(target_3), np.array(preds_3))
        total_smape_6 = smape(np.array(target_6), np.array(preds_6))
        total_smape_9 = smape(np.array(target_9), np.array(preds_9))
        total_mae_3 = mae(np.array(target_3), np.array(preds_3))
        total_mae_6 = mae(np.array(target_6), np.array(preds_6))
        total_mae_9 = mae(np.array(target_9), np.array(preds_9))
        results[mod] = {
            "SMAPE": total_smape / len(split),
            "MAE": total_mae / len(split),
            "SMAPE_3": total_smape_3,
            "MAE_3": total_mae_3,
            "SMAPE_6": total_smape_6,
            "MAE_6": total_mae_6,
            "SMAPE_9": total_smape_9,
            "MAE_9": total_mae_9,
            "preds": preds,
            "smape_per_fold": Smape_per_fold,
            "mae_per_fold": mae_per_fold,
        }
    return results

In [None]:
results = ensemble_method_univar_preds(
    df,
    "best_price_compound",
    ["Ridge", "Lasso", "LR", "RF", "XGB"],
    split,
    models,
)

## Plotting predictions vs True Values

In [None]:
preds = results["XGB"]["preds"]
pred_times = []
for _, test_idx in split:
    for i in range(len(test_idx)):
        if i % 3 == 2:
            pred_times += [df.index[test_idx[i]]]

pred_df = pd.DataFrame(
    list(zip(pred_times, preds)), columns=["Time", "Predictions"]
)
pred_df = pred_df.set_index("Time")

In [None]:
plt.figure(figsize=(15, 6))
plt.plot(df.index, df["best_price_compound"], label="Target", color="blue")
plt.scatter(
    pred_df.index, pred_df["Predictions"], label="Predictions", color="orange"
)
plt.title("Target vs. Predictions per Fold")
plt.xlabel("Time")
plt.ylabel("Best Compound Price")
plt.legend();