# Forecasting of Asset Valueslassification of the Fire Using OPLS-DA

#### In this notebook, we use machine learning to forecast the values of assests.
#### To achieve this, we use the Orthogonal Partial Least Square Discriminant Analysis (OPLS-DA) technique.

#### We consider 3 different time horizons to forecast:

#### **- 1-day horizon**
#### **- 3-days horizon**
#### **- 7-days horizon**

### Import some packages

In [1]:
import os, sys
from time import sleep
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgbm
from sklearn.model_selection import cross_val_predict, LeaveOneOut
from sklearn.metrics import (
    r2_score,
    explained_variance_score,
    mean_squared_error,
    mean_absolute_percentage_error,
)

from pyopls import OPLS
from pyopls import OPLSValidator, OPLSDAValidator

from matplotlib import pyplot as plt

plt.rcParams["figure.figsize"] = (20, 10)

import warnings

warnings.filterwarnings("ignore")

### Global variable

In [2]:
RESULTS = "../results"

### Helper function

In [3]:
def truncate(num, digits):

    truncated_num = num

    try:
        sp = str(num).split(".")
        int_part = sp[0]
        dec_part = sp[1]

        if digits > len(dec_part):
            digits = len(dec_part)
        truncated_num = float(".".join([int_part, dec_part[:digits]]))
    except:
        pass

    return truncated_num

In [4]:
def build_model_summary():
    """
    Build the result tables that summarize the performance of the model.
    The tables contain the results of the forecast of all the assets at different time horizons.
    Different

    Return:
        The tables containing all the results of forecast.
    """

    assets = [
        "AC.PA",
        "BNP.PA",
        "CAP.PA",
        "ENGI.PA",
        "G.MI",
        "RACE.MI",
        "SAN.PA",
        "TIT.MI",
    ]
    horizons = ["1-day-horizon--", "3-days-horizon--", "7-days-horizon--"]

    values = len(assets) * len(horizons)

    with tqdm(total=values, file=sys.stdout) as pbar:
        for horizon in horizons:
            # initialize the lists of all the predictions
            y = []
            ŷ_pls = []
            ŷ_opls = []
            ŷ_xgboost = []
            ŷ_aug_xgboost = []

            for asset in assets:
                train = pd.read_csv(
                    os.path.join(
                        RESULTS,
                        "features",
                        "features-" + horizon + asset + "-train.csv",
                    ),
                    index_col=0,
                )
                test = pd.read_csv(
                    os.path.join(
                        RESULTS, "features", "features-" + horizon + asset + "-test.csv"
                    ),
                    index_col=0,
                )

                X_cols = [c for c in train.columns if c != "target"]
                y_col = "target"

                X_train = train[X_cols]
                y_train = train[y_col]

                X_test = test[X_cols]
                y_test = test[y_col]

                # PLS
                pls_model = PLSRegression(1).fit(X_train, y_train)

                # OPLS model
                ncomp = 20
                opls = OPLS(ncomp)
                Z_train = opls.fit_transform(X_train, y_train)
                opls_model = PLSRegression(1).fit(Z_train, y_train)

                # XGBoost models
                xgboost_params = {
                    "max_features": "auto",
                    "n_estimators": 10000,
                    "n_iter_no_change": 5,
                    "random_state": 42,
                }
                xgboost_model = GradientBoostingRegressor(**xgboost_params).fit(
                    X_train, y_train
                )
                augmented_xgboost_model = GradientBoostingRegressor(
                    **xgboost_params
                ).fit(Z_train, y_train)

                # Make predictions on the test
                # and compute some evaluation values
                n = len(X_test)  # sample size
                p = 1  # number of independant variables (=number of PLS component)

                y_pred_test_pls = pls_model.predict(X_test)
                r2_test_pls = r2_score(y_test, y_pred_test_pls)
                r2_adj_test_pls = 1 - (1 - r2_test_pls) * (n - 1) / (n - p - 1)
                rmse_test_pls = mean_squared_error(y_test, y_pred_test_pls)
                mape_test_pls = mean_absolute_percentage_error(y_test, y_pred_test_pls)

                Z_test = opls.transform(X_test)
                y_pred_test_opls = opls_model.predict(Z_test)
                r2_test_opls = r2_score(y_test, y_pred_test_opls)
                r2_adj_test_opls = 1 - (1 - r2_test_opls) * (n - 1) / (n - p - 1)
                rmse_test_opls = mean_squared_error(y_test, y_pred_test_opls)
                mape_test_opls = mean_absolute_percentage_error(
                    y_test, y_pred_test_opls
                )

                y_pred_test_xgboost = xgboost_model.predict(X_test)
                r2_test_xgboost = r2_score(y_test, y_pred_test_xgboost)
                r2_adj_test_xgboost = xgboost_model.score(
                    X_test, y_test
                )  # 1-(1-r2_test_xgboost)*(n-1)/(n-p-1)
                rmse_test_xgboost = mean_squared_error(y_test, y_pred_test_xgboost)
                mape_test_xgboost = mean_absolute_percentage_error(
                    y_test, y_pred_test_xgboost
                )

                y_pred_test_aug_xgboost = augmented_xgboost_model.predict(Z_test)
                r2_test_aug_xgboost = r2_score(y_test, y_pred_test_aug_xgboost)
                r2_adj_test_aug_xgboost = augmented_xgboost_model.score(
                    Z_test, y_test
                )  # 1-(1-r2_test_aug_xgboost)*(n-1)/(n-p-1)
                rmse_test_aug_xgboost = mean_squared_error(
                    y_test, y_pred_test_aug_xgboost
                )
                mape_test_aug_xgboost = mean_absolute_percentage_error(
                    y_test, y_pred_test_aug_xgboost
                )

                # Create the results table

                # Table with the predictions
                values_test = {
                    "Ground Truth": y_test,
                    "PLS Prediction": y_pred_test_pls,
                    "O-PLS Prediction": y_pred_test_opls,
                    "XGBoost Prediction": y_pred_test_xgboost,
                    "Augmented XGBoost Prediction": y_pred_test_aug_xgboost,
                }

                results_test = pd.DataFrame(columns=list(values_test.keys()))

                for k, v in values_test.items():
                    results_test[k] = truncate(v, 3)
                results_test["forecasting_horizon_in_days"] = [
                    int(horizon.split("-")[0])
                ] * len(results_test)
                results_test["asset_code"] = [asset] * len(results_test)

                # Results summary table
                summary_table = pd.DataFrame(
                    index=pd.Series(["PLS", "O-PLS", "XGBoost", "Augmented XGBoost"]),
                    columns=[
                        "R2",
                        "RMSE",
                        "MAPE",
                        "y_mean",
                        "y_std",
                        "ŷ_mean",
                        "ŷ_std",
                    ],
                )

                values = {
                    "PLS": {
                        "R2": r2_test_pls,
                        "RMSE": rmse_test_pls,
                        "MAPE": mape_test_pls,
                        "y_mean": np.mean(y_test),
                        "y_std": np.std(y_test),
                        "ŷ_mean": np.mean(y_pred_test_pls),
                        "ŷ_std": np.std(y_pred_test_pls),
                    },
                    "O-PLS": {
                        "R2": r2_test_opls,
                        "RMSE": rmse_test_opls,
                        "MAPE": mape_test_opls,
                        "y_mean": np.mean(y_test),
                        "y_std": np.std(y_test),
                        "ŷ_mean": np.mean(y_pred_test_opls),
                        "ŷ_std": np.std(y_pred_test_opls),
                    },
                    "XGBoost": {
                        "R2": r2_test_xgboost,
                        "RMSE": rmse_test_xgboost,
                        "MAPE": mape_test_xgboost,
                        "y_mean": np.mean(y_test),
                        "y_std": np.std(y_test),
                        "ŷ_mean": np.mean(y_pred_test_xgboost),
                        "ŷ_std": np.std(y_pred_test_xgboost),
                    },
                    "Augmented XGBoost": {
                        "R2": r2_test_aug_xgboost,
                        "RMSE": rmse_test_aug_xgboost,
                        "MAPE": mape_test_aug_xgboost,
                        "y_mean": np.mean(y_test),
                        "y_std": np.std(y_test),
                        "ŷ_mean": np.mean(y_pred_test_aug_xgboost),
                        "ŷ_std": np.std(y_pred_test_aug_xgboost),
                    },
                }
                for k, v in values.items():
                    for m in list(v.keys()):
                        summary_table.loc[k, m] = truncate(v[m], 3)
                summary_table["asset_code"] = [asset] * len(summary_table)

                # append the predictions to the overall lists
                y.append(y_test)
                ŷ_pls.append(y_pred_test_pls)
                ŷ_opls.append(y_pred_test_opls)
                ŷ_xgboost.append(y_pred_test_xgboost)
                ŷ_aug_xgboost.append(y_pred_test_aug_xgboost)

                # save the tables
                results_test.to_csv(
                    os.path.join(
                        RESULTS,
                        "predictions",
                        "predictions-" + horizon + asset + "-test.csv",
                    )
                )
                summary_table.to_csv(
                    os.path.join(
                        RESULTS, "summary", "summary-" + horizon + asset + "-test.csv"
                    )
                )

                res = "predictions-" + horizon + asset + "-test.csv"
                pbar.write(f"processed: {res}")
                pbar.update(1)
                sleep(1)

            # flatten lists of predictions
            y = [p for el in y for p in el]
            ŷ_pls = [p for el in ŷ_pls for p in el]
            ŷ_opls = [p for el in ŷ_opls for p in el]
            ŷ_xgboost = [p for el in ŷ_xgboost for p in el]
            ŷ_aug_xgboost = [p for el in ŷ_aug_xgboost for p in el]

            # compute the overall metrics
            n = len(y)  # sample size
            p = 1  # number of independant variables (=number of PLS component)
            r2_pls = r2_score(y, ŷ_pls)
            r2_adj_pls = 1 - (1 - r2_pls) * (n - 1) / (n - p - 1)
            rmse_pls = mean_squared_error(y, ŷ_pls)
            mape_pls = mean_absolute_percentage_error(y, ŷ_pls)

            r2_opls = r2_score(y, ŷ_opls)
            r2_adj_opls = 1 - (1 - r2_opls) * (n - 1) / (n - p - 1)
            rmse_opls = mean_squared_error(y, ŷ_opls)
            mape_opls = mean_absolute_percentage_error(y, ŷ_opls)

            r2_xgboost = r2_score(y, ŷ_xgboost)
            r2_adj_xgboost = 1 - (1 - r2_xgboost) * (n - 1) / (n - p - 1)
            rmse_xgboost = mean_squared_error(y, ŷ_xgboost)
            mape_xgboost = mean_absolute_percentage_error(y, ŷ_xgboost)

            r2_aug_xgboost = r2_score(y, ŷ_aug_xgboost)
            r2_adj_aug_xgboost = 1 - (1 - r2_aug_xgboost) * (n - 1) / (n - p - 1)
            rmse_aug_xgboost = mean_squared_error(y, ŷ_aug_xgboost)
            mape_aug_xgboost = mean_absolute_percentage_error(y, ŷ_aug_xgboost)

            # create the overall summary results table
            summary = pd.DataFrame(
                index=pd.Series(["PLS", "O-PLS", "XGBoost", "Augmented XGBoost"]),
                columns=["R2", "RMSE", "MAPE", "y_mean", "y_std", "ŷ_mean", "ŷ_std"],
            )

            values = {
                "PLS": {
                    "R2": truncate(np.mean(r2_pls), 3),
                    "RMSE": truncate(np.mean(rmse_pls), 3),
                    "MAPE": truncate(mape_pls, 3),
                    "y_mean": np.mean(y),
                    "y_std": np.std(y),
                    "ŷ_mean": np.mean(ŷ_pls),
                    "ŷ_std": np.std(ŷ_pls),
                },
                "O-PLS": {
                    "R2": truncate(np.mean(r2_opls), 3),
                    "RMSE": truncate(np.mean(rmse_opls), 3),
                    "MAPE": truncate(mape_opls, 3),
                    "y_mean": np.mean(y),
                    "y_std": np.std(y),
                    "ŷ_mean": np.mean(ŷ_opls),
                    "ŷ_std": np.std(ŷ_opls),
                },
                "XGBoost": {
                    "R2": truncate(np.mean(r2_xgboost), 3),
                    "RMSE": truncate(np.mean(rmse_xgboost), 3),
                    "MAPE": truncate(mape_xgboost, 3),
                    "y_mean": np.mean(y),
                    "y_std": np.std(y),
                    "ŷ_mean": np.mean(ŷ_xgboost),
                    "ŷ_std": np.std(ŷ_xgboost),
                },
                "Augmented XGBoost": {
                    "R2": truncate(np.mean(r2_aug_xgboost), 3),
                    "RMSE": truncate(np.mean(rmse_aug_xgboost), 3),
                    "MAPE": truncate(mape_aug_xgboost, 3),
                    "y_mean": np.mean(y),
                    "y_std": np.std(y),
                    "ŷ_mean": np.mean(ŷ_aug_xgboost),
                    "ŷ_std": np.std(ŷ_aug_xgboost),
                },
            }
            for k, v in values.items():
                for m in list(v.keys()):
                    summary.loc[k, m] = truncate(v[m], 3)

            # save overall summary table
            summary.to_csv(
                os.path.join(
                    RESULTS, "summary", "summary-" + horizon + "-all-assets-test.csv"
                )
            )

# Build and export the result tables

In [5]:
%%time
build_model_summary()

processed: predictions-1-day-horizon--AC.PA-test.csv                                                                                                   
processed: predictions-1-day-horizon--BNP.PA-test.csv                                                                                                  
processed: predictions-1-day-horizon--CAP.PA-test.csv                                                                                                  
processed: predictions-1-day-horizon--ENGI.PA-test.csv                                                                                                 
processed: predictions-1-day-horizon--G.MI-test.csv                                                                                                    
processed: predictions-1-day-horizon--RACE.MI-test.csv                                                                                                 
processed: predictions-1-day-horizon--SAN.PA-test.csv                                   