In [1]:
import sys
sys.path.append('../src')

from time_series import (

    # Prepare data
    load_data,
    generate_lags,
    generate_lagged_df,
    split_df,

    # Fit, Evaluate, Forecast
    fit_model,
    predict_evaluate_model,
    plot_forecast_vs_actual,
    plot_train_test_predictions,
    evaluate_methodology,

    # Save
    save_results_run,
)


In [2]:
# Load data
df = load_data('../data/data_original_m6.csv')
df.head(3)


Unnamed: 0_level_0,ABBV,ACN,AEP,AIZ,ALLE,AMAT,AMP,AMZN,AVB,AVY,...,XLC,XLE,XLF,XLI,XLK,XLP,XLU,XLV,XLY,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-02-01,125.672005,341.526031,81.144257,144.980698,116.244766,135.845093,295.97229,151.193497,225.756577,198.813828,...,73.001434,62.652004,37.921776,98.32856,159.164963,71.541222,63.674873,126.980408,182.482285,73.886658
2022-02-02,127.158058,347.372375,82.549454,146.648575,116.419128,137.952209,300.881073,150.612503,232.359009,193.332672,...,74.459702,62.853901,38.180336,99.012138,160.458786,72.398964,64.616043,128.688538,181.393707,73.694695
2022-02-03,129.093597,333.920837,82.83419,145.507904,112.476517,133.787003,292.273804,138.845505,231.04776,186.040558,...,69.478104,62.257393,37.758987,97.433182,155.803101,72.417839,64.327171,128.138443,175.842789,72.835434


In [None]:
# Load models

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# Define hyperparameters (model classes and hyperparameters)
model_configs = {
    LinearRegression: {},  # no random_state
    # Ridge: {"alpha": 1.0, "random_state": 42},
    # SVR: {"kernel": "rbf", "C": 1.0, "epsilon": 0.1},  # no random_state
    # KNeighborsRegressor: {"n_neighbors": 5},  # no random_state
    # DecisionTreeRegressor: {"max_depth": 5, "random_state": 42},
    # RandomForestRegressor: {"n_estimators": 100, "max_depth": 5, "random_state": 42},
    # GradientBoostingRegressor: {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3, "random_state": 42},
}


In [None]:
# Parameters
train_test_ratio = 0.8
p_first = 1
p_last = 30

In [5]:
# Experiments

n_lags_future_range = range(p_first, p_last+1)
all_stocks = df.columns

# Initialize list to collect experiment results
results_run = []

# Loop over each model class
for model_class, model_params in model_configs.items():
    model_class_name = model_class.__name__  # Store class name for tracking

    # Loop over number of future lags
    for n_lags_future in n_lags_future_range:
        n_lags_past = 2 * n_lags_future  # ARP rule: past lags = 2 * future lags

        # Loop over each stock (column) in the dataset
        for stock in all_stocks:
            # Extract the univariate time series
            series = df[stock]

            # Generate lag names for AR and ARP models
            lags_past, lags_future, lags_ar, lags_arp = generate_lags(
                n_lags_past=n_lags_past,
                n_lags_future=n_lags_future,
            )

            # Create lagged dataframe with full target + context
            df_lagged = generate_lagged_df(
                series=series,
                n_lags_past=n_lags_past,
                n_lags_future=n_lags_future,
            )

            # Split into training and testing sets
            df_train, df_test = split_df(
                df=df_lagged,
                train_test_ratio=train_test_ratio
            )

            # Fit and evaluate AR model (using only past lags for both training and testing)
            y_pred_train_ar, y_pred_test_ar, mape_train_ar, mape_test_ar = evaluate_methodology(
                df_train=df_train,
                df_test=df_test,
                features_train=lags_ar,
                features_test=lags_ar,
                model_class=model_class,
                **model_params
            )

            # Fit and evaluate ARP model
            # Train on future + past lags, test on past-only substitute ('y-1') for future terms
            y_pred_train_arp, y_pred_test_arp, mape_train_arp, mape_test_arp = evaluate_methodology(
                df_train=df_train,
                df_test=df_test,
                features_train=lags_arp,
                features_test=lags_arp[:len(lags_past) // 2] + ['y-1'] * n_lags_future,
                model_class=model_class,
                **model_params
            )

            # Store all experiment results in a structured and traceable format
            results_run.append({
                "stock": stock,
                "n_lags_future": n_lags_future,
                "model_class": model_class_name,

                "mape_train_ar": mape_train_ar,
                "mape_test_ar": mape_test_ar,
                "mape_train_arp": mape_train_arp,
                "mape_test_arp": mape_test_arp,

                "y_pred_train_ar": y_pred_train_ar,
                "y_pred_test_ar": y_pred_test_ar,
                "y_pred_train_arp": y_pred_train_arp,
                "y_pred_test_arp": y_pred_test_arp,
            })


In [8]:
# Save results
save_results_run(
    results_run=results_run,
    model_class_name=model_class.__name__,
    p_first=p_first,
    p_last=p_last,
    folder='../results/',
)
