In [None]:
import sys
sys.path.append('../src')

from time_series import (
    load_data,
    generate_lags,
    generate_lagged_df,
    split_df,
    fit_model,
    predict_evaluate_model,
    plot_forecast_vs_actual,
    plot_train_test_predictions,
    evaluate_methodology,
    count_lower_mape,
    save_results_run,
)


In [None]:
# Load data
df = load_data('../data/data_original_m6.csv')
df.head(3)


In [None]:
# Load models

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# Define hyperparameters
# Dictionary of model classes and their corresponding hyperparameters
model_configs = {
    LinearRegression: {},  # no random_state
    # Ridge: {"alpha": 1.0, "random_state": 42},
    # SVR: {"kernel": "rbf", "C": 1.0, "epsilon": 0.1},  # no random_state
    # KNeighborsRegressor: {"n_neighbors": 5},  # no random_state
    # DecisionTreeRegressor: {"max_depth": 5, "random_state": 42},
    # RandomForestRegressor: {"n_estimators": 100, "max_depth": 5, "random_state": 42},
    # GradientBoostingRegressor: {"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3, "random_state": 42},
}


In [None]:
# Parameters
train_test_ratio = 0.8
p_first = 1
p_last = 5

In [None]:
# Experiments

n_lags_future_range = range(p_first, p_last+1)
all_stocks = df.columns

# Initialize list to collect experiment results
results_run = []

# Loop over each model class
for model_class, model_params in model_configs.items():
    model_class_name = model_class.__name__  # Store class name for tracking

    # Loop over number of future lags
    for n_lags_future in n_lags_future_range:
        n_lags_past = 2 * n_lags_future  # ARP rule: past lags = 2 * future lags

        # Loop over each stock (column) in the dataset
        for stock in all_stocks:
            # Extract the univariate time series
            series = df[stock]

            # Generate lag names for AR and ARP models
            lags_past, lags_future, lags_ar, lags_arp = generate_lags(
                n_lags_past=n_lags_past,
                n_lags_future=n_lags_future,
            )

            # Create lagged dataframe with full target + context
            df_lagged = generate_lagged_df(
                series=series,
                n_lags_past=n_lags_past,
                n_lags_future=n_lags_future,
            )

            # Split into training and testing sets
            df_train, df_test = split_df(
                df=df_lagged,
                train_test_ratio=train_test_ratio
            )

            # Fit and evaluate AR model (using only past lags for both training and testing)
            y_pred_train_ar, y_pred_test_ar, mape_train_ar, mape_test_ar = evaluate_methodology(
                df_train=df_train,
                df_test=df_test,
                features_train=lags_ar,
                features_test=lags_ar,
                model_class=model_class,
                **model_params
            )

            # Fit and evaluate ARP model
            # Train on future + past lags, test on past-only substitute ('y-1') for future terms
            y_pred_train_arp, y_pred_test_arp, mape_train_arp, mape_test_arp = evaluate_methodology(
                df_train=df_train,
                df_test=df_test,
                features_train=lags_arp,
                features_test=lags_arp[:len(lags_past) // 2] + ['y-1'] * n_lags_future,
                model_class=model_class,
                **model_params
            )

            # Store all experiment results in a structured and traceable format
            results_run.append({
                "stock": stock,
                "n_lags_future": n_lags_future,
                "model_class": model_class_name,

                "mape_train_ar": mape_train_ar,
                "mape_test_ar": mape_test_ar,
                "mape_train_arp": mape_train_arp,
                "mape_test_arp": mape_test_arp,

                "y_pred_train_ar": y_pred_train_ar,
                "y_pred_test_ar": y_pred_test_ar,
                "y_pred_train_arp": y_pred_train_arp,
                "y_pred_test_arp": y_pred_test_arp,
            })


In [None]:
# Save results
save_results_run(
    results_run=results_run,
    model_class_name="LinearRegression",
    p_first=p_first,
    p_last=p_last,
    folder='../results/',
)


In [None]:
# Load results



In [None]:
# Count lower MAPE in train and test

results_count = count_lower_mape(results_run, model_class=LinearRegression)
display(results_count)

# # Print results cleanly
# for n_lags_future, stats in sorted(results_count.items()):
#     print(f"\nFor p = r = {n_lags_future}:")
#     print("-" * 30)
#     print(f"MAPE Train ARP lower (better) than MAPE Train AR in: {stats['train_count']} times ({stats['train_pct']:.1f}%).")
#     print(f"MAPE Test  ARP lower (better) than MAPE Test  AR in: {stats['test_count']} times ({stats['test_pct']:.1f}%).")

