In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.tsa.arima_process import ArmaProcess


In [2]:
# Load data
data_file = 'data_original_m6.csv'
data_original = pd.read_csv(data_file, index_col=0)

# Shrink data
shrink = 1
nrows = int(len(data_original) * shrink)
data_original = data_original.iloc[:nrows]

In [3]:
tickers_split = [
 ['ABBV', 'ACN', 'AEP', 'AIZ', 'ALLE', 'AMAT', 'AMP', 'AMZN', 'AVB'],
 ['AVY', 'AXP', 'BDX', 'BF-B', 'BMY', 'BR', 'CARR', 'CDW', 'CE', 'CHTR'],
 ['CNC', 'CNP', 'COP', 'CTAS', 'CZR', 'DG', 'DPZ', 'DXC', 'META', 'FTV'],
 ['GOOG', 'GPC', 'HIG', 'HST', 'JPM', 'KR', 'OGN', 'PG', 'PPL', 'PRU'],
 ['PYPL', 'ROL', 'ROST', 'UNH', 'URI', 'V', 'VRSK', 'WRK', 'XOM', 'IVV'],
 ['IWM', 'EWU', 'EWG', 'EWL', 'EWQ', 'IEUS', 'EWJ', 'EWT', 'MCHI'],
 ['INDA', 'EWY', 'EWA', 'EWH', 'EWZ', 'EWC', 'IEMG', 'LQD', 'HYG', 'SHY'],
 ['IEF', 'TLT', 'SEGA.L', 'IEAA.L', 'HIGH.L', 'JPEA.L', 'IAU', 'SLV', 'GSG', 'REET'],
 ['ICLN', 'IXN', 'IGF', 'IUVL.L', 'IUMO.L', 'SPMV.L', 'IEVL.L', 'IEFM.L', 'MVEU.L', 'XLK'],
 ['XLF', 'XLV', 'XLE', 'XLY', 'XLI', 'XLC', 'XLU', 'XLP', 'XLB', 'VXX']
 ]

# Just one list
tickers = [ticker for sublist in tickers_split for ticker in sublist]

# tickers = ['ABBV', 'ACN', 'AEP', 'AIZ', 'ALLE', 'AMAT', 'AMP', 'AMZN', 'AVB']

In [4]:
# Initialize a DataFrame to store MAPE values
mape_df = pd.DataFrame(columns=['MAPE_BTF', 'MAPE_AR1', 'MAPE_AR2'])
mse_df = pd.DataFrame(columns=['MSE_BTF', 'MSE_AR1', 'MSE_AR2'])

# Loop through time series
for ticker in tickers:

    # Create shifts
    y = data_original[ticker]
    y_plus_1 = y.shift(-1)
    y_minus_1 = y.shift(1)
    y_minus_2 = y.shift(2)

    # Prepare data for linear regression (drop NaN values due to shifting)
    df = pd.DataFrame({
        'y_plus_1': y_plus_1,
        'y': y,
        'y_minus_1': y_minus_1,
        'y_minus_2': y_minus_2
    }).dropna()

    # Estimation: BTF
    btf = sm.add_constant(df[['y_minus_1', 'y_plus_1']])
    model_btf = sm.OLS(df['y'], btf)
    results_btf = model_btf.fit()
    y_btf = results_btf.predict(btf)
    mape_btf = np.mean(np.abs((df['y'] - y_btf) / df['y'])) * 100

    # Estimation: AR(1)
    ar_1 = sm.add_constant(df[['y_minus_1']])
    model_ar_1 = sm.OLS(df['y'], ar_1)
    results_ar_1 = model_ar_1.fit()
    y_ar_1 = results_ar_1.predict(ar_1)
    mape_ar_1 = np.mean(np.abs((df['y'] - y_ar_1) / df['y'])) * 100

    # Estimation: AR(2)
    ar_2 = sm.add_constant(df[['y_minus_1', 'y_minus_2']])
    model_ar_2 = sm.OLS(df['y'], ar_2)
    results_ar_2 = model_ar_2.fit()
    y_ar_2 = results_ar_2.predict(ar_2)
    mape_ar_2 = np.mean(np.abs((df['y'] - y_ar_2) / df['y'])) * 100

    # Store MAPE
    mape_df.loc[ticker] = [mape_btf, mape_ar_1, mape_ar_2]

    # Store MSE
    mse_df.loc[ticker] = [results_btf.mse_model, results_ar_1.mse_model, results_ar_2.mse_model]

    # all_models = pd.DataFrame({
    #                 'y': y,
    #                 'y_btf': y_btf,
    #                 'y_ar_1': y_ar_1,
    #                 'y_ar_2': y_ar_2,
    #                 })
    
    # # Define custom styles for each series
    # styles = {
    #     'y': {'linestyle': '-', 'color': 'black', 'marker': 'd'},
    #     'y_btf': {'linestyle': '--', 'color': 'green', 'marker': 's'},
    #     'y_ar_1': {'linestyle': '--', 'color': 'blue', 'marker': 's'},
    #     'y_ar_2': {'linestyle': '-.', 'color': 'red', 'marker': 'x'},
    # }

    # # Plot all series in the same graph with custom styles
    # plt.figure(figsize=(14, 7))

    # for column in all_models.columns:
    #     plt.plot(all_models[column], 
    #             linestyle=styles[column]['linestyle'], 
    #             color=styles[column]['color'], 
    #             marker=styles[column]['marker'], 
    #             label=column)

    # # Add title and labels
    # plt.title('Comparison of All Models')
    # plt.xlabel('Index')
    # plt.ylabel('Values')

    # # Add legend
    # plt.legend()

    # # Show grid
    # plt.grid(True)

    # # Display the plot
    # plt.show()
    
# # Display the MAPE DataFrame
# display(mape_df)
# display(mse_df)


In [5]:
# Find the column with the minimum MAPE value for each row
min_mape_columns = mape_df.idxmin(axis=1)

# Count the occurrences of each column being the minimum
min_mape_counts = min_mape_columns.value_counts()

# Display the result
print(min_mape_counts)


MAPE_BTF    98
Name: count, dtype: int64


In [6]:
# Find the column with the minimum MASE value for each row
min_mse_columns = mse_df.idxmin(axis=1)

# Count the occurrences of each column being the minimum
min_mse_counts = min_mse_columns.value_counts()

# Display the result
print(min_mse_counts)

MSE_AR2    98
Name: count, dtype: int64
