**ReadME**  
**VAR Cleaned**

**Section 1: Required Libraries**

In [1]:
import pandas as pd
import yfinance as yf
import numpy as np
import pandas as pd

**Section 2: Data Processing**  

In [9]:
def get_historical_returns(ticker, start_date, end_date, frequency="monthly"):
    'Function to fetch Historical Price data and compute returns'

    data = yf.download(ticker,start=start_date, end=end_date)

    # Calculate Daily Returns
    daily_data = data.copy()
    daily_data['Return'] = daily_data['Close'].pct_change()
    daily_returns = daily_data[['Return']].dropna()

    # Calculate Monthly Returns
    monthly_data = data.copy()
    monthly_data['Return'] = monthly_data['Close']
    monthly_data = monthly_data['Return'].resample('M').last()
    monthly_returns = monthly_data.pct_change()
    monthly_returns = monthly_returns.dropna()

    if frequency == "daily": return daily_returns
    if frequency == "monthly": return pd.DataFrame(monthly_returns)
    return monthly_data

In [10]:
def multi_df(ticker_list, start_date, end_date):
    company_data = {}
    for ticker in ticker_list:
        company_data[ticker] = get_historical_returns(ticker, start_date, end_date)

    # Initialize a list to hold DataFrames with the new multi-index
    multi_index_dfs = []

    for company, df in company_data.items():
        # Set the company name as an additional level in the index
        df_multi_index = df.copy()
        df_multi_index['Company'] = company
        df_multi_index.set_index(['Company', df_multi_index.index], inplace=True)

        # Append to the list
        multi_index_dfs.append(df_multi_index)

    # Concatenate all DataFrames into a single multi-index DataFrame
    final_df = pd.concat(multi_index_dfs)

    return final_df

**Section 3: VAR**  
Section 3.1 Pre-Processing

In [11]:
def create_sequences(features, targets, seq_length):
    'Function to create sequence'
    'Need to define the sequence length: e.g. using 4 quaters to predict the next quater'

    xs = []
    ys = []

    for i in range(len(features)-seq_length):
        x = features[i:(i+seq_length)]
        y = targets.iloc[i+seq_length]
        xs.append(x)
        ys.append(y)

    return np.array(xs), np.array(ys)

**Section 4: Run Program**

Step 1: Define Input and Parameters

In [12]:
# 1. File Path
path_to_file_gpt = "ChatGPT_Tickers/"
path_to_file_cluster = "Clustering_Tickers/"
path_to_file_svm = "SVM_Tickers/"

# 2. Ticker List
ChatGPT_stocks = ['AAPL', 'MSFT', 'AMZN', 'GOOGL', 'JNJ', 'V', 'PG', 'JPM', 'UNH', 'MA', 'INTC', 'VZ', 'GOOG', 'HD', 'T']
Cluster_stocks = ['AMC', 'AME','CRL', 'DVN', 'DFS', 'DTE', 'FERG', 'GIS', 'HD', 'MSFT', 'PRU', 'RUN', 'TMO', 'RARE', 'VIRT']
SVM_stocks = ['TRGP', 'AMC', 'EXAS', 'OXY', 'LYV', 'JLL', 'FTI', 'OKE', 'SON', 'RRC', 'DVN', 'COTY', 'AR', 'EQT', 'NOV']

# 3. Target Time Frame
start_date = '2016-09-30'
end_date = '2021-09-30'

# 4. Sequence Length
seq_length = 6

# 5. Training and Validation Set Split Ratio
train_ratio = 0.8

# 6. Num Epoch and Num Batch
num_epoch = 20
num_batch = 12

Step2: Pre-Processing

In [13]:
final_df_cluster = multi_df(Cluster_stocks, start_date, end_date)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [15]:
multi_df_train = final_df_cluster

In [16]:
multi_df_return = pd.DataFrame(multi_df_train['Return'])
multi_X, multi_y = create_sequences(multi_df_train, multi_df_return, seq_length)
targets = multi_df_train['Return'].unstack(level=0)
targets_input = targets.fillna(0)

Step3: Train Model on Training and Validation Sets

In [17]:
from statsmodels.tsa.api import VAR

train_size = int(len(targets_input)*0.8)
X_train, X_test = targets_input[:train_size], targets_input[train_size:]

model_var = VAR(X_train)
result_var = model_var.fit()
#print(result_var.summary())

predictions_VAR = result_var.forecast(X_test.values, steps=len(X_test)) ##Test on X_test
forecast_df_VAR = pd.DataFrame(predictions_VAR, columns=targets_input.columns,index=X_test.index)
#print(forecast_df_VAR)

mse_VAR = np.mean((X_test - forecast_df_VAR)**2)
#print(f"Mean Squared Error (VAR): {mse_VAR}")

  self._init_dates(dates, freq)


Step4: Evaluate the Model on Test Set

In [18]:
# Test Period 1: September 30 2021 to July 30 2023
# Test Period 2: March 14, 2023 to July 31 2023
# Test Period 3: May 01 2023 to July 31 2023

start_t1 = '2021-09-30'
end_t1 = '2023-07-31'

start_t2 = '2023-03-14'
end_t2 = '2023-07-31'

start_t3 = '2023-05-01'
end_t3 = '2023-07-31'

In [20]:
# Loading Test Phase: Took a while to run this (Don't Rerun)
final_df_t1 = multi_df(Cluster_stocks, start_t1, end_t1)
final_df_t2 = multi_df(Cluster_stocks, start_t2, end_t2)
final_df_t3 = multi_df(Cluster_stocks, start_t3, end_t3)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [21]:
multi_df_t1 = final_df_t1
multi_return_t1 = pd.DataFrame(multi_df_t1['Return'])
seq_length = 6
multi_X_t1, multi_y_t1 = create_sequences(multi_df_t1, multi_return_t1, seq_length)

In [22]:
def generate_sequence_mapping(df,freq):
    sequences = []
    sequence_mappings = []

    for company in df.index.get_level_values(0).unique():
        # Get the data for the current company
        company_data = df.xs(company, level='Company')

        # Create 6-month sequences and record their mappings
        for i in range(len(company_data) - (freq-1)):
            sequence = company_data.iloc[i:i+freq]
            if sequence.shape[0] == freq:  # Ensure each sequence has 6 months
                sequences.append(sequence.drop(columns='Return').values)  # Add the sequence to the list, excluding 'Return' if it's not an input feature
                end_date = sequence.index[-1]  # The end date of the sequence
                sequence_mappings.append((company, end_date))  # Record the mapping
    return sequence_mappings

**Test Period Execution parts**

In [None]:
df, multi_return, X_test, y_test = multi_df_t1, multi_return_t1, multi_X_t1, multi_y_t1

# Parse accordingly before feeding into the model
sequence_mappings = generate_sequence_mapping(df,6)
# print(sequence_mappings)

multi_return_transposed = multi_return['Return'].unstack(level=0)

In [24]:
model_var = VAR(multi_return_transposed)
result_var = model_var.fit()
# print(result_var.summary())

predictions_VAR_t1 = result_var.forecast(multi_return_transposed.values, steps=len(multi_return_transposed))
forecast_df_VAR = pd.DataFrame(predictions_VAR_t1, columns=multi_return_transposed.columns,index=multi_return_transposed.index)

  self._init_dates(dates, freq)


**For Test Period 1: September 30, 2023 to July 31 2023**

In [41]:
average_returns_t1 = forecast_df_VAR.mean().values.reshape(-1, 1)
cov_matrix_t1 = forecast_df_VAR.cov()

pd.DataFrame(average_returns_t1).to_csv('Saved_mu_Q/VAR_Cluster_mu_1.csv')
cov_matrix_t1.to_csv('Saved_mu_Q/VAR_Cluster_Q_1.csv')

**For Test Period 2: March 14, 2023 to July 31 2023**


In [43]:
filtered_df_2 = forecast_df_VAR.loc['2023-03-14':]
average_returns_t2 = filtered_df_2.mean().values.reshape(-1, 1)
cov_matrix_t2 = filtered_df_2.cov()

pd.DataFrame(average_returns_t2).to_csv('Saved_mu_Q/VAR_Cluster_mu_2.csv')
cov_matrix_t2.to_csv('Saved_mu_Q/VAR_Cluster_Q_2.csv')

**For Test Period 3: Test Period 3: May 01 2023 to July 31 2023**


In [44]:
filtered_df_3 = forecast_df_VAR.loc['2023-05-01':]
average_returns_t3 = filtered_df_3.mean().values.reshape(-1, 1)
cov_matrix_t3 = filtered_df_3.cov()

pd.DataFrame(average_returns_t3).to_csv('Saved_mu_Q/VAR_Cluster_mu_3.csv')
cov_matrix_t3.to_csv('Saved_mu_Q/VAR_Cluster_Q_3.csv')