In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Section 1: Required Libraries**

In [3]:
import pandas as pd
import yfinance as yf
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Dense
from tensorflow.keras.regularizers import l2

**Section 2: Data Processing**  
Section 2.1 Single Asset Data Processing

In [4]:
def get_historical_returns(ticker, start_date, end_date, frequency="monthly"):
    'Function to fetch Historical Price data and compute returns'

    data = yf.download(ticker,start=start_date, end=end_date)

    # Calculate Daily Returns
    daily_data = data.copy()
    daily_data['Return'] = daily_data['Close'].pct_change()
    daily_returns = daily_data[['Return']].dropna()

    # Calculate Monthly Returns
    monthly_data = data.copy()
    monthly_data['Return'] = monthly_data['Close']
    monthly_data = monthly_data['Return'].resample('M').last()
    monthly_returns = monthly_data.pct_change()
    monthly_returns = monthly_returns.dropna()

    if frequency == "daily": return daily_returns
    if frequency == "monthly": return monthly_returns

    return monthly_data

def resample_quaterly_data(quaterly_data, target_data):
    'Repeat the quaterly available ratios to same frequency as target return'

    quaterly_data.index = pd.to_datetime(quaterly_data.index)
    target_data.index = pd.to_datetime(target_data.index)

    # Resample the quaterly data to daily frequency using Forward Fill
    quaterly_data.index = quaterly_data.index + pd.DateOffset(days=1)
    aligned_quaterly_data = quaterly_data.reindex(target_data.index, method='ffill')

    aligned_quaterly_data = aligned_quaterly_data.dropna()
    return aligned_quaterly_data


def load_features(path_to_file, ticker, start_date, end_date):
    'Function to Load all features for a single company'

    # Load the Excel file and read Data from the file
    file_path = path_to_file + ticker + '.xlsx'
    sheet_name = ticker + '-US'
    data = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')

    # Remove rows with any NaN values
    # Because time frame is longer, cannot apply this
    # data = data.dropna()

    # Reset the index of the DataFrame and drop the old index
    data = data.reset_index(drop=True)

    data = data.set_index('Date').T
    data.index = pd.to_datetime(data.index, format='%b \'%y')
    data.index = data.index + pd.offsets.MonthEnd()
    ratio_data = data.apply(pd.to_numeric)
    #print(ratio_data)

    # Select a few columns
    pe_column = 'Price/Earnings'
    pb_column = 'Price/Book Value'
    roa_column = 'Return on Assets'
    roe_column = 'Return on Equity '
    fcf_column = 'Free Cash Flow per Share'
    ratio_data = data[[pe_column, pb_column, roa_column, roe_column, fcf_column]]

    # Drop N/A dates
    # Removing rows with any NaN values
    ratio_data = ratio_data.dropna()

    # Process Return Data
    returns_data = get_historical_returns(ticker, start_date, end_date)
    adjusted_ratio_data = resample_quaterly_data(ratio_data, returns_data)
    features = pd.concat([adjusted_ratio_data, returns_data],axis=1)

    return features

In [5]:
#get_historical_returns('AM', '2015-08-30', '2023-09-30', "monthly")

Section 2.3 Multi Asset Data Processing

In [6]:
def multi_df(path_to_file, ticker_list, start_date, end_date):
    company_data = {}
    for ticker in ticker_list:
        company_data[ticker] = load_features(path_to_file, ticker, start_date, end_date)

    # Initialize a list to hold DataFrames with the new multi-index
    multi_index_dfs = []

    for company, df in company_data.items():
        # Set the company name as an additional level in the index
        df_multi_index = df.copy()
        df_multi_index['Company'] = company
        df_multi_index.set_index(['Company', df_multi_index.index], inplace=True)

        # Append to the list
        multi_index_dfs.append(df_multi_index)

    # Concatenate all DataFrames into a single multi-index DataFrame
    final_df = pd.concat(multi_index_dfs)

    return final_df

**Section 3: LSTM Machine Learning Estimator**  
Section 3.1 Pre-Processing

In [7]:
def create_sequences(features, targets, seq_length):
    'Function to create sequence'
    'Need to define the sequence length: e.g. using 4 quaters to predict the next quater'

    xs = []
    ys = []

    for i in range(len(features)-seq_length):
        x = features[i:(i+seq_length)]
        y = targets.iloc[i+seq_length]
        xs.append(x)
        ys.append(y)

    return np.array(xs), np.array(ys)

ChatGPT 15 Stock Test Case

Step 1: Define Input and Parameters

## Input Path

In [8]:
# 1. File Path
path_to_file_gpt = "/content/drive/My Drive/MIE479 Capstone Project/Machine Learning Estimator/LSTM/ChatGPT Tickers/"
# 2. Ticker List
ChatGPT_stocks = ['AAPL', 'MSFT', 'AMZN', 'GOOGL', 'JNJ', 'V', 'PG', 'JPM', 'UNH', 'MA', 'INTC', 'VZ', 'GOOG', 'HD', 'T']
# 3. Target Time Frame
start_date = '2016-09-30'
end_date = '2021-09-30'
# 4. Sequence Length
seq_length = 6
# 5. Training and Validation Set Split Ratio
train_ratio = 0.8
# 6. Num Epoch and Num Batch
num_epoch = 20
num_batch = 12

Step2: Pre-Processing

In [9]:
# Loading Phase: Took a while to run this (Don't Rerun)
final_df_gpt = multi_df(path_to_file_gpt, ChatGPT_stocks, start_date, end_date)




[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [10]:
multi_df_gpt = final_df_gpt
multi_return_gpt = pd.DataFrame(multi_df_gpt['Return'])

In [11]:
features = multi_df_gpt
targets = multi_return_gpt
multi_X_gpt, multi_y_gpt = create_sequences(features, targets, seq_length)
targets2 = targets['Return'].unstack(level=0)
targets_input = targets2.fillna(0)

Step3: Train Model on Training and Validation Sets

In [12]:
from statsmodels.tsa.api import VAR

train_size = int(len(targets_input)*0.8)
X_train, X_test = targets_input[:train_size], targets_input[train_size:]
model_var = VAR(X_train)
result_var = model_var.fit()
predictions_VAR = result_var.forecast(X_test.values, steps=len(X_test)) ##Test on X_test
forecast_df_VAR = pd.DataFrame(predictions_VAR, columns=targets_input.columns,index=X_test.index)
mse_VAR = np.mean((X_test - forecast_df_VAR)**2)

  self._init_dates(dates, freq)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


Step4: Evaluate the Model on Test Set

In [13]:
# Test Period 1: September 30 2021 to July 30 2023
# Test Period 2: March 14, 2023 to July 31 2023
# Test Period 3: May 01 2023 to July 31 2023

start_t1 = '2021-09-30'
end_t1 = '2023-07-31'

start_t2 = '2023-03-14'
end_t2 = '2023-07-31'

start_t3 = '2023-05-01'
end_t3 = '2023-07-31'

In [None]:
# Loading Test Phase: Took a while to run this (Don't Rerun)
final_df_gpt_t1 = multi_df(path_to_file_gpt, ChatGPT_stocks, start_t1, end_t1)
final_df_gpt_t2 = multi_df(path_to_file_gpt, ChatGPT_stocks, start_t2, end_t2)
final_df_gpt_t3 = multi_df(path_to_file_gpt, ChatGPT_stocks, start_t3, end_t3)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [None]:
multi_df_t1 = final_df_gpt_t1
multi_df_t2 = final_df_gpt_t2
multi_df_t3 = final_df_gpt_t3

multi_return_t1 = pd.DataFrame(multi_df_t1['Return'])

multi_return_t2 = pd.DataFrame(multi_df_t2['Return'])

multi_return_t3 = pd.DataFrame(multi_df_t3['Return'])


**Evaluation for Each test Period Execution parts**

In [None]:
multi_return_t1_transposed = multi_return_t1['Return'].unstack(level=0)
predictions_VAR_t1 = result_var.forecast(multi_return_t1_transposed.values, steps=len(multi_return_t1_transposed))
forecast_df_VAR = pd.DataFrame(predictions_VAR_t1, columns=multi_return_t1_transposed.columns,index=multi_return_t1_transposed.index)

In [None]:
# Ouput the final Average Expected return and Covariance matrix for each company
# Group by 'Company' and calculate the mean of 'ExpectedReturn'
# average_returns now contains the average expected return for each company
average_returns_t1 = forecast_df_VAR.mean().values.reshape(-1, 1)
cov_matrix_t1 = forecast_df_VAR.cov()
print(average_returns_t1)


**For Test Period 2: March 14, 2023 to July 31 2023**


In [None]:
filtered_df_2 = forecast_df_VAR.loc['2023-03-14':]
average_returns_t2 = filtered_df_2.mean().values.reshape(-1, 1)
cov_matrix_t2 = filtered_df_2.cov()


**For Test Period 3: Test Period 3: May 01 2023 to July 31 2023**


In [None]:
filtered_df_3 = forecast_df_VAR.loc['2023-05-01':]
average_returns_t3 = filtered_df_3.mean().values.reshape(-1, 1)
cov_matrix_t3 = filtered_df_3.cov()