# Library

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import yfinance as yf

from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import warnings
warnings.filterwarnings('ignore')

# Parameters

In [None]:
ticker = "AAPL"
lag = 15
start_date = "2018-01-01"
end_date = "2023-04-04"
train_test_ratio = 0.8
num_epochs = 200

# Functions

## Data

### Finance Data

In [None]:
def prep_finance(ticker, start_date="2018-01-01", end_date="2023-04-04"): # create_multi_df
    # Load data
    tickerData = yf.Ticker(ticker)
    tickerDF = tickerData.history(period="1d", start=start_date, end=end_date)

    # cut some variables
    tickerDF = tickerDF.drop("Dividends", axis=1)
    tickerDF = tickerDF.drop("Stock Splits", axis=1)

    # add control variables
    tickerDataDJI = yf.Ticker('^DJI')
    tickerDFDJI = tickerDataDJI.history(period='1d', start=start_date, end=end_date)
    tickerDFDJI['DJI'] = tickerDFDJI['Close']
    tickerDF = pd.merge(tickerDF, tickerDFDJI['DJI'], left_index=True, right_index=True)

    # delete na rows
    tickerDF = tickerDF.dropna()
    
    return tickerDF

### Text Data

In [None]:
finbert_path = "/content/output_FinBert_final.xlsx"
textblob_path = "/content/output_TextBlob.xlsx"
vader_path = "/content/output_VADER.xlsx"
flair_path = "/content/Result_Flair_v1.xlsx"

In [None]:
finbert = pd.read_excel(finbert_path, index_col=0)
finbert = finbert.drop(["Neutral"], axis=1) # avoid multicollinearity

textblob = pd.read_excel(textblob_path, index_col=0)

vader = pd.read_excel(vader_path, index_col=0)
vader = vader.drop(["vader_neg", "vader_pos", "vader_neu"], axis=1) # avoid multicollinearity

flair = pd.read_excel(flair_path, index_col=0)
flair = flair.reset_index()
flair = flair.rename(columns={"Date":"date", "Stock name":"stock_name", "Report type":"report_type"}) # fix typo
flair["flair_sentiment"] = flair.apply(lambda x: x["Sentiment score"] if x["Sentiment value"] == "POSITIVE" else 1-x["Sentiment score"], axis=1)
flair = flair.drop(["Sentiment score"], axis=1)

In [None]:
def prep_text(df, feature_lst, ticker):
    df_ticker = df[df["stock_name"]==ticker]
    df_ticker = df_ticker.groupby(["date", "report_type"], as_index=False).mean()
    df_ticker.set_index("date", inplace=True)
    df_ticker = df_ticker.drop(["report_type"], axis=1)

    return df_ticker

In [None]:
def prep_all_text(ticker):
    finbert_ticker = prep_text(finbert, ["Positive", "Negative"], ticker)
    textblob_ticker = prep_text(textblob, ["tb_polarity", "tb_subjectivity"], ticker)
    vader_ticker = prep_text(vader, ["vader_compound"], ticker)
    flair_ticker = prep_text(flair, ["flair_sentiment"], ticker)

    text_ticker = pd.merge(finbert_ticker, textblob_ticker, left_index=True, right_index=True, how='outer')
    text_ticker = pd.merge(text_ticker, vader_ticker, left_index=True, right_index=True, how='outer')
    text_ticker = pd.merge(text_ticker, flair_ticker, left_index=True, right_index=True, how='outer')

    return text_ticker

### Combine Data

In [None]:
def prep_all_feature(ticker):
    fin_data = prep_finance(ticker)
    text_data = prep_all_text(ticker)
    fin_data.index = fin_data.index.strftime("%Y-%m-%d")
    text_data.index = text_data.index.strftime("%Y-%m-%d")
    res = pd.merge(fin_data, text_data, left_index=True, right_index=True, how="left")

    res.fillna(method='ffill', inplace=True)
    res = res.dropna()

    return res

## Model

In [None]:
def LSTM_scaled_multi_model(df, lag, train_test_ratio, epoch):
    # scale data
    scaler = MinMaxScaler()
    df_scaled = scaler.fit_transform(df.to_numpy())
    df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

    scaler_pred = MinMaxScaler()
    close_scaled = scaler_pred.fit_transform(df['Close'].values.reshape(-1, 1))
    
    # reshape inputs
    X, y = [], []
    df_len = df_scaled.shape[0]
    for i in range(lag, df_len):
        X.append(df_scaled.iloc[i-lag:i,:].transpose())
        y.append(df_scaled.iloc[i]["Close"])
    X = np.array(X)
    y = np.array(y)

    # split train and test
    cutoff = int(train_test_ratio * X.shape[0])
    X_train = X[:cutoff]
    X_test = X[cutoff:]
    y_train = y[:cutoff]
    y_test = y[cutoff:]

    # define model
    n_neurons = X_train.shape[1] * X_train.shape[2] # num_features x lag
    model = Sequential()
    model.add(LSTM(n_neurons, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2]))) 
    model.add(LSTM(n_neurons, return_sequences=False))
    model.add(Dense(5))
    model.add(Dense(1))
    model.compile(optimizer="adam", loss="mse")

    # fit model
    model.fit(X_train, y_train, epochs=epoch, verbose=0)

    # predict
    y_pred = model.predict(X_test, verbose=0)

    # unscale
    y_true = scaler_pred.inverse_transform(y_test.reshape(-1, 1))
    y_pred = scaler_pred.inverse_transform(y_pred)

    return y_true, y_pred, cutoff

## Evaluation

In [None]:
def plot_pred(y_true, y_pred, df):
    # create two sample arrays
    x = [datetime.strptime(date_str, '%Y-%m-%d') for date_str in data.index.tolist()]
    x = x[-len(y_true):]
    # x = [datetime.strptime(d, "%Y-%m-%d") for d in x]

    # create a new figure and axis
    fig, ax = plt.subplots()

    # plot the two arrays on the same axis
    ax.plot(x, y_true, label="true")
    ax.plot(x, y_pred, label="pred")

    # add legend and axis labels
    ax.legend()
    ax.set_xlabel("Date")
    ax.set_ylabel("Price")
    ax.xaxis.set_major_locator(mdates.MonthLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%m/%Y"))

    # display the plot
    plt.show()

In [None]:
def RMSE(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_pred - y_true) ** 2))
    return rmse

# Results

In [None]:
ticker_lst = ["AAPL", "MSFT", "V", "UNH", "JPM", "JNJ", "WMT", "PG", "CVX", "HD"]
lag = 15
start_date = "2018-01-01"
end_date = "2023-04-04"
train_test_ratio = 0.8
num_epochs = 200

In [None]:
feature_lst = ["Positive", "Negative", "tb_polarity", "tb_subjectivity", "vader_compound", "flair_sentiment"]

In [1]:
# baseline

def summary_baseline(ticker_lst):
    rmse_lst = []
    for ticker in ticker_lst:
        data = prep_all_feature(ticker)
        data_sub = data[["Open",	"High",	"Low", "Close",	"Volume",	"DJI"]]
        y_true, y_pred, cutoff = LSTM_scaled_multi_model(data_sub, lag, train_test_ratio, num_epochs)
        rmse_lst.append(RMSE(y_true, y_pred))
    
    res = {'Company': ticker_lst, 'RMSE': rmse_lst}
    res = pd.DataFrame(res)

    return res

rmse_baseline = summary_baseline(ticker_lst)
rmse_baseline

In [None]:
rmse_baseline.to_excel("rmse_baseline.xlsx") 

In [2]:
# 8k + 10k + 10q constant padding

def summary_all_constant(ticker_lst):
    rmse_finbert_lst = []
    rmse_textblob_lst = []
    rmse_vader_lst = []
    rmse_flair_lst = []

    for ticker in ticker_lst:
        data = prep_all_feature(ticker)
        
        data_sub = data[["Open",	"High",	"Low", "Close",	"Volume",	"DJI", "Positive", "Negative"]]
        y_true, y_pred, cutoff = LSTM_scaled_multi_model(data_sub, lag, train_test_ratio, num_epochs)
        rmse_finbert_lst.append(RMSE(y_true, y_pred))

        data_sub = data[["Open",	"High",	"Low", "Close",	"Volume",	"DJI", "tb_polarity", "tb_subjectivity"]]
        y_true, y_pred, cutoff = LSTM_scaled_multi_model(data_sub, lag, train_test_ratio, num_epochs)
        rmse_textblob_lst.append(RMSE(y_true, y_pred))

        data_sub = data[["Open",	"High",	"Low", "Close",	"Volume",	"DJI", "vader_compound"]]
        y_true, y_pred, cutoff = LSTM_scaled_multi_model(data_sub, lag, train_test_ratio, num_epochs)
        rmse_vader_lst.append(RMSE(y_true, y_pred))

        data_sub = data[["Open",	"High",	"Low", "Close",	"Volume",	"DJI", "flair_sentiment"]]
        y_true, y_pred, cutoff = LSTM_scaled_multi_model(data_sub, lag, train_test_ratio, num_epochs)
        rmse_flair_lst.append(RMSE(y_true, y_pred))

    res = {'Company': ticker_lst,
           'RMSE_finbert': rmse_finbert_lst,
           'RMSE_textblob': rmse_textblob_lst,
           'RMSE_vader': rmse_vader_lst,
           'RMSE_flair': rmse_flair_lst}
    res = pd.DataFrame(res)

    return res

rmse_all_constant = summary_all_constant(ticker_lst)
rmse_all_constant

In [None]:
rmse_all_constant.to_excel("rmse_all_constant.xlsx") 