### Import Libraries

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import math
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

## Functions

### Finance Data

In [None]:
def prep_finance(ticker, start_date="2018-01-01", end_date="2023-04-04"): # create_multi_df
    # Load data
    tickerData = yf.Ticker(ticker)
    tickerDF = tickerData.history(period="1d", start=start_date, end=end_date)

    # cut some variables
    tickerDF = tickerDF.drop("Dividends", axis=1)
    tickerDF = tickerDF.drop("Stock Splits", axis=1)

    # add control variables
    tickerDataDJI = yf.Ticker('^DJI')
    tickerDFDJI = tickerDataDJI.history(period='1d', start=start_date, end=end_date)
    tickerDFDJI['DJI'] = tickerDFDJI['Close']
    tickerDF = pd.merge(tickerDF, tickerDFDJI['DJI'], left_index=True, right_index=True)

    # delete na rows
    tickerDF = tickerDF.dropna()
    
    return tickerDF

### Text Data

In [None]:
finbert_path = "output_FinBert_final.xlsx"
textblob_path = "output_TextBlob.xlsx"
vader_path = "output_VADER.xlsx"
flair_path = "Result_Flair_v1.xlsx"

In [None]:
finbert = pd.read_excel(finbert_path, index_col=0)
finbert = finbert.drop(["Neutral"], axis=1) # avoid multicollinearity

textblob = pd.read_excel(textblob_path, index_col=0)

vader = pd.read_excel(vader_path, index_col=0)
vader = vader.drop(["vader_neu"], axis=1) # avoid multicollinearity

flair = pd.read_excel(flair_path, index_col=0)
flair = flair.reset_index()
flair = flair.rename(columns={"Date":"date", "Stock name":"stock_name", "Report type":"report_type"}) # fix typo
flair["flair_sentiment"] = flair.apply(lambda x: x["Sentiment score"] if x["Sentiment value"] == "POSITIVE" else 1-x["Sentiment score"], axis=1)
flair = flair.drop(["Sentiment score"], axis=1)

In [None]:
def prep_text(df, feature_lst, ticker):
    df_ticker = df[df["stock_name"]==ticker]
    df_ticker = df_ticker.groupby(["date", "report_type"], as_index=False).mean()

    df_ticker.set_index("date", inplace=True)
    df_ticker = df_ticker.drop(["report_type"], axis=1)

    return df_ticker

In [None]:
def prep_all_text(ticker):
    finbert_ticker = prep_text(finbert, ["Positive", "Negative"], ticker)
    textblob_ticker = prep_text(textblob, ["tb_polarity", "tb_subjectivity"], ticker)
    vader_ticker = prep_text(vader, ["vader_neg", "vader_pos", "vader_compound"], ticker)
    flair_ticker = prep_text(flair, ["flair_sentiment"], ticker)

    text_ticker = pd.merge(finbert_ticker, textblob_ticker, left_index=True, right_index=True, how='outer')
    text_ticker = pd.merge(text_ticker, vader_ticker, left_index=True, right_index=True, how='outer')
    text_ticker = pd.merge(text_ticker, flair_ticker, left_index=True, right_index=True, how='outer')

    return text_ticker

### Combine Data

In [None]:
def prep_all_feature(ticker):
    fin_data = prep_finance(ticker)
    text_data = prep_all_text(ticker)
    fin_data.index = fin_data.index.strftime("%Y-%m-%d")
    text_data.index = text_data.index.strftime("%Y-%m-%d")
    res = pd.merge(fin_data, text_data, left_index=True, right_index=True, how="left")
    #
    res.fillna(method='ffill', inplace=True)
    res = res.dropna()

    return res

In [None]:
def split_data(stock, lookback):
    data_raw = stock.to_numpy() 
    data = []
    
    # create all possible sequences of length seq_len
    for index in range(len(data_raw) - lookback): 
        data.append(data_raw[index: index + lookback])
    
    data = np.array(data);
    test_set_size = int(np.round(0.1*data.shape[0]))
    train_set_size = data.shape[0] - (test_set_size)
    
    x_train = data[:train_set_size,:-1,:]
    y_train = data[:train_set_size,-1,:]
    
    x_test = data[train_set_size:,:-1]
    y_test = data[train_set_size:,-1,:]
    return [x_train, y_train, x_test, y_test]

### GRU Model

In [None]:
class GRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(GRU, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        out, (hn) = self.gru(x, (h0.detach()))
        out = self.fc(out[:, -1, :]) 
        return out

In [None]:
def train(model, train_inputs, train_targets, num_epochs, criterion, optimizer):
    train_losses = []
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(train_inputs)
        loss = criterion(outputs, train_targets)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
    return train_losses

In [None]:
def evaluate(model, test_inputs, test_targets, criterion):
    model.eval()
    test_loss=0
    with torch.no_grad():
        outputs = model(test_inputs)
        loss = criterion(outputs, test_targets)
    return test_targets, outputs

### Parameters

In [None]:
input_dim = 7
hidden_dim = 64
num_layers = 2
output_dim = 1
num_epochs = 180

In [None]:
torch.manual_seed(42)
model = GRU(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
criterion = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001)

### Results

In [None]:
def result(model,num_epochs,criterion,optimizer,feature_lst):
    company_name=["AAPL", "MSFT", "V", "UNH", "JPM", "JNJ", "WMT", "PG", "CVX", "HD"]
    output=[]
    for i in range(len(company_name)):
        data=prep_all_feature(company_name[i])
        scaler = MinMaxScaler(feature_range=(0, 1))
        target_scaler = MinMaxScaler(feature_range=(0, 1))
        for col in data.columns:
            if col=="Close":
                data[col] = target_scaler.fit_transform(data[col].values.reshape(-1,1))
            else:
                data[col] = scaler.fit_transform(data[col].values.reshape(-1,1))
        data=data[feature_lst]
        lookback = 15 # choose sequence length
        x_train, y_train, x_test, y_test = split_data(data, lookback)
        x_train = torch.from_numpy(x_train).type(torch.Tensor)
        x_test = torch.from_numpy(x_test).type(torch.Tensor)
        y_train = y_train[:,0].reshape(-1,1)
        y_test = y_test[:,0].reshape(-1,1)
        y_train_gru = torch.from_numpy(y_train).type(torch.Tensor)
        y_test_gru = torch.from_numpy(y_test).type(torch.Tensor)
        train_gru=train(model, x_train, y_train_gru, num_epochs, criterion, optimizer)
        eva_gru=evaluate(model, x_test, y_test_gru, criterion)
        days = range(len(eva_gru[0]))
        y1=target_scaler.inverse_transform(eva_gru[0])
        y2=target_scaler.inverse_transform(eva_gru[1])
        testScore = math.sqrt(mean_squared_error(y1.reshape(-1,1),y2.reshape(-1,1)))
        output.append(testScore)
    return output

In [None]:
feature_lst = ["Positive", "Negative", "tb_polarity", "tb_subjectivity", "vader_neg", "vader_pos", "vader_compound", "flair_sentiment"]

In [None]:
final_output={}
final_output["Stock"]=["AAPL", "MSFT", "V", "UNH", "JPM", "JNJ", "WMT", "PG", "CVX", "HD"]
model = GRU(input_dim=6, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001)
final_output["Baseline"] = result(model,num_epochs,criterion,optimizer,["Close","Open","High","Low","Volume","DJI"])
model = GRU(input_dim=9, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001)
final_output["VADER"] = result(model,num_epochs,criterion,optimizer,["Close","Open","High","Low","Volume","DJI","vader_neg", "vader_pos", "vader_compound"])
model = GRU(input_dim=8, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001)
final_output["Text Blob"] = result(model,num_epochs,criterion,optimizer,["Close","Open","High","Low","Volume","DJI","tb_polarity","tb_subjectivity"])
model = GRU(input_dim=8, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001)
final_output["FinBERT"] = result(model,num_epochs,criterion,optimizer,["Close","Open","High","Low","Volume","DJI","Positive", "Negative"])
model = GRU(input_dim=7, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001)
final_output["Flair"] = result(model,num_epochs,criterion,optimizer,["Close","Open","High","Low","Volume","DJI","flair_sentiment"])

In [None]:
final_output_df = pd.DataFrame(final_output)
final_output_df.to_excel("GRU_stock_rmse.xlsx") 