# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json
import polars as pl

In [2]:
data_base_dir = "../Data"

# Load News Data - Fetched using Alpha Vantage Earlier

In [3]:
news_data = pl.read_json(f'{data_base_dir}/news_data.json')
news_data = news_data.to_pandas()

In [4]:
news_data.head(2)

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,overall_sentiment_score,overall_sentiment_label,ticker_sentiment,date
0,"Apple, AMC, Meta, And How Elon Musk Is Reactin...",https://www.benzinga.com/news/large-cap/22/12/...,20221231T190310,[Michael Cohen],Benzinga examined the prospects for many inves...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",-0.079733,Neutral,"[{'ticker': 'MSTR', 'relevance_score': '0.2478...",20221231
1,Bulls In A Bear Market: These 10 Stocks Clocke...,https://www.benzinga.com/analyst-ratings/analy...,20221231T161114,[Shanthi Rexaline],2022 would go down as one of the worst years f...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,General,www.benzinga.com,"[{'topic': 'Life Sciences', 'relevance_score':...",0.064077,Neutral,"[{'ticker': 'AAPL', 'relevance_score': '0.1137...",20221231


# Load Stock Data - To create Ground truth label for training

In [5]:
stock_data = pd.read_csv(f"{data_base_dir}/stock_data.csv")

In [6]:
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-03-22,2859.5,2864.75,2805.25,2810.75,2810.75,2254842
1,2019-03-25,2813.0,2818.5,2789.5,2807.0,2807.0,1801326
2,2019-03-26,2808.5,2835.0,2806.0,2823.0,2823.0,1461068
3,2019-03-27,2823.5,2831.75,2791.75,2810.5,2810.5,1726746
4,2019-03-28,2807.0,2824.25,2795.0,2821.0,2821.0,1185882


In [7]:
stock_data.sort_values(by = 'Date', inplace=True)

In [8]:
stock_data['label'] = (stock_data['Close'] < stock_data['Close'].shift(-1)).astype(int)

In [9]:
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,label
0,2019-03-22,2859.5,2864.75,2805.25,2810.75,2810.75,2254842,0
1,2019-03-25,2813.0,2818.5,2789.5,2807.0,2807.0,1801326,1
2,2019-03-26,2808.5,2835.0,2806.0,2823.0,2823.0,1461068,0
3,2019-03-27,2823.5,2831.75,2791.75,2810.5,2810.5,1726746,1
4,2019-03-28,2807.0,2824.25,2795.0,2821.0,2821.0,1185882,1


# Group the News Data - Based On Date

In [10]:
news_data.sort_values(by = 'time_published', inplace=True)

In [11]:
news_data = news_data[['date', 'title', 'summary']]

In [12]:
news_data.head()

Unnamed: 0,date,title,summary
6211,20220512,Pharma ETFs in Focus Post Q1 Earnings,Many industry bigwigs reported solid results w...
6210,20220519,4 Large Drug Stocks to Watch as the Industry R...,Drug/biotech companies are likely to see signi...
6209,20220520,"Zacks Industry Outlook Highlights Eli Lilly, N...","Eli Lilly, Novo Nordisk, Merck, and Glaxo are ..."
6208,20220523,Stocks making the biggest moves midday: JPMorg...,These are the stocks posting the largest moves...
6207,20220711,3 Red-Hot Dividends In A Down Market,It's not easy to be upbeat in a bear market...


In [13]:
news_data['title'] = news_data['title']
news_data['summary'] = news_data['summary'] + ' \n\n'

In [14]:
news_data['cumulative_title'] = news_data.groupby('date')['title'].transform(lambda x: x.cumsum())
news_data['cumulative_summary'] = news_data.groupby('date')['summary'].transform(lambda x: x.cumsum())

In [18]:
news_data

Unnamed: 0,date,title,summary,cumulative_title,cumulative_summary
6211,20220512,Pharma ETFs in Focus Post Q1 Earnings,Many industry bigwigs reported solid results w...,Pharma ETFs in Focus Post Q1 Earnings,Many industry bigwigs reported solid results w...
6210,20220519,4 Large Drug Stocks to Watch as the Industry R...,Drug/biotech companies are likely to see signi...,4 Large Drug Stocks to Watch as the Industry R...,Drug/biotech companies are likely to see signi...
6209,20220520,"Zacks Industry Outlook Highlights Eli Lilly, N...","Eli Lilly, Novo Nordisk, Merck, and Glaxo are ...","Zacks Industry Outlook Highlights Eli Lilly, N...","Eli Lilly, Novo Nordisk, Merck, and Glaxo are ..."
6208,20220523,Stocks making the biggest moves midday: JPMorg...,These are the stocks posting the largest moves...,Stocks making the biggest moves midday: JPMorg...,These are the stocks posting the largest moves...
6207,20220711,3 Red-Hot Dividends In A Down Market,It's not easy to be upbeat in a bear market......,3 Red-Hot Dividends In A Down Market,It's not easy to be upbeat in a bear market......
...,...,...,...,...,...
17393,20240322,"Dell, TJX And 2 Other Stocks Insiders Are Sell...",The Nasdaq 100 closed higher by around 80 poin...,"CHEMOURS INVESTOR NEWS: ROSEN, TOP RANKED INVE...","NEW YORK, March 21, 2024 ( GLOBE NEWSWIRE ) --..."
17392,20240322,Tesla Bull Ross Gerber Says DOJ's Apple Lawsui...,Tesla Inc. bull Ross Gerber has questioned the...,"CHEMOURS INVESTOR NEWS: ROSEN, TOP RANKED INVE...","NEW YORK, March 21, 2024 ( GLOBE NEWSWIRE ) --..."
13888,20240322,Tesla Bull Ross Gerber Says DOJ's Apple Lawsui...,Tesla Inc. bull Ross Gerber has questioned the...,"CHEMOURS INVESTOR NEWS: ROSEN, TOP RANKED INVE...","NEW YORK, March 21, 2024 ( GLOBE NEWSWIRE ) --..."
14574,20240322,Tesla Bull Ross Gerber Says DOJ's Apple Lawsui...,Tesla Inc. bull Ross Gerber has questioned the...,"CHEMOURS INVESTOR NEWS: ROSEN, TOP RANKED INVE...","NEW YORK, March 21, 2024 ( GLOBE NEWSWIRE ) --..."


# Load Text embedding model

In [None]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

checkpoint = 'sentence-transformers/paraphrase-TinyBERT-L6-v2' # 'sentence-transformers/all-MiniLM-L6-v2'

model = SentenceTransformer(checkpoint)

In [20]:
def generate_embedding(x):
    return model.encode(x)

In [21]:
tqdm.pandas()

news_data['cumulative_title_emb'] = news_data['cumulative_title'].progress_apply(generate_embedding)
news_data['cumulative_summary_emb'] = news_data['cumulative_summary'].progress_apply(generate_embedding)

100%|█████████████████████████████████████| 20384/20384 [06:46<00:00, 50.10it/s]
100%|█████████████████████████████████████| 20384/20384 [08:02<00:00, 42.21it/s]


# Merge News Data and Stock Data - Based on Date

In [25]:
news_data['Date'] = news_data['date'].apply(lambda x : x[:4] + '-' + x[4:6] + '-' + x[6:])
news_data.drop(columns = ['date'], inplace =True)

In [26]:
news_data.head()

Unnamed: 0,title,summary,cumulative_title,cumulative_summary,cumulative_title_emb,cumulative_summary_emb,Date
6211,Pharma ETFs in Focus Post Q1 Earnings,Many industry bigwigs reported solid results w...,Pharma ETFs in Focus Post Q1 Earnings,Many industry bigwigs reported solid results w...,"[-0.18760552, 0.25935495, -0.050723966, -0.092...","[0.110145286, 0.3496942, 0.24096532, 0.1784440...",2022-05-12
6210,4 Large Drug Stocks to Watch as the Industry R...,Drug/biotech companies are likely to see signi...,4 Large Drug Stocks to Watch as the Industry R...,Drug/biotech companies are likely to see signi...,"[-0.145372, -0.0041549024, 0.06780252, -0.0711...","[-0.11769274, -0.086127214, 0.040418006, -0.13...",2022-05-19
6209,"Zacks Industry Outlook Highlights Eli Lilly, N...","Eli Lilly, Novo Nordisk, Merck, and Glaxo are ...","Zacks Industry Outlook Highlights Eli Lilly, N...","Eli Lilly, Novo Nordisk, Merck, and Glaxo are ...","[0.2105611, 0.10151193, -0.11327079, 0.1774636...","[0.16020752, 0.07496309, -0.043075092, 0.19859...",2022-05-20
6208,Stocks making the biggest moves midday: JPMorg...,These are the stocks posting the largest moves...,Stocks making the biggest moves midday: JPMorg...,These are the stocks posting the largest moves...,"[-0.015674531, 0.0131594855, 0.2657702, 0.1695...","[-0.0655723, -0.47901604, 0.3199635, 0.0668567...",2022-05-23
6207,3 Red-Hot Dividends In A Down Market,It's not easy to be upbeat in a bear market......,3 Red-Hot Dividends In A Down Market,It's not easy to be upbeat in a bear market......,"[0.09879005, 0.13978297, 0.52456844, -0.058282...","[0.03242542, -0.2530861, 0.41749194, -0.197073...",2022-07-11


In [27]:
merged_df = pd.merge(news_data, stock_data, on = 'Date')

In [30]:
# Concatenate Features
series1 = merged_df['cumulative_title_emb'].apply(lambda x: x.reshape(1, -1))
series2 = merged_df['cumulative_summary_emb'].apply(lambda x: x.reshape(1, -1))

concatenated_features = np.concatenate([np.concatenate(list(series1.values), axis = 0), np.concatenate(list(series2.values), axis = 0)], axis = 1)

merged_df['features'] = list(concatenated_features)

In [32]:
final_df = merged_df[['features', 'Date', 'label']]

# Training

In [113]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

import torch
import torch.nn as nn
import torch.optim as optim

In [82]:
split_date = '2024-01-01'
train_df = final_df[final_df['Date'] < split_date]
test_df = final_df[final_df['Date'] >= split_date]

In [83]:
X_train = list(train_df['features'])
y_train = train_df['label']
X_test = list(test_df['features'])
y_test = test_df['label']

In [84]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [85]:
y_train = y_train.values
y_test = y_test.values

## Random Forest Classifier

In [118]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state = 2024)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7095205003474635
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.75      0.67      2317
           1       0.80      0.68      0.74      3439

    accuracy                           0.71      5756
   macro avg       0.71      0.72      0.71      5756
weighted avg       0.73      0.71      0.71      5756



In [119]:
model_base_dir = "../model"

# Save the model using pickle
with open(f"{model_base_dir}/random_model.pkl", "wb") as f:
    pickle.dump(model, f)

## Neural Network

In [120]:
class NN(nn.Module):
    def __init__(self, input_size):
        super(NN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.relu = nn.LeakyReLU()
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc3(out)
        return out

In [121]:
input_size = 1536
model = NN(input_size)
criterion = nn.BCEWithLogitsLoss()  # Applies Sigmoid activation for calculating the loss
optimizer = optim.Adam(model.parameters(), lr=4e-4)

In [122]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

num_epochs = 50

for epoch in range(num_epochs):
    
    # Training
    model.train()
    inputs = torch.tensor(X_train, dtype=torch.float32)
    labels = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

    outputs = model(inputs)
    loss = criterion(outputs, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Validation
    with torch.no_grad():
        model.eval()
        test_outputs = model(X_test_tensor)
                
        predicted = (torch.sigmoid(test_outputs) > 0.5).float()
        correct = (predicted == y_test_tensor).sum().item()
        test_accuracy = 100 * correct / y_test_tensor.size(0)
    
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}, Test Accuracy: {test_accuracy:.2f}%')

Epoch 1, Loss: 0.6936, Test Accuracy: 40.25%
Epoch 2, Loss: 0.6822, Test Accuracy: 40.27%
Epoch 3, Loss: 0.6715, Test Accuracy: 40.46%
Epoch 4, Loss: 0.6597, Test Accuracy: 39.80%
Epoch 5, Loss: 0.6461, Test Accuracy: 62.18%
Epoch 6, Loss: 0.6308, Test Accuracy: 63.79%
Epoch 7, Loss: 0.6133, Test Accuracy: 65.32%
Epoch 8, Loss: 0.5931, Test Accuracy: 64.58%
Epoch 9, Loss: 0.5712, Test Accuracy: 63.72%
Epoch 10, Loss: 0.5466, Test Accuracy: 63.64%
Epoch 11, Loss: 0.5196, Test Accuracy: 68.52%
Epoch 12, Loss: 0.4912, Test Accuracy: 68.50%
Epoch 13, Loss: 0.4625, Test Accuracy: 68.19%
Epoch 14, Loss: 0.4320, Test Accuracy: 61.64%
Epoch 15, Loss: 0.4022, Test Accuracy: 61.19%
Epoch 16, Loss: 0.3728, Test Accuracy: 61.10%
Epoch 17, Loss: 0.3444, Test Accuracy: 61.08%
Epoch 18, Loss: 0.3185, Test Accuracy: 61.17%
Epoch 19, Loss: 0.2926, Test Accuracy: 62.72%
Epoch 20, Loss: 0.2704, Test Accuracy: 62.72%
Epoch 21, Loss: 0.2488, Test Accuracy: 63.76%
Epoch 22, Loss: 0.2292, Test Accuracy: 63.7