In [2]:
# Import libraries
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import nltk, json 
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [8]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/phuongnguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/phuongnguyen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/phuongnguyen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
from func_dataprep import preprocess_text
from model_train import CNNModel, GRUModel, LSTMModel, BiLSTM

# Import data

In [3]:
data_path = "dataset/2025/financial_news_events.json"
# Load data
pandas_data = pd.read_json(data_path, lines=True)

In [14]:
print(pandas_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3024 entries, 0 to 3023
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  3024 non-null   datetime64[ns]
 1   Headline              2876 non-null   object        
 2   Source                3024 non-null   object        
 3   Market_Event          3024 non-null   object        
 4   Market_Index          3024 non-null   object        
 5   Index_Change_Percent  2863 non-null   float64       
 6   Trading_Volume        3024 non-null   float64       
 7   Sentiment             2853 non-null   object        
 8   Sector                3024 non-null   object        
 9   Impact_Level          3024 non-null   object        
 10  Related_Company       3024 non-null   object        
 11  News_Url              2871 non-null   object        
dtypes: datetime64[ns](1), float64(2), object(9)
memory usage: 283.6+ KB
None


In [6]:
print(pandas_data.head())

        Date                                           Headline  \
0 2025-05-21        Nikkei 225 index benefits from a weaker yen   
1 2025-05-18  Government subsidy program gives a lift to the...   
2 2025-06-25  New housing data release shows a slowdown in m...   
3 2025-07-21  Massive stock buyback program announced by a c...   
4 2025-07-23  Government spending bill is expected to stimul...   

                    Source                Market_Event        Market_Index  \
0           Times of India       Commodity Price Shock                 DAX   
1          Financial Times        Central Bank Meeting  Shanghai Composite   
2  The Hindu Business Line  Consumer Confidence Report  Shanghai Composite   
3            The Economist       Commodity Price Shock           NSE Nifty   
4          The Motley Fool      Inflation Data Release    Nasdaq Composite   

   Index_Change_Percent  Trading_Volume Sentiment          Sector  \
0                  3.52          166.45      None      Tech

In [45]:
#unique category
categories = pandas_data['Sector'].unique()
print("Unique categories:", categories)
unique_categories = len(categories)
print("Length of unique categories:", unique_categories)

Unique categories: ['Technology' 'Retail' 'Consumer Goods' 'Transportation'
 'Telecommunications' 'Energy' 'Pharmaceuticals' 'Healthcare' 'Finance'
 'Industrials' 'Real Estate' 'Aerospace & Defense' 'Agriculture'
 'Utilities' 'Construction' 'Media & Entertainment' 'Materials'
 'Automotive']
Length of unique categories: 18


In [15]:
#Drop NaN values of Healine and Sector
pandas_data = pandas_data.dropna(subset=['Headline', 'Sector'])

# DATA PREPROCESSING

**INPUTS PREPROCESSING**

In [18]:
# Apply preprocessing to the Headline column
pandas_data['input'] = pandas_data['Headline'].apply(preprocess_text)

In [19]:
print(pandas_data['input'].head(10))

0            [nikkei, 225, index, benefit, weaker, yen]
1     [government, subsidy, program, give, lift, agr...
2     [new, housing, data, release, show, slowdown, ...
3     [massive, stock, buyback, program, announced, ...
4     [government, spending, bill, expected, stimula...
5     [central, bank, maintains, status, quo, intere...
6      [tech, giant, new, product, launch, spark, gain]
7     [massive, data, breach, sends, tech, company, ...
9     [market, sentiment, turn, positive, vaccine, t...
10    [global, trade, talk, collapse, causing, marke...
Name: input, dtype: object


In [20]:
# Dictionaries to store the word to index mappings and vice versa
all_tokens = [word for tokens in pandas_data['input'] for word in tokens]
unique_tokens = list(set(all_tokens))

word2idx = {o:i for i,o in enumerate(unique_tokens)}
idx2word = {i:o for i,o in enumerate(unique_tokens)}

In [21]:
print(len(unique_tokens))

232


In [22]:
print(idx2word)

{0: 'index', 1: 'requirement', 2: 'oil', 3: 'rattle', 4: 'policy', 5: 'smaller', 6: 'crude', 7: 'tone', 8: 'brace', 9: 'automotive', 10: 'pmi', 11: 'sign', 12: 'plummeting', 13: 'weaker', 14: 'eu', 15: 'boom', 16: 'hint', 17: 'bond', 18: 'expansion', 19: 'recovery', 20: 'governor', 21: 'cut', 22: 'rate', 23: 'benefit', 24: 'deal', 25: 'price', 26: 'streaming', 27: 'ftse', 28: 'activity', 29: 'announced', 30: 'firm', 31: 'sector', 32: 'composite', 33: 'yield', 34: 'estate', 35: 'favorable', 36: 'high', 37: 'shift', 38: 'report', 39: 'drug', 40: 'buyback', 41: 'result', 42: 'impact', 43: 'heavily', 44: 'nikkei', 45: 'slowdown', 46: 'cause', 47: 'inflation', 48: 'revenue', 49: 'cost', 50: 'data', 51: 'give', 52: 'aerospace', 53: 'asx', 54: 'upward', 55: 'interest', 56: 'tension', 57: 'rise', 58: 'restriction', 59: 'outlook', 60: 'overheating', 61: 'jones', 62: 'expectation', 63: 'unemployment', 64: 'utility', 65: 'good', 66: 'dividend', 67: 'potential', 68: 'drop', 69: 'government', 70: '

In [23]:
# Mapping function
def map_tokens(tokens, word2idx):
    return [word2idx.get(word, 0) for word in tokens]  # 0 nếu từ không có trong vocab

# Applied for tokens function
pandas_data['indexed'] = pandas_data['input'].apply(lambda tokens: map_tokens(tokens, word2idx))

In [24]:
print(pandas_data.head(1))

        Date                                     Headline          Source  \
0 2025-05-21  Nikkei 225 index benefits from a weaker yen  Times of India   

            Market_Event Market_Index  Index_Change_Percent  Trading_Volume  \
0  Commodity Price Shock          DAX                  3.52          166.45   

  Sentiment      Sector Impact_Level Related_Company  \
0      None  Technology         High   Goldman Sachs   

                                            News_Url  \
0  https://timesofindia.indiatimes.com/business/m...   

                                        input                    indexed  
0  [nikkei, 225, index, benefit, weaker, yen]  [44, 185, 0, 23, 13, 115]  


In [25]:
#Define a function that either shorten sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [26]:
#apply padding to the input data
padded_inputs = pad_input(pandas_data['indexed'], 30)

In [27]:
print(padded_inputs.shape)

(2876, 30)


**LABELS PREPROCESSING**

In [28]:
#Encoding labels (text > int)
from sklearn.preprocessing import LabelEncoder
 
labels_text = pandas_data['Sector'].values
encoder = LabelEncoder()

#Fit and transform labels
labels_int = encoder.fit_transform(labels_text)

In [29]:
print(len(labels_int))

2876


In [30]:
print(encoder.classes_)

['Aerospace & Defense' 'Agriculture' 'Automotive' 'Construction'
 'Consumer Goods' 'Energy' 'Finance' 'Healthcare' 'Industrials'
 'Materials' 'Media & Entertainment' 'Pharmaceuticals' 'Real Estate'
 'Retail' 'Technology' 'Telecommunications' 'Transportation' 'Utilities']


In [31]:
#Mapping label dictionary
id2label = {i:o for i,o in enumerate(encoder.classes_)}
label2id = {o:i for i,o in enumerate(encoder.classes_)}

**Convert to Tensor and create DataLoader**

In [32]:
train_text, test_text, train_labels, test_labels = train_test_split(padded_inputs, labels_int, test_size=0.2, random_state=42)

train_data = TensorDataset(torch.tensor(train_text, dtype=torch.long), torch.tensor(train_labels, dtype=torch.long))
test_data = TensorDataset(torch.tensor(test_text, dtype=torch.long), torch.tensor(test_labels, dtype=torch.long))

In [33]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# DEFINE THE MODELS

# HYPERPARAMETERS

In [47]:
vocab_size = len(word2idx)
embed_dim = 100
num_classes = unique_categories

In [48]:
print(type(vocab_size), vocab_size)
print(type(embed_dim), embed_dim)
print(type(num_classes), num_classes)


<class 'int'> 232
<class 'int'> 100
<class 'int'> 18


In [50]:
modelCNN = CNNModel(vocab_size, embed_dim, num_classes)
modelLSTM = LSTMModel(vocab_size = vocab_size, embed_size=embed_dim, hidden_size=128, output_size =num_classes)
modelGRU = GRUModel(input_size = vocab_size, hidden_size=128, output_size =num_classes)
modelBiLSTM = BiLSTM(vocab_size, embed_dim, 128, num_classes, 2, 0.5)

In [51]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modelLSTM.parameters(), lr=0.001)

**TEST MODEL WITH TEST INPUT**

In [52]:
#Test input data model
batch_size = 64
input_test = train_data[:batch_size]
print(input_test)

(tensor([[  0,   0,   0,  ..., 122,  22,  21],
        [  0,   0,   0,  ..., 164, 225, 159],
        [  0,   0,   0,  ..., 174,  31, 129],
        ...,
        [  0,   0,   0,  ..., 122,  22,  21],
        [  0,   0,   0,  ...,  85, 190,  18],
        [  0,   0,   0,  ..., 172, 184,  82]]), tensor([15, 14, 14,  2,  5, 14,  8,  0,  7, 14, 12, 12,  4, 14, 12,  9,  4, 14,
         2, 13,  3, 16,  5, 10,  5, 16,  1, 13,  5,  9,  7,  4,  9, 17, 17,  3,
        16, 11, 15,  8,  9,  3,  0,  7, 17,  3, 16,  5,  6,  0, 17, 12,  0,  9,
         7, 14, 12,  0, 17,  8,  6,  8, 13, 17]))


In [53]:
output_test = modelBiLSTM(input_test[0])
print(output_test.shape)

torch.Size([64, 18])


In [54]:
label_test = train_data[:batch_size][1]
print(label_test.shape)

torch.Size([64])


In [55]:
loss_test = criterion(output_test, label_test)

In [56]:
#Test probabilities
preds_test = torch.argmax(output_test, dim=1)
print(preds_test)

tensor([17,  7, 13, 10,  0, 12, 13,  6, 16, 16, 17, 10, 10,  2, 11, 10,  2,  6,
        10, 13, 15,  8, 16,  3, 12, 11,  6, 10, 17,  8, 10, 16,  8, 16, 10,  6,
         7,  0, 13, 17,  0, 11, 16,  2, 13,  3,  0,  6,  6, 11,  3, 11, 11, 17,
        11,  0,  0, 13, 13,  0, 17, 11, 11, 10])


In [57]:
#Test probabilities
probs_test = F.softmax(output_test, dim=-1)
print('probabilities test', probs_test)
preds_test = torch.argmax(probs_test, dim=-1)
print('predictions test', preds_test)

probabilities test tensor([[0.0537, 0.0511, 0.0570,  ..., 0.0537, 0.0595, 0.0649],
        [0.0564, 0.0529, 0.0532,  ..., 0.0547, 0.0556, 0.0590],
        [0.0595, 0.0525, 0.0565,  ..., 0.0545, 0.0528, 0.0570],
        ...,
        [0.0579, 0.0514, 0.0560,  ..., 0.0558, 0.0550, 0.0572],
        [0.0573, 0.0540, 0.0585,  ..., 0.0534, 0.0559, 0.0571],
        [0.0559, 0.0512, 0.0559,  ..., 0.0572, 0.0557, 0.0601]],
       grad_fn=<SoftmaxBackward0>)
predictions test tensor([17,  7, 13, 10,  0, 12, 13,  6, 16, 16, 17, 10, 10,  2, 11, 10,  2,  6,
        10, 13, 15,  8, 16,  3, 12, 11,  6, 10, 17,  8, 10, 16,  8, 16, 10,  6,
         7,  0, 13, 17,  0, 11, 16,  2, 13,  3,  0,  6,  6, 11,  3, 11, 11, 17,
        11,  0,  0, 13, 13,  0, 17, 11, 11, 10])


# TRAIN THE MODELS

In [59]:
#Config Model before training
model = modelBiLSTM

In [60]:
#Training epoch
for epoch in range(10):
    model.train()
    training_loss = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        training_loss += loss.item()
    epoch_loss = training_loss/ len(train_data)
    print(f"Epoch {epoch+1}, Training Loss: {epoch_loss}")

Epoch 1, Training Loss: 0.04528419950734014
Epoch 2, Training Loss: 0.045227501392364505
Epoch 3, Training Loss: 0.045284293838169266
Epoch 4, Training Loss: 0.04529779859211134
Epoch 5, Training Loss: 0.045263649173404856
Epoch 6, Training Loss: 0.045264561590941055
Epoch 7, Training Loss: 0.04525750222413436
Epoch 8, Training Loss: 0.04527691115503726
Epoch 9, Training Loss: 0.04527451338975326
Epoch 10, Training Loss: 0.04525009797966999


# TEST THE MODELS

In [61]:
# Valiation  
validation_loss = 0.0
model.eval()

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        validation_loss += loss.item()
epoch_val_loss = validation_loss / len(test_loader)
#model.train()

In [62]:
print(epoch_val_loss)

2.892735905117459


# EVALUATION

In [67]:
import torch.nn.functional as F
from torchmetrics.classification import MulticlassAccuracy, MulticlassPrecision, MulticlassRecall

num_classes = unique_categories
accuracy = MulticlassAccuracy(num_classes=num_classes, average="weighted").to("cpu")
precision = MulticlassPrecision(num_classes=num_classes, average="weighted").to("cpu")
recall = MulticlassRecall(num_classes=num_classes, average="weighted").to("cpu")

model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to("cpu"), labels.to("cpu")

        outputs = model(inputs)
        probs = F.softmax(outputs, dim=1)
        preds = torch.argmax(probs, dim=1)

        # Update metrics each batch
        accuracy.update(outputs, labels)
        precision.update(outputs, labels)
        recall.update(outputs, labels)

# Compute metrics after going through the full dataloader
acc = accuracy.compute().item()
prec = precision.compute().item()
rec = recall.compute().item()

print(f"Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}")

# Reset for next evaluation
accuracy.reset()
precision.reset()
recall.reset()

Accuracy=0.0451, Precision=0.0135, Recall=0.0451


# SAVE THE MODEL

# GET PREDICTION