In [None]:
pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 torchmetrics==1.3.2

In [1]:
pip list | grep torch

torch                                    2.7.1
torchaudio                               2.7.1
torchmetrics                             1.3.2
torchvision                              0.22.1
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


**Import libraries**

In [2]:
# Import libraries
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import nltk, json 
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import string

In [3]:
import os
from pathlib import Path

In [4]:
from torchmetrics.classification import MulticlassAccuracy, MulticlassPrecision, MulticlassRecall



**Import data and labels**

In [32]:
#pandas_data = pd.read_json("News_Category_Dataset_v3.json", lines=True)

In [7]:
#route to root folder
BASE_DIR = Path.cwd() #Route to root folder of project #.parent if in sub folder
data_path = os.path.join(BASE_DIR, "dataset", "huffpost", "news_updated.csv")

In [8]:
pandas_data = pd.read_csv(data_path)

In [9]:
pandas_data.head(5)

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [10]:
pandas_data.shape

(209514, 6)

In [11]:
unique_category = len(pandas_data['category'].unique())
print(unique_category) 

27


In [12]:
max_chars = pandas_data['headline'].apply(lambda x: len(str(x))).max()
print("Max length (characters):", max_chars)

Max length (characters): 320


In [13]:
max_len = pandas_data['headline'].apply(lambda x: len(str(x).split())).max()
print("Max length (words):", max_len)


Max length (words): 44


# **DATA PREPROCESSING**

In [14]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/phuongnguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/phuongnguyen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/phuongnguyen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word.isalnum()
            and not word.isdigit()
            and word not in stop_words
            and word not in string.punctuation]
    return tokens

In [16]:
pandas_data['headline'] = pandas_data['headline'].fillna("")

In [17]:
# Apply preprocessing to the headline column
pandas_data['input'] = pandas_data['headline'].apply(preprocess_text)

In [18]:
print(pandas_data['input'].head(30))

0     [4, million, american, roll, sleeve, covid, bo...
1     [american, airline, flyer, charged, banned, li...
2                 [23, funniest, tweet, cat, dog, week]
3                       [funniest, tweet, parent, week]
4           [woman, called, cop, black, loses, lawsuit]
5     [cleaner, dead, belk, bathroom, 4, day, body, ...
6     [reporter, get, adorable, surprise, boyfriend,...
7     [puerto, ricans, desperate, water, hurricane, ...
8     [new, documentary, capture, complexity, child,...
9     [biden, un, call, russian, war, affront, body,...
10    [world, cup, captain, want, wear, rainbow, arm...
11    [man, set, fire, apparent, protest, funeral, j...
12    [fiona, threatens, become, category, 4, storm,...
13    [twitch, ban, gambling, site, streamer, scam, ...
14      [virginia, thomas, agrees, interview, 6, panel]
15    [russian, cosmonaut, valery, polyakov, broke, ...
16                  [clever, look, inside, tv, reboots]
17             [maury, will, shortstop, dodger, 

In [19]:
# Dictionaries to store the word to index mappings and vice versa
all_tokens = [word for tokens in pandas_data['input'] for word in tokens]
unique_tokens = list(set(all_tokens))

word2idx = {o:i for i,o in enumerate(unique_tokens)}
idx2word = {i:o for i,o in enumerate(unique_tokens)}

In [20]:
print(len(unique_tokens))

48421


In [21]:
print(idx2word)



In [22]:
# Mapping function
def map_tokens(tokens, word2idx):
    return [word2idx.get(word, 0) for word in tokens]  # 0 nếu từ không có trong vocab

# Applied mapping function for 'tokens'
pandas_data['indexed'] = pandas_data['input'].apply(lambda tokens: map_tokens(tokens, word2idx))

In [23]:
print(pandas_data.head(1))

                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   

         date                                              input  \
0  2022-09-23  [4, million, american, roll, sleeve, covid, bo...   

                                           indexed  
0  [30923, 27768, 36041, 2140, 5210, 33174, 29828]  


In [24]:
#Define a function that either shorten sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [25]:
#apply padding to the input data
padded_inputs = pad_input(pandas_data['indexed'], 44)

In [26]:
print(padded_inputs.shape)

(209514, 44)


#Preprocessing labels

In [27]:
#Encoding labels (text > int)
from sklearn.preprocessing import LabelEncoder
 
labels_text = pandas_data['category'].values
encoder = LabelEncoder()

#Fit and transform labels
labels_int = encoder.fit_transform(labels_text)

In [28]:
print(len(labels_int))

209514


In [29]:
print(encoder.classes_)

['ARTS & CULTURE' 'BUSINESS & FINANCES' 'COMEDY' 'CRIME' 'DIVORCE'
 'EDUCATION' 'ENTERTAINMENT' 'ENVIRONMENT' 'FOOD & DRINK' 'GROUPS VOICES'
 'HOME & LIVING' 'IMPACT' 'MEDIA' 'MISCELLANEOUS' 'PARENTING' 'POLITICS'
 'RELIGION' 'SCIENCE & TECH' 'SPORTS' 'STYLE & BEAUTY' 'TRAVEL'
 'U.S. NEWS' 'WEDDINGS' 'WEIRD NEWS' 'WELLNESS' 'WOMEN' 'WORLD NEWS']


In [30]:
#Mapping label dictionary
id2label = {i:o for i,o in enumerate(encoder.classes_)}
label2id = {o:i for i,o in enumerate(encoder.classes_)}

**Convert to tensor**

In [31]:
train_text, test_text, train_labels, test_labels = train_test_split(padded_inputs, labels_int, test_size=0.2, random_state=42)

train_data = TensorDataset(torch.tensor(train_text, dtype=torch.long), torch.tensor(train_labels, dtype=torch.long))
test_data = TensorDataset(torch.tensor(test_text, dtype=torch.long), torch.tensor(test_labels, dtype=torch.long))

In [32]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# **DEFINE THE MODELS**

In [34]:
# CNN text classification model
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv = nn.Conv1d(embed_dim, embed_dim, kernel_size = 3, stride=1, padding=1)
        self.fc = nn.Linear(embed_dim, num_classes)         # num_classes = unique categories
    def forward(self, text):
        embedded = self.embedding(text).permute(0, 2, 1)    # (batch_size, seq_len, embed_dim) -> (batch_size, embed_dim, seq_len)
        conved = F.relu(self.conv(embedded))                # (batch_size, embed_dim, seq_len)
        conved = conved.mean(dim=2)                         # (batch_size, embed_dim)
        return self.fc(conved)

In [35]:
print(CNNModel(len(word2idx), 100, unique_category))

CNNModel(
  (embedding): Embedding(48421, 100)
  (conv): Conv1d(100, 100, kernel_size=(3,), stride=(1,), padding=(1,))
  (fc): Linear(in_features=100, out_features=27, bias=True)
)


In [36]:
#LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)  # convert token IDs → embeddings
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)   # [batch, seq_len] → [batch, seq_len, embed_size]
        _, (hidden, _) = self.lstm(x)
        output = self.fc(hidden.squeeze(0))  # [batch, hidden] → [batch, output_size]
        return output
        

In [37]:
print(LSTMModel(300, 300, 128, 42))

LSTMModel(
  (embedding): Embedding(300, 300)
  (lstm): LSTM(300, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=42, bias=True)
)


In [38]:
#GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        _, hidden = self.gru(x)
        output = self.fc(hidden.squeeze(0))
        return output

In [39]:
print(GRUModel(300, 128, 42))

GRUModel(
  (gru): GRU(300, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=42, bias=True)
)


In [40]:
#BiLSTMModel
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, dropout_rate):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                            bidirectional=True, dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)         # *2 for bidirectional output
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, text):
        # text = [batch size, seq len]
        embedded = self.dropout(self.embedding(text))           # embedded = [batch size, seq len, embedding dim]
        output, (hidden, cell) = self.lstm(embedded)
                                                                # output = [batch size, seq len, hidden dim * 2]
                                                                # hidden = [num layers * 2, batch size, hidden dim]
                                                                # cell = [num layers * 2, batch size, hidden dim]

        # Use the concatenated final hidden state from both directions
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))   # hidden = [batch size, hidden dim * 2]

        return self.fc(hidden)

In [41]:
print(BiLSTM(len(word2idx), 100, 128, unique_category, 2, 0.5))

BiLSTM(
  (embedding): Embedding(48421, 100)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=256, out_features=27, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


**HYPERPARAMETERS**

In [42]:
vocab_size = len(word2idx)
embed_dim = 300
num_classes = unique_category

In [43]:
modelCNN = CNNModel(vocab_size, embed_dim, num_classes)
modelLSTM = LSTMModel(vocab_size = vocab_size, embed_size=embed_dim, hidden_size=128, output_size =num_classes)
modelGRU = GRUModel(input_size = vocab_size, hidden_size=128, output_size =num_classes)
modelBiLSTM = BiLSTM(vocab_size, embed_dim, 128, num_classes, 2, 0.5)

In [44]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modelLSTM.parameters(), lr=0.001)

**TEST MODEL WITH TEST INPUT**

In [45]:
#Test input data model
batch_size = 64
input_test = train_data[:batch_size]
print(input_test)

(tensor([[    0,     0,     0,  ...,  7169, 10956, 14791],
        [    0,     0,     0,  ..., 42016, 29633, 35416],
        [    0,     0,     0,  ..., 34961, 46772, 38089],
        ...,
        [    0,     0,     0,  ..., 37351, 12783,  1491],
        [    0,     0,     0,  ..., 38780, 41807, 20542],
        [    0,     0,     0,  ..., 25406, 48415, 33672]]), tensor([15, 19,  9, 22, 15, 13,  6, 20, 15,  6, 18, 20, 12,  8, 18,  3, 14, 10,
        16, 17, 25,  7, 26, 17, 10,  6, 15,  9, 15, 24, 19, 16, 14, 24, 15, 24,
        15,  1, 19,  9, 14, 15, 15, 24,  8, 15, 14, 15, 15, 18,  7,  8,  5, 24,
        24, 24, 25, 24, 14, 24, 15,  6,  6, 26]))


In [46]:
output_test = modelBiLSTM(input_test[0])
print(output_test.shape)

torch.Size([64, 27])


In [47]:
label_test = train_data[:batch_size][1]
print(label_test.shape)

torch.Size([64])


In [48]:
loss_test = criterion(output_test, label_test)

In [49]:
#Test probabilities
preds_test = torch.argmax(output_test, dim=1)
print(preds_test)

tensor([22, 26, 22,  6,  9, 13,  4, 22, 16, 23, 13, 22, 13,  4, 22, 14, 22,  7,
         5,  6, 22, 26, 22, 22,  9, 14, 22, 13, 16,  0,  1, 23, 15, 13,  4, 25,
        16, 14, 13,  4, 23, 26, 23, 22, 22, 12, 16, 22,  6, 13, 22,  4, 13,  4,
        26,  6, 23,  4, 13, 16, 16, 13, 22,  9])


In [50]:
#Test probabilities
probs_test = F.softmax(output_test, dim=-1)
print('probabilities test', probs_test)
preds_test = torch.argmax(probs_test, dim=-1)
print('predictions test', preds_test)

probabilities test tensor([[0.0360, 0.0381, 0.0385,  ..., 0.0377, 0.0354, 0.0366],
        [0.0318, 0.0404, 0.0367,  ..., 0.0356, 0.0363, 0.0445],
        [0.0382, 0.0325, 0.0362,  ..., 0.0401, 0.0347, 0.0386],
        ...,
        [0.0347, 0.0344, 0.0345,  ..., 0.0361, 0.0381, 0.0417],
        [0.0335, 0.0401, 0.0402,  ..., 0.0415, 0.0382, 0.0369],
        [0.0384, 0.0367, 0.0393,  ..., 0.0382, 0.0359, 0.0332]],
       grad_fn=<SoftmaxBackward0>)
predictions test tensor([22, 26, 22,  6,  9, 13,  4, 22, 16, 23, 13, 22, 13,  4, 22, 14, 22,  7,
         5,  6, 22, 26, 22, 22,  9, 14, 22, 13, 16,  0,  1, 23, 15, 13,  4, 25,
        16, 14, 13,  4, 23, 26, 23, 22, 22, 12, 16, 22,  6, 13, 22,  4, 13,  4,
        26,  6, 23,  4, 13, 16, 16, 13, 22,  9])


# **TRAINING THE MODEL**

In [51]:
#Config Model before training
model = modelBiLSTM

In [52]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [53]:
#Training epoch
for epoch in range(10):
    model.train()
    training_loss = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        training_loss += loss.item()
    epoch_loss = training_loss/ len(train_data)
    print(f"Epoch {epoch+1}, Training Loss: {epoch_loss}")

Epoch 1, Training Loss: 0.03588230215689714
Epoch 2, Training Loss: 0.029810979458889882
Epoch 3, Training Loss: 0.02740504818680245
Epoch 4, Training Loss: 0.02586358850101903
Epoch 5, Training Loss: 0.024691164100608464
Epoch 6, Training Loss: 0.02377476120751008
Epoch 7, Training Loss: 0.022966511482710547
Epoch 8, Training Loss: 0.022373646035251778
Epoch 9, Training Loss: 0.021742680474257238
Epoch 10, Training Loss: 0.02124071724939952


# **TESTING THE MODEL**

In [54]:
# Valiation  
validation_loss = 0.0
model.eval()

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        validation_loss += loss.item()
epoch_val_loss = validation_loss / len(test_loader)
#model.train()

In [55]:
print(epoch_val_loss)

1.3790654277073517


# **EVALUATION**

In [56]:
import torch.nn.functional as F
from torchmetrics.classification import MulticlassAccuracy, MulticlassPrecision, MulticlassRecall

num_classes = unique_category
accuracy = MulticlassAccuracy(num_classes=num_classes, average="weighted").to("cpu")
precision = MulticlassPrecision(num_classes=num_classes, average="weighted").to("cpu")
recall = MulticlassRecall(num_classes=num_classes, average="weighted").to("cpu")

model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to("cpu"), labels.to("cpu")

        outputs = model(inputs)
        probs = F.softmax(outputs, dim=1)
        preds = torch.argmax(probs, dim=1)

        # Update metrics each batch
        accuracy.update(outputs, labels)
        precision.update(outputs, labels)
        recall.update(outputs, labels)

# Compute metrics after going through the full dataloader
acc = accuracy.compute().item()
prec = precision.compute().item()
rec = recall.compute().item()

print(f"Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}")

# Reset for next evaluation
accuracy.reset()
precision.reset()
recall.reset()


Accuracy=0.6370, Precision=0.6227, Recall=0.6370


Note:
- CNNModel: Acc=0.0236, Prec=0.0005, Rec=0.0236
- LSTMModel: Acc=0.3946, Prec=0.4609, Rec=0.3946
- BiLSTM: Accuracy=0.3806, Precision=0.4873, Recall=0.3806

2nd (group nhóm category, data preprocessing)

- Accuracy=0.6385, Precision=0.6232, Recall=0.6385

# **PREDICTION**

In [103]:
import torch
import torch.nn.functional as F

def predict(model, text, word2idx, max_len, id2label, device="cpu"):
    # Tokenize
    tokens = preprocess_text(text)

    # Map to ids
    unk_id = word2idx.get("<UNK>", 0)
    token_ids = [word2idx.get(w, unk_id) for w in tokens]

    # Pad / truncate
    if len(token_ids) < max_len:
        token_ids += [0] * (max_len - len(token_ids))
    else:
        token_ids = token_ids[:max_len]

    # To tensor (Long for embedding)
    input_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)

    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(input_tensor)  # shape [1, num_classes]
        probs = F.softmax(outputs, dim=1).squeeze(0)
        pred_idx = torch.argmax(probs).item()
    
    predicted_label = id2label.get(pred_idx, "UNKNOWN")
    return predicted_label, probs.tolist()


In [96]:
from dataprep_func import preprocess_text, pad_input

In [106]:
# Suppose you trained with 42 classes

model = modelBiLSTM

sample_text = "IRS Launches Safety Review Amid Threats To Workers Linked To Conspiracy Theories"
label, probs = predict(model, sample_text, word2idx, max_len=30, id2label=id2label, device="cpu")

print("Predicted:", label)
print("Probabilities:", probs)


Predicted: POLITICS
Probabilities: [0.011666462756693363, 0.08129678666591644, 0.0261932834982872, 0.009402933530509472, 0.003729430725798011, 0.010401722975075245, 0.007352364249527454, 0.04697415232658386, 0.006885646842420101, 0.02988668531179428, 0.0070567987859249115, 0.04598565027117729, 0.019370272755622864, 0.006371327675879002, 0.020594680681824684, 0.21967674791812897, 0.03584476187825203, 0.025051148608326912, 0.015479865483939648, 0.005620117299258709, 0.019892839714884758, 0.006450417451560497, 0.0033804296981543303, 0.004511426202952862, 0.13877299427986145, 0.012286266312003136, 0.17986467480659485]


# SAVE AND LOAD MODEL

In [108]:
import torch
import pickle

In [112]:
model_train_path = "saved_model/news_categorize_bilstm.pth"
torch.save(model.state_dict(), model_train_path)

In [114]:
#Save word2idx, id2label, max_len
utils_bilstm = {
    "word2idx": word2idx,
    "id2label": id2label,
    "max_len": 30
}
with open("saved_model/utils_bilstm.pkl", "wb") as f:
    pickle.dump(utils_bilstm, f)