In [None]:
pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 torchmetrics==1.3.2

In [2]:
pip list | grep torch

torch                                    2.7.1
torchaudio                               2.7.1
torchmetrics                             1.3.2
torchvision                              0.22.1
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


**Import libraries**

In [3]:
# Import libraries
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import nltk, json 
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import string

In [4]:
import os
from pathlib import Path

In [4]:
from torchmetrics.classification import MulticlassAccuracy, MulticlassPrecision, MulticlassRecall



**Import data and labels**

In [5]:
#pandas_data = pd.read_json("News_Category_Dataset_v3.json", lines=True)

In [6]:
#route to root folder
BASE_DIR = Path.cwd() #Route to root folder of project #.parent if in sub folder
data_path = os.path.join(BASE_DIR, "dataset", "huffpost", "news_updated.csv")

In [7]:
pandas_data = pd.read_csv(data_path)

In [8]:
pandas_data.head(5)

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [9]:
pandas_data.shape

(209514, 6)

In [10]:
unique_category = len(pandas_data['category'].unique())
print(unique_category) 

27


In [11]:
max_chars = pandas_data['headline'].apply(lambda x: len(str(x))).max()
print("Max length (characters):", max_chars)

Max length (characters): 320


In [12]:
max_len = pandas_data['headline'].apply(lambda x: len(str(x).split())).max()
print("Max length (words):", max_len)


Max length (words): 44


# **DATA PREPROCESSING**

In [13]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/phuongnguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/phuongnguyen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/phuongnguyen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word.isalnum()
            and not word.isdigit() #remove numbers
            and word not in stop_words #remove stop words
            and word not in string.punctuation] #remove punctuation
    return tokens

In [15]:
pandas_data['headline'] = pandas_data['headline'].fillna("")

In [16]:
# Apply preprocessing to the headline column
pandas_data['input'] = pandas_data['headline'].apply(preprocess_text)

In [17]:
print(pandas_data['input'].head(30))

0     [million, american, roll, sleeve, covid, booster]
1     [american, airline, flyer, charged, banned, li...
2                     [funniest, tweet, cat, dog, week]
3                       [funniest, tweet, parent, week]
4           [woman, called, cop, black, loses, lawsuit]
5     [cleaner, dead, belk, bathroom, day, body, fou...
6     [reporter, get, adorable, surprise, boyfriend,...
7     [puerto, ricans, desperate, water, hurricane, ...
8     [new, documentary, capture, complexity, child,...
9     [biden, un, call, russian, war, affront, body,...
10    [world, cup, captain, want, wear, rainbow, arm...
11    [man, set, fire, apparent, protest, funeral, j...
12    [fiona, threatens, become, category, storm, he...
13    [twitch, ban, gambling, site, streamer, scam, ...
14         [virginia, thomas, agrees, interview, panel]
15    [russian, cosmonaut, valery, polyakov, broke, ...
16                  [clever, look, inside, tv, reboots]
17                 [maury, will, shortstop, dodg

In [18]:
# Dictionaries to store the word to index mappings and vice versa
all_tokens = [word for tokens in pandas_data['input'] for word in tokens]
unique_tokens = list(set(all_tokens))

word2idx = {o:i for i,o in enumerate(unique_tokens)}
idx2word = {i:o for i,o in enumerate(unique_tokens)}

In [19]:
print(len(unique_tokens))

47900


In [20]:
print(idx2word)



In [21]:
# Mapping function
def map_tokens(tokens, word2idx):
    return [word2idx.get(word, 0) for word in tokens]  # 0 nếu từ không có trong vocab

# Applied mapping function for 'tokens'
pandas_data['indexed'] = pandas_data['input'].apply(lambda tokens: map_tokens(tokens, word2idx))

In [22]:
print(pandas_data.head(1))

                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   

         date                                              input  \
0  2022-09-23  [million, american, roll, sleeve, covid, booster]   

                                      indexed  
0  [31741, 40079, 12037, 38271, 10405, 35997]  


In [23]:
#Define a function that either shorten sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [26]:
#apply padding to the input data
padded_inputs = pad_input(pandas_data['indexed'], 12)

In [27]:
print(padded_inputs.shape)

(209514, 12)


#Preprocessing labels

In [28]:
#Encoding labels (text > int)
from sklearn.preprocessing import LabelEncoder
 
labels_text = pandas_data['category'].values
encoder = LabelEncoder()

#Fit and transform labels
labels_int = encoder.fit_transform(labels_text)

In [29]:
print(len(labels_int))

209514


In [30]:
print(encoder.classes_)

['ARTS & CULTURE' 'BUSINESS & FINANCES' 'COMEDY' 'CRIME' 'DIVORCE'
 'EDUCATION' 'ENTERTAINMENT' 'ENVIRONMENT' 'FOOD & DRINK' 'GROUPS VOICES'
 'HOME & LIVING' 'IMPACT' 'MEDIA' 'MISCELLANEOUS' 'PARENTING' 'POLITICS'
 'RELIGION' 'SCIENCE & TECH' 'SPORTS' 'STYLE & BEAUTY' 'TRAVEL'
 'U.S. NEWS' 'WEDDINGS' 'WEIRD NEWS' 'WELLNESS' 'WOMEN' 'WORLD NEWS']


In [31]:
#Mapping label dictionary
id2label = {i:o for i,o in enumerate(encoder.classes_)}
label2id = {o:i for i,o in enumerate(encoder.classes_)}

**Convert to tensor**

In [32]:
train_text, test_text, train_labels, test_labels = train_test_split(padded_inputs, labels_int, test_size=0.2, random_state=42)

train_data = TensorDataset(torch.tensor(train_text, dtype=torch.long), torch.tensor(train_labels, dtype=torch.long))
test_data = TensorDataset(torch.tensor(test_text, dtype=torch.long), torch.tensor(test_labels, dtype=torch.long))

In [33]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

# **DEFINE THE MODELS**

In [34]:
# CNN text classification model
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv = nn.Conv1d(embed_dim, embed_dim, kernel_size = 3, stride=1, padding=1)
        self.fc = nn.Linear(embed_dim, num_classes)         # num_classes = unique categories
    def forward(self, text):
        embedded = self.embedding(text).permute(0, 2, 1)    # (batch_size, seq_len, embed_dim) -> (batch_size, embed_dim, seq_len)
        conved = F.relu(self.conv(embedded))                # (batch_size, embed_dim, seq_len)
        conved = conved.mean(dim=2)                         # (batch_size, embed_dim)
        return self.fc(conved)

In [35]:
print(CNNModel(len(word2idx), 100, unique_category))

CNNModel(
  (embedding): Embedding(47900, 100)
  (conv): Conv1d(100, 100, kernel_size=(3,), stride=(1,), padding=(1,))
  (fc): Linear(in_features=100, out_features=27, bias=True)
)


In [36]:
#LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)  # convert token IDs → embeddings
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)   # [batch, seq_len] → [batch, seq_len, embed_size]
        _, (hidden, _) = self.lstm(x)
        output = self.fc(hidden.squeeze(0))  # [batch, hidden] → [batch, output_size]
        return output
        

In [37]:
print(LSTMModel(300, 300, 128, 42))

LSTMModel(
  (embedding): Embedding(300, 300)
  (lstm): LSTM(300, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=42, bias=True)
)


In [38]:
#GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        _, hidden = self.gru(x)
        output = self.fc(hidden.squeeze(0))
        return output

In [39]:
print(GRUModel(300, 128, 42))

GRUModel(
  (gru): GRU(300, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=42, bias=True)
)


In [40]:
#BiLSTMModel
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, dropout_rate):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                            bidirectional=True, dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)         # *2 for bidirectional output
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, text):
        # text = [batch size, seq len]
        embedded = self.dropout(self.embedding(text))           # embedded = [batch size, seq len, embedding dim]
        output, (hidden, cell) = self.lstm(embedded)
                                                                # output = [batch size, seq len, hidden dim * 2]
                                                                # hidden = [num layers * 2, batch size, hidden dim]
                                                                # cell = [num layers * 2, batch size, hidden dim]

        # Use the concatenated final hidden state from both directions
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))   # hidden = [batch size, hidden dim * 2]

        return self.fc(hidden)

In [41]:
print(BiLSTM(len(word2idx), 100, 128, unique_category, 2, 0.5))

BiLSTM(
  (embedding): Embedding(47900, 100)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=256, out_features=27, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [91]:
from model_train_transformers import TransformerEncoder

In [83]:
print(TransformerEncoder(128, 8, 2, 27, 0.1))

TransformerEncoder(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=128, out_features=27, bias=True)
)




In [92]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_len, dropout):
        super(TransformerModel, self).__init__()

        # Embedding + Positional Encoding
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Embedding(max_len, embed_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            dropout=dropout,
            batch_first=True  # makes input shape (batch, seq, embed)
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classification head
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        # x: (batch, seq_len)
        seq_len = x.size(1)
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0).expand_as(x)

        x = self.embedding(x) + self.pos_embedding(positions)  # (batch, seq_len, embed)
        x = self.transformer_encoder(x)  # (batch, seq_len, embed)

        # Take [CLS]-like representation (mean pooling or first token)
        x = x.mean(dim=1)  
        out = self.fc(x)  # (batch, num_classes)
        return out

**HYPERPARAMETERS**

In [44]:
vocab_size = len(word2idx)
embed_dim = 300
num_classes = unique_category

In [49]:
modelCNN = CNNModel(vocab_size, embed_dim, num_classes)
modelLSTM = LSTMModel(vocab_size = vocab_size, embed_size=embed_dim, hidden_size=128, output_size =num_classes)
modelGRU = GRUModel(input_size = vocab_size, hidden_size=128, output_size =num_classes)
modelBiLSTM = BiLSTM(vocab_size, embed_dim, 128, num_classes, 2, 0.5)
modelTransformer = TransformerEncoder(128, heads=8, num_layers=2, num_classes=num_classes, dropout=0.1)



In [93]:
modelTransformer2 = TransformerModel(vocab_size = vocab_size, embed_dim = 128, num_heads=8, hidden_dim=512, num_layers=2, num_classes=num_classes, max_len=12, dropout=0.1)

In [50]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modelLSTM.parameters(), lr=0.001)

**TEST MODEL WITH TEST INPUT**

In [95]:
#Test input data model
batch_size = 64
input_test = train_data[:batch_size]
print(input_test)

(tensor([[    0,     0,     0,     0, 24613, 37963,     5, 29023, 40598, 40649,
         24070, 41165],
        [    0,     0,     0,     0,     0,     0,     0, 15724, 19685, 18519,
          2772, 26118],
        [    0,     0,     0,     0,     0, 23983, 43145,  2847, 40417,  8338,
         30074, 12855],
        [    0,     0,     0,     0,     0,     0,     0,  2928, 40537, 21279,
         26220, 19454],
        [    0,     0,     0,     0,     0,     0,     0,  2236, 14589, 46235,
         12669, 20585],
        [    0,     0,     0,     0,     0,     0,  3132, 35584,  5221, 45177,
         32734, 17060],
        [    0,     0,     0,     0,     0,     0,     0, 14643, 43469,  4998,
          6568, 29852],
        [    0,     0,     0,     0,  9406, 11426, 15739, 29583, 17171, 42229,
         18825, 39753],
        [    0,     0,     0,     0,     0,  9978,  5675, 42643,  7292,  3669,
         15234, 41284],
        [    0,     0,     0, 22823, 45319,  2128, 45904, 18568, 40038, 

In [96]:
print(input_test[0].shape)  # Input shape

torch.Size([64, 12])


In [78]:
print(input_test[1].shape)  # Labels shape

torch.Size([64])


In [56]:
output_test = modelBiLSTM(input_test[0])
print(output_test.shape)

torch.Size([64, 27])


In [94]:
transformer_output = modelTransformer2(input_test[0])
print(transformer_output.shape)

torch.Size([64, 27])


In [57]:
label_test = train_data[:batch_size][1]
print(label_test.shape)

torch.Size([64])


In [58]:
loss_test = criterion(output_test, label_test)

In [59]:
#Test probabilities
preds_test = torch.argmax(output_test, dim=1)
print(preds_test)

tensor([21, 26, 23, 23, 21, 11, 22, 15, 11, 23, 15, 18, 14, 11,  4, 23, 23, 23,
        21, 20, 16, 11, 11, 23, 23, 22,  1, 23, 11, 22,  2, 20, 23, 21, 23, 23,
         2, 21, 23, 23, 21, 23, 11, 23, 21,  1,  3, 21, 21, 21, 21,  2, 23, 23,
        20, 18, 20, 21, 20, 23, 23, 21, 23, 23])


In [60]:
#Test probabilities
probs_test = F.softmax(output_test, dim=-1)
print('probabilities test', probs_test)
preds_test = torch.argmax(probs_test, dim=-1)
print('predictions test', preds_test)

probabilities test tensor([[0.0331, 0.0386, 0.0386,  ..., 0.0321, 0.0375, 0.0342],
        [0.0365, 0.0366, 0.0328,  ..., 0.0360, 0.0378, 0.0443],
        [0.0317, 0.0371, 0.0380,  ..., 0.0333, 0.0380, 0.0396],
        ...,
        [0.0331, 0.0391, 0.0405,  ..., 0.0356, 0.0373, 0.0359],
        [0.0387, 0.0357, 0.0342,  ..., 0.0324, 0.0357, 0.0336],
        [0.0382, 0.0356, 0.0370,  ..., 0.0313, 0.0412, 0.0337]],
       grad_fn=<SoftmaxBackward0>)
predictions test tensor([21, 26, 23, 23, 21, 11, 22, 15, 11, 23, 15, 18, 14, 11,  4, 23, 23, 23,
        21, 20, 16, 11, 11, 23, 23, 22,  1, 23, 11, 22,  2, 20, 23, 21, 23, 23,
         2, 21, 23, 23, 21, 23, 11, 23, 21,  1,  3, 21, 21, 21, 21,  2, 23, 23,
        20, 18, 20, 21, 20, 23, 23, 21, 23, 23])


# **TRAINING THE MODEL**

In [97]:
#Config Model before training
model = modelTransformer2

In [98]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [99]:
#Training epoch
for epoch in range(10):
    model.train()
    training_loss = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        training_loss += loss.item()
    epoch_loss = training_loss/ len(train_data)
    print(f"Epoch {epoch+1}, Training Loss: {epoch_loss}")

Epoch 1, Training Loss: 0.0309234888988278
Epoch 2, Training Loss: 0.023523804085434936
Epoch 3, Training Loss: 0.02069996645317863
Epoch 4, Training Loss: 0.01872493262840334
Epoch 5, Training Loss: 0.017205103164195386
Epoch 6, Training Loss: 0.015798907731781647
Epoch 7, Training Loss: 0.014659296391011153
Epoch 8, Training Loss: 0.013612455360914086
Epoch 9, Training Loss: 0.012651889968637161
Epoch 10, Training Loss: 0.011813166627298657


# **TESTING THE MODEL**

In [100]:
# Valiation  
validation_loss = 0.0
model.eval()

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        validation_loss += loss.item()
epoch_val_loss = validation_loss / len(test_loader)
#model.train()

In [101]:
print(epoch_val_loss)

1.5614400548789338


# **EVALUATION**

In [102]:
import torch.nn.functional as F
from torchmetrics.classification import MulticlassAccuracy, MulticlassPrecision, MulticlassRecall

num_classes = unique_category
accuracy = MulticlassAccuracy(num_classes=num_classes, average="weighted").to("cpu")
precision = MulticlassPrecision(num_classes=num_classes, average="weighted").to("cpu")
recall = MulticlassRecall(num_classes=num_classes, average="weighted").to("cpu")

model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to("cpu"), labels.to("cpu")

        outputs = model(inputs)
        probs = F.softmax(outputs, dim=1)
        preds = torch.argmax(probs, dim=1)

        # Update metrics each batch
        accuracy.update(outputs, labels)
        precision.update(outputs, labels)
        recall.update(outputs, labels)

# Compute metrics after going through the full dataloader
acc = accuracy.compute().item()
prec = precision.compute().item()
rec = recall.compute().item()

print(f"Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}")

# Reset for next evaluation
accuracy.reset()
precision.reset()
recall.reset()


Accuracy=0.6180, Precision=0.6085, Recall=0.6180


Note:
- CNNModel: Acc=0.0236, Prec=0.0005, Rec=0.0236
- LSTMModel: Acc=0.3946, Prec=0.4609, Rec=0.3946
- BiLSTM: Accuracy=0.3806, Precision=0.4873, Recall=0.3806

2nd (group nhóm category, data preprocessing)

- BiLSTM: Accuracy=0.6406, Precision=0.6262, Recall=0.6406
- Transformers: Accuracy=0.6180, Precision=0.6085, Recall=0.6180

# **PREDICTION**

In [103]:
import torch
import torch.nn.functional as F

def predict(model, text, word2idx, max_len, id2label, device="cpu"):
    # Tokenize
    tokens = preprocess_text(text)

    # Map to ids
    unk_id = word2idx.get("<UNK>", 0)
    token_ids = [word2idx.get(w, unk_id) for w in tokens]

    # Pad / truncate
    if len(token_ids) < max_len:
        token_ids += [0] * (max_len - len(token_ids))
    else:
        token_ids = token_ids[:max_len]

    # To tensor (Long for embedding)
    input_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)

    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(input_tensor)  # shape [1, num_classes]
        probs = F.softmax(outputs, dim=1).squeeze(0)
        pred_idx = torch.argmax(probs).item()
    
    predicted_label = id2label.get(pred_idx, "UNKNOWN")
    return predicted_label, probs.tolist()


In [105]:
from func_dataprep import preprocess_text, pad_input

In [106]:
# Suppose you trained with 42 classes

model = modelBiLSTM

sample_text = "IRS Launches Safety Review Amid Threats To Workers Linked To Conspiracy Theories"
label, probs = predict(model, sample_text, word2idx, max_len=10, id2label=id2label, device="cpu")

print("Predicted:", label)
print("Probabilities:", probs)


Predicted: BUSINESS & FINANCES
Probabilities: [0.00026518042432144284, 0.49663493037223816, 0.0014566309982910752, 0.0012426332104951143, 0.00036261870991438627, 0.003507173154503107, 0.003052482847124338, 0.0091775581240654, 0.00040141315548680723, 0.002648609457537532, 0.00031191491871140897, 0.009010563604533672, 0.011370893567800522, 0.003811159636825323, 0.00132908730302006, 0.28303173184394836, 0.00041427763062529266, 0.04187610745429993, 0.0006400030106306076, 0.0003194712335243821, 0.000508948287460953, 0.027165137231349945, 6.196845060912892e-05, 0.0006430988432839513, 0.09755177795886993, 0.001535338582471013, 0.0016693304060027003]


# SAVE AND LOAD MODEL

In [108]:
import torch
import pickle

In [107]:
# model_train_path = "saved_model/news_categorize_transformers.pth"
# torch.save(model.state_dict(), model_train_path)

In [108]:
# #Save word2idx, id2label, max_len
# utils_transformers = {
#     "word2idx": word2idx,
#     "id2label": id2label,
#     "max_len": 12
# }
# with open("saved_model/utils_transformers.pkl", "wb") as f:
#     pickle.dump(utils_transformers, f)

NameError: name 'pickle' is not defined