In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import os

In [2]:
seed = 43
torch.manual_seed(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

In [3]:
batch_size = 32

In [4]:
def simple_tokenizer(text):
    return re.findall(r'\b\w+\b', str(text).lower())

In [5]:
train_df = pd.read_csv("Datasets/TrainData.csv")
test_df = pd.read_csv("Datasets/TestLabels.csv")

train_df.dropna(subset=['Text', 'Category'], inplace=True)
test_df.dropna(subset=['Text', 'Label - (business, tech, politics, sport, entertainment)'], inplace=True)

print(f"Train samples: {len(train_df)}, Test samples: {len(test_df)}")

train_texts = train_df['Text'].tolist()
train_labels = train_df['Category'].tolist()
test_texts = test_df['Text'].tolist()
test_labels = test_df['Label - (business, tech, politics, sport, entertainment)'].tolist()

Train samples: 1490, Test samples: 735


In [6]:
train_tokenized = [simple_tokenizer(t) for t in train_texts]
test_tokenized = [simple_tokenizer(t) for t in test_texts]

In [7]:
lengths = [len(inner_array) for inner_array in train_tokenized]
max_len = int(np.percentile(lengths, 90))

In [8]:
class NewsDataset(Dataset):
    def __init__(self, tokenized_text, labels, vocab=None, label2idx=None, max_len=300):
        self.texts = tokenized_text
        self.max_len = max_len
        if vocab is None:
            words = [word for text in self.texts for word in text]
            word_freq = Counter(words)
            self.vocab = {'<PAD>': 0, '<UNK>': 1}
            for word in word_freq:
                self.vocab[word] = len(self.vocab)
        else:
            self.vocab = vocab

        self.texts = [self.encode(text) for text in self.texts]

        if label2idx is None:
            unique_labels = sorted(set(label for label in labels if pd.notna(label)))
            self.label2idx = {label: i for i, label in enumerate(unique_labels)}
        else:
            self.label2idx = label2idx

        self.labels = [self.label2idx[label] for label in labels if pd.notna(label)]

    def encode(self, tokens):
        encoded = [self.vocab.get(tok, self.vocab['<UNK>']) for tok in tokens]
        return encoded[:self.max_len] + [0]*(self.max_len - len(encoded))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx]), torch.tensor(self.labels[idx])

In [9]:
train_data = NewsDataset(train_tokenized, train_labels)
test_data = NewsDataset(test_tokenized, test_labels, vocab=train_data.vocab, label2idx=train_data.label2idx)

In [10]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [11]:
class CLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes,ls_layer=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv = nn.Conv1d(embed_dim, 128, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.lstm = nn.LSTM(128, 128, batch_first=True, bidirectional=True,dropout=0.3)
        self.attn_fc = nn.Linear(256, 1)
        self.fc = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.3)  # Add dropout
        self.bn = nn.BatchNorm1d(128)  # After conv layer


    def forward(self, x):
        x = self.embedding(x)              
        x = x.permute(0, 2, 1)           
        x = self.relu(self.conv(x))    
        x = self.dropout(x)  # After embedding/conv layers
   
        x = x.permute(0, 2, 1)             
        lstm_out, _ = self.lstm(x)        # (B, T, 2H)
        x = torch.tanh(lstm_out)
        attn_weights = torch.softmax(self.attn_fc(x), dim=1)  # (B, T, 1)
        context = torch.sum(attn_weights * x, dim=1)          # (B, 2H)
        return self.fc(context)           # (B, num_classes)


In [12]:
def train_model(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0
    for texts, labels in loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train
    return avg_train_loss , train_accuracy

In [13]:
def evaluate_model(model, loader, criterion):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0
    all_preds ,all_labels = [] ,[]
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().tolist())
            all_labels.extend(y_batch.cpu().tolist())

            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    avg_loss = total_loss / len(test_loader)
    accuracy = 100 * correct / total
    f1 = f1_score(all_labels,all_preds,average='micro')
    return avg_loss, accuracy,f1

In [14]:
EPOCHS = 50
lr = 0.0003
weight_decay = 1e-3
ls_layer = 64
hidden = 64

In [15]:

# Model
model = CLSTM(len(train_data.vocab), embed_dim=512, num_classes=len(train_data.label2idx),
              ).to(device)




In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=lr,weight_decay=weight_decay)

In [18]:
patience = 15 
best_val_loss = float('inf')
counter = 0  
for epoch in range(EPOCHS):
    avg_train_loss , train_accuracy = train_model(model, train_loader, criterion, optimizer, device)
    avg_val_loss, val_accuracy ,f1 = evaluate_model(model, test_loader, criterion)
    print(f"Epoch {epoch+1}/{EPOCHS} | "
              f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.2f}% | "
              f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.2f}% | f1: {f1:.2f}")
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0  # Reset patience counter
        best_model = model.state_dict()  # Save best model
    else:
        counter += 1

    if counter >= patience:
        print(f"Early stopping at epoch {epoch}. Best val loss: {best_val_loss:.4f}")
        break

Epoch 1/50 | Train Loss: 1.5432 | Train Acc: 28.26% | Val Loss: 1.4705 | Val Acc: 37.28% | f1: 0.37
Epoch 2/50 | Train Loss: 1.2365 | Train Acc: 55.84% | Val Loss: 0.9742 | Val Acc: 58.78% | f1: 0.59
Epoch 3/50 | Train Loss: 0.6462 | Train Acc: 78.26% | Val Loss: 0.6424 | Val Acc: 73.61% | f1: 0.74
Epoch 4/50 | Train Loss: 0.3932 | Train Acc: 87.72% | Val Loss: 0.4253 | Val Acc: 85.44% | f1: 0.85
Epoch 5/50 | Train Loss: 0.2316 | Train Acc: 94.30% | Val Loss: 0.3918 | Val Acc: 86.67% | f1: 0.87
Epoch 6/50 | Train Loss: 0.1836 | Train Acc: 94.63% | Val Loss: 0.3414 | Val Acc: 88.98% | f1: 0.89
Epoch 7/50 | Train Loss: 0.1376 | Train Acc: 95.77% | Val Loss: 0.3418 | Val Acc: 89.25% | f1: 0.89
Epoch 8/50 | Train Loss: 0.0775 | Train Acc: 97.99% | Val Loss: 0.3478 | Val Acc: 88.71% | f1: 0.89
Epoch 9/50 | Train Loss: 0.0701 | Train Acc: 98.39% | Val Loss: 0.3116 | Val Acc: 90.61% | f1: 0.91
Epoch 10/50 | Train Loss: 0.1319 | Train Acc: 97.52% | Val Loss: 0.3190 | Val Acc: 89.25% | f1: 0.89

In [19]:
model.load_state_dict(best_model)

<All keys matched successfully>

In [20]:
# Final evaluation and confusion matrix
avg_val_loss, val_accuracy, f1 = evaluate_model(model, test_loader, criterion)
print(f"\nFinal Micro F1 Score = {f1:.4f}")
label_names = [label for label, _ in sorted(train_data.label2idx.items(), key=lambda x: x[1])]



Final Micro F1 Score = 0.9156
