In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import f1_score
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
import gensim.downloader as api
from nltk.corpus import stopwords
import string
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/kash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/kash/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
torch.manual_seed(420)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

In [4]:
train_path ="Datasets/TrainData.csv"
test_path = "Datasets/TestLabels.csv"

In [5]:
# Load dataset
train_df = pd.read_csv(train_path)  # Change to actual file path
train_texts = train_df["Text"].astype(str).tolist()
train_labels = train_df["Category"].tolist()


In [6]:
test_df = pd.read_csv(test_path)  # Change to actual file path
test_texts = test_df["Text"].astype(str).tolist()
test_labels = test_df["Label - (business, tech, politics, sport, entertainment)"].tolist()


In [7]:
def preprocess_text(text):
    tokenized = word_tokenize(text.lower())
    filtered = [word for word in tokenized]
    return filtered

In [8]:
embedding_dim = 768  # BERT embedding size

In [9]:
from transformers import BertTokenizer, BertModel

bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")



In [10]:
max_len =512
def get_bert_embeddings(text):
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=max_len, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :]  # CLS token representation
# Convert datasets


In [11]:
train_encodings = torch.stack([get_bert_embeddings(text) for text in train_texts])
test_encodings = torch.stack([get_bert_embeddings(text) for text in test_texts])

from transformers import BertTokenizer
embedding_dim = 768  # BERT embedding size

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize_with_transformer(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=embedding_dim, return_tensors="pt")["input_ids"].squeeze(0)# Tokenize datasets
train_encodings = tokenize_with_transformer(train_texts)
test_encodings = tokenize_with_transformer(test_texts)


In [33]:
# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)

In [34]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len=max_len):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

In [35]:
class TransformerTextEncoder(nn.Module):
    def __init__(self, embedding_dim, num_classes, num_heads=8, num_layers=2, dropout=0.3):
        super(TransformerTextEncoder, self).__init__()

        self.embedding_layer = nn.Linear(embedding_dim, embedding_dim)  # To fine-tune embeddings
        self.pos_encoder = PositionalEncoding(embedding_dim)

        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dropout=dropout, batch_first=True),
            num_layers=num_layers
        )

        self.fc = nn.Linear(embedding_dim, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding_layer(x)  
        x = self.pos_encoder(x.unsqueeze(1))  
        x = self.transformer_encoder(x)  # Apply transformer
        x = x[:, 0, :]  # Take CLS token output
        x = self.dropout(x)
        return self.fc(x)

In [36]:
class TextDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.data = tokenized_texts  # Already a tensor, no need to stack
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [61]:
num_classes = len(label_encoder.classes_)  # Ensure correct output size
num_heads = 8
num_layers =6
epochs = 15  # Reduced for testing
learning_rate = 0.0001
batch_size = 32

In [62]:
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

In [63]:
from torch.utils.data import random_split

train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size

train_subset, val_subset = random_split(train_dataset, [train_size, val_size])


In [64]:
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [65]:
model = TransformerTextEncoder(embedding_dim, num_classes, num_heads, num_layers,0.3).to(device)

In [66]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.1)

In [67]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch  # Unpacking
            inputs = inputs.to(device).to(torch.float32)  # Convert to float32
            labels = labels.to(device)
            inputs = inputs.squeeze(1)  # Remove the unnecessary dimension
            outputs = model(inputs)             
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = (np.array(all_preds) == np.array(all_labels)).mean() * 100
    micro_f1 = f1_score(all_labels, all_preds, average='micro')
    return avg_loss, accuracy, micro_f1

In [68]:
# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device).to(torch.float32), y_batch.to(device)
        optimizer.zero_grad()
        X_batch = X_batch.squeeze(1)  # Remove the unnecessary dimension
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total_train += y_batch.size(0)
        correct_train += (predicted == y_batch).sum().item()

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train

    avg_val_loss, val_accuracy,micro_f1 = evaluate(model, val_loader, criterion)

    avg_train_loss = total_loss / len(train_loader)
    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch+1}/{epochs} | "
            f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.2f}% | "
            f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.2f}% | Val micro F1: {micro_f1:.2f}")




Epoch 1/15 | Train Loss: 1.0053 | Train Acc: 59.06% | Val Loss: 0.2104 | Val Acc: 94.30% | Val micro F1: 0.94
Epoch 2/15 | Train Loss: 0.2400 | Train Acc: 92.70% | Val Loss: 0.1197 | Val Acc: 96.98% | Val micro F1: 0.97
Epoch 3/15 | Train Loss: 0.2064 | Train Acc: 92.37% | Val Loss: 0.2892 | Val Acc: 92.62% | Val micro F1: 0.93
Epoch 4/15 | Train Loss: 0.1573 | Train Acc: 94.71% | Val Loss: 0.1233 | Val Acc: 96.31% | Val micro F1: 0.96
Epoch 5/15 | Train Loss: 0.1225 | Train Acc: 95.89% | Val Loss: 0.1607 | Val Acc: 95.30% | Val micro F1: 0.95
Epoch 6/15 | Train Loss: 0.1192 | Train Acc: 95.97% | Val Loss: 0.1043 | Val Acc: 97.32% | Val micro F1: 0.97
Epoch 7/15 | Train Loss: 0.0897 | Train Acc: 96.90% | Val Loss: 0.1127 | Val Acc: 96.64% | Val micro F1: 0.97
Epoch 8/15 | Train Loss: 0.0729 | Train Acc: 97.57% | Val Loss: 0.0964 | Val Acc: 97.99% | Val micro F1: 0.98
Epoch 9/15 | Train Loss: 0.0594 | Train Acc: 98.07% | Val Loss: 0.1567 | Val Acc: 96.31% | Val micro F1: 0.96
Epoch 10/1

In [69]:
test_loss, test_accuracy, test_f1 = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.2f}% | Test F1: {test_f1:.4f}")

Test Loss: 0.3028 | Test Accuracy: 95.24% | Test F1: 0.9524
