In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import f1_score
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
import string
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/kash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/kash/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
torch.manual_seed(420)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

In [4]:
train_path ="Datasets/TrainData.csv"
test_path = "Datasets/TestLabels.csv"

In [5]:
# Load dataset
train_df = pd.read_csv(train_path)  # Change to actual file path
train_texts = train_df["Text"].astype(str).tolist()
train_labels = train_df["Category"].tolist()


In [6]:
test_df = pd.read_csv(test_path)  # Change to actual file path
test_texts = test_df["Text"].astype(str).tolist()
test_labels = test_df["Label - (business, tech, politics, sport, entertainment)"].tolist()


In [7]:
def preprocess_text(text):
    tokenized = word_tokenize(text.lower())
    filtered = [word for word in tokenized]
    return filtered

In [8]:
embedding_dim = 256  # BERT embedding size

In [9]:
# Build vocabulary from training data
vocab = {word: idx + 1 for idx, word in enumerate(set(word for text in train_texts for word in word_tokenize(text.lower())))}
vocab_size = len(vocab) + 1  # Add 1 for padding index

# Add an <OOV> token for out-of-vocabulary words
vocab["<OOV>"] = len(vocab) + 1

In [10]:
def text_to_indices(tokenized, vocab, max_len):
    indices = [vocab.get(word, vocab["<OOV>"]) for word in tokenized[:max_len]]
    if len(indices) < max_len:
        indices += [0] * (max_len - len(indices))  # Padding with 0
    return indices

In [11]:
tokenized_texts = [word_tokenize(text.lower()) for text in train_texts]
test_tokenized = [word_tokenize(text.lower()) for text in test_texts]

In [12]:
lengths = [len(inner_array) for inner_array in tokenized_texts]
max_len = 256#int(np.percentile(lengths, 90))

In [13]:
train_encodings = torch.tensor([text_to_indices(text, vocab, max_len) for text in train_texts], dtype=torch.long)
test_encodings = torch.tensor([text_to_indices(text, vocab, max_len) for text in test_texts], dtype=torch.long)


In [14]:
# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)

In [15]:
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len=max_len):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))

        pe[:, 0::2] = torch.sin(position * div_term)
        if embedding_dim % 2 == 0:
            pe[:, 1::2] = torch.cos(position * div_term)
        else:
            pe[:, 1:embedding_dim:2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))  # Store positional encoding as a buffer

    def forward(self, x):
        # Ensure positional encoding is moved to the same device as x
        return x + self.pe[:, :x.size(1), :].to(x.device)


In [16]:
class TransformerTextEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, num_heads=8, num_layers=2, dropout=0.3):
        super(TransformerTextEncoder, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)  # Learnable embeddings
        self.pos_encoder = PositionalEncoding(embedding_dim)

        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dropout=dropout, batch_first=True),
            num_layers=num_layers
        )

        self.fc = nn.Linear(embedding_dim, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)  # Convert indices to embeddings
        x = self.pos_encoder(x)  
        x = self.transformer_encoder(x)  
        x = x[:, 0, :]  # Take CLS token output
        x = self.dropout(x)
        return self.fc(x)


In [17]:
class TextDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.data = tokenized_texts  # Tensor of indices
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [18]:
num_classes = len(label_encoder.classes_)  # Ensure correct output size
num_heads = 8
num_layers =4
epochs = 15  # Reduced for testing
learning_rate = 0.001
batch_size = 32

In [19]:
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

In [20]:
from torch.utils.data import random_split

train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size

train_subset, val_subset = random_split(train_dataset, [train_size, val_size])


In [21]:
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [22]:
model = TransformerTextEncoder(vocab_size,embedding_dim, num_classes, num_heads, num_layers,0.3).to(device)

In [23]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.1)

In [24]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch  # Unpacking
            inputs = inputs.to(device)  # Convert to float32
            labels = labels.to(device)
            inputs = inputs.squeeze(1)  # Remove the unnecessary dimension
            outputs = model(inputs)             
            loss = criterion(outputs, labels)
            total_loss += loss.item()


            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = (np.array(all_preds) == np.array(all_labels)).mean() * 100
    micro_f1 = f1_score(all_labels, all_preds, average='micro')
    return avg_loss, accuracy, micro_f1

In [25]:
# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # Keep X_batch as LongTensor
        optimizer.zero_grad()
        X_batch = X_batch.squeeze(1)  # Remove the unnecessary dimension
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()


        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total_train += y_batch.size(0)
        correct_train += (predicted == y_batch).sum().item()

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train

    avg_val_loss, val_accuracy,micro_f1 = evaluate(model, val_loader, criterion)

    avg_train_loss = total_loss / len(train_loader)
    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch+1}/{epochs} | "
            f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.2f}% | "
            f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.2f}% | Val micro F1: {micro_f1:.2f}")




Epoch 1/15 | Train Loss: 1.7787 | Train Acc: 21.56% | Val Loss: 1.6091 | Val Acc: 24.16% | Val micro F1: 0.24
Epoch 2/15 | Train Loss: 1.6583 | Train Acc: 23.24% | Val Loss: 1.6306 | Val Acc: 17.45% | Val micro F1: 0.17


KeyboardInterrupt: 

In [69]:
test_loss, test_accuracy, test_f1 = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.2f}% | Test F1: {test_f1:.4f}")

Test Loss: 0.3028 | Test Accuracy: 95.24% | Test F1: 0.9524
