In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import string

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/kash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/kash/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
punctuation = set(string.punctuation)

In [4]:
batch_size = 256
EPOCHS=200

In [5]:
seed = 23
torch.manual_seed(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

In [6]:
train_path ="Datasets/TrainData.csv"
test_path = "Datasets/TestLabels.csv"

In [7]:
# Load dataset
train_df = pd.read_csv(train_path)  # Change to actual file path
train_texts = train_df["Text"].astype(str).tolist()
train_labels = train_df["Category"].tolist()


In [8]:
test_df = pd.read_csv(test_path)  # Change to actual file path
test_texts = test_df["Text"].astype(str).tolist()
test_labels = test_df["Label - (business, tech, politics, sport, entertainment)"].tolist()


In [9]:
def preprocess_text(text):
    tokenized = word_tokenize(text.lower())  # Tokenize and lowercase
    #filtered = [word for word in tokenized]
    filtered = [word for word in tokenized if word not in punctuation]

    return tokenized

In [10]:
tokenized_texts = [preprocess_text(text) for text in train_texts]
test_tokenized = [preprocess_text(text) for text in test_texts]

In [11]:
vocab = {word: idx + 1 for idx, word in enumerate(set(word for text in tokenized_texts for word in text))}
vocab_size = len(vocab) + 1  

In [12]:
vocab["<OOV>"] = len(vocab) + 1

def text_to_indices(text, vocab, max_len):
    indices = [vocab.get(word, vocab["<OOV>"]) for word in text[:max_len]]
    if len(indices) < max_len:
        indices += [0] * (max_len - len(indices)) # Padding with 0
    return indices


In [13]:
# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)


In [14]:
lengths = [len(inner_array) for inner_array in tokenized_texts]
max_len = int(np.percentile(lengths, 97))

In [15]:
X_train = np.array([text_to_indices(text, vocab, max_len) for text in tokenized_texts])
y_train = np.array(train_labels)
X_test = np.array([text_to_indices(text, vocab, max_len) for text in test_tokenized])
y_test = np.array(test_labels)

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)

In [17]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)  # Use long type for embedding lookup
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [18]:
train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)
test_dataset = TextDataset(X_test, y_test)


In [19]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [20]:
class CLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, out_size=64, hidden_layer=64):
        super(CLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)  # Embedding layer
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=out_size, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(out_size)  # Batch normalization for convolution output
        self.lstm = nn.LSTM(input_size=out_size, hidden_size=out_size,
                            batch_first=True, bidirectional=True, dropout=0.3)
        
        # Add Layer Normalization before Attention
        self.layer_norm = nn.LayerNorm(out_size * 2)  # Normalize over features (bidirectional LSTM output)

        self.attention = nn.Linear(out_size * 2, 1)  
        self.fc1 = nn.Linear(out_size * 2, hidden_layer)
        self.dropout = nn.Dropout(p=0.3)
        self.fc2 = nn.Linear(hidden_layer, num_classes)
        self.fc3 = nn.Linear(hidden_layer//4, num_classes)

        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.embedding(x) 
        x = x.permute(0, 2, 1)  # Change shape to (batch_size, embedding_dim, seq_len)
        x = self.conv1(x)
        x = self.bn1(x)  # Apply Batch Normalization
        x = self.relu(x)
        x = x.permute(0, 2, 1)  # Shape: (batch_size, seq_len, out_channels)
        
        lstm_out, _ = self.lstm(x)  
        
        # Apply Layer Normalization before Attention
        lstm_out = self.layer_norm(lstm_out)

        attention_scores = self.attention(torch.tanh(lstm_out))  # Optional nonlinearity
        attn_weights = torch.softmax(attention_scores.squeeze(-1), dim=1)  # Shape: (batch_size, seq_len)
        
        context_vector = torch.sum(attn_weights.unsqueeze(-1) * lstm_out, dim=1)  # Shape: (batch_size, out_size * 2)
        
        x = torch.relu(self.fc1(context_vector))
        x = self.dropout(x)

        x = self.fc2(x)  
        
        return x


In [21]:
def evaluate(model, test_loader, criterion):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0
    
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    avg_loss = total_loss / len(test_loader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy

In [22]:
def train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=10):
    model.train()
    patience = 20  
    best_val_loss = float('inf')
    counter = 0  
    for epoch in range(epochs):
        total_loss = 0
        correct_train = 0
        total_train = 0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total_train += y_batch.size(0)
            correct_train += (predicted == y_batch).sum().item()

        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = 100 * correct_train / total_train
        avg_val_loss, val_accuracy = evaluate(model, val_loader, criterion)
        scheduler.step(avg_val_loss)  # Adjust learning rate based on validation loss

        print(f"Epoch {epoch+1}/{epochs} | "
              f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.2f}% | "
              f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.2f}%")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            counter = 0  # Reset patience counter
            best_model = model.state_dict()  # Save best model
        else:
            counter += 1

        if counter >= patience:
            print(f"Early stopping at epoch {epoch}. Best val loss: {best_val_loss:.4f}")
            break
    return best_model
  

In [23]:
embedding_dim = 32
num_classes = len(set(y_train))
model = CLSTM(vocab_size=vocab_size,
              embedding_dim=embedding_dim,
              num_classes=num_classes,
              out_size=128,
              hidden_layer=64).to(device)



In [24]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.002, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=10, verbose=True
)



In [25]:
best_model = train(model, train_loader,val_loader ,criterion, optimizer,scheduler, epochs=EPOCHS)


Epoch 1/200 | Train Loss: 1.6137 | Train Acc: 21.56% | Val Loss: 1.5236 | Val Acc: 30.20%
Epoch 2/200 | Train Loss: 1.5592 | Train Acc: 27.60% | Val Loss: 1.4879 | Val Acc: 29.53%
Epoch 3/200 | Train Loss: 1.5346 | Train Acc: 29.28% | Val Loss: 1.4758 | Val Acc: 27.85%
Epoch 4/200 | Train Loss: 1.5468 | Train Acc: 27.77% | Val Loss: 1.4985 | Val Acc: 29.53%
Epoch 5/200 | Train Loss: 1.5294 | Train Acc: 29.61% | Val Loss: 1.4437 | Val Acc: 29.87%
Epoch 6/200 | Train Loss: 1.5126 | Train Acc: 30.79% | Val Loss: 1.4605 | Val Acc: 33.22%
Epoch 7/200 | Train Loss: 1.5233 | Train Acc: 29.87% | Val Loss: 1.4447 | Val Acc: 30.20%
Epoch 8/200 | Train Loss: 1.5054 | Train Acc: 31.71% | Val Loss: 1.4312 | Val Acc: 35.23%
Epoch 9/200 | Train Loss: 1.5012 | Train Acc: 33.05% | Val Loss: 1.4566 | Val Acc: 36.58%
Epoch 10/200 | Train Loss: 1.4971 | Train Acc: 34.40% | Val Loss: 1.4404 | Val Acc: 37.58%
Epoch 11/200 | Train Loss: 1.5007 | Train Acc: 32.55% | Val Loss: 1.4226 | Val Acc: 33.22%
Epoch 12

In [26]:
model.load_state_dict(best_model)

<All keys matched successfully>

In [27]:
test_loss, test_accuracy = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.2f}%")

Test Loss: 1.2372 | Test Accuracy: 71.56%
