In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
#from gensim.models import Word2Vec
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
import nltk
import gensim.downloader as api
from nltk.corpus import stopwords
import string

In [27]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/kash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/kash/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [50]:
punctuation = set(string.punctuation)

In [51]:
batch_size = 64
EPOCHS=200

In [52]:
seed = 420
torch.manual_seed(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

In [53]:
train_path ="Datasets/TrainData.csv"
test_path = "Datasets/TestLabels.csv"

In [54]:
# Load dataset
train_df = pd.read_csv(train_path)  # Change to actual file path
train_texts = train_df["Text"].astype(str).tolist()
train_labels = train_df["Category"].tolist()


In [55]:
test_df = pd.read_csv(test_path)  # Change to actual file path
test_texts = test_df["Text"].astype(str).tolist()
test_labels = test_df["Label - (business, tech, politics, sport, entertainment)"].tolist()


In [57]:
def preprocess_text(text):
    tokenized = word_tokenize(text.lower())  # Tokenize and lowercase
    #filtered = [word for word in tokenized]
    filtered = [word for word in tokenized if word not in punctuation]

    return filtered

In [58]:
tokenized_texts = [preprocess_text(text) for text in train_texts]
test_tokenized = [preprocess_text(text) for text in test_texts]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
embedding_dim = word2vec_model.vector_size


In [59]:
#word2vec_model = api.load('word2vec-google-news-300')
word2vec_model = api.load('fasttext-wiki-news-subwords-300')

embedding_dim = word2vec_model.vector_size


from gensim.models import KeyedVectors

# Convert GloVe format to Word2Vec format
glove_input_file = 'path_to_extracted_files/glove.6B.300d.txt'  # Use 300-dimensional embeddings
word2vec_output_file = 'glove.6B.300d.word2vec'

# Convert GloVe to Word2Vec format
gensim.scripts.glove2word2vec.glove2word2vec(glove_input_file, word2vec_output_file)

# Load converted Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [60]:
# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)



In [61]:
lengths = [len(inner_array) for inner_array in tokenized_texts]
max_len = int(np.percentile(lengths, 98))

In [62]:
oov_count = sum(1 for text in tokenized_texts for word in text if word not in word2vec_model)
print(f"Number of OOV words: {oov_count}")
embeddings_list = [word2vec_model[word] for word in word2vec_model.index_to_key]
average_embedding = np.mean(embeddings_list, axis=0)


Number of OOV words: 11389


In [63]:
def text_to_embedding(text,average_embedding=average_embedding, max_len=max_len):
    embedding = np.zeros((max_len, embedding_dim))
    
    for i, word in enumerate(text[:max_len]):
        if word in word2vec_model:
            embedding[i] = word2vec_model[word]
        else:
            embedding[i] = average_embedding
    
    return embedding

In [64]:
X_train = np.array([text_to_embedding(text) for text in tokenized_texts])
y_train = np.array(train_labels)

X_test = np.array([text_to_embedding(text) for text in test_tokenized])
y_test = np.array(test_labels)


In [65]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)


In [66]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [67]:
train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)
test_dataset = TextDataset(X_test, y_test)


In [68]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


class CLSTM(nn.Module):
    def __init__(self, embedding_dim, num_classes,out_size=64,hidden_layer = 64):
        super(CLSTM, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=out_size, kernel_size=3, padding=1)
        
        self.lstm = nn.LSTM(input_size=out_size, hidden_size=out_size, batch_first=True, bidirectional=True, dropout=0.3)
        
        self.attention = nn.Linear(out_size * 2, 1)  
        
        self.fc1 = nn.Linear(out_size * 2, hidden_layer)  
        self.dropout = nn.Dropout(p=0.4)
        self.fc2 = nn.Linear(hidden_layer, num_classes)
        self.relu  = nn.ReLU()

    def forward(self, x):
        x = x.permute(0, 2, 1)  # Change to (batch_size, embedding_dim, seq_len)
        x = torch.relu(self.conv1(x)) 
        
        x = x.permute(0, 2, 1)  # Shape: (batch_size, seq_len, 128)
        
        lstm_out, _ = self.lstm(x)  
        
        attn_weights = torch.softmax(self.attention(lstm_out), dim=1)  
        context_vector = torch.sum(attn_weights * lstm_out, dim=1)     
        
        x = torch.relu(self.fc1(context_vector))  
        x = self.dropout(x)
        x = self.fc2(x)  # Shape: (batch_size, num_classes)

        return x


In [69]:
class CLSTM(nn.Module):
    def __init__(self, embedding_dim, num_classes,out_size=64,hidden_layer = 64):
        super(CLSTM, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=out_size, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(out_size)  # Batch Normalization for Conv1d output
        self.lstm = nn.LSTM(input_size=out_size, hidden_size=out_size, batch_first=True, bidirectional=True, dropout=0.3)
        
        self.attention = nn.Linear(out_size * 2, 1)  
        
        self.fc1 = nn.Linear(out_size * 2, hidden_layer)  
        self.dropout = nn.Dropout(p=0.4)
        self.fc2 = nn.Linear(hidden_layer, num_classes)
        self.relu  = nn.ReLU()

    def forward(self, x):
        x = x.permute(0, 2, 1)  # Change to (batch_size, embedding_dim, seq_len)
        x = self.conv1(x)
        x = self.bn1(x)  
        x = self.relu(x)       # Apply Batch Normalization
        x = x.permute(0, 2, 1)  # Shape: (batch_size, seq_len, out_size)

        lstm_out, _ = self.lstm(x)  # Shape: (batch_size, seq_len, out_size * 2)
        attention_scores = self.attention(torch.tanh(lstm_out))  # Optional nonlinearity
        #attention_scores = self.attention(lstm_out)  # Shape: (batch_size, seq_len, 1)
        attn_weights = torch.softmax(attention_scores.squeeze(-1), dim=1)  # Shape: (batch_size, seq_len)

        context_vector = torch.sum(attn_weights.unsqueeze(-1) * lstm_out, dim=1)  # Shape: (batch_size, out_size * 2)

        x = torch.relu(self.fc1(context_vector))  
        x = self.dropout(x)
        x = self.fc2(x)  # Shape: (batch_size, num_classes)

        return x



In [70]:
def evaluate(model, test_loader, criterion):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0
    
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    avg_loss = total_loss / len(test_loader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy

In [71]:
def train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=10):
    model.train()
    patience = 20  
    best_val_loss = float('inf')
    counter = 0  
    for epoch in range(epochs):
        total_loss = 0
        correct_train = 0
        total_train = 0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total_train += y_batch.size(0)
            correct_train += (predicted == y_batch).sum().item()

        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = 100 * correct_train / total_train

        # Evaluate on validation set
        avg_val_loss, val_accuracy = evaluate(model, val_loader, criterion)
        #scheduler.step()

        #scheduler.step(avg_val_loss)

        print(f"Epoch {epoch+1}/{epochs} | "
              f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.2f}% | "
              f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.2f}%")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            counter = 0  # Reset patience counter
            best_model = model.state_dict()  # Save best model
        else:
            counter += 1

        if counter >= patience:
            print(f"Early stopping at epoch {epoch}. Best val loss: {best_val_loss:.4f}")
            #break
    return best_model
  

In [72]:
num_classes = len(set(y_train))
model = CLSTM(embedding_dim, num_classes,hidden_layer=16).to(device)



In [73]:
criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4) #4
optimizer = optim.AdamW(model.parameters(), lr=0.0005, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-4)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.004, epochs=EPOCHS, steps_per_epoch=len(train_loader))
#scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2, eta_min=1e-6)

In [74]:
best_model = train(model, train_loader,val_loader ,criterion, optimizer,scheduler, epochs=EPOCHS)


Epoch 1/200 | Train Loss: 1.6122 | Train Acc: 18.20% | Val Loss: 1.6105 | Val Acc: 19.13%
Epoch 2/200 | Train Loss: 1.6131 | Train Acc: 18.20% | Val Loss: 1.6098 | Val Acc: 19.13%
Epoch 3/200 | Train Loss: 1.6118 | Train Acc: 18.20% | Val Loss: 1.6090 | Val Acc: 19.13%
Epoch 4/200 | Train Loss: 1.6113 | Train Acc: 18.20% | Val Loss: 1.6084 | Val Acc: 19.13%
Epoch 5/200 | Train Loss: 1.6106 | Train Acc: 18.20% | Val Loss: 1.6076 | Val Acc: 19.13%
Epoch 6/200 | Train Loss: 1.6098 | Train Acc: 18.20% | Val Loss: 1.6065 | Val Acc: 19.13%
Epoch 7/200 | Train Loss: 1.6090 | Train Acc: 18.20% | Val Loss: 1.6055 | Val Acc: 19.13%
Epoch 8/200 | Train Loss: 1.6078 | Train Acc: 18.20% | Val Loss: 1.6043 | Val Acc: 19.13%
Epoch 9/200 | Train Loss: 1.6065 | Train Acc: 19.46% | Val Loss: 1.6023 | Val Acc: 22.82%
Epoch 10/200 | Train Loss: 1.6040 | Train Acc: 23.32% | Val Loss: 1.5997 | Val Acc: 22.82%
Epoch 11/200 | Train Loss: 1.5993 | Train Acc: 21.73% | Val Loss: 1.5915 | Val Acc: 23.83%
Epoch 12

In [56]:
model.load_state_dict(best_model)

<All keys matched successfully>

In [57]:
test_loss, test_accuracy = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.2f}%")

Test Loss: 0.3650 | Test Accuracy: 88.44%
