In [14]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
import os

nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [37]:
pd.set_option('display.max_colwidth', None) 

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

**Notes:**

* Downloaded the Glove pre-embeddings of 100 dimensions from https://nlp.stanford.edu/data/glove.6B.zip
* Convert the given input tokens according to downloaded Glove 100D representation
* As it is sequential data, we can start with varient of Reccurrent Neural Network, I have chosen LSTM model.

## Loading Dataset

In [16]:
# Load dataset
df = pd.read_csv("/kaggle/input/quora-spam-questions/train.csv")

In [17]:
# Rename columns
df = df.rename(columns={"question_text": "question", "target": "label"})

In [28]:
df['label'].value_counts()

label
0    1225312
1      80810
Name: count, dtype: int64

**Data is highly imbalanced hence using Stratified sampling while splitting between train, test, validation sets.**

In [18]:
# Split dataset
train_df, test_df = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1111, stratify=train_df['label'], random_state=42)  # to make 80-10-10 split

In [34]:
print(train_df['label'].value_counts())
print(test_df['label'].value_counts())
print(val_df['label'].value_counts())

label
0    980260
1     64649
Name: count, dtype: int64
label
0    122532
1      8081
Name: count, dtype: int64
label
0    122520
1      8080
Name: count, dtype: int64


## Data preperation

In [19]:
# Build vocab
def build_vocab(sentences, min_freq=1):
    freq = {}
    for sent in sentences:
        for word in word_tokenize(sent.lower()):
            freq[word] = freq.get(word, 0) + 1
    vocab = {word for word, count in freq.items() if count >= min_freq}
    word2idx = {word: idx+2 for idx, word in enumerate(vocab)}
    word2idx['<PAD>'] = 0
    word2idx['<UNK>'] = 1
    return word2idx

word2idx = build_vocab(train_df['question'].tolist())

In [20]:
# Load GloVe embeddings
def load_glove(path, word2idx, dim=100):
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), dim))
    embeddings[word2idx['<PAD>']] = np.zeros(dim)
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            if word in word2idx:
                embeddings[word2idx[word]] = np.array(parts[1:], dtype=np.float32)
    return torch.tensor(embeddings, dtype=torch.float32)

glove_path = "/kaggle/input/quora-spam-questions/glove.6B.100d.txt"  # Download from https://nlp.stanford.edu/data/glove.6B.zip
embedding_matrix = load_glove(glove_path, word2idx, dim=100)

In [21]:
MAX_LEN = 30

def encode_sentence(sent, word2idx):
    tokens = word_tokenize(sent.lower())
    idxs = [word2idx.get(tok, word2idx['<UNK>']) for tok in tokens]
    if len(idxs) < MAX_LEN:
        idxs += [word2idx['<PAD>']] * (MAX_LEN - len(idxs))
    else:
        idxs = idxs[:MAX_LEN]
    return idxs

class QuoraDataset(Dataset):
    def __init__(self, df, word2idx):
        self.samples = [(encode_sentence(row['question'], word2idx), row['label']) for _, row in df.iterrows()]
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x).to(device), torch.tensor(y).to(device)

In [None]:
train_data = QuoraDataset(train_df, word2idx)
val_data = QuoraDataset(val_df, word2idx)
test_data = QuoraDataset(test_df, word2idx)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32)

## Model Configuration

In [22]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=128):
        super().__init__()
        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        out = self.fc(hidden[-1])
        return torch.sigmoid(out).squeeze(1)

## Training & Validation:

In [23]:
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for x, y in tqdm(loader):
        x, y = x.to(device), y.float().to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            output = model(x)
            preds.extend((output > 0.5).cpu().numpy())
            labels.extend(y.cpu().numpy())
    acc = np.mean(np.array(preds) == np.array(labels))
    return acc

In [24]:
def train_validation_loop(model, num_epochs=5, lr=0.001):
    
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr)

    best_val_acc = 0.0

    for epoch in range(num_epochs):
        loss = train(model, train_loader, optimizer, criterion)
        train_acc = evaluate(model, train_loader)
        val_acc = evaluate(model, val_loader)
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), "/kaggle/working/quora_spam_model_best.pth")
            print(f"Best model is saved at {epoch+1}")
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss:.4f} | Train Acc: {train_acc:.2f}% | Val Acc: {val_acc:.2f}%")

In [25]:
model = LSTMClassifier(embedding_matrix).to(device)

train_validation_loop(model, num_epochs=5)

100%|██████████| 32654/32654 [05:59<00:00, 90.84it/s]


Best model is saved at 1
Epoch 1/5 - Loss: 0.1132 | Train Acc: 0.96% | Val Acc: 0.96%


100%|██████████| 32654/32654 [06:00<00:00, 90.56it/s]


Epoch 2/5 - Loss: 0.0941 | Train Acc: 0.97% | Val Acc: 0.96%


100%|██████████| 32654/32654 [06:00<00:00, 90.49it/s]


Epoch 3/5 - Loss: 0.0822 | Train Acc: 0.97% | Val Acc: 0.96%


100%|██████████| 32654/32654 [06:01<00:00, 90.31it/s]


Epoch 4/5 - Loss: 0.0718 | Train Acc: 0.98% | Val Acc: 0.95%


100%|██████████| 32654/32654 [06:01<00:00, 90.33it/s]


Epoch 5/5 - Loss: 0.0624 | Train Acc: 0.98% | Val Acc: 0.95%


In [26]:
model = LSTMClassifier(embedding_matrix).to(device)

# Load best model
model.load_state_dict(torch.load("/kaggle/working/quora_spam_model_best.pth"))

# Test Accuracy
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy: {test_acc:.4f}")

Test Accuracy: 0.9585


## Prediction:

In [39]:
# Inference
def predict(model, sentence):
    model.eval()
    encoded = encode_sentence(sentence, word2idx)
    tensor = torch.tensor(encoded).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(tensor)
        return "Spam" if output.item() > 0.5 else "Not Spam"

print(predict(model, "When will Texas finally execute it's last citizen? When they do, who will turn off the lights?"))

Not Spam


## Observations:

* **With LSTM model, we got 95.85 accuracy on test data set**
* **One observation is, accuracy is getting decreased on validation dataset as training iterations are getting increased, it seems model is overfitting.**
* **We can try to tweak the model configuration and experiment if we can increase the model performance on test dataset.**