In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import pickle
import re

print("âœ“ All imports successful!")

âœ“ All imports successful!


In [6]:
# Cell 2: Load and balance dataset
df = pd.read_csv("spam.csv", encoding='latin-1')
df = df[['Category', 'Message']]
df.columns = ['Category', 'Message']

In [7]:
# Downsample to balance
df_spam = df[df['Category']=='spam']
df_ham = df[df['Category']=='ham']
df_ham_downsampled = df_ham.sample(df_spam.shape[0], random_state=42)
df_balanced = pd.concat([df_ham_downsampled, df_spam]).sample(frac=1, random_state=42).reset_index(drop=True)
df_balanced['spam'] = df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)

print(f"âœ“ Balanced dataset: {df_balanced.shape[0]} messages")
print(f"Distribution:\n{df_balanced['spam'].value_counts()}")

âœ“ Balanced dataset: 1494 messages
Distribution:
spam
1    747
0    747
Name: count, dtype: int64


In [8]:
# Cell 3: Preprocessing
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.split()

def build_vocabulary(texts, min_freq=1, max_vocab_size=3000):
    counter = Counter()
    for text in texts:
        tokens = preprocess_text(text)
        counter.update(tokens)
    
    most_common = counter.most_common(max_vocab_size)
    filtered_words = [word for word, freq in most_common if freq >= min_freq]
    
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for idx, word in enumerate(filtered_words, start=2):
        vocab[word] = idx
    
    return vocab

print("âœ“ Preprocessing defined")

âœ“ Preprocessing defined


In [9]:
# Cell 4: Dataset class
class EmailDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=50):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        tokens = preprocess_text(self.texts[idx])
        indices = [self.vocab.get(word, self.vocab['<UNK>']) for word in tokens]
        
        if len(indices) < self.max_len:
            indices = indices + [self.vocab['<PAD>']] * (self.max_len - len(indices))
        else:
            indices = indices[:self.max_len]
        
        return (
            torch.tensor(indices, dtype=torch.long),
            torch.tensor(int(self.labels[idx]), dtype=torch.float32)
        )

print("âœ“ Dataset defined")

âœ“ Dataset defined


In [10]:
# Cell 5: Simplified LSTM Model
class EmailClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=100):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=1)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)
        dropped = self.dropout(hidden[-1])
        output = self.fc(dropped)
        return self.sigmoid(output)

print("âœ“ Model defined")

âœ“ Model defined


In [11]:
# Cell 6: Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_balanced['Message'].tolist(),
    df_balanced['spam'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_balanced['spam']
)

print(f"âœ“ Train: {len(train_texts)}, Test: {len(test_texts)}")

âœ“ Train: 1195, Test: 299


In [12]:
# Cell 7: Build vocab and loaders
vocab = build_vocabulary(train_texts)
print(f"âœ“ Vocabulary: {len(vocab)} words")

train_dataset = EmailDataset(train_texts, train_labels, vocab, max_len=50)
test_dataset = EmailDataset(test_texts, test_labels, vocab, max_len=50)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

âœ“ Vocabulary: 3002 words


In [13]:

# Cell 8: Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EmailClassifier(vocab_size=len(vocab)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)  # Lower learning rate

print(f"âœ“ Device: {device}")
print(f"âœ“ Learning rate: 0.0005")

âœ“ Device: cuda
âœ“ Learning rate: 0.0005


In [14]:
# Cell 9: Training with detailed monitoring
num_epochs = 30

print("\n" + "="*80)
print("TRAINING")
print("="*80)

best_acc = 0

for epoch in range(num_epochs):
    # Train
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    train_spam_preds = 0
    train_ham_preds = 0
    
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(texts).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        train_loss += loss.item()
        preds = (outputs > 0.5).float()
        train_correct += (preds == labels).sum().item()
        train_total += labels.size(0)
        train_spam_preds += preds.sum().item()
        train_ham_preds += (preds == 0).sum().item()
    
    # Evaluate
    model.eval()
    test_correct = 0
    test_total = 0
    test_spam_correct = 0
    test_spam_total = 0
    test_ham_correct = 0
    test_ham_total = 0
    test_spam_preds = 0
    test_ham_preds = 0
    
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts).squeeze()
            preds = (outputs > 0.5).float()
            
            test_correct += (preds == labels).sum().item()
            test_total += labels.size(0)
            test_spam_preds += preds.sum().item()
            test_ham_preds += (preds == 0).sum().item()
            
            spam_mask = labels == 1
            ham_mask = labels == 0
            
            if spam_mask.sum() > 0:
                test_spam_correct += (preds[spam_mask] == labels[spam_mask]).sum().item()
                test_spam_total += spam_mask.sum().item()
            
            if ham_mask.sum() > 0:
                test_ham_correct += (preds[ham_mask] == labels[ham_mask]).sum().item()
                test_ham_total += ham_mask.sum().item()
    
    train_acc = train_correct / train_total
    test_acc = test_correct / test_total
    spam_recall = test_spam_correct / test_spam_total if test_spam_total > 0 else 0
    ham_recall = test_ham_correct / test_ham_total if test_ham_total > 0 else 0
    
    print(f"Epoch {epoch+1:2d}/{num_epochs} | Loss: {train_loss/len(train_loader):.4f} | "
          f"Train: {train_acc:.3f} | Test: {test_acc:.3f} | "
          f"Spam: {spam_recall:.3f} | Ham: {ham_recall:.3f} | "
          f"Preds: S={test_spam_preds} H={test_ham_preds}")
    
    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.cpu(), 'email_classifier_best.pth')
        model.to(device)

print("="*80)
print(f"âœ“ Best accuracy: {best_acc:.4f}")
print("="*80)


TRAINING
Epoch  1/30 | Loss: 0.6937 | Train: 0.486 | Test: 0.502 | Spam: 1.000 | Ham: 0.007 | Preds: S=298.0 H=1
Epoch  2/30 | Loss: 0.6922 | Train: 0.517 | Test: 0.498 | Spam: 0.000 | Ham: 0.993 | Preds: S=1.0 H=298
Epoch  3/30 | Loss: 0.6921 | Train: 0.495 | Test: 0.502 | Spam: 1.000 | Ham: 0.007 | Preds: S=298.0 H=1
Epoch  4/30 | Loss: 0.6915 | Train: 0.505 | Test: 0.505 | Spam: 1.000 | Ham: 0.013 | Preds: S=297.0 H=2
Epoch  5/30 | Loss: 0.6894 | Train: 0.500 | Test: 0.505 | Spam: 0.987 | Ham: 0.027 | Preds: S=293.0 H=6
Epoch  6/30 | Loss: 0.6983 | Train: 0.540 | Test: 0.502 | Spam: 1.000 | Ham: 0.007 | Preds: S=298.0 H=1
Epoch  7/30 | Loss: 0.6908 | Train: 0.490 | Test: 0.505 | Spam: 1.000 | Ham: 0.013 | Preds: S=297.0 H=2
Epoch  8/30 | Loss: 0.6899 | Train: 0.510 | Test: 0.505 | Spam: 1.000 | Ham: 0.013 | Preds: S=297.0 H=2
Epoch  9/30 | Loss: 0.6900 | Train: 0.509 | Test: 0.505 | Spam: 1.000 | Ham: 0.013 | Preds: S=297.0 H=2
Epoch 10/30 | Loss: 0.6898 | Train: 0.505 | Test: 0.50

In [15]:
# Cell 10: Save
torch.save(model.cpu(), 'email_classifier.pth')
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

print("âœ… Saved: email_classifier.pth, vocab.pkl")

âœ… Saved: email_classifier.pth, vocab.pkl


In [16]:
# Cell 11: Test
def predict(text, model, vocab):
    model.eval()
    model.cpu()
    tokens = preprocess_text(text)
    indices = [vocab.get(w, vocab['<UNK>']) for w in tokens]
    
    if len(indices) < 50:
        indices += [vocab['<PAD>']] * (50 - len(indices))
    else:
        indices = indices[:50]
    
    with torch.no_grad():
        prob = model(torch.tensor([indices], dtype=torch.long)).item()
    
    label = "Spam" if prob > 0.5 else "Ham"
    conf = prob if prob > 0.5 else 1 - prob
    return label, conf, prob
    
tests = [
    "FREE! Win Â£1000 cash prize now! Call immediately",
    "Hey, meeting at 3pm tomorrow",
    "URGENT: Click here to verify your account now",
    "Can you send the report please",
    "Congratulations! You won a free iPhone!",
    "Thanks for your help yesterday"
]

print("\nðŸ“§ Predictions:\n" + "-"*80)
for msg in tests:
    label, conf, prob = predict(msg, model, vocab)
    emoji = "ðŸš«" if label == "Spam" else "âœ…"
    print(f"{emoji} {label} ({conf:.1%}, prob={prob:.3f}): {msg[:50]}")
print("-"*80)

print("\nðŸŽ‰ Done! Run: streamlit run app.py")



ðŸ“§ Predictions:
--------------------------------------------------------------------------------
ðŸš« Spam (83.0%, prob=0.830): FREE! Win Â£1000 cash prize now! Call immediately
âœ… Ham (96.8%, prob=0.032): Hey, meeting at 3pm tomorrow
ðŸš« Spam (83.0%, prob=0.830): URGENT: Click here to verify your account now
âœ… Ham (96.8%, prob=0.032): Can you send the report please
ðŸš« Spam (83.0%, prob=0.830): Congratulations! You won a free iPhone!
ðŸš« Spam (83.0%, prob=0.830): Thanks for your help yesterday
--------------------------------------------------------------------------------

ðŸŽ‰ Done! Run: streamlit run app.py
