In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

# Device setup: use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the dataset from the correct path (ensure you have emails.csv locally or provide the correct path)
df = pd.read_csv("emails.csv", encoding='latin-1')

# Check the columns
print(df.columns)  # Expecting 'text' and 'spam'

# We don't need to rename the columns as the file already has the correct headers: 'text' and 'spam'
# Check the distribution of labels
print(df['spam'].value_counts())  # Should print counts of 0s (ham) and 1s (spam)

# No need for label mapping since 'spam' is already 0 and 1
# Check for missing values
print(df.isnull().sum())

# Remove rows with missing values (if any)
df = df.dropna(subset=['text', 'spam'])

# Convert text to numeric values using CountVectorizer
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text']).toarray()
y = df['spam'].values  # Directly using the 'spam' column as labels (0 or 1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32).unsqueeze(1).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).unsqueeze(1).to(device)
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
y_test = torch.tensor(y_test, dtype=torch.long).to(device)

# Define a custom Dataset class
class EmailDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create DataLoader objects for batching the data
train_dataset = EmailDataset(X_train, y_train)
test_dataset = EmailDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(2, x.size(0), 128).to(device)
        c0 = torch.zeros(2, x.size(0), 128).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # Only use the last time-step output
        return out

# Initialize the model, loss function, and optimizer
model = LSTMModel(input_size=5000, hidden_size=128, num_layers=2, output_size=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# Save the model
torch.save(model.state_dict(), "lstm_model.pth")

# Load the saved model
model.load_state_dict(torch.load("lstm_model.pth", weights_only=True))

model.eval()

# Evaluate the model on the test data
y_true, y_pred = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(y_batch.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

# Calculate performance metrics
conf_matrix = confusion_matrix(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted')

print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)
print("F1 Score:", f1)


Index(['text', 'spam'], dtype='object')
spam
0    4360
1    1368
Name: count, dtype: int64
text    0
spam    0
dtype: int64
Epoch 1/10, Loss: 23.3664
Epoch 2/10, Loss: 1.5835
Epoch 3/10, Loss: 0.3432
Epoch 4/10, Loss: 0.1696
Epoch 5/10, Loss: 0.0931
Epoch 6/10, Loss: 0.0641
Epoch 7/10, Loss: 0.0502
Epoch 8/10, Loss: 0.0383
Epoch 9/10, Loss: 0.0307
Epoch 10/10, Loss: 0.0255
Confusion Matrix:
 [[852   4]
 [  4 286]]
Accuracy: 0.9930191972076788
F1 Score: 0.9930191972076788
