In [1]:
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import joblib

In [2]:
from pathlib import Path

BASE_DIR = Path("..").resolve()   # goes out of /notebooks to root folder

DATA_RAW = BASE_DIR / "data" / "raw" / "spam.csv"
TRAIN_OUT = BASE_DIR / "data" / "processed" / "train.csv"
TEST_OUT = BASE_DIR / "data" / "processed" / "test.csv"
MODEL_DIR = BASE_DIR / "models"

MODEL_DIR.mkdir(exist_ok=True, parents=True)

BASE_DIR, DATA_RAW.exists()

(WindowsPath('C:/Users/User/OneDrive/Desktop/SMS_SPAM_CLASSIFIER'), True)

In [3]:
import pandas as pd

# Load the CSV
df = pd.read_csv(DATA_RAW, encoding="latin-1")

# See the columns
print(df.columns)
df.head()

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Keep only the label and text columns
if "v1" in df.columns and "v2" in df.columns:
    df = df[["v1", "v2"]]
    df.columns = ["label", "text"]

# Drop any missing rows
df = df.dropna(subset=["label", "text"])

# Map ham/spam to 0/1
df["label"] = df["label"].map({"ham": 0, "spam": 1})

df.head(), df["label"].value_counts()

(   label                                               text
 0      0  Go until jurong point, crazy.. Available only ...
 1      0                      Ok lar... Joking wif u oni...
 2      1  Free entry in 2 a wkly comp to win FA Cup fina...
 3      0  U dun say so early hor... U c already then say...
 4      0  Nah I don't think he goes to usf, he lives aro...,
 label
 0    4825
 1     747
 Name: count, dtype: int64)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})

train_df.to_csv(TRAIN_OUT, index=False)
test_df.to_csv(TEST_OUT, index=False)

train_df.head(), test_df.head()

(                                                   text  label
 184                          Going on nothing great.bye      0
 2171                      I wont. So wat's wit the guys      0
 5422            Ok k..sry i knw 2 siva..tats y i askd..      0
 4113  Where are you ? What do you do ? How can you s...      0
 4588       Have you not finished work yet or something?      0,
                                                    text  label
 2826  Oh right, ok. I'll make sure that i do loads o...      0
 3695                     I am in tirupur.  call you da.      0
 3906             No that just means you have a fat head      0
 575   You have won ?1,000 cash or a ?2,000 prize! To...      1
 2899  Come aftr  &lt;DECIMAL&gt; ..now i m cleaning ...      0)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

input_dim = X_train_vec.shape[1]
input_dim

5000

In [7]:
import numpy as np

X_train_dense = X_train_vec.toarray().astype("float32")
X_test_dense = X_test_vec.toarray().astype("float32")

y_train_arr = y_train.values.astype("float32")
y_test_arr = y_test.values.astype("float32")

X_train_dense.shape, X_test_dense.shape

((4457, 5000), (1115, 5000))

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

class SpamDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        return torch.from_numpy(x), torch.tensor(y, dtype=torch.float32)


In [9]:
train_dataset = SpamDataset(X_train_dense, y_train_arr)
test_dataset  = SpamDataset(X_test_dense, y_test_arr)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=128, shuffle=False)

len(train_dataset), len(test_dataset)

(4457, 1115)

In [10]:
# Define the PyTorch model
import torch.nn as nn

class SpamNet(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, dropout=0.3):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x.squeeze(1)

# Ensure input_dim is available
input_dim = X_train_dense.shape[1]
input_dim


5000

In [11]:
# Create model, loss, optimizer, device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

model = SpamNet(input_dim=input_dim, hidden_dim=128, dropout=0.3).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [12]:
# Train & eval helper functions
from tqdm.auto import tqdm


def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    
    for X_batch, y_batch in loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * X_batch.size(0)
    
    return total_loss / len(loader.dataset)


def eval_epoch(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            logits = model(X_batch)
            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())
    
    acc = accuracy_score(all_labels, all_preds)
    return acc, all_labels, all_preds


In [13]:
# Training loop
num_epochs = 10
best_acc = 0.0

for epoch in range(1, num_epochs + 1):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_acc, _, _ = eval_epoch(model, test_loader, device)
    best_acc = max(best_acc, val_acc)
    print(f"Epoch {epoch}/{num_epochs} | Loss: {train_loss:.4f} | Val Acc: {val_acc:.4f}")

print("Best validation accuracy:", best_acc)


Epoch 1/10 | Loss: 0.5573 | Val Acc: 0.9166
Epoch 2/10 | Loss: 0.1986 | Val Acc: 0.9722
Epoch 3/10 | Loss: 0.0778 | Val Acc: 0.9821
Epoch 4/10 | Loss: 0.0449 | Val Acc: 0.9848
Epoch 5/10 | Loss: 0.0297 | Val Acc: 0.9848
Epoch 6/10 | Loss: 0.0214 | Val Acc: 0.9848
Epoch 7/10 | Loss: 0.0164 | Val Acc: 0.9848
Epoch 8/10 | Loss: 0.0119 | Val Acc: 0.9857
Epoch 9/10 | Loss: 0.0094 | Val Acc: 0.9857
Epoch 10/10 | Loss: 0.0078 | Val Acc: 0.9857
Best validation accuracy: 0.9856502242152466


In [14]:
# Detailed evaluation
val_acc, labels, preds = eval_epoch(model, test_loader, device)

print("Final Accuracy:", val_acc)
print(classification_report(labels, preds))

cm = confusion_matrix(labels, preds)
cm


Final Accuracy: 0.9856502242152466
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99       966
         1.0       0.99      0.91      0.94       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



array([[964,   2],
       [ 14, 135]])

In [15]:
# Helper function to test a custom message
def predict_text(text: str):
    vec = vectorizer.transform([text])
    X = vec.toarray().astype("float32")
    X_tensor = torch.from_numpy(X).to(device)

    model.eval()
    with torch.no_grad():
        logits = model(X_tensor)
        prob_spam = torch.sigmoid(logits).item()
    
    label = "spam" if prob_spam >= 0.5 else "ham"
    return label, prob_spam

predict_text("You have won 10 million dollars! Click this link now.")


('spam', 0.9811213612556458)

In [16]:
# Save model and vectorizer
from pathlib import Path
import joblib

MODEL_DIR = Path("..") / "models"
MODEL_DIR.mkdir(exist_ok=True, parents=True)

MODEL_PATH = MODEL_DIR / "spam_torch_model.pt"
VECTORIZER_PATH = MODEL_DIR / "tfidf_vectorizer.joblib"

torch.save(model.state_dict(), MODEL_PATH)
joblib.dump(vectorizer, VECTORIZER_PATH)

MODEL_PATH, VECTORIZER_PATH


(WindowsPath('../models/spam_torch_model.pt'),
 WindowsPath('../models/tfidf_vectorizer.joblib'))