# Homework: SMS Spam Classification

**Course:** Deep Learning

**Objective:** Train a model to classify SMS messages as spam or ham.

**Dataset:** SMS Spam Collection  
* **Source:** UCI ML Repository  
* **Download:** https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection  
* **Size:** ~5 500 messages (13 % spam, 87 % ham)  
* **Format:** TSV with columns  
  * `label`: “spam” (1) / “ham” (0)
  * `text`: raw SMS content  

**Tasks:**
1. Load and explore the dataset.
2. Preprocess the text.
3. Define and train a model (any method from the course).
4. Evaluate the model's performance using standard classification metrics on the test set.

> **Success:** achieve ≥ 0.90 F1-score on the test set.  


# Prerequisites
There might be pip errors, just ignore them, it's okay...

In [59]:
%pip install datasets >> None
import os
import random
import numpy as np
import torch
import requests, zipfile, io
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, set_seed
from datasets import Dataset

# Dont change ssid for accurate testing results
ssid = 42
random.seed(ssid)
np.random.seed(ssid)
torch.manual_seed(ssid)
torch.cuda.manual_seed_all(ssid)
torch.mps.manual_seed(ssid)
set_seed(ssid)


def download_data():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
    response = requests.get(url)
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        z.extractall("data")
    df = pd.read_csv("data/SMSSpamCollection", sep="\t", header=None, names=["label", "text"])
    df["label"] = df["label"].map({"ham": 0, "spam": 1})
    return df


def train_val_test(df):
    train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df["label"], random_state=ssid)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=ssid)
    return train_df, val_df, test_df

/Users/aigoncharov/dev/sktech/course-dl/.venv/bin/python: No module named pip


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


# Data

In [60]:
df = download_data()
train_df, val_df, test_df = train_val_test(df)
df

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


# Your training code here

In [61]:
train_df.value_counts("label")

label
0    3377
1     523
Name: count, dtype: int64

In [62]:
from datasets import Dataset

train_df = Dataset.from_pandas(train_df)
val_df = Dataset.from_pandas(val_df)
test_df = Dataset.from_pandas(test_df)

As we can see the dataset is unbalanced. We will potentially need to do something about it later on if we do not see the required performance.

In [63]:
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import numpy as np

device = torch.device("mps")


tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


train_df = train_df.map(preprocess_function, batched=True)
val_df = val_df.map(preprocess_function, batched=True)
test_df = test_df.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc, prec, rec, f1 = (
        accuracy_score(labels, predictions),
        *precision_recall_fscore_support(labels, predictions, average="binary")[:3],
    )
    return {"acc": acc, "prec": prec, "rec": rec, "f1": f1}


model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2).to(
    device
)

training_args = TrainingArguments(
    output_dir="./sms_fraud",
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_strategy="epoch",
    eval_strategy="epoch",
    report_to="none",
    # remove_unused_columns=False,
    save_strategy="epoch",
    overwrite_output_dir=True,
    save_total_limit=1,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=val_df,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Map:   0%|          | 0/3900 [00:00<?, ? examples/s]

Map:   0%|          | 0/836 [00:00<?, ? examples/s]

Map:   0%|          | 0/836 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Acc,Prec,Rec,F1
1,0.177,0.045598,0.989234,0.981308,0.9375,0.958904
2,0.0347,0.035846,0.989234,0.990476,0.928571,0.958525
3,0.0203,0.028003,0.992823,0.973214,0.973214,0.973214




TrainOutput(global_step=93, training_loss=0.07730364222680369, metrics={'train_runtime': 116.6976, 'train_samples_per_second': 100.259, 'train_steps_per_second': 0.797, 'total_flos': 313983956369040.0, 'train_loss': 0.07730364222680369, 'epoch': 3.0})

# Evaluation

In [None]:
import gc
from torch.utils.data import DataLoader

model.eval()
gc.collect()

test_dataloader = DataLoader(
    test_df.remove_columns(["text", "__index_level_0__"]), batch_size=128, collate_fn=data_collator
)

y_pred_test = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        y_pred_test.extend(predictions.cpu().tolist())

y_test = test_df["label"]
acc, prec, rec, f1 = (
    accuracy_score(y_test, y_pred_test),
    *precision_recall_fscore_support(y_test, y_pred_test, average="binary")[:3],
)
print("\nTest —     acc: {:.3f}, prec: {:.3f}, rec: {:.3f}, f1: {:.3f}".format(acc, prec, rec, f1))


Test —     acc: 0.993, prec: 0.973, rec: 0.973, f1: 0.973
