In [None]:
import kagglehub
import numpy as np
import csv

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
import tensorflow as tf

import matplotlib as plt

In [2]:
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/sms-spam-collection-dataset?dataset_version_number=1...


100%|██████████| 211k/211k [00:00<00:00, 7.67MB/s]

Extracting files...
Path to dataset files: /Users/kennethwan/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1





In [3]:
# Load and preprocess dataset
dataset_path = "/kaggle/input/sms-spam-collection-dataset/spam.csv"
dataset_path = path + "/spam.csv"

with open(dataset_path, 'r', encoding='latin-1') as f:
    reader = csv.reader(f)
    rows = list(reader)

if rows[0][0].lower() == "v1":
    rows = rows[1:]

data = np.array([[r[0], r[1]] for r in rows], dtype=object)
labels = data[:, 0]
messages = data[:, 1]

labels = np.where(labels == "spam", 1, 0)
print(labels.shape)
print(messages.shape)

(5572,)
(5572,)


In [6]:
# Split and vectorize dataset
X_train, X_test, y_train, y_test = train_test_split(messages, labels, test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()

print("Vectorized shape:", X_train_vec.shape)

Vectorized shape: (4457, 7735)


In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased" ,num_labels=2)

train_enc = tokenizer(
    list(X_train),
    truncation=True,
    padding=True,
    max_length=256,
    return_tensors="tf"
)

test_enc = tokenizer(
    list(X_test),
    truncation=True,
    padding=True,
    max_length=256,
    return_tensors="tf"
)

# Build tf.data datasets
train_ds = tf.data.Dataset.from_tensor_slices((
    dict(train_enc),
    y_train
)).shuffle(1000).batch(16)

test_ds = tf.data.Dataset.from_tensor_slices((
    dict(test_enc),
    y_test
)).batch(16)

# Compile
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ["accuracy"]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Train
model.fit(train_ds, validation_data=test_ds, epochs=2)

# Evaluate
logits = model.predict(test_ds).logits
preds = tf.argmax(logits, axis=1).numpy()

acc = accuracy_score(y_test, preds)
prec, recall, f1, _ = precision_recall_fscore_support(y_test, preds, average="binary")

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", recall)
print("F1:", f1)