In [1]:
import kagglehub
import numpy as np
import csv

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import tensorflow as tf
from tensorflow.keras import layers, models

import matplotlib.pyplot as plt

In [2]:
# Download latest version of dataset
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/justiniverson/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1


In [3]:
# Load and preprocess dataset
dataset_path = "/kaggle/input/sms-spam-collection-dataset/spam.csv"
dataset_path = path + "/spam.csv"

with open(dataset_path, 'r', encoding='latin-1') as f:
    reader = csv.reader(f)
    rows = list(reader)

if rows[0][0].lower() == "v1":
    rows = rows[1:]

data = np.array([[r[0], r[1]] for r in rows], dtype=object)
labels = data[:, 0]
messages = data[:, 1]

labels = np.where(labels == "spam", 1, 0)
print(labels.shape)
print(messages.shape)

(5572,)
(5572,)


In [6]:
# Split and vectorize dataset
X_train, X_test, y_train, y_test = train_test_split(messages, labels, test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()

print("Vectorized shape:", X_train_vec.shape)

Vectorized shape: (4457, 7735)


In [7]:
input_dim = X_train_vec.shape[1] # number of features

model = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [8]:
history = model.fit(
    X_train_vec, y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=32
)

Epoch 1/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9203 - loss: 0.2821 - val_accuracy: 0.9843 - val_loss: 0.0712
Epoch 2/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9938 - loss: 0.0291 - val_accuracy: 0.9854 - val_loss: 0.0601
Epoch 3/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9997 - loss: 0.0058 - val_accuracy: 0.9832 - val_loss: 0.0723
Epoch 4/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9997 - loss: 0.0019 - val_accuracy: 0.9843 - val_loss: 0.0663
Epoch 5/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 9.4445e-04 - val_accuracy: 0.9821 - val_loss: 0.0813


In [16]:
probs = model.predict(X_test_vec).flatten()
preds = (probs >= 0.5).astype(int)

acc = accuracy_score(y_test, preds)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_test, preds, average="binary", zero_division=0
)

print("Test accuracy:", acc)
print("Test precision:", prec)
print("Test recall:", rec)
print("Test F1:", f1)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 889us/step
Test accuracy: 0.979372197309417
Test precision: 1.0
Test recall: 0.8466666666666667
Test F1: 0.9169675090252708


In [37]:
# checking five misclassified inputs / some other stuff

misclassified_idx = np.where(y_test != preds)[0]
for i in misclassified_idx[:5]:
    print(X_test[i])
    print("SPAM" if y_test[i] == 1 else "NOT SPAM", "\n")

print(f"Spam guess freq: {round(len(np.where(preds == 1)[0]) / len(preds), 3)}")
print(f"Actual spam freq: {round(len(np.where(y_test == 1)[0]) / len(y_test), 3)}")

Hi I'm sue. I am 20 years old and work as a lapdancer. I love sex. Text me live - I'm i my bedroom now. text SUE to 89555. By TextOperator G2 1DA 150ppmsg 18+
SPAM 

Loans for any purpose even if you have Bad Credit! Tenants Welcome. Call NoWorriesLoans.com on 08717111821
SPAM 

ringtoneking 84484
SPAM 

08714712388 between 10am-7pm Cost 10p
SPAM 

Can U get 2 phone NOW? I wanna chat 2 set up meet Call me NOW on 09096102316 U can cum here 2moro Luv JANE xx Callså£1/minmoremobsEMSPOBox45PO139WA
SPAM 

Spam guess freq: 0.114
Actual spam freq: 0.135
