In [1]:
import pandas as pd
from google.colab import files

uploaded = files.upload()
emails_df = pd.read_csv("spam.csv", encoding="latin1")

print(emails_df.head())


# ============================
# TEXT PREPROCESSING
# ============================

import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def preprocess_text(text):

    text = text.lower()

    text = text.translate(str.maketrans("", "", string.punctuation))

    text = re.sub(r"\d+", " ", text)

    text = text.encode("ascii", errors="ignore").decode()

    words = text.split()

    cleaned_tokens = []
    for w in words:
        if w not in stop_words:
            cleaned_tokens.append(w)
    return cleaned_tokens


emails_df = emails_df.rename(columns={'v1': 'label', 'v2': 'email'})
emails_df["tokens"] = emails_df["email"].apply(preprocess_text)

print(emails_df[["label", "email", "tokens"]].head(), "\n")


# ============================
# VOCABULARY
# ============================

vocab_set = set()
for token_list in emails_df["tokens"]:
    for token in token_list:
        vocab_set.add(token)

vocab_list = list(vocab_set)
print("Vocabulary size:", len(vocab_list))
print("First 20 words:", vocab_list[:20])


# ============================
# BAG OF WORDS
# ============================

def create_bow(tokens, vocab):
    vector = [0] * len(vocab)
    for i, word in enumerate(vocab):
        vector[i] = tokens.count(word)
    return vector

emails_df["bow_vector"] = emails_df["tokens"].apply(lambda x: create_bow(x, vocab_list))
print(emails_df[["label", "bow_vector"]].head())


# ============================
# COUNT WORDS PER CLASS
# ============================

import numpy as np

vocab_index = {word: idx for idx, word in enumerate(vocab_list)}
V = len(vocab_list)

spam_counts = np.zeros(V)
ham_counts  = np.zeros(V)
total_spam_words = 0
total_ham_words  = 0
num_spam_emails = 0
num_ham_emails  = 0

for bow, lbl in zip(emails_df["bow_vector"], emails_df["label"]):
    if lbl == "spam":
        num_spam_emails += 1
        for i, c in enumerate(bow):
            spam_counts[i] += c
            total_spam_words += c
    else:
        num_ham_emails += 1
        for i, c in enumerate(bow):
            ham_counts[i] += c
            total_ham_words += c

# Class probabilities
P_spam = num_spam_emails / len(emails_df)
P_ham  = num_ham_emails / len(emails_df)

print("Total emails:", len(emails_df))
print("Spam emails:", num_spam_emails)
print("Ham emails:", num_ham_emails)
print(f"P(Spam) = {P_spam:.3f}, P(Ham) = {P_ham:.3f}")


# ============================
# LIKELIHOOD PROBABILITIES (with Laplace smoothing)
# ============================

spam_likelihood = [(count + 1) / (total_spam_words + V) for count in spam_counts]
ham_likelihood  = [(count + 1) / (total_ham_words  + V) for count in ham_counts]

print("\nLikelihoods for first 20 words:")
for i in range(20):
    print(f"{vocab_list[i]} | P(Spam)={spam_likelihood[i]:.8f}, P(Ham)={ham_likelihood[i]:.8f}")


# ============================
# EMAIL CLASSIFICATION
# ============================

import math

def classify(email_text):
    tokens = preprocess_text(email_text)
    log_prob_spam = math.log(P_spam)
    log_prob_ham  = math.log(P_ham)

    for word in tokens:
        if word in vocab_index:
            idx = vocab_index[word]
            log_prob_spam += math.log(spam_likelihood[idx])
            log_prob_ham  += math.log(ham_likelihood[idx])
        else:
            # unseen word, Laplace smoothing
            log_prob_spam += math.log(1 / (total_spam_words + V))
            log_prob_ham  += math.log(1 / (total_ham_words + V))

    print(f"\nEmail: '{email_text}'")
    print(f"log P(Spam|email) = {log_prob_spam:.3f}")
    print(f"log P(Ham|email)  = {log_prob_ham:.3f}")

    if log_prob_spam > log_prob_ham:
        print("Classification: SPAM")
        return "spam"
    else:
        print("Classification: HAM")
        return "ham"


# ============================
# TEST NEW EMAILS
# ============================

new_emails = [
    "Congratulations! You won a free iPhone. Click here to claim now!",
    "Get rich quick! Invest in our program and double your money!",
    "Limited time offer: Buy one, get one free on all products!",
    "You have been selected for a $1000 gift card. Act fast!",
    "Earn $5000 per week working from home. No experience required!",
    "Hey, are we still on for coffee tomorrow?",
    "Please find attached the report for our meeting.",
    "Happy birthday! Wishing you a wonderful day with your family.",
    "Can you review the document and send me your feedback?",
    "Let's schedule a call to discuss the project updates.",
]

predictions = []
for email in new_emails:
    lbl = classify(email)
    predictions.append(lbl)


# ============================
# EVALUATION
# ============================

true_labels = ["spam"]*5 + ["ham"]*5

correct = 0
for pred, true in zip(predictions, true_labels):
    if pred == true:
        correct += 1

accuracy = correct / len(true_labels) * 100
print(f"\nModel accuracy: {accuracy:.2f}%")


Saving spam.csv to spam.csv
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


  label                                              email  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                              tokens  
0  [go, jurong, point, crazy, available, bugis, n...  
1                     [ok, lar, joking, wif, u, oni]  
2  [free, entry, wkly, comp, win, fa, cup, final,...  
3      [u, dun, say, early, hor, u, c, already, say]  
4  [nah, dont, think, goes, usf, lives, around, t...   

Vocabulary size: 8345
First 20 words: ['malaria', 'crucify', 'star', 'type', 'spontaneously', 'ghost', 'retrieve', 'jenne', 'robinson', 'southern', 'kb', 'anymore', 'toledo', 'boooo', 'towards', 'alternativehope', 'hunt', 'comp', 'answr', 'juicy']
  label                                        