In [65]:
import pandas as pd
import re
from collections import defaultdict

In [66]:
df = pd.read_csv('spam+-+Data.csv')

In [67]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [68]:
labels = []
for value in df['Label']:
    if value == "spam":
        labels.append(1)
    else:
        labels.append(0)

In [69]:
df["Label"] = labels

In [70]:
# Preprocessing function to tokenize and clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    tokens = text.split()
    return tokens

In [71]:
# Create vocabulary and calculate word frequencies for spam and non-spam messages
spam_word_freq = defaultdict(int)
non_spam_word_freq = defaultdict(int)
spam_count = 0
non_spam_count = 0

for index, row in df.iterrows():
    tokens = preprocess_text(row['Message'])
    if row['Label'] == 1:
        spam_count += 1
        for token in tokens:
            spam_word_freq[token] += 1
    else:
        non_spam_count += 1
        for token in tokens:
            non_spam_word_freq[token] += 1

In [72]:
# Calculate prior probabilities
total_messages = len(df)
prior_spam = spam_count / total_messages
prior_non_spam = non_spam_count / total_messages

In [73]:
# Function to classify a message as spam or non-spam
def classify_message(message):
    tokens = preprocess_text(message)
    spam_prob = 1.0
    non_spam_prob = 1.0
    for token in tokens:
        # Laplace smoothing for unseen words
        spam_prob *= (spam_word_freq[token] + 1) / (spam_count + len(spam_word_freq))
        non_spam_prob *= (non_spam_word_freq[token] + 1) / (non_spam_count + len(non_spam_word_freq))
    spam_prob *= prior_spam
    non_spam_prob *= prior_non_spam
    if spam_prob > non_spam_prob:
        return 1  # Spam
    else:
        return 0  # Non-spam

In [74]:
def calculate_accuracy(inputted_predictions, actual):
    correct = sum(inputted_predictions == actual)
    total = len(actual)
    accuracy = correct / total
    return accuracy

In [75]:
preds = []
for message in df['Message']:
    prediction = classify_message(message)
    preds.append(prediction)

In [76]:
# Test accuracy
accuracy = calculate_accuracy(preds, df['Label'])
print("Naive Bayes Accuracy:", accuracy)

Naive Bayes Accuracy: 0.9816941852117731


In [77]:
classify_message("Hey, Swiftie! Thanks for preordering Taylor Swift's new album, Tortured Poets Department! Your PayPal account has been charged $34.99. If you think there has been a mistake, please give us a call at (707) 426-2689.")

0