In [48]:
import csv
import random
import sys
from collections import Counter
import nltk
import math
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# This symbols and words can be found in almost all emails, removing them for better results
symbols_and_numbers = {'!','â','subject','subject:', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
stop_words.update(symbols_and_numbers)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andriihladkyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
# Loading the dataset and balancing it (I can do this because I have a lot of data)
import sys
csv.field_size_limit(sys.maxsize)

X = []
y = []

with open("dataset.csv", "r") as file:
    reader = csv.reader(file)
    for row in reader:
        X.append(row[0])
        y.append(row[1])
    
paired = list(zip(X, y))

ham = []
spam = []

for email,label in paired:
    if label == "1":
        spam.append((email, label))
    else:
        ham.append((email, label))

print(len(ham))
print(len(spam))


min_count = min(len(ham), len(spam))

# Balancing and shuffle everything
balanced_spam = spam[:min_count]
balanced_ham = ham[:min_count]

balanced_emails = balanced_spam + balanced_ham

random.shuffle(balanced_emails)

X, y = zip(*balanced_emails)

X = list(X)
y = list(y)

17648
9140


In [57]:
# number of folds for cross-validation
num_folds = 10
alpha = 1

fold_size = len(X)//num_folds
folds_X = []
folds_y = []
for i in range(num_folds):
    start = i * fold_size
    end = (i+1)*fold_size if i < num_folds-1 else len(X)
    folds_X.append(X[start:end])
    folds_y.append(y[start:end])

In [58]:
def train(X_train, y_train):
    words_spam, words_ham = [], []
    for x, y in zip(X_train, y_train):
        
        words = x.lower().split()
        for word in words:
            if word not in stop_words and y == "0":
                words_ham.append(word)
            elif word not in stop_words and y == "1":
                words_spam.append(word)
    
    # Yes, it's just a regular bag of words algorithm I use, nothing complicated like tokenization
    # Still works just fine
    word_counts_spam = Counter(words_spam)
    word_counts_ham = Counter(words_ham)
    all_words = set(word_counts_spam.keys()) | set(word_counts_ham.keys())

    total_spam_words = sum(word_counts_spam.values()) + alpha * len(all_words)
    total_ham_words = sum(word_counts_ham.values()) + alpha * len(all_words)

    # Everything is scaled using log, also I applied Laplas smoothing techniques
    spam_probs_dict = {word: math.log((count + alpha) / total_spam_words) for word, count in word_counts_spam.items()}
    ham_probs_dict = {word: math.log((count + alpha) / total_ham_words) for word, count in word_counts_ham.items()}
    
    # Prior probability directly from Bayes formula
    spam_prior_probability = math.log(sum(y == "1" for y in y_train) + alpha) - math.log(len(y_train) + alpha * 2)
    ham_prior_probability = math.log(sum(y == "0" for y in y_train) + alpha) - math.log(len(y_train) + alpha * 2)
    
    return spam_probs_dict, ham_probs_dict, spam_prior_probability, ham_prior_probability, total_spam_words, total_ham_words


In [59]:
def inference(email, spam_probs_dict, ham_probs_dict, spam_prior_probability, ham_prior_probability, total_spam_words, total_ham_words):
    words = email.lower().split()
    spam_log_prob = spam_prior_probability  
    ham_log_prob = ham_prior_probability  
    for word in words:
        # In the original Bayes, this should be replaced by multiplication, but algorithm developers usually change it to addition
        # because multiplication makes the probabilities vanish near zero
        # also I'm using Laplas Smoothing for non-existing words which just works fine
        spam_log_prob += spam_probs_dict.get(word, math.log(alpha / total_spam_words))
        ham_log_prob += ham_probs_dict.get(word, math.log(alpha / total_ham_words))
    if spam_log_prob > ham_log_prob:
        return True
    else:
        return False

In [60]:
def test(X_test, y_test, spam_probs_dict, ham_probs_dict, spam_prior_probability, ham_prior_probability, total_spam_words, total_ham_words):
    success = 0
    total_emails = len(X_test)  
    for email, y in zip(X_test, y_test):
        result = inference(email, spam_probs_dict, ham_probs_dict, spam_prior_probability, ham_prior_probability, total_spam_words, total_ham_words)
        if result:
            result = "1"
        else:
            result = "0"
        if result == y:
            success += 1
    accuracy = success / total_emails
    print("Accuracy:", accuracy)


In [61]:
# cross-validation
for i in range(num_folds):
    X_train = []
    y_train = []
    
    for j, fold in enumerate(folds_X):
        if j != i:  
            X_train.extend(fold)  
    for j, fold in enumerate(folds_y):
        if j != i:  
            y_train.extend(fold) 
            
    X_test = folds_X[i]
    y_test = folds_y[i]
    spam_probs_dict, ham_probs_dict, spam_prior_probability, ham_prior_probability, total_spam_words, total_ham_words = train(X_train, y_train)
    test(X_train, y_train,spam_probs_dict, ham_probs_dict, spam_prior_probability, ham_prior_probability, total_spam_words, total_ham_words)
    

Accuracy: 0.9784220763433017
Accuracy: 0.9790299051787017
Accuracy: 0.9783612934597617
Accuracy: 0.9789691222951616
Accuracy: 0.9788475565280816
Accuracy: 0.9776318988572817
Accuracy: 0.9781789448091417
Accuracy: 0.9790906880622416
Accuracy: 0.9793338195964016
Accuracy: 0.9769632871383418


In [36]:
X_train = folds_X[0]
y_train = folds_y[0]
    
spam_probs_dict, ham_probs_dict, spam_prior_probability, ham_prior_probability, total_spam_words, total_ham_words = train(X_train, y_train)
test(X_train, y_train,spam_probs_dict, ham_probs_dict, spam_prior_probability, ham_prior_probability, total_spam_words, total_ham_words)

Accuracy: 0.9782275711159737


In [62]:
email = """ 
Subject: Exclusive Offer Inside! Claim Your Reward Now!

Dear [Recipient],

Congratulations! 🎉 You've been selected for an exclusive offer from our esteemed partners. As a valued member of our community, we're excited to extend this incredible opportunity to you.

Claim your reward now and enjoy fantastic benefits, including:

Free access to premium content
Exciting discounts on top brands
Special deals tailored just for you
But hurry, this offer won't last long! Simply click the link below to unlock your rewards:

[Link to claim your reward]

Don't miss out on this amazing opportunity! Act fast and elevate your experience with our exclusive offers.

Best regards,
[Your Company Name]
[Your Contact Information]
"""

print(inference(email, spam_probs_dict, ham_probs_dict, spam_prior_probability, ham_prior_probability, total_spam_words, total_ham_words))

True


In [38]:
sorted_dict = dict(sorted(ham_probs_dict.items(), key=lambda item: item[1], reverse=True))
sorted_dict

{'ect': -5.036481604193827,
 'would': -5.674702582642577,
 'hou': -5.772208228914405,
 '2001': -5.812609917080657,
 'new': -5.841976468229792,
 'please': -5.940353194442096,
 'com': -5.953046021240515,
 'company': -5.977288233451549,
 'said': -5.998545528921332,
 '2000': -6.022951717984826,
 'one': -6.065936846217966,
 'get': -6.182252623788401,
 '10': -6.215501530746621,
 'like': -6.268169309918848,
 'energy': -6.274126374094144,
 'list': -6.283922628767342,
 'also': -6.299605459395129,
 'know': -6.309655795248631,
 'may': -6.31259265492194,
 'pm': -6.325419668481694,
 'time': -6.359081547480626,
 'gas': -6.37948688975197,
 '01': -6.392499206287825,
 'cc': -6.411077804252589,
 'message': -6.460986516591107,
 'power': -6.494914211715895,
 'vince': -6.4992372681636885,
 '11': -6.499631201594034,
 'could': -6.522345866176566,
 'use': -6.533695805609877,
 'need': -6.53573620623615,
 'corp': -6.572769770460692,
 'thanks': -6.577443668920379,
 'business': -6.583852577846937,
 'information':

In [39]:
sorted_dict = dict(sorted(spam_probs_dict.items(), key=lambda item: item[1], reverse=True))
sorted_dict

{'email': -5.697775549083209,
 'com': -5.756217146410222,
 'company': -5.765643587057877,
 'please': -5.907625594171704,
 'get': -5.943125867211083,
 'http': -5.964596091070387,
 'free': -5.966853428372038,
 'information': -5.9693990385714715,
 'e': -5.995508684132414,
 'one': -6.056024300262277,
 'us': -6.098933090763373,
 'money': -6.140736116520127,
 'click': -6.15939653923753,
 'new': -6.172148736861455,
 'business': -6.173537143887682,
 'may': -6.225946213809177,
 'time': -6.227411415536505,
 'statements': -6.327572049474579,
 '00': -6.360105287174048,
 'report': -6.4120817696522145,
 'www': -6.414730777823792,
 'make': -6.420939223486275,
 'within': -6.461335003985051,
 'price': -6.4890428107883436,
 'mail': -6.551931899337277,
 'like': -6.57761384496962,
 'de': -6.6190430303833185,
 'send': -6.629948263865581,
 'order': -6.630496659821194,
 '000': -6.650441919340429,
 'address': -6.657743271630332,
 'people': -6.660565740340072,
 'best': -6.668511298599891,
 'want': -6.686913197