In [18]:
import sys
sys.path.append("./")

In [4]:
from collections import Counter, defaultdict
import math, random, re, glob

In [1]:
def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9']+", message)
    return set(all_words)

In [13]:
def count_words(training_set):
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [33]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    return [(w,
             (spam+k) / (total_spams + 2*k),
             (non_spam+k) / (total_non_spams + 2*k))
            for w, (spam, non_spam) in counts.items()
           ]

In [37]:
def spam_probability(word_probabilities, message):
    message_words = tokenize(message)
    log_probability_if_spam = log_probability_if_not_spam = 0.0
    
    for word, prob_if_spam, prob_if_not_spam in word_probabilities:
        if word in message_words:
            log_probability_if_spam += math.log(prob_if_spam)
            log_probability_if_not_spam += math.log(prob_if_not_spam)
        else:            
            log_probability_if_spam += math.log(1.0 - prob_if_spam)
            log_probability_if_not_spam += math.log(1.0 - prob_if_not_spam)
    
    prob_if_spam = math.exp(log_probability_if_spam)
    prob_if_not_spam = math.exp(log_probability_if_not_spam)
    
    return prob_if_spam / (prob_if_spam / prob_if_not_spam)

In [31]:
class NaiveBayesClassifier:
    
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
        
    def train(self, training_set):
        num_spams = len([is_spam for message, is_spam in training_set if is_spam])
        num_non_spams = len(training_set) - num_spams
        
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts, num_spams, num_non_spams, self.k)
        
    def classifiy(self, message):
        return spam_probability(self.word_probs, message)

In [45]:
from lib.lin_alg_machine_learning import *

path = "./spam_data/*/*"
data = []

for filename in glob.glob(path):
    is_spam = "ham" not in filename
    with open(filename, "r", encoding='ISO-8859-1') as file:
        for line in file:
            if line.startswith("Subject:"):
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))

random.seed(0)
train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

classified = [(subject, is_spam, classifier.classifiy(subject)) for subject, is_spam in test_data]

counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified)

print(counts)

classified.sort(key=lambda row: row[2])

def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

print("spammiest_hams", spammiest_hams)
print("hammiest_spams", hammiest_spams)

words = sorted(classifier.word_probs, key=p_spam_given_word)

spammiest_words = words[-5:]
hammiest_words = words[:5]

print("spammiest_words", spammiest_words)
print("hammiest_words", hammiest_words)


Counter({(False, False): 737, (True, False): 139})
spammiest_hams [('Feeling Lucky', False, 0.0009335424922000174), ('Whither vCard?', False, 0.0009335424922000174), ('Artist Gallery: Troy Paradise', False, 0.0009335424922000174), ('Warchalking FAQ', False, 0.0009335424922000174), ('Cobain dispute settled', False, 0.0009335424922000174)]
hammiest_spams [('Over $100,000 Per Year Possible On The Net!  No Illegal MLM Junk! Time:11:43:57 PM', True, 5.518959318980145e-41), ('Get your own fountain of youth! HGH human growth hormone from 21st Century!17441', True, 2.1719292200646804e-35), ('International calls for only 33 cents per minute with no subscription', True, 6.875356518975671e-34), ('Please help Home Entertainment Companies with Survey - Win a DVR!', True, 2.7707171044747665e-30), ('Attn: How about being a few pounds lighter?OLIBYKN', True, 3.798580255266521e-30)]
spammiest_words [('only', 0.028767123287671233, 0.00022893772893772894), ('systemworks', 0.028767123287671233, 0.00022893