In [3]:
import re
import random
import numpy as np
import math

from collections import defaultdict

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import word_tokenize

In [4]:
def tokenize(message):
    message = message.lower()
    all_words = re.findall('[a-z0-9]+', message)
    return set(all_words)

In [5]:
def count_words(training_set):
    """Training set consists of pairs(message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [6]:
def word_probability(counts, total_spams, total_non_spams, k=0.5):
    """Turn the word_counts into a list of triplets
    w, p(w | spam) and p(w | ~spam)"""
    return [(w,
            (spam + k) / (total_spams + 2 * k),
            (non_spam + k) / (total_non_spams + 2 * k))
            for w, (spam, non_spam) in counts.items()]

In [7]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0
    
    # Iterate through each word in out vocabulary.
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        # If word appears in the message, add the log probability of seing it.
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        
        # If word does not appear in the message, add the log probability of not seeing it,
        # which is log(1 - probability of seeing it).
        else:
            log_prob_if_spam += math.log(1 - prob_if_spam)
            log_prob_if_not_spam += math.log(1 - prob_if_not_spam)
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [8]:
class NaiveBayesClassifier:
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
    
    def train(self, training_set):
        # Count spam and non-spam messages.
        num_spams = len([is_spam
                         for message, is_spam in training_set
                         if is_spam])
        num_non_spams = len(training_set) - num_spams
        
        # Run training data through our pipeline.
        word_counts = count_words(training_set)
        self.word_probs = word_probability(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)
    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [9]:
import glob, re

path = r'../data/*/*'

data = []

# glob.glob returns every filename that matches the wildcarded path.
for fn in glob.glob(path):
    is_spam = 'ham' not in fn
    
    with open(fn, 'r', encoding='ISO-8859-1') as file:
        for line in file:
            if line.startswith('Subject:'):
                # Remove the leading "Subject: " and keep what's left.
                subject = re.sub(r'Subject: ', '', line).strip()
                data.append((subject, is_spam))
len(data)

3423

In [10]:
def split_data(data, prob):
    """Split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

In [11]:
random.seed(0)

train_data, test_data = split_data(data, 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)

In [12]:
# triplets (subject, actual is_spam, predicted spam probability)
classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

In [13]:
from collections import Counter
# Assume that spam_probability > 0.5 corresponds to spam prediction 
# and count the combinations of (actual is_spam, predicted is_spam).
counts = Counter((is_spam, spam_probability > 0.5) 
                 for _, is_spam, spam_probability in classified)

In [14]:
# Sort by spam_probability from smallest to largest.
classified.sort(key=lambda row: row[2])

In [15]:
# The highest predicted spam probabilities are among the non-spams.
spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]

In [16]:
def p_spam_given_word(word_prob):
    """Uses bayes theorem to compute p(spam|message contains words)"""
    
    # word_prob is one of the triplets produced by word_probabilities.
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [17]:
words = sorted(classifier.word_probs, key=p_spam_given_word)

In [18]:
spammiest_words = words[-5:]
hammiest_words = words[:5]

In [19]:
spammiest_words

[('year', 0.02837837837837838, 0.0002294630564479119),
 ('zzzz', 0.02837837837837838, 0.0002294630564479119),
 ('money', 0.033783783783783786, 0.0002294630564479119),
 ('systemworks', 0.033783783783783786, 0.0002294630564479119),
 ('adv', 0.0445945945945946, 0.0002294630564479119)]

In [20]:
hammiest_words

[('spambayes', 0.0013513513513513514, 0.04612207434603029),
 ('users', 0.0013513513513513514, 0.039238182652592934),
 ('was', 0.0013513513513513514, 0.03786140431390546),
 ('razor', 0.0013513513513513514, 0.034189995410738874),
 ('zzzzteana', 0.0013513513513513514, 0.030059660394676457)]

In [21]:
data = np.array(data)

In [22]:
data.shape

(3423, 2)

In [23]:
X_raw, y = data[:, 0], data[:, 1]

In [24]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(X_raw)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [26]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)

In [30]:
np.array(precision_recall_fscore_support(y_test, y_predict))[:,1]

array([ 0.925     ,  0.38947368,  0.54814815, 95.        ])

In [28]:
X_i = vectorizer.transform(['Want to make some quick money?'])
clf.predict(X_i)

array(['True'], dtype='<U5')

In [29]:
X_i = vectorizer.transform(['make $1000 fast'])
clf.predict(X_i)

array(['False'], dtype='<U5')