# Import Dependencies

In [2]:
import sys
sys.path.append("../code-python3")
from collections import Counter, defaultdict
from machine_learning import split_data
import math, random, re, glob

In [5]:
def tokenize(message):
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    return set(all_words)                           # remove duplicates


def count_words(training_set):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [4]:
message = "this is a cow, that is a pig"

tokenize(message)

{'a', 'cow', 'is', 'pig', 'that', 'this'}

In [6]:
count_words(message)

ValueError: not enough values to unpack (expected 2, got 1)

In [14]:




def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    """turn the word_counts into a list of triplets
    w, p(w | spam) and p(w | ~spam)"""
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    for word, prob_if_spam, prob_if_not_spam in word_probs:

        # for each word in the message,
        # add the log probability of seeing it
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        # for each word that's not in the message
        # add the log probability of _not_ seeing it
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)


class NaiveBayesClassifier:

    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def train(self, training_set):

        # count spam and non-spam messages
        num_spams = len([is_spam
                         for message, is_spam in training_set
                         if is_spam])
        num_non_spams = len(training_set) - num_spams

        # run training data through our "pipeline"
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message)


def get_subject_data(path):

    data = []

    # regex for stripping out the leading "Subject:" and any spaces after it
    subject_regex = re.compile(r"^Subject:\s+")

    # glob.glob returns every filename that matches the wildcarded path
    for fn in glob.glob(path):
        is_spam = "ham" not in fn

        with open(fn,'r',encoding='ISO-8859-1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = subject_regex.sub("", line).strip()
                    data.append((subject, is_spam))

    print(data)
    return data

def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

def train_and_test_model(path):

    data = get_subject_data(path)
    random.seed(0)      # just so you get the same answers as me
    train_data, test_data = split_data(data, 0.75)

    classifier = NaiveBayesClassifier()
    classifier.train(train_data)

    classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

    counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                     for _, is_spam, spam_probability in classified)

    print(counts)

    classified.sort(key=lambda row: row[2])
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print("hammiest_spams", hammiest_spams)

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print("spammiest_words", spammiest_words)
    print("hammiest_words", hammiest_words)


if __name__ == "__main__":
    #train_and_test_model(r"c:\spam\*\*")
    train_and_test_model(r"../data/spam_examples/*/*")


../data/spam_examples/spam/0009.c05e264fbf18783099b53dbc9a9aacda
../data/spam_examples/spam/0007.859c901719011d56f8b652ea071c1f8b
../data/spam_examples/spam/0010.7f5fb525755c45eb78efc18d7c9ea5aa
../data/spam_examples/spam/0001.bfc8d64d12b325ff385cca8d07b84288
../data/spam_examples/spam/0005.1f42bb885de0ef7fc5cd09d34dc2ba54
../data/spam_examples/spam/0002.24b47bb3ce90708ae29d0aec1da08610
../data/spam_examples/spam/0004.1874ab60c71f0b31b580f313a3f6e777
../data/spam_examples/spam/0006.7a32642f8c22bbeb85d6c3b5f3890a2c
../data/spam_examples/spam/0008.9562918b57e044abfbce260cc875acde
../data/spam_examples/spam/0003.4b3d943b8df71af248d12f8b2e7a224a
../data/spam_examples/spam/0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1
../data/spam_examples/hard_ham/0009.2bc1d4efa31fc78edb6e4bd82f68023f
../data/spam_examples/hard_ham/0003.0aa92b5f121c27c6e094fd89c6c89448
../data/spam_examples/hard_ham/0007.7f2ea3a532284cff3321e5ba159cdb50
../data/spam_examples/hard_ham/0002.2fe846db6e3249836abdbfcae459bf2a
../data/s