In [156]:
%matplotlib inline
import pandas as pd

## Conditional Probabilities

$P(A|B) = \frac{P(A\cap B)}{P(B)}$

In [157]:
a, b = set([1,2,3]), set([1,4,5])

In [158]:
# Intersection
a & b

{1}

In [159]:
# Union
a | b

{1, 2, 3, 4, 5}

In [160]:
total = len(a) + len(b)
p_a_cap_b = len(a & b) / total # Cap means intersection.
p_b = len(b) / total
p_a_given_b = p_a_cap_b / p_b
p_a_given_b

0.3333333333333333

## Inverse Conditional Probabilities (Bayes Theorem)

$P(B|A) = \frac{P(A|B)P(B)}{P(A)}$

In [161]:
import unittest
import io
import re
import email
from bs4 import BeautifulSoup
from collections import defaultdict

In [162]:
class EmailObject:
    def __init__(self, filepath, category=None):
        self.filepath = filepath
        self.category = category
        self.mail = email.message_from_file(self.filepath)
    
    def subject(self):
        return self.mail.get('Subject')

    def body(self):
        content_type = self.mail.get_content_type()
        body = self.mail.get_payload(decode=True)
        
        if content_type == 'text/html':
            return BeautifulSoup(body, 'html.parser').text
        elif content_type == 'text/plain':
            return body
        else:
            return ''

In [163]:
class Tokenizer:
    NULL = u'\u0000'
    
    @staticmethod
    def tokenize(string):
        return re.findall('\w+', string.lower())
    
    @staticmethod
    def ngram(string, ngram):
        tokens = Tokenizer.tokenize(string)
        
        ngrams = []
        
        for i in range(len(tokens)):
            shift = i - ngram + 1
            padding = max(-shift, 0)
            first_idx = max(shift, 0)
            last_idx = first_idx + ngram - padding
            
            ngrams.append(Tokenizer.pad(tokens[first_idx:last_idx], padding))
        
        return ngrams

    @staticmethod
    def pad(tokens, padding):
        padded_tokens = []
        
        for i in range(padding):
            padded_tokens.append(Tokenizer.NULL)

        return padded_tokens + tokens

In [173]:
class SpamTrainer:
    def __init__(self, training_files):
        self.categories = set()
    
        for category, file in training_files:
            self.categories.add(category)
        
        self.totals = defaultdict(float)
        self.training = {c: defaultdict(float) 
                         for c in self.categories}
        self.to_train = training_files
    
    def total_for(self, category):
        return self.totals[category]
    
    def train(self):
        for category, file in self.to_train:
            email = EmailObject(io.open(file, 'r'))
            
            self.categories.add(category)
            
            for token in Tokenizer.unique_tokenizer(email.body()):
                self.training[category][token] += 1
                self.totals['_all'] += 1
                self.totals[category] += 1
        self.to_train = {}
    
    def score(self, email):
        self.train()
        cat_totals = self.totals
        
        aggregrates = {cat: cat_totals[c] / cat_totals['_all']
                       for c in self.categories}
        
        for token in Tokenizer.unique_tokenizer(email.body()):
            for cat in self.categories:
                value = self.training[cat][token]
                r = (value + 1)/(cat_totals[cat] + 1)
                aggregates[cat] *= r

        return aggregates
    
    def normalized_scores(self, email):
        score = self.score(email)
        scoresum = sum(score.values())
        
        normalized = {cat: (agg/scoresum)
                      for cat, agg in score.items()}
        
        return normalized
    
    def preference(self):
        return sorted(self.categories, key=lambda cat: self.total_for(cat))

In [165]:
class TestPlaintextEmailObject(unittest.TestCase):
    CLRF = '\n\n'
    
    # The spelling is setUp.
    def setUp(self):
        self.plain_file = './data/plain.eml'
        self.plaintext = io.open(self.plain_file, 'r')
        self.text = self.plaintext.read()
        self.plaintext.seek(0)
        self.plain_email = EmailObject(self.plaintext)
    
    def test_parse_plain_body(self):
        body = self.CLRF.join(self.text.split(self.CLRF)[1:])
        # Decode the byte to utf-8.
        self.assertEqual(str(self.plain_email.body(), 'utf-8'), body)
    
    def test_parses_the_subject(self):
        subject = re.search('Subject: (.*)', self.text).group(1)
        self.assertEqual(self.plain_email.subject(), subject)

In [166]:
class TestHTMLEmail(unittest.TestCase):
    CLRF = '\n\n'
    
    def setUp(self):
        self.html_file = io.open('./data/html.eml', 'r')
        self.html = self.html_file.read()
        self.html_file.seek(0)
        self.html_email = EmailObject(self.html_file)
    
    def test_parses_stores_inner_text_html(self):
        body = self.CLRF.join(self.html.split(self.CLRF)[1:])
        expected = BeautifulSoup(body, 'html.parser').text
        
        # str.encode('utf-8')
        self.assertEqual(self.html_email.body(), expected)
        
    def test_stores_subject(self):
        subject = re.search('Subject: (.*)', self.html).group(1)
        self.assertEqual(self.html_email.subject(), subject)
        
    class Classification:
        def __init__(self, guess, score):
            self.guess = guess
            self.score = score
            
        def __eq__(self, other):
            return self.guess == other.guess and self.score == other.score
    
        def classify(self, email):
            score = self.score(email)
            
            max_score = 0.0
            preference = self.preference()
            max_key = preference[-1]
            
            for k, v in score.items():
                if v > max_score:
                    max_key = k
                    max_score = v
                elif v == max_score and preference.index(k) > preference.index(max_key):
                    max_key = k
                    max_score = v
            return self.Classification(max_key, max_score)

In [167]:
class TestTokenizer(unittest.TestCase):
    def setUp(self):
        self.string = 'this is a test of the emergency broadcasting system'
    
    def test_downcasing(self):
        expected = ['this', 'is', 'all', 'caps']
        actual = Tokenizer.tokenize('THIS IS ALL CAPS')
        self.assertEqual(expected, actual)
    
    def test_ngram(self):
        expected = [
            [u'\u0000', 'quick'],
            ['quick', 'brown'],
            ['brown', 'fox']
        ]
        actual = Tokenizer.ngram('quick brown fox', 2)
        self.assertEqual(expected, actual)

In [175]:
class TestSpamTrainer(unittest.TestCase):
    def setUp(self):
        self.training = [['spam', './data/plain.eml'],
                         ['ham', './data/small.eml'],
                         ['scram', './data/plain.eml']]
        self.trainer = SpamTrainer(self.training)
        file = io.open('./data/plain.eml', 'r')
        self.email = EmailObject(file)
    
    def test_multiple_categories(self):
        categories = self.trainer.categories
        expected = set([k for k, v in self.training])
        self.assertEqual(categories, expected)
        
    def test_counts_all_at_zero(self):
        for cat in ['_all', 'spam', 'ham', 'scram']:
            self.assertEqual(self.trainer.total_for(cat), 0)
            
    def test_probability_being_1_over_n(self):
        trainer = self.trainer
        scores = trainer.score(self.email).values()
        
        self.assertAlmostEqual(scores[0], scores[-1])
        
        for i in range(len(scores) - 1):
            self.assertAlmostEqual(scores[i], scores[i+1])
            
    def test_adds_up_to_one(self):
        trainer = self.trainer
        scores = trainer.normalized_score(self.email).values()
        self.assertAlmostEqual(sum(scores), 1)
        self.assertAlmostEqual(scores[0], 1/2.0)
        
    def test_preference_category(self):
        trainer = self.trainer
        expected = sorted(trainer.categories, key=lambda cat: trainer.total_for(cat))
        self.assertEqual(trainer.preference(), expected)
        
    def test_give_preference_to_whatever_has_the_most(self):
        trainer = self.trainer
        score = trainer.score(self.email)
        
        preference = trainer.preference()[-1]
        preference_score = score[preference]
        
        expected = SpamTrainer.Classification(preference, preference_score)
        self.assertEqual(trainer.classify(self.email), expected)

In [169]:
if __name__ == '__main__':
    unittest.main(argv=['first-argument-is-excluded'], exit=False)

  return self.run(*args, **kwds)
  for index, test in enumerate(self):
  return self.run(*args, **kwds)
....
FAIL: test_parses_stores_inner_text_html (__main__.TestHTMLEmail)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-166-36814c579247>", line 15, in test_parses_stores_inner_text_html
    self.assertEqual(self.html_email.body(), expected)
AssertionError: '\n\n[1370 chars] you.\\u2020 No teasing. No grand testimonies![2036 chars]\n\n' != '\n\n[1370 chars] you.† No teasing. No grand testimonies! No\nk[2024 chars]\n\n'
Diff is 3878 characters long. Set self.maxDiff to None to see it.

----------------------------------------------------------------------
Ran 8 tests in 0.119s

FAILED (failures=1)


In [170]:
%%bash open .

UsageError: %%bash is a cell magic, but the cell body is empty.
