In [389]:
%matplotlib inline
import pandas as pd

## Conditional Probabilities

$P(A|B) = \frac{P(A\cap B)}{P(B)}$

In [390]:
a, b = set([1,2,3]), set([1,4,5])

In [391]:
# Intersection
a & b

{1}

In [392]:
# Union
a | b

{1, 2, 3, 4, 5}

In [393]:
total = len(a) + len(b)
p_a_cap_b = len(a & b) / total # Cap means intersection.
p_b = len(b) / total
p_a_given_b = p_a_cap_b / p_b
p_a_given_b

0.3333333333333333

## Inverse Conditional Probabilities (Bayes Theorem)

$P(B|A) = \frac{P(A|B)P(B)}{P(A)}$

## Reading email

In [394]:
import email
from bs4 import BeautifulSoup

In [395]:
with open('./data/plain.eml') as f:
    msg = email.message_from_file(f)
msg.get_content_type(), msg.get_payload(decode=False)[:100] # Decode=True will return bytes.

('text/plain',
 'Wanna see sexually curious teens playing with each other?\n\nhttp://www.site-personals.com <-- click h')

In [396]:
with open('./data/html.eml') as f:
    msg = email.message_from_file(f)
msg.get_content_type(), msg.get_payload(decode=False)[:100] # Decode=True will return bytes.

('text/html',
 "<body lang=EN-US>\n\n<div class=Section1>\n\n<p class=MsoBodyText style='text-align:justify'><b>CONSANTL")

In [397]:
# We can also get the email subject.
msg.get('Subject')

'One of a kind Money maker! Try it for free!'

In [448]:
# Parsing HTML email.
BeautifulSoup(msg.get_payload(decode=False), 'html.parser').text[:100]

'\n\nCONSANTLY being\nbombarded by so-called ìFREEî money-making systems that teases you with limited\nin'

## Email Spam Detection

In [399]:
import re
import email
import unittest
from bs4 import BeautifulSoup
from collections import defaultdict

In [400]:
class EmailObject:
    def __init__(self, file, category=None):
        self.mail = email.message_from_file(file)
        self.category = category

    def subject(self):
        """Returns the email subject."""
        return self.mail.get('Subject')

    def body(self):
        """Normalizes the content of the body to plain text if it is of type HTML."""
        content_type = self.mail.get_content_type()
        body = self.mail.get_payload(decode=False)
        
        if content_type == 'text/html':
            return BeautifulSoup(body, 'html.parser').text
        elif content_type == 'text/plain':
            return body
        else:
            return ''

In [441]:
class Tokenizer:
    NULL = u'\u0000'
    
    @staticmethod
    def tokenize(string):
        return re.findall('\w+', string.lower())
    
    @staticmethod
    def ngram(string, n=2):
        s = string.split(' ')
        result = []
        for i in range(1, n + 1):
            result.append([Tokenizer.NULL] * (n-i) + s)
        return list(zip(*result))
    
    @staticmethod
    def unique_tokenizer(string):
        tokens = Tokenizer.tokenize(string)
        return set(tokens)

In [483]:
class SpamTrainer:
    def __init__(self, training_files):
        self.categories = set()
    
        for category, file in training_files:
            self.categories.add(category)
        
        self.totals = defaultdict(float)
        self.training = {c: defaultdict(float) 
                         for c in self.categories}
        self.to_train = training_files
    
    def total_for(self, category):
        return self.totals[category]
    
    def train(self):
        for category, file in self.to_train:
            # It's not utf-8.
            with open(file, 'r', encoding='latin-1') as f:
                email = EmailObject(f)
            self.categories.add(category)
            
            for token in Tokenizer.unique_tokenizer(email.body()):
                self.training[category][token] += 1
                self.totals['_all'] += 1
                self.totals[category] += 1
        self.to_train = {}
    
    def score(self, email):
        self.train()
        cat_totals = self.totals
        
        aggregates = {c: cat_totals[c] / cat_totals['_all']
                      for c in self.categories}
        
        for token in Tokenizer.unique_tokenizer(email.body()):
            for cat in self.categories:
                value = self.training[cat][token]
                r = (value + 1)/(cat_totals[cat] + 1)
                aggregates[cat] *= r

        return aggregates
    
    def normalized_score(self, email):
        score = self.score(email)
        scoresum = sum(score.values())
        
        normalized = {cat: (agg/scoresum)
                      for cat, agg in score.items()}
        
        return normalized
    
    def preference(self):
        return sorted(self.categories, key=lambda cat: self.total_for(cat))
    
    class Classification:
        def __init__(self, guess, score):
            self.guess = guess
            self.score = score
            
        def __eq__(self, other):
            return self.guess == other.guess and self.score == other.score
    
    def classify(self, email):
        score = self.score(email)

        max_score = 0.0
        preference = self.preference()
        max_key = preference[-1]

        for k, v in score.items():
            if v > max_score:
                max_key = k
                max_score = v
            elif v == max_score and preference.index(k) > preference.index(max_key):
                max_key = k
                max_score = v
        return self.Classification(max_key, max_score)

In [484]:
class TestPlaintextEmailObject(unittest.TestCase):
    CLRF = '\n\n'
    
    # The spelling is setUp.
    def setUp(self):
        self.plain_file = './data/plain.eml'
        with open(self.plain_file, 'r') as f:
            self.plaintext = f
            self.text = f.read()
            self.plaintext.seek(0)
            self.plain_email = EmailObject(self.plaintext)
            
    def tearDown(self):
        self.plaintext.close()
    
    def test_parse_plain_body(self):
        body = self.CLRF.join(self.text.split(self.CLRF)[1:])
        self.assertEqual(self.plain_email.body(), body)
    
    def test_parses_the_subject(self):
        subject = re.search('Subject: (.*)', self.text).group(1)
        self.assertEqual(self.plain_email.subject(), subject)

In [485]:
class TestHTMLEmail(unittest.TestCase):
    CLRF = '\n\n'
    
    def setUp(self):
        with open('./data/html.eml', 'r') as f:
            self.html_file = f
            self.html = self.html_file.read()
            self.html_file.seek(0)
            self.html_email = EmailObject(self.html_file)

    def tearDown(self):
        self.html_file.close()
        
    def test_parses_stores_inner_text_html(self):
        body = self.CLRF.join(self.html.split(self.CLRF)[1:])
        expected = BeautifulSoup(body, 'html.parser').text
 
        self.assertEqual(self.html_email.body(), expected)
        
    def test_stores_subject(self):
        subject = re.search('Subject: (.*)', self.html).group(1)
        self.assertEqual(self.html_email.subject(), subject)

In [486]:
class TestTokenizer(unittest.TestCase):
    def setUp(self):
        self.string = 'this is a test of the emergency broadcasting system'
    
    def test_downcasing(self):
        expected = ['this', 'is', 'all', 'caps']
        actual = Tokenizer.tokenize('THIS IS ALL CAPS')
        self.assertEqual(expected, actual)
    
    def test_ngram(self):
        expected = [
            (u'\u0000', 'quick'),
            ('quick', 'brown'),
            ('brown', 'fox')
        ]
        actual = Tokenizer.ngram('quick brown fox', 2)
        self.assertEqual(expected, actual)

In [487]:
class TestSpamTrainer(unittest.TestCase):
    def setUp(self):
        self.training = [['spam', './data/plain.eml'],
                         ['ham', './data/small.eml'],
                         ['scram', './data/plain.eml']]
        self.trainer = SpamTrainer(self.training)
        with open('./data/plain.eml', 'r') as f:
            self.file = f
            self.email = EmailObject(self.file)
            
    def tearDown(self):
        self.file.close()
    
    def test_multiple_categories(self):
        categories = self.trainer.categories
        expected = set([k for k, v in self.training])
        self.assertEqual(categories, expected)
        
    def test_counts_all_at_zero(self):
        for cat in ['_all', 'spam', 'ham', 'scram']:
            self.assertEqual(self.trainer.total_for(cat), 0)
            
    def test_probability_being_1_over_n(self):
        trainer = self.trainer
        scores = list(trainer.score(self.email).values())
        
        self.assertAlmostEqual(scores[0], scores[-1])
        
        for i in range(len(scores) - 1):
            self.assertAlmostEqual(scores[i], scores[i+1])
            
    def test_adds_up_to_one(self):
        trainer = self.trainer
        scores = list(trainer.normalized_score(self.email).values())
        self.assertAlmostEqual(sum(scores), 1)
        self.assertAlmostEqual(scores[0], 1/2.0)
        
    def test_preference_category(self):
        trainer = self.trainer
        expected = sorted(trainer.categories, key=lambda cat: trainer.total_for(cat))
        self.assertEqual(trainer.preference(), expected)
        
    def test_give_preference_to_whatever_has_the_most(self):
        trainer = self.trainer
        score = trainer.score(self.email)
        
        preference = trainer.preference()[-1]
        preference_score = score[preference]
        
        expected = SpamTrainer.Classification(preference, preference_score)
        self.assertEqual(trainer.classify(self.email), expected)

In [488]:
if __name__ == '__main__':
    unittest.main(argv=['first-argument-is-excluded'], exit=False)

............
----------------------------------------------------------------------
Ran 12 tests in 0.048s

OK


In [489]:
%%bash open .

UsageError: %%bash is a cell magic, but the cell body is empty.


## Cross Validation

In [490]:
def label_to_training_data(fold_file):
    training_data = []
    with open(fold_file, 'r') as f:
        for line in f:
            target, filepath = line.rstrip().split(' ')
            # ./data/TRAINING/TRAIN_00002.eml
            # Remap the path
            filepath = filepath.replace('./data', './data/naive_bayes/data')
            training_data.append([target, filepath])
    print(training_data[0])
    return SpamTrainer(training_data)

In [491]:
trainer = label_to_training_data('./data/naive_bayes/fixtures/fold1.label')

['ham', './data/naive_bayes/data/TRAINING/TRAIN_00002.eml']


In [492]:
def parse_emails(keyfile):
    emails = []
    print(f'Parsing email for {keyfile}')
    
    with open(keyfile, 'r') as f:
        for line in f:
            label, file = line.rstrip().split(' ')
            file = file.replace('./data', './data/naive_bayes/data')
            with open(file, 'r', encoding='latin-1') as labelfile:
                emails.append(EmailObject(labelfile, category=label))
    print(f'Done parsing file for {keyfile}')
    return emails

In [493]:
emails = parse_emails('./data/naive_bayes/fixtures/fold2.label')
emails[0]

Parsing email for ./data/naive_bayes/fixtures/fold2.label
Done parsing file for ./data/naive_bayes/fixtures/fold2.label


<__main__.EmailObject at 0x1180a2a58>

In [494]:
%%bash open .

UsageError: %%bash is a cell magic, but the cell body is empty.


In [495]:
def validate(trainer, set_of_emails):
    correct = 0
    false_positives = 0.0
    false_negatives = 0.0
    confidence = 0.0
    
    for email in set_of_emails:
        classification = trainer.classify(email)
        confidence += classification.score
        
        if classification.guess == 'spam' and email.category == 'ham':
            false_positives += 1
        elif classification.guess == 'ham' and email.category == 'spam':
            false_negatives += 1
        else:
            correct += 1
    
    total = false_positives + false_negatives + correct
    
    false_positive_rate = false_positives / total
    false_negative_rate = false_negatives / total
    accuracy = (false_positives + false_negatives) / total
    message = f"""
        False Positives: {false_positive_rate}
        False Negatives: {false_negative_rate}
        Accuracy: {accuracy}
    """
    print(message)

In [496]:
validate(trainer, emails)


        False Positives: 0.0004623208506703652
        False Negatives: 0.21128062875635692
        Accuracy: 0.21174294960702728
    
