In [50]:
import unittest
import io
import re
import email

from bs4 import BeautifulSoup

In [51]:
class EmailObject(object):
    CLRF = "\n\r\n\r"
    def __init__(self, filepath, category = None):
        self.filepath = filepath
        self.category = category
        self.mail = email.message_from_binary_file(self.filepath)
    
    def subject(self):
        return self.mail.get('Subject')
    
    def body(self):
        payload = self.mail.get_payload()
        if self.mail.is_multipart():
            parts = [self._single_body(part) for part in list(payload)]
        else:
            parts = [self._single_body(self.mail)]
        decoded_parts = []
        for part in parts:
            if len(part) == 0:
                continue
            if isinstance(part, bytes):
                decoded_parts.append(part.decode('utf-8', errors = 'ignore'))
            else:
                decoded_parts.append(part)
        return self.CLRF.join(decoded_parts)
    
    @staticmethod
    def _single_body(part):
        content_type = part.get_content_type()
        try:
            body = part.get_payload(decode = True)
        except Exception:
            return ''
        
        if content_type == 'text/html':
            return BeautifulSoup(body, 'html.parser').text
        elif content_type == 'text/plain':
            return body
        return ''

In [52]:
class Tokenizer:
    NULL = u'\u0000'
    
    @staticmethod
    def tokenize(string):
        return re.findall('\w+', string.lower())
    
    @staticmethod
    def ngram(string, ngram):
        tokens = Tokenizer.tokenize(string)
        
        ngrams = []
        
        for i in range(len(tokens)):
            shift = i - ngram + 1
            padding = max(-shift, 0)
            first_idx = max(shift, 0)
            last_idx = first_idx + ngram - padding
        
            ngrams.append(Tokenizer.pad(tokens[first_idx:last_idx], padding))
            
        return ngrams
    
    @staticmethod
    def pad(tokens, padding):
        padded_tokens = []
        
        for i in range(padding):
            padded_tokens.append(Tokenizer.NULL)
        
        return padded_tokens + tokens

In [53]:
class TestPlaintextEmailObject(unittest.TestCase):
    CLRF = '\n\n'
    def setUp(self):
        self.plain_file = './plain.eml'
        
        with io.open(self.plain_file, 'rb') as plaintext:
            self.text = plaintext.read().decode('utf-8')
            plaintext.seek(0)
            self.plain_email = EmailObject(plaintext)
    
    def test_parse_plain_body(self):
        body = self.CLRF.join(self.text.split(self.CLRF)[1:])
        self.assertEqual(self.plain_email.body(), body)
    
    def test_parses_the_subject(self):
        subject = re.search("Subject: (.*)", self.text).group(1)
        self.assertEqual(self.plain_email.subject(), subject)

In [54]:
class TestHTMLEmail(unittest.TestCase):
    CLRF = '\n\n'
    def setUp(self):
        with io.open('./html.eml', 'rb') as html_file:
            self.html = html_file.read().decode('utf-8')
            html_file.seek(0)
            self.html_email = EmailObject(html_file)

    def test_parses_stores_inner_text_html(self):
        body = self.CLRF.join(self.html.split(self.CLRF)[1:])
        expected = BeautifulSoup(body, 'html.parser').text
        actual = self.html_email.body()
        self.assertEqual(actual, expected)
        
    def test_stores_subject(self):
        subject = re.search('Subject: (.*)', self.html).group(1)
        self.assertEqual(self.html_email.subject(), subject)

In [55]:
class TestTokenizer(unittest.TestCase):
    def setUp(self):
        self.string = 'this is a test of the emergency broadcasting system'
    
    def test_downcasing(self):
        expected = ['this', 'is', 'all', 'caps']
        actual = Tokenizer.tokenize('THIS IS ALL CAPS')
        self.assertEqual(actual, expected)
    
    def test_ngrams(self):
        expected = [[u'\u0000', 'quick'], ['quick', 'brown'], ['brown', 'fox']]
        actual = Tokenizer.ngram('quick brown fox', 2)
        self.assertEqual(actual, expected)

In [56]:
if __name__ == '__main__':
    unittest.main(argv = ['ignore-first-argv'], exit = False)

......
----------------------------------------------------------------------
Ran 6 tests in 0.039s

OK
