In [31]:
import csv
import os
import logging
import random
from collections import defaultdict
from decimal import Decimal
import nltk
import re
import email.parser
import sys
sys.path.append('/usr/local/lib/python3.5/site-packages')
import lxml.html

logger = logging.getLogger(__name__)

def extract_body(filename):
    with open(filename, encoding='utf-8', errors='replace') as f:
        text = f.read()
        msg = email.message_from_string(text)
        payload = msg.get_payload()
        if type(payload) == type(list()):
            payload = payload[0]
        plain_text_body_content = lxml.html.document_fromstring(str(payload)).text_content()
        return plain_text_body_content

class NaiveBayes(object):
    def __init__(self, categories):
        self.words = defaultdict(dict)
        self.categories = self._create_categories(categories)
#         print('here is categories: {0}'.format(self.categories))
        self.training_examples = 0
        self.unique_words = set()

    def _create_categories(self, categories):
        categories = {category: {'total': 0, 'word_count': 0}
                      for category in categories}
        return categories

    def train(self, category, text):
        text = self._tokenize_text(text)  # TODO: stem words

        self._increment_unique_word_count(text)  # Laplace Smoothing
        self._increment_word_frequency(category, text)
        self._increment_category_count(category)
        self._increment_category_word_count(category, len(text))

        self.training_examples += 1

    def _tokenize_text(self, text):
        text = re.findall(r"[\w']+", text)
        words = []
        for word in text:
            if word and word not in nltk.corpus.stopwords.words('english'):
                words.append(word)
        return words

    def _increment_word_frequency(self, category, words):
        for word in words:
            if self.words[word].get(category):
                self.words[word][category] += 1
            else:
                self.words[word][category] = 1

    def _increment_unique_word_count(self, text):
        self.unique_words = set(list(self.unique_words) + text)

    def _increment_category_count(self, category):
        self.categories[category]['total'] += 1

    def _increment_category_word_count(self, category, number):
        if self.categories[category].get('word_count'):
            self.categories[category]['word_count'] += number
        else:
            self.categories[category]['word_count'] = number

    def classify(self, text):
        text = self._tokenize_text(text)

        probabilities = {}
        for cat, cat_data in self.categories.items():
#             print(cat, cat_data)
            category_prob = self._get_category_probability(cat_data['total'])
            predictors_likelihood = self._get_predictors_probability(cat, text)
            probabilities[cat] = category_prob * predictors_likelihood

        return 1 if probabilities[1] > probabilities[0] else 0

    def _get_category_probability(self, count):
        # Can make use of logarithm in lieu of Python's decimal object to avoid
        # Floating point underflow
        # e.g. return log(class_prior_prob)
        return Decimal(float(count)) / Decimal(self.training_examples + len(self.categories.keys()))

    def _get_predictors_probability(self, category, text):
        word_count = self.categories[category]['word_count'] + len(self.unique_words)
        likelihood = 1
        for word in text:
            if not self.words.get(word) or not self.words[word].get(category):
                smoothed_freq = 1  # Laplace smoothing
            else:
                smoothed_freq = 1 + self.words[word][category]
            likelihood *= Decimal(float(smoothed_freq)) / Decimal(word_count)
            # floating point underflow!! EEE!
            # http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
            # likelihood *= Decimal(float(self.words[word][category])) / Decimal(word_count)
            # print category, log(predictor_likelihood)
        return likelihood

class SpamHamDetector(object):
    def __init__(self, categories, path):
        self.naive_bayes = NaiveBayes(categories)
        self.path = path
        self.classified_examples = dict()


    def train(self):
        with open('{0}/labels.csv'.format(self.path), 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '{0}/TR/TRAIN_{1}.eml'.format(path, row['Id'])
                try:
                    body = extract_body(filename)
                    self.naive_bayes.train(int(label), body)

                except Exception as e:
                    logger.info("Error training email {0}: {1}".format(row['Id'], e.message))

    def train_and_evaluate(self):
        all_ids = list(range(1, 2501))
        random.shuffle(all_ids)
        training_ids, labeling_ids = all_ids[:2250], all_ids[2250:]

        with open('{0}/labels.csv'.format(self.path), 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '{0}/TR/TRAIN_{1}.eml'.format(path, row['Id'])
                if int(row['Id']) in training_ids:
                    try:
                        body = extract_body(filename)
                        self.naive_bayes.train(int(label), body)
                    except Exception as e:
                        logger.info("Error training email {0}: {1}".format(row['Id'], e.message))

        correct, incorrect = 0, 0
        with open('{0}/labels.csv'.format(self.path), 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '{0}/TR/TRAIN_{1}.eml'.format(path, row['Id'])
                if int(row['Id']) in labeling_ids:
                    try:
                        test_body = extract_body(filename)
                        result = self.naive_bayes.classify(test_body)
                        if result == int(label):
                            correct += 1
                        else:
                            incorrect += 1
                    except Exception as e:
                        logger.info("Error classifying email {0}: {1}".format(row['Id'], e.message))
        return self._calculate_results(correct, incorrect)

    def classify(self, size):
        counter = 1
        test = self.path + '/TT/TEST_{0}.eml'

        while counter < size+1:
            try:
                test_body = extract_body(test.format(counter))
                self.classified_examples[str(counter)] = str(self.naive_bayes.classify(test_body))
            except Exception as e:
                logger.info("Error classifying email {0}: {1}".format(counter, e.message))
            counter += 1

        self._store_results()

    def display_results(self):
        spam = sum(1 for category in self.classified_examples.values() if category == '0')
        ham = sum(1 for category in self.classified_examples.values() if category == '1')
        return "Spam Emails: {0}\nHam Emails: {1}\nSpam Percent: {2}\nHam Percent: {}".format(
                  (spam, ham, (float(spam) / len(self.classified_examples)),
                  (float(ham) / len(self.classified_examples))))

    def _calculate_results(self, correct, incorrect):
        return "correct {0}s, incorrect {1}s, performance measurement {2}s".format(correct,
                                                                         incorrect,
                                                                         (float(correct) / (correct + incorrect)))

    def _store_results(self):
        with open('{0}/results.csv'.format(self.path), 'w+') as resultscsv:
            writer = csv.DictWriter(resultscsv, fieldnames=['id','Prediction'])
            writer.writeheader()
            for example_num, category in self.classified_examples.items():
                writer.writerow({'id': example_num, 'Prediction': category})

In [None]:
print("starting!")
path = '/Users/lorenamesa/Desktop/naive-bayes/naive-bayes' #os.path.dirname(__file__)
detector = SpamHamDetector([0, 1], path)
print(detector.train_and_evaluate())
detector.train()
print("done training!")
detector.classify(1827)
print("done classifying!")
print(detector.display_results())

starting!
here is categories: {0: {'word_count': 0, 'total': 0}, 1: {'word_count': 0, 'total': 0}}


#### Porting Project from Python 2 -> Python 3

- `email.parser` triggers `UnicodeError` - resolve via these [instructions](http://stackoverflow.com/questions/22216076/unicodedecodeerror-utf8-codec-cant-decode-byte-0xa5-in-position-0-invalid-s)
- Error loading `lxml.html` resolve by commenting out Anaconda bin location in `~/.bash_profile`, and `brew install` directions found [here](http://stackoverflow.com/questions/23172384/lxml-runtime-error-reason-incompatible-library-version-etree-so-requires-vers) 