In [1]:
import os
import email
import string
import nltk
import codecs

In [2]:
DATA_DIR = 'trec07p/data'
LABELS_FILE = 'trec07p/full/index'
TRAINING_SET_RATIO = 0.7

In [3]:
punctuations = list(string.punctuation)
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()

In [4]:
# helper methods for serializing our blacklist set
import json
import pickle

class PythonObjectEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (list, dict, str, unicode, int, float, bool, type(None))):
            return json.JSONEncoder.default(self, obj)
        return {'_python_object': pickle.dumps(obj)}

def as_python_object(dct):
    if '_python_object' in dct:
        return pickle.loads(str(dct['_python_object']))
    return dct

In [5]:
labels = {}
spam_words = set()
ham_words = set()

In [6]:
def flatten_to_string(parts):
    ret = []
    if type(parts) == str:
        ret.append(parts)
    elif type(parts) == list:
        for part in parts:
            ret += flatten_to_string(part)
    elif parts.get_content_type == 'text/plain':
        ret += parts.get_payload()
    return ret

In [7]:
def extract_email_text(path):
    # Load a single email from an input file.
    with open(path) as f:
        msg = email.message_from_file(f)
    if not msg:
        return ""

    # Read the email subject.
    subject = msg['Subject']
    if subject:
        subject = subject.decode(encoding='utf-8', errors='ignore')
    else:
        subject = ""

    # Read the email body.
    body = ' '.join(m for m in flatten_to_string(msg.get_payload()) if type(m) == str)
    if body:
        body = body.decode(encoding='utf-8', errors='ignore')
    else:
        body = ""

    return subject + ' ' + body

In [8]:
def load(path):
    email_text = extract_email_text(path)
    if not email_text:
        return []

    # Tokenize the message.
    tokens = nltk.word_tokenize(email_text)

    # Remove punctuation from tokens.
    tokens = [i.strip("".join(punctuations)) for i in tokens if i not in punctuations]

    # Remove stopwords and stem tokens.
    if len(tokens) > 2:
        return [stemmer.stem(w) for w in tokens if w not in stopwords]
    return []

In [9]:
# Read the labels.
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0

# BUILDING SPAM_WORDS minus HAM_WORDS set

In [10]:
# Split corpus into train and test sets.
filelist = os.listdir(DATA_DIR)
X_train = filelist[:int(len(filelist)*TRAINING_SET_RATIO)]
X_test = filelist[int(len(filelist)*TRAINING_SET_RATIO):]

print "TOTAL FILES {}".format(len(filelist))
print "X_TRAIN: {}".format(len(X_train))
print "X_TEST: {}".format(len(X_test))

for filename in X_train:
    path = os.path.join(DATA_DIR, filename)
    if filename in labels:
        label = labels[filename]
        stems = load(path)
        if not stems:
            continue
        if label == 1:
            ham_words.update(stems)
        else:
            spam_words.update(stems)

blacklist = spam_words - ham_words

TOTAL FILES 75419
X_TRAIN: 52793
X_TEST: 22626


In [11]:
# clean non ascii blacklist words
print('before: {}'.format(len(blacklist)))
cleaned_blacklist = set() 
for w in iter(blacklist):
    try:
        w.decode('utf-8')
        cleaned_blacklist.add(w)
    except UnicodeError:
        pass
print('after: {}'.format(len(cleaned_blacklist)))

before: 107944
after: 100363


In [None]:
# Saving blacklist to disk
# with open('blacklist.json', 'w') as outfile:
#     json.dump(cleaned_blacklist, outfile, cls=PythonObjectEncoder)

In [12]:
with open ('blacklist.json', 'r') as infile:
    bl = json.load(infile, object_hook=as_python_object)

In [13]:
fp = 0
tp = 0
fn = 0
tn = 0

In [14]:
for filename in X_test:
    path = os.path.join(DATA_DIR, filename)
    if filename in labels:
        label = labels[filename]
        stems = load(path)
        if not stems:
            continue
        stems_set = set(stems)
        if stems_set & bl:
            if label == 1:
                fp = fp + 1
            else:
                tp = tp + 1
        else:
            if label == 1:
                tn = tn + 1
            else:
                fn = fn + 1

print "tp: " + str(tp)
print "tn: " + str(tn)
print "fp: " + str(fp)
print "fn: " + str(fn)

tp: 8251
tn: 6660
fp: 834
fn: 5124


# Using Machine Learning - Naive Bayes

In [15]:
# LOADING DATA
filelist = os.listdir(DATA_DIR)
X_train = filelist[:int(len(filelist)*TRAINING_SET_RATIO)]
X_test = filelist[int(len(filelist)*TRAINING_SET_RATIO):]

In [None]:
# PREPROCESSING DATA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split

# Loading messages
message_stems = [load(os.path.join(DATA_DIR, f)) for f in filelist]

In [25]:
joined_stems = [' '.join(l) for l in message_stems]

In [26]:
import numpy as np

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(joined_stems)
y = np.array([labels[f] for f in filelist])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAINING_SET_RATIO)

In [28]:
# NAIVE BAYES MACHINE LEARNING
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Initialize the classifier and make label predictions.
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# Print results.
print(classification_report(y_test, y_pred, target_names=['Spam', 'Ham']))
print("Accuracy {:.3f}".format(accuracy_score(y_test, y_pred)))

             precision    recall  f1-score   support

       Spam       0.99      0.95      0.97     15068
        Ham       0.91      0.98      0.94      7558

avg / total       0.96      0.96      0.96     22626

Accuracy 0.962
