 ## Data 620, Week 5, Part 2 Assignment
 #### Team 4: John Grando, Nick Capofari, Ken Markus, Armenoush Aslanian-Persico, Andrew Goldberg
 
 #### Project Details: 
Use a dataset to predict a class of new documents (either withheld from the training dataset or from another source such as your own spam folder). 

For this project, we used a pre-processed Enron e-mail corpus (available here: http://www2.aueb.gr/users/ion/data/enron-spam/) to classify the documents as either spam or ham. 

 ### Import and normalize data

In [126]:
import os
import nltk
from IPython.display import display

spamfolder = '/Users/andrew/Documents/School/Web Analytics/HW4/enron1/spam'
spamdata = []
for filename in os.listdir(spamfolder):
    with open(spamfolder+'/'+filename) as spamtext:
        spamtext = spamtext.read()
        spamtext = spamtext.decode('UTF8', errors='ignore')
        spamdata.append(spamtext)
        
hamfolder = '/Users/andrew/Documents/School/Web Analytics/HW4/enron1/ham'
hamdata = []
for filename in os.listdir(hamfolder):
    with open(hamfolder+'/'+filename) as hamtext:
        hamtext = hamtext.read()
        hamtext = hamtext.decode('UTF8', errors='ignore')
        hamdata.append(hamtext)

In [127]:
#Sample e-mail data
hamdata[1]

u'Subject: vastar resources , inc .\r\ngary , production from the high island larger block a - 1 # 2 commenced on\r\nsaturday at 2 : 00 p . m . at about 6 , 500 gross . carlos expects between 9 , 500 and\r\n10 , 000 gross for tomorrow . vastar owns 68 % of the gross production .\r\ngeorge x 3 - 6992\r\n- - - - - - - - - - - - - - - - - - - - - - forwarded by george weissman / hou / ect on 12 / 13 / 99 10 : 16\r\nam - - - - - - - - - - - - - - - - - - - - - - - - - - -\r\ndaren j farmer\r\n12 / 10 / 99 10 : 38 am\r\nto : carlos j rodriguez / hou / ect @ ect\r\ncc : george weissman / hou / ect @ ect , melissa graves / hou / ect @ ect\r\nsubject : vastar resources , inc .\r\ncarlos ,\r\nplease call linda and get everything set up .\r\ni \' m going to estimate 4 , 500 coming up tomorrow , with a 2 , 000 increase each\r\nfollowing day based on my conversations with bill fischer at bmar .\r\nd .\r\n- - - - - - - - - - - - - - - - - - - - - - forwarded by daren j farmer / hou / ect on 12 / 10

 ### Format and label e-mails spam/ham

In [128]:
labeled_emails = ([(ham_mail.split(), 'ham') for ham_mail in hamdata] +
                  [(spam_mail.split(), 'spam') for spam_mail in spamdata])
import random
random.seed(222)
random.shuffle(labeled_emails)

all_emails = [email for email, classification in labeled_emails[:500]]
flattened_emails_all = [word for email in all_emails for word in email]

all_emails_spam = [email for email, classification in labeled_emails[:500] if classification=="spam"]
flattened_emails_spam = [word for email in all_emails_spam for word in email]

all_emails_ham = [email for email, classification in labeled_emails[:500] if classification=="ham"]
flattened_emails_ham = [word for email in all_emails_ham for word in email]

tokenized_emails_all = []
for word in flattened_emails_all:
        tokenized_emails_all.extend(nltk.word_tokenize(word))
tokenized_emails_spam = []
for word in flattened_emails_spam:
        tokenized_emails_spam.extend(nltk.word_tokenize(word))
tokenized_emails_ham = []
for word in flattened_emails_ham:
        tokenized_emails_ham.extend(nltk.word_tokenize(word))

 ### Define feature extractor

In [129]:
from nltk.corpus import stopwords

#extract the 500 most common words from each type and then from total
word_freq_all = nltk.FreqDist(tokenized_emails_all)
top_words_all = [w for (w,c) in word_freq_all.most_common(500)]
top_words_all = [w for w in top_words_all if w.isalpha()]
top_words_all = [w for w in top_words_all if w not in stopwords.words('english')]

word_freq_ham = nltk.FreqDist(tokenized_emails_ham)
top_words_ham = [w for (w,c) in word_freq_ham.most_common(250)]
top_words_ham = [w for w in top_words_ham if w.isalpha()]
top_words_ham = [w for w in top_words_ham if w not in stopwords.words('english')]

word_freq_spam = nltk.FreqDist(tokenized_emails_spam)
top_words_spam = [w for (w,c) in word_freq_spam.most_common(250)]
top_words_spam = [w for w in top_words_spam if w.isalpha()]
top_words_spam = [w for w in top_words_spam if w not in stopwords.words('english')]

top_words = top_words_all + top_words_spam + top_words_ham
top_words = list(set(top_words))

In [130]:
#most common words
print display(word_freq.most_common(20))

[(u'-', 7721),
 (u'.', 5159),
 (u',', 3787),
 (u'/', 3321),
 (u':', 2745),
 (u'the', 2583),
 (u'to', 2025),
 (u'and', 1366),
 (u'ect', 1120),
 (u'of', 1051),
 (u'a', 1041),
 (u'for', 1037),
 (u'?', 931),
 (u'@', 922),
 (u'in', 816),
 (u'on', 791),
 (u'you', 780),
 (u'this', 735),
 (u'is', 700),
 (u'i', 641)]

None


In [131]:
#build feature extractor; uses both most common words and extremes in email length
from __future__ import division
import math
def document_features(document):
    document_words = set(document)
    features = {}
    if len(document) < 20:
        short_mail = True
        long_mail = False
    elif len(document) > 1500:
        short_mail = False
        long_mail = True
    else:
        short_mail = False
        long_mail = False
    features['len_check({})'.format("short_mail")] = short_mail
    features['len_check({})'.format("long_mail")] = long_mail
    word_length = 0
    for word in top_words:
        features['contains({})'.format(word)] = (word in document_words)
    for word in document:
        word_length += len(word)
    avg_word_length = word_length / len(document)
    if avg_word_length > 3.9:
        long_words = True
        short_words = False
    elif avg_word_length < 2.5:
        long_words = False
        short_words = True
    else:
        long_words = False
        short_words = False
    features['len_check({})'.format("long_words")] = long_words
    features['len_check({})'.format("short_words")] = short_words
    lex_div = len(document) / len(set(document))
    if lex_div > 1.9:
        features['len_check({})'.format("long_lex_div")] = True
    else:
        features['len_check({})'.format("long_lex_div")] = False
    #features['len_check({})'.format("average_word")] = round(word_length / len(document),1)
    #features['lexical({})'.format("diversity")] = round(len(document) / len(set(document)),1)
    return features

 ### Train Classifier

In [132]:
featuresets = [(document_features(d), c) for (d,c) in labeled_emails]
train_set, dev_test_set, test_set = featuresets[:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

 ### Predictions

In [133]:
import pandas as pd
preds = pd.DataFrame({'spam or ham':[email for (email,classification) in dev_test_set],
                      'observed':[classification for (email,classification) in dev_test_set],
                      'predicted': [classifier.classify(document_features(n)) for (n,g) in labeled_emails[500:1000]]})

In [134]:
pd.crosstab(preds.observed,preds.predicted)

predicted,ham,spam
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,316,27
spam,0,157


Impressive sensitivity at the expense of some specificity; some ham is predicted as spam. 

In [135]:
#Confusion matrix, Accuracy, sensitivity and specificity
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(preds.observed,preds.predicted)
sensitivity1 = (float(cm[1,1])/(cm[1,1]+cm[1,0]))
print('Sensitivity : ', sensitivity1 )

specificity1 = (float(cm[0,0])/(cm[0,0]+cm[0,1]))
print('Specificity : ', specificity1)

('Sensitivity : ', 1.0)
('Specificity : ', 0.9212827988338192)


 ### Test Performance

In [136]:
print 'Accuracy: %4.2f' %nltk.classify.accuracy(classifier, dev_test_set)
classifier.show_most_informative_features(15)

Accuracy: 0.95
Most Informative Features
            contains(cc) = True              ham : spam   =     30.6 : 1.0
      contains(attached) = True              ham : spam   =     24.8 : 1.0
    contains(securities) = True             spam : ham    =     21.9 : 1.0
           contains(act) = True             spam : ham    =     21.9 : 1.0
          contains(save) = True             spam : ham    =     21.9 : 1.0
     contains(investing) = True             spam : ham    =     20.0 : 1.0
           contains(ect) = True              ham : spam   =     19.0 : 1.0
      contains(investor) = True             spam : ham    =     18.1 : 1.0
     contains(investors) = True             spam : ham    =     18.1 : 1.0
           contains(gas) = True              ham : spam   =     18.1 : 1.0
          contains(easy) = True             spam : ham    =     17.7 : 1.0
  contains(technologies) = True             spam : ham    =     14.3 : 1.0
        contains(target) = True             spam : ham    =

 ### Errors

In [137]:
errors = []
for (doc, tag) in labeled_emails[500:1000]:
    guess = classifier.classify(document_features(doc))
    accuracy = classifier.prob_classify(document_features(doc))
    if guess != tag:
        word_len = 0
        for word in doc:
            word_len += len(word)
        errors.append( (tag, guess, len(doc), round(len(doc)/len(set(doc)),1), round(word_len/len(doc),2),
                        accuracy.prob("spam")) )

In [138]:
col_names = ['tag', 'guess', 'length', 'lex div', 'avg word len', 'modeled spam probability']
pd.DataFrame(errors, columns = col_names)

Unnamed: 0,tag,guess,length,lex div,avg word len,modeled spam probability
0,ham,spam,112,1.5,3.08,0.985873
1,ham,spam,24,1.3,3.21,0.580058
2,ham,spam,27,1.1,4.04,0.996669
3,ham,spam,91,1.3,3.65,0.929502
4,ham,spam,11,1.1,2.82,0.967109
5,ham,spam,152,1.8,4.27,0.996847
6,ham,spam,13,1.1,4.85,0.991888
7,ham,spam,858,2.4,3.89,0.99985
8,ham,spam,112,1.7,4.17,0.999996
9,ham,spam,434,2.5,3.79,0.999607


 ## Model Performance

In [139]:
perf = pd.DataFrame({'spam or ham':[email for (email,classification) in test_set],
                      'observed':[classification for (email,classification) in test_set],
                      'predicted': [classifier.classify(document_features(n)) for (n,g) in labeled_emails[1000:]]})
pd.crosstab(perf.observed,perf.predicted)

predicted,ham,spam
observed,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,2670,288
spam,7,1207


We see that sensitivity remains very high, but some ham is unfortunately classified as spam. 

In [141]:
#Confusion matrix, Accuracy, sensitivity and specificity
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(perf.observed,perf.predicted)
sensitivity1 = (float(cm[1,1])/(cm[1,1]+cm[1,0]))
print('Sensitivity : ', sensitivity1 )

specificity1 = (float(cm[0,0])/(cm[0,0]+cm[0,1]))
print('Specificity : ', specificity1)

('Sensitivity : ', 0.9942339373970346)
('Specificity : ', 0.9026369168356998)


In [142]:
print 'Accuracy: %4.2f' %nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(15)

Accuracy: 0.93
Most Informative Features
            contains(cc) = True              ham : spam   =     30.6 : 1.0
      contains(attached) = True              ham : spam   =     24.8 : 1.0
    contains(securities) = True             spam : ham    =     21.9 : 1.0
           contains(act) = True             spam : ham    =     21.9 : 1.0
          contains(save) = True             spam : ham    =     21.9 : 1.0
     contains(investing) = True             spam : ham    =     20.0 : 1.0
           contains(ect) = True              ham : spam   =     19.0 : 1.0
      contains(investor) = True             spam : ham    =     18.1 : 1.0
     contains(investors) = True             spam : ham    =     18.1 : 1.0
           contains(gas) = True              ham : spam   =     18.1 : 1.0
          contains(easy) = True             spam : ham    =     17.7 : 1.0
  contains(technologies) = True             spam : ham    =     14.3 : 1.0
        contains(target) = True             spam : ham    =