In [2]:
import numpy as np
import os
import pickle
import math

In [218]:
spamFiles = os.listdir('Spam')
hamFiles = os.listdir('Ham')

spam = []
ham = []

for fname in spamFiles:
    f = open("Spam/"+fname)
    mailStr = ""
    for line in f:
        mailStr = mailStr + line
    spam.append(mailStr)

for fname in hamFiles:
    f = open("Ham/"+fname)
    mailStr = ""
    for line in f:
        mailStr = mailStr + line
    ham.append(mailStr)

In [219]:
data = {'spam': spam, 'ham': ham}

In [220]:
pickleOut = open('Dataset.pickle', 'wb')
pickle.dump(data, pickleOut)

In [221]:
def isNum(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

### Split data into training and testing samples

In [222]:
stopwords = set([" ", "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours	ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"])

In [223]:
avoidChars = set([".", "+", "-", "|", "\r", "\n", "/", ":", "!", "\x00", "\xff", "=", "%", "'", ",", "*", "_", "__"])

for idx, document in enumerate(spam):
    spam[idx] = "".join([c if c not in unwanted else " " for c in spam[idx]])
    spam[idx] = spam[idx].split(" ")

In [224]:
for idx, document in enumerate(ham):
    ham[idx] = "".join([c if c not in unwanted else " " for c in ham[idx]])
    ham[idx] = ham[idx].split(" ")

In [225]:
for idx, doc in enumerate(ham):
    ham[idx] = [item.lower() for item in ham[idx] if item != "" and item.lower() not in stopwords and not isNum(item.lower())]

In [226]:
for idx, doc in enumerate(spam):
    spam[idx] = [item.lower() for item in spam[idx] if item != "" and item.lower() not in stopwords  and not isNum(item.lower())]

### Split into train and test set

In [227]:
spamtest = spam[int(len(spam)*0.9):]
hamtest = ham[int(len(ham)*0.9):]

spamval = spam[int(len(spam)*0.85): int(len(spam)*0.9)]
hamval = ham[int(len(ham)*0.85):int(len(ham)*0.9)]

spam = spam[:int(len(spam)*0.85)]
ham = ham[:int(len(ham)*0.85)]

In [242]:
print len(spamval), len(spam)

858 14596


### We calculate the prior before we split into training and testing samples

In [228]:
Pspam = len(spam)*1.0/(len(spam)+len(ham))
Pham = 1-Pspam

In [229]:
Pspam, Pham

(0.5092812281926029, 0.4907187718073971)

### We also need the conditional probability of each word given it is spam and non spam.

In [230]:
# Calculate set of all words.
wordList = set()
for lst in spam:
    for word in lst:
        wordList.add(word)
for lst in ham:
    for word in lst:
        wordList.add(word)

### P_wc = Probability of finding word w given document class c.
### P_wc = (count(wi, c)+alpha) / sum(count(w, c)+alpha)

In [243]:
totalSpamWords = 0
totalHamWords = 0

countSpam = {}
countHam = {}


for lst in spam:
    for word in lst:
        if word not in countSpam:
            countSpam[word] = 0
        countSpam[word] += 1
        totalSpamWords += 1

for lst in ham:
    for word in lst:
        if word not in countHam:
            countHam[word] = 0
        countHam[word] += 1
        totalHamWords += 1

In [244]:
alpha = 1

In [245]:
def P_wc(word, spam):
    if(spam):
        occ = 0
        if word in countSpam:
            occ = countSpam[word]
        return np.log((occ + alpha)*1.0 / (totalSpamWords + alpha*len(wordList)))
    else:
        occ = 0
        if word in countHam:
            occ = countHam[word]
        return np.log((occ + alpha)*1.0 / (totalHamWords + alpha*len(wordList)))

In [264]:
thresh = 0.495

def isSpam(email):
    ps, ph = np.log(Pspam), np.log(Pham)
    
    for word in email:
        ps += P_wc(word, 1)
        
    for word in email:
        ph += P_wc(word, 0)
        

    return  ps*1.0/(ps+ph) <= thresh
            
   

In [265]:
def getAcc(spam, ham):
    acc = 0
    for item in ham:
        if not isSpam(item):
            acc += 1
            
    for item in spam:
        if isSpam(item):
            acc += 1
    return acc*1.0/(len(spam)+len(ham))

In [266]:
def getFP(spam, ham):
    fp = 0
    for item in ham:
        if isSpam(item):
            fp += 1

    print fp*1.0/len(ham)

## Hyperparameter Selection 
### Validation Data

In [267]:
print "Validation accuracy = ", getAcc(spamval, hamval)
print "Validation false positive rate = ", getFP(spamval, hamval)

Validation accuracy =  0.962017804154
Validation false positive rate =  0.00967351874244
None


In [268]:
print "Test accuracy = ", getAcc(spamtest, hamtest)
print "Test false positive rate = ", getFP(spamtest, hamtest)

Test accuracy =  0.970945745627
Test false positive rate =  0.00483383685801
None


### RESULTS: So we obtained 97% overall accuracy, but fp is 0.5%. This means that for every 1000 good mails, we will have about 5 good mails classified as spam. This isn't bad, but should be improved.

#### Where did we fail?

In [270]:
for item in hamtest:
    if isSpam(item):
        print item, "\n"

['subject', 'si', 'back', 'si', 'back', 'folsom'] 

['subject', 'congratulations', 'congratulations', 'expanded', 'role', 'hope', 'means', 'get', 'lots', 'money', 'fewer', 'hours'] 

['subject', 'please', 'note', 'new', 'email', 'address', 'effective', 'today', 'please', 'send', 'future', 'correspondence', 'staceykn', '@', 'yahoo', 'com', 'thanks'] 

['subject', 'registration', 'confirmation', 'spinner', 'com', 'thank', 'joining', 'spinner', 'com', 'web', 's', 'largest', 'source', 'free', 'streaming', 'music', 'just', 'wanted', 'confirm', 'registration', 'spinner', 'now', 'complete', 'access', 'spinner', 's', 'professionally', 'programmed', 'music', 'channels', 'entire', 'spinner', 'com', 'website', 'just', 'remind', 'player', 'website', '(', 'importantly', ')', 'music', 'totally', 'free', 'user', 'name', 'junglo', 'omitted', 'password', 'privacy', 'please', 'hang', 'email', 'can', 'easily', 'retrieve', 'user', 'name', 'forget', 'forget', 'password', '?', 'enter', 'user', 'name', 'emai

#### Here's what the failures seems like to me, in that order.
* Short mails are spam/only subject mails are spam.
* Get more money related mails are spam.
* Third one seems like a legimitate fail case.
* Legit fail.
* Legit fail.
* Legit fail - (Big fail)
* Seems like spam to me.
* Mostly fail. 