In [1]:
import numpy as np
import os
import pickle
import math

In [14]:
spamFiles = os.listdir('Spam')
hamFiles = os.listdir('Ham')

spam = []
ham = []

for fname in spamFiles:
    f = open("Spam/"+fname)
    mailStr = ""
    for line in f:
        mailStr = mailStr + line
    spam.append(mailStr)

for fname in hamFiles:
    f = open("Ham/"+fname)
    mailStr = ""
    for line in f:
        mailStr = mailStr + line
    ham.append(mailStr)

In [3]:
data = {'spam': spam, 'ham': ham}

In [4]:
pickleOut = open('Dataset.pickle', 'wb')
pickle.dump(data, pickleOut)

### Split data into training and testing samples

In [5]:
stopwords = set(["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours	ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"])

In [15]:
for idx, document in enumerate(spam):
    
    spam[idx] = spam[idx].replace(".", " ").replace("-", " ")\
                .replace("(", " ").replace(")", " ")\
                .replace(",", " ").replace('\r', " ")\
                .replace(":", " ").replace("\n", " ")\
                .replace("/", " ").replace("!", " ! ").replace("\x00", "")\
                .replace("\xff", "").replace("\x10", "")
                                           
    spam[idx] = spam[idx].split(" ")

In [16]:
for idx, document in enumerate(ham):
    ham[idx] = ham[idx].replace(".", " ").replace("-", " ")\
                .replace("(", " ").replace(")", " ")\
                .replace(",", " ").replace('\r', " ")\
                .replace(":", " ").replace("\n", " ")\
                .replace("/", " ").replace("!", " ! ").replace("\x00", "")\
                .replace("\xff", "").replace("\x10", "")
                                           
    ham[idx] = ham[idx].split(" ")

In [17]:
for idx, doc in enumerate(ham):
    ham[idx] = [item.lower() for item in ham[idx] if item != "" and item.lower() not in stopwords]

In [18]:
for idx, doc in enumerate(spam):
    spam[idx] = [item.lower() for item in spam[idx] if item != "" and item.lower() not in stopwords]

### Split into train and test set

In [19]:
spamtest = spam[int(len(spam)*0.7):]
hamtest = ham[int(len(ham)*0.7):]

spam = spam[:int(len(spam)*0.7)]
ham = ham[:int(len(ham)*0.7)]

### We calculate the prior before we split into training and testing samples

In [20]:
Pspam = len(spam)*1.0/(len(spam)+len(ham))
Pham = 1-Pspam

In [21]:
Pspam, Pham

(0.509278874671638, 0.490721125328362)

### We also need the conditional probability of each word given it is spam and non spam.

In [22]:
# Calculate set of all words.
wordList = set()
for lst in spam:
    for word in lst:
        wordList.add(word)
for lst in ham:
    for word in lst:
        wordList.add(word)

### P_wc = Probability of finding word w given document class c.
### P_wc = (count(wi, c)+alpha) / sum(count(w, c)+alpha)

In [24]:
totalSpamWords = 0
totalHamWords = 0

countSpam = {}
countHam = {}


for lst in spam:
    for word in lst:
        if word not in countSpam:
            countSpam[word] = 0
        countSpam[word] += 1
        totalSpamWords += 1

for lst in ham:
    for word in lst:
        if word not in countHam:
            countHam[word] = 0
        countHam[word] += 1
        totalHamWords += 1

In [33]:
alpha = 10

In [34]:
def P_wc(word, spam):
    if(spam):
        occ = 0
        if word in countSpam:
            occ = countSpam[word]
        return np.log((occ + alpha)*1.0 / (totalSpamWords + alpha*len(wordList)))
    else:
        occ = 0
        if word in countHam:
            occ = countHam[word]
        return np.log((occ + alpha)*1.0 / (totalHamWords + alpha*len(wordList)))

In [71]:
def guess(email):
    ps, ph = np.log(Pspam), np.log(Pham)
    
    for word in email:
        ps += P_wc(word, 1)
        
    for word in email:
        ph += P_wc(word, 0)
        

    return ps*1.0/(ps+ph) <= 0.498
            
   

### Overall accuracy

In [73]:
acc = 0

for item in hamtest:
    if not guess(item):
        acc += 1

for item in spamtest:
    if guess(item):
        acc += 1

print acc*1.0/(len(spamtest) + len(hamtest))

0.965302491103


### False positive

In [74]:
fp = 0
for item in hamtest:
    if guess(item):
        fp += 1

print fp*1.0/len(hamtest)

0.00946817082998


### RESULTS: So we obtained 96.5% overall accuracy, but fp is 0.9%. This means that for every 100 ham mails, we will have about 1 good mail predicted as spam. We must try to improve it to bring it down as much as possible.