In [3]:
import numpy as np
import pandas as pd
import argparse, os, re, functools
from stemmer import stemDoc

#-----------------NAIVE BAYES-----------------

def initBag(ham):
    ''' (string) -> dict

        Given the name of a stemmed text file (ham) generated from 
        all of the ham files, convert the document into a bag of 
        words stored as a dictionary and return the bag
    '''
    d = {}
    with open(ham,'r') as openFile:
        for line in openFile:
            for word in line.split():
                if not word in d:
                    d[word] = 1
                else:
                    d[word] += 1
    return d

#--------------MCAP L2-------------------------------

def initBag1(doclist):
    ''' (list) -> dict

        Given the name of a stemmed text file (ham) generated from 
        all of the ham files, convert the document into a bag of 
        words stored as a dictionary and return the bag
        Bag of words here is slightly different than the one above
        as probability of word is stored instead of freq. Don't need 
        to do this, but saves time on calculations later
    '''
    d = {}
    for word in doclist:
        if not word in d:
            d[word] = 1
        else:
            d[word] += 1
    tot = sum(d.values())
    for key in d.keys():
       d[key] = d[key]/tot
    return d

def sigmoid(z):
    return 1/(1+np.exp(-x))


def genDataArr(testHamDir, testSpamDir, uniqueWords, stopwords=""):
    numOfDocs = len(os.listdir(testHamDir)+os.listdir(testSpamDir)) #indexed rows
    numOfAttr = len(uniqueWords)+2 #column names
    uniqueWords.append("THRESHOLD")
    uniqueWords.append("CLASS")
    #print(uniqueWords)

    zero_data = np.zeros(shape=(numOfDocs,numOfAttr))
    df = pd.DataFrame(zero_data, columns=uniqueWords)
    #print(df)
    #print(df.loc[:,'spend'])
    
    ind = 0
    # For document in Ham dir
    for doc in os.listdir(testHamDir):
        #if ind%10 == 0:
        #    print("processing doc {} of {}".format(ind,numOfDocs))
        # Stem the document
        listFromDoc = stemDoc(testHamDir+doc) if stopwords=="\
                " else stemDoc(testHamDir+doc,stopwords)
        bag = initBag1(listFromDoc)
        #print("{} size = {}, bag size = {}".format(doc, len(listFromDoc), len(bag)))
        #print(listFromDoc)
        #print(bag)
        # Move probablilites into df from bag
        for key in bag:
            #print(df[ind,key])
            try:
                currentInd = df.loc[ind,key]
                #print("  "+str(df.loc[ind,key])+" set to " +str(bag[key]))
                df.loc[ind, key] = bag[key]
            except KeyError:
                #print("could not insert " + str(key))
                pass
        # Set class to HAM and threshold to 1
        df.loc[ind, "CLASS"] = 1
        df.loc[ind, "THRESHOLD"] = 1
        # Start at next document
        ind+=1

    #for doc in os.listdir(testSpamDir):
    #    print(doc)
        # Stem the document
    #    l = stemDoc(testSpamDir+doc) if stopwords==" \
    #            " else stemDoc(testSpamDir+doc,stopwords)
    #    bag = initBag1(l)
        # Move probablilites into df from bag
    #    for key in bag:
    #        try:
    #            df[ind, key] = bag[key]
    #        except:
    #            print("{} not found".format(key))
        # Set class to HAM and threshold to 1
    #    df[ind, "CLASS"] = 1
    #    df[ind, "THRESHOLD"] = 1
        # Start at next document
    #    ind+=1

    return df

In [4]:
#--------------MAIN--------------------
# Get arguments
ham = "stemmedFiles/train-ham-stemmed.txt"
spam = "stemmedFiles/train-spam-stemmed.txt"
hamDir = "test/ham/"
spamDir = "test/spam/"
stopWords = ""

# Initialize bags from stemmed test email files
hamBag = initBag(ham)
spamBag = initBag(spam)

#generate list total uniqe words
l = set(hamBag.keys())
l2 = set(spamBag.keys())
#total = len(hamBag) + len(spamBag) - len(l.intersection(l2))
tot = l.union(l2)
attrlst = list(tot)
#print(total)
# Calculate priors for NB
hamCount = len(os.listdir(hamDir))    
spamCount = len(os.listdir(spamDir))    
prior_ham = hamCount / (hamCount+spamCount) # number of hamDocs/totalDocs
prior_spam = spamCount / (hamCount+spamCount)

print("Running MCAP with L2")



Running MCAP with L2


In [5]:
#l = stemDoc("minitest/ham/t1.txt")
data_df = genDataArr('test/ham/','test/spam/',attrlst)
print(data_df)

     amend  southeastern  gushforbad  chandeli  3910  sold  cake  dipl  god  \
0      0.0           0.0         0.0       0.0   0.0   0.0   0.0   0.0  0.0   
1      0.0           0.0         0.0       0.0   0.0   0.0   0.0   0.0  0.0   
2      0.0           0.0         0.0       0.0   0.0   0.0   0.0   0.0  0.0   
3      0.0           0.0         0.0       0.0   0.0   0.0   0.0   0.0  0.0   
4      0.0           0.0         0.0       0.0   0.0   0.0   0.0   0.0  0.0   
5      0.0           0.0         0.0       0.0   0.0   0.0   0.0   0.0  0.0   
6      0.0           0.0         0.0       0.0   0.0   0.0   0.0   0.0  0.0   
7      0.0           0.0         0.0       0.0   0.0   0.0   0.0   0.0  0.0   
8      0.0           0.0         0.0       0.0   0.0   0.0   0.0   0.0  0.0   
9      0.0           0.0         0.0       0.0   0.0   0.0   0.0   0.0  0.0   
10     0.0           0.0         0.0       0.0   0.0   0.0   0.0   0.0  0.0   
11     0.0           0.0         0.0       0.0   0.0