# Naive Bayesian Anti-Spam Filtering

In [5]:
class Folder:
    def __init__(self):
        self.spamEmail = []
        self.legitEmail = []

    def addSpamEmail(self, email):
        self.spamEmail.append(email)

    def addLegitimateEmail(self, email):
        self.legitEmail.append(email)

In [6]:
class Word:
    def __init__(self, content):
        self.content = content
        self.mutualInfo = 0
        self.notPresentSpamCount = 0
        self.notPresentLegitCount = 0
        self.presentSpamCount = 0
        self.presentLegitCount = 0

        self.spamDocumentCount = 0
        self.legitDocumentCount = 0

In [7]:
import os

trainingDistinctWords = {}
trainingSpamEmails = []
trainingLegitEmails = []
folderList = []
nWordsSpam = 0
nWordsLegit = 0

In [8]:
# loads emails given a path
def loadEmails(path):
        print("Loading emails...")
        for i in range(1,11):
            partPath = path + str(i)
            partFolder = Folder()
            print("Loading email[",i,"]...")
            for filename in os.listdir(partPath):
                content = open(partPath + '\\' + filename).read()
                if filename.startswith('sp'):
                    partFolder.addSpamEmail(content)
                else:
                    partFolder.addLegitimateEmail(content)

            folderList.append(partFolder)

In [9]:
def preparingTrainingSet(testingIndex):

        print("Preparing training set (find distinct words and load training spam and legit emails)...")
        
        trainingSpamEmails = []
        trainingLegitEmails = []
        trainingDistinctWords = {}
        
        #getting all the emails read except for the testing index
        for i in range(len(folderList)):
            if i != testingIndex:
                print("Preparing folder[",i,"]...")
                trainingSpamEmails += folderList[i].spamEmail
                trainingLegitEmails += folderList[i].legitEmail

        print("Spam: ", len(trainingSpamEmails))
        print("Legit: ", len(trainingLegitEmails))

        for email in trainingLegitEmails:
            email = email.split()
            tokenizedEmail = set(email)

            #count term frequencies
            for token in tokenizedEmail:
                if token in trainingDistinctWords:
                    word = trainingDistinctWords.get(token)
                    word.presentLegitCount += 1
                    word.notPresentLegitCount -= 1
                else:
                    word = Word(token)
                    word.presentLegitCount = 1  #number of times the word appeared in an email
                    word.notPresentLegitCount = len(trainingLegitEmails) - 1
                    word.presentSpamCount = 0
                    word.notPresentSpamCount = len(trainingSpamEmails)
                    trainingDistinctWords[token] = word



        for email in trainingSpamEmails:
            email = email.split()
            tokenizedEmail = set(email)
            
            for token in tokenizedEmail:
                if token in trainingDistinctWords:
                    word = trainingDistinctWords.get(token)
                    word.presentSpamCount += 1
                    word.notPresentSpamCount -= 1
                else:
                    word = Word(token)
                    word.presentSpamCount = 1
                    word.notPresentSpamCount = len(trainingSpamEmails) - 1
                    word.presentLegitCount = 0
                    word.notPresentLegitCount = len(trainingLegitEmails)
                    trainingDistinctWords[token] = word



        print("Training distinct words: ", len(trainingDistinctWords))

In [10]:
#start
loadEmails('data\\bare\\part')

for i in range(10):
    preparingTrainingSet(i)

Loading emails...
('Loading email[', 1, ']...')
('Loading email[', 2, ']...')
('Loading email[', 3, ']...')
('Loading email[', 4, ']...')
('Loading email[', 5, ']...')
('Loading email[', 6, ']...')
('Loading email[', 7, ']...')
('Loading email[', 8, ']...')
('Loading email[', 9, ']...')
('Loading email[', 10, ']...')
Preparing training set (find distinct words and load training spam and legit emails)...
('Preparing folder[', 1, ']...')
('Preparing folder[', 2, ']...')
('Preparing folder[', 3, ']...')
('Preparing folder[', 4, ']...')
('Preparing folder[', 5, ']...')
('Preparing folder[', 6, ']...')
('Preparing folder[', 7, ']...')
('Preparing folder[', 8, ']...')
('Preparing folder[', 9, ']...')
('Spam: ', 433)
('Legit: ', 2171)
('Training distinct words: ', 62333)
Preparing training set (find distinct words and load training spam and legit emails)...
('Preparing folder[', 0, ']...')
('Preparing folder[', 2, ']...')
('Preparing folder[', 3, ']...')
('Preparing folder[', 4, ']...')
('Pre