In [37]:
import os
import re

import numpy as np

from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import random

## Loading Data

The first step is to load our sample data for both spam and ham. For this, we created a utility called loader which, given an input file path, reads the content from the file and append it to a python list.

In [2]:
def loader(file_input):
    data = []
    for (dirpath, dirnames, filenames) in os.walk(file_input):
        for file in filenames:
            path = os.path.join(dirpath, file)
            with open(path, encoding='latin-1') as f:
                data.append(f.read())
                f.close()
    return data

In [3]:
file_input = './data/enron1/ham'
ham = loader(file_input)

In [4]:
file_input = './data/enron1/spam'
spam = loader(file_input)

In [28]:
# Tokenize
patt = re.compile(r'\W')
stops = set(stopwords.words('english'))

def process_words(data):
    words = word_tokenize(data)
    
    # Lowercase
#     words = [word.tolower() for word in words]

    # Remove stop words
#     words = [word for word in words if word not in stops]

    # Remove special characters
#     words = [word for word in words if not patt.search(word)]

    # Remove digit
#     words = [word for word in words if not word.isdigit()]

    return dict([(word, True) for word in words])

In [39]:
ham_data = [(process_words(words), 'pos') for words in ham]
spam_data = [(process_words(words), 'neg') for words in spam]
all_data = spam_data + ham_data
print('done')

done


In [42]:
clf = NaiveBayesClassifier.train(all_data)
print('done')

done


In [43]:
clf.show_most_informative_features()

Most Informative Features
               forwarded = True              pos : neg    =    247.5 : 1.0
                     hou = True              pos : neg    =    234.2 : 1.0
                    2004 = True              neg : pos    =    198.2 : 1.0
            prescription = True              neg : pos    =    160.7 : 1.0
                     nom = True              pos : neg    =    154.3 : 1.0
                    pain = True              neg : pos    =    128.1 : 1.0
                    spam = True              neg : pos    =    108.5 : 1.0
                     ect = True              pos : neg    =    103.9 : 1.0
                     sex = True              neg : pos    =    103.6 : 1.0
                featured = True              neg : pos    =     92.2 : 1.0


In [44]:
text = 'fake babe is amazing'
text_data = process_words(text)
clf.classify(text_data)

'neg'

In [45]:
text = 'december is amazing'
text_data = process_words(text)
clf.classify(text_data)

'pos'