In [210]:
import os
import re

import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC

from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.classify.util import accuracy

from collections import Counter

## Loading Data

The first step is to load our sample data for both spam and ham. For this, we created a utility called loader which, given an input file path, reads the content from the file and append it to a python list.

In [211]:
def loader(file_input):
    data = []
    for (dirpath, dirnames, filenames) in os.walk(file_input):
        for file in filenames:
            path = os.path.join(dirpath, file)
            with open(path, encoding='latin-1') as f:
                data.append(f.read())
                f.close()
    return data

In [212]:
file_input = './data/enron1/ham'
ham = loader(file_input)

In [213]:
file_input = './data/enron1/spam'
spam = loader(file_input)

In [214]:
# Tokenize
patt = re.compile(r'\W')
stops = set(stopwords.words('english'))

def process_words(data):
    words = word_tokenize(data)

    # Remove stop words
    words = [word for word in words if word not in stops]

    # Remove special characters
    words = [word for word in words if not patt.search(word)]

    # Remove digit
    words = [word for word in words if not word.isdigit()]

    return ' '.join(words)

In [219]:
ham_data = [[process_words(words), 0] for words in ham]
spam_data = [[process_words(words), 1] for words in spam]
data = np.array(ham_data + spam_data)
print('done')

done


In [224]:
# X = np.random.randint(5, size = (6, 100))
# y = np.array([1, 2, 3, 4, 5, 6])
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data[:, 0])
y = data[:, 1]
clf = MultinomialNB()
clf.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [250]:
text = 'december see attached file'

In [251]:
v = vectorizer.transform([process_words(text)])
clf.predict(v)

array(['0'], 
      dtype='<U1')

In [234]:
clf2 = LinearSVC()
clf2.fit(X, y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [252]:
v = vectorizer.transform([process_words(text)])
clf2.predict(v)

array(['0'], 
      dtype='<U21442')