In [1]:
import glob
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
FOLDER = '../enron1/'
HAM_FOLDER = 'ham/'
SPAM_FOLDER = 'spam/'

HAM = 0
SPAM = 1

HAM_LIST = glob.glob(FOLDER + HAM_FOLDER + '*.txt')
SPAM_LIST = glob.glob(FOLDER + SPAM_FOLDER + '*.txt')

In [3]:
class naive_bayes:

    def __init__(self, ham_list, spam_list):
        self.ham_list = ham_list
        self.spam_list = spam_list
#         self.email_list = ham_list + spam_list
        self.N = np.size(ham_list + spam_list)
        self.N_HAM = np.size(ham_list)
        self.N_SPAM = np.size(spam_list)
        self.labels = np.asarray([HAM]* self.N_HAM + [SPAM]* self.N_SPAM)
        
        self.N_TRAINING = np.asarray([int(np.floor(self.N_HAM * 0.8)), int(np.floor(self.N_SPAM * 0.8))])
        
        self.vocab = []
        self.nvocab = 0
        
    def nwords(self, X):
        '''return the number of distinct words in input matrix X
            parameters:
                X: 2d numpy array'''
        return np.count_nonzero(X.sum(axis = 0))
    
    def start_training(self, P_TRAINING = None):
        if P_TRAINING == None:
            N_TRAINING = self.N_TRAINING
        else:
            self.N_TRAINING = np.asarray([int(np.floor(self.N_HAM * P_TRAINING)), 
                                          int(np.floor(self.N_SPAM * P_TRAINING))])
            N_TRAINING = self.N_TRAINING

        training = CountVectorizer(input = 'filename', decode_error = 'ignore')
        training_X = training.fit_transform(self.ham_list[:N_TRAINING[HAM]] + self.spam_list[:N_TRAINING[SPAM]]).toarray()
        self.vocab = training.get_feature_names()
        self.nvocab = np.size(self.vocab)
        
        training_ham = training_X[:N_TRAINING[HAM]]
        training_spam = training_X[-N_TRAINING[SPAM]:]
        
        prior = N_TRAINING / N_TRAINING.sum()
        con_ham = (training_ham.sum(axis = 0) + 1) / (self.nwords(training_ham) + self.nvocab)
        con_spam = (training_spam.sum(axis = 0) + 1) / (self.nwords(training_spam) + self.nvocab)
        conditionals = np.asarray([con_ham, con_spam])
        return prior, conditionals
        
    def classifier(self, prior, conditionals):
        N_TESTING = self.N - self.N_TRAINING
        testing = CountVectorizer(input = 'filename', vocabulary = self.vocab, decode_error = 'ignore')
        testing_X = testing.fit_transform(self.ham_list[-N_TESTING[HAM]:] + self.spam_list[-N_TESTING[SPAM]:]).toarray()
        testing_ham = testing_X[:N_TESTING[HAM]]
        testing_spam = testing_X[-N_TESTING[SPAM]:]
        results = []
        for i in np.arange(testing_X.shape[0]):
            prob_ham = np.dot(np.log(conditionals[HAM]), testing_X[i]) + np.log(prior[HAM])
            prob_spam = np.dot(np.log(conditionals[SPAM]), testing_X[i]) + np.log(prior[SPAM])
            if prob_ham > prob_spam:
                results.append(HAM)
            else:
                results.append(SPAM)
        return results
                
    def accuracy(self, results):
        N_TESTING = self.N - self.N_TRAINING
        test_label = np.asarray([HAM] * N_TESTING[HAM] + [SPAM] * N_TESTING[SPAM])
        expect = 0
        for i in np.arange(np.size(results)):
            if results[i] == test_label[i]:
                expect += 1
            else:
                pass
        return expect / np.size(results)

In [4]:
test = naive_bayes(HAM_LIST, SPAM_LIST)
prior, conditionals = test.start_training(0.9)
results = test.classifier(prior, conditionals)
test.accuracy(results)

NameError: name 'numpy' is not defined