In [85]:
import unittest
from functools import reduce
import numpy as np

In [86]:
def load_data():
    X = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'pleas'],
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmatian', 'is', 'so', 'cute', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    y = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 is not
    return X, y

In [87]:
def create_vocab_list(dataset):
    if (len(dataset) == 0): return []
    vocabset = set(reduce(lambda x, y: x + y, dataset))
    return list(vocabset)

In [88]:
def word2vec(vocab_list, words):
    return [1 if vocab in words else 0 for vocab in vocab_list]

In [89]:
def create_train_matrix(vocab_list, X):
    return [word2vec(vocab_list, feat) for feat in X]

In [90]:
def train_naive_bayes(train_matrix, train_category):
    num_train_docs = len(train_matrix)
    num_words = len(train_matrix[0])
    
    # prefix p for probabilities
    p_abusive = np.sum(train_category) / float(num_train_docs)
    p0_num = np.ones(num_words) # np.zeros(num_words)
    p1_num = np.ones(num_words) # np.zeros(num_words)
    
    p0_den = 2.0 # 0.0
    p1_den = 2.0 # 0.0
    
    for i in range(num_train_docs):
        if train_category[i] == 1:
            p1_num += train_matrix[i]
            p1_den += sum(train_matrix[i])
        else:
            p0_num += train_matrix[i]
            p0_den += sum(train_matrix[i])
    p0_vect = np.log(p0_num / p0_den)
    p1_vect = np.log(p1_num / p1_den)
    return p0_vect, p1_vect, p_abusive

In [91]:
def classify_naive_bayes(vec2_classify, p0_vec, p1_vec, p_class1):
    p1 = sum(vec2_classify * p1_vec) + log(p_class1)
    p0 = sum(vec2_classify * p0_vec) + log(1.0 - p_class1)
    if p1 > p0:
        return 1
    else:
        return 0

In [101]:
class NaiveBayesTestCase(unittest.TestCase):    
    def test_load_data(self):
        X, y = load_data()
        self.assertEqual(len(X), len(y))

    def test_create_vocab_list_with_empty_params(self):
        out = create_vocab_list([])
        self.assertEqual(out, [])

    def test_create_vocab_list_with_params(self):
        X_test = [['a', 'b'], ['c', 'd']]
        out = create_vocab_list(X_test)
        self.assertEqual(len(out), 4)
        
    def test_word2vec(self):
        vocab_list = ['a', 'b', 'c', 'd']
        expected = [0, 1, 1, 0]
        got = word2vec(vocab_list, ['b', 'c'])
        self.assertEqual(expected, got)
    
    def test_create_train_matrix(self):
        X_test = [['a', 'b'], ['c', 'd']]
        vocab_list = create_vocab_list(X_test)
        got = create_train_matrix(vocab_list, X_test)
        self.assertEqual(len(got), 2)
    
    def test_train_naive_bayes(self):
        X_test = [['a', 'a'], ['a', 'b'], ['b', 'c'], ['b', 'c']]
        y = [1, 1, 0, 0]
        
        vocab_list = create_vocab_list(X_test)
        train_matrix = create_train_matrix(vocab_list, X_test)
        p0, p1, p_abusive = train_naive_bayes(train_matrix, y)
        self.assertEqual(p_abusive, 0.5)
    
    def test_naive_bayes(self):
        X, y = load_data()
        vocab_list = create_vocab_list(X)
        train_matrix = create_train_matrix(vocab_list, X)
        p0, p1, p_abusive = train_naive_bayes(train_matrix, y)
        
        # Test not abusive
        X_test = ['love', 'my', 'dalmatian']
        y_test = word2vec(vocab_list, X_test)
        expected = 0
        got = classify_naive_bayes(y_test, p0, p1, p_abusive)
        self.assertEqual(expected, got)
        
        # Test abusive
        X_test = ['stupid', 'garbage']
        y_test = word2vec(vocab_list, X_test)
        expected = 1
        got = classify_naive_bayes(y_test, p0, p1, p_abusive)
        self.assertEqual(expected, got)
        

    def test_unique(self):
        items = [1, 1, 2]
        unique = list(set(items))
        self.assertEqual(unique, [1,2])

if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

........
----------------------------------------------------------------------
Ran 8 tests in 0.012s

OK


In [93]:
X, y = load_data()
X, y

([['my', 'dog', 'has', 'flea', 'problems', 'help', 'pleas'],
  ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
  ['my', 'dalmatian', 'is', 'so', 'cute', 'I', 'love', 'him'],
  ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
  ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
  ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']],
 [0, 1, 0, 1, 0, 1])

In [94]:
vocab_list = create_vocab_list(X)
vocab_list

['flea',
 'mr',
 'so',
 'steak',
 'licks',
 'pleas',
 'him',
 'love',
 'take',
 'dog',
 'how',
 'stupid',
 'ate',
 'buying',
 'food',
 'worthless',
 'garbage',
 'quit',
 'help',
 'park',
 'my',
 'problems',
 'stop',
 'cute',
 'not',
 'is',
 'posting',
 'has',
 'maybe',
 'to',
 'I',
 'dalmatian']

In [95]:
for i, v in enumerate(X):
    print('row {} -> {}'.format(i, word2vec(vocab_list, v)))

row 0 -> [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
row 1 -> [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0]
row 2 -> [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1]
row 3 -> [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]
row 4 -> [0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]
row 5 -> [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [96]:
create_train_matrix(vocab_list, X)

[[1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0],
 [0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0],
 [0,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]]