In [140]:
%matplotlib inline

import matplotlib.pyplot as plt
import unittest
import numpy as np
import math

In [111]:
data = [['my dog has flea problems help please', 'not abusive'],
        ['maybe not take him to dog park stupid', 'abusive'],
        ['my dalmation is so cute I love him', 'not abusive'],
        ['stop posting stupid worthless garbage', 'abusive'],
        ['mr licks ate my steak how to stop him', 'not abusive'],
        ['quit buying worthless dog food stupid', 'abusive']]
X = [sentence.split(' ') 
     for sentence, label in data]
y = [1 if label == 'abusive' else 0 
     for sentence, label in data]

In [112]:
def bag_of_words(documents):
    bow = set()
    for doc in documents:
        bow |= set(doc)
    return bow

In [113]:
def word2vec(bow, document):
    return [1 if word in document else 0
            for word in bow]

In [114]:
class TestNaiveBayes(unittest.TestCase):
    def setUp(self):
        self.documents = [['hello', 'world'], ['hi', 'world']]
        self.bag_of_words = bag_of_words(self.documents)
        
    def test_bag_of_words(self):
        expected = {'hello', 'world', 'hi'}
        self.assertEqual(expected, self.bag_of_words)
    
    def test_word2vec(self):
        expected = [0, 1, 1]
        got = word2vec(self.bag_of_words, self.documents[0])
        self.assertEqual(expected, got)
        
if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

..
----------------------------------------------------------------------
Ran 2 tests in 0.003s

OK


In [127]:
def train(X, y):
    n_samples, n_words = X.shape
    p_abusive = sum(y) / n_samples
    
    p0_num, p1_num = np.ones(n_words), np.ones(n_words)
    p0_den, p1_den = 1, 1
    
    for i in range(n_samples):
        if y[i] == 1:
            # Matrix addition.
            p1_num += X[i]
            p1_den += sum(X[i])
        else:
            p0_num += X[i]
            p0_den += sum(X[i])

    p1 = np.log(p1_num / p1_den)
    p0 = np.log(p0_num / p0_den)
    
    return p0, p1, p_abusive

In [128]:
bow = bag_of_words(X)

X_vec = []
for X_i in X:
    X_vec.append(word2vec(bow, X_i))

X_vec = np.array(X_vec)
X_vec.shape

(6, 32)

In [139]:
p0, p1, p_abusive = train(X_vec, y)

In [141]:
def classify(X, p0, p1, p_abusive):
    p1 = sum(p1 * X) + math.log(p_abusive)
    p0 = sum(p0 * X) + math.log(1 - p_abusive)
    return 1 if p1 > p0 else 0

In [142]:
X_test = ['love', 'my', 'dalmation']
X_test = word2vec(bow, X_test)
classify(X_test, p0, p1, p_abusive)

0

In [143]:
X_test = ['stupid', 'garbage']
X_test = word2vec(bow, X_test)
classify(X_test, p0, p1, p_abusive)

1

In [154]:
import re
regexp = re.compile('\W+') # Not valid: "\\W*"
regexp.split('This book is the best book on Python or M.L that I have.')

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'that',
 'I',
 'have',
 '']