In [1]:
import numpy as np
import re
from os import listdir

In [22]:
def createVocabList(datasets):
    """
    build a set including only unique tokens from all of sub-Text 
    """
    vocabSet = []
    for data in datasets:
        vocabSet.extend(data)
    vocabSet = np.unique(vocabSet) 
    return vocabSet


def word2Vec(vocabSet, inputSet):  # set-of-words model
    vector = np.zeros(len(vocabSet))
    for token in inputSet:
        if token in vocabSet:
            vector[list(vocabSet).index(token)] = 1 # set the value to 1 at the corresponds position
    return vector

def trainNB(trainMat, trainLabels):
    """
    Compute the probability for each token in whole training set
    that this token/word/feature exists in the text sets with certain
    label 0(not abusive) or 1(abusive)
    
    Input: tokenized training texts and correponding class vectors
    return: pAbusive--the prob. of abusive texts in the training sets
            pNotAbusive--the prob. of non-abusive texts in the training sets
            p0Vect--a vector contains the prob. for each token that this token
                    exists in  non-abusive texts 
            p1Vect--a vector contains the prob. for each token that this token
                    exists in abusive texts
    """
    trainMat = np.array(trainMat)
    trainLabels = np.array(trainLabels)
    numTrainMat = len(trainMat)
    pAbusive = len(trainLabels[trainLabels == 1]) / numTrainMat
    pNotAbusive = 1 - pAbusive
    nonAbusiveSets = trainMat[trainLabels == 0]  # sub-traingsets with class 0: non-abusive
    abusiveSets = trainMat[trainLabels == 1]  # # sub-traingsets with class 1: abusive
    # Compute the probability for each token in whole training set
    # that this token/word/feature exists in the label=0 texts
    p0Vect = (np.sum(nonAbusiveSets, axis=0) + 0.5) / (len(nonAbusiveSets) + 0.5) # add a tiny fraction 0.5 to avoid numerical problem
    
    # Compute the probability for each token in whole training set
    # that this token/word/feature exists in the label=1 texts
    p1Vect = (np.sum(abusiveSets, axis=0) + 0.5) / (len(abusiveSets) + 0.5)
    return pNotAbusive, pAbusive, p0Vect, p1Vect


def testNB(testsets, p0Vect, p1Vect, pNotAbusive, pAbusive):
    testsets = np.array(testsets.reshape(-1, 1))
    length = len(testsets)
    # the prob. of being non-absuive text, using p0Vect
    likelihoodVect0 = np.zeros((length, 1))
    for i in range(length):
        if testsets[i] == 1:
            likelihoodVect0[i] = p0Vect[i]
        else:
            likelihoodVect0[i] = 1 - p0Vect[i]
    likelihoodNotAbusive = np.prod(likelihoodVect0)  # production for each element: satisfy the iid. assumption for naive bayes
    posteriorNotAbusive = likelihoodNotAbusive * pNotAbusive  # posterior prob. of sets 
    # the prob. of being absuive text, using p1Vect
    likelihoodVect1 = np.zeros((length, 1))
    for i in range(length):
        if testsets[i] == 1:
            likelihoodVect1[i] = p1Vect[i]
        else:
            likelihoodVect1[i] = 1 - p1Vect[i]
    likelihoodAbusive = np.prod(likelihoodVect1)  # production for each element: satisfy the iid. assumption for naive bayes
    posteriorAbusive = likelihoodAbusive * pAbusive  # posterior prob. of sets 
    
    if posteriorNotAbusive > posteriorAbusive:
        return 0
    else:
        return 1
    
def bagOfword2Vec(vocabSet, inputSet):  # bag-of-words model
    vector = np.zeros(len(vocabSet))
    for token in inputSet:
        if token in vocabSet:
            vector[list(vocabSet).index(token)] += 1 # set the value to 1 at the corresponds position
    return vector

def generateTokens(file):
    regEx = re.compile('\\W*') # filter any characters except words and numbers
    fileName = listdir(file)
    tokenSets = []
    for name in fileName:
        fr = open(file + name, encoding='gb18030', errors='ignore')
        lists = fr.read()
        tokens = regEx.split(lists.lower())
        tokens = [token for token in tokens if len(token) > 2]  # drop the tokens with too short length
        tokenSets.append(tokens)
    return tokenSets

def crossValidation(datasets, labels):
    # split datasets into training sets and testing sets
    n = len(datasets)
    datasets = np.array(datasets)
    accuracy = []
    for i in range(int(n / 5)):
        testIndex = np.random.permutation(n)[:int(n / 5)]  # pick n/5 sets at random as testing sets
        trainIndex = list(set(np.arange(0, n)).difference(set(testIndex)))  # the rest as training sets
        testSets = datasets[testIndex]
        testLabels = labels[testIndex]
        trainSets = datasets[trainIndex]
        trainLabels = labels[trainIndex]
        pHam, pSpam, p0Vect, p1Vect = trainNB(trainSets, trainLabels)
        
        errorCount = 0
        for i in range(len(testSets)):
            classLabel = testNB(testSets[i], p0Vect, p1Vect, pHam, pSpam)
            if classLabel != testLabels[i]:
                errorCount += 1
        accuracy.append(1 - errorCount / len(testLabels))
    accuracy = np.mean(accuracy)  # average value of 10 accuracy results
    return accuracy

In [6]:
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

In [7]:
listOfPosts, listOfClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)
myVocabList

array(['I', 'ate', 'buying', 'cute', 'dalmation', 'dog', 'flea', 'food',
       'garbage', 'has', 'help', 'him', 'how', 'is', 'licks', 'love',
       'maybe', 'mr', 'my', 'not', 'park', 'please', 'posting',
       'problems', 'quit', 'so', 'steak', 'stop', 'stupid', 'take', 'to',
       'worthless'], dtype='<U9')

In [8]:
vector = word2Vec(myVocabList, listOfPosts[0])
vector

array([0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [9]:
trainMat = [bagOfword2Vec(myVocabList, listOfPost) for listOfPost in listOfPosts]
trainMat

[array([0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
        0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0.]),
 array([1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0.,
        1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0.]),
 array([0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.])]

In [10]:
pNotAbusive, pAbusive, p0Vect, p1Vect = trainNB(trainMat, listOfClasses)
print("The prob. of non-abusive tests is:{}".format(pNotAbusive))
print("The prob. of abusive tests is:{}".format(pAbusive))
print(p0Vect)
print(p1Vect)

The prob. of non-abusive tests is:0.5
The prob. of abusive tests is:0.5
[0.42857143 0.42857143 0.14285714 0.42857143 0.42857143 0.42857143
 0.42857143 0.14285714 0.14285714 0.42857143 0.42857143 0.71428571
 0.42857143 0.42857143 0.42857143 0.42857143 0.14285714 0.42857143
 1.         0.14285714 0.14285714 0.42857143 0.14285714 0.42857143
 0.14285714 0.42857143 0.42857143 0.42857143 0.14285714 0.14285714
 0.42857143 0.14285714]
[0.14285714 0.14285714 0.42857143 0.14285714 0.14285714 0.71428571
 0.14285714 0.42857143 0.42857143 0.14285714 0.14285714 0.42857143
 0.14285714 0.14285714 0.14285714 0.14285714 0.42857143 0.14285714
 0.14285714 0.42857143 0.42857143 0.14285714 0.42857143 0.14285714
 0.42857143 0.14285714 0.14285714 0.42857143 1.         0.42857143
 0.42857143 0.71428571]


In [11]:
# test
testText1 = ['love', 'my', 'dalmation']
testText2 = ['stupid', 'garbage']
testsets1 = word2Vec(myVocabList, testText1)
testsets2 = word2Vec(myVocabList, testText2)
class1 = testNB(testsets1, p0Vect, p1Vect, pNotAbusive, pAbusive)
class2 = testNB(testsets2, p0Vect, p1Vect, pNotAbusive, pAbusive)
print(class1)
print(class2)

0
1


# Filtering spam e-mails

In [12]:
regEx = re.compile('\\W*') # filter any characters except words and numbers

hamFile = 'email/ham/'
spamFile = 'email/spam/'
hamTokenSets = generateTokens(hamFile)
spamTokenSets = generateTokens(spamFile)
emailLists = np.concatenate((hamTokenSets, spamTokenSets), axis=0)
emailClasses = np.concatenate((np.zeros(len(hamTokenSets)), np.ones(len(spamTokenSets))), axis=0)  # two classes,ham:0,spam:1
vocabSets = createVocabList(emailLists)
emailDatasets = [word2Vec(vocabSets, emailList) for emailList in emailLists]  # number:50 dimension: 694



In [29]:
accuracy = crossValidation(emailDatasets, emailClasses)
print("The accuracy is: %.2f" %(100 * accuracy) + '%')

The accuracy is: 97.00%
