## 第四章—朴素贝叶斯

### 基于两个假设：
**1、特征之间相互独立（即朴素的含义）**  
**2、每个特征同等重要**

### 1、使用python进行文本分类

### 1.1、词表到向量的转换

In [None]:
import numpy as np
import operator

In [None]:
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]                                              # 1 代表侮辱性文字, 0 代表正常言论
    return postingList,classVec

In [None]:
# 列表合并，并去重
def createVocabList(dataSet):
    vocabSet = set()                        # 创建空集合
    for document in dataSet:
        vocabSet = vocabSet | set(document) # 两个集合的并集
    return list(vocabSet)

In [None]:
# 对照词列表生成词向量
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)                                  # 定义一个列表表示词向量，初始化为[0, 0,.....0]
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1                    # 输入词集合中出现词列表中的单词，词向量相应位置设为1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [None]:
listOP, listClasses = loadDataSet()
myVocabList = createVocabList(listOP)
print(myVocabList)

In [None]:
print(setOfWords2Vec(myVocabList, myVocabList))
print(setOfWords2Vec(myVocabList, listOP[0]))

### 1.2、朴素贝叶斯分类器训练函数

In [None]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)                        # 训练样本数目
    numWords = len(trainMatrix[0])                         # 特征数
    pAbusive = np.sum(trainCategory)/float(numTrainDocs)   # 侮辱性言论的比率
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)                              # 初始化分子
    p0Denom = 2.0
    p1Denom = 2.0                                          # 初始化分母
    for i in range(numTrainDocs):
        if(trainCategory[i] == 1):
            p1Num += trainMatrix[i]                        # 正样本中出现的频数
            p1Denom += np.sum(trainMatrix[i])              # 所有词条总频数
        else:
            p0Num += trainMatrix[i]
            p0Denom += np.sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)
    p0Vect = np.log(p0Num/p1Denom)
    return p0Vect, p1Vect, pAbusive

In [None]:
trainMat = []
for postinDoc in listOP:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0v, p1v, pAb = trainNB0(trainMat, listClasses)
print(p0v, p1v, pAb)

### 1.3、朴素贝叶斯分类函数

$$ln(p(w0,w1...wn,c1)) = ln(p(w0,w1...wn|c1)) + ln(p(c1))$$

In [None]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)       # vec2Classify为要分类的向量
    p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if(p1 > p0):
        return 1
    else:
        return 0

In [None]:
def testingNB():
    listOPosts, listClasses = loadDataSet()                               # 加载数据集及标签
    myVocabList = createVocabList(listOPosts)                             # 生成词列表
    # 将所有训练数据转换为词向量
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))   # 学习先验概率
    testEntry = ['love', 'my', 'dalmation']                               # 测试用例              
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))            # 测试用例转换为词向量
    print(testEntry,'classified as: ',classifyNB(thisDoc, p0V, p1V, pAb)) # 输出测试结果
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))

In [None]:
testingNB()

### 1.4、朴素贝叶斯词袋模型

In [None]:
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if(word in vocabList):
            returnVec[vocabList.index(word)] += 1
    return returnVec

### 2、使用朴素贝叶斯过滤垃圾邮件

### 2.1、准备数据：切分文本

In [None]:
mySent = "This book is the best book on Python or M.L. I have ever laid eyes upon."
print(mySent.split())

In [None]:
import re
regEx  = re.compile('\\W*')                                           # 正则化的方法！！！
listofTokens = regEx.split(mySent)
listofTokens = [tok for tok in listofTokens if len(tok) > 0]
print(listofTokens)
listofTokens = [tok.lower() for tok in listofTokens if len(tok) > 0]  # 转换为小写
print(listofTokens)
listofTokens = [tok.upper() for tok in listofTokens if len(tok) > 0]  # 转换为大写
print(listofTokens)

In [None]:
emailText = open('D:/data/study/AI/ML/MLcode/Ch04/email/ham/6.txt').read()
listOfTokens = regEx.split(emailText)
listOfTokens = [tok for tok in listOfTokens if len(tok) > 0]
print(listOfTokens)

### 2.2、文件解析及完整的垃圾邮件测试函数

In [None]:
def textParse(bigString):                                        # 输入为大的字符串，输出为词列表
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] # 输出长度大于2的字符串，并且转化为小写字母

In [None]:
import random
def spamTest():
    docList = []             # 定义存储词列表的列表
    classList = []           # 定义类列表
    fullText = []            # 将所有词列表合并为一个词列表
    for i in range(1, 26):
        # 正样本处理
        wordList = textParse(open('D:/data/study/AI/ML/MLcode/Ch04/email/spam/%d.txt'% i).read()) # 将文件转换为词列表
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        # 负样本处理
        wordList = textParse(open('D:/data/study/AI/ML/MLcode/Ch04/email/ham/%d.txt'% i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)          # 词列表合并，并去重
    trainingSet = list(range(50))                 # 正负样本一共50个
    testSet=[]                                    # 创建测试集
    # 随机构建训练集:10个测试样本，40个训练样本
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet))) # 生成随机数可替换为: random.randint(0, len(trainingSet))
        testSet.append(trainingSet[randIndex])               # 将随机选择的训练集放入测试集
        del(trainingSet[randIndex])                          # 删除训练集中对应数据
    trainMat=[]
    trainClasses = []
    for docIndex in trainingSet:                                              # 训练分类器
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))       # 根据词袋模型，将训练集词列表转换为词向量
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))    # 训练分类器
    errorCount = 0
    for docIndex in testSet:                                                  # 对测试集分类
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])           # 生成测试词向量
        if(classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]):
            errorCount += 1                                                   # 分类错误时错误数加一
            print("classification error",docList[docIndex])
    print('the error rate is: ',float(errorCount)/len(testSet))

In [None]:
spamTest()

### 3、使用朴素贝叶斯分类器从个人广告中获取区域倾向

### 3.1、RSS源分类器及高频词去除

In [None]:
import feedparser

In [None]:
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
print(len(ny['entries']))

### RSS源分类器及高频词去除函数

In [None]:
def calcMostFreq(vocabList, fullText):
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)               # count方法统计频数
    sortedFreq = sorted(freqDict.items(), key = operator.itemgetter(1), reverse = True) # 按频数从大到小排序
    return sortedFreq[:30]                                    # 返回频数最高的30个词条

In [None]:
# 此处为sorted函数对字典排序的Demo
dictNew = {"A":2, "Q":4, "B":7, "E":9, "C":1}
dictOut1 = sorted(dictNew.items(), key = operator.itemgetter(1), reverse = True)
dictOut2 = sorted(dictNew.items(), key = lambda x:x[1], reverse = True)
print(dictOut1,'\n',dictOut2)

In [None]:
def localWords(feed1, feed0):
    import feedparser
    import random
    docList=[]; classList = []; fullText =[]                  # 定义文本列表、类列表、文本合并后的词条列表
    minLen = min(len(feed1['entries']), len(feed0['entries']))# 求两个较小值
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])  # 将字符串分割得到词列表
        docList.append(wordList)                              # 将词列表整体放入文本列表
        fullText.extend(wordList)                             # 将词列表所有成员放入文本词条列表
        classList.append(1)                                   # 将feed1标记为class 1
        # 与上面同理
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)                      # 创建词条列表
    top30Words = calcMostFreq(vocabList, fullText)             # 得到频数最高的前30个词条
    for pairW in top30Words:
        if(pairW[0] in vocabList):                            # 理解这里为什么是pairW[0]而不是pairW
            vocabList.remove(pairW[0])                        # 词条列表中去除高频词条
    trainingSet = list(range(2 * minLen))                     # 创建训练集，转化为列表
    testSet=[]                                                # 创建测试集
    # 随机选择20个样本作为测试集，余下为训练集
    for i in range(20):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat=[]
    trainClasses = []
    for docIndex in trainingSet:                                          # 训练分类器
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))   # 根据词袋模型，将训练集词列表转换为词向量
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat),np.array(trainClasses)) # 训练分类器
    errorCount = 0
    for docIndex in testSet:                                              # 对测试集分类
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1                                               # 分类错误时错误数加一
    print('the error rate is: ',float(errorCount)/len(testSet))
    return vocabList, p0V, p1V

In [None]:
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
vocabList, pSF, pNY = localWords(ny, sf)

### 最具表征性的词汇显示函数

In [None]:
def getTopWords(ny, sf):
    import operator
    vocabList, p0V, p1V=localWords(ny, sf)
    topNY=[]
    topSF=[]
    for i in range(len(p0V)):
        if(p0V[i] > -6.0):
            topSF.append((vocabList[i], p0V[i]))
        if(p1V[i] > -6.0):
            topNY.append((vocabList[i], p1V[i]))
    sortedSF = sorted(topSF, key = lambda pair: pair[1], reverse = True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY, key = lambda pair: pair[1], reverse = True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sortedNY:
        print(item[0])

In [None]:
getTopWords(ny, sf)