# 第四章：基于概率论的分类方法： 朴素贝叶斯

## 4.5使用python进行文本分类

### 4.5.1准备数据： 从文本中构建词向量

In [10]:
# 词表到向量的转换函数
# input: 无
# return: 第一个是进行词条切分后的文档集合， 第二个是一个类别标签的集合。这里有两类， 侮辱性和非侮辱性

def loadDataSet():
    postingList = [
        ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please' ],
        ['my', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid' ],
        ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'garbage' ],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him' ],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid' ]
    ]
    classVec = [0,1,0,1,0,1] # 1代表侮辱性文字， 0代表正常言论
    return postingList, classVec

In [11]:
# 创建一个包含所有文档中出现的不重复单词的列表
# input: dataSet
# return: 一个包含所有文档中出现的不重复单词的列表

def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document) # 创建两个集合的并集
    return list(vocabSet)

In [12]:
# 将文档转成文档向量， 向量的每个元素为0（该单词没出现）或1（该单词出现）
# input: 词汇表(vocabList)和某个文档(inputSet)
# return: 文档向量， 向量的每个元素为0（该单词没出现）或1（该单词出现）

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        # 如果某个单词在词汇表中， 则在 returnVec 对应的位置写 1
        if word in inputSet:
            returnVec[ vocabList.index(word) ] = 1
        else:
            print ("The word: %s is not in my Vocabulary!") % word
    return returnVec
    

In [13]:
# 测试上面的3个函数
listOfPosts, listClasses = loadDataSet()

In [14]:
listOfPosts

[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
 ['my', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]

In [15]:
listClasses

[0, 1, 0, 1, 0, 1]

In [16]:
myVocabList = createVocabList(listOfPosts)
myVocabList
type(myVocabList)

list

In [17]:
setOfWords2Vec(myVocabList, listOfPosts[0])

[1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

### 4.5.2：从词向量计算概率

In [18]:
# 朴素贝叶斯分类器训练函数
# input: 训练样本(trainMatrix), 训练样本的类别(trainCategory)
# return: 每个类别的条件概率，和任意文档属于侮辱性文档的概率
import numpy as np
import math

def trainNB0(trainMatrix, trainCategory ):
    numTrainDocs = len(trainMatrix) #样本数量
    numWords = len(trainMatrix[0]) #特征数量
    pAbusive = sum(trainCategory)/float(numTrainDocs) #任意文档属于侮辱性文档的概率
    
    #初始化概率
    # p0Num = np.zeros(numWords); # 0 类别的各个单词（特征）的数量组成的向量
    # p1Num = np.zeros(numWords); # 1 类别的各个单词（特征）的数量组成的向量
    # p0Deom = 0.0; # 属于0类别的所有单词的总个数
    # p1Deom = 0.0; # 属于1类别的所有单词的总个数
    
    # 为了避免下溢出， 我们这样初始化
    p0Num = np.ones(numWords); # 0 类别的各个单词（特征）的数量组成的向量
    p1Num = np.ones(numWords); # 1 类别的各个单词（特征）的数量组成的向量
    p0Deom = 2.0; # 属于0类别的所有单词的总个数
    p1Deom = 2.0; # 属于1类别的所有单词的总个数
    
    # 遍历所有文档
    for i in range(numTrainDocs):
        # 如果这个文档属于类别 1 
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Deom += sum(trainMatrix[i])
        # 如果这个文档属于类别 0
        else:
            p0Num += trainMatrix[i]
            p0Deom += sum(trainMatrix[i])
        
        # 取对数， 为了避免下溢出
        p1Vect = np.ones(numWords);
        p0Vect = np.ones(numWords);
        length = len(p1Vect)
        for i in range(length):
            p1Vect[i] = math.log(p1Num[i]/p1Deom)
            p0Vect[i] = math.log(p0Num[i]/p0Deom)
            
        # p1Vect = np.array([ for x in p1Vect: log(x/p1Deom) ])
        #  p0Vect = np.array([ for x in p0Vect: log(x/p0Deom) ])
        #p1Vect = math.log(p1Num/p1Deom)
        #p0Vect = math.log(p0Num/p0Deom)
        
    return p0Vect, p1Vect, pAbusive

In [19]:
# 测试 trainNB0 函数
listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)

In [20]:
listOfPosts

[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
 ['my', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]

In [21]:
listClasses

[0, 1, 0, 1, 0, 1]

In [22]:
myVocabList

['help',
 'how',
 'posting',
 'garbage',
 'is',
 'flea',
 'quit',
 'steak',
 'please',
 'take',
 'to',
 'food',
 'dog',
 'worthless',
 'park',
 'licks',
 'problems',
 'has',
 'stupid',
 'dalmation',
 'him',
 'mr',
 'love',
 'I',
 'so',
 'buying',
 'not',
 'cute',
 'stop',
 'ate',
 'my']

In [23]:
trainMat = []
for postinDoc in listOfPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc) )

In [24]:
trainMat

[[1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  1],
 [0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0],
 [0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0]]

In [25]:
p0V, p1V, pAbusive = trainNB0(trainMat, listClasses )

In [26]:
p0V

array([-2.56494936, -2.56494936, -3.25809654, -3.25809654, -2.56494936,
       -2.56494936, -3.25809654, -2.56494936, -2.56494936, -3.25809654,
       -2.56494936, -3.25809654, -2.56494936, -3.25809654, -3.25809654,
       -2.56494936, -2.56494936, -2.56494936, -3.25809654, -2.56494936,
       -2.15948425, -2.56494936, -2.56494936, -2.56494936, -2.56494936,
       -3.25809654, -3.25809654, -2.56494936, -2.56494936, -2.56494936,
       -1.87180218])

In [27]:
p1V

array([-3.04452244, -3.04452244, -2.35137526, -2.35137526, -3.04452244,
       -3.04452244, -2.35137526, -3.04452244, -3.04452244, -2.35137526,
       -2.35137526, -2.35137526, -1.94591015, -1.94591015, -2.35137526,
       -3.04452244, -3.04452244, -3.04452244, -1.65822808, -3.04452244,
       -2.35137526, -3.04452244, -3.04452244, -3.04452244, -3.04452244,
       -2.35137526, -2.35137526, -3.04452244, -2.35137526, -3.04452244,
       -2.35137526])

In [28]:
pAbusive

0.5

In [29]:
type(np.ones(5))

numpy.ndarray

### 4.5.3 测试算法根据现实情况修改分类器

In [30]:
# 朴素贝叶斯分类函数
# input: 要分类的向量(vec2Classify), 以及由 trainNB0 函数得到的三个数
# return: vec2Classify 属于的类别

def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
    # 元素相乘 得到概率
    p1 = sum(vec2Classify * p1Vect) + math.log(pClass1)
    p0 = sum(vec2Classify * p0Vect) + math.log(1 - pClass1)
    
    if p1 > p0:
        return 1
    else:
        return 0

In [31]:
# 测试 朴素贝叶斯分类器的 函数， 函数中测试两个 向量
# input: 无
# function: 测试两个 向量 的类别

def testingNB():
    listOfPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOfPosts)
    trainMat = []
    for postinDoc in listOfPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc) )
    p0V, p1V, pAbusive = trainNB0(trainMat, listClasses )
    
    # 测试第一个向量
    testEntry = ['love', 'my', 'dalmation' ]
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry ))
    string = str(testEntry) + " classified as: " + str(classifyNB(thisDoc, p0V, p1V, pAbusive))
    print(string)
                    
    # 测试第二个向量
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry ))
    string = str(testEntry) + " classified as: " + str(classifyNB(thisDoc, p0V, p1V, pAbusive))
    print(string)


In [32]:
testingNB()

['love', 'my', 'dalmation'] classified as: 0
['stupid', 'garbage'] classified as: 1


### 4.5.4 准备数据： 文档词带模型

In [72]:
# 将文档转成文档向量， 向量的每个元素为该单词出现的次数
# input: 词汇表(vocabList)和某个文档(inputSet)
# return: 文档向量， 向量的每个元素为该单词出现的次数

def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        # 如果某个单词在词汇表中， 则在 returnVec 对应的位置加 1
        if word in vocabList:
            returnVec[ vocabList.index(word) ] += 1
    return returnVec
    

## 4.6 示例： 使用朴素贝叶斯过滤垃圾邮件

### 4.6.1 准备数据：切分文本

In [34]:
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'

In [35]:
mySent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon.']

In [36]:
import re
regExpre = re.compile('\\W*') # 正则表达式， 分隔符是除单词、数字外的任意字符串
listOfTokens = regExpre.split(mySent)
listOfTokens

  This is separate from the ipykernel package so we can avoid doing imports until


['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon',
 '']

In [37]:
[tok for tok in listOfTokens if len(tok) > 0]

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [38]:
[tok.lower() for tok in listOfTokens if len(tok) > 0]

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [39]:
emailText = open('email/ham/6.txt').read()

In [40]:
listOfTokens = regExpre.split(emailText)

  """Entry point for launching an IPython kernel.


In [41]:
listOfTokens

['Hello',
 'Since',
 'you',
 'are',
 'an',
 'owner',
 'of',
 'at',
 'least',
 'one',
 'Google',
 'Groups',
 'group',
 'that',
 'uses',
 'the',
 'customized',
 'welcome',
 'message',
 'pages',
 'or',
 'files',
 'we',
 'are',
 'writing',
 'to',
 'inform',
 'you',
 'that',
 'we',
 'will',
 'no',
 'longer',
 'be',
 'supporting',
 'these',
 'features',
 'starting',
 'February',
 '2011',
 'We',
 'made',
 'this',
 'decision',
 'so',
 'that',
 'we',
 'can',
 'focus',
 'on',
 'improving',
 'the',
 'core',
 'functionalities',
 'of',
 'Google',
 'Groups',
 'mailing',
 'lists',
 'and',
 'forum',
 'discussions',
 'Instead',
 'of',
 'these',
 'features',
 'we',
 'encourage',
 'you',
 'to',
 'use',
 'products',
 'that',
 'are',
 'designed',
 'specifically',
 'for',
 'file',
 'storage',
 'and',
 'page',
 'creation',
 'such',
 'as',
 'Google',
 'Docs',
 'and',
 'Google',
 'Sites',
 'For',
 'example',
 'you',
 'can',
 'easily',
 'create',
 'your',
 'pages',
 'on',
 'Google',
 'Sites',
 'and',
 'share',


### 4.6.2 测试算法：使用朴素贝叶斯进行交叉验证

In [42]:
trainingSet = range(50)
trainingSet

range(0, 50)

In [43]:
import random
randIndex = int(random.uniform(0, len(trainingSet ) ) )
randIndex

35

In [44]:
#　文本解析，切分文本
# input: the string to split
# return: 文本切分后的 list

def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2 ]

In [45]:
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'

In [46]:
textParse(mySent)

  return _compile(pattern, flags).split(string, maxsplit)


['this',
 'book',
 'the',
 'best',
 'book',
 'python',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [47]:
# 完整的垃圾邮件测试函数
# input: 无
# function: 对50封邮件(25封正常邮件，25封垃圾邮件)进行分类
# 方法：首先从文件夹中读取邮件并转化为训练集， 从训练集中随机抽取10封邮件作为测试集（这10封邮件在训练集中就不存在了）
#      然后用训练集得到朴素贝叶斯分类器， 最后用测试集测试并显示错误率

def spamTest():
    docList = []; classList = []; fullText = []
    for i in range(1, 26):
        # 导入并解析垃圾邮件的文本文件
        wordList = textParse(open('email/spam/%d.txt' %i).read())
        docList.append(wordList) # 用于保存整个训练集
        fullText.extend(wordList) # 用于存储所有的单词
        classList.append(1)
        
        # 导入并解析正常邮件的文本文件
        wordList = textParse(open('email/ham/%d.txt' %i).read())
        docList.append(wordList) # 用于保存整个训练集
        fullText.extend(wordList) # 用于存储所有的单词
        classList.append(0)

    # 创建词汇表
    vocabList = createVocabList(docList)
    # 训练集和测试集的 index
    
    trainingSetIndex = []
    for i in range(50):
        trainingSetIndex.append(i)
    
    testSetIndex = []
    # 从训练集中随机抽取10封邮件作为测试集（这10封邮件在训练集中就不存在了）
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSetIndex ) ) )
        testSetIndex.append(trainingSetIndex[randIndex ] )
        del(trainingSetIndex[randIndex] )
        
    # 构造训练集和训练集的 label
    trainMat = []; trainClasses = []
    for docIndex in trainingSetIndex:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex ]))
        trainClasses.append(classList[docIndex ])

    # 用训练集得到朴素贝叶斯分类器
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    
    # 用测试集进行测试
    errorCount = 0
    for docIndex in testSetIndex:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex ])
        # 如果分类错误， 则 errorCount + 1
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
        
        # 输出错误率
    string = "The error rate is: " + str(float(errorCount)/len(testSetIndex))
    print(string)

In [48]:
spamTest()

The error rate is: 0.1


  return _compile(pattern, flags).split(string, maxsplit)


# 4.7 示例： 使用朴素贝叶斯分类器从个人广告中获取区域倾向

## 4.7.1 收集数据， 导入 RSS 源

In [49]:
import feedparser

In [50]:
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss ' )

In [51]:
# 访问所有条目列表
ny['entries']

[{'dc_source': 'https://newyork.craigslist.org/brk/stp/d/some-posts-on-here-are-crazy/6338598490.html',
  'dc_type': 'text',
  'id': 'https://newyork.craigslist.org/brk/stp/d/some-posts-on-here-are-crazy/6338598490.html',
  'language': 'en-us',
  'link': 'https://newyork.craigslist.org/brk/stp/d/some-posts-on-here-are-crazy/6338598490.html',
  'links': [{'href': 'https://newyork.craigslist.org/brk/stp/d/some-posts-on-here-are-crazy/6338598490.html',
    'rel': 'alternate',
    'type': 'text/html'}],
  'published': '2017-10-09T03:01:37-04:00',
  'published_parsed': time.struct_time(tm_year=2017, tm_mon=10, tm_mday=9, tm_hour=7, tm_min=1, tm_sec=37, tm_wday=0, tm_yday=282, tm_isdst=0),
  'rights': 'copyright 2017 craiglist',
  'rights_detail': {'base': 'https://newyork.craigslist.org/search/stp?format=rss',
   'language': None,
   'type': 'text/plain',
   'value': 'copyright 2017 craiglist'},
  'summary': 'It\'s insane the amount of details people put into finding someone to communicate 

In [52]:
len(ny['entries'] )

25

In [53]:
ny['entries'][2]['summary']

"anyone else in love with that movie and utterly annoyed at the thought of a remake haha? i'm a witchy, bitchy when need be, nerdy little weirdo who loves gaming, horror and all things strange and unusual. my style of dress is sort of 90s goth witch,  ..."

In [73]:
# 高频词去除函数
# input: vocabulary list and the full text
# function: 该函数遍历词汇表中的每个词， 并统计它在文本中出现的次数， 然后根据出现次数
#           从高到低对字典进行排序， 最后返回排序最高的30个单词

def calcMostFreq(vocabList, fullText):
    import operator
    freqDict = {}
    
    # 统计词条在文本中出现的次数
    for token in vocabList:
        freqDict[token] = fullText.count(token)
        
    # 对字典进行排序
    sortedFreq = sorted(freqDict.items(), key = operator.itemgetter(1), reverse = True)
    return sortedFreq[:30]
    

In [74]:
# RSS 源分类器
# input: 两个RSS源
# return: 返回 词汇表，源 feed0 的向量 和 源 feed1 的向量，这里关注的是单词概率而不是实际分类

def localWords(feed1, feed0):
    import feedparser
    
    docList = []; classList = []; fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        # 导入并解析 lable=1 的文本文件
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList) # 用于保存整个训练集
        fullText.extend(wordList) # 用于存储所有的单词
        classList.append(1)
        
        # 导入并解析 lable=0 的文本文件
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList) # 用于保存整个训练集
        fullText.extend(wordList) # 用于存储所有的单词
        classList.append(0)

    # 创建词汇表
    vocabList = createVocabList(docList)
    
    # 删除30个最高频的词汇
    top30Words = calcMostFreq(vocabList, fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    
    # 训练集和测试集的 index
    trainingSetIndex = []
    for i in range(2*minLen):
        trainingSetIndex.append(i)
    
    testSetIndex = []
    # 从训练集中随机抽取20个样本作为测试集（这20个样本在训练集中就不存在了）
    for i in range(20):
        randIndex = int(random.uniform(0, len(trainingSetIndex ) ) )
        testSetIndex.append(trainingSetIndex[randIndex ] )
        del(trainingSetIndex[randIndex] )
        
    # 构造训练集和训练集的 label
    trainMat = []; trainClasses = []
    for docIndex in trainingSetIndex:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex ]))
        trainClasses.append(classList[docIndex ])

    # 用训练集得到朴素贝叶斯分类器
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    
    # 用测试集进行测试
    errorCount = 0
    for docIndex in testSetIndex:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex ])
        # 如果分类错误， 则 errorCount + 1
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
        
    # 输出错误率
    string = "The error rate is: " + str(float(errorCount)/len(testSetIndex))
    print(string)
    
    return vocabList, p0V, p1V


In [75]:
# 测试
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss ' )
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss ' )

In [79]:
vocabList, pSF, pNY = localWords(ny, sf)

The error rate is: 0.35


  return _compile(pattern, flags).split(string, maxsplit)


### 这里的错误率要远高于垃圾邮件中的错误率。 由于这里关注的是单词概率而不是实际分类， 因此这个问题不是很严重

## 4.7.2 分析数据： 显示地域相关的用词

In [87]:
# 最具表征性的词汇显示函数
# input: 两个 RSS 源
# function: 训练并测试朴素贝叶斯分类器， 打印使用概率超过 10^(-6.0) 的词汇

def getTopWords(ny, sf):
    import operator
    vocabList, pSF, pNY = localWords(ny, sf)
    topNY = []; topSF = []
    # 将使用概率超过 10^(-6.0) 的词汇加入到列表中
    for i in range(len(pSF)):
        if pSF[i] > -6.0 : topSF.append((vocabList[i], pSF[i]))
        if pNY[i] > -6.0 : topNY.append((vocabList[i], pNY[i]))
    
    # 排序并打印 SF
    sortedSF = sorted(topSF, key = lambda pair: pair[1], reverse = True)
    print ("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF** ")
    for item in sortedSF:
        string = str(item[0]) + " : " + str(item[1])
        print(string)
        
    # 排序并打印  NY
    sortedNY = sorted(topNY, key = lambda pair: pair[1], reverse = True)
    print ("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY** ")
    for item in sortedNY:
        string = str(item[0]) + " : " + str(item[1])
        print(string)

In [88]:
getTopWords(ny, sf)

  return _compile(pattern, flags).split(string, maxsplit)


The error rate is: 0.3
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF** 
male : -4.08597631255
female : -4.08597631255
but : -4.26829786935
meet : -4.26829786935
here : -4.26829786935
help : -4.49144142066
same : -4.49144142066
woman : -4.49144142066
out : -4.49144142066
separated : -4.49144142066
clean : -4.49144142066
very : -4.49144142066
work : -4.49144142066
love : -4.49144142066
friend : -4.49144142066
this : -4.77912349311
relationship : -4.77912349311
wants : -4.77912349311
enjoy : -4.77912349311
professional : -4.77912349311
area : -4.77912349311
prefer : -4.77912349311
day : -4.77912349311
workout : -4.77912349311
voice : -4.77912349311
during : -4.77912349311
bay : -4.77912349311
right : -4.77912349311
matter : -4.77912349311
divorced : -4.77912349311
friendship : -4.77912349311
walks : -4.77912349311
never : -4.77912349311
friends : -4.77912349311
coffee : -4.77912349311
guy : -4.77912349311
lady : -4.77912349311
even : -4.779

than : -5.87773578178
otherwise : -5.87773578178
conversations : -5.87773578178
style : -5.87773578178
places : -5.87773578178
over : -5.87773578178
great : -5.87773578178
will : -5.87773578178
bbq : -5.87773578178
continue : -5.87773578178
construction : -5.87773578178
hobby : -5.87773578178
either : -5.87773578178
ease : -5.87773578178
pleasure : -5.87773578178
going : -5.87773578178
pictures : -5.87773578178
oral : -5.87773578178
hubby : -5.87773578178
full : -5.87773578178
finding : -5.87773578178
marri : -5.87773578178
more : -5.87773578178
session : -5.87773578178
korean : -5.87773578178
netflix : -5.87773578178
nice : -5.87773578178
sense : -5.87773578178
age : -5.87773578178
apprentice : -5.87773578178
dress : -5.87773578178
actually : -5.87773578178
ads : -5.87773578178
plan : -5.87773578178
release : -5.87773578178
trip : -5.87773578178
anybody : -5.87773578178
act : -5.87773578178
rubdown : -5.87773578178
charge : -5.87773578178
strings : -5.87773578178
colombian : -5.877735