In [31]:
from numpy import *

In [26]:
def bagOfWords2VecMN(vocabList,inputSet):
    returnVec=[0]*len(vocabList)  #创建一个其中所含元素都为0的向量
    for word in inputSet:
        if word in vocabList:
                returnVec[vocabList.index(word)]+=1
    return returnVec

In [4]:
# 创建一个包含在所有文档中出现的不重复词的列表
def createVocabList(dataSet):
    vocabSet=set([])
    #创建一个空集
    for document in dataSet:
        vocabSet = vocabSet|set(document)
        #创建两个集合的并集
        return list(vocabSet)

#文件解析
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W+',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

In [2]:
def trainNB0(trainMatrix, trainCategory):
    """
    训练数据优化版本
    :param trainMatrix: 文件单词矩阵
    :param trainCategory: 文件对应的类别
    :return:
    """
    # 总文件数
    numTrainDocs = len(trainMatrix)
    # 总单词数
    numWords = len(trainMatrix[0])
    # 侮辱性文件的出现概率
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    # 构造单词出现次数列表
    # p0Num 正常的统计
    # p1Num 侮辱的统计
    # 避免单词列表中的任何一个单词为0，而导致最后的乘积为0，所以将每个单词的出现次数初始化为 1
    p0Num = ones(numWords)#[0,0......]->[1,1,1,1,1.....]
    p1Num = ones(numWords)
    # 整个数据集单词出现总数，2.0根据样本/实际调查结果调整分母的值（2主要是避免分母为0，当然值可以调整）
    # p0Denom 正常的统计
    # p1Denom 侮辱的统计
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            # 累加辱骂词的频次
            p1Num += trainMatrix[i]
            # 对每篇文章的辱骂的频次 进行统计汇总
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    # 类别1，即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表
    p1Vect = log(p1Num / p1Denom)
    # 类别0，即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表
    p0Vect = log(p0Num / p0Denom)
    return p0Vect, p1Vect, pAbusive
    

In [28]:
#RSS源分类器及高频词去除函数
def calcMostFreq(vocabList, fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        #遍历词汇表中的每个词
        freqDict[token] = fullText.count(token)
    #统计每个词在文本中出现的次数
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
    #根据每个词出现的次数从高到底对字典进行排序
    return sortedFreq[:30] #返回出现次数最高的30个单词

    
def localWords(feed1, feed0):
    import feedparser
    import random
    docList=[]; classList=[]; fullText=[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        # 每次访问一条RSS源
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList, fullText)
    
    for pairW in top30Words:
        if pairW[0] in vocabList:
            # 去掉出现次数最高的那些词
            vocabList.remove(pairW[0])

    trainingSet = list(range(2*minLen)); testSet=[]

    for i in range(20):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])

    trainMat = []; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
        p0V,p1V,pSpam = trainNB0(array(trainMat), array(trainClasses))
        errorCount = 0
        
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount+=1
    print('the error rate is:', float(errorCount)/len(testSet))
    return vocabList, p0V, p1V 


#朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1=sum(vec2Classify*p1Vec)+log(pClass1)
    p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
    if p1>p0:
        return 1
    else:
        return 0


In [44]:
import feedparser
# ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
# sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
# http://www.nasa.gov/rss/dyn/image_of_the_day.rss
# ny = feedparser.parse('http://www.ftchinese.com/rss/news')
# sf = feedparser.parse('http://rss.tom.com/happy/happy.xml')

ny = feedparser.parse('http://www.nasa.gov/rss/dyn/image_of_the_day.rss')
sf = feedparser.parse('http://sports.yahoo.com/nba/teams/hou/rss.xml')

In [47]:
vocabList, pSF, pNY = localWords(ny, sf)

IndexError: list index out of range

In [49]:
len(sf['entries'])

4

In [45]:
sf['entries'][1]['summary']

'When last we left the NBA, the Golden State Warriors were wrapping up their third championship in four years and staking their claim as perhaps the game’s greatest dynasty. Then a bunch of players switched teams, including LeBron James (to the Lakers),'

In [46]:
ny['entries'][1]['summary']

"The largest fire in California's history, the Mendocino Complex, is still spewing clouds of smoke across the state."

In [42]:
ny2 = feedparser.parse('http://www.nasa.gov/rss/dyn/image_of_the_day.rss')

In [43]:
ny2['entries'][1]['summary']

"The largest fire in California's history, the Mendocino Complex, is still spewing clouds of smoke across the state."

In [None]:
#最具表征性的词汇显示函数
def getTopWords(ny,sf):
    import operator
    vocabList,p0V,p1V=localWords(ny,sf)    topNY=[];topSF=[]    for i in range(len(p0V)):        if p0V[i]>-6.0:topSF.append((vocabList[i],p0V[i]))        if p1V[i]>-6.0:topNY.append((vocabList[i],p1V[i]))    sortedSF=sorted(topSF,key=lambda pair:pair[1],reverse=True)    print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"    for item in sortedSF:        print item[0]    sortedNY=sorted(topNY,key=lambda pair:pair[1],reverse=True)    print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"    for item in sortedNY:        print item[0]




