In [1]:
# 文档分类
# 根据语料库中已经进行类别分类的语料，建立分类器，自动给新文档添加适当的类别标签。

In [2]:
# 确认语料库：选择nltk中的电影评论语料库，将每个评论归类为正面或负面。

In [33]:
import random
import nltk
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)),category) 
             for category in movie_reviews.categories() 
             for fileid in movie_reviews.fileids(category)]

In [34]:
# 两种不同的构造文档的链表的方式，
documents2 = [] 
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        ele = (movie_reviews.words(fileid), category)
        documents2.append(ele)

In [36]:
# 随机排列documents
random.shuffle(documents)
random.shuffle(documents2)

In [40]:
# 定义特征提取器
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

In [41]:
all_words

FreqDist({'plot': 1513,
          ':': 3042,
          'two': 1911,
          'teen': 151,
          'couples': 27,
          'go': 1113,
          'to': 31937,
          'a': 38106,
          'church': 69,
          'party': 183,
          ',': 77717,
          'drink': 32,
          'and': 35576,
          'then': 1424,
          'drive': 105,
          '.': 65876,
          'they': 4825,
          'get': 1949,
          'into': 2623,
          'an': 5744,
          'accident': 104,
          'one': 5852,
          'of': 34123,
          'the': 76529,
          'guys': 268,
          'dies': 104,
          'but': 8634,
          'his': 9587,
          'girlfriend': 218,
          'continues': 88,
          'see': 1749,
          'him': 2633,
          'in': 21822,
          'her': 4522,
          'life': 1586,
          'has': 4719,
          'nightmares': 26,
          'what': 3322,
          "'": 30585,
          's': 18513,
          'deal': 219,
          '?': 3771,
          'wa

In [46]:
word_features = []
for word in all_words.keys():
    if len(word_features) <2000:
        word_features.append(word)
    else:
        break

In [47]:
word_features # 取2000个word作为特征

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an',
 'accident',
 'one',
 'of',
 'the',
 'guys',
 'dies',
 'but',
 'his',
 'girlfriend',
 'continues',
 'see',
 'him',
 'in',
 'her',
 'life',
 'has',
 'nightmares',
 'what',
 "'",
 's',
 'deal',
 '?',
 'watch',
 'movie',
 '"',
 'sorta',
 'find',
 'out',
 'critique',
 'mind',
 '-',
 'fuck',
 'for',
 'generation',
 'that',
 'touches',
 'on',
 'very',
 'cool',
 'idea',
 'presents',
 'it',
 'bad',
 'package',
 'which',
 'is',
 'makes',
 'this',
 'review',
 'even',
 'harder',
 'write',
 'since',
 'i',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'with',
 'your',
 'head',
 'such',
 '(',
 'lost',
 'highway',
 '&',
 'memento',
 ')',
 'there',
 'are',
 'good',
 'ways',
 'making',
 'all',
 'types',
 'these',
 'folks',
 'just',
 'didn',
 't',
 'snag',
 'correctly',
 'seem',
 'have',
 'taken',
 'pretty',


In [54]:
def document_features(document):
    '''
    获取文档特征，自定义特征提取器
    '''
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [55]:
# 验证document_features(document)
document_features(movie_reviews.words('neg/cv000_29416.txt'))

{'contains(plot)': True,
 'contains(:)': True,
 'contains(two)': True,
 'contains(teen)': True,
 'contains(couples)': True,
 'contains(go)': True,
 'contains(to)': True,
 'contains(a)': True,
 'contains(church)': True,
 'contains(party)': True,
 'contains(,)': True,
 'contains(drink)': True,
 'contains(and)': True,
 'contains(then)': True,
 'contains(drive)': True,
 'contains(.)': True,
 'contains(they)': True,
 'contains(get)': True,
 'contains(into)': True,
 'contains(an)': True,
 'contains(accident)': True,
 'contains(one)': True,
 'contains(of)': True,
 'contains(the)': True,
 'contains(guys)': True,
 'contains(dies)': True,
 'contains(but)': True,
 'contains(his)': True,
 'contains(girlfriend)': True,
 'contains(continues)': True,
 'contains(see)': True,
 'contains(him)': True,
 'contains(in)': True,
 'contains(her)': True,
 'contains(life)': True,
 'contains(has)': True,
 'contains(nightmares)': True,
 'contains(what)': True,
 "contains(')": True,
 'contains(s)': True,
 'contains

In [57]:
# 训练
featuresets = [(document_features(d),c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [58]:
# 准确率
nltk.classify.accuracy(classifier,test_set)

0.81

In [59]:
classifier.show_most_informative_features(5)

Most Informative Features
    contains(schumacher) = True              neg : pos    =     11.8 : 1.0
        contains(welles) = True              neg : pos    =      8.4 : 1.0
 contains(unimaginative) = True              neg : pos    =      8.4 : 1.0
        contains(shoddy) = True              neg : pos    =      7.1 : 1.0
          contains(mena) = True              neg : pos    =      7.1 : 1.0


In [61]:
# 从上面的结论显示，提高schumacher的评论中neg是pos的11.8倍...

({'contains(plot)': False,
  'contains(:)': True,
  'contains(two)': True,
  'contains(teen)': False,
  'contains(couples)': False,
  'contains(go)': False,
  'contains(to)': True,
  'contains(a)': True,
  'contains(church)': False,
  'contains(party)': False,
  'contains(,)': True,
  'contains(drink)': False,
  'contains(and)': True,
  'contains(then)': True,
  'contains(drive)': False,
  'contains(.)': True,
  'contains(they)': True,
  'contains(get)': True,
  'contains(into)': True,
  'contains(an)': True,
  'contains(accident)': False,
  'contains(one)': True,
  'contains(of)': True,
  'contains(the)': True,
  'contains(guys)': True,
  'contains(dies)': False,
  'contains(but)': True,
  'contains(his)': True,
  'contains(girlfriend)': False,
  'contains(continues)': False,
  'contains(see)': False,
  'contains(him)': False,
  'contains(in)': True,
  'contains(her)': True,
  'contains(life)': True,
  'contains(has)': True,
  'contains(nightmares)': False,
  'contains(what)': True,
 

In [62]:
# =========================结束=======================

In [63]:
# 训练一个分类器来算出哪个后缀最优信息量。

In [73]:
# 首先，找出最常见的后缀
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] +=1
    suffix_fdist[word[-2:]] +=1
    suffix_fdist[word[-3:]] +=1
    

In [76]:
len(suffix_fdist)

4975

In [77]:
common_suffixes=[]  # 获取suffix数量最大的前100个key，注意，此处未排序保证谁是最大值，如下面需要考虑这项，此处再做修改。
v = sorted(suffix_fdist.values(),reverse=True)[:100]
for suffix,count in suffix_fdist.items():
    if len(common_suffixes)<100 and count in v:
        common_suffixes.append(suffix)
        

In [87]:
common_suffixes[:10]

['e', 'he', 'the', 'n', 'on', 'ton', 'y', 'ty', 'nty', 'd']

In [88]:
def pos_features(word):
    '''
    自定义一个特征提取函数，检查给定的单子的这些后缀
    '''
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

In [90]:
# 特征提取函数的行为就像有色眼睛一样，强调我们的数据中的某些属性（颜色），并使其无法看到其他属性。

In [91]:
tagged_words = brown.tagged_words(categories='news')

In [92]:
tagged_words

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [93]:
featuresets = [(pos_features(n),g) for (n,g) in tagged_words]

In [94]:
size = int(len(featuresets)*0.1)
train_set, test_set = featuresets[size:], featuresets[:size]  # 训练集和测试集

In [95]:
# 训练
classifier = nltk.DecisionTreeClassifier.train(train_set)

In [96]:
# 准确率
nltk.classify.accuracy(classifier, test_set)

0.5689706613625062

In [97]:
classifier.classify(pos_features('cats'))

'NNS'

In [103]:
# 决策树模型的一个很好的性质是它们往往很容易解释。
# 我们甚至可以指示NLTK将它们以伪代码形式输出：
classifier.pseudocode(depth=4)

"if endswith(the) == False: \n  if endswith(,) == False: \n    if endswith(s) == False: \n      if endswith(.) == False: return '.'\n      if endswith(.) == True: return '.'\n    if endswith(s) == True: \n      if endswith(was) == False: return 'PP$'\n      if endswith(was) == True: return 'BEDZ'\n  if endswith(,) == True: return ','\nif endswith(the) == True: return 'AT'\n"

In [104]:
# 上面的结果中我们可以看到，分类器一开始检查一个词是否以the结尾--如果不是，它会得到判断是否以','结尾....
# 实际的分类器包含这里显示的if-then语句下面进一步的嵌套，参数depth=4只显示决策树的顶端部分。