In [34]:
import nltk
import re

In [35]:
>>> from nltk.corpus import movie_reviews
>>> import random
>>> documents = [(list(movie_reviews.words(fileid)), category)
...              for category in movie_reviews.categories()
...              for fileid in movie_reviews.fileids(category )]
>>> random.shuffle(documents)

In [45]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000] 

def document_features (document): 
    document_words = set(document)
    features = {}
    for word in word_features:
        features[ 'contains({})' .format(word)] = (word in document_words)
    return features

In [46]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.85

In [44]:
featuresets[:3]

[({'contains(plot)': True,
   'contains(:)': True,
   'contains(two)': True,
   'contains(teen)': False,
   'contains(couples)': False,
   'contains(go)': False,
   'contains(to)': True,
   'contains(a)': True,
   'contains(church)': False,
   'contains(party)': False,
   'contains(,)': True,
   'contains(drink)': False,
   'contains(and)': True,
   'contains(then)': True,
   'contains(drive)': False,
   'contains(.)': True,
   'contains(they)': False,
   'contains(get)': True,
   'contains(into)': False,
   'contains(an)': True,
   'contains(accident)': False,
   'contains(one)': True,
   'contains(of)': True,
   'contains(the)': True,
   'contains(guys)': False,
   'contains(dies)': False,
   'contains(but)': True,
   'contains(his)': True,
   'contains(girlfriend)': False,
   'contains(continues)': False,
   'contains(see)': True,
   'contains(him)': False,
   'contains(in)': True,
   'contains(her)': True,
   'contains(life)': True,
   'contains(has)': True,
   'contains(nightmares

In [39]:
[f[1] for f in featuresets].count('neg')/len(featuresets)

0.5

In [40]:
[re.search(r'\(\w+',w[0]).group()[1:] 
 for w in classifier.most_informative_features(30)]

['unimaginative',
 'schumacher',
 'turkey',
 'atrocious',
 'suvari',
 'mena',
 'shoddy',
 'jumbled',
 'surveillance',
 'canyon',
 'underwood',
 'singers',
 'unravel',
 'uninspired',
 'wasted',
 'justin',
 'poorly',
 'awful',
 'waste',
 'bronson',
 'welles',
 'fluffy',
 'ridiculous',
 'groan',
 'sexist',
 'ugh',
 'everyday',
 'kudos',
 'savages',
 'unimpressive']

In [22]:
classifier.show_most_informative_features(100)

Most Informative Features
 contains(unimaginative) = True              neg : pos    =      7.7 : 1.0
    contains(schumacher) = True              neg : pos    =      7.5 : 1.0
        contains(suvari) = True              neg : pos    =      7.1 : 1.0
          contains(mena) = True              neg : pos    =      7.1 : 1.0
        contains(shoddy) = True              neg : pos    =      7.1 : 1.0
     contains(atrocious) = True              neg : pos    =      6.7 : 1.0
        contains(turkey) = True              neg : pos    =      6.6 : 1.0
        contains(wasted) = True              neg : pos    =      5.9 : 1.0
  contains(surveillance) = True              neg : pos    =      5.7 : 1.0
        contains(canyon) = True              neg : pos    =      5.7 : 1.0
       contains(unravel) = True              pos : neg    =      5.6 : 1.0
           contains(ugh) = True              neg : pos    =      5.5 : 1.0
        contains(justin) = True              neg : pos    =      5.5 : 1.0

In [None]:
#選擇一個本章所描述的分類任務，如名字性別檢測、文檔分類、詞性標註或對話行為分類。使用相同的訓練和測試數據，相同的特徵提取器，建立該任務的三個分類器：：決策樹、樸素貝葉斯分類器和最大熵分類器。比較你所選任務上這三個分類器的準確性。你如何看待如果你使用了不同的特徵提取器，你的結果可能會不同？

In [23]:
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1 :]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [24]:
def pos_features (sentence, i, history): 
    features = { "suffix(1)" : sentence[i][-1:],
                  "suffix(2)" : sentence[i][-2:],
                  "suffix( 3)" : sentence[i][-3:]}
    if i == 0:
        features[ "prev-word" ] = "<START>" 
#        features[ "prev-tag" ] = "<START>" 
    else :
        features[ "prev-word" ] = sentence[i-1]
        features[ "prev-tag" ] = history#[i-1]
    return features

In [26]:
tagged_sents = brown.tagged_sents(categories= 'news' )
featuresets = []
for s in tagged_sents:
    sent = []
    history = '<start>'
    for w in s:
        sent.append(w[0])
        featuresets.append((pos_features(sent,s.index(w),history),w[1]))
        history=w[1]

In [28]:
def test_features(txt=str,i=int):
    txt = re.findall(r"[\w('’)\w]+|\S",txt)
#    pos_features(txt,i)
    return classifier.classify(pos_features( txt, i))

In [29]:
>>> size = int(len(featuresets) * 0.1)
>>> train_set, test_set = featuresets[size:], featuresets[:size]

0.7695673794132273

In [30]:
>>> classifier = nltk.DecisionTreeClassifier.train(train_set)
>>> nltk.classify.accuracy(classifier, test_set)

0.7695673794132273

In [31]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.8166086524117354

In [32]:
classifier = nltk.MaxentClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -5.36129        0.007
      Training stopped: keyboard interrupt
         Final          -1.26210        0.829


0.7894579811039284