In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### 最基础的语句：

In [2]:
import nltk
from nltk import word_tokenize
s = 'I was watching TV'
print(nltk.pos_tag(word_tokenize(s)))

[('I', 'PRP'), ('was', 'VBD'), ('watching', 'VBG'), ('TV', 'NN')]


### To look for all the nouns

In [3]:
tagged = nltk.pos_tag(word_tokenize(s))
allnouns = [word for word,pos in tagged if pos in ['NN','NNP']]
print(allnouns)

['TV']


### Some simple examples for tagging 

In [4]:
# POS tags freq distribution in the Brown corpus:
from nltk.corpus import brown
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
print(nltk.FreqDist(tags)) # print 是另一种形式 字典排序？？

<FreqDist with 218 samples and 100554 outcomes>


### 创建一个tagger，将所有test data标注为“NN“词性

In [5]:
brown_tagged_sents = brown.tagged_sents(categories='news')
default_tagger = nltk.DefaultTagger("NN")
print(default_tagger.evaluate(brown_tagged_sents)) # 13% 的词性预测正确，也就是说13%是”NN“

0.13089484257215028


In [7]:
print(brown_tagged_sents)

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

## Sequential tagger

Tagger 会根据词的上下文建模，如果对该Tag的词性预测错误，那么采用备选的Tagger进行预测；


### N-gram 采用context中的前N 个词 来预测 给定的token

In [8]:

from nltk.tag import UnigramTagger 
from nltk.tag import DefaultTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

In [10]:
# test train split
num_train = int(len(brown_tagged_sents) * 0.9)
train_data = brown_tagged_sents[:num_train]
test_data = brown_tagged_sents[num_train:]

In [11]:
unigram_tagger = UnigramTagger(train_data,backoff=default_tagger)  # 只考虑条件频率，将给定的token预测为最频繁的tag
print(unigram_tagger.evaluate(test_data))

bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)  # 考虑给定的词和前一个词
print(bigram_tagger.evaluate(test_data))

trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger) #TrigramTagger 的覆盖面会小些，但准确率会高些；为了平衡，
print(trigram_tagger.evaluate(test_data))                         # precision／recall，一种不行就backoff



0.8361407355726104
0.8452108043456593
0.843317053722715


In [22]:
# test
trigram_tagger.tag_sents(test_data)

[[(('But', 'CC'), 'NN'),
  (('in', 'IN'), 'NN'),
  (('all', 'ABN'), 'NN'),
  (('its', 'PP$'), 'NN'),
  (('175', 'CD'), 'NN'),
  (('years', 'NNS'), 'NN'),
  ((',', ','), 'NN'),
  (('not', '*'), 'NN'),
  (('a', 'AT'), 'NN'),
  (('single', 'AP'), 'NN'),
  (('Negro', 'NP'), 'NN'),
  (('student', 'NN'), 'NN'),
  (('has', 'HVZ'), 'NN'),
  (('entered', 'VBN'), 'NN'),
  (('its', 'PP$'), 'NN'),
  (('classrooms', 'NNS'), 'NN'),
  (('.', '.'), 'NN')],
 [(('Last', 'AP'), 'NN'),
  (('week', 'NN'), 'NN'),
  (('Federal', 'JJ-TL'), 'NN'),
  (('District', 'NN-TL'), 'NN'),
  (('Judge', 'NN-TL'), 'NN'),
  (('William', 'NP'), 'NN'),
  (('A.', 'NP'), 'NN'),
  (('Bootle', 'NP'), 'NN'),
  (('ordered', 'VBD'), 'NN'),
  (('the', 'AT'), 'NN'),
  (('university', 'NN'), 'NN'),
  (('to', 'TO'), 'NN'),
  (('admit', 'VB'), 'NN'),
  (('immediately', 'RB'), 'NN'),
  (('a', 'AT'), 'NN'),
  (('``', '``'), 'NN'),
  (('qualified', 'VBN'), 'NN'),
  (("''", "''"), 'NN'),
  (('Negro', 'NP'), 'NN'),
  (('boy', 'NN'), 'NN'),
 

Regex tagger

In [26]:
from nltk.tag.sequential import RegexpTagger
regexp_tagger = RegexpTagger(
        [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
         (r'(The|the|A|a|An|an)$', 'AT'), # articles
         (r'.*able$', 'JJ'),              # adjectives
         (r'.*ness$', 'NN'),              # nouns formed from adj
         (r'.*ly$', 'RB'),                # adverbs
         (r'.*s$', 'NNS'),                # plural nouns
         (r'.*ing$', 'VBG'),              # gerunds
         (r'.*ed$', 'VBD'),               # past tense verbs
         (r'.*', "NN"),                    # nouns(default)
         ])

In [27]:
print(regexp_tagger.evaluate(test_data))

0.31306687929831556
