## 不能在词性标注之前删除停用词，否则语义不通

In [1]:
import nltk
from nltk import word_tokenize
s="I was watching TV."
print(nltk.pos_tag(word_tokenize(s)))

[('I', 'PRP'), ('was', 'VBD'), ('watching', 'VBG'), ('TV', 'NN'), ('.', '.')]


# 其中
### PRP----人称代词
### VBD—动词的过去式
### VBG—动词的动名词用法
### NN—专用名词

## 筛选出名词

In [2]:
#导入库
import nltk
from nltk import word_tokenize
s="I was watching TV."
#对字符串s标注
tagged=nltk.pos_tag(word_tokenize(s))
#输出所有词性里边的名词
all_noun=[word for word ,pos in tagged if pos in ['NN','mnp']]
print(all_noun)

['TV']


# N-gram标注器是一种顺序标注器，会在其所在的上下文环境中标注出前n个单词，并预测给定词项的Pos标签。

In [9]:
import nltk
from nltk.corpus import brown
#一元模型标注，只考虑条件概率
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
#会考虑给定单词和该单词前一个单词
from nltk.tag import BigramTagger
#与前面两个都有关
from nltk.tag import TrigramTagger
#brown训练集
brown_tagged_sents = brown.tagged_sents(categories='news')
default_tagger = nltk.DefaultTagger('NN')
#训练集
train_data=brown_tagged_sents[:int(len(brown_tagged_sents)*0.9)]
test_data=brown_tagged_sents[int(len(brown_tagged_sents)*0.9):]

In [8]:
#backoff是指当不能进行正确的标签预测时会咨询backoff
#一元模型
unigram_tagger=UnigramTagger(train_data,backoff=default_tagger)
print(unigram_tagger.evaluate(test_data))
#二元模型
bigram_tagger=BigramTagger(train_data,backoff=unigram_tagger)
print(bigram_tagger.evaluate(test_data))
#三元模型
trigram_tagger=TrigramTagger(train_data,backoff=bigram_tagger)
print(trigram_tagger.evaluate(test_data))


0.8361407355726104
0.8452108043456593
0.843317053722715


In [26]:
test = "I think I can do it ?".split(' ')
test

['I', 'think', 'I', 'can', 'do', 'it', '?']

# 正则表达式标注器

In [4]:
#导入训练集
from nltk.corpus import brown
#NLTK正则标注器
from nltk.tag.sequential import RegexpTagger
#对词性进行标注
brown_tagged_sents = brown.tagged_sents(categories='news')
#测试集
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]
regexp_tagger=RegexpTagger([
( r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
( r'(The|the|A|a|An|an)$', 'AT'),
( r'.*able$', 'JJ'),
( r'.*ness$', 'NN'),    #以Ness结尾大多是名词
( r'.*ly$', 'RB'),      #以ly结尾大多是副词
( r'.*s$', 'NNS'),      #以s结尾的大多是复数名词
(r'.*ing$', 'VBG'),     #以ing结尾的大多是动名词
(r'.*ed$', 'VBD'),      #以ed结尾的大多数是动词过去式
(r'.*', 'NN')
])
print(regexp_tagger.evaluate(test_data))


0.31306687929831556


# 5.命名实体识别（NER）
NER主要由实体名、位置和组织等。NLTK库提供了ne_chunk方法。需要先对语句进行标识化处理，然后再进行语块分解和词性标注的处理顺序，之后进行命名实体标注。
简单看一看

In [32]:
import nltk
from nltk import word_tokenize
from nltk import ne_chunk
sent="Mark is studying at Stanford University in California"
print(ne_chunk(nltk.pos_tag(word_tokenize(sent)),binary=False))

(S
  (PERSON Mark/NNP)
  is/VBZ
  studying/VBG
  at/IN
  (ORGANIZATION Stanford/NNP University/NNP)
  in/IN
  (GPE California/NNP))
