In [2]:
import nltk
from nltk.corpus import treebank

train_data = treebank.tagged_sents()[:3500]
test_data = treebank.tagged_sents()[3500:]
print(train_data[0])
print(len(treebank.tagged_sents()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
3914


In [5]:
# nltk.download('help/tagsets/PY3/upenn_tagset.pickle')
# nltk.help.upenn_tagset()

In [12]:
from nltk.tag import hmm
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data)
print(tagger)

<HiddenMarkovModelTagger 14 states and 74479 output symbols>


In [13]:
nodes = tagger._states
transitions = tagger._transitions_matrix()
words = tagger._symbols
priors = tagger._priors 
posteriors = tagger._outputs

triples = [[nodes[j], nodes[i], transitions[i][j]]  for i in range(len(nodes)) for j in range(len(nodes))]
triples.sort(key=lambda x: x[2], reverse=True)
print('Most probable transitions:')
for triple in triples[:20]:
    print('{} ---> {} (log(p) = {:1.4f})'.format(*triple))

Most probable transitions:
INTJ ---> PUNCT (log(p) = -0.2678)
ADJ ---> NOUN (log(p) = -0.6067)
NUM ---> NOUN (log(p) = -0.8638)
ADP ---> NOUN (log(p) = -1.0095)
DET ---> NOUN (log(p) = -1.0149)
X ---> PUNCT (log(p) = -1.1769)
PART ---> VERB (log(p) = -1.3060)
PROPN ---> PUNCT (log(p) = -1.3316)
PRON ---> VERB (log(p) = -1.3664)
NOUN ---> PUNCT (log(p) = -1.5516)
NUM ---> PUNCT (log(p) = -1.6549)
ADV ---> VERB (log(p) = -1.7225)
CONJ ---> NOUN (log(p) = -2.0315)
X ---> X (log(p) = -2.2559)
VERB ---> NOUN (log(p) = -2.2861)
VERB ---> ADP (log(p) = -2.4043)
ADP ---> ADJ (log(p) = -2.4953)
VERB ---> PUNCT (log(p) = -2.5101)
ADV ---> PUNCT (log(p) = -2.5800)
NOUN ---> NOUN (log(p) = -2.6034)


In [None]:
print('Most probable nouns:')
nouns = [[word, posteriors['NN'].prob(word)] for word in words if word.isalpha()]
nouns.sort(key=lambda x: x[1], reverse=True)
for noun in nouns[:20]:
    print('{} (p = {:1.4f})'.format(*noun))

In [None]:
print('Most probable verbs:')
verbs = [[word, posteriors['VB'].prob(word)] for word in words if word.isalpha()]
verbs.sort(key=lambda x: x[1], reverse=True)
for verb in verbs[:20]:
    print('{} (p = {:1.4f})'.format(*verb))

In [15]:
print('Most probable POS tags for "talks":')
word = [[tag, posteriors[tag].prob('talks')] for tag in nodes]
word.sort(key=lambda x: x[1], reverse=True)
for w in word[:20]:
    if w[1] > 0:
        print('{} (p = {:1.4f})'.format(*w))

Most probable POS tags for "talks":


In [99]:
print(tagger.tag("Today is a good day .".split()))

print(tagger.tag("Joe met Joanne in Delhi .".split()))

print(tagger.tag("Chicago is the birthplace of Ginny".split()))

print(tagger.tag("The chief talks".split()))

[('Today', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('good', 'JJ'), ('day', 'NN'), ('.', '.')]
[('Joe', 'NNP'), ('met', 'VBD'), ('Joanne', 'NNP'), ('in', 'IN'), ('Delhi', 'NNP'), ('.', 'NNP')]
[('Chicago', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('birthplace', 'NNP'), ('of', 'NNP'), ('Ginny', 'NNP')]
[('The', 'DT'), ('chief', 'NN'), ('talks', 'NNS')]


In [4]:
tagger.evaluate(test_data)

0.41872398102430053

In [100]:
from nltk.tag import tnt
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)
tnt_pos_tagger.evaluate(test_data)

0.8777229160615742

In [113]:
print(tnt_pos_tagger.tag("Today is a good day .".split()))

print(tnt_pos_tagger.tag("Joe met Joanne in Delhi .".split()))

print(tnt_pos_tagger.tag("Chicago is the birthplace of Ginny".split()))

print(tnt_pos_tagger.tag("The chief talks".split()))

[('Today', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('good', 'JJ'), ('day', 'NN'), ('.', '.')]
[('Joe', 'NNP'), ('met', 'VBD'), ('Joanne', 'NNP'), ('in', 'IN'), ('Delhi', 'Unk'), ('.', '.')]
[('Chicago', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('birthplace', 'Unk'), ('of', 'IN'), ('Ginny', 'Unk')]
[('The', 'DT'), ('chief', 'JJ'), ('talks', 'NNS')]


# Задание

Повторите аналогичные шаги на русскоязычных данных. Будем использовать данные из MorphoRuEval: возьмем размеченные предложения из Открытого корпуса, разделим их на две части, обучающую (30000 предложений) и тестовую (все остальное). 

Ссылка на файл с данными: https://github.com/dialogue-evaluation/morphoRuEval-2017/blob/master/OpenCorpora_Texts.rar
    

In [9]:
train_file = 'unamb_sent_14_6.conllu'

In [10]:
from nltk.corpus.reader.conll import *
train_reader = ConllCorpusReader(root = '', fileids = [train_file], columntypes = ['ignore', 'words', 'ignore', 'pos', 'chunk'])


sents = list(train_reader.iob_sents())
train_sents = sents[:30000]
test_sents = sents[30000:]

print(len(train_sents))
print(len(test_sents))

30000
8508


In [11]:
train_sents[0]

[('«', 'PUNCT', '_'),
 ('Школа', 'NOUN', '_'),
 ('злословия', 'NOUN', '_'),
 ('»', 'PUNCT', '_'),
 ('учит', 'VERB', '_'),
 ('прикусить', 'VERB', '_'),
 ('язык', 'NOUN', '_')]

In [19]:
from nltk.tag import hmm
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_sents)
print(tagger)

nodes = tagger._states
transitions = tagger._transitions_matrix()
words = tagger._symbols
priors = tagger._priors 
posteriors = tagger._outputs

triples = [[nodes[j], nodes[i], transitions[i][j]]  for i in range(len(nodes)) for j in range(len(nodes))]
triples.sort(key=lambda x: x[2], reverse=True)
print('Most probable transitions:')
for triple in triples[:20]:
    print('{} ---> {} (log(p) = {:1.4f})'.format(*triple))

<HiddenMarkovModelTagger 14 states and 74479 output symbols>
Most probable transitions:
INTJ ---> PUNCT (log(p) = -0.2678)
ADJ ---> NOUN (log(p) = -0.6067)
NUM ---> NOUN (log(p) = -0.8638)
ADP ---> NOUN (log(p) = -1.0095)
DET ---> NOUN (log(p) = -1.0149)
X ---> PUNCT (log(p) = -1.1769)
PART ---> VERB (log(p) = -1.3060)
PROPN ---> PUNCT (log(p) = -1.3316)
PRON ---> VERB (log(p) = -1.3664)
NOUN ---> PUNCT (log(p) = -1.5516)
NUM ---> PUNCT (log(p) = -1.6549)
ADV ---> VERB (log(p) = -1.7225)
CONJ ---> NOUN (log(p) = -2.0315)
X ---> X (log(p) = -2.2559)
VERB ---> NOUN (log(p) = -2.2861)
VERB ---> ADP (log(p) = -2.4043)
ADP ---> ADJ (log(p) = -2.4953)
VERB ---> PUNCT (log(p) = -2.5101)
ADV ---> PUNCT (log(p) = -2.5800)
NOUN ---> NOUN (log(p) = -2.6034)


In [23]:
print('Most probable nouns:')
nouns = [[word, posteriors['NOUN'].prob(word)] for word in words if word.isalpha()]
nouns.sort(key=lambda x: x[1], reverse=True)
for noun in nouns[:20]:
    print('{} (p = {:1.4f})'.format(*noun))

Most probable nouns:
года (p = 0.0087)
году (p = 0.0070)
время (p = 0.0043)
лет (p = 0.0039)
Статья (p = 0.0037)
человек (p = 0.0023)
страны (p = 0.0017)
год (p = 0.0017)
Федерации (p = 0.0017)
день (p = 0.0016)
жизни (p = 0.0016)
место (p = 0.0015)
долларов (p = 0.0015)
области (p = 0.0014)
вопрос (p = 0.0014)
числе (p = 0.0014)
времени (p = 0.0014)
власти (p = 0.0013)
мира (p = 0.0013)
млн (p = 0.0013)


In [24]:
print('Most probable verbs:')
verbs = [[word, posteriors['VERB'].prob(word)] for word in words if word.isalpha()]
verbs.sort(key=lambda x: x[1], reverse=True)
for verb in verbs[:20]:
    print('{} (p = {:1.4f})'.format(*verb))

Most probable verbs:
был (p = 0.0176)
будет (p = 0.0144)
было (p = 0.0116)
может (p = 0.0101)
быть (p = 0.0100)
были (p = 0.0100)
была (p = 0.0094)
является (p = 0.0067)
могут (p = 0.0055)
будут (p = 0.0051)
стал (p = 0.0051)
сказал (p = 0.0049)
нет (p = 0.0037)
заявил (p = 0.0033)
имеет (p = 0.0032)
стала (p = 0.0029)
сообщает (p = 0.0029)
стали (p = 0.0024)
являются (p = 0.0022)
говорит (p = 0.0022)


In [26]:
print('Most probable POS tags for "стали":')
word = [[tag, posteriors[tag].prob('стали')] for tag in nodes]
word.sort(key=lambda x: x[1], reverse=True)
for w in word[:20]:
    if w[1] > 0:
        print('{} (p = {:1.4f})'.format(*w))

Most probable POS tags for "стали":
VERB (p = 0.0024)
NOUN (p = 0.0000)


In [None]:
test_sents_1 = []
for sent in test_sents:
    sent_1 = []
    for word in sent:
        sent_1.append(word[:2])
    test_sents_1.append(sent_1)
test_sents_1[0]

In [36]:
%time
tagger.evaluate(test_sents_1)

0.47075232194907907

In [38]:
from nltk.tag import tnt
train_sents_1 = []
for sent in test_sents:
    sent_1 = []
    for word in sent:
        sent_1.append(word[:2])
    train_sents_1.append(sent_1)
test_sents_1[0]
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_sents_1)
tnt_pos_tagger.evaluate(test_sents_1)

0.9953090882497362