# Ejemplos POS Tagging

1.- Aprendizaje y evaluación de etiquetadores basados en unigramas y bigramas

In [14]:
import pprint
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import treebank
import nltk
nltk.download('treebank')


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\alex_\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.


True

In [15]:
talla = int(len(treebank.tagged_sents()) * 0.9)
train_sents = treebank.tagged_sents(tagset = 'universal')[:talla]

In [17]:
unigram_tagger = UnigramTagger(train_sents)

In [18]:
bigram_tagger = BigramTagger(train_sents)

In [19]:
test_sents = treebank.tagged_sents()[talla:]
okUni = round(unigram_tagger.evaluate(test_sents) * 100, 2);
okBi = round(bigram_tagger.evaluate(test_sents) * 100, 2);
print("Tasas de acierto:")
print("Unigramas: " + str(okUni) + "%")
print("Bigramas: " + str(okBi) + "%")

Tasas de acierto:
Unigramas: 3.91%
Bigramas: 0.11%


In [20]:
sent1 = treebank.sents()[talla+1]
print (sent1)
pprint.pprint(unigram_tagger.tag(sent1))
pprint.pprint(bigram_tagger.tag(sent1))

['First', 'of', 'America', 'said', '0', 'some', 'of', 'the', 'managers', 'will', 'take', 'other', 'jobs', 'with', 'First', 'of', 'America', '.']
[('First', 'NOUN'),
 ('of', 'ADP'),
 ('America', 'NOUN'),
 ('said', 'VERB'),
 ('0', 'X'),
 ('some', 'DET'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('managers', 'NOUN'),
 ('will', 'VERB'),
 ('take', 'VERB'),
 ('other', 'ADJ'),
 ('jobs', 'NOUN'),
 ('with', 'ADP'),
 ('First', 'NOUN'),
 ('of', 'ADP'),
 ('America', 'NOUN'),
 ('.', '.')]
[('First', 'NOUN'),
 ('of', 'ADP'),
 ('America', 'NOUN'),
 ('said', 'VERB'),
 ('0', 'X'),
 ('some', 'DET'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('managers', 'NOUN'),
 ('will', 'VERB'),
 ('take', 'VERB'),
 ('other', 'ADJ'),
 ('jobs', 'NOUN'),
 ('with', 'ADP'),
 ('First', 'NOUN'),
 ('of', 'ADP'),
 ('America', 'NOUN'),
 ('.', '.')]


2.- Definición y evaluación del etiquetador por defecto, usando la etiqueta más frecuente

In [21]:
import pprint
from nltk.corpus import treebank
from nltk.tag import DefaultTagger
from nltk.probability import FreqDist
tags = [tag for (word, tag) in treebank.tagged_words()]
masFTag = FreqDist(tags).max()
default_tagger = DefaultTagger(masFTag)

In [21]:
test_sents = treebank.tagged_sents()[3000:]
print(default_tagger.evaluate(test_sents))

0.14331966328512843


In [22]:
sent1 = treebank.sents()[talla+1]
print(sent1)
tag = default_tagger.tag(sent1)
pprint.pprint(tag)

['First', 'of', 'America', 'said', '0', 'some', 'of', 'the', 'managers', 'will', 'take', 'other', 'jobs', 'with', 'First', 'of', 'America', '.']
[('First', 'NN'),
 ('of', 'NN'),
 ('America', 'NN'),
 ('said', 'NN'),
 ('0', 'NN'),
 ('some', 'NN'),
 ('of', 'NN'),
 ('the', 'NN'),
 ('managers', 'NN'),
 ('will', 'NN'),
 ('take', 'NN'),
 ('other', 'NN'),
 ('jobs', 'NN'),
 ('with', 'NN'),
 ('First', 'NN'),
 ('of', 'NN'),
 ('America', 'NN'),
 ('.', 'NN')]


3.- Definición y evaluación de un etiquetador usando backoff

In [23]:
import pprint
from nltk.corpus import treebank
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger
from nltk.probability import FreqDist
tags = [tag for (word, tag) in treebank.tagged_words()]
masFTag = FreqDist(tags).max()
default_tagger = DefaultTagger(masFTag)

talla = int(len(treebank.tagged_sents()) * 0.9)
train_sents = treebank.tagged_sents()[:talla]
unigram_tagger = UnigramTagger(train_sents, backoff = default_tagger)
backoff_tagger = BigramTagger(train_sents, backoff= unigram_tagger)

In [24]:
test_sents = treebank.tagged_sents()[talla:]
okTriBO = round(backoff_tagger.evaluate(test_sents) * 100, 2);
print("Bigramas - backoff: " + str(okTriBO) + "%")

Bigramas - backoff: 89.06%


In [25]:
sent1 = treebank.sents()[talla+1]
print(sent1)
tag = backoff_tagger.tag(sent1)
pprint.pprint(tag)

['First', 'of', 'America', 'said', '0', 'some', 'of', 'the', 'managers', 'will', 'take', 'other', 'jobs', 'with', 'First', 'of', 'America', '.']
[('First', 'NNP'),
 ('of', 'IN'),
 ('America', 'NNP'),
 ('said', 'VBD'),
 ('0', '-NONE-'),
 ('some', 'DT'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('managers', 'NNS'),
 ('will', 'MD'),
 ('take', 'VB'),
 ('other', 'JJ'),
 ('jobs', 'NNS'),
 ('with', 'IN'),
 ('First', 'NNP'),
 ('of', 'IN'),
 ('America', 'NNP'),
 ('.', '.')]
