In [2]:
# Daniel Bandala @ oct 2022
import nltk
from nltk.corpus import treebank
from nltk.tag import hmm

# Sequence labelling
In machine learning, sequence labeling is a type of pattern recognition task that involves the algorithmic assignment of a categorical label to each member of a sequence of observed values. A common example of a sequence labeling task is part of speech tagging, which seeks to assign a part of speech to each word in an input sentence or document. Sequence labeling can be treated as a set of independent classification tasks, one per member of the sequence. However, accuracy is generally improved by making the optimal label for a given element dependent on the choices of nearby elements, using special algorithms to choose the globally best set of labels for the entire sequence at once.

As an example of why finding the globally best label sequence might produce better results than labeling one item at a time, consider the part-of-speech tagging task just described. Frequently, many words are members of multiple parts of speech, and the correct label of such a word can often be deduced from the correct label of the word to the immediate left or right. 

In [4]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /home/bandala/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [5]:
train_data = treebank.tagged_sents()[:3000]
print (train_data[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [6]:
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_data)

In [7]:
print (tagger)
print (tagger.tag("Today is a good day .".split()))
print (tagger.tag("Joe met Joanne in Delhi .".split()))
print (tagger.tag("Chicago is the birthplace of Ginny".split()))

<HiddenMarkovModelTagger 46 states and 10779 output symbols>
[('Today', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('good', 'JJ'), ('day', 'NN'), ('.', '.')]
[('Joe', 'NNP'), ('met', 'VBD'), ('Joanne', 'NNP'), ('in', 'IN'), ('Delhi', 'NNP'), ('.', 'NNP')]
[('Chicago', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('birthplace', 'NNP'), ('of', 'NNP'), ('Ginny', 'NNP')]
