In [41]:
# Imports
import nltk
from nltk.corpus import brown
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

In [29]:
# Consult nltk to for tag description
nltk.help.upenn_tagset('``')

``: opening quotation mark
    ` ``


In [3]:
headline = "British Left Waffles on Falkland Islands"

text = headline.split()
nltk.pos_tag(text)

[('British', 'JJ'),
 ('Left', 'NNP'),
 ('Waffles', 'NNP'),
 ('on', 'IN'),
 ('Falkland', 'NNP'),
 ('Islands', 'NNP')]

### 2. Vocabulary size and variability reduction

In [10]:
# Load corpus
brown_tagged = brown.tagged_words(tagset="universal")
print("Corpus sample", brown_tagged[0:10])

# Find the number of unique words
fdist_before = nltk.FreqDist(word.lower() for (word, tag) in brown_tagged)
unique_num = len(fdist_before)
print("There are {} unique words in the corpus.".format(unique_num))

Corpus sample [('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP')]
There are 49815 unique words in the corpus.


In [18]:
# Lemmatize the corpus
# converts a pos tag from the 'universal' format to wordnet format 
def get_wordnet_pos(universal_tag):
    if universal_tag == 'VERB':
        return wordnet.VERB
    elif universal_tag == 'ADJ':
        return wordnet.ADJ
    elif universal_tag == 'ADV':
        return wordnet.ADV
    else:
        return wordnet.NOUN


def lemmatize(corpus):
    lemmas = {}
    lemmas = set()
    lemmatizer = WordNetLemmatizer()
    for (token, tag) in corpus:
        # Convert univesal tag to wordnet tag
        pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(token, pos)
        lemmas.add(lemma)
    
    return lemmas


# Generate the lemmatized form of the corpus
lemma_corp = lemmatize(brown_tagged)
lemma_num = len(lemma_corp)
print("The lemmatized corpus has size {}.".format(lemma_num))
print("This leads to a percentage reduction of {}.".format((unique_num - lemma_num) / unique_num * 100))

The lemmatized corpus has size 46080.
This leads to a percentage reduction of 7.497741644083107


### 3. N-gram tagging

#### 3.1 Unigram tagger

In [19]:
#import the tagged and untagged sentences
brown_tagged_sents = brown.tagged_sents(categories=['news', 'fiction'])
print("Size of corpus: {0} sentences.".format(len(brown_tagged_sents)))

# split the sentences into training and test sets
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
print("Size of training set: {0} sentences.".format(size))

# Tagged test sentences (used as reference)
test_sents = brown_tagged_sents[size:]

# Untagged test sentences
raw_sents = brown.sents(categories=['news', 'fiction'])[size:]

Size of corpus: 8872 sentences
Size of training set: 7984 sentencces


In [23]:
# Train a unigram tagger
unigram_tagger = nltk.UnigramTagger(train_sents)

In [26]:
# Inspect how the tagger behaves on some samples
num_samples = 3
for i in range(num_samples):
    tags = unigram_tagger.tag(raw_sents[i])
    print("The tagging of the sentence:")
    print(' '.join(raw_sents[i]))
    print('is')
    print(tags)
    print()

# Evaluate the tagger
train_eval = unigram_tagger.evaluate(train_sents)
print("The performance of the tagger on the test set is {}.".format(train_eval))
test_eval = unigram_tagger.evaluate(test_sents)
print("The performance of the tagger on the test set is {}.".format(test_eval))

The tagging of the sentence:
To Mark : `` Please give my regards to Myra '' .
is
[('To', 'TO'), ('Mark', 'NP'), (':', ':'), ('``', '``'), ('Please', 'VB'), ('give', 'VB'), ('my', 'PP$'), ('regards', None), ('to', 'TO'), ('Myra', None), ("''", "''"), ('.', '.')]

The tagging of the sentence:
She signed the letters quickly , stamped them , and placed them on the hall table for Raphael to mail in town .
is
[('She', 'PPS'), ('signed', 'VBD'), ('the', 'AT'), ('letters', 'NNS'), ('quickly', 'RB'), (',', ','), ('stamped', 'VBD'), ('them', 'PPO'), (',', ','), ('and', 'CC'), ('placed', 'VBN'), ('them', 'PPO'), ('on', 'IN'), ('the', 'AT'), ('hall', 'NN'), ('table', 'NN'), ('for', 'IN'), ('Raphael', None), ('to', 'TO'), ('mail', 'NN'), ('in', 'IN'), ('town', 'NN'), ('.', '.')]

The tagging of the sentence:
Then she went back to the wicker chair and resolutely adjusted her eyes to the glare on the water .
is
[('Then', 'RB'), ('she', 'PPS'), ('went', 'VBD'), ('back', 'RB'), ('to', 'TO'), ('the', 'A

Some of the words are left untagged because they were not encoutered in the training set, so the tagger has no idea about how it should tag them.

#### 3.2 Guesser

We first test a default tagger.

In [39]:
# Initialize the default tagger
default_tagger = nltk.tag.DefaultTagger('NN')

# Evaluate the default tagger
test_eval = default_tagger.evaluate(test_sents)
print("The performance of the tagger on the test set is {}.".format(test_eval))

The performance of the tagger on the test set is 0.11230377861884966.


Now use the default tagger as a fallback for the unigram tagger.

In [49]:
# Create a unigram tagger with the default tagger as a fall-back
unigram_default_tagger = nltk.UnigramTagger(train_sents, backoff=default_tagger)

# Evaluate the baseline tagger
test_eval = unigram_default_tagger.evaluate(test_sents)
print("The performance of the tagger on the test set is {}.".format(test_eval))

The performance of the tagger on the test set is 0.868834150276106.


In [43]:
# Initialize the regular expression tagger
regexp_tagger = nltk.tag.RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'),   # articles
(r'.*able$', 'JJ'),                # adjectives
(r'.*ness$', 'NN'),                # nouns formed from adjectives
(r'.*ly$', 'RB'),                  # adverbs
(r'.*s$', 'NNS'),                  # plural nouns
(r'.*ing$', 'VBG'),                # gerunds
(r'.*ed$', 'VBD'),                 # past tense verbs
(r'.*', 'NN')                      # nouns (default)
])

# Evaluate the regexp tagger
test_eval = regexp_tagger.evaluate(test_sents)
print("The performance of the tagger on the test set is {}.".format(test_eval))

The performance of the tagger on the test set is 0.2701495315505367.


Now use the regexp tagger as a fallback for the unigram tagger.

In [48]:
# Create a unigram tagger with the default tagger as a fall-back
unigram_regex_tagger = nltk.UnigramTagger(train_sents, backoff=regexp_tagger)

# Evaluate the baseline tagger
test_eval = unigram_regex_tagger.evaluate(test_sents)
print("The performance of the tagger on the test set is {}.".format(test_eval))

The performance of the tagger on the test set is 0.8877582676676801.


#### 3.3 Bigrams and more

In [45]:
# Train a bigram tagger
bigram_tagger = nltk.BigramTagger(train_sents)

In [47]:
# Test the tagger on a sample sentence
sentence = 'I did not object to the object'.split()
print('The tagging for the sentece:')
print(sentence)
print('is')
print(bigram_tagger.tag(sentence))
print()

# Test the tagger on the test set
test_eval = bigram_tagger.evaluate(test_sents)
print("The performance of the tagger on the test set is {}.".format(test_eval))

The tagging for the sentece:
['I', 'did', 'not', 'object', 'to', 'the', 'object']
is
[('I', 'PPSS'), ('did', 'DOD'), ('not', '*'), ('object', 'VB'), ('to', 'TO'), ('the', None), ('object', None)]

The performance of the tagger on the test set is 0.19631445058013278.


In [50]:
# Test the bigram tagger with fallback on unigram taggers
bigram_unigram_default_tagger = nltk.BigramTagger(train_sents, backoff=unigram_default_tagger)
test_eval = bigram_unigram_default_tagger.evaluate(test_sents)
print("The performance of the tagger with default backoff on the test set is {}.".format(test_eval))

bigram_unigram_regex_tagger = nltk.BigramTagger(train_sents, backoff=unigram_regex_tagger)
test_eval = bigram_unigram_regex_tagger.evaluate(test_sents)
print("The performance of the tagger with regex backoff on the test set is {}.".format(test_eval))

The performance of the tagger with default backoff on the test set is 0.8848420922007818.
The performance of the tagger with regex backoff on the test set is 0.9036421170192964.


### 4. Viterbi algorithm

In [None]:
def viterbi(sentence, init_prob, trans_prob, emis_prob):
    n = len(sentence)
    nr_tags = len(init_prob)
    rho = np.zeros((n, nr_tags))
    prev_tag = np.zeros_like(rho)
    res = []
    
    # Set the initial values
    for i, ip in enumerate(init_prob):
        rho[0, i] = ip * emis_prob[i, sentence[0]]
    
    # Compute the rest of the values, keeping track of the tags used
    for i in range(1, n):
        for t in range(nr_tags):
            rho[i, t] = 0
            for tprev in range(nr_tags):
                new_p = rho[i - 1, tprev] * trans_prob[tprev, t]
                if rho[i, t] < new_p:
                    rho[i, t] = new_p
                    prev_tag[i, t] = tprev
            rho[i, t] *= emis_prob[t, sentence[i]]
    
    # Backtrack to form the answer
    
            
    
tagset = {'H': 0, 'T': 1}
init_prob = [0.5, 0.5]
trans_prob = [[0.4, 0.6], [0.9, 0.1]]
emis_prob = [[0.49, 0.51], [0.85, 0.15]]
sentence = list('HTTHTTHHTTHTTTHHTHHTTHTTTTHTHHTHTHHTTTH')
sentence = list(map(lambda x: tagset[x]))
vit = viterbi(sentence, init_prob, trans_prob, emis_prob)
print('The most probable coin toss sequence that produced the string is',vit)