In [2]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

from nltk.corpus import brown

In [3]:
nltk.download('brown')

[nltk_data] Downloading package brown to /home/ashish/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [4]:
print(brown.tagged_words()) 

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]


In [5]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/ashish/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [6]:
# storing all the POS tags, words
tags = defaultdict()
words = defaultdict()
tag_words = defaultdict()
for tuple in brown.tagged_words(tagset='universal'):
    tag = tuple[1]
    word = tuple[0]
    if tag in tags: tags[tag] += 1
    else: tags[tag] = 1
        
    if word in words: words[word] += 1
    else: words[word] = 1
        
    if tag in tag_words:
        if word in tag_words[tag]: tag_words[tag][word] += 1
        else: tag_words[tag][word] = 1
    else:
        tag_words[tag] = defaultdict()
        tag_words[tag][word] = 1

In [7]:
# checking tag_words
word = "this"
tag = "DET"
if word in tag_words[tag]:
    print(tag_words[tag][word])

3965


In [8]:
# checking tags
for key,value in tags.items():
    print(key, value)

DET 137019
NOUN 275558
ADJ 83721
VERB 182750
ADP 144766
. 147565
ADV 56239
CONJ 38151
PRT 29829
PRON 49334
NUM 14874
X 1386


In [9]:
count = 0
for key,value in words.items():
    if count > 10: break
    count += 1
    print(key, value)

The 7258
Fulton 17
County 85
Grand 18
Jury 4
said 1943
Friday 60
an 3542
investigation 43
of 36080
Atlanta's 4


## Probabilities to compute: 
    P(word|tag) = count(word,tag)/count(all words,tag) -> emission probabilities
    P(tag1|tag2) = count(tag2,tag1)/count(tag2,tagx) -> transition probabilities
 


## Emission Probabilities

In [10]:
# computing emission probabilities
emission = defaultdict()
for tag in tags.keys():
    for word in words.keys():
        
        if tag not in emission:
            emission[tag] = defaultdict()
        count = 0
        if word in tag_words[tag]: count = tag_words[tag][word]
        emission[tag][word] = count/tags[tag]

In [11]:
# checking probabilities
tag = "DET"
word = "the"
emission[tag][word]

0.45767375327509324

## Computing bi-gram counts

In [12]:
sents = []
for lis in brown.tagged_sents(tagset='universal'):
    sents.append(lis)

print(len(sents))

57340


In [15]:
count = 0
for sent in sents:
    if count > 10: break
    if len(sent) == 5: 
        print(sent)
        count += 1

[('``', '.'), ('Must', 'VERB'), ('solve', 'VERB'), ('problem', 'NOUN'), ("''", '.')]
[('The', 'DET'), ('hotel', 'NOUN'), ('owner', 'NOUN'), ('shrugged', 'VERB'), ('.', '.')]
[('Formula', 'NOUN'), ('is', 'VERB'), ('due', 'ADJ'), ('this', 'DET'), ('week', 'NOUN')]
[('Oak', 'NOUN'), ('Grove', 'NOUN'), ('(', '.'), ('special', 'ADJ'), (')', '.')]
[('--', '.'), ('emphasizes', 'VERB'), ('the', 'DET'), ('Virgin', 'NOUN'), ('birth', 'NOUN')]
[("'", '.'), ('church', 'NOUN'), ('meets', 'VERB'), ('change', 'NOUN'), ("'", '.')]
[('Seeks', 'VERB'), ('``', '.'), ('improved', 'VERB'), ('fielding', 'VERB'), ("''", '.')]
[('Duren', 'NOUN'), (',', '.'), ('Sheldon', 'NOUN'), ('on', 'ADP'), ('hill', 'NOUN')]
[('A', 'DET'), ('quick', 'ADJ'), ('touchdown', 'NOUN'), ('resulted', 'VERB'), ('.', '.')]
[('It', 'PRON'), ('made', 'VERB'), ('him', 'PRON'), ('human', 'NOUN'), ('.', '.')]
[('He', 'PRON'), ('had', 'VERB'), ('a', 'DET'), ('16', 'NUM'), ('.', '.')]


In [16]:
bigram = defaultdict()
for sent in sents:
    length = len(sent)
    for i in range(0, length-1):
        curr = sent[i][1]
        next_ = sent[i+1][1]
        bi = curr + '-' + next_
        if bi not in bigram: bigram[bi] = 1
        else: bigram[bi] += 1

In [17]:
# test bi-grams
print(bigram["NOUN-DET"])
print(tags["NOUN"])

4270
275558


In [18]:
# compute bi-gram counts for starting tag and next tag
for sent in sents:
    start_tag = sent[0][1]
    bi = "^-" + start_tag
    if bi not in bigram: bigram[bi] = 1
    else: bigram[bi] += 1

## Transition Probabilities

In [19]:
# check bi-grams of states which will be divided by count of initial state
transition = defaultdict()
for tag in tags.keys():
    for tag1 in tags.keys():
        if tag not in transition:
            transition[tag] = defaultdict()
        transition[tag][tag1] = bigram[tag+'-'+tag1]/tags[tag] 

In [20]:
# compute starting tag transition probability i.e P(X|^)
transition["^"] = defaultdict()
for tag in tags.keys():
     transition["^"][tag] = bigram["^"+"-"+tag]/len(sents)

In [21]:
# test transition
print(transition["DET"]["NOUN"])
print(transition["^"]["ADJ"])

0.6264678621213117
0.034339030345308684


## Viterbi Algo

In [22]:
import copy

In [23]:
class TreeNode:
    def __init__(self, tag, prob, parent):
        self.tag = tag
        self.prob = prob
        self.tags = []
        self.parent = parent
        

class Viterbi:
    
    def find_tags(self, imp_nodes):
        max_node = None
        for tag,tag_node in imp_nodes.items():
            if max_node is None or max_node.prob < tag_node.prob: max_node = tag_node
        
        tags = []
        max_node = max_node.parent
        while(max_node is not None):
            tags.insert(0, max_node.tag)
            max_node = max_node.parent
        
        return tags
        
    def compute_states(self, sent):
        sent = sent.strip()
        tokens = sent.split(' ')
        if sent[-1] == ".":
            tokens[-1] = tokens[-1][:-1]
            tokens.append(".")
        
        root = TreeNode("^", 1, None)
        imp_nodes = defaultdict()
        
        # create first level of tree
        for tag in tags.keys():
            node = TreeNode(tag, transition["^"][tag], root)
            root.tags.append(copy.deepcopy(node))
            imp_nodes[tag] = copy.deepcopy(node)
        
        
        temp_best = defaultdict()
        for token in tokens:
            level_nodes = []
            temp_best = defaultdict()
            for tag,tag_node in imp_nodes.items():
                # compute every tag for this node
                for child_tag in tags.keys():
                    new_prob = tag_node.prob*emission[tag][token]*transition[tag][child_tag]
                    #print(token + " tag: " + tag + " child_tag: " + child_tag + " " + str(new_prob))
                    child = TreeNode(child_tag, new_prob, tag_node)
                    tag_node.tags.append(child)
                    level_nodes.append(child)
            
            # now select the best child for each tag
            for node in level_nodes:
                tag = node.tag
                #print(token + " " + tag + " " + str(node.prob))
                if tag in temp_best:
                    if node.prob > temp_best[tag].prob:
                        temp_best[tag] = node
                else:
                    temp_best[tag] = node
            
            for tag in tags.keys(): imp_nodes[tag] = copy.deepcopy(temp_best[tag])
        
        return self.find_tags(imp_nodes)
        
        

obj = Viterbi()
sent = "The hotel owner shrugged."
states = obj.compute_states(sent) 

In [24]:
states

['^', 'DET', 'NOUN', 'NOUN', 'VERB', '.']