In [394]:
# import libaries
import numpy as np
import pandas as pd
import nltk, pprint
import matplotlib.pyplot as plt
import random

import gzip, os, pickle # gzip for reading the gz files, pickle to save/dump trained model 
import _pickle as cPickle

import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV

# train-test split
from sklearn.model_selection import train_test_split

import time

# supress warnings
import warnings
warnings.filterwarnings('ignore')

In [395]:
# reading the Treebank tagged sentences
wsj = list(nltk.corpus.treebank.tagged_sents())

In [396]:
# Splitting into train and test
train_set, test_set = train_test_split(wsj,test_size=0.05, random_state=27)

In [397]:
# Getting list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

95475

In [398]:
#train_tagged_words = [(tup[0],'VBN') if (tup[1]=='VBD' and [i-1] not in('VP','VB'))  else tup for i,tup in enumerate(train_tagged_words)]

In [399]:
# ING

In [400]:
train_tagged_words = [(tup[0],'VBG') if (tup[0].endswith('ing'))  else tup for tup in train_tagged_words]

In [401]:
# tokens 
tokens = [pair[0] for pair in train_tagged_words]
tokens[:10]

['Nissan',
 'Motor',
 'Co.',
 ',',
 'Japan',
 "'s",
 'second-largest',
 'car',
 'maker',
 ',']

In [402]:
# vocabulary
V = set(tokens)
print(len(V))

12042


In [403]:
# number of tags
T = set([pair[1] for pair in train_tagged_words])
len(T)

46

### Emission Probabilities

In [404]:
# computing P(w/t) and storing in T x V matrix
t = len(T)
v = len(V)
w_given_t = np.zeros((t, v))

In [405]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

### Transition Probabilities

In [406]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [407]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [408]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))

## Viterbi Algorithm

In [409]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))



In [410]:
# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(27)

# choose random 5 sents
rndom = [random.randint(1,len(test_set)) for x in range(5)]

# list of sents
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
test_run[0]

[('Sterling', 'NNP'),
 ("'s", 'POS'),
 ('firm', 'JJ'),
 ('tone', 'NN'),
 (',', ','),
 ('combined', 'VBN'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('steady', 'JJ'),
 ('opening', 'NN'),
 ('on', 'IN'),
 ('Wall', 'NNP'),
 ('Street', 'NNP'),
 (',', ','),
 ('also', 'RB'),
 ('tempted', 'VBD'),
 ('some', 'DT'),
 ('investors', 'NNS'),
 ('to', 'TO'),
 ('come', 'VB'),
 ('back', 'RB'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('market', 'NN'),
 (',', ','),
 ('dealers', 'NNS'),
 ('said', 'VBD'),
 ('0', '-NONE-'),
 ('*T*-1', '-NONE-'),
 ('.', '.')]

In [411]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

In [412]:
print("Time taken in seconds: ", difference)
print(tagged_seq)
#print(test_run_base)

Time taken in seconds:  104.9240825176239
[('Sterling', 'VBG'), ("'s", 'POS'), ('firm', 'NN'), ('tone', 'NN'), (',', ','), ('combined', 'VBN'), ('with', 'IN'), ('a', 'DT'), ('steady', 'JJ'), ('opening', 'VBG'), ('on', 'IN'), ('Wall', 'NNP'), ('Street', 'NNP'), (',', ','), ('also', 'RB'), ('tempted', '.'), ('some', 'DT'), ('investors', 'NNS'), ('to', 'TO'), ('come', 'VB'), ('back', 'RP'), ('to', 'TO'), ('the', 'DT'), ('market', 'NN'), (',', ','), ('dealers', 'NNS'), ('said', 'VBD'), ('0', '-NONE-'), ('*T*-1', '-NONE-'), ('.', '.'), ('Currently', 'RB'), (',', ','), ('margins', 'NNS'), ('on', 'IN'), ('stock', 'NN'), ('futures', 'NNS'), ('purchases', 'NNS'), ('are', 'VBP'), ('much', 'RB'), ('lower', 'JJR'), ('--', ':'), ('roughly', 'RB'), ('7', 'CD'), ('%', 'NN'), ('compared', 'VBN'), ('with', 'IN'), ('50', 'CD'), ('%', 'NN'), ('for', 'IN'), ('stocks', 'NNS'), ('--', ':'), ('*', '-NONE-'), ('making', 'VBG'), ('the', 'DT'), ('futures', 'NNS'), ('market', 'NN'), ('much', 'RB'), ('faster', 'J

In [413]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
accuracy = len(check)/len(tagged_seq)
accuracy

0.851063829787234

In [414]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[[("''", "''"), (('Sterling', 'VBG'), ('Sterling', 'NNP'))],
 [("'s", 'POS'), (('firm', 'NN'), ('firm', 'JJ'))],
 [('steady', 'JJ'), (('opening', 'VBG'), ('opening', 'NN'))],
 [('also', 'RB'), (('tempted', '.'), ('tempted', 'VBD'))],
 [('come', 'VB'), (('back', 'RP'), ('back', 'RB'))],
 [('and', 'CC'), (('potentially', '.'), ('potentially', 'RB'))],
 [('potentially', 'RB'), (('more', 'JJR'), ('more', 'RBR'))],
 [('For', 'IN'), (('starters', '.'), ('starters', 'NNS'))],
 [('starters', 'NNS'), ((',', '.'), (',', ','))],
 [('H.', 'NNP'), (('Hudnut', '.'), ('Hudnut', 'NNP'))],
 [('an', 'DT'), (('evening', 'VBG'), ('evening', 'NN'))],
 [('Indianapolis', 'NNP'), (('Symphony', '.'), ('Symphony', 'NNP'))],
 [('Symphony', 'NNP'), (('Orchestra', '.'), ('Orchestra', 'NNP'))],
 [('a', 'DT'), (('guest', '.'), ('guest', 'NN'))],
 [('guest', 'NN'), (('pianist-comedian', '.'), ('pianist-comedian', 'NN'))],
 [('pianist-comedian', 'NN'), (('Victor', '.'), ('Victor', 'NNP'))],
 [('Victor', 'NNP'), (('Bor

In [None]:
# question 1: Find the number of unique POS tags in the corpus
incorrect_tags = [tags[1] for tags in incorrect_tagged_cases]
# question 2: Which is the most frequent tag in the corpus
from collections import Counter
incorrect_tags = Counter(incorrect_tags)
incorrect_tags

In [None]:
incorrect_tags.most_common(50)

In [None]:
def update_tags(tagged_list, word,tag):
    return [(tup[0],tag) if tup[0].endswith(word) else tup for tup in tagged_list]

In [None]:
#train_tagged_words = update_ing_words(train_tagged_words,'ing','VBG')
#len(train_tagged_words)

In [352]:
P1 = [j for i, tup in enumerate(train_tagged_words) if (tup[1]=='VBD' and [i-1] not in('VP','VB'))]
P1

[]

In [351]:
train_tagged_words = [(tup[0],'VBP') if (tup[1]=='VBD' and [i-1] not in('VP','VB'))  else tup for i,tup in enumerate(train_tagged_words)]

In [353]:
train_tagged_words

[('Nissan', 'NNP'),
 ('Motor', 'NNP'),
 ('Co.', 'NNP'),
 (',', ','),
 ('Japan', 'NNP'),
 ("'s", 'POS'),
 ('second-largest', 'JJ'),
 ('car', 'NN'),
 ('maker', 'NN'),
 (',', ','),
 ('announced', 'VBP'),
 ('Wednesday', 'NNP'),
 ('that', 'IN'),
 ('the', 'DT'),
 ('parent', 'NN'),
 ('concern', 'NN'),
 ("'s", 'POS'),
 ('pretax', 'NN'),
 ('earnings', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('first', 'JJ'),
 ('half', 'DT'),
 ('ended', 'VBP'),
 ('last', 'JJ'),
 ('Sept.', 'NNP'),
 ('30', 'CD'),
 ('rose', 'VBP'),
 ('14', 'CD'),
 ('%', 'NN'),
 ('to', 'TO'),
 ('88.32', 'CD'),
 ('billion', 'CD'),
 ('yen', 'NN'),
 ('-LRB-', '-LRB-'),
 ('$', '$'),
 ('618.1', 'CD'),
 ('million', 'CD'),
 ('*U*', '-NONE-'),
 ('-RRB-', '-RRB-'),
 ('from', 'IN'),
 ('77.6', 'CD'),
 ('billion', 'CD'),
 ('yen', 'NN'),
 ('a', 'DT'),
 ('year', 'NN'),
 ('earlier', 'JJR'),
 ('.', '.'),
 ('Sales', 'NNS'),
 ('were', 'VBP'),
 ('roughly', 'RB'),
 ('flat', 'JJ'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('1989', 'CD'),
 ('model', 'NN'),
 ('year