## POS tagging using modified Viterbi

### 1. Data Preparation

In [1]:
#Importing libraries
import numpy as np
import pandas as pd

import nltk

import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [2]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

#### Exploring corpus

In [3]:
# checking tagged sentences
nltk_data[:2]

[[('Pierre', 'NOUN'),
  ('Vinken', 'NOUN'),
  (',', '.'),
  ('61', 'NUM'),
  ('years', 'NOUN'),
  ('old', 'ADJ'),
  (',', '.'),
  ('will', 'VERB'),
  ('join', 'VERB'),
  ('the', 'DET'),
  ('board', 'NOUN'),
  ('as', 'ADP'),
  ('a', 'DET'),
  ('nonexecutive', 'ADJ'),
  ('director', 'NOUN'),
  ('Nov.', 'NOUN'),
  ('29', 'NUM'),
  ('.', '.')],
 [('Mr.', 'NOUN'),
  ('Vinken', 'NOUN'),
  ('is', 'VERB'),
  ('chairman', 'NOUN'),
  ('of', 'ADP'),
  ('Elsevier', 'NOUN'),
  ('N.V.', 'NOUN'),
  (',', '.'),
  ('the', 'DET'),
  ('Dutch', 'NOUN'),
  ('publishing', 'VERB'),
  ('group', 'NOUN'),
  ('.', '.')]]

In [4]:
# unique tags applied to the words in the corpus
print(set([tpl[1] for pair in nltk_data for tpl in pair]))

{'X', 'DET', 'PRT', 'CONJ', 'ADJ', 'ADP', 'NOUN', '.', 'VERB', 'ADV', 'NUM', 'PRON'}


In [5]:
# checking unique words in the corpus, which does not have 'X', 'NUM', or '.' tags.

print(len(set([tlp[0] for pair in nltk_data for tlp in pair 
                                                  if (tlp[1] != 'X' and tlp[1] != '.' and tlp[1] != 'NUM')
              ]
         )))

10992


In [6]:
# checking the text for which 'NOUN' tag is used

print(set(
    [tpl[0].lower() for pair in nltk_data for tpl in pair 
                                        if tpl[1] == 'NOUN']
         ))



In [7]:
# checking the text for which 'X' tag is used

print(set(
    [tpl[0].lower() for pair in nltk_data for tpl in pair 
                                        if tpl[1] == 'X']
         ))

# as per the documentation 'X' tag is used for foreign words.
# https://www.nltk.org/_modules/nltk/tag/mapping.html

{'*t*-44', '*-128', '*-1', '*t*-3', '*-36', '*t*-5', '*t*-46', '*ich*-4', '*t*-33', '*t*-229', '*t*-242', '*-97', '*-130', '*-11', '*-27', '*t*-244', '*-104', '*-3', '3', '*t*-164', '0', '*-84', '*t*-103', '*t*-28', '*t*-75', '*-137', '*t*-238', '*t*-54', '*t*-22', '*t*-13', '*t*-183', '*t*-121', '*t*-148', '*t*-68', '*-69', '*t*-111', '*t*-60', '*t*-40', '*t*-106', '*t*-139', '*t*-14', '*rnr*-4', '*-16', '*t*-15', '*-64', '*t*-253', '*t*-18', '*t*-32', '*t*-207', '*t*-115', '*t*-88', '5', '*-23', '*t*-102', '*t*-128', '*t*-38', '*-38', '*t*-156', '*-100', 'besuboru', '*t*-45', '*t*-126', '*t*-227', '*t*-119', '*-138', 'oh', '*-99', '*t*-6', '*t*-120', '*t*-221', '*t*-118', '*t*-151', '*-87', '*-115', '*t*-143', '*t*-138', '*t*-73', '*-53', '*t*-153', '*ppa*-1', '*t*-245', '*-54', 'no', '*-24', '*t*-224', 'etc.', '*-93', '*-111', '*-5', '*t*-55', '*t*-234', '*t*-114', '*t*-137', '*t*-37', '*t*-21', '*t*-97', '*t*-210', '*t*-214', '*t*-20', '*-78', '*-76', '*t*-61', '*t*-67', '*-12', '*

#### Splitting the data into train and test sets

In [8]:
# as per the instructions, splitting the data into 95% train set and 5% test set
random.seed(1234)
train_set, test_set = train_test_split(nltk_data,test_size=0.05)

print(len(train_set))
print(len(test_set))
print(train_set[:5])

3718
196
[[('I', 'PRON'), ('would', 'VERB'), ('predict', 'VERB'), ('that', 'ADP'), ('within', 'ADP'), ('a', 'DET'), ('short', 'ADJ'), ('time', 'NOUN'), ('most', 'ADJ'), ('of', 'ADP'), ('them', 'PRON'), ('would', 'VERB'), ('find', 'VERB'), ('Thunderbird', 'NOUN'), ('a', 'DET'), ('satisfactory', 'ADJ'), ('substitute', 'NOUN'), ('for', 'ADP'), ('Chivas', 'NOUN'), ('Regal', 'NOUN'), ('and', 'CONJ'), ('that', 'ADP'), ('their', 'PRON'), ('``', '.'), ('normal', 'ADJ'), ("''", '.'), ('phobias', 'NOUN'), (',', '.'), ('anxieties', 'NOUN'), (',', '.'), ('depressions', 'NOUN'), ('and', 'CONJ'), ('substance', 'NOUN'), ('abuse', 'NOUN'), ('would', 'VERB'), ('increase', 'VERB'), ('dramatically', 'ADV'), ('.', '.')], [('Newsweek', 'NOUN'), ('said', 'VERB'), ('0', 'X'), ('it', 'PRON'), ('will', 'VERB'), ('introduce', 'VERB'), ('the', 'DET'), ('Circulation', 'NOUN'), ('Credit', 'NOUN'), ('Plan', 'NOUN'), (',', '.'), ('which', 'DET'), ('*T*-1', 'X'), ('awards', 'VERB'), ('space', 'NOUN'), ('credits', 'NO

### 2. Build the vanilla Viterbi based POS tagger

In [9]:
# Getting list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]

# number of tags
T = set([pair[1] for pair in train_tagged_words])

#### Emission probability

In [10]:
# compute word given tag: Emission Probability

def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1].lower()==tag.lower()]
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0].lower()==word.lower()]
    
    return (len(w_given_tag_list), len(tag_list))

#### Transition probability

In [11]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

#### Viterbi Algorithm

In [12]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)):
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [13]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))

In [14]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    
    #extracting unique tags from the text
    T = list(set([pair[1] for pair in train_bag]))
    
    #enumerate creates a tuple with key and the individual value passed in the enumerate function.
    #in this case words. 
    
    #E.g. l1 = ["eat","sleep","repeat"] 
    # print(l1) will result in (0, 'eat'), (1, 'sleep'), (2, 'repeat')
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            #######
            #print("%s %s %s" % (emission_p, word, tag))
            #######
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
    # the loop executes for (words*tags) times. Therefore time is O(nt); where, n=words and t=tags.
            
        #print(p)
        pmax = max(p)
        ######
        #print('pmax = %f, %s, %s' % (pmax, word, tag))
        ######
        
        
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        ######
        #print('pmax = %f, %s, %s' % (pmax, word, state_max))
        ######
        
        state.append(state_max)
    return list(zip(words, state))

#### Evaluation on Test data set

In [15]:
# creating a function to show list of words which were incorrectly tagged.

def correction(output_of_viterbi_algo, testing_set):
    return [
                [
                    j
                ] 

                for i, j in enumerate(zip(output_of_viterbi_algo, testing_set))
                    if j[0]!=j[1]
            ]

### 3. Solve the problem of unknown words

#### 3.1 Analyzing how Viterbi Algoritm tags the new words

Identifying patterns which may help in improving the problem of unknown words.

In [16]:
# reading the Test_sentences.txt file and applying the viterbi algo, to see how the words are tagged.

test_text = open('Test_sentences.txt', 'r')
words = word_tokenize(test_text.read())
viterbi_tagged_wrds = Viterbi(words)

Here we can see that words (nouns) like 'Android', 'Google', 'Twitter', 'FIFA' etc. are tagged as 'ADP - adpositions (prepositions and postpositions)', because they were not present in the earlier corpus.

In [17]:
# checking unique words in the corpus, which are marked as 'ADP'. Most of the nouns are wrongly marked as ADP

print(
    set(
        [tlp for tlp in viterbi_tagged_wrds if tlp[1] == 'ADP']
    )
)

{('since', 'ADP'), ('about', 'ADP'), ('Before', 'ADP'), ('on', 'ADP'), ('at', 'ADP'), ('like', 'ADP'), ('with', 'ADP'), ('in', 'ADP'), ('of', 'ADP'), ('by', 'ADP'), ('as', 'ADP'), ('that', 'ADP'), ('from', 'ADP')}


From above we can see that the numbers and nouns are not tagged properly. They are by default tagged as 'ADP' because these words were not present in the universal corpus. Hence will create below patterns which tags the number, nouns properly.

<u>Analysis of tagging with Vanilla Viterbi algorithm</u>
1. Proper nouns like 'Android', 'Google', 'Twitter', 'FIFA' etc. start with a capital letter.
2. Plural nouns like 'tweets', 'trips', 'messages' etc which ends with an 's' are also in correctly marked as ADP

#### 3.2 Lexicon (Unigram) Tagger

In [18]:
lexicon_tagger = nltk.UnigramTagger(train_set)
wrds_only_lex_tagger = lexicon_tagger.tag(words)

<u>Analysis of tagging with UnigramTagger algorithm</u>
1. Since this tagger is trained with NLTK's Universal corpus, it is not able to tag new words like 'Android', 'Google', 'Twitter', 'domineering' etc.
2. Words like 'networking' are correctly tagged as Noun, where as other verbs like 'entering', 'leaving' are tagged correctly.

<u>Conclusion</u>
There are many new words in the Test_sentences.txt files which UnigramTagger is not able to identify. Hence we should use a secondary Rule based tagger for such words.

#### 3.3 Rule-Based (Regular Expression) Tagger

In [19]:
# specifying patterns for tagging as per the patterns found in the test sentences
# example from the NLTK book
patterns = [
    (r'.*ing$', 'VERB'),                             # gerund
    (r'.*ed$', 'VERB'),                              # past tense
    (r'.*es$', 'VERB'),                              # 3rd singular present
    (r'.*ould$', 'PRT'),                             # modals
    (r'[A-Z]{1}([a-z]{1,})?([A-Z]{1,})?', 'NOUN'),   # words with capital letters
    (r'.*\'s$', 'NOUN'),                             # possessive nouns
    (r'.*s$', 'NOUN'),                               # plural nouns
    (r'([0-9])+(\-|\/)?([a-z]{2,3})?(\-)?', 'NUM'),  # cardinal numbers and dates
    (r'.*', 'NOUN')                                  # any word not found will be tagged as a noun
]

In [20]:
# Rule based tagger
rule_based_tagger = nltk.RegexpTagger(patterns)
wrds_rule_tagger = rule_based_tagger.tag(words)

<u>Analysis of tagging with Rule Based Tagger</u>
1. Since the we have specified the final rule as a Noun, all the words that RegEx is not able to parse are tagged as 'NOUN'.
2. Words like 'networking' which are ending with 'ing' are tagged as Verb, where as it is a Noun.

<u>Conclusion</u>
Since the Rule Based (RegEx based) Tagger is not efficiently tagging the POS, we'll need another Tagger and we should use the RegEx tagger as a secondary tagger. 

#### 3.4 Combined Taggers

In [21]:
# initializing the variables again to avoid any confusion.

# rule based tagger
rule_based_tagger = nltk.RegexpTagger(patterns)

# lexicon tagger, backed up by the rule-based tagger
# lexicon tagger is trained on the NLTK's Universal corpus.

lexicon_tagger = nltk.UnigramTagger(train_set, backoff=rule_based_tagger)

wrds_comb_tagger = lexicon_tagger.tag(words)

### 4. Evaluating tagging accuracy

#### Evaluating tagging accuracy on Viterbi algorithm (checking Universal's train data vs test data)

In [22]:
# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(1234)

# choose random 20 sents
rndom = [random.randint(1,len(test_set)) for x in range(20)]

# list of sents
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [23]:
# tagging the test sentences
tagged_seq = Viterbi(test_tagged_words)

In [24]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]
print(len(check)/len(tagged_seq))

0.9024390243902439


#### Evaluating tagging accuracy of Lexicon and Rule based taggers

In [25]:
lexicon_tagger.evaluate(test_set)

0.9494811043665558

In [26]:
rule_based_tagger.evaluate(test_set)

0.3436459761112199

### 5. List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

In [27]:
correction(viterbi_tagged_wrds, wrds_comb_tagger)

[[(('Android', 'X'), ('Android', 'NOUN'))],
 [(('Google', 'X'), ('Google', 'NOUN'))],
 [(('Android', 'X'), ('Android', 'NOUN'))],
 [(('OS', 'X'), ('OS', 'NOUN'))],
 [(('worldwide', 'X'), ('worldwide', 'NOUN'))],
 [(('smartphones', 'X'), ('smartphones', 'VERB'))],
 [(('2011', 'X'), ('2011', 'NUM'))],
 [(('2013', 'X'), ('2013', 'NUM'))],
 [(('Google', 'X'), ('Google', 'NOUN'))],
 [(('Twitter', 'X'), ('Twitter', 'NOUN'))],
 [(('2015', 'X'), ('2015', 'NUM'))],
 [(('Google', 'X'), ('Google', 'NOUN'))],
 [(('Twitter', 'X'), ('Twitter', 'NOUN'))],
 [(('firehose', 'X'), ('firehose', 'NOUN'))],
 [(('Twitter', 'X'), ('Twitter', 'NOUN'))],
 [(('online', 'X'), ('online', 'NOUN'))],
 [(('interact', 'X'), ('interact', 'NOUN'))],
 [(('messages', 'X'), ('messages', 'VERB'))],
 [(('tweets', 'X'), ('tweets', 'NOUN'))],
 [(('domineering', 'X'), ('domineering', 'VERB'))],
 [(('personality', 'X'), ('personality', 'NOUN'))],
 [(('2018', 'X'), ('2018', 'NUM'))],
 [(('FIFA', 'X'), ('FIFA', 'NOUN'))],
 [(('Cup