### **POS Tagging Model**


In [1]:
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time

In [2]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [3]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [4]:
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [5]:
print(nltk_data[:2])

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


In [6]:
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)

In [7]:
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]

In [8]:
train_tagged_words[:5]


[('Drink', 'NOUN'),
 ('Carrier', 'NOUN'),
 ('Competes', 'VERB'),
 ('With', 'ADP'),
 ('Cartons', 'NOUN')]

In [9]:
tags = {tag for word,tag in train_tagged_words}

In [10]:
vocab = {word for word,tag in train_tagged_words}

In [11]:
print("\nCalculating Emission Probablity...")
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    return (count_w_given_tag, count_tag)


Calculating Emission Probablity...


In [12]:
print("Calculating Transmission Probablity...")
#! compute  Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

Calculating Transmission Probablity...


In [13]:
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)):
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [14]:
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
print(tags_df)

           ADP       NUM       PRT       ADJ      CONJ         .      VERB  \
ADP   0.016958  0.063275  0.001266  0.107062  0.001012  0.038724  0.008479   
NUM   0.037487  0.184220  0.026062  0.035345  0.014281  0.119243  0.020707   
PRT   0.019569  0.056751  0.001174  0.082975  0.002348  0.045010  0.401174   
ADJ   0.080583  0.021748  0.011456  0.063301  0.016893  0.066019  0.011456   
CONJ  0.055982  0.040615  0.004391  0.113611  0.000549  0.035126  0.150384   
.     0.092908  0.078210  0.002789  0.046132  0.060079  0.092372  0.089690   
VERB  0.092357  0.022836  0.030663  0.066390  0.005433  0.034807  0.167956   
DET   0.009918  0.022855  0.000287  0.206411  0.000431  0.017393  0.040247   
NOUN  0.176827  0.009144  0.043935  0.012584  0.042454  0.240094  0.149134   
ADV   0.119472  0.029868  0.014740  0.130721  0.006982  0.139255  0.339022   
PRON  0.022323  0.006834  0.014123  0.070615  0.005011  0.041913  0.484738   
X     0.142226  0.003075  0.185086  0.017682  0.010379  0.160869

In [15]:
random.seed(1234)

In [16]:
rndom = [random.randint(1,len(test_set)) for x in range(10)]

In [17]:
test_run = [test_set[i] for i in rndom]

In [18]:
test_run_base = [tup for sent in test_run for tup in sent]

In [19]:
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [20]:
patterns = [
    (r'.*ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense
    (r'.*es$', 'VERB'),               # verb
    (r'.*\'s$', 'NOUN'),              # possessive nouns
    (r'.*s$', 'NOUN'),                # plural nouns
    (r'\*T?\*?-[0-9]+$', 'X'),        # X
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
    (r'.*', 'NOUN')                   # nouns
]

In [21]:
rule_based_tagger = nltk.RegexpTagger(patterns)

In [22]:
print("\nRunning Viterbi Algorithm...")
#modified Viterbi to include rule based tagger in it
def Viterbi_rule_based(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))

    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]

            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p
            p.append(state_probability)

        pmax = max(p)
        state_max = rule_based_tagger.tag([word])[0][1]


        if(pmax==0):
            state_max = rule_based_tagger.tag([word])[0][1] # assign based on rule based tagger
        else:
            if state_max != 'X':
                # getting state for which probability is maximum
                state_max = T[p.index(pmax)]


        state.append(state_max)
    return list(zip(words, state))


Running Viterbi Algorithm...


In [23]:
start = time.time()
tagged_seq = Viterbi_rule_based(test_tagged_words)
end = time.time()
difference = end-start

In [24]:
print("\nTime taken in seconds: ", round(difference,2))


Time taken in seconds:  48.63


In [25]:
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]

In [26]:
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',round(accuracy*100,2))

Viterbi Algorithm Accuracy:  97.13


In [27]:
test_sent="The brown cat jumped over the grey fox."
pred_tags_rule=Viterbi_rule_based(test_sent.split())

In [28]:
print(pred_tags_rule)

[('The', 'DET'), ('brown', 'NOUN'), ('cat', 'NOUN'), ('jumped', 'VERB'), ('over', 'ADP'), ('the', 'DET'), ('grey', 'NOUN'), ('fox.', 'NOUN')]
