In [18]:
# Importing libraries
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time

## Read data & Processing

In [19]:
#download the treebank corpus from nltk
nltk.download('treebank')
 
#download the universal tagset from nltk
nltk.download('universal_tagset')
 
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

[nltk_data] Downloading package treebank to /Users/he/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/he/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [20]:
#example for first two sentences
for sent in nltk_data[:2]:
  for tuple in sent:
    print(tuple)

('Pierre', 'NOUN')
('Vinken', 'NOUN')
(',', '.')
('61', 'NUM')
('years', 'NOUN')
('old', 'ADJ')
(',', '.')
('will', 'VERB')
('join', 'VERB')
('the', 'DET')
('board', 'NOUN')
('as', 'ADP')
('a', 'DET')
('nonexecutive', 'ADJ')
('director', 'NOUN')
('Nov.', 'NOUN')
('29', 'NUM')
('.', '.')
('Mr.', 'NOUN')
('Vinken', 'NOUN')
('is', 'VERB')
('chairman', 'NOUN')
('of', 'ADP')
('Elsevier', 'NOUN')
('N.V.', 'NOUN')
(',', '.')
('the', 'DET')
('Dutch', 'NOUN')
('publishing', 'VERB')
('group', 'NOUN')
('.', '.')


In [21]:
# split data into training and test set  
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 121)

In [22]:
print("There are {} sentences in the corpus.".format(len(nltk_data)))
print("There are {} sentences in the training set.".format(len(train_set)))
print("There are {} sentences in the testing set.".format(len(test_set)))

There are 3914 sentences in the corpus.
There are 3131 sentences in the training set.
There are 783 sentences in the testing set.


In [23]:
# create list of tagged words
tagged_words = [ tup for sent in nltk_data for tup in sent ]
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]

In [24]:
# number unique tags 
tags = {tag for word,tag in tagged_words}
train_tags = {tag for word,tag in train_tagged_words}
test_tags = {tag for word,tag in test_tagged_words}
print(len(tags))
print(len(train_tags))
print(len(test_tags))
print(tags)

12
12
12
{'NUM', 'PRON', 'VERB', '.', 'X', 'CONJ', 'PRT', 'ADV', 'ADP', 'NOUN', 'DET', 'ADJ'}


In [25]:
# number unique words  
vocab = {word for word,tag in tagged_words}
train_vocab = {word for word,tag in train_tagged_words}
test_vocab = {word for word,tag in test_tagged_words}

In [26]:
print("There are a total of {} samples of {} unique words in the corpus."
      .format(len(tagged_words), len(vocab)))
print("There are {} samples of {} unique words in the training set."
      .format(len(train_tagged_words), len(train_vocab)))
print("There are {} samples of {} unique words in the testing set."
      .format(len(test_tagged_words), len(test_vocab)))
print("There are {} words in the test set that are missing in the training set."
      .format(len(test_vocab - train_vocab)))

There are a total of 100676 samples of 12408 unique words in the corpus.
There are 80582 samples of 11091 unique words in the training set.
There are 20094 samples of 4758 unique words in the testing set.
There are 1317 words in the test set that are missing in the training set.


## Emission Probability

### Suppose in all sentences, there are 10 nouns, of which 3 are Will, 3 are Mary and 4 are Jack. Then P(Will|noun)=3/10, P(Mary|noun)=3/10, P(Jack|noun)=4/10

In [27]:
# Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list) #total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list) # total number of times the passed word occurred as the passed tag
     
    return (count_w_given_tag, count_tag)

## Transition Probability

### Suppose in all sentences,Noun is followed by Verb by 10 times, and is followed by Noun by 3 times. Then P(Verb|Noun)=10/13, P(Noun|Noun)=3/13

In [28]:
# Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [29]:
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
 
tags_matrix = np.zeros((len(train_tags), len(train_tags)), dtype='float32')
for i, t1 in enumerate(list(train_tags)):
    for j, t2 in enumerate(list(train_tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
 
print(tags_matrix)

[[1.84005663e-01 3.53857031e-04 1.73389949e-02 1.17480539e-01
  2.10191086e-01 1.38004245e-02 2.76008490e-02 2.47699930e-03
  3.46779898e-02 3.53857040e-01 3.53857037e-03 3.46779898e-02]
 [6.03528321e-03 7.89229386e-03 4.80965644e-01 4.17827293e-02
  9.61002782e-02 6.03528321e-03 1.20705664e-02 3.34261842e-02
  2.32126284e-02 2.07520887e-01 9.74930357e-03 7.52089173e-02]
 [2.21624635e-02 3.63501497e-02 1.67284861e-01 3.50519270e-02
  2.17080861e-01 4.91468841e-03 3.08790803e-02 8.19732919e-02
  9.19881314e-02 1.11646883e-01 1.36591241e-01 6.40764087e-02]
 [8.17435756e-02 6.55440688e-02 8.84578526e-02 9.22945738e-02
  2.68570818e-02 5.92561029e-02 2.02493882e-03 5.17957993e-02
  8.85644257e-02 2.25194499e-01 1.72439516e-01 4.57209833e-02]
 [2.45283009e-03 5.26415110e-02 2.07358494e-01 1.63018867e-01
  7.58490562e-02 1.03773586e-02 1.84150949e-01 2.60377359e-02
  1.45471692e-01 6.18867911e-02 5.43396212e-02 1.64150950e-02]
 [3.73011529e-02 5.97915538e-02 1.50850251e-01 3.56555134e-02
  9

In [30]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(train_tags), index=list(train_tags))
display(tags_df)

Unnamed: 0,NUM,PRON,VERB,.,X,CONJ,PRT,ADV,ADP,NOUN,DET,ADJ
NUM,0.184006,0.000354,0.017339,0.117481,0.210191,0.0138,0.027601,0.002477,0.034678,0.353857,0.003539,0.034678
PRON,0.006035,0.007892,0.480966,0.041783,0.0961,0.006035,0.012071,0.033426,0.023213,0.207521,0.009749,0.075209
VERB,0.022162,0.03635,0.167285,0.035052,0.217081,0.004915,0.030879,0.081973,0.091988,0.111647,0.136591,0.064076
.,0.081744,0.065544,0.088458,0.092295,0.026857,0.059256,0.002025,0.051796,0.088564,0.225194,0.17244,0.045721
X,0.002453,0.052642,0.207358,0.163019,0.075849,0.010377,0.184151,0.026038,0.145472,0.061887,0.05434,0.016415
CONJ,0.037301,0.059792,0.15085,0.035656,0.009325,0.0,0.004937,0.053209,0.055403,0.351618,0.121777,0.120132
PRT,0.056567,0.018597,0.396358,0.044169,0.013948,0.002712,0.001937,0.009686,0.017823,0.25184,0.102286,0.084076
ADV,0.032937,0.014683,0.350794,0.131746,0.022619,0.006746,0.015476,0.079762,0.118254,0.029365,0.068651,0.128968
ADP,0.062301,0.066879,0.009027,0.038398,0.033821,0.00089,0.00089,0.013223,0.016529,0.326256,0.327654,0.104132
NOUN,0.009591,0.004516,0.145536,0.240323,0.030019,0.042276,0.044297,0.016773,0.175899,0.266386,0.012816,0.011569


### Note: The third row above is regarded as Initial Probability, i.e., P(initial=noun)=P(noun| .)

## Viterbi alg

In [31]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

## HMM Train Accuracy 跑了1.5hr还没出来，放弃！

In [32]:
train_tagged_words = [tup for sent in train_set for tup in sent]
train_untagged_words = [tup[0] for sent in train_set for tup in sent]

In [33]:
start = time.time()
train_tagged_seq = Viterbi(train_untagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)

Time taken in seconds:  76119.21822309494


In [34]:
# accuracy
train_check = [i for i, j in zip(train_tagged_seq, train_tagged_words) if i == j] 
 
train_accuracy = len(train_check)/len(train_tagged_seq)
print('Viterbi Algorithm Train Accuracy: ',train_accuracy*100)

Viterbi Algorithm Train Accuracy:  97.79231093792659


## HMM Test Accuracy (Running Time: 1 hr!!!)

In [35]:
# tagging the test sentences
test_tagged_words = [tup for sent in test_set for tup in sent]
test_untagged_words = [tup[0] for sent in test_set for tup in sent]
#test_untagged_words

In [36]:
start = time.time()
test_tagged_seq = Viterbi(test_untagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)

Time taken in seconds:  3881.384532928467


In [37]:
print(test_tagged_seq)



In [38]:
# accuracy
test_check = [i for i, j in zip(test_tagged_seq, test_tagged_words) if i == j] 
 
test_accuracy = len(test_check)/len(test_tagged_seq)
print('Viterbi Algorithm Test Accuracy: ',accuracy*100)

NameError: name 'accuracy' is not defined