# Hidden Markov's Model

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter, defaultdict,namedtuple
from pprint import pprint
%matplotlib inline

## Importing Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls -ltr "/content/drive/My Drive/Dataset/Seq_Learn_Assign_1"

total 6328
-rw------- 1 root root     120 Apr 14  2020 HMM_Train_NER.txt
-rw------- 1 root root     237 Apr 14  2020 HMM_Train_Sentences.txt
-rw------- 1 root root 6478648 Apr 14  2020 CRF_POS_dataset.csv


In [None]:
## Function to read the file and return list of words and sentences

def read_file(filename):
  file_lines = []
  with open(filename) as file:
    for line in file:
      file_lines.append(line.rstrip().split())
  
  return file_lines

In [None]:
## Getting list of Sentences
filename = "/content/drive/My Drive/Dataset/Seq_Learn_Assign_1/HMM_Train_Sentences.txt"
sentence = read_file(filename)
print(sentence)

[['Bailey', 'named', 'Australia', 'captain'], ['Starc', 'player', 'of', '2015', 'World', 'Cup'], ['Australia', 'won', '2003', '2007', '2015', 'Cups'], ['Melbourne', 'Starc', 'Warner', 'knocks', 'etched', 'in', 'memory'], ['2003', 'SA', '2007', 'WI', '2015', 'Australia', 'were', 'venues'], ['Starc', 'Warner', 'Melbourne', 'go', 'as', 'great', 'combination']]


In [None]:
## Getting list of NER Words
filename = "/content/drive/My Drive/Dataset/Seq_Learn_Assign_1/HMM_Train_NER.txt"
ner = read_file(filename)
print(ner)

[['PER', 'O', 'GEO', 'O'], ['PER', 'O', 'O', 'TIM', 'O', 'O'], ['GEO', 'O', 'TIM', 'TIM', 'TIM', 'O'], ['GEO', 'PER', 'PER', 'O', 'O', 'O', 'O'], ['TIM', 'GEO', 'TIM', 'GEO', 'TIM', 'GEO', 'O', 'O'], ['PER', 'PER', 'GEO', 'O', 'O', 'O', 'O']]


## Question 1.1

In [None]:
## Getting Start of NER list
start_ner_list = []
for i in ner:
  start_ner_list.append(i[0])
print(start_ner_list)

['PER', 'PER', 'GEO', 'GEO', 'TIM', 'PER']


In [None]:
## Getting End of NER list

end_ner_list = []
for i in ner:
  end_ner_list.append(i[-1])
  #end_ner_list.append(i[0])
print(end_ner_list)

['O', 'O', 'O', 'O', 'O', 'O']


In [None]:
print(sentence)

[['Bailey', 'named', 'Australia', 'captain'], ['Starc', 'player', 'of', '2015', 'World', 'Cup'], ['Australia', 'won', '2003', '2007', '2015', 'Cups'], ['Melbourne', 'Starc', 'Warner', 'knocks', 'etched', 'in', 'memory'], ['2003', 'SA', '2007', 'WI', '2015', 'Australia', 'were', 'venues'], ['Starc', 'Warner', 'Melbourne', 'go', 'as', 'great', 'combination']]


## Question 1.2 - Creating Unigram and Bigram Tokens. Also NER Word Count

In [None]:
## Creating Word Tag

def generate_word_tag(sentence,ner,ignore_tag=None):
  word_tag=[]
  ind = 0
  for s,n in zip(sentence,ner):
    sn_list=[]
    for i in range(len(n)):
      idx = "Sentence_"+str(ind + 1)
      if n[i] != ignore_tag:
        word_tag.append((idx,s[i],n[i]))
    #word_tag.append(sn_list)
    ind += 1
  return word_tag

In [None]:
generate_word_tag(sentence,ner)

[('Sentence_1', 'Bailey', 'PER'),
 ('Sentence_1', 'named', 'O'),
 ('Sentence_1', 'Australia', 'GEO'),
 ('Sentence_1', 'captain', 'O'),
 ('Sentence_2', 'Starc', 'PER'),
 ('Sentence_2', 'player', 'O'),
 ('Sentence_2', 'of', 'O'),
 ('Sentence_2', '2015', 'TIM'),
 ('Sentence_2', 'World', 'O'),
 ('Sentence_2', 'Cup', 'O'),
 ('Sentence_3', 'Australia', 'GEO'),
 ('Sentence_3', 'won', 'O'),
 ('Sentence_3', '2003', 'TIM'),
 ('Sentence_3', '2007', 'TIM'),
 ('Sentence_3', '2015', 'TIM'),
 ('Sentence_3', 'Cups', 'O'),
 ('Sentence_4', 'Melbourne', 'GEO'),
 ('Sentence_4', 'Starc', 'PER'),
 ('Sentence_4', 'Warner', 'PER'),
 ('Sentence_4', 'knocks', 'O'),
 ('Sentence_4', 'etched', 'O'),
 ('Sentence_4', 'in', 'O'),
 ('Sentence_4', 'memory', 'O'),
 ('Sentence_5', '2003', 'TIM'),
 ('Sentence_5', 'SA', 'GEO'),
 ('Sentence_5', '2007', 'TIM'),
 ('Sentence_5', 'WI', 'GEO'),
 ('Sentence_5', '2015', 'TIM'),
 ('Sentence_5', 'Australia', 'GEO'),
 ('Sentence_5', 'were', 'O'),
 ('Sentence_5', 'venues', 'O'),
 ('Se

In [None]:
# Creating a ngram function for NER tags only
def generate_ngram_token(sentence,ner,ngram = 1,default=True):
  NER_LIST=[]
  
  #List after ignoring default tag
  if default == True:
    n_gram_list = generate_word_tag(sentence,ner,'O')
  else:
    n_gram_list = generate_word_tag(sentence,ner)

  print("For ngram = {}".format(ngram))
  
  for i in range(len(n_gram_list)):
    if ngram == 1:
      NER_LIST.append(n_gram_list[i][2])
    else:
      if (i + ngram -1 ) < len(n_gram_list):
        if n_gram_list[i][0] == n_gram_list[(i + ngram - 1)][0]:
          x = i
          x_list=[]
          while x <= (i + ngram - 1):
            x_list.append(n_gram_list[x][2])
            x += 1
          NER_LIST.append(tuple(x_list))
          del x_list
    
  return NER_LIST



In [None]:
## Generating Unigram tokens only for NER tags
Unigram_token = generate_ngram_token(sentence,ner,1,default=False)
print(Unigram_token)

For ngram = 1
['PER', 'O', 'GEO', 'O', 'PER', 'O', 'O', 'TIM', 'O', 'O', 'GEO', 'O', 'TIM', 'TIM', 'TIM', 'O', 'GEO', 'PER', 'PER', 'O', 'O', 'O', 'O', 'TIM', 'GEO', 'TIM', 'GEO', 'TIM', 'GEO', 'O', 'O', 'PER', 'PER', 'GEO', 'O', 'O', 'O', 'O']


In [None]:
## Generating Bigram tokens only for NER tags
Bigram_token = generate_ngram_token(sentence,ner,2,default=False)
print(Bigram_token)

For ngram = 2
[('PER', 'O'), ('O', 'GEO'), ('GEO', 'O'), ('PER', 'O'), ('O', 'O'), ('O', 'TIM'), ('TIM', 'O'), ('O', 'O'), ('GEO', 'O'), ('O', 'TIM'), ('TIM', 'TIM'), ('TIM', 'TIM'), ('TIM', 'O'), ('GEO', 'PER'), ('PER', 'PER'), ('PER', 'O'), ('O', 'O'), ('O', 'O'), ('O', 'O'), ('TIM', 'GEO'), ('GEO', 'TIM'), ('TIM', 'GEO'), ('GEO', 'TIM'), ('TIM', 'GEO'), ('GEO', 'O'), ('O', 'O'), ('PER', 'PER'), ('PER', 'GEO'), ('GEO', 'O'), ('O', 'O'), ('O', 'O'), ('O', 'O')]


In [None]:
print(sentence)

[['Bailey', 'named', 'Australia', 'captain'], ['Starc', 'player', 'of', '2015', 'World', 'Cup'], ['Australia', 'won', '2003', '2007', '2015', 'Cups'], ['Melbourne', 'Starc', 'Warner', 'knocks', 'etched', 'in', 'memory'], ['2003', 'SA', '2007', 'WI', '2015', 'Australia', 'were', 'venues'], ['Starc', 'Warner', 'Melbourne', 'go', 'as', 'great', 'combination']]


In [None]:
## Calculating NER Word count
def calc_ner_count(sentence,ner):
  NER_COUNT = defaultdict(lambda: defaultdict(int))
  tag_list = generate_word_tag(sentence,ner)

  for _,name,entity in tag_list:
    NER_COUNT[entity][name] += 1

  return  NER_COUNT

In [None]:
POS_wordsFreq = calc_ner_count(sentence,ner)

In [None]:
pprint(POS_wordsFreq)

defaultdict(<function calc_ner_count.<locals>.<lambda> at 0x7f9e2fb6a0e0>,
            {'GEO': defaultdict(<class 'int'>,
                                {'Australia': 3,
                                 'Melbourne': 2,
                                 'SA': 1,
                                 'WI': 1}),
             'O': defaultdict(<class 'int'>,
                              {'Cup': 1,
                               'Cups': 1,
                               'World': 1,
                               'as': 1,
                               'captain': 1,
                               'combination': 1,
                               'etched': 1,
                               'go': 1,
                               'great': 1,
                               'in': 1,
                               'knocks': 1,
                               'memory': 1,
                               'named': 1,
                               'of': 1,
                               'player': 1,
       

## Question 1.3 - Create Hidden Markov's Model. Also Start, End and Emission Probabilities 

In [None]:
!pip install pomegranate



In [None]:
# Importing the librariies

from pomegranate import State, HiddenMarkovModel, DiscreteDistribution

In [None]:
#Build Hidden Markov Model
hmm_model = HiddenMarkovModel(name="POS-Tagger")

In [None]:
## Creating state object for each word
to_states = []
for POS, wordsFreq in POS_wordsFreq.items():
    total = float(sum(wordsFreq.values()))
    print('------------------------------------')
    print(POS,' total',total)
    emission_prob = {word: count/total for word, count in wordsFreq.items()}
    print(emission_prob)
    #print('------------------------------------')
    POS_state = State(DiscreteDistribution(emission_prob), name=POS)
    #print('POS_state',POS_state)
    to_states.append(POS_state)


------------------------------------
PER  total 6.0
{'Bailey': 0.16666666666666666, 'Starc': 0.5, 'Warner': 0.3333333333333333}
------------------------------------
O  total 18.0
{'named': 0.05555555555555555, 'captain': 0.05555555555555555, 'player': 0.05555555555555555, 'of': 0.05555555555555555, 'World': 0.05555555555555555, 'Cup': 0.05555555555555555, 'won': 0.05555555555555555, 'Cups': 0.05555555555555555, 'knocks': 0.05555555555555555, 'etched': 0.05555555555555555, 'in': 0.05555555555555555, 'memory': 0.05555555555555555, 'were': 0.05555555555555555, 'venues': 0.05555555555555555, 'go': 0.05555555555555555, 'as': 0.05555555555555555, 'great': 0.05555555555555555, 'combination': 0.05555555555555555}
------------------------------------
GEO  total 7.0
{'Australia': 0.42857142857142855, 'Melbourne': 0.2857142857142857, 'SA': 0.14285714285714285, 'WI': 0.14285714285714285}
------------------------------------
TIM  total 7.0
{'2015': 0.42857142857142855, '2003': 0.2857142857142857, '

In [None]:
print(type(to_states[0]))
print(len(to_states))
print([state.name for state  in to_states])
print(to_states[0])

<class 'pomegranate.base.State'>
4
['PER', 'O', 'GEO', 'TIM']
{
    "class" : "State",
    "distribution" : {
        "class" : "Distribution",
        "dtype" : "str",
        "name" : "DiscreteDistribution",
        "parameters" : [
            {
                "Bailey" : 0.16666666666666666,
                "Starc" : 0.5,
                "Warner" : 0.3333333333333333
            }
        ],
        "frozen" : false
    },
    "name" : "PER",
    "weight" : 1.0
}


In [None]:
## Getting Count of each
start_POS_count = Counter(start_ner_list)
print(start_POS_count)
end_POS_count = Counter(end_ner_list)
print(end_POS_count)
POS_count_ug = Counter(Unigram_token)
print(POS_count_ug)
POS_count_bg = Counter(Bigram_token)
print(POS_count_bg)

Counter({'PER': 3, 'GEO': 2, 'TIM': 1})
Counter({'O': 6})
Counter({'O': 18, 'GEO': 7, 'TIM': 7, 'PER': 6})
Counter({('O', 'O'): 9, ('GEO', 'O'): 4, ('PER', 'O'): 3, ('TIM', 'GEO'): 3, ('O', 'TIM'): 2, ('TIM', 'O'): 2, ('TIM', 'TIM'): 2, ('PER', 'PER'): 2, ('GEO', 'TIM'): 2, ('O', 'GEO'): 1, ('GEO', 'PER'): 1, ('PER', 'GEO'): 1})


In [None]:
## Calculating the start and end probabilities
pos_tags = generate_ngram_token(sentence,ner,1,default=False)
start_prob, end_prob = {}, {}
for ps in pos_tags:
    start_prob[ps]=start_POS_count[ps]/POS_count_ug[ps]

for ps in pos_tags:
    end_prob[ps]=end_POS_count[ps]/POS_count_ug[ps]

print(start_prob)
print(end_prob)

For ngram = 1
{'PER': 0.5, 'O': 0.0, 'GEO': 0.2857142857142857, 'TIM': 0.14285714285714285}
{'PER': 0.0, 'O': 0.3333333333333333, 'GEO': 0.0, 'TIM': 0.0}


In [None]:
for POS_state in to_states :
    hmm_model.add_transition(hmm_model.start,POS_state,start_prob[POS_state.name])
    hmm_model.add_transition(POS_state,hmm_model.end,end_prob[POS_state.name])

In [None]:
pprint(POS_count_bg)
POS_count_ug

Counter({('O', 'O'): 9,
         ('GEO', 'O'): 4,
         ('PER', 'O'): 3,
         ('TIM', 'GEO'): 3,
         ('O', 'TIM'): 2,
         ('TIM', 'O'): 2,
         ('TIM', 'TIM'): 2,
         ('PER', 'PER'): 2,
         ('GEO', 'TIM'): 2,
         ('O', 'GEO'): 1,
         ('GEO', 'PER'): 1,
         ('PER', 'GEO'): 1})


Counter({'GEO': 7, 'O': 18, 'PER': 6, 'TIM': 7})

In [None]:
# Get the transition probability 
transition_prob_POS_word={}
for key in POS_count_bg.keys():
    transition_prob_POS_word[key]=POS_count_bg.get(key)/POS_count_ug[key[0]]

transition_prob_POS_word

{('GEO', 'O'): 0.5714285714285714,
 ('GEO', 'PER'): 0.14285714285714285,
 ('GEO', 'TIM'): 0.2857142857142857,
 ('O', 'GEO'): 0.05555555555555555,
 ('O', 'O'): 0.5,
 ('O', 'TIM'): 0.1111111111111111,
 ('PER', 'GEO'): 0.16666666666666666,
 ('PER', 'O'): 0.5,
 ('PER', 'PER'): 0.3333333333333333,
 ('TIM', 'GEO'): 0.42857142857142855,
 ('TIM', 'O'): 0.2857142857142857,
 ('TIM', 'TIM'): 0.2857142857142857}

In [None]:
# If a certain pair of POS don't occur in traning set, make them ZEROES. 
transition_prob_POS_word[('GEO', 'GEO')]=0
transition_prob_POS_word[('O', 'PER')]=0
transition_prob_POS_word[('PER', 'TIM')]=0
transition_prob_POS_word[('TIM', 'PER')]=0
pprint(transition_prob_POS_word)

{('GEO', 'GEO'): 0,
 ('GEO', 'O'): 0.5714285714285714,
 ('GEO', 'PER'): 0.14285714285714285,
 ('GEO', 'TIM'): 0.2857142857142857,
 ('O', 'GEO'): 0.05555555555555555,
 ('O', 'O'): 0.5,
 ('O', 'PER'): 0,
 ('O', 'TIM'): 0.1111111111111111,
 ('PER', 'GEO'): 0.16666666666666666,
 ('PER', 'O'): 0.5,
 ('PER', 'PER'): 0.3333333333333333,
 ('PER', 'TIM'): 0,
 ('TIM', 'GEO'): 0.42857142857142855,
 ('TIM', 'O'): 0.2857142857142857,
 ('TIM', 'PER'): 0,
 ('TIM', 'TIM'): 0.2857142857142857}


In [None]:
# Add transition probabilities to all POS
for POS_state in to_states :
    #print(POS_state)
    for next_POS_state in to_states :
        #print(next_POS_state)
        hmm_model.add_transition(POS_state,next_POS_state,transition_prob_POS_word[(POS_state.name,next_POS_state.name)])

In [None]:
## Baking the model
hmm_model.bake()

In [None]:
print(hmm_model)

POS-Tagger:{
    "class" : "State",
    "distribution" : {
        "class" : "Distribution",
        "dtype" : "str",
        "name" : "DiscreteDistribution",
        "parameters" : [
            {
                "Australia" : 0.42857142857142855,
                "Melbourne" : 0.2857142857142857,
                "SA" : 0.14285714285714285,
                "WI" : 0.14285714285714285
            }
        ],
        "frozen" : false
    },
    "name" : "GEO",
    "weight" : 1.0
}{
    "class" : "State",
    "distribution" : {
        "class" : "Distribution",
        "dtype" : "str",
        "name" : "DiscreteDistribution",
        "parameters" : [
            {
                "named" : 0.05555555555555555,
                "captain" : 0.05555555555555555,
                "player" : 0.05555555555555555,
                "of" : 0.05555555555555555,
                "World" : 0.05555555555555555,
                "Cup" : 0.05555555555555555,
                "won" : 0.05555555555555555,
     

In [None]:
hmm_model.dense_transition_matrix()

array([[0.        , 0.57142857, 0.14285714, 0.28571429, 0.        ,
        0.        ],
       [0.05555556, 0.5       , 0.        , 0.11111111, 0.        ,
        0.33333333],
       [0.16666667, 0.5       , 0.33333333, 0.        , 0.        ,
        0.        ],
       [0.42857143, 0.28571429, 0.        , 0.28571429, 0.        ,
        0.        ],
       [0.30769231, 0.        , 0.53846154, 0.15384615, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [None]:
hmm_model.sample(length=20)

array(['Australia', '2007', 'Melbourne', '2007', 'Australia', 'Starc',
       'as', 'in', 'Melbourne', 'as', 'Australia', 'Starc', 'Warner',
       'player', 'in', 'were'], dtype='<U9')

## Question 1.3 - Predicting the sentence 

In [None]:
st = "Starc named 2015 Australia player"

In [None]:
# Decode POS for a new sentence
def POS_decoding(sentence, model):    
    _, state_path = model.viterbi(sentence)
    return [state[1].name for state in state_path[1:-1]]

In [None]:
POS_tags_predict = POS_decoding(st.split(), hmm_model)
print(str(POS_tags_predict))

['PER', 'O', 'TIM', 'GEO', 'O']
