# Hidden Markov Model Hindi POS tagger

In [1]:
!pip install conllu

Collecting conllu
  Downloading conllu-4.4-py2.py3-none-any.whl (15 kB)
Installing collected packages: conllu
Successfully installed conllu-4.4


In [83]:
from conllu import parse_incr
from io import open
file=open('hi_hdtb-ud-train.conllu','r',encoding='utf-8')
ud_files=[]
for tokenlist in parse_incr(file):
    ud_files.append(tokenlist)

In [85]:
def dataset(ud_files):
    bank=[]
    for sentence in ud_files:
        tokens=[]
        tags=[]

        for token in sentence:
            tokens.append(token['form'])
            tags.append(token['upostag'])

        bank.append((tokens,tags))
    return bank

In [88]:
train=dataset(ud_files)

In [89]:
print(train[1][0])

['इसे', 'नवाब', 'शाहजेहन', 'ने', 'बनवाया', 'था', '।']


In [5]:
print(bank[1][1])

['PRON', 'NOUN', 'PROPN', 'ADP', 'VERB', 'AUX', 'PUNCT']


In [6]:
bank[13000][0]

['शरीफ',
 'ने',
 'पासपोर्ट',
 'के',
 'लिए',
 'अपना',
 'आवेदन',
 'मुशर्रफ',
 'की',
 'पिछले',
 'सप्ताह',
 'सऊदी',
 'अरब',
 'की',
 'यात्रा',
 'के',
 'समय',
 'किया',
 'था',
 '।']

In [7]:
def separate(bank):
    X,y=[],[]
    for index in range(len(bank)):
        X.append(bank[index][0])
        y.append(bank[index][1])
    return X,y
    

In [8]:
X,y=separate(bank)

In [9]:
len(X)

13304

In [10]:
len(y)

13304

In [11]:
X[0]

['यह', 'एशिया', 'की', 'सबसे', 'बड़ी', 'मस्जिदों', 'में', 'से', 'एक', 'है', '।']

In [12]:
y[0]

['DET',
 'PROPN',
 'ADP',
 'ADV',
 'ADJ',
 'NOUN',
 'ADP',
 'ADP',
 'NUM',
 'AUX',
 'PUNCT']

In [13]:
def flatten(list):
  for i in list:
    for j in i:
      yield j
L1=[[1,2,3],[4,5],[6,7,8,9]]
flat=flatten(L1)
print (list(flat))

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [14]:
from collections import Counter
tag_set = flatten(y)
count=Counter(tag_set)
count.items()

dict_items([('DET', 6081), ('PROPN', 34289), ('ADP', 59221), ('ADV', 2703), ('ADJ', 16459), ('NOUN', 62191), ('NUM', 5332), ('AUX', 20821), ('PUNCT', 18668), ('PRON', 11857), ('VERB', 27188), ('CCONJ', 5110), ('PART', 5610), ('SCONJ', 5389), ('X', 135), ('INTJ', 3)])

In [15]:
file=open('hi_hdtb-ud-test.conllu','r',encoding='utf-8')
ud_files=[]
for tokenlist in parse_incr(file):
    ud_files.append(tokenlist)
test_bank=[]
for sentence in ud_files:
    tokens=[]
    tags=[]
    
    for token in sentence:
        tokens.append(token['form'])
        tags.append(token['upostag'])
        
    test_bank.append((tokens,tags))

In [16]:
file=open('hi_hdtb-ud-dev.conllu','r',encoding='utf-8')
ud_files=[]
for tokenlist in parse_incr(file):
    ud_files.append(tokenlist)
dev_bank=[]
for sentence in ud_files:
    tokens=[]
    tags=[]
    
    for token in sentence:
        tokens.append(token['form'])
        tags.append(token['upostag'])
        
    dev_bank.append((tokens,tags))

In [17]:
Xtest,ytest=separate(test_bank)

In [18]:
Xdev,ydev=separate(dev_bank)

In [19]:
tag_set = flatten(ydev)
count=Counter(tag_set)
count.items()

dict_items([('PROPN', 4214), ('ADP', 7380), ('NOUN', 7928), ('PRON', 1473), ('ADJ', 2144), ('VERB', 3302), ('PUNCT', 2367), ('DET', 699), ('CCONJ', 682), ('PART', 722), ('AUX', 2613), ('ADV', 292), ('NUM', 715), ('SCONJ', 682), ('X', 4)])

In [23]:
tag_set = flatten(ytest)
count=Counter(tag_set)
count.items()

dict_items([('PRON', 1372), ('ADP', 7544), ('PROPN', 4438), ('PUNCT', 2420), ('CCONJ', 635), ('PART', 677), ('ADJ', 2043), ('NOUN', 8036), ('AUX', 2596), ('DET', 745), ('VERB', 3263), ('NUM', 693), ('ADV', 304), ('SCONJ', 655), ('X', 9)])

In [24]:
tag_list = set()
tag_count = {}
word_set = set()

def transition_count(X,y):
    global tag_list
    global word_set
    transition_dict = {}
    global tag_count
    for v in range(len(X)):
        previous="start"
        for data in range(len(X[v])):
            i=X[v][data]
            word = i
            word_set.add(word.lower())
            tag = y[v][data]
            tag_list.add(tag)

            if tag in tag_count:
                tag_count[tag]+=1
            else:
                tag_count[tag] = 1


            if (previous + "~tag~" + tag) in transition_dict:
                    transition_dict[previous + "~tag~" + tag] += 1
                    previous = tag
            else:
                    transition_dict[previous + "~tag~" + tag] = 1
                    previous = tag

    return transition_dict,tag_count,tag_list,word_set    

In [25]:
transmission_m,tag_count,tag_list,word_set = transition_count(X,y) 

In [26]:
def transition_probability(X,y):
    #count_dict = transition_count(X,y)
    count_dict = transmission_m
    prob_dict = {}
    for key in count_dict:
        den = 0
        val = key.split("~tag~")[0]
        # Probabilty of a tagA to be followed by tagB out of all possible tags # 
        for key_2 in count_dict:
            if key_2.split("~tag~")[0] == val:
                den += count_dict[key_2]
        prob_dict[key] = Decimal(count_dict[key])/(den)
    return prob_dict

In [27]:
def transition_smoothing(X,y):
    transition_prob = transition_probability(X,y)
    for tag in tag_list:
    	# if a tag does not occur as a start tag, then set its probability to be a start tag to minimum value #
        if "start" + tag not in  transition_prob:
            transition_prob[("start" + "~tag~" + tag)] = Decimal(1) / Decimal(len(word_set) + tag_count[tag])
    for tag1 in tag_list:
        for tag2 in tag_list:
        	# if a particular tag combination does not exist in the dictionary, we set its probability to minimum#
            if (tag1 +"~tag~" + tag2) not in transition_prob:
                transition_prob[(tag1+"~tag~"+tag2)] = Decimal(1)/Decimal(len(word_set) + tag_count[tag1])
    return transition_prob

In [28]:
def emission_count(X,y):  
    count_word = {}
    for v in range(len(X)):
        for data in range(len(X[v])):
    #for value in train_data:
        #for data in value:
            i = X[v][data]
            word = i
            tag = y[v][data]
            # map the words in the training set to their tagged POS #
            if word.lower() + "/" + tag in count_word:
                count_word[word.lower() + "/" + tag] +=1
            else:
                count_word[word.lower() + "/" + tag] = 1
    return count_word

In [29]:
def emission_probability(X,y):
    global tag_count
    word_count = emission_count(X,y)
    emission_prob_dict = {}
    # calculate probability of a word to be a certain Tag out of all the possible tags that it can be #
    for key in word_count:
        emission_prob_dict[key] = Decimal(word_count[key])/tag_count[key.split("/")[-1]]
    return emission_prob_dict

In [30]:
from decimal import *

In [31]:
transition_model = transition_smoothing(X,y)
emission_model = emission_probability(X,y)

In [32]:
def viterbi_algorithm(sentence, tag_list, transition_prob, emission_prob,tag_count, word_set):
    global tag_set
    # Get words from each sentence #
    sentence = sentence.strip("\n")
    word_list = sentence.split(" ")
    current_prob = {}
    for tag in tag_list:
        # transition probability #
        tp = Decimal(0)
        # Emission probability #
        em = Decimal(0)
        # Storing the probability of every tag to be starting tag #
        if "start~tag~"+tag in transition_prob:
            tp = Decimal(transition_prob["start~tag~"+tag])
        # Check for word in training data. If present, check the probability of the first word to be of given tag#
        if word_list[0].lower() in word_set:
            if (word_list[0].lower()+"/"+tag) in emission_prob:
                em = Decimal(emission_prob[word_list[0].lower()+"/"+tag])
                # Storing probability of current combination of tp and em #
                current_prob[tag] = tp * em
         # Check for word in training data. If absent then probability is just tp# 
        else:
            em = Decimal(1) /(tag_count[tag] +len(word_set))
            current_prob[tag] = tp

    if len(word_list) == 1:
        # Return max path if only one word in sentence #
        max_path = max(current_prob, key=current_prob.get)
        return max_path
    else:
        # Tracking from second word to last word #
        for i in range(1, len(word_list)):
            previous_prob = current_prob
            current_prob = {}
            locals()['dict{}'.format(i)] = {}
            previous_tag = ""
            for tag in tag_list:
                if word_list[i].lower() in word_set:
                    if word_list[i].lower()+"/"+tag in emission_prob:
                        em = Decimal(emission_prob[word_list[i].lower()+"/"+tag])
                        # Find the maximum probability using previous node's(tp*em)[i.e probability of reaching to the previous node] * tp * em (Bigram Model) #
                        max_prob, previous_state = max((Decimal(previous_prob[previous_tag]) * Decimal(transition_prob[previous_tag + "~tag~" + tag]) * em, previous_tag) for previous_tag in previous_prob)
                        current_prob[tag] = max_prob
                        locals()['dict{}'.format(i)][previous_state + "~" + tag] = max_prob
                        previous_tag = previous_state
                else:
                    em = Decimal(1) /(tag_count[tag] +len(word_set))
                    max_prob, previous_state = max((Decimal(previous_prob[previous_tag]) * Decimal(transition_prob[previous_tag+"~tag~"+tag]) * em, previous_tag) for previous_tag in previous_prob)
                    current_prob[tag] = max_prob
                    locals()['dict{}'.format(i)][previous_state + "~" + tag] = max_prob
                    previous_tag = previous_state

            # if last word of sentence, then return path dicts of all words #
            if i == len(word_list)-1:
                max_path = ""
                last_tag = max(current_prob, key=current_prob.get)
                max_path = max_path + last_tag + " " + previous_tag
                for j in range(len(word_list)-1,0,-1):
                    for key in locals()['dict{}'.format(j)]:
                        data = key.split("~")
                        if data[-1] == previous_tag:
                            max_path = max_path + " " +data[0]
                            previous_tag = data[0]
                            break
                result = max_path.split()
                result.reverse()
                return " ".join(result)

In [39]:
sentence = 'भारत एक देश है ।'

In [40]:
path = viterbi_algorithm(sentence, tag_list, transition_model, emission_model,tag_count, word_set)
word = sentence.split(" ")
tag = path.split(" ")
for j in range(0,len(word)):
    if(j==len(word)-1):
        print(word[j] + "/" + tag[j]+ u'\n')
    else:
        print(word[j] + "/" + tag[j] + " ")

भारत/PROPN 
एक/NUM 
देश/NOUN 
है/AUX 
।/PUNCT



In [111]:
Xtest[0]

['इसके',
 'अतिरिक्त',
 'गुग्गुल',
 'कुंड',
 ',',
 'भीम',
 'गुफा',
 'तथा',
 'भीमशिला',
 'भी',
 'दर्शनीय',
 'स्थल',
 'हैं',
 '।']

In [76]:
ytest[0]

['PRON',
 'ADP',
 'PROPN',
 'PROPN',
 'PUNCT',
 'PROPN',
 'PROPN',
 'CCONJ',
 'PROPN',
 'PART',
 'ADJ',
 'NOUN',
 'AUX',
 'PUNCT']

In [77]:
tag_count

{'DET': 6081,
 'PROPN': 34289,
 'ADP': 59221,
 'ADV': 2703,
 'ADJ': 16459,
 'NOUN': 62191,
 'NUM': 5332,
 'AUX': 20821,
 'PUNCT': 18668,
 'PRON': 11857,
 'VERB': 27188,
 'CCONJ': 5110,
 'PART': 5610,
 'SCONJ': 5389,
 'X': 135,
 'INTJ': 3}

In [78]:
tag_list

{'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'INTJ',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'VERB',
 'X'}

# CRF model

In [10]:
def extract_features(sentence, index):
    return{
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'next_word':sentence[index+1] if index<len(sentence)-1 else '',
      'prev_word':'' if index == 0 else sentence[index-1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit()
    }

In [11]:
xtrain=[]
for index in range(len(X)):
    arrange=[]
    for i in range(len(X[index])):
        arrange.append(extract_features(X[index],i))
    xtrain.append(arrange)

In [12]:
xtrain[0]

[{'word': 'यह',
  'is_first': True,
  'is_last': False,
  'prefix-1': 'य',
  'prefix-2': 'यह',
  'prefix-3': 'यह',
  'suffix-1': 'ह',
  'suffix-2': 'यह',
  'suffix-3': 'यह',
  'next_word': 'एशिया',
  'prev_word': '',
  'has_hyphen': False,
  'is_numeric': False},
 {'word': 'एशिया',
  'is_first': False,
  'is_last': False,
  'prefix-1': 'ए',
  'prefix-2': 'एश',
  'prefix-3': 'एशिय',
  'suffix-1': 'ा',
  'suffix-2': 'या',
  'suffix-3': 'शिया',
  'next_word': 'की',
  'prev_word': 'यह',
  'has_hyphen': False,
  'is_numeric': False},
 {'word': 'की',
  'is_first': False,
  'is_last': False,
  'prefix-1': 'क',
  'prefix-2': 'की',
  'prefix-3': 'की',
  'suffix-1': 'ी',
  'suffix-2': 'की',
  'suffix-3': 'की',
  'next_word': 'सबसे',
  'prev_word': 'एशिया',
  'has_hyphen': False,
  'is_numeric': False},
 {'word': 'सबसे',
  'is_first': False,
  'is_last': False,
  'prefix-1': 'स',
  'prefix-2': 'सब',
  'prefix-3': 'सबसे',
  'suffix-1': 'े',
  'suffix-2': 'से',
  'suffix-3': 'सबसे',
  'next_word': 

In [25]:
y[0]

['DET',
 'PROPN',
 'ADP',
 'ADV',
 'ADJ',
 'NOUN',
 'ADP',
 'ADP',
 'NUM',
 'AUX',
 'PUNCT']

In [79]:
import warnings
warnings.filterwarnings('ignore')
!pip install sklearn_crfsuite
from sklearn_crfsuite import CRF


hindi_crf = CRF(
    algorithm='lbfgs',
    c1=0.20,
    c2=0.3,
    max_iterations=100,
    all_possible_transitions=True
)

print("Started training ")
hindi_crf.fit(xtrain, y)
print("Finished training ")

Started training 
Finished training 


In [80]:
xtest=[]
for index in range(len(Xtest)):
    arrange=[]
    for i in range(len(Xtest[index])):
        arrange.append(extract_features(Xtest[index],i))
    xtest.append(arrange)

In [107]:
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
print("##nltk##")
y_pred = hindi_crf.predict(xtest)
print("F1 score on Test Data")
print(metrics.flat_f1_score(ytest, y_pred,average='weighted',labels=hindi_crf.classes_))
#For the sake of clarification, we do the same for train data.
y_pred_train=hindi_crf.predict(xtrain)
print("F1 score on Training Data ")
print(metrics.flat_f1_score(y, y_pred_train,average='weighted',labels=hindi_crf.classes_))

# This presents class wise score. Helps see which classes (tags) are the ones with most problems.
print("Class wise score:")
print(metrics.flat_classification_report(
    ytest, y_pred, labels=hindi_crf.classes_, digits=3
))

##nltk##
F1 score on Test Data
0.9582940108541691
Class wise score:
              precision    recall  f1-score   support

         DET      0.967     0.969     0.968       745
       PROPN      0.912     0.878     0.894      4438
         ADP      0.989     0.993     0.991      7544
         ADV      0.855     0.757     0.803       304
         ADJ      0.911     0.926     0.919      2043
        NOUN      0.931     0.946     0.939      8036
         NUM      0.985     0.975     0.980       693
         AUX      0.973     0.991     0.982      2596
       PUNCT      1.000     1.000     1.000      2420
        PRON      0.981     0.981     0.981      1372
        VERB      0.982     0.971     0.976      3263
       CCONJ      0.981     0.997     0.989       635
        PART      0.989     0.969     0.979       677
       SCONJ      0.988     0.989     0.989       655
           X      0.250     0.333     0.286         9
        INTJ      0.000     0.000     0.000         0

   micro avg

In [111]:
print(metrics.flat_accuracy_score(ytest, y_pred))

0.958453288173864


In [93]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest,y_pred)

0.958453288173864

In [112]:
y_pred_train=hindi_crf.predict(xtrain)
print("F1 score on Training Data ")
print(metrics.flat_f1_score(y, y_pred_train,average='weighted',labels=hindi_crf.classes_))

F1 score on Training Data 
0.9832879019997677


In [114]:
print("Class wise score:")
print(metrics.flat_classification_report(
    y, y_pred_train, labels=hindi_crf.classes_, digits=3
))

Class wise score:
              precision    recall  f1-score   support

         DET      0.969     0.977     0.973      6081
       PROPN      0.969     0.971     0.970     34289
         ADP      0.994     0.996     0.995     59221
         ADV      0.905     0.846     0.875      2703
         ADJ      0.969     0.972     0.971     16459
        NOUN      0.981     0.978     0.979     62191
         NUM      0.983     0.992     0.988      5332
         AUX      0.978     0.995     0.986     20821
       PUNCT      1.000     1.000     1.000     18668
        PRON      0.989     0.979     0.984     11857
        VERB      0.992     0.980     0.986     27188
       CCONJ      0.986     0.996     0.991      5110
        PART      0.992     0.989     0.990      5610
       SCONJ      0.986     0.997     0.991      5389
           X      0.957     0.667     0.786       135
        INTJ      1.000     0.667     0.800         3

    accuracy                          0.983    281057
   macro

In [113]:
print(metrics.flat_accuracy_score(y, y_pred_train))

0.9833378994296531


In [18]:
y_pred=hindi_crf.predict(xtest)

In [91]:
y_pred=list(flatten(y_pred))

In [92]:
ytest=list(flatten(ytest))

In [99]:
sentence='पत्तेदार सब्जियां आपके स्वास्थ्य के लिए अच्छी होती हैं ।'
list1=[]
list1.append(sentence.split())
xtesting=[]
for index in range(len(list1)):
    arrange=[]
    for i in range(len(list1[index])):
        arrange.append(extract_features(list1[index],i))
    xtesting.append(arrange)


In [100]:
xtesting

[[{'word': 'पत्तेदार',
   'is_first': True,
   'is_last': False,
   'prefix-1': 'प',
   'prefix-2': 'पत',
   'prefix-3': 'पत्त',
   'suffix-1': 'र',
   'suffix-2': 'ार',
   'suffix-3': 'ेदार',
   'next_word': 'सब्जियां',
   'prev_word': '',
   'has_hyphen': False,
   'is_numeric': False},
  {'word': 'सब्जियां',
   'is_first': False,
   'is_last': False,
   'prefix-1': 'स',
   'prefix-2': 'सब',
   'prefix-3': 'सब्ज',
   'suffix-1': 'ं',
   'suffix-2': 'ां',
   'suffix-3': 'ियां',
   'next_word': 'आपके',
   'prev_word': 'पत्तेदार',
   'has_hyphen': False,
   'is_numeric': False},
  {'word': 'आपके',
   'is_first': False,
   'is_last': False,
   'prefix-1': 'आ',
   'prefix-2': 'आप',
   'prefix-3': 'आपके',
   'suffix-1': 'े',
   'suffix-2': 'के',
   'suffix-3': 'आपके',
   'next_word': 'स्वास्थ्य',
   'prev_word': 'सब्जियां',
   'has_hyphen': False,
   'is_numeric': False},
  {'word': 'स्वास्थ्य',
   'is_first': False,
   'is_last': False,
   'prefix-1': 'स',
   'prefix-2': 'स्',
   'prefix-

In [101]:
pred = hindi_crf.predict(xtesting)

In [102]:
sentence

'पत्तेदार सब्जियां आपके स्वास्थ्य के लिए अच्छी होती हैं ।'

In [103]:
pred

[['NOUN', 'NOUN', 'PRON', 'NOUN', 'ADP', 'ADP', 'ADJ', 'VERB', 'AUX', 'PUNCT']]

In [78]:
import pickle 
filename = 'crfhindi_1.sav'
pickle.dump(hindi_crf, open(filename, 'wb'))

### HMM

In [20]:
import numpy as np
import pandas as pd
import random
import pprint,time

In [21]:
def merge(list1, list2):
      
    merged_list = []
    for i in range(max((len(list1), len(list2)))):
  
        while True:
            try:
                tup = (list1[i], list2[i])
            except IndexError:
                if len(list1) > len(list2):
                    list2.append('')
                    tup = (list1[i], list2[i])
                elif len(list1) < len(list2):
                    list1.append('')
                    tup = (list1[i], list2[i])
                continue
  
            merged_list.append(tup)
            break
    return merged_list

In [23]:
xflat=list(flatten(X))
yflat = list(flatten(y))

In [25]:
merged=merge(xflat,yflat)

In [26]:
merged

[('यह', 'DET'),
 ('एशिया', 'PROPN'),
 ('की', 'ADP'),
 ('सबसे', 'ADV'),
 ('बड़ी', 'ADJ'),
 ('मस्जिदों', 'NOUN'),
 ('में', 'ADP'),
 ('से', 'ADP'),
 ('एक', 'NUM'),
 ('है', 'AUX'),
 ('।', 'PUNCT'),
 ('इसे', 'PRON'),
 ('नवाब', 'NOUN'),
 ('शाहजेहन', 'PROPN'),
 ('ने', 'ADP'),
 ('बनवाया', 'VERB'),
 ('था', 'AUX'),
 ('।', 'PUNCT'),
 ('इसका', 'PRON'),
 ('प्रवेश', 'NOUN'),
 ('द्वार', 'NOUN'),
 ('दो', 'NUM'),
 ('मंजिला', 'ADJ'),
 ('है', 'AUX'),
 ('।', 'PUNCT'),
 ('जिसमें', 'PRON'),
 ('चार', 'NUM'),
 ('मेहराबें', 'NOUN'),
 ('हैं', 'AUX'),
 ('और', 'CCONJ'),
 ('मुख्य', 'ADJ'),
 ('प्रार्थना', 'NOUN'),
 ('हॉल', 'NOUN'),
 ('में', 'ADP'),
 ('जाने', 'VERB'),
 ('के', 'ADP'),
 ('लिए', 'ADP'),
 ('9', 'NUM'),
 ('प्रवेश', 'NOUN'),
 ('द्वार', 'NOUN'),
 ('हैं', 'VERB'),
 ('।', 'PUNCT'),
 ('पूरी', 'ADJ'),
 ('इमारत', 'NOUN'),
 ('बेहद', 'ADV'),
 ('खूबसूरत', 'ADJ'),
 ('है', 'AUX'),
 ('।', 'PUNCT'),
 ('यहाँ', 'PRON'),
 ('लगने', 'VERB'),
 ('वाला', 'ADP'),
 ('तीन', 'NUM'),
 ('दिन', 'NOUN'),
 ('का', 'ADP'),
 ('इज्तिमा', 

In [27]:
tags = {tag for word,tag in merged}
print(len(tags))
print(tags)
 
# check total words in vocabulary
vocab = {word for word,tag in merged}

16
{'PRON', 'INTJ', 'PART', 'NUM', 'CCONJ', 'PROPN', 'ADJ', 'X', 'PUNCT', 'ADV', 'NOUN', 'SCONJ', 'DET', 'AUX', 'ADP', 'VERB'}


In [29]:
def word_given_tag(word, tag, train_bag = merged):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]

    count_w_given_tag = len(w_given_tag_list)
 
     
    return (count_w_given_tag, count_tag)

In [30]:
def t2_given_t1(t2, t1, train_bag = merged):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [31]:
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1] 
print(tags_matrix)

[[8.40010121e-02 0.00000000e+00 4.66391146e-02 2.39520967e-02
  2.02412088e-03 8.08804929e-02 9.91819203e-02 8.43383663e-04
  6.24103891e-03 1.57712735e-02 3.17112267e-01 4.21691831e-04
  4.14101370e-02 6.57839235e-03 1.55266926e-01 1.19676143e-01]
 [6.66666687e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 3.33333343e-01]
 [5.63279875e-02 0.00000000e+00 1.53297679e-02 6.22103401e-02
  5.34759369e-04 5.81105165e-02 9.42958966e-02 8.91265576e-04
  5.34759369e-03 1.17647061e-02 1.97504461e-01 1.42602494e-03
  4.22459878e-02 8.60962570e-02 2.29946524e-02 3.44919801e-01]
 [5.62640664e-04 0.00000000e+00 5.43885957e-03 1.00337587e-01
  5.25131263e-03 9.56489146e-03 1.20030008e-01 0.00000000e+00
  1.89422350e-02 1.12528133e-03 6.75731421e-01 0.00000000e+00
  9.75243840e-03 4.87621920e-03 3.91972996e-02 9.18979757e-03]
 [1.

In [32]:
# convert the matrix to a df for better readability
#the table is same as the transition table shown in section 3 of article
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,PRON,INTJ,PART,NUM,CCONJ,PROPN,ADJ,X,PUNCT,ADV,NOUN,SCONJ,DET,AUX,ADP,VERB
PRON,0.084001,0.0,0.046639,0.023952,0.002024,0.08088,0.099182,0.000843,0.006241,0.015771,0.317112,0.000422,0.04141,0.006578,0.155267,0.119676
INTJ,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
PART,0.056328,0.0,0.01533,0.06221,0.000535,0.058111,0.094296,0.000891,0.005348,0.011765,0.197504,0.001426,0.042246,0.086096,0.022995,0.34492
NUM,0.000563,0.0,0.005439,0.100338,0.005251,0.009565,0.12003,0.0,0.018942,0.001125,0.675731,0.0,0.009752,0.004876,0.039197,0.00919
CCONJ,0.148532,0.0,0.008023,0.03816,0.0,0.286106,0.135421,0.0,0.010959,0.015851,0.306262,0.003914,0.037769,0.0,0.0,0.009002
PROPN,0.005629,0.0,0.005483,0.002071,0.036688,0.369185,0.012803,0.000175,0.065969,0.001896,0.066231,0.000379,0.00315,0.001137,0.415089,0.014115
ADJ,0.002552,0.0,0.023331,0.014582,0.012091,0.043624,0.039188,0.0,0.01288,0.003645,0.550155,6.1e-05,0.006623,0.043563,6.1e-05,0.247646
X,0.007407,0.0,0.037037,0.0,0.0,0.007407,0.066667,0.007407,0.007407,0.007407,0.096296,0.0,0.022222,0.037037,0.0,0.703704
PUNCT,0.185719,0.000107,0.002839,0.021427,0.040658,0.320388,0.068834,0.001018,0.006589,0.02132,0.209449,0.025391,0.064335,0.000268,0.023784,0.007821
ADV,0.069182,0.0,0.105808,0.020348,0.00185,0.113947,0.157233,0.00037,0.032926,0.010729,0.18572,0.00074,0.064003,0.005549,0.073992,0.157603


In [81]:
def Viterbi(words, train_bag = merged):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['X', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [68]:
tags_df.loc[]

PRON     0.666667
INTJ     0.000000
PART     0.000000
NUM      0.000000
CCONJ    0.000000
PROPN    0.000000
ADJ      0.000000
X        0.000000
PUNCT    0.000000
ADV      0.000000
NOUN     0.000000
SCONJ    0.000000
DET      0.000000
AUX      0.000000
ADP      0.000000
VERB     0.333333
Name: INTJ, dtype: float32

In [58]:
T = list(set([pair[1] for pair in merged]))

In [34]:
xtest=list(flatten(Xtest))
Ytest=list(flatten(ytest))

In [65]:
state=[]
for key, word in enumerate(test_tagged_words):
        #initialise list of probability column for a given observation
        print(word)
        #for tag in T:
            #if key!=0:
               #transition_p = tags_df.loc[state[-1], tag] 

('इस', 'DET')
('बूटा', 'PROPN')
('आडवाणी', 'PROPN')
('पार्वती', 'PROPN')
('अमेरिकी', 'ADJ')
('इस', 'DET')
('गौरतलब', 'ADJ')
('इस', 'DET')
('उन्होंने', 'PRON')
('उपमुख्यमंत्री', 'PROPN')


In [60]:
tags_df

Unnamed: 0,PRON,INTJ,PART,NUM,CCONJ,PROPN,ADJ,X,PUNCT,ADV,NOUN,SCONJ,DET,AUX,ADP,VERB
PRON,0.084001,0.0,0.046639,0.023952,0.002024,0.08088,0.099182,0.000843,0.006241,0.015771,0.317112,0.000422,0.04141,0.006578,0.155267,0.119676
INTJ,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
PART,0.056328,0.0,0.01533,0.06221,0.000535,0.058111,0.094296,0.000891,0.005348,0.011765,0.197504,0.001426,0.042246,0.086096,0.022995,0.34492
NUM,0.000563,0.0,0.005439,0.100338,0.005251,0.009565,0.12003,0.0,0.018942,0.001125,0.675731,0.0,0.009752,0.004876,0.039197,0.00919
CCONJ,0.148532,0.0,0.008023,0.03816,0.0,0.286106,0.135421,0.0,0.010959,0.015851,0.306262,0.003914,0.037769,0.0,0.0,0.009002
PROPN,0.005629,0.0,0.005483,0.002071,0.036688,0.369185,0.012803,0.000175,0.065969,0.001896,0.066231,0.000379,0.00315,0.001137,0.415089,0.014115
ADJ,0.002552,0.0,0.023331,0.014582,0.012091,0.043624,0.039188,0.0,0.01288,0.003645,0.550155,6.1e-05,0.006623,0.043563,6.1e-05,0.247646
X,0.007407,0.0,0.037037,0.0,0.0,0.007407,0.066667,0.007407,0.007407,0.007407,0.096296,0.0,0.022222,0.037037,0.0,0.703704
PUNCT,0.185719,0.000107,0.002839,0.021427,0.040658,0.320388,0.068834,0.001018,0.006589,0.02132,0.209449,0.025391,0.064335,0.000268,0.023784,0.007821
ADV,0.069182,0.0,0.105808,0.020348,0.00185,0.113947,0.157233,0.00037,0.032926,0.010729,0.18572,0.00074,0.064003,0.005549,0.073992,0.157603


In [46]:
random.seed(1234)      #define a random seed to get same sentences when run multiple times
 
# choose random 10 numbers
rndom = [random.randint(1,len(m)) for x in range(10)]
 
# list of 10 sents on which we test the model
test_run = [m[i] for i in rndom]
 
# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]
 
# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [42]:
m=[]
for i in range(len(Xtest)):
    mer=[]
    mer.append(merge(Xtest[i],ytest[i]))
    m.append(mer)
    
        

In [71]:

start=time.time()
tagged_seq = Viterbi(test_tagged_words)
end=time.time()
difference = end-start
print("Time taken in seconds: ",difference)
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Time taken in seconds:  11.152700424194336
Viterbi Algorithm Accuracy:  0.0


In [48]:
test_tagged_words

[('इस', 'DET'),
 ('बूटा', 'PROPN'),
 ('आडवाणी', 'PROPN'),
 ('पार्वती', 'PROPN'),
 ('अमेरिकी', 'ADJ'),
 ('इस', 'DET'),
 ('गौरतलब', 'ADJ'),
 ('इस', 'DET'),
 ('उन्होंने', 'PRON'),
 ('उपमुख्यमंत्री', 'PROPN')]

In [82]:
test_sent="कार्पोरेट जगत घोटालों से भरा हुआ है ।"

pred_tags_withoutRules= Viterbi(Xtest[0])

print(pred_tags_withoutRules)

[('इसके', 'PRON'), ('अतिरिक्त', 'ADJ'), ('गुग्गुल', 'PRON'), ('कुंड', 'NOUN'), (',', 'PUNCT'), ('भीम', 'PRON'), ('गुफा', 'NOUN'), ('तथा', 'CCONJ'), ('भीमशिला', 'PRON'), ('भी', 'PART'), ('दर्शनीय', 'ADJ'), ('स्थल', 'NOUN'), ('हैं', 'AUX'), ('।', 'PUNCT')]


In [73]:
len(check)

0

In [78]:
Xtest[0]

['इसके',
 'अतिरिक्त',
 'गुग्गुल',
 'कुंड',
 ',',
 'भीम',
 'गुफा',
 'तथा',
 'भीमशिला',
 'भी',
 'दर्शनीय',
 'स्थल',
 'हैं',
 '।']

In [80]:
ytest[0]

['PRON',
 'ADP',
 'PROPN',
 'PROPN',
 'PUNCT',
 'PROPN',
 'PROPN',
 'CCONJ',
 'PROPN',
 'PART',
 'ADJ',
 'NOUN',
 'AUX',
 'PUNCT']