# **Morphological Analysis of Hindi Text**

**Dependencies:** `numpy, conllu, FastText, pandas, Keras, pyconll, sys, collections`

Authors :
Abhinav Kuruma 22111401,
Sanket Kale 22111052, 
Saqeeb 22111053

## POS tag with HMM and Viterbi Algorithm

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install conllu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting conllu
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.2


In [39]:
import sys
import math
import codecs
from io import open
from decimal import *
from conllu import parse
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt

### Getting the data in correct format

In [40]:
def getTrainData():
  with open("/content/drive/MyDrive/NLPassn1/hi_hdtb-ud-train.conllu", 'r') as file:
      temp = file.read()

  parsesentence = parse(temp)
  train_data = []
  for sent in parsesentence:
    eachSentence = []
    for word in sent:
      wordForm = word['form']
      POS = word['xpos']
      eachWord = wordForm + '/' + POS
      eachSentence.append(eachWord)
    train_data.append(eachSentence)
  return train_data

train_data = getTrainData()

In [41]:
tag_list = set()
tag_count = {}
word_set = set()

In [42]:
print(train_data[-300])

['सूत्रों/NN', 'का/PSP', 'कहना/VM', 'है/VM', 'कि/CC', 'आंतरिक/JJ', 'गुटबाजी/NN', 'एवं/CC', 'कलह/NN', 'के/PSP', 'कारण/PSP', 'ही/RP', 'अफ्रीकी/JJ', 'संघ/NN', 'ने/PSP', 'जी/NNPC', '-/SYM', '४/NNP', 'के/PSP', 'प्रस्ताव/NN', 'को/PSP', 'खारिज/JJ', 'किया/VM', 'है/VAUX', '।/SYM']


### Calculating Trainsition Probability

In [43]:
# value = train_data[0]
tag_list.clear()
word_set.clear()
tag_count.clear()
transition_dict = {}
for value in train_data:
  previous = "start"
  for data in value:
    # we store words and their corresponding tags #
      i = data[::-1]
      word = data[:-i.find("/") - 1]
      word_set.add(word.lower())
      data = data.split("/")
      tag = data[-1]
      tag_list.add(tag)
      # store frequency of each tag #
      if tag in tag_count:
          tag_count[tag] += 1
      else:
          tag_count[tag] = 1
      # store the frequency of each combination of tags #
      if (previous + "~tag~" + tag) in transition_dict:
          transition_dict[previous + "~tag~" + tag] += 1
          previous = tag
      else:
          transition_dict[previous + "~tag~" + tag] = 1
          previous = tag
print(transition_dict)

{'start~tag~DEM': 1010, 'DEM~tag~NNP': 18, 'NNP~tag~PSP': 14230, 'PSP~tag~INTF': 147, 'INTF~tag~JJ': 223, 'JJ~tag~NN': 8016, 'NN~tag~PSP': 30260, 'PSP~tag~PSP': 4630, 'PSP~tag~QC': 1995, 'QC~tag~VM': 75, 'VM~tag~SYM': 5440, 'start~tag~PRP': 2839, 'PRP~tag~NNC': 306, 'NNC~tag~NNP': 124, 'PSP~tag~VM': 6393, 'VM~tag~VAUX': 12661, 'VAUX~tag~SYM': 8789, 'NNC~tag~NN': 4661, 'NN~tag~QC': 361, 'QC~tag~JJ': 622, 'JJ~tag~VM': 4762, 'PRP~tag~QC': 266, 'QC~tag~NN': 3374, 'NN~tag~VM': 12045, 'VM~tag~CC': 4185, 'CC~tag~JJ': 1018, 'JJ~tag~NNC': 657, 'VM~tag~PSP': 4236, 'QC~tag~NNC': 220, 'start~tag~JJ': 851, 'NN~tag~INTF': 51, 'PRP~tag~VM': 1389, 'PSP~tag~NN': 19027, 'NN~tag~JJ': 2368, 'PSP~tag~JJ': 6156, 'start~tag~NNPC': 1150, 'NNPC~tag~NNP': 8037, 'PSP~tag~NST': 2754, 'NST~tag~JJ': 329, 'PSP~tag~DEM': 1178, 'DEM~tag~JJ': 179, 'NN~tag~NNPC': 1687, 'VM~tag~PRP': 470, 'PRP~tag~NNPC': 284, 'NNP~tag~RP': 178, 'RP~tag~VM': 640, 'VAUX~tag~VAUX': 5359, 'NNC~tag~CC': 1, 'CC~tag~NNC': 548, 'PRP~tag~NNP': 66

In [44]:
prob_dict = {}
for key in transition_dict.keys():
    den = 0
    val = key.split("~tag~")[0]
    # Probabilty of a tagA to be followed by tagB out of all possible tags # 
    for key_2 in transition_dict:
        if key_2.split("~tag~")[0] == val:
            den += transition_dict[key_2]
    prob_dict[key] = Decimal(transition_dict[key])/(den)
print(prob_dict)

{'start~tag~DEM': Decimal('0.07591701743836440168370414913'), 'DEM~tag~NNP': Decimal('0.004681404421326397919375812744'), 'NNP~tag~PSP': Decimal('0.6360060784839545901492804148'), 'PSP~tag~INTF': Decimal('0.002607631312862540577936246075'), 'INTF~tag~JJ': Decimal('0.6925465838509316770186335404'), 'JJ~tag~NN': Decimal('0.5127942681678607983623336745'), 'NN~tag~PSP': Decimal('0.5384916539132291703740612877'), 'PSP~tag~PSP': Decimal('0.08213151686090859099214162808'), 'PSP~tag~QC': Decimal('0.03538928210313447927199191102'), 'QC~tag~VM': Decimal('0.01575299306868304977945809704'), 'VM~tag~SYM': Decimal('0.1819702291353069075096169928'), 'start~tag~PRP': Decimal('0.2133944678292242934455802766'), 'PRP~tag~NNC': Decimal('0.02645456903259272067087403821'), 'NNC~tag~NNP': Decimal('0.02095302467049678945589726259'), 'PSP~tag~VM': Decimal('0.1134053536267362035016763344'), 'VM~tag~VAUX': Decimal('0.4235156380665663154373641077'), 'VAUX~tag~SYM': Decimal('0.4868712608021271881231996455'), 'NNC~

In [45]:
# Added the problities of the combination NOT in the dictionary with minimin prob
for tag in tag_list:
  # if a tag does not occur as a start tag, then set its probability to be a start tag to minimum value #
    if "start" + tag not in  prob_dict:
        prob_dict[("start" + "~tag~" + tag)] = Decimal(1) / Decimal(len(word_set) + tag_count[tag])
for tag1 in tag_list:
    for tag2 in tag_list:
      # if a particular tag combination does not exist in the dictionary, we set its probability to minimum#
        if (tag1 +"~tag~" + tag2) not in prob_dict:
            prob_dict[(tag1+"~tag~"+tag2)] = Decimal(1)/Decimal(len(word_set) + tag_count[tag1])
print(prob_dict)

{'start~tag~DEM': Decimal('0.00004825323296660876278710673615'), 'DEM~tag~NNP': Decimal('0.004681404421326397919375812744'), 'NNP~tag~PSP': Decimal('0.6360060784839545901492804148'), 'PSP~tag~INTF': Decimal('0.002607631312862540577936246075'), 'INTF~tag~JJ': Decimal('0.6925465838509316770186335404'), 'JJ~tag~NN': Decimal('0.5127942681678607983623336745'), 'NN~tag~PSP': Decimal('0.5384916539132291703740612877'), 'PSP~tag~PSP': Decimal('0.08213151686090859099214162808'), 'PSP~tag~QC': Decimal('0.03538928210313447927199191102'), 'QC~tag~VM': Decimal('0.01575299306868304977945809704'), 'VM~tag~SYM': Decimal('0.1819702291353069075096169928'), 'start~tag~PRP': Decimal('0.00003515432749771496871264852703'), 'PRP~tag~NNC': Decimal('0.02645456903259272067087403821'), 'NNC~tag~NNP': Decimal('0.02095302467049678945589726259'), 'PSP~tag~VM': Decimal('0.1134053536267362035016763344'), 'VM~tag~VAUX': Decimal('0.4235156380665663154373641077'), 'VAUX~tag~SYM': Decimal('0.4868712608021271881231996455')

In [46]:
# Total of 992 POS tag sequence (32 POS with start symbol * 31 POS tag)
len(prob_dict)

992

### Calculating Emission Probability 

In [47]:
count_word = {}
for value in train_data:
    for data in value:
        i = data[::-1]
        word = data[:-i.find("/") - 1]
        tag = data.split("/")[-1]
        # map the words in the training set to their tagged POS #
        if word.lower() + "/" + tag in count_word:
            count_word[word.lower() + "/" + tag] +=1
        else:
            count_word[word.lower() + "/" + tag] = 1
print(count_word)

{'यह/DEM': 660, 'एशिया/NNP': 20, 'की/PSP': 6476, 'सबसे/INTF': 135, 'बड़ी/JJ': 71, 'मस्जिदों/NN': 1, 'में/PSP': 8416, 'से/PSP': 4653, 'एक/QC': 1295, 'है/VM': 2426, '।/SYM': 12160, 'इसे/PRP': 140, 'नवाब/NNC': 1, 'शाहजेहन/NNP': 1, 'ने/PSP': 4889, 'बनवाया/VM': 12, 'था/VAUX': 785, 'इसका/PRP': 131, 'प्रवेश/NNC': 9, 'द्वार/NN': 11, 'दो/QC': 377, 'मंजिला/JJ': 4, 'जिसमें/PRP': 100, 'चार/QC': 141, 'मेहराबें/NN': 1, 'हैं/VM': 629, 'और/CC': 3096, 'मुख्य/JJ': 99, 'प्रार्थना/NNC': 4, 'हॉल/NN': 4, 'जाने/VM': 109, 'के/PSP': 12669, 'लिए/PSP': 1773, '9/QC': 8, 'पूरी/JJ': 167, 'इमारत/NN': 14, 'बेहद/INTF': 32, 'खूबसूरत/JJ': 43, 'यहाँ/PRP': 161, 'लगने/VM': 22, 'वाला/PSP': 51, 'तीन/QC': 232, 'दिन/NN': 232, 'का/PSP': 3514, 'इज्तिमा/NN': 1, 'पूरे/JJ': 91, 'देश/NN': 311, 'लोगों/NN': 511, 'को/PSP': 6006, 'आमंत्रित/JJ': 17, 'करता/VM': 44, 'है/VAUX': 4486, 'शौकत/NNPC': 19, 'महल/NNP': 23, 'सामने/NST': 125, 'झील/NN': 13, 'किनारे/NN': 21, 'स्थित/JJ': 153, 'वास्तुकला/NN': 2, 'नमूना/NN': 7, 'कुदसिया/NNPC': 1, 'बेगम/NN

In [48]:
emission_prob_dict = {}
# calculate probability of a word to be a certain Tag out of all the possible tags that it can be #
for key in count_word:
    emission_prob_dict[key] = Decimal(count_word[key])/tag_count[key.split("/")[-1]]

print(emission_prob_dict)

{'यह/DEM': Decimal('0.1716514954486345903771131339'), 'एशिया/NNP': Decimal('0.0008938547486033519553072625698'), 'की/PSP': Decimal('0.1148776896741347808347967999'), 'सबसे/INTF': Decimal('0.4192546583850931677018633540'), 'बड़ी/JJ': Decimal('0.004541965199590583418628454452'), 'मस्जिदों/NN': Decimal('0.00001779422755258194241787963984'), 'में/PSP': Decimal('0.1492913274085111666932751495'), 'से/PSP': Decimal('0.08253951359693470278324729924'), 'एक/QC': Decimal('0.2720016803192606595253098089'), 'है/VM': Decimal('0.08109643991308708006017048304'), '।/SYM': Decimal('0.6513820441397043068352260553'), 'इसे/PRP': Decimal('0.01210339759661104867294890637'), 'नवाब/NNC': Decimal('0.0001689760054072321730314295370'), 'शाहजेहन/NNP': Decimal('0.00004469273743016759776536312849'), 'ने/PSP': Decimal('0.08672591488833306724850549022'), 'बनवाया/VM': Decimal('0.0004011365535684439244526157446'), 'था/VAUX': Decimal('0.04340373769766670352759040142'), 'इसका/PRP': Decimal('0.01132532203682890982968790525

### Implementing Viterbi Algorithm

In [49]:
tag_list, prob_dict, emission_prob_dict, tag_count, word_set
transition_prob = prob_dict
emission_prob = emission_prob_dict

In [50]:
def viterbi_algorithm(sentence, tag_list, transition_prob, emission_prob,tag_count, word_set):
    global tag_set
    # Get words from each sentence #
    sentence = sentence.strip("\n")
    word_list = sentence.split()
    current_prob = {}
    for tag in tag_list:
        # transition probability #
        tp = Decimal(0)
        # Emission probability #
        em = Decimal(0)
        # Storing the probability of every tag to be starting tag #
        if "start~tag~"+tag in transition_prob:
            tp = Decimal(transition_prob["start~tag~"+tag])
        # Check for first word in training data. If present, check the probability of the first word to be of given tag
        if word_list[0].lower() in word_set:
            if (word_list[0].lower()+"/"+tag) in emission_prob:
                em = Decimal(emission_prob[word_list[0].lower()+"/"+tag])
                # Storing probability of current combination of tp and em #
                current_prob[tag] = tp * em
         # Check for word in training data. If absent then probability is just tp# 
        else:
            em = Decimal(1) /(tag_count[tag] +len(word_set))
            current_prob[tag] = tp

    if len(word_list) == 1:
        # Return max path if only one word in sentence #
        max_path = max(current_prob, key=current_prob.get)
        return max_path
    else:
        # Tracking from second word to last word #
        for i in range(1, len(word_list)):
            previous_prob = current_prob
            current_prob = {}
            locals()['dict{}'.format(i)] = {}
            previous_tag = ""
            for tag in tag_list:
                if word_list[i].lower() in word_set:
                    if word_list[i].lower()+"/"+tag in emission_prob:
                        em = Decimal(emission_prob[word_list[i].lower()+"/"+tag])
                        # Find the maximum probability using previous node's(tp*em)[i.e probability of reaching to the previous node] * tp * em (Bigram Model) #
                        max_prob, previous_state = max((Decimal(previous_prob[previous_tag]) * Decimal(transition_prob[previous_tag + "~tag~" + tag]) * em, previous_tag) for previous_tag in previous_prob)
                        current_prob[tag] = max_prob
                        locals()['dict{}'.format(i)][previous_state + "~" + tag] = max_prob
                        previous_tag = previous_state
                else:
                    em = Decimal(1) /(tag_count[tag] +len(word_set))
                    max_prob, previous_state = max((Decimal(previous_prob[previous_tag]) * Decimal(transition_prob[previous_tag+"~tag~"+tag]) * em, previous_tag) for previous_tag in previous_prob)
                    current_prob[tag] = max_prob
                    locals()['dict{}'.format(i)][previous_state + "~" + tag] = max_prob
                    previous_tag = previous_state

            # if last word of sentence, then return path dicts of all words #
            if i == len(word_list)-1:
                max_path = ""
                last_tag = max(current_prob, key=current_prob.get)
                max_path = max_path + last_tag + " " + previous_tag
                for j in range(len(word_list)-1,0,-1):
                    for key in locals()['dict{}'.format(j)]:
                        data = key.split("~")
                        if data[-1] == previous_tag:
                            max_path = max_path + " " +data[0]
                            previous_tag = data[0]
                            break
                result = max_path.split()
                result.reverse()
                return " ".join(result)


### Testing 

In [51]:
transition_model = prob_dict
emission_model = emission_prob_dict

def printPOS(sentence1):
  path = viterbi_algorithm(sentence1, tag_list, transition_model, emission_model,tag_count, word_set)
  # sentence1 = sentence1.strip("\n")
  word = sentence1.split()
  tag = path.split(" ")
  mytext = ''
  for j in range(0,len(word)):
      if j == len(word)-1:
        mytext = mytext+word[j]+"->" + tag[j]
        print(mytext)
        mytext = ""
      else:
        mytext = mytext+word[j]+"->" + tag[j] + " "

sentence = '''यहाँ लगने वाला तीन दिन का इज्तिमा पूरे देश के लोगों को आमंत्रित करता है ।'''
printPOS(sentence)

यहाँ->PRP लगने->VM वाला->PSP तीन->QC दिन->NN का->PSP इज्तिमा->NN पूरे->JJ देश->NN के->PSP लोगों->NN को->PSP आमंत्रित->JJ करता->VM है->VAUX ।->SYM


In [52]:
test_data = []
with open("/content/drive/MyDrive/NLPassn1/hi_hdtb-ud-dev.conllu", 'r') as file:
  temp = file.read()
parsesentence = parse(temp)

In [53]:
def predSentPOS(strSent):
  path = viterbi_algorithm(strSent, tag_list, transition_model, emission_model,tag_count, word_set)
  word = strSent.split()
  tag = path.split(" ")
  mytext = ''
  for j in range(0,len(word)):
      if j == len(word)-1:
        mytext = mytext+ tag[j]
        return mytext.split()
      else:
        mytext = mytext+ tag[j] + " "
  
def getTestPred():
  ypred = []
  ytest = []
  # a1 = parsesentence
  i = 0
  for sent in parsesentence:
    trainList = []
    strSent = str()
    for word in sent:
      trainList.append(word['xpos'])
      strSent = strSent + word['form'] + " "
    # Adding the sentence token from the train data
    ytest.extend(trainList)
    # print(strSent)
    # Now get the POS tag with respect to each word

    predList = predSentPOS(strSent)
    # print("train-> ", trainList, len(trainList))
    # print("pred-> ", predList, len(predList))
    ypred.extend(predList)

  return ytest, ypred
ytest, ypred = getTestPred()

In [54]:
print(accuracy_score(ytest, ypred)*100)
print(precision_score(ytest, ypred,average='macro')*100)

92.04645483715251
68.32359252453621


## Analysis of Gender Case Number in Hindi

### Setting up Dependencies

In [3]:
!pip install pyconll

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyconll
  Downloading pyconll-3.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyconll
Successfully installed pyconll-3.1.0


In [4]:
import pyconll
import pandas as pd
from collections import defaultdict

In [5]:
corpus = pyconll.load_from_file("/content/drive/MyDrive/NLPassn1/hi_hdtb-ud-train.conllu")

In [6]:
s = set()
for sentence in corpus:
  for token in sentence:
    s.add(token.form)
print(len(s))

16879


In [7]:
word_gender = {}
word_number = {}
word_case = {}

In [8]:
for sentence in corpus:
  for token in sentence:
    if 'Gender' in token.feats:
      if token.form not in word_gender:
        word_gender[token.form] = list(token.feats['Gender'])[0]  
    if 'Number' in token.feats:
      if token.form not in word_number:
        word_number[token.form] = list(token.feats['Number'])[0]
    if 'Case' in token.feats:
      if token.form not in word_case:
        word_case[token.form] = list(token.feats['Case'])[0]

In [9]:
print(len(word_gender))
print(len(word_number))
print(len(word_case))

13455
14130
14107


### Rules based approach for Gender, Number, Case

In [10]:
test_gender = {}
test_number = {}
test_case = {}
for sentence in corpus:
  for token in sentence:
    if 'Gender' in token.feats:
      if(token.form[-1] == "ी" or token.form[-1] == "ि" or token.form.endswith("ियाँ") or token.form.endswith("ियां")):
        if token.form not in test_gender:
          test_gender[token.form] = 'Fem'
      else:
        test_gender[token.form] = 'Masc'
    if 'Number' in token.feats:
      if token.form.endswith("ियाँ") or token.form.endswith("ियां") or token.form.endswith("ियों") or (token.form[-1] in ["ो","े","ों","ें"]) or token.form.endswith("ओं") or token.form.endswith("ाओं") or token.form.endswith("नों") or token.form.endswith("यो") :
        if token.form not in test_number:
          test_number[token.form] = 'Plur'
      else:
        if token.form not in test_number:
          test_number[token.form] = 'Sing'
    if 'Case' in token.feats:
      if token.form not in test_case:
        test_case[token.form] = list(token.feats['Case'])[0]

In [11]:
print(len(test_gender))
print(len(test_number))
print(test_number)

13455
14130
{'यह': 'Sing', 'एशिया': 'Sing', 'की': 'Sing', 'बड़ी': 'Sing', 'मस्जिदों': 'Sing', 'है': 'Sing', 'इसे': 'Plur', 'नवाब': 'Sing', 'शाहजेहन': 'Sing', 'बनवाया': 'Sing', 'था': 'Sing', 'इसका': 'Sing', 'प्रवेश': 'Sing', 'द्वार': 'Sing', 'जिसमें': 'Sing', 'मेहराबें': 'Sing', 'हैं': 'Sing', 'प्रार्थना': 'Sing', 'हॉल': 'Sing', 'पूरी': 'Sing', 'इमारत': 'Sing', 'लगने': 'Plur', 'वाला': 'Sing', 'दिन': 'Sing', 'का': 'Sing', 'इज्तिमा': 'Sing', 'पूरे': 'Plur', 'देश': 'Sing', 'के': 'Plur', 'लोगों': 'Sing', 'करता': 'Sing', 'शौकत': 'Sing', 'महल': 'Sing', 'सामने': 'Plur', 'झील': 'Sing', 'किनारे': 'Plur', 'वास्तुकला': 'Sing', 'नमूना': 'Sing', 'कुदसिया': 'Sing', 'बेगम': 'Sing', 'काल': 'Sing', 'जिन्हें': 'Sing', 'गोहर': 'Sing', 'कहा': 'Sing', 'जाता': 'Sing', 'हिंदू': 'Sing', 'मुगल': 'Sing', 'कला': 'Sing', 'संगम': 'Sing', 'भारत': 'Sing', 'अनूठी': 'Sing', 'संस्था': 'Sing', 'रूप': 'Sing', 'प्रदर्शन': 'Sing', 'दृश्य': 'Sing', 'केंद्र': 'Sing', 'चार्ल्स': 'Sing', 'कोरिया': 'Sing', 'डिजाइन': 'Sing', 'किय

Calculating Accuracy

In [12]:
correct = 0
total = len(test_gender)
for word in word_gender:
  if(word_gender[word] == test_gender[word]):
    correct += 1
accuracy = (correct/total)
print("Accuracy of Gender: ",accuracy*100)

Accuracy of Gender:  78.52099591230026


In [13]:
correct = 0
total = len(test_number)
for word in word_number:
  if(word_number[word] == test_number[word]):
    correct += 1
accuracy = (correct/total)
print("Accuracy of Number: ",accuracy*100)

Accuracy of Number:  82.9723991507431


### Deep Learning Based Prediction of Gender, Number, Case

**Dependencies:** `FastText, Keras, `

In [16]:
!git clone https://github.com/facebookresearch/fastText.git

Cloning into 'fastText'...
remote: Enumerating objects: 3930, done.[K
remote: Counting objects: 100% (944/944), done.[K
remote: Compressing objects: 100% (140/140), done.[K
remote: Total 3930 (delta 854), reused 804 (delta 804), pack-reused 2986[K
Receiving objects: 100% (3930/3930), 8.24 MiB | 21.64 MiB/s, done.
Resolving deltas: 100% (2505/2505), done.


In [17]:
!cd fastText

In [18]:
!pip install fastText

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fastText
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 4.1 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.10.1-py3-none-any.whl (216 kB)
Building wheels for collected packages: fastText
  Building wheel for fastText (setup.py) ... [?25l[?25hdone
  Created wheel for fastText: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3158211 sha256=483ef434470baeda70a3b813eeca03b9c9a90984d485962ff1844ba021ae258d
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fastText
Installing collected packages: pybind11, fastText
Successfully installed fastText-0.9.2 pybind11-2.10.1


In [19]:
import fasttext.util
fasttext.util.download_model('hi', if_exists='ignore') 
model = fasttext.load_model('cc.hi.300.bin')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz





In [None]:
import numpy as np
from fasttext.FastText import _FastText

model = _FastText(model_path='cc.hi.300.bin')

**Deep Learning Model for Number Prediction**

In [None]:
# X = Creating FastText embedding vector for each word
# y = Class labels of Number
X = []
y = []
for key in word_number:
  X.append(np.array(model.get_word_vector(key)))
  y.append(word_number[key])

In [None]:
# Changing Number label 
# Sing -> 0 & Plur -> 1
Y = []
for i in y:
  if i == 'Sing':
    Y.append(0)
  else:
    Y.append(1)
Y = np.array(Y)

In [None]:
# Creating Numpy array
X = np.array(X)
print(X.shape,Y.shape)

(14130, 300) (14130,)


In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=0)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=300))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train, Y_train, epochs=40, batch_size=10, validation_split=0.1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [3]:
test_loss,test_acc = model.evaluate(X_test, Y_test)
print("Loss: ",test_loss)
print("Accuracy: ",test_acc*100)

Loss: 0.6610872745513916
Accuracy: 0.9246284365653992


**Deep Learning Model for Gender prediction**

In [None]:
model = _FastText(model_path='cc.hi.300.bin')

In [None]:
# X = Creating FastText embedding vector for each word
# y = Class labels of Number
X = []
y = []
for key in word_gender:
  X.append(np.array(model.get_word_vector(key)))
  y.append(word_gender[key])

In [None]:
# Changing Gender label 
# Masc -> 0 & Fem -> 1
Y = []
for i in y:
  if i == 'Masc':
    Y.append(0)
  else:
    Y.append(1)

In [None]:
X = np.array(X)
Y = np.array(Y)
print(X.shape,Y.shape)

(13455, 300) (13455,)


In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=1)

In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=300))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train, Y_train, epochs=40, batch_size=10, validation_split=0.1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [1]:
test_loss,test_acc = model.evaluate(X_test, Y_test)
print("Loss: ",test_loss)
print("Accuracy: ",test_acc)

Loss: 0.6775475144386292
Accuracy: 0.9059829115867615


**Deep Learning Model for Case Prediction**

In [None]:
# Calculating number of classes in Case
aaa = set()
for k in word_case:
  aaa.add(word_case[k])
num_classes = len(aaa)
print(num_classes)

7


In [None]:
model = _FastText(model_path='cc.hi.300.bin')

In [None]:
# X = Creating FastText embedding vector for each word
# y = Class labels of Number
X = []
y = []
for key in word_case:
  X.append(np.array(model.get_word_vector(key)))
  y.append(word_case[key])

In [None]:
# Changing Case label 
Y = []
for i in y:
  if(i == 'Erg'):
    Y.append(0)
  if(i == 'Nom'):
    Y.append(1)
  if(i == 'Ine'):
    Y.append(2)
  if(i == 'Ins'):
    Y.append(3)
  if(i == 'Dat'):
    Y.append(4)
  if(i == 'Gen'):
    Y.append(5)
  if(i == 'Acc'):
    Y.append(6)

In [None]:
X = np.array(X)
Y = np.array(Y)
X.shape,Y.shape

((14107, 300), (14107,))

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=1)

In [None]:
# Create onehotencoding/Categorical for each label
Y_train = to_categorical(Y_train, num_classes)
Y_test = to_categorical(Y_test, num_classes)
print(Y_train.shape,Y_test.shape)

(11285, 7) (2822, 7)


In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=300))
model.add(Dense(7, activation='softmax'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train, Y_train, epochs=30, batch_size=32, validation_split=0.1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [14]:
test_loss,test_acc = model.evaluate(X_test, Y_test)
print("Loss: ",test_loss)
print("Accuracy: ", test_acc)

Loss: 0.21299847960472107
Accuracy: 0.6537916660308838


## Rule based Lemmatizer 

In [None]:
conda install -c pyconll pyconll

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Retrieving notices: ...working... done

Note: you may need to restart the kernel to use updated packages.


In [None]:
import sys
import conllu
from conllu import parse

In [None]:
conllu=open('/content/drive/MyDrive/NLPassn1/hi_hdtb-ud-train.conllu','r', encoding="utf-8")
annotations = conllu.read()
sentences = parse(annotations)

In [None]:
text=open("/content/drive/MyDrive/NLPassn1/output.txt","r", encoding="utf-8", errors = 'ignore')
text=text.read(100000)

In [None]:
english=['a','b','c','d','e','f','g','h','i','g','k','l','m','n','o','p','q','r',u'।',"'",'"','“','”','`','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','1','2','3','4','5','6','7','8','9','0','!','@','#','$','%','^','&','*','(',')','_','+','=','-','{','}','[',']',',','.','?',':','"',';','\n','\u200c','/','\xa0','...']
for i in range(len(text)):
    if text[i] in english:  # DATA CLEANING when we encounter any symbols which are in english list
        text=text.replace(text[i],'')
#print(text)       

In [None]:
word=[]
lemma=[]
word_lemma_dict={}
for sentence in sentences:
    for token in sentence:
        word.append(token['form'])
        lemma.append(token['lemma'])
for i in range(len(word)):
    word_lemma_dict[word[i]]=lemma[i]
#print(word_lemma_dict)    

In [None]:
rules=open("/content/drive/MyDrive/NLPassn1/rules.txt","r", encoding="utf-8")
rules=rules.read()
rules=rules.split('\n')
for i in rules:
    if(i==''):
        break
    else:    
        rule=i
        rule=rule.split(' ')
        word_lemma_dict[rule[0]]=rule[2]
        lemma.append(rule[2])
rules2=open("/content/drive/MyDrive/NLPassn1/rules2.txt","r", encoding="utf-8")
rules2=rules2.read()
rules2=rules2.split('\n')
for i in rules2:
    if(i==''):
        break
    else:    
        rule=i
        rule=rule.split(' ')
        word_lemma_dict[rule[0]]=rule[3] 
        lemma.append(rule[3])
        
rules3=open("/content/drive/MyDrive/NLPassn1/lemma.txt","r", encoding="utf-8")
rules3=rules3.read()  
rules3=rules3.split('\n')
for i in rules3:
    rule=i
    rule=rule.split(':')
    word_lemma_dict[rule[0]]=rule[-1] 
    lemma.append(rule[-1])
word=set(word)
lemma=set(lemma)   

In [None]:
suffixes =["ो", "े", "ू", "ु", "ी", "ि", "ा","तृ","ान","ैत","ने","ाऊ","ाव","कर", "ाओ", "िए", "ाई", "ाए", "नी", "ना", "ते", "ीं", "ती","ता", "ाँ", "ां", "ों", "ें","ीय", "ति","या", "पन", "पा","ित","ीन","लु","यत","वट","लू", "ेरा","त्व","नीय","ौनी","ौवल","ौती","ौता","ापा","वास","हास","काल","पान","न्त","ौना","सार","पोश","नाक","ियल","ैया", "ौटी","ावा","ाहट","िया","हार", "ाकर", "ाइए", "ाईं", "ाया", "ेगी", "वान", "बीन","ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं","कला","िमा","कार","गार", "दान","खोर", "ावास","कलाप","हारा","तव्य","वैया", "वाला", "ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां","त्वा","तव्य","कल्प","िष्ठ","जादा","क्कड़", "ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां", "अक्कड़","तव्य:","निष्ठ""ो","े","ू","ु","ी","ि","ा","कर","ाओ","िए","ाई","ाए","ने","नी","ना","ते","ीं","ती","ता","ाँ","ां","ों","ें","ाकर","ाइए","ाईं","ाया","ेगी","ेगा","ोगी","ोगे","ाने","ाना","ाते","ाती","ाता","तीं","ाओं","ाएं","ुओं","ुएं","ुआं","ाएगी","ाएगा","ाओगी","ाओगे","एंगी","ेंगी","एंगे","ेंगे","ूंगी","ूंगा","ातीं","नाओं","नाएं","ताओं","ताएं","ियाँ","ियों","ियां","ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]


In [None]:

def words_lemmas(words):
    
    words_lemmas = {}
    lemmas=[]
    remaining = []
    remaining2 = []
    sett = []
    for i in words:
        sett.append(i)
        flag = 0
        if i in word:
            lemmas.append(word_lemma_dict[i])
            words_lemmas[i] = word_lemma_dict[i]
        elif i in lemma:
            lemmas.append(i)
            words_lemmas[i] = i
        else:
            for s in suffixes:
                if i.endswith(s):
                    t = i[:(len(i)-len(s))]
                    if t in lemma:
                        lemmas.append(t)
                        words_lemmas[i] = t
                        break
                    else:
                        remaining.append(i)
                        flag = 1
                        words_lemmas[i]=t
                        break

            if(flag == 0):
                remaining.append(i)
                words_lemmas[i]=i
                
    for k in remaining:
        flag=0
        for s in suffixes:
            if k.endswith(s):
                k=k[:len(i)-len(s)]
                for s1 in suffixes:
                    j=k+s1
                    if j in lemma:
                        lemmas.append(j)
                        words_lemmas[i]=j
                        break
                    else:
                        remaining2.append(i)
                        words_lemmas[i]=k
                        flag=1
                        break
                break         
        if(flag==0):
            remaining2.append(i)
            words_lemmas[i]=i                
    return words_lemmas
        

In [None]:
conllu=open('/content/drive/MyDrive/NLPassn1/hi_hdtb-ud-test.conllu','r', encoding="utf-8")
annotations =conllu.read()
sentences = parse(annotations)
word2=[]
lemma2=[]
word2_lemma2_dict={}
for sentence in sentences:
    for token in sentence:
        word2.append(token['form'])
        lemma2.append(token['lemma'])
for i in range(len(word2)):
    word2_lemma2_dict[word2[i]]=lemma2[i]
store=words_lemmas(word2) 
print(len(store.keys()),len(word2_lemma2_dict.keys()))

420 420
5298 5298


In [None]:
correct=0
total=len(word2_lemma2_dict.keys())
for i in word2_lemma2_dict.keys():
    if(store[i]==word2_lemma2_dict[i]):
        correct+=1
print("Accuracy:", correct/total)        

Accuracy: 0.44394110985277463
