# Hindi pos tagger using a neural network

### Importing libraries and Loading Data


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
!pip install conllu

Collecting conllu
  Downloading https://files.pythonhosted.org/packages/ae/be/be6959c3ff2dbfdd87de4be0ccdff577835b5d08b1d25bf7fd4aaf0d7add/conllu-4.4-py2.py3-none-any.whl
Installing collected packages: conllu
Successfully installed conllu-4.4


In [None]:
from conllu import parse_incr
from io import open
train_file=open('/hi_hdtb-ud-train.conllu','r',encoding='utf-8')
train_files=[]
for tokenlist in parse_incr(train_file):
    train_files.append(tokenlist)

In [None]:
val_file=open('/hi_hdtb-ud-dev.conllu','r',encoding='utf-8')
val_files=[]
for tokenlist in parse_incr(val_file):
    val_files.append(tokenlist)

In [None]:
test_file=open('/hi_hdtb-ud-test.conllu','r',encoding='utf-8')
test_files=[]
for tokenlist in parse_incr(test_file):
    test_files.append(tokenlist)

In [None]:
def dataset(ud_files):
  bank=[]
  for sentence in ud_files:
      tokens=[]
      tags=[]
      
      for token in sentence:
          tokens.append(token['form'])
          tags.append(token['upostag'])
          
      bank.append((tokens,tags))
  return bank

In [None]:
train_bank=dataset(train_files)
test_bank = dataset(test_files)
val_bank = dataset(val_files)

In [None]:

len(train_bank),len(test_bank),len(val_bank)

(13304, 1684, 1659)

In [None]:
def separate(bank):
    X,y=[],[]
    for index in range(len(bank)):
        X.append(bank[index][0])
        y.append(bank[index][1])
    return X,y

In [None]:
Xtrain,ytrain=separate(train_bank)
Xdev,ydev=separate(val_bank)
Xtest,ytest=separate(test_bank)

### Loading word embeddings

In [None]:
!wget -O hi.tar.gz https://www.dropbox.com/s/pq50ca4o3phi9ks/hi.tar.gz?dl=0

--2021-05-27 03:50:13--  https://www.dropbox.com/s/pq50ca4o3phi9ks/hi.tar.gz?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.18, 2620:100:601c:18::a27d:612
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/pq50ca4o3phi9ks/hi.tar.gz [following]
--2021-05-27 03:50:14--  https://www.dropbox.com/s/raw/pq50ca4o3phi9ks/hi.tar.gz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc24707568c9eba3ea88d2c5633e.dl.dropboxusercontent.com/cd/0/inline/BPTIAQ3TnPqtOvm4-orCwYtkY72oM4SwY0R76kBfEkxPlZ0QznTclxIqtE7C5MkHUl4TKx58OyoC0DIQKf5_ZU9WM_gAgn32PiZdLDiKeNejSfMbPq5ZSmN34AkXjdcO-RSx1WtRKt8Q-CpXfkDCcm-H/file# [following]
--2021-05-27 03:50:14--  https://uc24707568c9eba3ea88d2c5633e.dl.dropboxusercontent.com/cd/0/inline/BPTIAQ3TnPqtOvm4-orCwYtkY72oM4SwY0R76kBfEkxPlZ0QznTclxIqtE7C5MkHUl4TKx58OyoC0DIQKf5_ZU9WM_gAgn

In [None]:
!gzip -d /content/hi.tar.gz

In [None]:
!ls

fasttext.pkl  hi.bin  hi.tar  hi.tar.gz  hi.vec  sample_data


In [None]:
!tar -xvf /content/hi.tar

hi.bin
hi.vec


In [None]:
embeddings_index={}
with open('/content/hi.vec',encoding="utf8") as glove_file:
  for line in glove_file:
        values = line.split()
        word = values[0]
        
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
import pickle
filename='fasttext.pkl'
pickle.dump(embeddings_index,open(filename,'wb'))

### Creating word and tag dictionaries

In [None]:
word=embeddings_index.keys()

In [None]:
word=list(word)

In [None]:
word2id={k:word.index(k) for k in word}  # Creating word to index dictionary

In [None]:
def flatten(y):
  l=[]
  for i in y:
    for j in i:
      l.append(j)
  return l

In [None]:
xflat=list(flatten(Xtrain))

In [None]:
y=ytrain

In [None]:
y=list(flatten(y))

In [None]:
len(y)

281057

In [None]:
tag2int={} #Creating tag to index dictionary
for item in y: 
  tag=item
  tag2int.setdefault(tag,len(tag2int))

In [None]:
len(tag2int)

16

### Creating Embedding matrix and handling unknown words.

In [None]:
EMBEDDING_DIM=300
embedding_matrix = np.zeros((len(word2id)+1,EMBEDDING_DIM))
for word,i in word2id.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i]=embedding_vector

In [None]:
def add_new_word(new_word,new_vector,new_index,embedding_matrix,word2id):
  embedding_matrix = np.insert(embedding_matrix, [new_index],[new_vector],axis=0)
  word2id = {word:(index+1) if index>=new_index else index for word,index in word2id.items()}
  word2id[new_word] = new_index
  return embedding_matrix,word2id
  

In [None]:
UNK_index = 0               #handling unknown words. If a word is not present in the dictionary then it is assigned 0 as id
UNK_token = "UNK"
unk_vector = embedding_matrix.mean(0)
embedding_matrix,word2id = add_new_word(UNK_token,unk_vector,UNK_index,embedding_matrix,word2id)

In [None]:
def prepare_data(X,Y,word2id,tag2int):
  x,y=[],[]
  unk_count = 0
  for v in range(len(X)):
    for data in range(len(X[v])):
        i=X[v][data]
        word = i
        tag = Y[v][data]
        y.append(tag2int.get(tag))
        if word in word2id:
          x.append(word2id.get(word))
        else:
          x.append(UNK_index)
          unk_count+=1
  print("Data created.Unknown data percentage: %.3f" % (unk_count/len(xflat)))
  return np.array(x),np.array(y)

    

In [None]:
x_train,y_train=prepare_data(Xtrain,ytrain,word2id,tag2int)

Data created.Unknown data percentage: 0.059


In [None]:
x_test,y_test=prepare_data(Xtest,ytest,word2id,tag2int)


Data created.Unknown data percentage: 0.063


In [None]:
x_val,y_val = prepare_data(Xdev,ydev,word2id,tag2int)

Data created.Unknown data percentage: 0.059


### Creating Model and training

In [None]:
from keras.layers import Dense, Embedding, Activation, Flatten
from keras.models import Sequential
!pip install np_utils
from keras.utils.np_utils import to_categorical

Collecting np_utils
[?25l  Downloading https://files.pythonhosted.org/packages/b6/18/5704a782fd72727a9e63198fcc76fadb86975f45bcdf579c10f668329508/np_utils-0.5.12.1.tar.gz (61kB)
[K     |█████▍                          | 10kB 15.9MB/s eta 0:00:01[K     |██████████▊                     | 20kB 21.1MB/s eta 0:00:01[K     |████████████████                | 30kB 24.4MB/s eta 0:00:01[K     |█████████████████████▍          | 40kB 26.3MB/s eta 0:00:01[K     |██████████████████████████▊     | 51kB 28.6MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 6.8MB/s 
Building wheels for collected packages: np-utils
  Building wheel for np-utils (setup.py) ... [?25l[?25hdone
  Created wheel for np-utils: filename=np_utils-0.5.12.1-cp37-none-any.whl size=57126 sha256=c54f3e59572a473f8f08898f29d9d6fd46e8bddfa055eaed304ba46c0b5dda42
  Stored in directory: /root/.cache/pip/wheels/92/4b/81/206efd0d01330a96f3aebe5021d2d5f0b264b7ade827c306ef
Successfully built np-utils
Installing col

In [None]:
y_train, y_test,y_val = to_categorical(y_train), to_categorical(y_test),to_categorical(y_val)

In [None]:
import collections

In [None]:
ytrainflat=list(flatten(ytrain))
ytestflat=list(flatten(ytest))
ydevflat=list(flatten(ydev))

In [None]:

hidden_size = 50
batch_size = 128

def define_model(embedding_matrix, class_count):
    """
    Takes one word as input and returns its part of speech
    """
    vocab_length = len(embedding_matrix)
    model = Sequential()
    # Input dimension would be length of our vocabulary, output would be 300 dimensional embedding
    # We load our pretrainned word2vec weights and set the input size to be 1
    model.add(Embedding(input_dim=vocab_length,output_dim=300,weights=[embedding_matrix],input_length=1))
    model.add(Flatten())
    model.add(Dense(hidden_size,activation="tanh"))
    model.add(Dense(class_count,activation="softmax"))
    model.compile(optimizer="Adam",loss="categorical_crossentropy",metrics=["accuracy"])
    return model

pos_model = define_model(embedding_matrix, len(tag2int))
pos_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 300)            9119400   
_________________________________________________________________
flatten (Flatten)            (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                15050     
_________________________________________________________________
dense_1 (Dense)              (None, 16)                816       
Total params: 9,135,266
Trainable params: 9,135,266
Non-trainable params: 0
_________________________________________________________________


In [None]:
pos_model.fit(x_train,y_train,epochs=1,verbose=1,batch_size=batch_size)



<keras.callbacks.History at 0x7fcc9ae84210>

In [None]:
id2tag = {v:k for k,v in tag2int.items()}

### Evaluating performance

In [None]:
prediction = pos_model.predict(x_test) #test set

In [None]:
num=prediction.shape[0]
l=[]
for i in range(num):
  l.append(id2tag[np.argmax(prediction[i])])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(ytestflat,l)

0.8875529212531753

In [None]:
predictiontrain=pos_model.predict(x_train) #train set

In [None]:
ytrainflat=flatten(ytrain)

In [None]:
num=predictiontrain.shape[0]
l=[]
for i in range(num):
  l.append(id2tag[np.argmax(predictiontrain[i])])

In [None]:
accuracy_score(ytrainflat,l)

0.903240267988344

In [None]:
predictiondev=pos_model.predict(x_val) #validation set

In [None]:
yvalflat=flatten(ydev)

In [None]:
num=predictiondev.shape[0]
l=[]
for i in range(num):
  l.append(id2tag[np.argmax(predictiondev[i])])

In [None]:
accuracy_score(yvalflat,l)

0.8862197234290258

### Testing the model

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
sentence = "भारत की राजधानी दिल्ली है ।"

In [None]:
sentence=word_tokenize(sentence)

In [None]:
sentence

['भारत', 'की', 'राजधानी', 'दिल्ली', 'है', '।']

In [None]:
test_set=[]
for ele in sentence:
  try:
    test_set.append(word2id[ele])
  except KeyError:
    test_set.append(word2id["UNK"])

test_set=np.array(test_set)

In [None]:
pred=pos_model.predict(test_set)

In [None]:
num=pred.shape[0]
l=[]
for i in range(num):
  l.append(id2tag[np.argmax(pred[i])])


In [None]:
l

['PROPN', 'ADP', 'NOUN', 'PROPN', 'AUX', 'PUNCT']

In [None]:
pos_model.save('postag.h5')