POS tagger using bidirectinal LSTM

In [3]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional 
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np

Read the data from the train file

In [4]:
data = pd.read_csv("train.tsv",delimiter="\t",on_bad_lines='skip')

check whether it has nay null columns

In [5]:
data=data.dropna()

In [6]:
data.columns=['word',"Tag"]

Exploring the dataset

In [7]:
data.describe()

Unnamed: 0,word,Tag
count,5057045,5057045
unique,126547,5
top,a,N
freq,203526,4339835


We create  train bag of words

In [10]:
train_bag=list(zip(data['word'],data['Tag']))

s tag refers to the end of a sentence ,we split and seggregate them to different ssentence

In [11]:
size = len(train_bag)
idx_list = [idx + 1 for idx, val in
            enumerate(train_bag) if val[0] == "<S>"]


In [12]:
res = [train_bag[i: j] for i, j in
        zip([0] + idx_list, idx_list + 
        ([size] if idx_list[-1] != size else []))]

we find the list of unique words 

In [13]:
vocab = list(set([w for sent in res for (w,t) in sent]))
vocab.append('<PAD>')
print(len(vocab))
tags = list(set([t for sent in res for (w,t) in sent]))
tags.append('<PAD>')
print(tags)
print(res[0]) 

126548
['N', 'T', 'S', 'U', 'H', '<PAD>']
[(')', 'N'), ('tá', 'N'), ('níos', 'N'), ('lú', 'N'), ('gaeilge', 'N'), ('ag', 'N'), ('na', 'N'), ('gardaí', 'N'), ('ná', 'N'), ('bí', 'S'), ('ariamh', 'N'), ('ainneoin', 'N'), ('na', 'N'), ('cearta', 'U'), ('.', 'N'), ('níl', 'N'), ('sé', 'N'), ('ach', 'N'), ('roinnt', 'N'), ('seachtainí', 'N'), ('ó', 'N'), ('sin', 'S'), ('a', 'N'), ('tógadh', 'N'), ('fear', 'N'), ('bocht', 'N'), ('a', 'N'), ('tug', 'S'), ('ainm', 'N'), ('gaeilge', 'N'), ('dóibh', 'N'), ('.', 'N'), ('<S>', 'N')]


padding the feature and the label to fit the data inside the model

In [14]:
num_tags =list(set((data['Tag'])))

In [15]:
max_len = 60
word2index = {w: i for i, w in enumerate(vocab)}
tag2index = {t: i for i, t in enumerate(tags)}
onehot = [[word2index[w[0]] for w in s] for s in res]
X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab)-1)

In [16]:
onehot_y = [[tag2index[w[1]] for w in s] for s in res]
y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post", value=tag2index["<PAD>"])
y = [to_categorical(i, num_classes=len(tags)) for i in y]

In [17]:
y = np.asarray(y)

In [18]:
Xtrain, ytrain = X, y

Hyper parameter tuning 

In [19]:
def tuning(batch_size,epochs):
  model = Sequential()
  model.add(Embedding(input_dim=len(vocab), output_dim=50, input_length=max_len))
  model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
  model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
  model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
  history = model.fit(Xtrain, ytrain, batch_size=batch_size, epochs=epochs, validation_split=0.1, verbose=1)
  print(max(history.history['val_accuracy']))

In [20]:
batch_size=[1000,5000,10000]
n_epochs=[5,10,15]
for i,j in zip(batch_size,n_epochs):
  tuning(i,j)


In [21]:
value=zip(batch_size,n_epochs)

we got high accuracy for epoechs= 5 and batch_size 1000.
we used them in the model

In [22]:
import numpy
# run fit on Colab or hopper
model = Sequential()
model.add(Embedding(input_dim=len(vocab), output_dim=50, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(Xtrain, ytrain, batch_size=1000, epochs=5, validation_split=0.1, verbose=1)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


we pad them test set and encode them and then make predictition

In [27]:
def evaluate():
  test_data = open('test.tsv', 'r')
  all_sentences = []
  sentence = []
  for line in test_data:
      pieces = line.rstrip("\n").split("\t")
      if pieces[0]=='<S>':
        all_sentences.append(sentence)
        sentence = []
      else:
        sentence.append(pieces)
  test_onehot = [[word2index[w[0]] if w[0] in word2index else len(vocab) - 1 for w in s] for s in all_sentences]
  X_final_test = pad_sequences(maxlen=max_len, sequences=test_onehot, padding="post", value=len(vocab)-1)
  onehot_y_test = [[tag2index[w[1]] for w in s] for s in all_sentences]
  y_final_test = pad_sequences(maxlen=max_len, sequences=onehot_y_test, padding="post", value=tag2index["<PAD>"])
  y_final_test = [to_categorical(i, num_classes=len(tags)) for i in y_final_test]
  test_result = model.evaluate(X_final_test, numpy.asarray(y_final_test),verbose=0)
  return test_result[1]

In [28]:
evaluate()

0.9815133810043335