# NER Tagging with Bidirectional LSTM

#### importing necessary packages

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv('S21-gene-train.txt', encoding= 'unicode_escape', sep="\\t", names = ["Length", "Word", "Tag"])
data.head()

  return func(*args, **kwargs)


Unnamed: 0,Length,Word,Tag
0,1,Comparison,O
1,2,with,O
2,3,alkaline,B
3,4,phosphatases,I
4,5,and,O


#### convert and map tokens and tags to integer values

In [2]:

from itertools import chain
def get_dict_map(data, token_or_tag):
     tok2idx = {}
     idx2tok = {}

     if token_or_tag == 'token':
         vocab = list(set(data['Word'].to_list()))
     else:
         vocab = list(set(data['Tag'].to_list()))

     idx2tok = {idx:tok for idx, tok in enumerate(vocab)}
     tok2idx = {tok:idx for idx, tok in enumerate(vocab)}
     return tok2idx, idx2tok
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

#### Creating new column in the main data structure for word and tag indices

In [3]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)


#### Creating data groups on the basis of sentences

In [4]:
data["Sentence"] = data["Length"]
data["Sentence"][data["Sentence"] > 1] = None
data.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Sentence"][data["Sentence"] > 1] = None


Unnamed: 0,Length,Word,Tag,Word_idx,Tag_idx,Sentence
0,1,Comparison,O,7749,2,1.0
1,2,with,O,3091,2,
2,3,alkaline,B,1562,1,
3,4,phosphatases,I,2867,0,
4,5,and,O,8530,2,


In [5]:
sen_no = 1
for j, i in enumerate(data["Sentence"]):
     if i == 1:
         data["Sentence"][j] = sen_no
         sen_no += 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Sentence"][j] = sen_no


In [6]:
data.head()

Unnamed: 0,Length,Word,Tag,Word_idx,Tag_idx,Sentence
0,1,Comparison,O,7749,2,1.0
1,2,with,O,3091,2,
2,3,alkaline,B,1562,1,
3,4,phosphatases,I,2867,0,
4,5,and,O,8530,2,


In [7]:
empty = np.where(data["Word"].isna())

In [8]:
empty_rows = list(empty[0])

In [9]:
data = data.drop(empty_rows)


In [10]:
np.where(data["Word"].isna())

(array([], dtype=int64),)

In [11]:
data.isna().any()

Length      False
Word        False
Tag         False
Word_idx    False
Tag_idx     False
Sentence     True
dtype: bool

In [12]:
data_fillna = data.fillna(method='ffill', axis=0)
data_fillna.head()

Unnamed: 0,Length,Word,Tag,Word_idx,Tag_idx,Sentence
0,1,Comparison,O,7749,2,1.0
1,2,with,O,3091,2,1.0
2,3,alkaline,B,1562,1,1.0
3,4,phosphatases,I,2867,0,1.0
4,5,and,O,8530,2,1.0


In [13]:
data_group = data_fillna.groupby(
['Sentence'],as_index=False
)['Length', 'Word', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
# Visualise data
data_group.head()

  data_group = data_fillna.groupby(


Unnamed: 0,Sentence,Length,Word,Tag,Word_idx,Tag_idx
0,1.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9]","[Comparison, with, alkaline, phosphatases, and...","[O, O, B, I, O, B, I, I, O]","[7749, 3091, 1562, 2867, 8530, 1622, 9157, 959...","[2, 2, 1, 0, 2, 1, 0, 0, 2]"
1,2.0,"[1, 2, 3, 4, 5, 6]","[Pharmacologic, aspects, of, neonatal, hyperbi...","[O, O, O, O, O, O]","[5551, 6300, 5600, 5316, 3696, 2715]","[2, 2, 2, 2, 2, 2]"
2,3.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[When, CSF, [, HCO3, -], is, shown, as, a, fun...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[8787, 6761, 6456, 6526, 6785, 4146, 1304, 381...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
3,4.0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Flurazepam, thus, appears, to, be, an, effect...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[1899, 2103, 1593, 5918, 3467, 1862, 3846, 663...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,5.0,"[1, 2, 3, 4]","[Beta, blocking, agents, .]","[O, O, O, O]","[3553, 6409, 4822, 2715]","[2, 2, 2, 2]"


#### Creating train and test tokens


In [14]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [15]:
def get_pad_train_test_val(data_group, data):
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))
    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)
    
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    
    train_tokens, test_tokens, train_tags, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntrain_tags:', len(train_tags),
        '\ntest_tags:', len(test_tags)
    )
    
    return train_tokens, test_tokens, train_tags, test_tags

train_tokens, test_tokens, train_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 1759 
test_tokens length: 196 
train_tags: 1759 
test_tags: 196


# Model


In [16]:
import numpy as np
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
import keras


In [17]:
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [18]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 32
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

input_dim:  9233 
output_dim:  32 
input_length:  143 
n_tags:  3


In [19]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation='leaky_relu')))

    #Optimiser 
    adam = tensorflow.keras.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=['accuracy'])
    model.summary()
    
    return model

In [20]:
def train_model(X, y, model):
    loss = list()
    for _ in range(15):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss
model_bilstm_lstm = get_bilstm_lstm_model()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 143, 32)           295456    
                                                                 
 bidirectional (Bidirectiona  (None, 143, 64)          16640     
 l)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 143, 32)           12416     
                                                                 
 time_distributed (TimeDistr  (None, 143, 3)           99        
 ibuted)                                                         
                                                                 
Total params: 324,611
Trainable params: 324,611
Non-trainable params: 0
_________________________________________________________________


  super(Adam, self).__init__(name, **kwargs)


#### Training the model

In [21]:
results = pd.DataFrame()
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

