# Named Entity Recognition using LSTM

Named Entity Recognition (NER) is a subtask of information extraction that locates and classifies named entities in a text. The named entities could be organizations, persons, locations, times, etc.

For example:

<img src='./media/ner.png' width=600px>

Is labeled as follows:

- French: geopolitical entity
- Morocco: geographic entity
- Christmas: time indicator

Everything else that is labeled with an O is not considered to be a named entity.

In [48]:
# Import required libraries
import trax
from trax import layers as tl
import numpy as np
from utils import (get_vocab_and_tags, build_word2idx_and_tag2idx_dict,
                  to_tensor)

## Load Dataset

A few NER tags which the model would be trained to identify:

- geo: geographical entity
- org: organization
- per: person
- gpe: geopolitical entity
- tim: time indicator
- art: artifact
- eve: event
- nat: natural phenomenon
- O: filler word

**Note:**

- B-\<tag>: Indicates first occurence of 'tag' entity in the sentence. Eg. B-geo
- I-\<tag>: Indicates subsequent occurence of 'tag' entity after the first has occured in the sentence. Eg. I-geo

In [49]:
tags_path = 'ner-datasets/large/tags.txt'
words_path = 'ner-datasets/large/words.txt'

### Build vocabulary and NER Tag list

In [50]:
vocab, tags = get_vocab_and_tags(words_path, tags_path)

In [51]:
print('NER tags the model will be trained on:\n', tags)

NER tags the model will be trained on:
 ['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim', 'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve', 'I-eve', 'I-nat']


### Build the Word to Index and Tag to Index Dictionary

In [52]:
vocab_dict, tag_dict = build_word2idx_and_tag2idx_dict(vocab, tags)

In [53]:
vocab_dict.get('<PAD>')

35180

In [54]:
vocab_dict.get('UNK')

35179

### Convert Sentences and NER Labels to Tensors

In [55]:
train_sentences_path = 'ner-datasets/large/train/sentences.txt'
train_labels_path = 'ner-datasets/large/train/labels.txt'

test_sentences_path = 'ner-datasets/large/test/sentences.txt'
test_labels_path = 'ner-datasets/large/test/labels.txt'

val_sentences_path = 'ner-datasets/large/val/sentences.txt'
val_labels_path = 'ner-datasets/large/val/labels.txt'

In [56]:
train_sentences_tensor, train_labels_tensor = to_tensor(train_sentences_path,
                                                        train_labels_path,
                                                        vocab_dict=vocab_dict,
                                                        tag_dict=tag_dict)

test_sentences_tensor, test_labels_tensor = to_tensor(test_sentences_path,
                                                      test_labels_path,
                                                      vocab_dict=vocab_dict,
                                                      tag_dict=tag_dict)
                                                      

val_sentences_tensor, val_labels_tensor = to_tensor(val_sentences_path,
                                                    val_labels_path,
                                                    vocab_dict=vocab_dict,
                                                    tag_dict=tag_dict)

In [57]:
print('Tensor of first sentence in train data:\n', train_sentences_tensor[0])
print('Tensor of labels associated with first sentence in train data:\n', train_labels_tensor[0])

Tensor of first sentence in train data:
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 1, 16, 17, 18, 19, 20, 21]
Tensor of labels associated with first sentence in train data:
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]


### Create a Data Generator

In [58]:
def data_generator(batch_size, sentences_tensor, 
                   tags_tensor, pad, max_len, shuffle=False):
    index = 0
    index_list = list(range(len(sentences_tensor)))
#     max_len = max(len(tensor) for tensor in tags_tensor)
    pad_len = 0
    
    if shuffle:
        np.random.shuffle(index_list)
        
    batch_sentences_tensor, batch_tags_tensor = [], []
    while True:
        if index == len(index_list):
            index = 0
            
            if shuffle:
                np.random.shuffle(index_list)
        
        if len(tags_tensor[index_list[index]]) < max_len:
            pad_len = np.abs(max_len - len(sentences_tensor[index_list[index]]))
            pad_sentence_tensor = sentences_tensor[index_list[index]] + [pad] * pad_len
            pad_tag_tensor = tags_tensor[index_list[index]] + [pad] * pad_len
        else:
            pad_sentence_tensor = sentences_tensor[index_list[index]][:max_len]
            pad_tag_tensor = tags_tensor[index_list[index]][:max_len]
            
        assert len(pad_sentence_tensor) == len(pad_tag_tensor)
        
        batch_sentences_tensor.append(pad_sentence_tensor)
        batch_tags_tensor.append(pad_tag_tensor)
        
        index += 1
        
        if len(batch_sentences_tensor) == batch_size:
            
            X = np.array(batch_sentences_tensor)
            y = np.array(batch_tags_tensor)
                         
            yield X, y
            
            batch_sentences_tensor, batch_tags_tensor = [], []

In [59]:
# Test the generator
gen = data_generator(batch_size=1, 
                     sentences_tensor=train_sentences_tensor[:2], 
                     tags_tensor=train_labels_tensor[:2], 
                     pad=vocab_dict['<PAD>'],
                     max_len=30,
                     shuffle=False)

In [60]:
next(gen)

(array([[    0,     1,     2,     3,     4,     5,     6,     7,     8,
             9,    10,    11,    12,    13,    14,     9,    15,     1,
            16,    17,    18,    19,    20,    21, 35180, 35180, 35180,
         35180, 35180, 35180]]),
 array([[    0,     0,     0,     0,     0,     0,     1,     0,     0,
             0,     0,     0,     1,     0,     0,     0,     0,     0,
             2,     0,     0,     0,     0,     0, 35180, 35180, 35180,
         35180, 35180, 35180]]))

## NER - LSTM Model

We will create a LSTM model to carry out the NER task using the below architecture.

Framework used: **trax**

<img src='./media/architecture.png' width=600px>

### Define the model

In [61]:
def ner_model(vocab_size, d_model, output_dim):
    model = tl.Serial(
        tl.Embedding(vocab_size, d_model),
        tl.LSTM(d_model),
        tl.Dense(output_dim),
        tl.LogSoftmax()
    )
    return model

In [62]:
# Check the architecture of the defined model.
model = ner_model(vocab_size=len(vocab),
                 d_model=50,
                 output_dim=len(tags))
print(model)

Serial[
  Embedding_35180_50
  LSTM_50
  Dense_17
  LogSoftmax
]


### Create Training and Validation Data Generators

We need to create the data generators from training and validation data which will be used during model training and evaluation.

**Note:** 

It is important to mask the padding during training because of,
1. Calculation of **proper** loss during training and,
2. Proper evaluation loss
Otherwise, apart from the valid input sequences, the model will be trained on the padded sequences `<PAD>`  (which we don't want). and this would cause model to learn artificial trend in data that is actually not there.

Masking the padding can be done using the id_to_mask argument of **trax.supervised.inputs.add_loss_weights**.

This will add a masking tensor to the output of generator by idenfying the pad sequences in the target tensor. Hence the output returned by the generator would be a tuple of (batch input tensors, batch label tensors, batch mask tensors) instead of (batch input tensors, batch label tensors)


In [63]:
batch_size = 64

# Create training data, mask pad id=35180 for training.
train_generator = trax.data.inputs.add_loss_weights(
    data_generator(batch_size, 
                   train_sentences_tensor, 
                   train_labels_tensor, 
                   vocab_dict['<PAD>'],
                   max_len=30, 
                   shuffle=True),
        id_to_mask=vocab_dict['<PAD>'])

# Create validation data, mask pad id=35180 for training.
eval_generator = trax.data.inputs.add_loss_weights(
    data_generator(batch_size,
                   val_sentences_tensor, 
                   val_labels_tensor, 
                   pad=vocab_dict['<PAD>'],
                   max_len=30,
                   shuffle=True),
        id_to_mask=vocab_dict['<PAD>'])

In [64]:
# Display the output of the train and validation generators.
# Outputs a tuple of (X, y, mask) tensors.
example = next(train_generator)
display(example)


(array([[28793,   180,  3668, ..., 35180, 35180, 35180],
        [  797, 10217,     9, ..., 35180, 35180, 35180],
        [   61,   575,    93, ...,    45,   950, 13262],
        ...,
        [  147,   677,   180, ..., 35180, 35180, 35180],
        [  595,    78,  2396, ..., 35180, 35180, 35180],
        [12002,   223,  3061, ..., 35180, 35180, 35180]]),
 array([[    0,     0,     0, ..., 35180, 35180, 35180],
        [    1,     0,     0, ..., 35180, 35180, 35180],
        [    0,     0,     0, ...,     0,     0,     0],
        ...,
        [    0,     0,     0, ..., 35180, 35180, 35180],
        [    0,     0,     0, ..., 35180, 35180, 35180],
        [    0,     0,     7, ..., 35180, 35180, 35180]]),
 array([[1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.]], dtype=float32))

### Define the Training Pipeline

In [65]:
def train_model(model,
                train_generator,
                eval_generator, 
                train_steps=1, 
                output_dir='model'):
    
    train_task = trax.supervised.training.TrainTask(
      train_generator,  # A train data generator
      loss_layer = tl.CrossEntropyLoss(), # A cross-entropy loss function
      optimizer = trax.optimizers.Adam(0.01), # The adam optimizer
      n_steps_per_checkpoint=100
    )

    eval_task = trax.supervised.training.EvalTask(
      labeled_data = eval_generator,  # A labeled data generator
      metrics = [tl.CrossEntropyLoss(), tl.Accuracy()], # Evaluate with cross-entropy loss and accuracy
      n_eval_batches = 20 # Number of batches to use on each evaluation
    )
    
    training_loop = trax.supervised.training.Loop(
        model, # A model to train
        train_task, # A train task
        eval_tasks=[eval_task], # The evaluation task
        output_dir=output_dir) # The output directory

    # Train with train_steps
    training_loop.run(n_steps=train_steps)
    ### END CODE HERE ###
    return training_loop

In [66]:
train_steps = 500            # Train for this many steps
!rm -f 'model/model.pkl.gz'  # Remove old model.pkl if it exists
!mkdir model

# Train the model
training_loop = train_model(model, train_generator, eval_generator, train_steps)

mkdir: cannot create directory ‘model’: File exists


  "jax.host_count has been renamed to jax.process_count. This alias "



Step      1: Total number of trainable weights: 1780067
Step      1: Ran 1 train steps in 2.40 secs
Step      1: train CrossEntropyLoss |  3.34563971
Step      1: eval  CrossEntropyLoss |  2.31452596
Step      1: eval          Accuracy |  0.00025674

Step    100: Ran 99 train steps in 4.19 secs
Step    100: train CrossEntropyLoss |  0.52717310
Step    100: eval  CrossEntropyLoss |  0.25589706
Step    100: eval          Accuracy |  0.93168457

Step    200: Ran 100 train steps in 4.19 secs
Step    200: train CrossEntropyLoss |  0.19711468
Step    200: eval  CrossEntropyLoss |  0.16314880
Step    200: eval          Accuracy |  0.95304129

Step    300: Ran 100 train steps in 4.17 secs
Step    300: train CrossEntropyLoss |  0.15919498
Step    300: eval  CrossEntropyLoss |  0.15533679
Step    300: eval          Accuracy |  0.95550829

Step    400: Ran 100 train steps in 4.27 secs
Step    400: train CrossEntropyLoss |  0.14897750
Step    400: eval  CrossEntropyLoss |  0.14043453
Step    400:

## Compute Accuracy

Compute the test accuracy by using all the data from the test data. 

This is done by creating a generator using the test data tensors and setting the `batch_size = len(test_sentences_tensor)`

In [67]:
X_test, y_test = next(data_generator(len(test_sentences_tensor), 
                                     test_sentences_tensor, 
                                     test_labels_tensor, 
                                     pad=vocab_dict['<PAD>'],
                                     max_len=70, 
                                     shuffle=True))

### Get the Predictions from the model

In [68]:
y_pred_tensor = model(X_test)
y_pred = np.argmax(y_pred_tensor, axis=2) # Choose the tag having the highest probability.

In [69]:
print('Shape of prediction tensor: ',y_pred_tensor.shape)
print('Shape of prediction labels: ', y_pred.shape)
print('Shape of actual targets: ',y_test.shape)

Shape of prediction tensor:  (7194, 70, 17)
Shape of prediction labels:  (7194, 70)
Shape of actual targets:  (7194, 70)


In [70]:
print('Prediction output for the first test input: ', (y_pred[0]))

Prediction output for the first test input:  [ 0  7  0  1  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  6  6  6  6  6  6  6  6  6  6  6  6 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10]


In [71]:
print('Actual output for the first test input: ', (y_test[0]))

Actual output for the first test input:  [    0     7     0     3    10     0     0     0     0     0     0     0
     0     0     0     0     7     0     0     0     0     0     0     0
     0     0 35180 35180 35180 35180 35180 35180 35180 35180 35180 35180
 35180 35180 35180 35180 35180 35180 35180 35180 35180 35180 35180 35180
 35180 35180 35180 35180 35180 35180 35180 35180 35180 35180 35180 35180
 35180 35180 35180 35180 35180 35180 35180 35180 35180 35180]


In [72]:
# Get accuracy score with and without the use of masking. 
def evaluate_prediction(y_pred, y_truth, pad=vocab_dict['<PAD>']):

    non_pad_acc = np.sum(y_pred == y_truth) / (y_truth.shape[1] * len(y_truth))
    
    mask = y_truth != pad
    
    pad_acc = np.sum(y_pred == y_truth) / np.sum(mask)
    
    return pad_acc , non_pad_acc

In [73]:
pad_acc , non_pad_acc = evaluate_prediction(y_pred, y_test)
print(f"Accuracy without masking the pad sequences: {non_pad_acc*100:.2f}%")
print(f"Accuracy with masking the pad sequences: {pad_acc*100:.2f}%")

Accuracy without masking the pad sequences: 29.73%
Accuracy with masking the pad sequences: 95.84%


## Make Predictions

In [74]:
def get_NER(sentence,
            vocab_dict=vocab_dict,
            tag_dict=tag_dict,
            model=model):
    
    sentence_tokenized = sentence.strip().split()
    sentence_tensor = [vocab_dict.get(word, vocab_dict.get('UNK')) 
                                          for word in sentence.strip().split()]
    
    # make it a batch of size 1 so as to feed to the model.
    sentence_tensor = np.array([sentence_tensor])
    
    y_pred_tensor = np.array(model(sentence_tensor))
    y_pred = np.argmax(y_pred_tensor, axis=2)
    
    # reduce to 1D tensor.
    y_pred = np.squeeze(y_pred)
    
    # print(type(y_pred))
    
    # create index to tag dictionary.
    idx2tag = { idx: tag for tag, idx in tag_dict.items()}
    
    ner_output = []
    for tag_idx in y_pred:
        ner_output.append(idx2tag.get(tag_idx))
    
    print("Sentence: ", sentence_tokenized)
    print("NER Extracted: ", ner_output)  


In [75]:
get_NER("Microsoft was banned from entering Germany")

Sentence:  ['Microsoft', 'was', 'banned', 'from', 'entering', 'Germany']
NER Extracted:  ['B-org', 'O', 'O', 'O', 'O', 'B-geo']
