## Imports

In [1]:
%matplotlib inline
import collections
import math
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from itertools import chain
from six.moves import range
from six.moves.urllib.request import urlretrieve

seed = 54321

# %env TF_FORCE_GPU_ALLOW_GROWTH=true

## Read Data 

In [2]:
def read_data(filename):
    '''
    Read data from a file with given filename
    Returns a list of sentences (each sentence a string), 
    and list of ner labels for each string
    '''

    # print("Reading data ...")
    # master lists - Holds sentences (list of tokens), ner_labels (for each token an NER label)
    sentences, ner_labels = [], [] 
    
    # Open the file
    with open(filename,'r',encoding='latin-1') as f:        
        # Read each line
        is_sos = True # We record at each line if we are seeing the beginning of a sentence
        
        # Tokens and labels of a single sentence, flushed when encountered a new one
        sentence_tokens = []
        sentence_labels = []
        i = 0
        for row in f:
            # If we are seeing an empty line or -DOCSTART- that's a new line
            if len(row.strip()) == 0 or row.split(' ')[0] == '-DOCSTART-':
                is_sos = False
            # Otherwise keep capturing tokens and labels
            else:
                is_sos = True
                token, _, _, ner_label = row.split(' ')
                sentence_tokens.append(token.strip())
                sentence_labels.append(ner_label.strip())
            
            # When we reach the end / or reach the beginning of next
            # add the data to the master lists, flush the temporary one
            if not is_sos and len(sentence_tokens)>0:
                sentences.append(' '.join(sentence_tokens))
                ner_labels.append(sentence_labels)
                sentence_tokens, sentence_labels = [], []
    
    # print('\tDone')
    return sentences, ner_labels

In [3]:
# Train data
train_filepath = 'data\conllpp_train.txt'
train_sentences, train_labels = read_data(train_filepath) 
# Validation data
dev_filepath = 'data\conllpp_dev.txt'
valid_sentences, valid_labels = read_data(dev_filepath) 
# Test data
test_filepath = 'data\conllpp_test.txt'
test_sentences, test_labels = read_data(test_filepath) 

# Print some stats
print(f"Train size: {len(train_labels)}")
print(f"Valid size: {len(valid_labels)}")
print(f"Test size: {len(test_labels)}")

# Print some data
print('\nSample data\n')
for v_sent, v_labels in zip(train_sentences[:5], train_labels[:5]):
    print(f"Sentence: {v_sent}")
    print(f"Labels: {v_labels}\n")

Train size: 14041
Valid size: 3250
Test size: 3452

Sample data

Sentence: EU rejects German call to boycott British lamb .
Labels: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

Sentence: Peter Blackburn
Labels: ['B-PER', 'I-PER']

Sentence: BRUSSELS 1996-08-22
Labels: ['B-LOC', 'O']

Sentence: The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .
Labels: ['O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Sentence: Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .
Labels: ['B-LOC', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 

## Data Processing

**Padding Labels**

In [4]:
def get_lable_id_map(train_labels):
    # Get the unique list of labels
    unique_train_labels = pd.Series(chain(*train_labels)).unique()
    # Create a class_label --> class_ID mapping
    label_map = dict(zip(unique_train_labels, np.arange(unique_train_labels.shape[0])))
    
    print("label_map: {}".format(label_map))
    return label_map

labels_map = get_lable_id_map(train_labels)

label_map: {'B-ORG': 0, 'O': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}


In [5]:
def get_padded_int_labels(labels:list[list[str]], labels_map:dict[str, int], 
                          max_seq_length:int, return_mask:bool = True):
    """
    This function takes sequences of class labels and return sequences of padded 
    class IDs, with the option to return a mask indicating padded labels.
    
    This function takes the following arguments:
        * labels (List[List[str]]) – A list of lists of strings, where each string is 
                                     a class label of the string type
        
        * labels_map (Dict[str, int]) – A dictionary mapping a string label to a 
                                        class ID of type integer
        
        * max_seq_length (int) – A maximum length to be padded to (longer sequences 
                                 will be truncated at this length)
        
        * return_mask (bool) – Whether to return the mask showing padded labels or not
    """
    
    # Convert string labels to integers
    int_labels = [[labels_map[x] for x in one_seq] for one_seq in labels]
    
    # Pad sequences
    if return_mask:
        # If we return mask, we first pad with a special value (-1) and
        # use that to create the mask and later replace -1 with 'O'
        padded_labels = np.array(
                           tf.keras.preprocessing.sequence.pad_sequences(
                                int_labels, maxlen=max_seq_length, padding='post',
                                truncating='post', value=-1
                           )
                        )
        # mask filter
        mask_filter = (padded_labels != -1)
        
        # replace -1 with 'O' s ID
        padded_labels[~mask_filter] = labels_map['O']
        
        return padded_labels, mask_filter.astype('int')
    else:
        # padded_labels = np.array(ner_pad_sequence_func(int_labels, 
        #                                                value=labels_map['O'])
        #                         )
        # return padded_labels
        return

In [6]:
max_seq_length = 40

# Convert string labels to integers for all train/validation/test data
# Pad train/validation/test data
padded_train_labels, train_mask = get_padded_int_labels(train_labels, labels_map, 
                                                        max_seq_length, return_mask=True)

padded_valid_labels, valid_mask = get_padded_int_labels(valid_labels, labels_map, 
                                                        max_seq_length, return_mask=True)

padded_test_labels, test_mask  = get_padded_int_labels(test_labels, labels_map, 
                                                       max_seq_length, return_mask=True)

print(padded_train_labels.shape, train_mask.shape)
print("\nLable Map:", labels_map)
print("\nPadded Label:",padded_train_labels[0])
print("Mask:\t", train_mask[0])

(14041, 40) (14041, 40)

Lable Map: {'B-ORG': 0, 'O': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}

Padded Label: [0 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]
Mask:	 [1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


## Define Hyperparameters

In [7]:
# The maximum length of sequences
max_seq_length = 40

# Size of token embeddings
embedding_size = 64

# Number of hidden units in the RNN layer
rnn_hidden_size = 64

# Number of output nodes in the last layer
n_classes = len(labels_map)

# Number of samples in a batch
batch_size = 64

# Number of epochs to train
epochs = 3

## NER with Character & Token Embeddings

- Here we will focus our discussion on a technique that provides the model embeddings at multiple scales, enabling it to understand language better. That is, instead of relying only on **token embeddings**, we can also use **character embeddings**. Then a token embedding is generated with the character embeddings by shifting a convolutional window over the characters in the token.

### Using Convolution to generate Token mbeddings

A combination of character embeddings and a convolutional kernel can be used to generate token embeddings. The method will be as follows:
- Pad each token (e.g. word) to a predefined length
- Look up the character embeddings for the characters in the token from an embedding layer
- Shift a convolutional kernel over the sequence of character embeddings to generate a token embedding

<div align='center'>
    <img src='images/char_token_embedding.png'/>
</div>

### Statistics about token lenghts (for char embeddings)

The very first thing we need to do is analyze the statistics around how many characters there are for a token in our corpus. Similar to how we did it previously

In [8]:
vocab_ser = pd.Series(pd.Series(train_sentences).str.split().explode().unique())
vocab_ser.str.len().describe(percentiles=[0.05, 0.95])

count    23623.000000
mean         6.832705
std          2.749288
min          1.000000
5%           3.000000
50%          7.000000
95%         12.000000
max         61.000000
dtype: float64

We can see around $95\%$ of our words have less than or equal to $12$ characters. 

Next, we will write a function to pad shorter tokens:

In [9]:
def prepare_corpus_for_char_embeddings(tokenized_sentences, max_seq_length):
    """
    Pads each sequence to a maximum length
    :args:
        - tokenized_sentences: The function takes a set of tokenized sentences
                               (i.e. each sentence as a list of tokens, not a string)
        
        - max_seq_length: maximum sequence length
        
    This function would then do the following:
        - For longer sentences, only return the max_seq_length tokens
        - For shorter sentences, append ‘‘ as a token until max_seq_length is reached
    """
    proc_sentences = []
    for token in tokenized_sentences:
        if len(token) >= max_seq_length:
            proc_sentences.append([[t] for t in token[:max_seq_length]])
        else:
            proc_sentences.append([[t] for t in token+['']*(max_seq_length-len(token))])
    
    return proc_sentences

# Define sample data
data = ['aaaa bb c', 'd eee', 'f gg hhh iiii jjjjj']

# Pad sequences
tokenized_sentences= [d.split() for d in data]
padded_sentences = prepare_corpus_for_char_embeddings(tokenized_sentences, 3)

print(f"tokenized_sentences:\n{tokenized_sentences}")
print(f"Padded sequence:\n{padded_sentences}")

tokenized_sentences:
[['aaaa', 'bb', 'c'], ['d', 'eee'], ['f', 'gg', 'hhh', 'iiii', 'jjjjj']]
Padded sequence:
[[['aaaa'], ['bb'], ['c']], [['d'], ['eee'], ['']], [['f'], ['gg'], ['hhh']]]


### Testing `TextVectorization` for char level

We define a new `TextVectorization` layer that can cope with the changes we introduced to the data. Instead of tokenizing on the token level, the new `TextVectorization` layer must tokenize on the character level.

For this we need to make a few changes. We will again write a function to contain this vectorization layer:

In [10]:
from tensorflow.keras.layers import TextVectorization

def split_char(token):
    """ Instead of splitting word by word, split each char"""
    return tf.strings.bytes_split(token)


# Define a vectorization layer that splits chars
vectorization_layer = TextVectorization(
        standardize=None,      
        split=split_char,
)

# Define sample data
data = ['aaaa bb c', 'd eee', 'f gg hhh iiii jjjjj']

# Pad sequences
tokenized_sentences= [d.split() for d in data]
padded_sentences = prepare_corpus_for_char_embeddings(tokenized_sentences, 3)

print(f"tokenized_sentences:\n{tokenized_sentences}\n")
print(f"Padded sequence:\n{padded_sentences}\n")

# Fit it on a corpus of data
vectorization_layer.adapt(padded_sentences)

# Print data
print(f"Vectorized output:\n{vectorization_layer(padded_sentences)}\n")
print(f"Vocabulary: {vectorization_layer.get_vocabulary()}")

tokenized_sentences:
[['aaaa', 'bb', 'c'], ['d', 'eee'], ['f', 'gg', 'hhh', 'iiii', 'jjjjj']]

Padded sequence:
[[['aaaa'], ['bb'], ['c']], [['d'], ['eee'], ['']], [['f'], ['gg'], ['hhh']]]

Vectorized output:
[[[2 2 2 2]
  [6 6 0 0]
  [9 0 0 0]]

 [[8 0 0 0]
  [4 4 4 0]
  [0 0 0 0]]

 [[7 0 0 0]
  [5 5 0 0]
  [3 3 3 0]]]

Vocabulary: ['', '[UNK]', 'a', 'h', 'e', 'g', 'b', 'f', 'd', 'c']


In [11]:
print(split_char(['abcd']))
print(split_char(tf.constant(['Amit'])))

<tf.RaggedTensor [[b'a', b'b', b'c', b'd']]>
<tf.RaggedTensor [[b'A', b'm', b'i', b't']]>


## Implementing the new NER model

### Defining an advance RNN model

- Token embeddings + Char embeddings
- Bidirectional RNN

### Define hyperparameters

In [12]:
max_seq_length = 40
max_token_length = 12

### Define the i/p layer

In [13]:
import tensorflow.keras.backend as K
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

K.clear_session()

# Input layer(tokens)
word_input = layers.Input(shape=(1,), dtype=tf.string)

# The inputs to this layer would be a batch of sentences, where each sentence is a string. 

### Defining the token-based TextVectorization layer & token_embedding

In [14]:
def get_fitted_token_vectorization_layer(corpus, max_seq_length, vocabulary_size=None):
    """ Fit a TextVectorization layer on given data """
    
    # Define the layer
    vectorization_layer = TextVectorization(max_tokens=vocabulary_size, standardize=None,
                                           output_sequence_length=max_seq_length)
    
    # Fit on the text corpus
    vectorization_layer.adapt(corpus)
    
    # Get the vocabulary_size
    n_vocab = len(vectorization_layer.get_vocabulary())
    
    return vectorization_layer, n_vocab

# Text vectorize layer (token)
token_vectorize_layer, n_token_vocab = get_fitted_token_vectorization_layer(train_sentences, max_seq_length)

# Vectorized output (each word mapped to an int ID)
token_vectorized_out = token_vectorize_layer(word_input)

# Look up embeddings for the returned IDs
token_embedding_out = layers.Embedding(input_dim=n_token_vocab, 
                                       output_dim=64, 
                                       mask_zero=True)(token_vectorized_out)

### Defining the character-based TextVectorization layer & character_embedding

In [15]:
# character-based TextVectorization layer; 
# same as implemented in "Testing TextVectorization for char level"

def get_fitted_char_vectorization_layer(corpus, max_seq_length, max_token_length, 
                                        vocabulary_size=None):
    """
    Fit a TextVectorization layer on given data
    """
    def _split_char(token):
        """
        _split_char() that takes a token (as a tf.Tensor) and returns a char-tokenized tensor
        
        _funcName() means it is a private function.
        Link: https://www.datacamp.com/tutorial/role-underscore-python
        """
        return tf.strings.bytes_split(token)
    
    # Define a text vectorization layer
    vectorization_layer = tf.keras.layers.TextVectorization(standardize=None,
                                                            split=_split_char,
                                                            output_sequence_length=max_token_length)
    # Tokenize the sentences and pad it
    tokenized_sentences = [sent.split() for sent in corpus]
    padded_tokenized_sentences = prepare_corpus_for_char_embeddings(tokenized_sentences,
                                                                    max_seq_length)
    
    # Fit it on a corpus of data
    vectorization_layer.adapt(padded_tokenized_sentences)
    
    # Get the vocab size
    n_vocab = len(vectorization_layer.get_vocabulary())
    
    return vectorization_layer, n_vocab


char_vectorize_layer, n_char_vocab = get_fitted_char_vectorization_layer(train_sentences, 
                                                                         max_seq_length, 
                                                                         max_token_length)

Two key differences between the previous token-based text vectorizer and this char-based text vectorizer are in the input dimensions and the final output dimensions:

- **Token-based vectorizer** – Takes in a `[batch_size, 1]`-sized input and produces a `[batch_size, sequence_length]`-sized output.

- **Char-based vectorizer** – Takes in a `[batch_size, sequence_length, 1]`-sized input and produces a `[batch_size, sequence_length, token_length]`-sized output.

Now we are equipped with the ingredients to implement our new and improved NER classifier.

### Processing the inputs for the `char_vectorize_layer`

To use the same `word_input` for the character vectorization layer, we need to introduce some interim pre-processing to get the i/p to the correct format as intentded for this layer as the input shape needs to be `[batch_size, sequence_length, 1]`-sized tensor.

This means the sentences need to be tokenized to a list of tokens.

In [16]:
tokenized_word_input = layers.Lambda(lambda x: tf.strings.split(x).to_tensor(default_value='', 
                                                                             shape=[None, max_seq_length, 1])
                       )(word_input)

char_vectorized_out = char_vectorize_layer(tokenized_word_input)

<div align='center'>
    <b>Explantion of above, as given in book</b>
    <img src='images/lambda.png'/>
</div>

In [17]:
# Produces a [batch size, seq length, token_length, emb size]
char_embedding_layer = layers.Embedding(input_dim=n_char_vocab, 
                                        output_dim=32, 
                                        mask_zero=True)(char_vectorized_out)

This layer produces a `[batch_size, sequence_length, token_length, 32]`-sized tensor, with a char embedding vector for each character in the tensor. 

Now it’s time to perform convolution on top of this output.

### Performing convolution on the character embeddings

In [18]:
# A 1D convolutional layer that will generate token embeddings by shifting
# a convolutional kernel over the sequence of chars in each token (padded)
char_token_output = layers.Conv1D(filters=1, kernel_size=5, strides=1, padding='same',
                                  activation='relu')(char_embedding_layer)

<div align='center'>
    <b>Explantion of above, as given in book</b>
    <img src='images/conv1d.png'/>
</div>

In [19]:
# There is an additional dimension of size 1 (out channel dimension) that
# we need to remove
char_token_output = layers.Lambda(lambda x: x[:, :, :, 0])(char_token_output)

To get the final output embedding (i.e. a combination of token- and character-based embeddings), we concatenate the two embeddings on the last axis. 

This would result in a 48 element-long vector (i.e. 32 element-long token embedding + 12 element-long char-based token embedding):

In [20]:
# Concatenate the token and char embeddings
concat_embedding_out = layers.Concatenate()([token_embedding_out, char_token_output])

### Defining a simple bidirectional RNN layer

In [21]:
# Define a simple bidirectional RNN layer, it returns an output at each position
rnn_layer_1 = layers.Bidirectional(
                     layers.SimpleRNN(units=64, activation='tanh', 
                                      return_sequences=True, 
                                      #kernel_regularizer='l2', recurrent_regularizer='l2',
                                      #dropout=0.05, 
                                      recurrent_dropout=0.2,
                                     )
              )

rnn_out_1 = rnn_layer_1(concat_embedding_out)

seting `return_sequences=True`, which means it will produce an output at each time step, as opposed to only at the last time step. 

Next, we define the final Dense layer, which has `n_classes` output nodes (i.e. 9) and a `softmax` activation:

In [22]:
# Defines the final prediction layer
dense_layer = layers.Dense(n_classes, activation='softmax')
dense_out = dense_layer(rnn_out_1)

In [23]:
# defining metric to handle class-imbalance

def macro_accuracy(y_true, y_pred):
    
    #  [batch size, time] => [batch size * time]
    y_true = tf.cast(tf.reshape(y_true, [-1]), 'int32')
    
    # [batch size, sequence length, n_classes] => [batch size * time]
    y_pred = tf.cast(tf.reshape(tf.argmax(y_pred, axis=-1), [-1]), 'int32')
    
    sorted_y_true = tf.sort(y_true)
    sorted_inds = tf.argsort(y_true)
    
    sorted_y_pred = tf.gather(y_pred, sorted_inds)
    
    sorted_correct = tf.cast(tf.math.equal(sorted_y_true, sorted_y_pred), 'int32')
    
    # We are adding one to make sure there are no division by zero
    correct_for_each_label = tf.cast(tf.math.segment_sum(sorted_correct, sorted_y_true), 'float32') + 1
    all_for_each_label = tf.cast(tf.math.segment_sum(tf.ones_like(sorted_y_true), sorted_y_true), 'float32') + 1
    
    mean_accuracy = tf.reduce_mean(correct_for_each_label/all_for_each_label)
    
    return mean_accuracy


# mean_accuracy_metric = tf.keras.metrics.MeanMetricWrapper(fn=macro_accuracy,
#                                                           name='macro_accuracy')

In [24]:
# Define the model
char_token_embedding_rnn = tf.keras.Model(inputs=word_input, outputs=dense_out)

# Define a macro accuracy measure
mean_accuracy_metric = tf.keras.metrics.MeanMetricWrapper(fn=macro_accuracy, 
                                                          name='macro_accuracy')

# Compile the model with a loss optimizer and metrics
char_token_embedding_rnn.compile(loss='sparse_categorical_crossentropy', 
                                 optimizer='adam', metrics=[mean_accuracy_metric])

# Summary of the model
char_token_embedding_rnn.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 lambda (Lambda)                (None, 40, 1)        0           ['input_1[0][0]']                
                                                                                                  
 text_vectorization_1 (TextVect  (None, 40, 12)      0           ['lambda[0][0]']                 
 orization)                                                                                       
                                                                                                  
 embedding_1 (Embedding)        (None, 40, 12, 32)   2752        ['text_vectorization_1[0][0]'

This is our final model. The key difference in this model compared to the previous solution is that it used two different embedding types. A standard token-based embedding layer and a complex, char-based embedding that was leveraged to generate token embeddings using the convolution operation. 

Now let’s train the model.

### Model training and evaluation

In [25]:
def get_label_id_map(train_labels):
    # Get the unique list of labels
    unique_train_labels = pd.Series(chain(*train_labels)).unique()
    # Create a class label -> class ID mapping
    labels_map = dict(zip(unique_train_labels, np.arange(unique_train_labels.shape[0])))
    print(f"labels_map: {labels_map}")
    return labels_map

def get_class_weights(train_labels):
    
    label_count_ser = pd.Series(chain(*train_labels)).value_counts()
    label_count_ser = label_count_ser.sum()/label_count_ser
    label_count_ser /= label_count_ser.max()
    
    label_id_map = get_label_id_map(train_labels)
    label_count_ser.index = label_count_ser.index.map(label_id_map)
    return label_count_ser.to_dict()

def get_sample_weights_from_class_weights(labels, class_weights):
    """ From the class weights generate sample weights """
    return np.vectorize(class_weights.get)(labels)

train_class_weights = get_class_weights(train_labels)
print(f"\nClass weights: {train_class_weights}")

labels_map: {'B-ORG': 0, 'O': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}

Class weights: {1: 0.006811025015037328, 5: 0.16176470588235295, 3: 0.17500000000000002, 0: 0.18272425249169436, 4: 0.25507950530035334, 6: 0.31182505399568033, 2: 0.33595113438045376, 8: 0.9982713915298186, 7: 1.0}


In [26]:
# Make train_sequences an array
train_sentences = np.array(train_sentences)
# Get sample weights (we cannot use class_weight with TextVectorization layer)
train_sample_weights = get_sample_weights_from_class_weights(padded_train_labels, train_class_weights)

# Training the model
history = char_token_embedding_rnn.fit(train_sentences, padded_train_labels,
                                       sample_weight=train_sample_weights,
                                       batch_size=64,
                                       epochs=3, 
                                       validation_data=(np.array(valid_sentences), padded_valid_labels))

Epoch 1/3
Epoch 2/3
Epoch 3/3


### Evaluate the model on test data

Improvement over using simple embedding and SimpleRNN

In [27]:
char_token_embedding_rnn.evaluate(np.array(test_sentences), padded_test_labels)



[0.11309965699911118, 0.806908369064331]

### Visually analysing outputs

In [28]:
n_samples = 5
visual_test_sentences = test_sentences[:n_samples]
visual_test_labels = padded_test_labels[:n_samples]

visual_test_predictions = char_token_embedding_rnn.predict(np.array(visual_test_sentences))
visual_test_pred_labels = np.argmax(visual_test_predictions, axis=-1)

rev_labels_map = dict(zip(labels_map.values(), labels_map.keys()))

for i, (sentence, sent_labels, sent_preds) in enumerate(zip(visual_test_sentences, visual_test_labels, visual_test_pred_labels)):    
    n_tokens = len(sentence.split())
    print("Sample:\t", " ".join(sentence.split()))
    print("True:\t", " ".join([rev_labels_map[i] for i in sent_labels[:n_tokens]]))
    print("Pred:\t", " ".join([rev_labels_map[i] for i in sent_preds[:n_tokens]]), '\n')

Sample:	 SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .
True:	 O O B-LOC O O O O B-LOC O O O O
Pred:	 O O O O O O O B-LOC O O O O 

Sample:	 Nadim Ladki
True:	 B-PER I-PER
Pred:	 B-PER O 

Sample:	 AL-AIN , United Arab Emirates 1996-12-06
True:	 B-LOC O B-LOC I-LOC I-LOC O
Pred:	 B-LOC O B-LOC I-LOC I-LOC I-LOC 

Sample:	 Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .
True:	 B-LOC O O O O O B-MISC I-MISC O O O O O O O B-LOC O O O O O O O O O
Pred:	 B-LOC O O O O O B-MISC I-MISC I-MISC O O B-ORG O O O B-LOC O O O O O O O O O 

Sample:	 But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan .
True:	 O B-LOC O O O O O O O O O O O O O O O O O O O O O B-LOC O
Pred:	 O B-LOC O O O O O O O O O O O O O O O O O O O O O O O 



## Other improvements

- **More RNN layers** – Adding more stacked RNN layers. By adding more hidden RNN layers, we can allow the model to learn more refined latent representations, leading to better performance.

In [29]:
# rnn_layer_1 = layers.SimpleRNN(units=64, activation='tanh', 
#                                use_bias=True, return_sequences=True)

# rnn_out_1 = rnn_layer_1(concat_embedding_out)

# rnn_layer_2 = layers.SimpleRNN(units=32, activation='tanh', 
#                                use_bias=True, return_sequences=True)

# rnn_out_2 = rnn_layer_1(rnn_out_1)

- **Make the RNN layer bidirectional** – The RNN models we discussed so far are uni-directional, i.e. looks at the sequence of text from forward to backward. However a different variant known as **bi-directional RNNs looks at the sequence in both directions, i.e. forward to backward and backward to forward**. This leads to better language understanding in models and inevitably better performance.

In [30]:
# Already Implemented

- **Incorporate regularization techniques** – You can leverage L2 regularization and dropout techniques to avoid overfitting and improve generalization of the model.

- **Use early stopping and learning rate reduction to reduce overfitting** – During model training, use early stopping (i.e. training the model only until the validation accuracy is improving) and learning rate reduction (i.e. gradually reducing the learning rate over the epochs).

* **