In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
data = pd.read_csv('./data/ner_dataset.csv', encoding= 'unicode_escape')

In [2]:
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))


2.10.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
import os
os.environ['TF_DISABLE_SEGMENT_REDUCTION_OP_DETERMINISM_EXCEPTIONS'] = '1'

In [4]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In the data, we can see that the words are broken into columns which will represent our feature X, and the Tag column in the right will represent our label Y.

Data Preparation for Neural Networks
I will train a Neural Network for the task of Named Entity Recognition (NER). So we need to do some modifications in the data to prepare it in such a manner so that it can easily fit into a neutral network. I will start this step by extracting the mappings that are required to train the neural network:

In [5]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}

    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))

    idx2tok = {idx:tok for idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

transform the columns in the data to extract the sequential data for our neural network:

In [6]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data_fillna = data.ffill()
# Groupby and collect columns
data_group = data_fillna.groupby(
    ['Sentence #'],as_index=False
)[['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx']].agg(lambda x: list(x))

Now I will split the data into training and test sets. I will create a function for splitting the data because the LSTM layers accept sequences of the same length only. So every sentence that appears as integer in the data must be padded with the same length:

In [7]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def get_pad_train_test_val(data_group, data):

    # get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    # Pad tokens (X var)
    tokens = data_group['Word_idx'].to_list()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value=n_token - 1)

    # Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].to_list()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = to_categorical(pad_tags, num_classes=n_tags)

    # split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2026)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_, tags_, test_size=0.25, train_size=0.75, random_state=2026)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags', len(val_tags),
    )

    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags
    
train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 32372 
train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags 10791


In [8]:
# Convert to consistent types
X_train_final = np.array(train_tokens).astype('int32')
y_train_final = np.array(train_tags).astype('float32')

# This handles the internal memory "Graph" more reliably
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_final, y_train_final))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64).prefetch(tf.data.AUTOTUNE)

X_val_final = np.array(val_tokens).astype('int32')
y_val_final = np.array(val_tags).astype('float32')
val_dataset = tf.data.Dataset.from_tensor_slices((X_val_final, y_val_final)).batch(64)

Training Neural Network for Named Entity Recognition (NER)

In [9]:
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

tf.keras.utils.set_random_seed(1)
tf.config.experimental.enable_op_determinism()

The layer below will take the dimensions from the LSTM layer and will give the maximum length and maximum tags as an output:

In [10]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].to_list()])
n_tags = len(tag2idx)
n_words = len(token2idx) + 1 


Now I will create a helper function which will help us in giving the summary of every layer of the neural network model for Named Entity Recognition (NER):

In [11]:
model = Sequential()
def get_bilstm_lstm_model():
    

    # Add Embedding layer
    model.add(Embedding(input_dim=n_words, output_dim=output_dim, input_length=104))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=128, return_sequences=True, dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(Bidirectional(LSTM(units=128, return_sequences=True, dropout=0.2)))

    # Dropout layer
    model.add(Dropout(0.3))
    
    # Add Dense layer
    model.add(Dense(n_tags, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

In [12]:
model_bilstm_lstm = get_bilstm_lstm_model()
    
history = model.fit(
    train_dataset, 
    validation_data=val_dataset,
    epochs=3
)
    

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 104, 64)           2251456   
                                                                 
 bidirectional (Bidirectiona  (None, 104, 256)         197632    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 104, 256)         394240    
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, 104, 256)          0         
                                                                 
 dense (Dense)               (None, 104, 17)           4369      
                                                                 
Total params: 2,847,697
Trainable params: 2,847,697
Non-

In [21]:
import spacy
from spacy import displacy
clean_text = 'Hi, My name is Mavi \n I am from India \n I want to work with Google \n Steve Jobs is My Inspiration'.replace('\n', '. ')
text = nlp(clean_text)
displacy.render(text, style='ent', jupyter=True)

In [14]:
from tensorflow import keras
import pickle

In [17]:
model.save("artifacts/model.h5")

pickle.dump(token2idx, open("artifacts/word2idx.pkl", "wb"))
pickle.dump(idx2tag, open("artifacts/idx2tag.pkl", "wb"))