In [1]:
import pandas as pd 
data = pd.read_csv('ner_dataset.csv' , encoding = 'unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [2]:
from itertools import chain 
def get_dict_map(data , token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
        
        
    idx2tok = {idx:tok for idx,tok in enumerate(vocab)}
    tok2idx = {tok:idx for idx,tok in enumerate(vocab)}
    
    return tok2idx , idx2tok

token2idx , idx2token = get_dict_map(data, 'token')
tag2idx , idx2tag = get_dict_map(data ,'tag')

# Transform the column data to train for neural network

In [3]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)

data_fillna = data.fillna(method='ffill' , axis = 0)

#Group by and collect columns
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  data_group = data_fillna.groupby(


In [4]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


def get_pad_train_test_val(data_group , data):
    
    #get max and token length 
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))
    
    #Pad token(X Var)
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s)for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen = maxlen , dtype = 'int32', padding='post' , value = n_token - 1)
    
    #Pad Tags (y var)  and convert it into one encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags , maxlen= maxlen , dtype = 'int32' , padding = 'post' ,value = tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i , num_classes=n_tags) for i in pad_tags]

    #split train test validation set
    token_,test_tokens, tag_ , test_tags =  train_test_split(pad_tokens , pad_tags , test_size =0.1, train_size = 0.9 , random_state = 2020)
    train_tokens , val_tokens,train_tags , val_tags = train_test_split(token_ , tag_ , test_size = 0.25 , train_size =0.75 ,random_state = 2020)
    
    print(
    'train_tokens length : ' , len(train_tokens),
    '\ntrain_tokens length : ' , len(train_tokens),
    '\ntest_tokens length : ' , len(test_tokens),
    '\ntest_tags : ' , len(test_tags),
    '\nval_tokens : ' , len(val_tokens),
    '\nval_tags : ' , len(val_tags)
    )
    
    return train_tokens , val_tokens, test_tokens, train_tags , val_tags , test_tags

train_tokens , val_tokens, test_tokens, train_tags , val_tags , test_tags = get_pad_train_test_val(data_group , data)

train_tokens length :  32372 
train_tokens length :  32372 
test_tokens length :  4796 
test_tags :  4796 
val_tokens :  10791 
val_tags :  10791


# Training Neural Network for Name Entity Recognition

In [5]:
import numpy as np 
import tensorflow 
from tensorflow.keras import Sequential , Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed 
seed(1)
tensorflow.random.set_seed(2)

In [6]:
input_dim = len(list(set(data['Word'].tolist()))) + 1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

# Helper function to get the summary of Neural Network

In [7]:
def get_bilstm_lstm_model():
    
    model = Sequential()
    
    #Add Embedding Layer
    model.add(Embedding(input_dim = input_dim , output_dim = output_dim , input_length = input_length))
    
    #Add Bidirectional LSTM
    model.add(Bidirectional(LSTM(units = output_dim,return_sequences = True , dropout = 0.2 , recurrent_dropout = 0.2) ,merge_mode = 'concat'))
    
    #Add LSTM
    model.add(LSTM(units = output_dim , return_sequences = True , dropout = 0.5 , recurrent_dropout = 0.5))
    
    #Add Timedistributed layer 
    model.add(TimeDistributed(Dense(n_tags, activation = 'relu')))
    
    #Complie model 
    model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam' ,metrics = ['accuracy'])
    model.summary()
    
    return model

In [8]:
def train_model(X,y, model):
    loss = list()
    for i in range(25):
        #fit model for one epoch on this sequence
        hist = model.fit(X,y, batch_size = 1000, verbose = 1, epochs =1, validation_split = 0.2)
        loss.append(hist.history['loss'][0])
        
    return loss

In [9]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags) , model_bilstm_lstm)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 104, 64)           2251456   
_________________________________________________________________
bidirectional (Bidirectional (None, 104, 128)          66048     
_________________________________________________________________
lstm_1 (LSTM)                (None, 104, 64)           49408     
_________________________________________________________________
time_distributed (TimeDistri (None, 104, 17)           1105      
Total params: 2,368,017
Trainable params: 2,368,017
Non-trainable params: 0
_________________________________________________________________
('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')

 1/26 [>.............................] - ETA: 0s - loss: 2.7992 - accuracy: 0.7728
 2/26 [=>...

 2/26 [=>............................] - ETA: 3:16 - loss: 0.2285 - accuracy: 0.9666
 3/26 [==>...........................] - ETA: 4:06 - loss: 0.2244 - accuracy: 0.9670
 4/26 [===>..........................] - ETA: 4:09 - loss: 0.2233 - accuracy: 0.9673
 5/26 [====>.........................] - ETA: 4:10 - loss: 0.2220 - accuracy: 0.9674
 6/26 [=====>........................] - ETA: 4:07 - loss: 0.2217 - accuracy: 0.9675

 1/26 [>.............................] - ETA: 0s - loss: 0.2029 - accuracy: 0.9677
 2/26 [=>............................] - ETA: 3:05 - loss: 0.2069 - accuracy: 0.9666
 3/26 [==>...........................] - ETA: 3:50 - loss: 0.2031 - accuracy: 0.9670
 4/26 [===>..........................] - ETA: 4:06 - loss: 0.2022 - accuracy: 0.9673
 5/26 [====>.........................] - ETA: 4:10 - loss: 0.2004 - accuracy: 0.9674
 6/26 [=====>........................] - ETA: 4:09 - loss: 0.2009 - accuracy: 0.9675

 1/26 [>.............................] - ETA: 0s - loss: 0.2061 -


 1/26 [>.............................] - ETA: 0s - loss: 0.2450 - accuracy: 0.9677
 2/26 [=>............................] - ETA: 2:55 - loss: 0.2386 - accuracy: 0.9666
 3/26 [==>...........................] - ETA: 3:59 - loss: 0.2289 - accuracy: 0.9669
 4/26 [===>..........................] - ETA: 4:14 - loss: 0.2233 - accuracy: 0.9672
 5/26 [====>.........................] - ETA: 4:22 - loss: 0.2192 - accuracy: 0.9674
 6/26 [=====>........................] - ETA: 4:17 - loss: 0.2172 - accuracy: 0.9675

 1/26 [>.............................] - ETA: 0s - loss: 0.1731 - accuracy: 0.9677
 2/26 [=>............................] - ETA: 3:04 - loss: 0.1761 - accuracy: 0.9666
 3/26 [==>...........................] - ETA: 4:03 - loss: 0.1722 - accuracy: 0.9670
 4/26 [===>..........................] - ETA: 4:24 - loss: 0.1710 - accuracy: 0.9673
 5/26 [====>.........................] - ETA: 4:33 - loss: 0.1693 - accuracy: 0.9674
 6/26 [=====>........................] - ETA: 4:31 - loss: 0.1685 -

 1/26 [>.............................] - ETA: 0s - loss: 0.1391 - accuracy: 0.9677
 2/26 [=>............................] - ETA: 3:31 - loss: 0.1408 - accuracy: 0.9666
 3/26 [==>...........................] - ETA: 4:24 - loss: 0.1387 - accuracy: 0.9670
 4/26 [===>..........................] - ETA: 4:41 - loss: 0.1394 - accuracy: 0.9673
 5/26 [====>.........................] - ETA: 4:44 - loss: 0.1416 - accuracy: 0.9674
 6/26 [=====>........................] - ETA: 4:42 - loss: 0.1455 - accuracy: 0.9675

 1/26 [>.............................] - ETA: 0s - loss: 0.1353 - accuracy: 0.9677
 2/26 [=>............................] - ETA: 3:26 - loss: 0.1376 - accuracy: 0.9666
 3/26 [==>...........................] - ETA: 4:27 - loss: 0.1345 - accuracy: 0.9670
 4/26 [===>..........................] - ETA: 4:47 - loss: 0.1334 - accuracy: 0.9673
 5/26 [====>.........................] - ETA: 4:51 - loss: 0.1317 - accuracy: 0.9675
 6/26 [=====>........................] - ETA: 4:48 - loss: 0.1311 - 


 1/26 [>.............................] - ETA: 0s - loss: 0.1190 - accuracy: 0.9678
 2/26 [=>............................] - ETA: 3:36 - loss: 0.1203 - accuracy: 0.9667
 3/26 [==>...........................] - ETA: 4:38 - loss: 0.1179 - accuracy: 0.9671
 4/26 [===>..........................] - ETA: 5:03 - loss: 0.1168 - accuracy: 0.9673
 5/26 [====>.........................] - ETA: 5:02 - loss: 0.1153 - accuracy: 0.9675
 6/26 [=====>........................] - ETA: 5:00 - loss: 0.1148 - accuracy: 0.9676

 1/26 [>.............................] - ETA: 0s - loss: 0.1131 - accuracy: 0.9679
 2/26 [=>............................] - ETA: 3:34 - loss: 0.1154 - accuracy: 0.9668
 3/26 [==>...........................] - ETA: 4:09 - loss: 0.1128 - accuracy: 0.9672
 4/26 [===>..........................] - ETA: 4:18 - loss: 0.1117 - accuracy: 0.9675
 5/26 [====>.........................] - ETA: 4:18 - loss: 0.1102 - accuracy: 0.9676
 6/26 [=====>........................] - ETA: 4:13 - loss: 0.1097 -


 1/26 [>.............................] - ETA: 0s - loss: 0.1126 - accuracy: 0.9682
 2/26 [=>............................] - ETA: 4:17 - loss: 0.1137 - accuracy: 0.9671
 3/26 [==>...........................] - ETA: 5:25 - loss: 0.1110 - accuracy: 0.9675
 4/26 [===>..........................] - ETA: 5:57 - loss: 0.1107 - accuracy: 0.9678
 5/26 [====>.........................] - ETA: 5:51 - loss: 0.1095 - accuracy: 0.9679
 6/26 [=====>........................] - ETA: 5:42 - loss: 0.1093 - accuracy: 0.9680

 1/26 [>.............................] - ETA: 0s - loss: 0.1941 - accuracy: 0.9680
 2/26 [=>............................] - ETA: 4:17 - loss: 0.1931 - accuracy: 0.9669
 3/26 [==>...........................] - ETA: 4:58 - loss: 0.1891 - accuracy: 0.9672
 4/26 [===>..........................] - ETA: 5:23 - loss: 0.1864 - accuracy: 0.9675
 5/26 [====>.........................] - ETA: 5:17 - loss: 0.1851 - accuracy: 0.9677
 6/26 [=====>........................] - ETA: 5:08 - loss: 0.1852 -


 1/26 [>.............................] - ETA: 0s - loss: 0.1067 - accuracy: 0.9684
 2/26 [=>............................] - ETA: 3:26 - loss: 0.1086 - accuracy: 0.9672
 3/26 [==>...........................] - ETA: 4:21 - loss: 0.1058 - accuracy: 0.9676
 4/26 [===>..........................] - ETA: 4:41 - loss: 0.1049 - accuracy: 0.9679
 5/26 [====>.........................] - ETA: 4:45 - loss: 0.1040 - accuracy: 0.9680
 6/26 [=====>........................] - ETA: 4:44 - loss: 0.1044 - accuracy: 0.9681

 1/26 [>.............................] - ETA: 0s - loss: 0.1882 - accuracy: 0.9687
 2/26 [=>............................] - ETA: 3:28 - loss: 0.1864 - accuracy: 0.9675
 3/26 [==>...........................] - ETA: 4:22 - loss: 0.1813 - accuracy: 0.9678
 4/26 [===>..........................] - ETA: 4:41 - loss: 0.1777 - accuracy: 0.9681
 5/26 [====>.........................] - ETA: 4:46 - loss: 0.1746 - accuracy: 0.9682
 6/26 [=====>........................] - ETA: 4:44 - loss: 0.1733 -


 1/26 [>.............................] - ETA: 0s - loss: 0.1081 - accuracy: 0.9684
 2/26 [=>............................] - ETA: 3:26 - loss: 0.1093 - accuracy: 0.9672
 3/26 [==>...........................] - ETA: 4:23 - loss: 0.1065 - accuracy: 0.9676
 4/26 [===>..........................] - ETA: 4:43 - loss: 0.1057 - accuracy: 0.9679
 5/26 [====>.........................] - ETA: 4:48 - loss: 0.1044 - accuracy: 0.9680
 6/26 [=====>........................] - ETA: 4:46 - loss: 0.1038 - accuracy: 0.9680


# Testing the Name Entity Recognition Model

In [10]:
import spacy 
from spacy import displacy 
nlp = spacy.load('en_core_web_sm')
text = nlp('Hi, My name is Anshul Mahajan \n I am from Indore, Madhya Pradesh \n I look up to Elon musk as my inspiration')

displacy.render(text , style = 'ent' , jupyter = True)