In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('ner_dataset.csv', encoding='latin1')

In [8]:
df.shape

(1048575, 4)

In [9]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [10]:
df.tail()

Unnamed: 0,Sentence #,Word,POS,Tag
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O
1048574,,attack,NN,O


Exploratory Data Analysis

In [11]:
df.isnull().sum()

Sentence #    1000616
Word               10
POS                 0
Tag                 0
dtype: int64

In [12]:
df.dropna(subset=['Word'],inplace=True) 

In [13]:
df.isnull().sum()

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64

In [14]:
df['Sentence #']

0          Sentence: 1
1                  NaN
2                  NaN
3                  NaN
4                  NaN
              ...     
1048570            NaN
1048571            NaN
1048572            NaN
1048573            NaN
1048574            NaN
Name: Sentence #, Length: 1048565, dtype: object

In [15]:
df['Word']

0              Thousands
1                     of
2          demonstrators
3                   have
4                marched
               ...      
1048570             they
1048571        responded
1048572               to
1048573              the
1048574           attack
Name: Word, Length: 1048565, dtype: object

In [16]:
df['Word'].nunique()

35177

In [17]:
df['POS'].unique()

array(['NNS', 'IN', 'VBP', 'VBN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'CC',
       'JJ', '.', 'VBD', 'WP', '``', 'CD', 'PRP', 'VBZ', 'POS', 'VBG',
       'RB', ',', 'WRB', 'PRP$', 'MD', 'WDT', 'JJR', ':', 'JJS', 'WP$',
       'RP', 'PDT', 'NNPS', 'EX', 'RBS', 'LRB', 'RRB', '$', 'RBR', ';',
       'UH', 'FW'], dtype=object)

In [18]:
df['Tag'].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [19]:
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    if token_or_tag == 'token':
        vocab = set(data['Word'].to_list())
    else:
        vocab = set(data['Tag'].to_list())
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

In [20]:
token2idx, idx2token = get_dict_map(df, 'token')
tag2idx, idx2tag = get_dict_map(df, 'tag')

In [21]:
df['Word_idx'] = df['Word'].map(token2idx)
df['Tag_idx'] = df['Tag'].map(tag2idx)

In [22]:
df

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,19859,0
1,,of,IN,O,14069,0
2,,demonstrators,NNS,O,32811,0
3,,have,VBP,O,17112,0
4,,marched,VBN,O,22881,0
...,...,...,...,...,...,...
1048570,,they,PRP,O,33389,0
1048571,,responded,VBD,O,4917,0
1048572,,to,TO,O,33787,0
1048573,,the,DT,O,27390,0


In [23]:
df_fillna = df.fillna(method='ffill', axis=0)

  df_fillna = df.fillna(method='ffill', axis=0)


In [24]:
df_fillna

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,19859,0
1,Sentence: 1,of,IN,O,14069,0
2,Sentence: 1,demonstrators,NNS,O,32811,0
3,Sentence: 1,have,VBP,O,17112,0
4,Sentence: 1,marched,VBN,O,22881,0
...,...,...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O,33389,0
1048571,Sentence: 47959,responded,VBD,O,4917,0
1048572,Sentence: 47959,to,TO,O,33787,0
1048573,Sentence: 47959,the,DT,O,27390,0


In [25]:
# Groupby and collect columns
df_group = df_fillna.groupby(
    ['Sentence #'], as_index=False
)[['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx']].agg(lambda x: list(x))

In [26]:
df_group

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[19859, 14069, 32811, 17112, 22881, 13520, 128...","[0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 15, 0, 0..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[32806, 34180, 16904, 33389, 18376, 33787, 960...","[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[11094, 13853, 28060, 15351, 8556, 20404, 9874...","[0, 0, 14, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 1..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[22328, 31926, 10185, 15671, 28997, 19536, 242...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[7022, 15017, 11283, 3228, 4224, 21953, 26206,...","[15, 0, 0, 3, 13, 0, 14, 0, 15, 0, 2, 0, 2, 0,..."
...,...,...,...,...,...,...
47944,Sentence: 9995,"[Opposition, leader, Mir, Hossein, Mousavi, ha...","[NNP, NN, NNP, NNP, NNP, VBZ, VBN, PRP, VBZ, T...","[O, O, O, B-per, I-per, O, O, O, O, O, O, O, O...","[6867, 30450, 30453, 8709, 20208, 4703, 21953,...","[0, 0, 0, 3, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
47945,Sentence: 9996,"[On, Thursday, ,, Iranian, state, media, publi...","[IN, NNP, ,, JJ, NN, NNS, VBN, DT, NN, IN, DT,...","[O, B-tim, O, B-gpe, O, O, O, O, O, O, O, O, B...","[1091, 14519, 5044, 32806, 26280, 709, 28558, ...","[0, 14, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 16, 11, ..."
47946,Sentence: 9997,"[Following, Iran, 's, disputed, June, 12, elec...","[VBG, NNP, POS, JJ, NNP, CD, NNS, ,, NNS, NNS,...","[O, B-geo, O, O, B-tim, I-tim, O, O, O, O, O, ...","[11610, 22867, 24659, 21838, 9216, 16782, 2634...","[0, 15, 0, 0, 14, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
47947,Sentence: 9998,"[Since, then, ,, authorities, have, held, publ...","[IN, RB, ,, NNS, VBP, VBN, JJ, NNS, IN, DT, VB...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[16695, 4430, 5044, 21041, 17112, 26913, 17570...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [27]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical



In [28]:
def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(df_group, df)

train_tokens length: 32365 
train_tokens length: 32365 
test_tokens length: 4795 
test_tags: 4795 
val_tokens: 10789 
val_tags: 10789


In [29]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [113]:
input_dim = len(list(set(df['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in df_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

In [114]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser 
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [117]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [None]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
# Build the model by calling it on a batch of data before plotting
model_bilstm_lstm.build(input_shape=(None, input_length))
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)



You must install pydot (`pip install pydot`) for `plot_model` to work.
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 2s/step - accuracy: 0.7941 - loss: 2.6050 - val_accuracy: 0.9681 - val_loss: 0.3490
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 2s/step - accuracy: 0.9677 - loss: 0.3383 - val_accuracy: 0.9681 - val_loss: 0.2497
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m500s[0m 20s/step - accuracy: 0.9676 - loss: 0.2789 - val_accuracy: 0.9681 - val_loss: 0.2349
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 2s/step - accuracy: 0.9677 - loss: 0.2547 - val_accuracy: 0.9681 - val_loss: 0.2031
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m772s[0m 31s/step - accuracy: 0.9677 - loss: 0.2263 - val_accuracy: 0.9682 - val_loss: 0.1820
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2s/step - accuracy: 0.9677 - loss: 0.2053 - val_accuracy: 0.9681 - val_loss: 0.1682
[1m26/26[0m [32m━━━━━━━━━━━━

In [127]:
model_bilstm_lstm.summary()

In [35]:
# Download the spaCy model if not already present
# import spacy.cli
# spacy.cli.download("en_core_web_sm")

In [36]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp('Hi, My name is Aman Kharwal \n I am from India \n I want to work with Google \n Steve Jobs is My Inspiration')
displacy.render(text, style = 'ent', jupyter=True)