In [40]:
import pandas as pd
data = pd.read_csv('ner_dataset.csv', encoding= 'unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [41]:
from itertools import chain
def MapTokens(data, value):
    token_to_index = {}
    index_to_token = {}
    
    if value == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    index_to_token = {idx:tok for idx, tok in enumerate(vocab)}
    token_to_index = {tok:idx for idx, tok in enumerate(vocab)}
    return token_to_index, index_to_token

token_to_index, index_to_token = MapTokens(data, 'token')
tag_to_index, index_to_tag = MapTokens(data, 'tag')
data['Word_idx'] = data['Word'].map(token_to_index)
data['Tag_idx'] = data['Tag'].map(tag_to_index)


In [42]:
fixed_data = data.fillna(method='ffill', axis = 0)
dataGroup = fixed_data.groupby(
['Sentence #'], as_index= False)['Word', 'POS','Tag','Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  dataGroup = fixed_data.groupby(


In [43]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
def getTrainTestSplit(data_group, data):
    number_of_token = len(list(set(data['Word'].to_list()))) #getting max length
    number_of_tag = len(list(set(data['Tag'].to_list())))
    tokens = data_group['Word_idx'].tolist() # here we pad our tokens
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen,dtype='int32',padding='post' , value=number_of_token-1)
    tags = data_group['Tag_idx'].tolist() #Here we convert our tags to one hot encoding after padding them
    pad_tags = pad_sequences(tags,maxlen = maxlen, dtype='int32', padding = 'post', value = tag_to_index["O"])
    number_of_tags = len(tag_to_index)
    pad_tags = [to_categorical(i,num_classes=number_of_tags) for i in pad_tags]
    
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, validation_tokens, train_tags, validation_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)
    
    print(
        'trainTokens length:', len(train_tokens),
        '\ntrainTags length', len(train_tags),
        '\ntestTokens length:', len(test_tokens),
        '\ntestTags length:', len(test_tags),
        '\nvalTokens length:', len(validation_tokens),
        '\nvalTags length:', len(validation_tags),
    )
    
    return train_tokens, validation_tokens, test_tokens, train_tags, validation_tags, test_tags

train_tokens, validation_tokens, testTokens, train_tags, validation_tags, test_tags = getTrainTestSplit(dataGroup, data)
    

trainTokens length: 32372 
trainTags length 32372 
testTokens length: 4796 
testTags length: 4796 
valTokens length: 10791 
valTags length: 10791


In [44]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

input_dimensions = len(list(set(data['Word'].to_list())))+1
output_dimensions = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
number_of_tags = len(tag_to_index)



In [45]:
def get_bilstm_lstm_model():
    model = Sequential()
    # Adding Embedding layer
    model.add(Embedding(input_dim=input_dimensions, output_dim=output_dimensions, input_length=input_length))
    # Adding bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dimensions, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
    # Adding LSTM
    model.add(LSTM(units=output_dimensions, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
    # Adding timeDistributed Layer
    model.add(TimeDistributed(Dense(number_of_tags, activation="relu")))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [46]:
def train_model(X, y,model):
    loss = list()
    for i in range(25):
        hist = model.fit(X,y,batch_size=1000, verbose = 1, epochs=1,validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 104, 64)           2251456   
                                                                 
 bidirectional_4 (Bidirecti  (None, 104, 128)          66048     
 onal)                                                           
                                                                 
 lstm_9 (LSTM)               (None, 104, 64)           49408     
                                                                 
 time_distributed_4 (TimeDi  (None, 104, 17)           1105      
 stributed)                                                      
                                                                 
Total params: 2368017 (9.03 MB)
Trainable params: 2368017 (9.03 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [1]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
Content = nlp('My name is Andrew Dusa and I am from the United States. I hope to inspire the world to learn to code. I would like the opportunity to work at a tech company like Microsoft, Google, Apple, or Amazon.')
displacy.render(Content, style = 'ent', jupyter = True)