In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

np.random.seed(42)
plt.style.use("ggplot")
max_len = 50

In [2]:
data= pd.read_csv("./ner_dataset.csv",encoding="latin1")

In [3]:
# Fix the problem that Sentence number is missing a lot of times!
data = None # Magic here
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [4]:
print("Unique Words in corpus:",data['Word'].nunique())
print("Unique Tag in corpus:",data['Tag'].nunique())

Unique Words in corpus: 35178
Unique Tag in corpus: 17


In [5]:
#Get a vocabulary of available words and tags. Add the word ENDPAD to the vocab to represent padding
words = list()
words.append("ENDPAD")
num_words = len(words)
tags = list()
num_tags = len(tags)

In [7]:
class SentenceGetter(object):
  def __init__(self,data):
    self.n_sent = 1 #counter
    self.data = data
    agg_func = lambda s:[(w,p,t) for w,p,t in zip(s['Word'].tolist(),s['POS'].tolist(),s['Tag'].tolist())]
    self.grouped = self.data.groupby("Sentence #").apply(agg_func)
    self.sentences = [s for s in self.grouped]

getter = SentenceGetter(data)
sentences = getter.sentences   # This was each sentence is correctly formatted

In [8]:
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [9]:
#Construct a word to index map and a tag to index map
word2idx =  {}
tag2idx  =  {}

In [10]:
X = [[word2idx[w[0]] for w in s]for s in sentences]
X = pad_sequences(maxlen = max_len , sequences =X, padding='post', value =num_words-1)

y = [[tag2idx[w[2]] for w in s]for s in sentences]
y = pad_sequences(maxlen = max_len , sequences =y, padding='post', value =tag2idx["O"])
y = [to_categorical(i, num_classes=num_tags) for i in y]

# Split X and y into train and val sets.
x_train,x_test,y_train,y_test = None


In [11]:
# Create the model, first an Embedding layer. We will add dropout to avoid overfitting, this is a good technique to reproduce in NLP.
input_word = Input(shape = (max_len,))
model = Embedding( )(input_word)  # Fill
model = SpatialDropout1D(0.1)(model)
# Add a Bi-LSTM layer. Specify that the previous layer had recurrent_dropout=0.1 such that it is aware.
model = None # Fill
out = TimeDistributed(Dense(num_tags,activation='softmax'))(model)  # This layer allows to save space by reusing the weights on every word
model = None
model.summary()

2021-07-25 11:15:43.870389: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 50)            1758950   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 50, 50)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 200)           120800    
_________________________________________________________________
time_distributed (TimeDistri (None, 50, 17)            3417      
Total params: 1,883,167
Trainable params: 1,883,167
Non-trainable params: 0
_________________________________________________________________


In [12]:

# Compile and fit the model.
history = None

Epoch 1/5


2021-07-25 11:15:44.310281: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
# Evaluate the model to get the true accuracy
model.evaluate(x_test, np.array(y_test))



[0.05039190128445625, 0.9852502346038818]

In [14]:
# Perform a prediction on a sample sentence and evaluate if all tags were correct
i = np.random.randint(0, x_test.shape[0])
p = model.predict()  # Fill
# Fill

Word           True 	 Pred

------------------------------
NATO           B-org	B-org
spokesmen      O    	O
Thursday       B-tim	B-tim
declined       O    	O
to             O    	O
say            O    	O
where          O    	O
the            O    	O
search         O    	O
is             O    	O
taking         O    	O
place          O    	O
,              O    	O
citing         O    	O
security       O    	O
reasons        O    	O
.              O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate       O    	O
alienate     