In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.utils import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from sklearn.model_selection import train_test_split

In [3]:
data=pd.read_csv("ner_datasetreference.csv", encoding="latin1")
data=data.fillna(method="ffill")
data.head(20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [4]:
print("Unique words in corpus:",data['Word'].nunique())
print("Unique tags in corpus:",data['Tag'].nunique())

Unique words in corpus: 35178
Unique tags in corpus: 17


In [5]:
words=list(set(data["Word"].values))
words.append("ENDPAD")
num_words=len(words)

In [6]:
tags=list(set(data["Tag"].values))
num_tags=len(tags)

In [7]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent=1
        self.data=data
        self.empty=False
        agg_func=lambda s:[(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped=self.data.groupby("Sentence #").apply(agg_func)
        self.sentences=[s for s in self.grouped]
    
    def get_next(self):
        try:
            s=self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent+=1
            return s
        except:
            return None

In [8]:
getter=SentenceGetter(data)
sentences=getter.sentences

In [9]:
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [10]:
word2idx={w:i + 1 for i, w in enumerate(words)}
tag2idx={t:i for i, t in enumerate(tags)}

In [11]:
word2idx

{'Bud': 1,
 'blockades': 2,
 'unsaturated': 3,
 'padded': 4,
 'Publication': 5,
 'anti-government': 6,
 'torchbearer': 7,
 'regimes': 8,
 'volt': 9,
 'credited': 10,
 'shows': 11,
 'penalize': 12,
 'Realtors': 13,
 'Lausanne': 14,
 'Kahar': 15,
 'apologies': 16,
 'Restore': 17,
 'punctuated': 18,
 'dad': 19,
 '03-Jun': 20,
 'glided': 21,
 '706': 22,
 'Amplio': 23,
 'measuring': 24,
 'Finland': 25,
 'hindering': 26,
 'dozens': 27,
 'Bids': 28,
 'Guehenno': 29,
 'Six-party': 30,
 'partner': 31,
 'coercion': 32,
 'export-oriented': 33,
 'interfere': 34,
 'shepherd': 35,
 'Brigades': 36,
 'Sur': 37,
 'tradition': 38,
 'Merimee': 39,
 'everyday': 40,
 'Sepat': 41,
 '5,19,000': 42,
 'discontinued': 43,
 'accepts': 44,
 'export-led': 45,
 'Enkhbayar': 46,
 'playlists': 47,
 'climb': 48,
 'anticipate': 49,
 'Demand': 50,
 'Jalbire': 51,
 '150': 52,
 'Piracy': 53,
 'designates': 54,
 'couple': 55,
 'metric': 56,
 'Ma': 57,
 'fining': 58,
 'Democrat': 59,
 'Aubenas': 60,
 'Polynesian': 61,
 'Rav

In [12]:
max_len=50
X=[[word2idx[w[0]] for w in s] for s in sentences]
X=pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words-1)
y=[[tag2idx[w[2]] for w in s] for s in sentences]
y=pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [13]:
x_train, x_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state=1)

In [14]:
input_word=Input(shape=(max_len,))
model=Embedding(input_dim=num_words, output_dim=50, input_length=max_len)(input_word)
model=SpatialDropout1D(0.1)(model)
model=Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out=TimeDistributed(Dense(num_tags, activation="softmax"))(model)
model=Model(input_word, out)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 50)            1758950   
                                                                 
 spatial_dropout1d (SpatialD  (None, 50, 50)           0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 50, 200)          120800    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 50, 17)           3417      
 ibuted)                                                         
                                                             

In [15]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [16]:
chkpt=ModelCheckpoint("model_weights.h5", monitor='val_loss',verbose=1, save_best_only=True, save_weights_only=True, mode='min')
early_stopping=EarlyStopping(monitor='val_accuracy', min_delta=0, patience=1, verbose=0, mode='max', baseline=None, restore_best_weights=False)
history=model.fit(x=x_train, y=y_train, validation_data=(x_test,y_test), batch_size=32, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
model.evaluate(x_test, y_test)



[0.05117364227771759, 0.9857881665229797]

In [20]:
i=np.random.randint(0, x_test.shape[0]) 
p=model.predict(np.array([x_test[i]]))
p=np.argmax(p, axis=-1)
y_true = y_test[i]
print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" *30)
for w, true, pred in zip(x_test[i], y_true, p[0]):
    print("{:15}{}\t{}".format(words[w-1], tags[true], tags[pred]))

Word           True 	 Pred

------------------------------
Meanwhile      O	O
,              O	O
Bulgaria       B-geo	B-geo
says           O	O
U.S.           B-geo	B-geo
forces         O	O
have           O	O
admitted       O	O
responsibility O	O
for            O	O
the            O	O
"              O	O
friendly-fire  O	O
"              O	O
death          O	O
of             O	O
a              O	O
Bulgarian      B-gpe	B-gpe
soldier        O	O
in             O	O
southern       B-geo	B-geo
Iraq           I-geo	I-geo
a              O	O
week           O	O
ago            O	O
.              O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal         O	O
Faisal  