<a href="https://colab.research.google.com/github/blawok/Named_Entity_Recognition/blob/master/ner_keras_lime.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Full run of example taken from https://www.depends-on-the-definition.com/interpretable-named-entity-recognition/

In [0]:
import torch
torch.cuda.get_device_name(0)

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/NER/ner_dataset.csv", encoding="latin1").fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [3]:
words = list(set(data["Word"].values))
n_words = len(words); n_words

35178

In [4]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

17

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter = SentenceGetter(data)
sentences = getter.sentences

In [7]:
labels = [[s[2] for s in sent] for sent in sentences]
sentences = [" ".join([s[0] for s in sent]) for sent in sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [8]:
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [9]:
from collections import Counter
from keras.preprocessing.sequence import pad_sequences

word_cnt = Counter(data["Word"].values)
vocabulary = set(w[0] for w in word_cnt.most_common(5000))

Using TensorFlow backend.


In [0]:
max_len = 50
word2idx = {"PAD": 0, "UNK": 1}
word2idx.update({w: i for i, w in enumerate(words) if w in vocabulary})
tag2idx = {t: i for i, t in enumerate(tags)}

In [0]:
X = [[word2idx.get(w, word2idx["UNK"]) for w in s.split()] for s in sentences]

X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word2idx["PAD"])

y = [[tag2idx[l_i] for l_i in l] for l in labels]

y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [0]:
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, shuffle=False)

In [0]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, SpatialDropout1D, Bidirectional

In [0]:
word_input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(word_input)
model = SpatialDropout1D(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)

In [0]:
model = Model(word_input, out)
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [16]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 50)            1758900   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 50, 50)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 200)           120800    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 50, 17)            3417      
Total params: 1,883,117
Trainable params: 1,883,117
Non-trainable params: 0
_________________________________________________________________


In [17]:
history = model.fit(X_tr, 
                    y_tr.reshape(*y_tr.shape, 1),
                    batch_size=32,
                    epochs=5,
                    validation_split=0.1,
                    verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 38846 samples, validate on 4317 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |███                             | 10kB 17.8MB/s eta 0:00:01[K     |██████▏                         | 20kB 1.7MB/s eta 0:00:01[K     |█████████▎                      | 30kB 2.2MB/s eta 0:00:01[K     |████████████▍                   | 40kB 2.5MB/s eta 0:00:01[K     |███████████████▌                | 51kB 2.0MB/s eta 0:00:01[K     |██████████████████▋             | 61kB 2.3MB/s eta 0:00:01[K     |█████████████████████▊          | 71kB 2.5MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 2.7MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 2.9MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 2.8MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 2.8MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [20]:
from eli5.lime import TextExplainer
from eli5.lime.samplers import MaskingTextSampler



In [0]:
class NERExplainerGenerator(object):
    
    def __init__(self, model, word2idx, tag2idx, max_len):
        self.model = model
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.idx2tag = {v: k for k,v in tag2idx.items()}
        self.max_len = max_len
        
    def _preprocess(self, texts):
        X = [[self.word2idx.get(w, self.word2idx["UNK"]) for w in t.split()]
             for t in texts]
        X = pad_sequences(maxlen=self.max_len, sequences=X,
                          padding="post", value=self.word2idx["PAD"])
        return X
    
    def get_predict_function(self, word_index):
        def predict_func(texts):
            X = self._preprocess(texts)
            p = self.model.predict(X)
            return p[:,word_index,:]
        return predict_func

In [22]:
index = 46781
label = labels[index]
text = sentences[index]
print(text)
print()
print(" ".join([f"{t} ({l})" for t, l in zip(text.split(), label)]))

Nigeria 's President Olusegun Obasanjo expressed his condolences , noting the late pontiff promoted religious tolerance and democracy in the West African nation .

Nigeria (B-geo) 's (O) President (B-per) Olusegun (I-per) Obasanjo (I-per) expressed (O) his (O) condolences (O) , (O) noting (O) the (O) late (O) pontiff (O) promoted (O) religious (O) tolerance (O) and (O) democracy (O) in (O) the (O) West (O) African (B-gpe) nation (O) . (O)


In [23]:
for i, w in enumerate(text.split()):
    print(f"{i}: {w}")

0: Nigeria
1: 's
2: President
3: Olusegun
4: Obasanjo
5: expressed
6: his
7: condolences
8: ,
9: noting
10: the
11: late
12: pontiff
13: promoted
14: religious
15: tolerance
16: and
17: democracy
18: in
19: the
20: West
21: African
22: nation
23: .


In [0]:
explainer_generator = NERExplainerGenerator(model, word2idx, tag2idx, max_len)

In [0]:
word_index = 4
predict_func = explainer_generator.get_predict_function(word_index=word_index)

In [0]:
sampler = MaskingTextSampler(
    replacement="UNK",
    max_replace=0.7,
    token_pattern=None,
    bow=False
)

In [27]:
samples, similarity = sampler.sample_near(text, n_samples=4)
print(samples)

("UNK 's UNK UNK UNK UNK UNK UNK , noting the UNK pontiff promoted religious UNK UNK UNK UNK UNK West African UNK .", "Nigeria 'UNK President Olusegun Obasanjo expressed UNK condolences , noting UNK UNK UNK promoted UNK UNK UNK UNK in the UNK African nation .", "Nigeria 's President Olusegun UNK expressed his condolences , noting the late UNK promoted religious UNK and democracy UNK the West African nation .", "UNK 'UNK President Olusegun Obasanjo expressed UNK condolences , noting the late pontiff promoted UNK UNK UNK UNK in UNK West UNK nation .")


In [28]:
te = TextExplainer(
    sampler=sampler,
    position_dependent=True,
    random_state=42
)

te.fit(text, predict_func)

te.explain_prediction(
    target_names=list(explainer_generator.idx2tag.values()),
    top_targets=3
)

Contribution?,Feature
4.415,Highlighted in text (sum)
-0.828,<BIAS>

Contribution?,Feature
-0.838,<BIAS>
-3.869,Highlighted in text (sum)

Contribution?,Feature
-2.18,Highlighted in text (sum)
-2.564,<BIAS>
