## Sequence Tagging using Sequential Models

Sequence Tagging is an information extraction technique to identify and classify named entities in text. These entities can be pre-defined and generic like location names, organizations, time and etc...

In [1]:
%tensorflow_version 2.x
import tensorflow
tensorflow.__version__

TensorFlow 2.x selected.


'2.1.0-rc1'

#### Desired Sample Output

<img src="https://miro.medium.com/max/2400/1*8LOMipM-fmszClg-AwATkQ.png">

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
project_path = '/content/drive/My Drive/Mentor decks DL-20191205T083139Z-001/Mentor decks DL/NLP - Mentor deck/Week 3 - NLP  - Mentor deck/'
glove_file = '/content/drive/My Drive/glove.6B.zip'
ner_dataset_file = project_path + 'ner_dataset.csv.zip'

In [0]:
from zipfile import ZipFile

with ZipFile(glove_file) as z:
  z.extractall()

In [0]:
with ZipFile(ner_dataset_file) as z:
  z.extractall()

In [0]:
# vocab_size = 27873
# # load the whole embedding into memory
# embeddings_index = dict()
# f = open('./glove.6B.100d.txt')

# for line in f:
# 	values = line.split()
# 	word = values[0]
# 	coefs = np.asarray(values[1:], dtype='float32')
# 	embeddings_index[word] = coefs
# f.close()
# print('Loaded %s word vectors.' % len(embeddings_index))

# # create a weight matrix for words in training docs
# embedding_matrix = np.zeros((vocab_size, 100))


# for word, i in t.word_index.items():
# 	embedding_vector = embeddings_index.get(word)
# 	if embedding_vector is not None:
# 		embedding_matrix[i] = embedding_vector

##### Take each word from the glove embedding and pass into the input layer of the model.

### Files required are given in below link

https://drive.google.com/drive/folders/1m9JjfsAEN50flYwFPCgZWQ5nHXGt0ZwI?usp=sharing



In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")

# from google.colab import drive
# drive.mount('/content/drive/')

In [9]:
data = pd.read_csv("ner_dataset.csv", encoding="latin-1")
# data = data.drop(['POS'], axis =1)
# data = data.fillna(method="ffill")
data.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


### Fill the NaN with Sentence tag to easily identify the words of a single sentence.

In [0]:
data = data.fillna(method="ffill")

### Drop POS column from dataset as we are only interested in tags for sentence tagging.

In [0]:
data = data.drop(['POS'], axis =1)

In [12]:
data.head(30)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Thousands,O
1,Sentence: 1,of,O
2,Sentence: 1,demonstrators,O
3,Sentence: 1,have,O
4,Sentence: 1,marched,O
5,Sentence: 1,through,O
6,Sentence: 1,London,B-geo
7,Sentence: 1,to,O
8,Sentence: 1,protest,O
9,Sentence: 1,the,O


Now we can see from the above result that the same sentence words have same sentence id.

In [13]:
tags = list(set(data["Tag"].values))
n_tags = len(tags)
n_tags

17

In [14]:
print(tags)

['I-tim', 'I-art', 'B-art', 'B-eve', 'B-gpe', 'B-org', 'B-tim', 'B-nat', 'I-per', 'I-nat', 'O', 'B-per', 'I-geo', 'I-gpe', 'I-eve', 'I-org', 'B-geo']


In [15]:
words = set(list(data['Word'].values))
words.add('dummy')
n_words = len(words)
n_words

35179

## Import Glove embeddings

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
MAX_NB_WORDS = 20000
t = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)
# t = Tokenizer(num_words=MAX_NB_WORDS) 
t.fit_on_texts(data['Word'])

In [17]:
vocab_size = n_words
# load the whole embedding into memory
embeddings_index = dict()
f = open('./glove.6B.100d.txt')

for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))


for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


### Groupby sentences and combining words and tags for each setence using groupby and apply on dataframe

In [0]:
combining_words_tags = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),s["Tag"].values.tolist())]
d = data.groupby("Sentence #").apply(combining_words_tags)

In [0]:
sentences = [s for s in d]

In [20]:
sentences[0]

[('Thousands', 'O'),
 ('of', 'O'),
 ('demonstrators', 'O'),
 ('have', 'O'),
 ('marched', 'O'),
 ('through', 'O'),
 ('London', 'B-geo'),
 ('to', 'O'),
 ('protest', 'O'),
 ('the', 'O'),
 ('war', 'O'),
 ('in', 'O'),
 ('Iraq', 'B-geo'),
 ('and', 'O'),
 ('demand', 'O'),
 ('the', 'O'),
 ('withdrawal', 'O'),
 ('of', 'O'),
 ('British', 'B-gpe'),
 ('troops', 'O'),
 ('from', 'O'),
 ('that', 'O'),
 ('country', 'O'),
 ('.', 'O')]

In [21]:
print(len(sentences))
sentences = sentences[:3200]

47959


### Map words and tags to integers

In [22]:
words2index = {w:i for i,w in enumerate(words)}
tags2index = {t:i for i,t in enumerate(tags)}
print(words2index['India'])
print(tags2index['B-geo'])

8463
16


#### Make all sentences equal length by appending a `dummy` token at the end of the sentence if the sentence is short. And if the sentence is long consider only `max_length` number of words from that sentence. 

In [0]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import numpy as np

data1 = np.zeros((len(sentences), 30), dtype='int32')

In [24]:
max_len = 30
X = [[w[0]for w in s] for s in sentences]
new_X = []
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("dummy")
    new_X.append(new_seq)
new_X[0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.',
 'dummy',
 'dummy',
 'dummy',
 'dummy',
 'dummy',
 'dummy']

In [0]:
for i, sentence in enumerate(new_X):
    for j, word in enumerate(sentence):
        # print(word)
        wordTokens = text_to_word_sequence(word)
        # print(wordTokens)
        if len(wordTokens):
            word = wordTokens[0]
            # print(word)
            if word == "dummy":
              data1[i,j] = 0
            else:
              if(j < max_len and t.word_index[word] < MAX_NB_WORDS):
                  data1[i,j] = t.word_index[word]
        else:
            data1[i,j] = 0

#### Similarly pad labels with `O` tag

In [26]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
y = [[tags2index[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tags2index["O"])
y[0]

array([10, 10, 10, 10, 10, 10, 16, 10, 10, 10, 10, 10, 16, 10, 10, 10, 10,
       10,  4, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10], dtype=int32)

### Split the dataset into train and test sets

In [0]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(data1, y, test_size=0.2, random_state=10)

In [28]:
len(X_tr)

2560

In [29]:
np.array(X_tr).shape

(2560, 30)

In [30]:
np.array(y_tr).shape

(2560, 30)

In [0]:
batch_size = 32

## Model

In [0]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda, Flatten, Input, Add
from tensorflow.keras import backend as K

In [0]:
input_text = Input(shape=(max_len,), dtype=tensorflow.int64)
embedding = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=True, input_length=max_len)(input_text)
# embedding = Lambda(ElmoEmbedding, output_shape=(max_len, 1024))(e)
x = Bidirectional(LSTM(units=50, return_sequences=True, dropout=0.2))(embedding)
# x = Bidirectional(LSTM(units=50, return_sequences=True))(x)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

In [0]:
def custom_sparse_categorical_accuracy(y_true, y_pred):
    return K.cast(K.equal(K.max(y_true, axis=-1),
                          K.cast(K.argmax(y_pred, axis=-1), K.floatx())),
                  K.floatx())

In [0]:
model = Model(input_text, out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=[custom_sparse_categorical_accuracy])

In [0]:
# y_tr = (np.arange(y_tr.max()+1) == y_tr[...,None]).astype(int)
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)

In [0]:
y_te = y_te.reshape(y_te.shape[0], y_te.shape[1], 1)

In [38]:
y_tr.shape

(2560, 30, 1)

In [39]:
y_te.shape

(640, 30, 1)

In [40]:
X_tr[1]

array([ 351, 6047, 2032,  229,    2,  115,  247,   79, 2926, 1222,  289,
        141,  382,    4,    5,   77,    3,  499,  676, 1334,   22,  115,
        247,    0,    6,   79, 1435,    7, 1242,    3], dtype=int32)

In [41]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 30, 100)           3517900   
_________________________________________________________________
bidirectional (Bidirectional (None, 30, 100)           60400     
_________________________________________________________________
time_distributed (TimeDistri (None, 30, 17)            1717      
Total params: 3,580,017
Trainable params: 3,580,017
Non-trainable params: 0
_________________________________________________________________


In [42]:
history = model.fit(np.array(X_tr), y_tr, batch_size=32, epochs=20, verbose=1)

Train on 2560 samples


In [43]:
!pip install seqeval

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7424 sha256=150c4e294fa114638e9bd26324e3caf2e96f316a3a5a2be734a220dfce726bc7
  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [44]:
np.array(X_te[:]).shape

(640, 30)

In [0]:
idx2tag = {i: w for w, i in tags2index.items()}

In [46]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

test_pred = model.predict(np.array(X_te[:32*10]), verbose=1)

idx2tag = {i: w for w, i in tags2index.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("dummy", "O"))
        out.append(out_i)
    return out



In [0]:
def test2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p[0]].replace("dummy", "O"))
        out.append(out_i)
    return out

In [0]:
pred_labels = pred2label(test_pred)

In [49]:
test_labels = test2label(y_te[:32*10])
print(classification_report(test_labels, pred_labels))

           precision    recall  f1-score   support

      geo       1.00      0.00      0.01       242
      per       0.00      0.00      0.00        95
      org       0.00      0.00      0.00       141
      gpe       0.00      0.00      0.00        97
      nat       0.00      0.00      0.00         2
      tim       0.00      0.00      0.00       138
      art       0.00      0.00      0.00         4
      eve       0.00      0.00      0.00         2

micro avg       1.00      0.00      0.00       721
macro avg       0.34      0.00      0.00       721



The score can be increased by considering full train data and increasing the epochs.

### To visualize tags on text

In [50]:
!pip install ipymarkup

Collecting ipymarkup
  Downloading https://files.pythonhosted.org/packages/d8/29/eaa1bcf649d6333dea829c05577c67f881d0555b6d77c1da72afda5c847d/ipymarkup-0.5.0-py2.py3-none-any.whl
Installing collected packages: ipymarkup
Successfully installed ipymarkup-0.5.0


In [0]:
from ipymarkup import show_box_markup
from ipymarkup.palette import palette, BLUE, RED, GREEN, ORANGE, PURPLE

In [52]:
test = new_X[2]
print(test)

# j = ''
# for i in test:
#   j = j + idx2tag[i]
# print(j) 
text = ' '.join(test)

spans = []
current_pos2 = 0
for index, i in enumerate(test):
  if index<max_len and i !="dummy":
    if len(idx2tag[y_te[2][index][0]].split('-'))>1:
      tag = idx2tag[y_te[2][index][0]].split('-')[1]
    else:
      tag = idx2tag[y_te[2][index][0]].split('-')[0]
    
    current_pos1 = current_pos2
    current_pos2 += len(i)+1
    if current_pos2 > current_pos1:
      spans.append( (current_pos1, current_pos2, tag) )

['Helicopter', 'gunships', 'Saturday', 'pounded', 'militant', 'hideouts', 'in', 'the', 'Orakzai', 'tribal', 'region', ',', 'where', 'many', 'Taliban', 'militants', 'are', 'believed', 'to', 'have', 'fled', 'to', 'avoid', 'an', 'earlier', 'military', 'offensive', 'in', 'nearby', 'South']


In [53]:
spans

[(0, 11, 'O'),
 (11, 20, 'O'),
 (20, 29, 'O'),
 (29, 37, 'O'),
 (37, 46, 'O'),
 (46, 55, 'O'),
 (55, 58, 'O'),
 (58, 62, 'O'),
 (62, 70, 'O'),
 (70, 77, 'O'),
 (77, 84, 'O'),
 (84, 86, 'tim'),
 (86, 92, 'O'),
 (92, 97, 'O'),
 (97, 105, 'O'),
 (105, 115, 'O'),
 (115, 119, 'O'),
 (119, 128, 'O'),
 (128, 131, 'geo'),
 (131, 136, 'O'),
 (136, 141, 'O'),
 (141, 144, 'O'),
 (144, 150, 'O'),
 (150, 153, 'gpe'),
 (153, 161, 'O'),
 (161, 170, 'O'),
 (170, 180, 'O'),
 (180, 183, 'O'),
 (183, 190, 'O'),
 (190, 196, 'O')]

In [54]:
show_box_markup(text, spans, palette=palette(tim=BLUE, geo=RED, gpe=ORANGE, O=PURPLE))