# 0. Configuration

In [1]:
from silence_tensorflow import silence_tensorflow
silence_tensorflow()

import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import os
import numpy as np
import pickle as pk
import itertools
from copy import deepcopy
from collections import defaultdict

from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from keras.layers import Dense, Bidirectional, LSTM, TimeDistributed
from keras_contrib.layers import CRF
from keras.utils import to_categorical

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


# 1. Data Preparation

## 1.1 Labels

In [3]:
label_dict = {'NON': 0,     #None
              'PER': 1,     #PERSON
              'FOD': 2,}    #FOOD

In [4]:
label_list = deepcopy([l for l in label_dict.keys()])

cnt = deepcopy(len(label_dict))
label_dict['__PAD__'] = cnt
label_dict['__UNK__'] = cnt+1
        
label2id = label_dict
id2label = {int(i): str(l) for i, l in enumerate(label_dict.keys())}

## 1.2 Sentences

In [8]:
data_sents = {'sent1': 'Sam likes pizza',
              'sent2': 'Erik eats pizza',
              'sent3': 'Erik and Sam are drinking soda',
              'sent4': 'Flora cooks chicken',
              'sent5': 'Sam ordered a chicken',
              'sent6': 'Flora likes chicken sandwitch',
              'sent7': 'Erik likes to drink soda'}
data_labels = {'sent1': [1, 0, 2],
               'sent2': [1, 0, 2],
               'sent3': [1, 0, 1, 0, 0, 2],
               'sent4': [1, 0, 2],
               'sent5': [1, 0, 0, 2],
               'sent6': [1, 0, 2, 2],
               'sent7': [1, 0, 0, 0, 2]}

In [9]:
docs = []
for tag, sent in data_sents.items():
    words = [str(w) for w in sent.split()]
    labels = data_labels[tag]
    docs.append((tag, words, labels))

In [11]:
words = list(set(itertools.chain(*[doc[1] for doc in docs])))
words.append('__PAD__')
words.append('__UNK__')

word2id = {w: i for i, w in enumerate(words)}
id2word = {i: w for i, w in enumerate(words)}

## 1.3 Parameters

In [15]:
max_sent_len = 10
test_size = 0.3

# 2. Corpus

## 2.1 Sentence Padding

In [16]:
X_words = []
Y_labels = []
for doc in docs:
    X_words.append([word2id[w] for w in doc[1]])
    Y_labels.append(doc[2])

X_words_pad = pad_sequences(
    maxlen=max_sent_len,
    sequences=X_words,
    padding='post',
    value=word2id['__PAD__'])
Y_labels_pad = pad_sequences(
    maxlen=max_sent_len,
    sequences=Y_labels,
    padding='post',
    value=label2id['__PAD__'])

## 2.2 Word Embedding

In [24]:
docs_for_w2v = [doc[1] for doc in docs]
w2v_model = Word2Vec(sentences=docs_for_w2v,
                     size=100,
                     window=5,
                     min_count=0,
                     iter=5)

In [26]:
word2vector = {w: w2v_model.wv[w] for w in w2v_model.wv.vocab.keys()}
feature_size = w2v_model.vector_size

In [30]:
word2vector['__PAD__'] = np.zeros(feature_size)
word2vector['__UNK__'] = np.zeros(feature_size)

X_embedded = np.zeros((len(docs), max_sent_len, feature_size))
Y_embedded = np.zeros((len(docs), max_sent_len, len(label2id)))

for i in range(len(docs)):
    for j, word_id in enumerate(X_words_pad[i]):
        Y_embedded[i] = to_categorical(Y_labels_pad[i], num_classes=(len(label2id)))
        for k in range(feature_size):
            word = id2word[word_id]
            X_embedded[i, j, k] = word2vector[word][k]

## 2.3 Dataset

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(X_embedded, Y_embedded, test_size=test_size)

# 3. Model Development

## 3.1 Initialization

In [39]:
_input = Input(shape=(max_sent_len, feature_size))
ner_model = Bidirectional(LSTM(units=512,
                           return_sequences=True,
                           recurrent_dropout=0.2))(_input)
ner_model = TimeDistributed(Dense(units=100,
                              activation='relu'))(ner_model)
crf = CRF(len(label2id))
_output = crf(ner_model)

ner_model = Model(inputs=_input, outputs=_output)
ner_model.compile(optimizer='rmsprop',
                  loss=crf.loss_function,
                  metrics=[crf.accuracy])

## 3.2 Training

In [43]:
ner_model.fit(x=X_train,
              y=Y_train,
              batch_size=1,
              epochs=100,
              validation_split=0.1,
              verbose=True)

Train on 3 samples, validate on 1 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100


Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100


Epoch 100/100


<keras.callbacks.History at 0x7f17a0262950>

## 3.3 Evaluation

In [46]:
def pred2labels(sents, prediction):
    pred_labels = []
    for sent, pred in zip(sents, prediction):
        try:
            sent_len = np.where(sent==word2id['__PAD__'])[0][0]
        except:
            sent_len = max_sent_len

        labels = []
        for i in range(sent_len):
            labels.append(id2label[np.argmax(pred[i])])
        pred_labels.append(labels)
    return pred_labels

In [48]:
matrix_size = len(label_list)
confusion_matrix = np.zeros((matrix_size+1, matrix_size+1), dtype='int64')

prediction = ner_model.predict(X_test)
pred_labels = pred2labels(X_test, prediction)
test_labels = pred2labels(Y_test, Y_test)

for i in range(len(pred_labels)):
    for j, pred in enumerate(pred_labels[i]):
        row = label2id[test_labels[i][j]]
        col = label2id[pred]
        confusion_matrix[row, col] += 1

for i in range(matrix_size):
    confusion_matrix[i, matrix_size] = sum(confusion_matrix[i, 0:matrix_size])
    confusion_matrix[matrix_size, i] = sum(confusion_matrix[0:matrix_size, i])

confusion_matrix[matrix_size, matrix_size] = sum(confusion_matrix[matrix_size, 0:matrix_size])
confusion_matrix

array([[ 4,  0,  1,  5],
       [ 1,  3,  0,  4],
       [ 0,  0,  2,  2],
       [ 5,  3,  3, 11]])

In [51]:
def get_f1_score(p, r):
    if p != 0 or r != 0:
        return (2*p*r)/(p+r)
    else:
        return 0

In [52]:
f1_score_list = []
matrix_size = len(confusion_matrix)
for i in range(matrix_size):
    corr = confusion_matrix[i, i]
    pred = confusion_matrix[matrix_size-1, i]
    real = confusion_matrix[i, matrix_size-1]

    precision = corr/max(pred, 1)
    recall = corr/max(real, 1)
    f1_score_list.append(get_f1_score(p=precision, r=recall))

f1_score_average = np.mean(f1_score_list).round(3)

for category, f1_score in zip(label_list, f1_score_list):
    print('|    [{}]: {:.03f}'.format(category, f1_score))

|    [NON]: 0.800
|    [PER]: 0.857
|    [FOD]: 0.800


## 3.4 Save & Load

In [53]:
fpath_model = 'test/ner/model.pk'
ner_model.save(fpath_model)
# fpath_dataset = '{}-dataset.pk'.format(fpath_model.replace('.pk', ''))
# with open(fpath_dataset, 'wb') as f:
#     pk.dump(dataset, f)

OSError: Unable to create file (unable to open file: name = 'test/ner/model.pk', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 242)

In [45]:
_input = Input(shape=(max_sent_len, feature_size))
ner_model = Bidirectional(LSTM(units=512,
                           return_sequences=True,
                           recurrent_dropout=0.2))(_input)
ner_model = TimeDistributed(Dense(units=100,
                              activation='relu'))(ner_model)
crf = CRF(len(label2id))
_output = crf(ner_model)

ner_model = Model(inputs=_input, outputs=_output)
ner_model.compile(optimizer='rmsprop',
                  loss=crf.loss_function,
                  metrics=[crf.accuracy])

ner_model.load_weights(fpath_model)

# 4. Application

## 4.1 Prediction

In [54]:
new_sent = 'Tom eats apple'

In [55]:
sent_by_id = []
for w in [w.lower() for w in new_sent]:
    if w in word2id.keys():
        sent_by_id.append(word2id[w])
    else:
        sent_by_id.append(word2id['__UNK__'])

sent_pad = pad_sequences(maxlen=max_sent_len, sequences=[sent_by_id], padding='post', value=word2id['__PAD__'])
X_input = np.zeros((1, max_sent_len, feature_size), dtype=list)
for j, w_id in enumerate(sent_pad[0]):
    for k in range(feature_size):
        word = id2word[w_id]
        X_input[0, j, k] = word2vector[word][k]

prediction = ner_model.predict(X_input)
pred_labels = pred2labels(sents=sent_pad, prediction=prediction)[0]

In [58]:
pred_labels

['PER',
 'NON',
 'NON',
 'FOD',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__']

In [59]:
output_sent = []
for (word, label) in zip(new_sent.split(), pred_labels):
    output_sent.append('{}/{}'.format(word, label))
print(' '.join(output_sent))

Tom/PER eats/NON apple/NON
