Keras Embedding CRF + FC

In [0]:
import numpy as np 
import pandas as pd 
import os
print(os.listdir("../input"))
import matplotlib.pyplot as plt
import tensorflow as tf
import keras

['ner.csv', 'ner_dataset.csv']


In [0]:
data = pd.read_csv("../input/ner_dataset.csv", encoding="latin1")

In [0]:
data = data.fillna(method="ffill")
words = list(set(data["Word"].values))
n_words = len(words)
tags = list(set(data["Tag"].values))
n_tags = len(tags)

In [0]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        """Return one sentence"""
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
getter = SentenceGetter(data)
sent = getter.get_next()
# Get all the sentences
sentences = getter.sentences

In [0]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 
word2idx["PAD"] = 0 

idx2word = {i: w for w, i in word2idx.items()}

tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

idx2tag = {i: w for w, i in tag2idx.items()}

from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word2idx["PAD"])

y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["PAD"]) 

from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags+1) for i in y]

from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

In [0]:
batch_size = 32 ################# batch size
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
sess = tf.Session()
K.set_session(sess)

NN Start

In [0]:
pip install git+https://www.github.com/keras-team/keras-contrib.git

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-sh2t50qi
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-sh2t50qi
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25ldone
[?25h  Created wheel for keras-contrib: filename=keras_contrib-2.0.8-cp36-none-any.whl size=101066 sha256=ef49863d8be64afd96e68a3affc392981befee77dd1cf28a36bd9d1ef71eb5f1
  Stored in directory: /tmp/pip-ephem-wheel-cache-uevju8wq/wheels/11/27/c8/4ed56de7b55f4f61244e2dc6ef3cdbaff2692527a2ce6502ba
Successfully built keras-contrib
Note: you may need to restart the kernel to use updated packages.


In [0]:
from keras.models import Model, Input
from keras.layers.merge import add
from keras import initializers, regularizers, constraints
from livelossplot.keras import PlotLossesCallback
from keras.layers.normalization import BatchNormalization
from keras_contrib.layers import CRF

In [0]:
batch_size = 512
epochs = 20
max_len = 80  
EMBEDDING = 40  

input_text = Input(shape=(max_len,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=max_len, mask_zero=True)(input_text)  
model = Bidirectional(LSTM(units=80, return_sequences=True,
                           recurrent_dropout=0.1))(model)  
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  
model = TimeDistributed(Dense(50, activation="relu"))(model)  
crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)
model = Model(input_text, out)
model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_32 (InputLayer)        (None, 80)                0         
_________________________________________________________________
embedding_14 (Embedding)     (None, 80, 40)            1407200   
_________________________________________________________________
bidirectional_94 (Bidirectio (None, 80, 160)           77440     
_________________________________________________________________
bidirectional_95 (Bidirectio (None, 80, 100)           84400     
_________________________________________________________________
time_distributed_21 (TimeDis (None, 80, 50)            5050      
_________________________________________________________________
crf_24 (CRF)                 (None, 80, 18)            1278      
Total params: 1,575,368
Trainable params: 1,575,368
Non-trainable params: 0
_________________________________________________________________


In [0]:
history = model.fit(X_tr, np.array(y_tr), batch_size=batch_size, epochs=epochs,validation_split=0.1, verbose=2)

Train on 38846 samples, validate on 4317 samples
Epoch 1/20
 - 97s - loss: 10.7374 - crf_viterbi_accuracy: 0.7440 - val_loss: 10.3918 - val_crf_viterbi_accuracy: 0.8424
Epoch 2/20
 - 61s - loss: 10.2273 - crf_viterbi_accuracy: 0.8474 - val_loss: 10.2389 - val_crf_viterbi_accuracy: 0.8417
Epoch 3/20
 - 61s - loss: 9.9402 - crf_viterbi_accuracy: 0.8756 - val_loss: 9.9433 - val_crf_viterbi_accuracy: 0.9066
Epoch 4/20
 - 61s - loss: 9.7742 - crf_viterbi_accuracy: 0.9265 - val_loss: 9.8567 - val_crf_viterbi_accuracy: 0.9315
Epoch 5/20
 - 60s - loss: 9.7133 - crf_viterbi_accuracy: 0.9440 - val_loss: 9.8286 - val_crf_viterbi_accuracy: 0.9387
Epoch 6/20
 - 60s - loss: 9.6844 - crf_viterbi_accuracy: 0.9526 - val_loss: 9.8139 - val_crf_viterbi_accuracy: 0.9435
Epoch 7/20
 - 60s - loss: 9.6643 - crf_viterbi_accuracy: 0.9596 - val_loss: 9.7989 - val_crf_viterbi_accuracy: 0.9486
Epoch 8/20
 - 60s - loss: 9.6476 - crf_viterbi_accuracy: 0.9647 - val_loss: 9.7798 - val_crf_viterbi_accuracy: 0.9537
Epo

In [0]:
pred_cat = model.predict(X_te)
pred = np.argmax(pred_cat, axis=-1)
y_te_true = np.argmax(y_te, -1)
from sklearn_crfsuite.metrics import flat_classification_report
# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_te_true_tag = [[idx2tag[i] for i in row] for row in y_te_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=y_te_true_tag)
print(report)

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        33
       B-eve       0.00      0.00      0.00        32
       B-geo       0.82      0.90      0.86      3690
       B-gpe       0.96      0.93      0.95      1567
       B-nat       0.00      0.00      0.00        18
       B-org       0.70      0.70      0.70      2111
       B-per       0.85      0.76      0.80      1638
       B-tim       0.87      0.88      0.88      2056
       I-art       0.00      0.00      0.00        31
       I-eve       0.50      0.03      0.06        33
       I-geo       0.73      0.79      0.76       719
       I-gpe       0.80      0.33      0.47        12
       I-nat       0.00      0.00      0.00         2
       I-org       0.71      0.77      0.74      1800
       I-per       0.87      0.78      0.82      1698
       I-tim       0.74      0.73      0.73       699
           O       0.99      0.99      0.99     88113
         PAD       1.00    