In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

## Parameters

In [2]:
class Config:
    # Data loading params
    max_sentence_length = 40
    dev_sample_percentage = 0.1
    
    # Embeddings
    embedding_dim = 300
    
    # RNN
    hidden_size = 300
    
    # Training parameters
    batch_size = 40
    num_epochs = 20
    display_every = 500
    evaluate_every = 1000
    num_checkpoints = 5
    learning_rate = 0.001
    decay_rate = 0.9
    
    # Testing parameters
    checkpoint_dir = ''
    
    UNK = "$UNK$"
    NUM = "$NUM$"
    NONE = "O"
    PAD = '$PAD$'

## Dataset 

Load annotated corpus for named entity recognition

https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus

In [3]:
import nltk
import re
import os

class Dataset:
    def __init__(self):
        self.all_tags, self.all_words = [], [] 
        
    def processing_word(self, word):
        word = word.lower()
        if word.isdigit():
            word = Config.NUM
        return word
        
    def load_dataset(self, path):
        words_col, tags_col = [], []
        with open(path) as f:
            words, tags = [], []
            for line in f:
                line = line.strip()
                if (len(line) == 0 or line.startswith("-DOCSTART-")):
                    if len(words) != 0:
                        words_col.append(' '.join(words))
                        tags_col.append(' '.join(tags))
                        words, tags = [], []
                else:
                    ls = line.split(' ')
                    word, tag = ls[0],ls[3]
                    word = self.processing_word(word)
                    
                    words.append(word)
                    tags.append(tag)
                    
                    self.all_words.append(word)
                    self.all_tags.append(tag)
                    
        return pd.DataFrame({'words': words_col, 'tags': tags_col})
        
    def download_and_load_datasets(self):
        self.all_tags, self.all_words = [], [] 
        
        dataset = tf.keras.utils.get_file(
          fname="CoNLL-2003.zip", 
          origin="https://s3.ap-northeast-2.amazonaws.com/bowbowbow-storage/dataset/CoNLL-2003.zip", 
          extract=True)
        
        dir_path = os.path.join(os.path.dirname(dataset), 'CoNLL-2003')
        train_df = self.load_dataset(os.path.join(dir_path, 'eng.train'))
        dev_df = self.load_dataset(os.path.join(dir_path, 'eng.testa'))
        test_df = self.load_dataset(os.path.join(dir_path, 'eng.testb'))
        return train_df, dev_df, test_df

dataset = Dataset()
train_df, dev_df, test_df = dataset.download_and_load_datasets()
train_df.head(10)

Unnamed: 0,tags,words
0,I-ORG O I-MISC O O O I-MISC O O,eu rejects german call to boycott british lamb .
1,I-PER I-PER,peter blackburn
2,I-LOC O,brussels 1996-08-22
3,O I-ORG I-ORG O O O O O O I-MISC O O O O O I-M...,the european commission said on thursday it di...
4,I-LOC O O O O I-ORG I-ORG O O O I-PER I-PER O ...,germany 's representative to the european unio...
5,O O O O O O O O O O O O O O O O O O O O I-ORG ...,""" we do n't support any such recommendation be..."
6,O O O O O O O O O O O O O O O O O O O O O O I-...,he said further scientific study was required ...
7,O O O O O O O I-ORG O O I-PER I-PER O O O O O ...,he said a proposal last month by eu farm commi...
8,I-PER O I-MISC O O O O I-LOC O I-LOC O O O O O...,fischler proposed eu-wide measures after repor...
9,O I-PER O O O O O O O I-ORG O O O O O O O O O ...,but fischler agreed to review his proposal aft...


In [4]:
word_list = list(set(dataset.all_words)) + [Config.PAD, Config.UNK]
word2idx = {w: i for i, w in enumerate(word_list)}
idx2word = {i: w for i, w in enumerate(word_list)}

tag_list = list(set(dataset.all_tags))
tag2idx = {w: i for i, w in enumerate(tag_list)}
idx2tag = {i: w for i, w in enumerate(tag_list)}


def get_data(df):
    x, lengths, y = [], [], []
    for index, row in train_df.iterrows():
        sentence = row['words'].split(' ')
        tags = row['tags'].split(' ')

        sentence = sentence[:Config.max_sentence_length]
        tags = tags[:Config.max_sentence_length]

        lengths.append(Config.max_sentence_length)
        x_row, y_row = [], []
        for word in sentence:
            x_row.append(word2idx[word])
        for tag in tags:
            y_row.append(tag2idx[tag])

        if len(sentence) < Config.max_sentence_length:
            lengths[-1] = len(sentence)
            x_row += [word2idx[Config.PAD]]* (Config.max_sentence_length - len(sentence))
            y_row += [tag2idx[Config.NONE]]* (Config.max_sentence_length - len(sentence))
        
        x.append(x_row)
        y.append(y_row)
        
    x = np.array(x)
    y = np.array(y)
    lengths = np.array(lengths)
    return x, y, lengths
    

In [5]:
x_train, y_train, lengths_train = get_data(train_df)
x_dev, y_dev, lengths_dev = get_data(dev_df)
x_test, y_test, lengths_test = get_data(test_df)

## Model

In [6]:
class Model:
    def __init__(self, 
               sequence_length, 
               num_classes, 
               vocab_size, 
               embedding_size, 
               hidden_size):
        
        self.input_x = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_x')
        self.input_y = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_y')
        self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], name="sequence_lengths")
        self.dropout = tf.placeholder(dtype=tf.float32, shape=[],name="dropout")
        
        initializer = tf.contrib.layers.xavier_initializer()
        
        # Embedding layer
        with tf.variable_scope('text-embedding'):
            self.W_text = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -0.25, 0.25), name='W_text', trainable=False)
            self.embedded_chars = tf.nn.embedding_lookup(self.W_text, self.input_x) # [batch_size, sequence_length, embedding_size]
            
        # Bidirectional LSTM
        with tf.variable_scope("bi-lstm"):
            fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size)
            bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size)
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell,
                                                                  cell_bw=bw_cell,
                                                                  inputs=self.embedded_chars,
                                                                  sequence_length= self.sequence_lengths, # [batch_size],
                                                                  dtype=tf.float32)
            
            self.rnn_outputs = tf.concat([output_fw, output_bw], axis=-1)  # [batch_size, sequence_length, 2*hidden_size]
            self.rnn_outputs = tf.nn.dropout(self.rnn_outputs, self.dropout)
    
    
        # Fully connected layer
        with tf.variable_scope('output'):
            self.W_output = tf.get_variable('W_output', shape=[2*hidden_size, num_classes],  dtype=tf.float32)
            self.b_output = tf.get_variable('b_output', shape=[num_classes], dtype=tf.float32, initializer=tf.zeros_initializer())
            
            rnn_outputs_flat = tf.reshape(self.rnn_outputs, [-1, 2*hidden_size])
            pred = tf.matmul(rnn_outputs_flat, self.W_output) + self.b_output
            
            self.logits = tf.reshape(pred, [-1, sequence_length, num_classes]) # [batch_size, sequence_length, num_classes]
    
        # Calculate mean corss-entropy loss
        with tf.variable_scope('loss'):
            self.losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            mask = tf.sequence_mask(self.sequence_lengths)
            losses = tf.boolean_mask(self.losses, mask)
            
            self.loss = tf.reduce_mean(losses) 
        
        # Accuracy    
        with tf.name_scope('accuracy'):
            self.predictions = tf.argmax(self.logits, 2, name='predictions')
            correct_predictions = tf.equal(self.predictions, tf.cast(self.input_y, tf.int64))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')
    
    # Length of the sequence data
    @staticmethod
    def _length(seq):
        relevant = tf.sign(tf.abs(seq))
        length = tf.reduce_sum(relevant, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

## Function for train

In [7]:
# Pre-trained glove
def load_glove(embedding_dim, word2idx):
    download_path = tf.keras.utils.get_file(
      fname="glove.6B.zip", 
      origin="http://nlp.stanford.edu/data/glove.6B.zip", 
      extract=True)
    
    embedding_path = os.path.join(os.path.dirname(download_path), 'glove.6B.300d.txt')
    print('embedding_path :', embedding_path)

    # initial matrix with random uniform
    initW = np.random.randn(len(word2idx), embedding_dim).astype(np.float32) / np.sqrt(len(word2idx))
    # load any vectors from the glove
    print("Load glove file {0}".format(embedding_path))
    f = open(embedding_path, 'r', encoding='utf8')
    for line in f:
        splitLine = line.split(' ')
        word = splitLine[0]
        embedding = np.asarray(splitLine[1:], dtype='float32')
        if word in word2idx:
            initW[word2idx[word]] = embedding
    return initW

def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
    for epoch in tqdm(range(num_epochs)):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

def evaluation(y, preds, lengths):
    from sklearn.metrics import classification_report
    arg_answer, arg_pred = [], []
    for i in range(len(y)):
        for j in range(lengths[i]):
            arg_answer.append(idx2tag[y[i][j]])
            arg_pred.append(idx2tag[preds[i][j]])

    print(classification_report(arg_answer, arg_pred))

## Train

In [8]:
import datetime
import time

from sklearn.metrics import f1_score
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

tf.reset_default_graph()
sess = tf.Session()
with sess.as_default():
    model = Model(
        sequence_length=x_train.shape[1],
        num_classes=len(tag_list),
        vocab_size=len(word_list),
        embedding_size=Config.embedding_dim,
        hidden_size=Config.hidden_size
    )
    
    global_step = tf.Variable(0, name='global_step', trainable=False)
    train_op = tf.train.AdamOptimizer(Config.learning_rate).minimize(model.loss, global_step=global_step)
    
    # Output directory for models and summary
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "30.runs", timestamp))
    print("Writing to {}\n".format(out_dir))

    sess.run(tf.global_variables_initializer())

    pretrain_W = load_glove(Config.embedding_dim, word2idx)
    sess.run(model.W_text.assign(pretrain_W))
    print("Success to load pre-trained glove model!\n")
    
    # Generate batches
    batches = batch_iter(list(zip(x_train, lengths_train, y_train)), Config.batch_size, Config.num_epochs)
    
    for batch in batches:
        x_batch, lengths_batch, y_batch = zip(*batch)

        # Train
        feed_dict = {
            model.input_x: x_batch,
            model.input_y: y_batch,
            model.sequence_lengths: lengths_batch,
            model.dropout: 0.5,
        }

        _, step, loss, accuracy, predictions = sess.run(
            [train_op, global_step, model.loss, model.accuracy, model.predictions], feed_dict)

        
        # Training log display
        if step % Config.display_every == 0:
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            
        # Evaluation
        if step % Config.evaluate_every == 0:
            print("\nDev Evaluation:")
            feed_dict = {
                model.input_x: x_dev,
                model.input_y: y_dev,
                model.sequence_lengths: lengths_dev,
                model.dropout: 1.0,
            }
            loss, accuracy, predictions = sess.run(
                [model.loss, model.accuracy, model.predictions], feed_dict)
            evaluation(y_dev, predictions, lengths_dev)

            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:.4f}\n".format(time_str, step, loss, accuracy))


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Writing to /home/seungwon/project/tf-notes/30.runs/1558023759

embedding_path : /home/seungwon/.keras/datasets/glove.6B.300d.txt
Load glove file /home/seungwon/.keras/datasets/glove.6B.300d.txt


  0%|          | 0/20 [00:00<?, ?it/s]

Success to load pre-trained glove model!



  5%|▌         | 1/20 [00:11<03:45, 11.87s/it]

2019-05-17T01:23:36.239300: step 500, loss 0.179014, acc 0.985625


 10%|█         | 2/20 [00:23<03:31, 11.77s/it]

2019-05-17T01:23:52.606882: step 1000, loss 0.0815813, acc 0.9925

Dev Evaluation:
              precision    recall  f1-score   support

       B-LOC       0.60      0.27      0.37        11
      B-MISC       0.00      0.00      0.00        37
       B-ORG       1.00      1.00      1.00        24
       I-LOC       0.96      0.91      0.93      8225
      I-MISC       0.92      0.79      0.85      4530
       I-ORG       0.92      0.82      0.87      9926
       I-PER       0.98      0.96      0.97     11031
           O       0.98      1.00      0.99    167606

   micro avg       0.98      0.98      0.98    201390
   macro avg       0.80      0.72      0.75    201390
weighted avg       0.98      0.98      0.98    201390

2019-05-17T01:23:54.969905: step 1000, loss 0.0750772, acc 0.9919



 20%|██        | 4/20 [00:48<03:14, 12.16s/it]

2019-05-17T01:24:11.401229: step 1500, loss 0.751189, acc 0.99375


 25%|██▌       | 5/20 [01:00<02:59, 11.99s/it]

2019-05-17T01:24:27.892104: step 2000, loss 0.0867381, acc 0.985

Dev Evaluation:
              precision    recall  f1-score   support

       B-LOC       1.00      0.64      0.78        11
      B-MISC       1.00      0.14      0.24        37
       B-ORG       1.00      1.00      1.00        24
       I-LOC       0.97      0.97      0.97      8225
      I-MISC       0.98      0.87      0.92      4530
       I-ORG       0.96      0.92      0.94      9926
       I-PER       0.98      0.99      0.99     11031
           O       0.99      1.00      1.00    167606

   micro avg       0.99      0.99      0.99    201390
   macro avg       0.99      0.82      0.85    201390
weighted avg       0.99      0.99      0.99    201390

2019-05-17T01:24:30.179855: step 2000, loss 0.0352761, acc 0.9962



 35%|███▌      | 7/20 [01:25<02:39, 12.27s/it]

2019-05-17T01:24:46.623662: step 2500, loss 0.360474, acc 0.9975


 40%|████      | 8/20 [01:37<02:24, 12.07s/it]

2019-05-17T01:25:03.129294: step 3000, loss 0.0126071, acc 0.99875

Dev Evaluation:
              precision    recall  f1-score   support

       B-LOC       1.00      0.82      0.90        11
      B-MISC       0.94      0.43      0.59        37
       B-ORG       1.00      1.00      1.00        24
       I-LOC       0.99      0.99      0.99      8225
      I-MISC       0.98      0.97      0.97      4530
       I-ORG       0.99      0.97      0.98      9926
       I-PER       1.00      0.99      1.00     11031
           O       1.00      1.00      1.00    167606

   micro avg       1.00      1.00      1.00    201390
   macro avg       0.99      0.90      0.93    201390
weighted avg       1.00      1.00      1.00    201390

2019-05-17T01:25:05.373110: step 3000, loss 0.0146638, acc 0.9986



 45%|████▌     | 9/20 [01:51<02:18, 12.60s/it]

2019-05-17T01:25:21.863307: step 3500, loss 0.0230563, acc 0.9975


 55%|█████▌    | 11/20 [02:14<01:48, 12.09s/it]

2019-05-17T01:25:38.321058: step 4000, loss 0.182173, acc 0.99875

Dev Evaluation:
              precision    recall  f1-score   support

       B-LOC       1.00      0.82      0.90        11
      B-MISC       1.00      0.57      0.72        37
       B-ORG       1.00      1.00      1.00        24
       I-LOC       0.99      0.99      0.99      8225
      I-MISC       0.99      0.99      0.99      4530
       I-ORG       0.99      0.97      0.98      9926
       I-PER       1.00      1.00      1.00     11031
           O       1.00      1.00      1.00    167606

   micro avg       1.00      1.00      1.00    201390
   macro avg       1.00      0.92      0.95    201390
weighted avg       1.00      1.00      1.00    201390

2019-05-17T01:25:40.596633: step 4000, loss 0.00980013, acc 0.9991



 60%|██████    | 12/20 [02:28<01:40, 12.61s/it]

2019-05-17T01:25:57.027431: step 4500, loss 0.136358, acc 0.998125


 70%|███████   | 14/20 [02:51<01:12, 12.10s/it]

2019-05-17T01:26:13.517254: step 5000, loss 0.00424521, acc 0.999375

Dev Evaluation:
              precision    recall  f1-score   support

       B-LOC       1.00      0.91      0.95        11
      B-MISC       1.00      0.86      0.93        37
       B-ORG       1.00      1.00      1.00        24
       I-LOC       1.00      1.00      1.00      8225
      I-MISC       1.00      0.99      0.99      4530
       I-ORG       0.99      0.99      0.99      9926
       I-PER       1.00      1.00      1.00     11031
           O       1.00      1.00      1.00    167606

   micro avg       1.00      1.00      1.00    201390
   macro avg       1.00      0.97      0.98    201390
weighted avg       1.00      1.00      1.00    201390

2019-05-17T01:26:15.786567: step 5000, loss 0.00464551, acc 0.9996



 75%|███████▌  | 15/20 [03:05<01:03, 12.62s/it]

2019-05-17T01:26:32.218181: step 5500, loss 0.00488206, acc 0.99875


 85%|████████▌ | 17/20 [03:28<00:36, 12.10s/it]

2019-05-17T01:26:48.681353: step 6000, loss 0.0114078, acc 0.99875

Dev Evaluation:
              precision    recall  f1-score   support

       B-LOC       1.00      1.00      1.00        11
      B-MISC       0.97      0.89      0.93        37
       B-ORG       1.00      1.00      1.00        24
       I-LOC       1.00      1.00      1.00      8225
      I-MISC       1.00      1.00      1.00      4530
       I-ORG       1.00      0.99      1.00      9926
       I-PER       1.00      1.00      1.00     11031
           O       1.00      1.00      1.00    167606

   micro avg       1.00      1.00      1.00    201390
   macro avg       1.00      0.98      0.99    201390
weighted avg       1.00      1.00      1.00    201390

2019-05-17T01:26:50.971277: step 6000, loss 0.00300757, acc 0.9997



 90%|█████████ | 18/20 [03:42<00:25, 12.62s/it]

2019-05-17T01:27:07.420553: step 6500, loss 0.00349693, acc 1


 95%|█████████▌| 19/20 [03:54<00:12, 12.32s/it]

2019-05-17T01:27:23.876156: step 7000, loss 0.00711772, acc 0.999375

Dev Evaluation:
              precision    recall  f1-score   support

       B-LOC       1.00      1.00      1.00        11
      B-MISC       0.97      0.95      0.96        37
       B-ORG       1.00      1.00      1.00        24
       I-LOC       1.00      1.00      1.00      8225
      I-MISC       1.00      1.00      1.00      4530
       I-ORG       1.00      0.99      1.00      9926
       I-PER       1.00      1.00      1.00     11031
           O       1.00      1.00      1.00    167606

   micro avg       1.00      1.00      1.00    201390
   macro avg       1.00      0.99      0.99    201390
weighted avg       1.00      1.00      1.00    201390

2019-05-17T01:27:26.168493: step 7000, loss 0.00233567, acc 0.9998



100%|██████████| 20/20 [04:07<00:00, 12.78s/it]


In [15]:
print("\nTest Evaluation:")
predictions = sess.run([model.predictions], feed_dict={
    model.input_x: x_test,
    model.input_y: y_test,
    model.sequence_lengths: lengths_test,
    model.dropout: 1.0,
})
evaluation(y_test, predictions[0], lengths_test)


Test Evaluation:
              precision    recall  f1-score   support

       B-LOC       1.00      1.00      1.00        11
      B-MISC       1.00      0.92      0.96        37
       B-ORG       1.00      1.00      1.00        24
       I-LOC       1.00      1.00      1.00      8225
      I-MISC       1.00      1.00      1.00      4530
       I-ORG       1.00      0.99      1.00      9926
       I-PER       1.00      1.00      1.00     11031
           O       1.00      1.00      1.00    167606

   micro avg       1.00      1.00      1.00    201390
   macro avg       1.00      0.99      0.99    201390
weighted avg       1.00      1.00      1.00    201390



## Tensorboard

```
tensorboard --logdir=./30.runs --host 0.0.0.0
```