an example from coursera projects

In [1]:

import trax 
from trax import layers as tl
import os 
import numpy as np
import pandas as pd
import random as rnd

## EDA
* geo: geographical entity
* org: organization
* per: person 
* gpe: geopolitical entity
* tim: time indicator
* art: artifact
* eve: event
* nat: natural phenomenon
* O: filler word


In [2]:
def get_vocab(vocab_path, tags_path):
    vocab = {}
    with open(vocab_path) as f:
        for i, l in enumerate(f.read().splitlines()):
            vocab[l] = i  
    vocab['<PAD>'] = len(vocab)
    tag_map = {}
    with open(tags_path) as f:
        for i, t in enumerate(f.read().splitlines()):
            tag_map[t] = i 
    
    return vocab, tag_map

def get_params(vocab, tag_map, sentences_file, labels_file):
    sentences = []
    labels = []

    with open(sentences_file) as f:
        for sentence in f.read().splitlines():
            s = [vocab[token] if token in vocab 
                 else vocab['UNK']
                 for token in sentence.split(' ')]
            sentences.append(s)

    with open(labels_file) as f:
        for sentence in f.read().splitlines():
            l = [tag_map[label] for label in sentence.split(' ')] 
            labels.append(l) 
    return sentences, labels, len(sentences)

In [3]:
vocab, tag_map = get_vocab('data/large/words.txt', 'data/large/tags.txt')
t_sentences, t_labels, t_size = get_params(vocab, tag_map, 'data/large/train/sentences.txt', 'data/large/train/labels.txt')
v_sentences, v_labels, v_size = get_params(vocab, tag_map, 'data/large/val/sentences.txt', 'data/large/val/labels.txt')
test_sentences, test_labels, test_size = get_params(vocab, tag_map, 'data/large/test/sentences.txt', 'data/large/test/labels.txt')

In [4]:
print(t_sentences[4],t_labels[4])

[61, 8, 62, 63, 9, 64, 1, 9, 65, 66, 1, 67, 68, 69, 70, 71, 11, 9, 72, 73, 74, 75, 1, 76, 21] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, 6, 0, 0, 0, 2, 0, 0, 0, 1, 0]


## batch data generator

In [5]:
def batch_data_generator(batch_size, max_length, x, y, padding):
    index =[]
    while True:
        if len(index)<len(x):
            #find index of lines less than maxlength
            index = np.where([1 if len(line)<max_length else 0 for line in x])[0]
        batch_index = np.random.choice(index,batch_size)
        #remove used index 
        index = [x for x in index if x not in batch_index]
        #make a batch
        batch_x = [x[i] for i in batch_index]  
        batch_y = [y[i] for i in batch_index]  
        batch_x_ = []
        batch_y_ = []
        # make a tensor
        for li in batch_x:
            pad = [padding] * (max_length - len(li))
            li_pad = li + pad
            batch_x_.append(li_pad)
            
        for li in batch_y:
            pad = [padding] * (max_length - len(li))
            li_pad = li + pad
            batch_y_.append(li_pad)

        X = np.array(batch_x_)
        Y = np.array(batch_y_)
        yield X,Y

In [6]:
batch_size = 2
mini_sentences = t_sentences[0: 8]
mini_labels = t_labels[0: 8]
dg = batch_data_generator(batch_size, 60,mini_sentences, mini_labels, vocab["<PAD>"])
X1, Y1 = next(dg)

In [7]:
X1

array([[   61,     6,    85,    86,    87,     1,    88,    89,    90,
           11,    91,    92,    93,    94,    95,    93,    96,    93,
           13,    97,    21, 35180, 35180, 35180, 35180, 35180, 35180,
        35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
        35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
        35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
        35180, 35180, 35180, 35180, 35180, 35180],
       [   61,    77,    78,    79,    80,    67,    68,    81,    11,
            9,    12,    25,    13,     9,    82,    83,     1,    84,
           16,    17,    11,    19,    20,    21, 35180, 35180, 35180,
        35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
        35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
        35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180, 35180,
        35180, 35180, 35180, 35180, 35180, 35180]])

## define model

In [8]:
def NER(vocab_size=35181, d_model=50, tags=tag_map):

    model = tl.Serial(
      tl.Embedding(vocab_size, d_model), 
      tl.LSTM(d_model), 
      tl.Dense(len(tags)), 
      tl.LogSoftmax()  
      )
    return model

In [9]:
model = NER()

print(model)

Serial[
  Embedding_35181_50
  LSTM_50
  Dense_17
  LogSoftmax
]


## train model

In [10]:
from trax.supervised import training

rnd.seed(33)

batch_size = 64

train_generator = trax.data.inputs.add_loss_weights(
    batch_data_generator(batch_size,70, t_sentences, t_labels, vocab['<PAD>']),
    id_to_mask=vocab['<PAD>'])


eval_generator = trax.data.inputs.add_loss_weights(
    batch_data_generator(batch_size,70, v_sentences, v_labels, vocab['<PAD>']),
    id_to_mask=vocab['<PAD>'])

In [11]:
def train_model(NER, train_generator, eval_generator, train_steps=1, output_dir='model'):

    train_task = training.TrainTask(
      train_generator, 
      loss_layer = tl.CrossEntropyLoss(),
      optimizer = trax.optimizers.Adam(0.01), 
    )

    eval_task = training.EvalTask(
      labeled_data = eval_generator, 
      metrics = [tl.CrossEntropyLoss(), tl.Accuracy()], 
      n_eval_batches = 10  
    )

    training_loop = training.Loop(
        NER, 
        train_task, 
        eval_tasks = eval_task,
        output_dir = output_dir) 

    
    training_loop.run(n_steps = train_steps)

    return training_loop

In [12]:
training_loop = train_model(NER(), train_generator, eval_generator, 2000)




Step   1300: Ran 100 train steps in 10.47 secs
Step   1300: train CrossEntropyLoss |  0.09662773
Step   1300: eval  CrossEntropyLoss |  0.14989577
Step   1300: eval          Accuracy |  0.95700048

Step   1400: Ran 100 train steps in 9.25 secs
Step   1400: train CrossEntropyLoss |  0.09209779
Step   1400: eval  CrossEntropyLoss |  0.14010059
Step   1400: eval          Accuracy |  0.95857831

Step   1500: Ran 100 train steps in 9.25 secs
Step   1500: train CrossEntropyLoss |  0.08418070
Step   1500: eval  CrossEntropyLoss |  0.13821157
Step   1500: eval          Accuracy |  0.95946410

Step   1600: Ran 100 train steps in 9.29 secs
Step   1600: train CrossEntropyLoss |  0.07763737
Step   1600: eval  CrossEntropyLoss |  0.13990689
Step   1600: eval          Accuracy |  0.95940822

Step   1700: Ran 100 train steps in 9.43 secs
Step   1700: train CrossEntropyLoss |  0.07715777
Step   1700: eval  CrossEntropyLoss |  0.14950747
Step   1700: eval          Accuracy |  0.95550586

Step   1800: 

In [13]:
model = NER()
model.init(trax.shapes.ShapeDtype((1, 1), dtype=np.int32))
model.init_from_file('model/model.pkl.gz', weights_only=True)

In [15]:
def evaluate_prediction(pred, labels, pad):

    outputs = np.argmax(pred, axis=2)
    mask = labels != pad
    accuracy = np.sum(outputs == labels) / float(np.sum(mask))
    return accuracy

x, y = next(batch_data_generator(len(test_sentences),70, test_sentences, test_labels, vocab['<PAD>']))
accuracy = evaluate_prediction(model(x), y, vocab['<PAD>'])
print("accuracy: ", accuracy)

accuracy:  0.9596766
