## NER: BiLSTM-LSTM Model
Use LSTM as the decoder. 

For the time being consider no batching. 
Later implement batching as well.

In [1]:
import logging
import os

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm import trange
from tqdm import tqdm_notebook as tqdm
import random

import utils
import model.net as net
from model.data_loader import DataLoader
from evaluate import evaluate, f_score_simple

In [2]:
data_dir = 'data/coNLL/eng/'
model_dir = 'experiments/coNLL/lstm_model/'
# data_dir = 'data/kaggle/'
# model_dir = 'experiments/kaggle/lstm_model/'
json_path = os.path.join(model_dir, 'params.json')
params = utils.Params(json_path)
# use GPU if available
params.cuda = torch.cuda.is_available()
params.num_epochs = 15
params.dict

{'learning_rate': 0.001,
 'batch_size': 5,
 'num_epochs': 15,
 'lstm_hidden_dim': 50,
 'embedding_dim': 50,
 'save_summary_steps': 100,
 'cuda': True}

In [3]:
# load data
data_loader = DataLoader(data_dir, params)
data = data_loader.load_data(['train', 'val', 'test'])
train_data = data['train']
val_data = data['val']
test_data = data['test']

# specify the train and val dataset sizes
params.train_size = train_data['size']
params.val_size = val_data['size']
params.test_size = test_data['size']

params.pad_tag_ind = data_loader.tag_map[params.pad_tag]
params.batch_size = 1
SOS_token = -1

14041 14041
3250 3250
3453 3453


## Model

In [6]:
class EncoderRNN(nn.Module):
    
    def __init__(self, params):
        
        super(EncoderRNN, self).__init__()
        
        # the embedding takes as input the vocab_size and the embedding_dim
        self.embedding = nn.Embedding(params.vocab_size, params.embedding_dim)

        # the LSTM takes as input the size of its input (embedding_dim), its hidden size
        # for more details on how to use it, check out the documentation
        self.encoder = nn.GRU(params.embedding_dim, params.lstm_hidden_dim, batch_first=True)
        
    def forward(self, s):   
        # apply the embedding layer that maps each token to its embedding
        s = self.embedding(s)            # dim: seq_len x batch_size x embedding_dim

        # run the LSTM along the sentences of length seq_len
        output, hidden = self.encoder(s)    # dim: seq_len x batch_size x lstm_hidden_dim

        return output, hidden


    
class DecoderRNN(nn.Module):
        
    def __init__(self, params):
        
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = params.lstm_hidden_dim
        self.tag_size = params.number_of_tags # SOS is represented as all zero vector
        self.decoder = nn.GRU(self.tag_size + self.hidden_size, self.hidden_size, batch_first=True)
        
        # the fully connected layer transforms the output to give the final output layer
        self.fc = nn.Linear(self.hidden_size, self.tag_size)
        
    
    def forward(self, prev_tag, hidden, encoder_output):
        '''prev_tag: is a scalar
           hidden: is the previous hidden vector: as we are now processing one sentence word by word
           encoder_output: encoder's  corresponding output in the input sentence
           
           output: 1 x tag_size
        '''
        tag = torch.zeros(1, self.tag_size).cuda()
        if prev_tag >= 0:
            tag[0][prev_tag] = 1 # 1 hot encoding
            
        encoder_output = encoder_output.view(1, -1)
        output = torch.cat((tag, encoder_output), dim=1)
        output = output.unsqueeze(1)
        
        # apply one cell of GRU
        output, hidden = self.decoder(output, hidden)  
        
        # apply the fully connected layer and obtain the output (before softmax) for each token
        output = self.fc(output[0])                   # dim: batch_size*seq_len x num_tags

        # apply log softmax on each token's output 
        output = F.log_softmax(output, dim=1)   # dim: batch_size*seq_len x num_tags
        
        return output, hidden

In [7]:
class RunningAverage():
    """A simple class that maintains the running average of a quantity
    Example:
    ```
    loss_avg = RunningAverage()
    loss_avg.update(2)
    loss_avg.update(4)
    loss_avg() = 3
    ```
    """

    def __init__(self):
        self.steps = 0
        self.total = 0

    def update(self, val, step=1):
        self.total += val
        self.steps += step

    def __call__(self):
        if self.steps == 0:
            # return float('nan')
            return 0
        else:
            return self.total / float(self.steps)

In [8]:
# Set the logger
utils.set_logger(os.path.join(model_dir, 'train.log'))

## Train

In [9]:
teacher_forcing_ratio = 0.5

def train(data_iterator, encoder, decoder, encoder_optimizer, decoder_optimizer, num_steps):
    
    # set model to training mode
    encoder.train()
    decoder.train()

    # Running average object for loss, accuracy 
    loss_avg = RunningAverage()
    acc_avg = RunningAverage()
    other_ind = data_loader.tag_map['O']
    recall = RunningAverage()
    precision = RunningAverage()
    
    t = trange(num_steps) 
    
    for i in t:

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        train_batch, labels_batch, _ = next(data_iterator) 

        encoder_outputs, encoder_hidden = encoder(train_batch)
        decoder_hidden = encoder_hidden
        loss = 0
        prev_token = SOS_token # -1
        predicted_tokens = torch.zeros_like(labels_batch)
        
        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

        if use_teacher_forcing:
            # Teacher forcing: Feed the target as the next input
            for di in range(labels_batch.shape[1]):
                decoder_output, decoder_hidden = decoder(prev_token, decoder_hidden, encoder_outputs[:,di,:])
                prev_token = labels_batch[:, di].item() # Teacher forcing
                _, predicted_tokens[0][di] = torch.max(decoder_output.view(-1), dim=0)
                loss += criterion(decoder_output, labels_batch[:, di])

        else:
            # Without teacher forcing: use its own predictions as the next input
            for di in range(labels_batch.shape[1]):
                decoder_output, decoder_hidden = decoder(prev_token, decoder_hidden, encoder_outputs[:,di,:])
                topv, topi = torch.max(decoder_output.view(-1), dim=0)
                prev_token = topi.item()
                predicted_tokens[0][di] = topi
                loss += criterion(decoder_output, labels_batch[:,di])
        
        # find accuracy of prediction
        correct = (predicted_tokens == labels_batch).view(-1).cpu().numpy()
        acc_avg.update(sum(correct), len(correct))
        
        # find accuracy (recall) for only named entities other than O
        ne_labels = labels_batch.view(-1).cpu().numpy() != other_ind
        recall.update(sum(correct & ne_labels), sum(ne_labels))
        
        # find precision of ne predictions
        ne_preds = predicted_tokens.view(-1).cpu().numpy() != other_ind
        precision.update(sum(correct & ne_preds), sum(ne_preds))
        
        # find f-score
        if (recall() + precision()) == 0:
            f_score = 0
        else:
            f_score = 2*recall()*precision() / (recall() + precision())
        
        # update the average loss
        loss_avg.update(loss.item())
        
        
        t.set_postfix(loss='{:05.3f}'.format(loss_avg()), acc='{:05.3f}'.format(acc_avg()), f1='{:05.3f}'.format(f_score))
        
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

In [10]:
def evaluate(data_iterator, encoder, decoder, num_steps):
    
    print('Evaluating:')
    # set model to eval mode
    encoder.eval()
    decoder.eval()

    # Running average object for loss, accuracy and ne_accuracy
    # ne_accuracy: omit the 'O' tag for computing accuracy
    loss_avg = RunningAverage()
    acc_avg = RunningAverage()
    other_ind = data_loader.tag_map['O']
    recall = RunningAverage()
    precision = RunningAverage()
    
    t = trange(num_steps) 
    
    for i in t:

        test_batch, labels_batch, _ = next(data_iterator) 

        encoder_outputs, encoder_hidden = encoder(test_batch)
        decoder_hidden = encoder_hidden
        loss = 0
        prev_token = SOS_token # -1
        predicted_tokens = torch.zeros_like(labels_batch)

        for di in range(labels_batch.shape[1]):
            decoder_output, decoder_hidden = decoder(prev_token, decoder_hidden, encoder_outputs[:,di,:])
            topv, topi = torch.max(decoder_output.view(-1), dim=0)
            prev_token = topi.item()
            predicted_tokens[0][di] = topi
            loss += criterion(decoder_output, labels_batch[:,di])
        
        # find accuracy of prediction
        correct = (predicted_tokens == labels_batch).view(-1).cpu().numpy()
        acc_avg.update(sum(correct), len(correct))
        
        # find accuracy (recall) for only named entities other than O
        ne_labels = labels_batch.view(-1).cpu().numpy() != other_ind
        recall.update(sum(correct & ne_labels), sum(ne_labels))
        
        # find precision of ne predictions
        ne_preds = predicted_tokens.view(-1).cpu().numpy() != other_ind
        precision.update(sum(correct & ne_preds), sum(ne_preds))
        
        # find f-score
        if (recall() + precision()) == 0:
            f_score = 0
        else:
            f_score = 2*recall()*precision() / (recall() + precision())
        
        # update the average loss
        loss_avg.update(loss.item())
        
        t.set_postfix(loss='{:05.3f}'.format(loss_avg()), acc='{:05.3f}'.format(acc_avg()), f1='{:05.3f}'.format(f_score))
    
    return f_score

In [11]:
# Define the model and optimizer
encoder1 = EncoderRNN(params).cuda()
decoder1 = DecoderRNN(params).cuda()

criterion = nn.NLLLoss()

encoder_optimizer = optim.SGD(encoder1.parameters(), lr=params.learning_rate, momentum=0.9)
decoder_optimizer = optim.SGD(decoder1.parameters(), lr=params.learning_rate, momentum=0.9)

In [12]:
import os
import shutil

def save_checkpoint(state, is_best, checkpoint, extra=''):
    """Saves model and training parameters at checkpoint + 'last.pth.tar'. If is_best==True, also saves
    checkpoint + 'best.pth.tar'
    Args:
        state: (dict) contains model's state_dict, may contain other keys such as epoch, optimizer state_dict
        is_best: (bool) True if it is the best model seen till now
        checkpoint: (string) folder where parameters are to be saved
    """
    filepath = os.path.join(checkpoint, extra + 'last.pth.tar')
    if not os.path.exists(checkpoint):
        print("Checkpoint Directory does not exist! Making directory {}".format(checkpoint))
        os.mkdir(checkpoint)
    else:
        print("Checkpoint Directory exists! ")
    torch.save(state, filepath)
    if is_best:
        shutil.copyfile(filepath, os.path.join(checkpoint, extra + 'best.pth.tar'))

In [None]:
best_val_acc = 0.0

for epoch in range(params.num_epochs):
    # Run one epoch
    logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

    # compute number of batches in one epoch (one full pass over the training set)
    # num_steps = params.train_size
    train_data_iterator = data_loader.data_iterator(train_data, params, shuffle=True)
    train(train_data_iterator, encoder1, decoder1, encoder_optimizer, decoder_optimizer, params.train_size)

    # Evaluate for one epoch on validation set
    # num_steps = params.val_size 
    val_data_iterator = data_loader.data_iterator(val_data, params, shuffle=False)
    val_acc = evaluate(val_data_iterator, encoder1, decoder1, params.val_size)
    val_metrics = {"f1": val_acc}
    
    is_best = val_acc >= best_val_acc

    # Save weights
    state_encoder = {'epoch': epoch + 1, 'state_dict': encoder1.state_dict(), 'optim_dict' : encoder_optimizer.state_dict()}
    save_checkpoint(state_encoder, is_best=is_best, checkpoint=model_dir, extra='encoder_')
    
    state_decoder = {'epoch': epoch + 1, 'state_dict': decoder1.state_dict(), 'optim_dict' : decoder_optimizer.state_dict()}
    save_checkpoint(state_decoder, is_best=is_best, checkpoint=model_dir, extra='decoder_')

    # If best_eval, best_save_path        
    if is_best:
        logging.info("- Found new best accuracy")
        best_val_acc = val_acc

        # Save best val metrics in a json file in the model directory
        best_json_path = os.path.join(model_dir, "f1_score_best_weights.json")
        utils.save_dict_to_json(val_metrics, best_json_path)

    # Save latest val metrics in a json file in the model directory
    last_json_path = os.path.join(model_dir, "f1_score_last_weights.json")
    utils.save_dict_to_json(val_metrics, last_json_path)

Epoch 1/15
 30%|██▉       | 4156/14041 [01:05<02:35, 63.75it/s, acc=0.840, f1=0.173, loss=8.363] 