In [1]:
%load_ext autoreload
%autoreload 2 

import os
import numpy as np
import pickle

from tqdm import tqdm

import torch
import torch.utils.data
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

from collections import OrderedDict

import nltk
from nltk.tokenize import word_tokenize

from embeddings import load_embeddings, load_vocab
from load_conll import load_conll03
from loader import prepare_sentence, tag_mapping, cap_feature, CoNLLDataset, pad_list
from model_char import Tagger, cuda
from torch_utils import prepare_sequence, prepare_sequence_float, tensor
from utils import sent2seq, sent2chars, word_index, char_index, add_unknown_last, zero_digits
from eval import eval, micro_precision_recall_f1_accuracy, eval_metrics, eval_metrics_crf, save_plot

Using TensorFlow backend.


In [2]:
#Parameters
torch.manual_seed(0)

parameters = OrderedDict()

parameters["lr"] = 0.1
parameters["optimizer"] = "SGD"
parameters["hidden_size"] = 200
parameters["pre_emb"] = "glove"

parameters["w_embed_size"] = 300
# parameters["dim_cap"] = 10

parameters["batch_size"] = 4

parameters["c_embed_size"] = 25
parameters["char_hidden_size"] = 100

parameters["load_embeds"] = True
parameters["dropout"] = 0.5
parameters["gradient_clipping"] = 0
parameters["crf"] = True

epochs = 1000
zero_digit = True

assert parameters["pre_emb"] in ["glove", "google"]
assert not parameters["pre_emb"] == "google" or parameters["w_embed_size"] == 300

param_str = "-".join(["%s:%s" % (str(k), str(v)) for (k,v) in parameters.items()]).lower()
print(param_str)

lr:0.1-optimizer:sgd-hidden_size:200-pre_emb:glove-w_embed_size:300-batch_size:4-c_embed_size:25-char_hidden_size:100-load_embeds:true-dropout:0.5-gradient_clipping:0-crf:true


# 1. Data Preprocessing

### Load pretrained embeddings

In [3]:
if parameters["pre_emb"] == "glove":
    embeddings_path = "word_embeddings/glove.6B/glove.6B.%sd_w2vformat.txt" % parameters["w_embed_size"]
    binary = False
else:
    embeddings_path = "word_embeddings/google/GoogleNews-vectors-negative300.bin"
    binary = True
    
if parameters["load_embeds"]:
    loaded_embeddings, (w2idx, idx2w) = load_embeddings(embeddings_path, binary=binary)
else:
    parameters["freeze"] = 0

Loading from saved embeddings
Loading vocab


### Load CoNLL

In [4]:
# CoNLL03
sents_train_03, pos_train_03, chunk_train_03, ner_train_03 = load_conll03(["cleaned_eng.train"])
sents_dev_03, pos_dev_03, chunk_train_03, ner_dev_03 = load_conll03(["cleaned_eng.testa"])
sents_test_03, pos_test_03, chunk_train_03, ner_test_03 = load_conll03(["cleaned_eng.testb"])

print("Train %s, Dev %s, Test %s" % (len(sents_train_03), len(sents_dev_03), len(sents_test_03)))

Loaded CoNLL03 in 1.79341459274292 seconds
Loaded CoNLL03 in 0.43091535568237305 seconds
Loaded CoNLL03 in 0.4309520721435547 seconds
Train 14041, Dev 3250, Test 3453


In [5]:
if zero_digit:
    sents_train_03 = [[zero_digits(w) for w in s] for s in sents_train_03]
    sents_test_03 = [[zero_digits(w) for w in s] for s in sents_test_03]    
    sents_dev_03 = [[zero_digits(w) for w in s] for s in sents_dev_03]    

In [6]:
sents_train = np.concatenate([sents_train_03, sents_dev_03, sents_test_03])

In [7]:
w2idx_train, idx2w_train = word_index(sents_train)
w2idx_train, idx2w_train = add_unknown_last(w2idx_train, idx2w_train)

X_train_03 = sent2seq(sents_train_03, w2idx_train)
X_dev_03 = sent2seq(sents_dev_03, w2idx_train)
X_test_03 = sent2seq(sents_test_03, w2idx_train)

idner_train, ner2idx, idx2ner = tag_mapping(ner_train_03)
idner_dev = tag_mapping(ner_dev_03, ner2idx)
idner_test = tag_mapping(ner_test_03, ner2idx)

num_ner_classes = len(ner2idx)

Found 9 unique named entity tags


## Character embeddings

In [8]:
c2idx, idx2c = char_index(sents_train)
char_embeddings = np.random.normal(scale=0.001, size=(len(w2idx_train), parameters["c_embed_size"]))

In [55]:
chars_train_03 = sent2chars(sents_train_03, c2idx)
chars_dev_03 = sent2chars(sents_dev_03, c2idx)
chars_test_03 = sent2chars(sents_test_03, c2idx)

### Data Pipeline

In [56]:
class CoNLLDataset_chars(torch.utils.data.Dataset):
    def __init__(self, X, chars, y, lens):
        self.words = X
        self.chars = chars
        self.labels = y
        self.lens = lens
        
    def __getitem__(self, idx):
        return self.words[idx], self.chars[idx],self.labels[idx], self.lens[idx]

    def __len__(self):
        return len(self.words)

In [77]:
def pad_chars(chars, pad_index=0):
    lens_sents = [len(s) for s in chars]
    lens_words = [[len(w) for w in s] for s in chars]
    
    maxlen_sent = max(lens_sents)
    maxlen = max(np.concatenate(lens_words))
    
    
    unrolled = []
    for s in chars:
        for w in s:
            unrolled.append(w)
    
    batch = pad_index * torch.ones(len(chars), int(maxlen_sent), int(maxlen)).long()
    
    for i, s in enumerate(chars):
        for j, w in enumerate(s):
            batch[i, j, :lens_words[i][j]] = torch.LongTensor(w)
        
    return batch, lens_words

In [79]:
batch, lens_words = pad_chars(chars_dev_03)

In [81]:
print(batch[0])
print(len(lens_words[0]))
print(lens_words[0])


   25    14    23  ...      0     0     0
   56    39     3  ...      0     0     0
   28     0     0  ...      0     0     0
       ...          ⋱          ...       
   27    27     0  ...      0     0     0
   48     0     0  ...      0     0     0
   20     0     0  ...      0     0     0
[torch.LongTensor of size 109x27]

109
[5, 6, 1, 2, 1, 5, 7, 1, 2, 1, 6, 5, 1, 2, 1, 5, 6, 1, 4, 8, 1, 2, 4, 1, 2, 1, 5, 3, 9, 1, 2, 1, 6, 8, 1, 2, 1, 5, 8, 1, 1, 1, 5, 3, 3, 10, 1, 1, 1, 4, 9, 1, 7, 1, 1, 1, 1, 5, 6, 1, 5, 5, 1, 2, 1, 1, 1, 1, 5, 5, 1, 1, 1, 4, 7, 1, 5, 3, 7, 1, 2, 1, 1, 1, 1, 5, 5, 1, 1, 1, 6, 6, 1, 1, 1, 5, 6, 1, 1, 1, 5, 6, 1, 5, 5, 1, 2, 1, 1]


In [78]:
words_train, lens_train, sorted_train = pad_list(X_train_03)
chars_train, wlens_train = pad_chars(chars_train_03)
labels_train, _, _ = pad_list(idner_train)

words_dev, lens_dev, sorted_dev = pad_list(X_dev_03)
chars_dev, wlens_dev = pad_chars(chars_dev_03)
labels_dev, _, _ = pad_list(idner_dev)

words_test, lens_test, sorted_test = pad_list(X_test_03)
chars_test, wlens_test = pad_chars(chars_test_03)
labels_test, _, _ = pad_list(idner_test)

In [83]:
chars_train[:4]


( 0 ,.,.) = 
   1   2   0  ...    0   0   0
   3   4   5  ...    0   0   0
   9   4   3  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

( 1 ,.,.) = 
  21   4   7  ...    0   0   0
  17  13  11  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

( 2 ,.,.) = 
  17  24   2  ...    0   0   0
  27  27  27  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

( 3 ,.,.) = 
  29  19   4  ...    0   0   0
   1  23   3  ...    0   0   0
  31  14  10  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
[torch.LongTensor of size 4x113x61]

In [84]:
dataset_train = CoNLLDataset_chars(words_train, chars_train, labels_train, lens_train)
loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=parameters["batch_size"], num_workers=0,
                                           pin_memory=True)

dataset_dev = CoNLLDataset_chars(words_dev, chars_dev,labels_dev, lens_dev)
loader_dev = torch.utils.data.DataLoader(dataset_dev, batch_size=parameters["batch_size"], num_workers=0,
                                         pin_memory=True)

dataset_test = CoNLLDataset_chars(words_test, chars_test, labels_test, lens_test)
loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=parameters["batch_size"], num_workers=0,
                                          pin_memory=True)

In [85]:
idner_dev = [idner_dev[i] for i in sorted_dev]
idner_test = [idner_test[i] for i in sorted_test]

sents_dev_03 = [sents_dev_03[i] for i in sorted_dev]
sents_test_03 = [sents_test_03[i] for i in sorted_test]

In [86]:
# for x, y, l in loader:
#     packed = pack_padded_sequence(autograd.Variable(x), l.numpy(), batch_first=True)
#     batch, lens = pad_packed_sequence(packed, batch_first=True)

In [None]:
for 

### Fit word embeddings to vocabulary

In [87]:
embeddings = np.random.normal(scale=0.001, size=(len(w2idx_train), parameters["w_embed_size"]))

if parameters["load_embeds"]:
    for w, i in w2idx_train.items():
        idx = w2idx.get(w)
        if idx is not None:
            embeddings[i] = loaded_embeddings[idx][:parameters["w_embed_size"]]

### Capitalization features

In [88]:
# cap_train_03 = [[cap_feature(w) for w in s] for s in sents_train_03]
# cap_test_03 = [[cap_feature(w) for w in s] for s in sents_test_03]
# cap_dev_03 = [[cap_feature(w) for w in s] for s in sents_dev_03]    

In [89]:
# if parameters["dim_cap"]:
#     n_cap = 4
#     cap_embeddings = np.random.normal(scale=0.001, size=(n_cap, parameters["dim_cap"]))

# 2. Training 

### Defining model

In [18]:
if not "<START>" in idx2ner.values():
    idx2ner[len(idx2ner)] = "<START>"
    idx2ner[len(idx2ner)] = "<STOP>"
    ner2idx = {v:k for (k,v) in idx2ner.items()}

idx2ner

{0: 'O',
 1: 'B-LOC',
 2: 'B-PER',
 3: 'B-ORG',
 4: 'I-PER',
 5: 'I-ORG',
 6: 'B-MISC',
 7: 'I-LOC',
 8: 'I-MISC',
 9: '<START>',
 10: '<STOP>'}

In [19]:
model = Tagger(tensor(embeddings),  parameters["hidden_size"], idx2ner, char_embeddings=tensor(char_embeddings),
               char_hidden_dim = parameters["char_hidden_size"], dropout=parameters["dropout"], 
               crf=parameters["crf"])

trainable_parameters = filter(lambda p: p.requires_grad, model.parameters())

if parameters["optimizer"].lower() == "adam":
    optimizer = optim.Adam(trainable_parameters, lr= parameters["lr"])
elif parameters["optimizer"].lower() == "sgd":
    optimizer = optim.SGD(trainable_parameters, lr= parameters["lr"])

if torch.cuda.is_available():
    model.cuda()

In [20]:
reload = 1
model_path = "models/ner_crf_batch/%s/" % param_str

if not os.path.exists(model_path):
    os.makedirs(model_path)
    
if reload and os.path.exists(model_path + "last_state_dict"):
    model.load_state_dict(torch.load(model_path + "last_state_dict"))
    model = model.cuda()
    with open(model_path + "metrics.p", "rb") as file:
        metrics = pickle.load(file)
    best_ner = np.max(metrics["ner"]["val_loss_dev"])
    
else:
    metrics = {"ner":{"precision":[], "recall":[], "f1":[], "accuracy":[], "ent_f1":[], "loss": [], "val_loss_dev": [],
                      "precision_test":[], "recall_test":[], "f1_test":[], "accuracy_test":[], "ent_f1_test":[], 
                      "val_loss_test": []}}
    best_ner = np.inf

In [92]:
for i, (words, chars, tags, lens) in enumerate(loader_train):
    print(i)
    print(words.size())
    print(chars)
    print(tags.size())
    print(lens.size())
    break
    
    

0
torch.Size([4, 113])

( 0 ,.,.) = 
   1   2   0  ...    0   0   0
   3   4   5  ...    0   0   0
   9   4   3  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

( 1 ,.,.) = 
  21   4   7  ...    0   0   0
  17  13  11  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

( 2 ,.,.) = 
  17  24   2  ...    0   0   0
  27  27  27  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

( 3 ,.,.) = 
  29  19   4  ...    0   0   0
   1  23   3  ...    0   0   0
  31  14  10  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
[torch.LongTensor of size 4x113x61]

torch.Size([4, 1

### Training 

In [None]:
for epoch in range(epochs):
    print("Epoch %s/%s :" % (epoch+1, epochs))

    losses = []
    val_loss_epoch = []
    loss_epoch = []
    
    test_lim = 3500 
    for i, (sentences, tags, lens) in enumerate(tqdm(loader_train)):
#     test_lim = 10
#     for i in tqdm(range(10)):
                      
        sentences_in = autograd.Variable(cuda(sentences[:,:lens.numpy()[0]]))
        targets = autograd.Variable(cuda(tags[:,:lens.numpy()[0]]))
        
        packed_targets = pack_padded_sequence(targets, lens.numpy(), batch_first=True) 
        
        if parameters["crf"]:        
            loss = model.neg_log_likelihood(sentences_in, lens, targets,
                                            gradient_clipping=parameters["gradient_clipping"])
        else:
            scores = model(sentences_in, lens)
            padded_scores = pad_packed_sequence(scores, batch_first=True) 
            
#             print(padded_scores[0].size())
#             print(targets.size())
            
            loss = nn.CrossEntropyLoss()(padded_scores[0].contiguous().view(-1,len(ner2idx)),
                                                          targets.contiguous().view(-1))

        loss.backward()        
        optimizer.step()
        
        losses.append(loss.cpu().data.numpy())
    
    preds_dev, ner_loss_dev = model.test(loader_dev)  
    
    eval_metrics_crf(preds_dev, metrics, idner_dev[:test_lim], sents_dev_03[:test_lim], 
                     ner2idx, idx2ner, model_path, dev=True)
    
    val_loss_epoch = ner_loss_dev
    
    preds_test, ner_loss_test = model.test(loader_test)     
    
    eval_metrics_crf(preds_test, metrics, idner_test[:test_lim], sents_test_03[:test_lim], 
                         ner2idx, idx2ner, model_path)
    
    loss_epoch = np.mean(losses)
    
    print("Loss :  NER %s" % (loss_epoch))
    print("Dev loss : NER %s" % (val_loss_epoch))
    print("Test loss : NER %s" % (ner_loss_test))
    
    torch.save(model.state_dict(), model_path + "last_state_dict")
    if ner_loss_dev < best_ner: 
        print("New best score on dev.")
        print("Saving model...")
        torch.save(model.state_dict(), model_path + "best_state_dict")
        
        best_ner = ner_loss_dev
    
    metrics["ner"]["val_loss_dev"].append(val_loss_epoch)
    metrics["ner"]["val_loss_test"].append(ner_loss_test)    
    metrics["ner"]["loss"].append(loss_epoch)    

    # Save learning curve
    save_plot(metrics, model_path)
    with open(model_path + "metrics.p", "wb") as file:
        pickle.dump(metrics, file)

print("Done")

# Debug Tests

In [None]:
for x,y,l in loader_train:
    X = autograd.Variable(x)
    Y = autograd.Variable(y)
    L = l
    break

In [None]:
word_embeddings = nn.Embedding(len(w2idx_train), 300)

In [None]:
packed = pack_padded_sequence(X, l.numpy(), batch_first=True)

In [None]:
w = word_embeddings(X)

In [None]:
w.size()

In [None]:
p = pack_padded_sequence(w, l.numpy(), batch_first=True)

In [None]:
p

In [None]:
np.sum(p.batch_sizes)

In [None]:
np.sum(l.numpy())

In [None]:
print(l.numpy())
print(p.batch_sizes)