In [1]:
%load_ext autoreload
%autoreload 2 

import os
import numpy as np
import pickle

from tqdm import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from collections import OrderedDict

import nltk
from nltk.tokenize import word_tokenize

from embeddings import load_embeddings, load_vocab
from load_conll import load_conll03
from loader import prepare_sentence, tag_mapping, cap_feature
from model import Tagger
from torch_utils import prepare_sequence, prepare_sequence_float, tensor
from utils import sent2seq, sent2chars, word_index, char_index, add_unknown_last, zero_digits
from eval import eval, micro_precision_recall_f1_accuracy, eval_metrics, save_plot

Using TensorFlow backend.


In [2]:
#Parameters
torch.manual_seed(0)

parameters = OrderedDict()

parameters["lr"] = 0.005
parameters["optimizer"] = "SGD"
parameters["hidden_size"] = 100

parameters["w_embed_size"] = 50
parameters["dim_cap"] = 10

parameters["load_embeds"] = True
parameters["dropout"] = 0.5

epochs = 20
zero_digit = True
gradient_clipping = 5

param_str = "-".join(["%s:%s" % (str(k), str(v)) for (k,v) in parameters.items()]).lower()
print(param_str)

lr:0.005-optimizer:sgd-hidden_size:100-w_embed_size:50-dim_cap:10-load_embeds:true-dropout:0.5


# 1. Data Preprocessing

### Load pretrained embeddings

In [3]:
embeddings_path = "word_embeddings/glove.6B/glove.6B.%sd_w2vformat.txt" % parameters["w_embed_size"]
if parameters["load_embeds"]:
    loaded_embeddings, (w2idx, idx2w) = load_embeddings(embeddings_path)
else:
    parameters["freeze"] = 0

Loading from saved embeddings
Loading vocab


### Load CoNLL

In [4]:
# CoNLL03
sents_train_03, pos_train_03, chunk_train_03, ner_train_03 = load_conll03(["cleaned_eng.train"])
sents_dev_03, pos_dev_03, chunk_train_03, ner_dev_03 = load_conll03(["cleaned_eng.testa"])
sents_test_03, pos_test_03, chunk_train_03, ner_test_03 = load_conll03(["cleaned_eng.testb"])

print("Train %s, Dev %s, Test %s" % (len(sents_train_03), len(sents_dev_03), len(sents_test_03)))

Loaded CoNLL03 in 1.8871440887451172 seconds
Loaded CoNLL03 in 0.4340505599975586 seconds
Loaded CoNLL03 in 0.4249567985534668 seconds
Train 14041, Dev 3250, Test 3453


In [5]:
if zero_digit:
    sents_train_03 = [[zero_digits(w) for w in s] for s in sents_train_03]
    sents_test_03 = [[zero_digits(w) for w in s] for s in sents_test_03]    
    sents_dev_03 = [[zero_digits(w) for w in s] for s in sents_dev_03]    

In [6]:
sents_train = np.concatenate([sents_train_03, sents_dev_03, sents_test_03])

In [7]:
w2idx_train, idx2w_train = word_index(sents_train)
w2idx_train, idx2w_train = add_unknown_last(w2idx_train, idx2w_train)

X_train_03 = sent2seq(sents_train_03, w2idx_train)
X_dev_03 = sent2seq(sents_dev_03, w2idx_train)
X_test_03 = sent2seq(sents_test_03, w2idx_train)

idner_train, ner2idx, idx2ner = tag_mapping(ner_train_03)
idner_dev = tag_mapping(ner_dev_03, ner2idx)
idner_test = tag_mapping(ner_test_03, ner2idx)

num_ner_classes = len(ner2idx)

Found 9 unique named entity tags


### Fit word embeddings to vocabulary

In [8]:
embeddings = np.random.normal(scale=0.001, size=(len(w2idx_train), parameters["w_embed_size"]))

if parameters["load_embeds"]:
    for w, i in w2idx_train.items():
        idx = w2idx.get(w)
        if idx is not None:
            embeddings[i] = loaded_embeddings[idx][:parameters["w_embed_size"]]

### Capitalization features

In [9]:
cap_train_03 = [[cap_feature(w) for w in s] for s in sents_train_03]
cap_test_03 = [[cap_feature(w) for w in s] for s in sents_test_03]
cap_dev_03 = [[cap_feature(w) for w in s] for s in sents_dev_03]    

In [10]:
if parameters["dim_cap"]:
    n_cap = 4
    cap_embeddings = np.random.normal(scale=0.001, size=(n_cap, parameters["dim_cap"]))

# 2. Training 

### Defining model

In [11]:
model = Tagger(tensor(embeddings),  parameters["hidden_size"], idx2ner, dropout=parameters["dropout"])

trainable_parameters = filter(lambda p: p.requires_grad, model.parameters())

if parameters["optimizer"].lower() == "adam":
    optimizer = optim.Adam(trainable_parameters, lr= parameters["lr"])
elif parameters["optimizer"].lower() == "sgd":
    optimizer = optim.SGD(trainable_parameters, lr= parameters["lr"])

if torch.cuda.is_available():
    model.cuda()

In [12]:
reload = 0
model_path = "models/ner/%s/" % param_str

if not os.path.exists(model_path):
    os.makedirs(model_path)
    
if reload and os.path.exists(model_path + "last_state_dict"):
    model.load_state_dict(torch.load(model_path + "last_state_dict"))
    model = model.cuda()
    with open(model_path + "metrics.p", "rb") as file:
        metrics = pickle.load(file)
    best_ner = np.max(metrics["ner"]["val_loss"])
    
else:
    metrics = {"ner":{"precision":[], "recall":[], "f1":[], "accuracy":[], "ent_f1":[], "loss": [],
                      "val_loss": []}}
    best_ner = np.inf

### Training 

In [30]:
for epoch in range(epochs):
    print("epoch %s/%s :" % (epoch+1, epochs))

    losses = []
    val_loss_epoch = []
    loss_epoch = []
    
    for i in tqdm(range(len(X_train_03))):
        sentence, tags = list(zip(X_train_03, idner_train))[i]
        
        model.zero_grad()
        model.hidden = model.init_hidden()
        
        sentence_in = prepare_sequence(sentence)
        targets = prepare_sequence(tags)
        
        scores = model(sentence_in)
        
        loss = nn.CrossEntropyLoss()(scores, targets)
        
        loss.backward()        
        optimizer.step()
        
        losses.append(loss.cpu().data.numpy())
        
    preds_test, val_loss_list = model.test(X_test_03, idner_test)
    eval_metrics(preds_test, metrics, idner_test, sents_test_03, ner2idx, idx2ner, model_path)
    
    ner_loss = np.mean(val_loss_list)
    
    val_loss_epoch = ner_loss
    loss_epoch = np.mean(losses)
    
    print("Loss :  NER %s" % (loss_epoch))
    print("Val loss : NER %s" % (val_loss_epoch))
    
    torch.save(model.state_dict(), model_path + "last_state_dict")
    if ner_loss < best_ner: 
        print("New best score on dev.")
        print("Saving model...")
        torch.save(model.state_dict(), model_path + "best_state_dict")
        
        best_ner = ner_loss

    metrics["ner"]["val_loss"].append(val_loss_epoch)
    metrics["ner"]["loss"].append(loss_epoch)    

    # Save learning curve
    save_plot(metrics, model_path)
    with open(model_path + "metrics.p", "wb") as file:
        pickle.dump(metrics, file)

print("Done")
plt.show()

  0%|          | 8/14041 [00:00<04:24, 53.08it/s]

epoch 1/20 :


100%|██████████| 14041/14041 [04:06<00:00, 56.88it/s]
100%|██████████| 3453/3453 [00:05<00:00, 667.18it/s]


ner : p 0.836696457414, r 0.840606676908, f 0.838647009303, acc 0.8366964574135889
ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  38323  38267      0      6     45      0      5      0      0      0   99.854
 1  B-LOC   1668   1614      0      0     51      0      3      0      0      0    0.000
 2  B-PER   1617   1533      0     81      3      0      0      0      0      0    5.009
 3  B-ORG   1661   1158      0     14    479      1      9      0      0      0   28.838
 4  I-PER   1156   1156      0      0      0      0      0      0      0      0    0.000
 5  I-ORG    835    658      0      1    140     11     25      0      0      0    2.994
 6 B-MISC    702    701      0      0      1      0      0      0      0      0    0.000
 7  I-LOC    257    251      0      0      4      0      2      0      0      0    0.000
 8 I-MISC    216    215      0      0      1      0      0      0      0      0    0.000
38852/46435 (83.66965%)
NER

  0%|          | 2/14041 [00:00<12:45, 18.33it/s]

epoch 2/20 :


100%|██████████| 14041/14041 [03:47<00:00, 61.72it/s]
100%|██████████| 3453/3453 [00:05<00:00, 660.87it/s]


ner : p 0.842360288575, r 0.846296977434, f 0.844324044294, acc 0.842360288575428
ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  38323  38265      0      6     45      0      7      0      0      0   99.849
 1  B-LOC   1668   1613      0      2     50      0      3      0      0      0    0.000
 2  B-PER   1617   1408      0    204      4      1      0      0      0      0   12.616
 3  B-ORG   1661   1137      0     35    477      3      9      0      0      0   28.718
 4  I-PER   1156   1131      0      0      0     25      0      0      0      0    2.163
 5  I-ORG    835    648      0      2     28     13    144      0      0      0   17.246
 6 B-MISC    702    700      0      0      1      0      1      0      0      0    0.000
 7  I-LOC    257    251      0      0      2      0      4      0      0      0    0.000
 8 I-MISC    216    214      0      0      1      0      1      0      0      0    0.000
39115/46435 (84.23603%)
NER 

  0%|          | 3/14041 [00:00<09:36, 24.36it/s]

epoch 3/20 :


 14%|█▎        | 1902/14041 [00:32<03:32, 57.11it/s]

KeyboardInterrupt: 

           14%|█▎        | 1902/14041 [00:50<05:20, 37.90it/s]

In [29]:
save_plot(metrics, model_path)