In [1]:
%load_ext autoreload
%autoreload 2 

import os
import numpy as np
import pickle

from tqdm import tqdm

import torch
import torch.utils.data
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

from collections import OrderedDict

import nltk
from nltk.tokenize import word_tokenize

from embeddings import load_embeddings, load_vocab
from load_conll import load_conll03
from loader import prepare_sentence, tag_mapping, cap_feature, CoNLLDataset, pad_list, CoNLLDataset_chars
from loader import loader, pad_chars
from model_char5 import Tagger
from torch_utils import prepare_sequence, prepare_sequence_float, tensor
from utils import sent2seq, sent2chars, word_index, char_index, add_unknown_last, zero_digits, save_mappings, load_mappings, reverse_dict
from eval import eval, micro_precision_recall_f1_accuracy, eval_metrics, eval_metrics_crf, save_plot

Using TensorFlow backend.


In [2]:
#Parameters
torch.manual_seed(0)

parameters = OrderedDict()

parameters["lr"] = 0.001
parameters["optimizer"] = "Adam"
parameters["hidden_size"] = 200
parameters["pre_emb"] = "glove"

parameters["w_embed_size"] = 300
# parameters["dim_cap"] = 10

parameters["batch_size"] = 20

parameters["c_embed_size"] = 25
parameters["char_hidden_size"] = 100

parameters["load_embeds"] = True
parameters["dropout"] = 0.2
parameters["gradient_clipping"] = 0
parameters["crf"] = True
parameters["freeze"] = False

epochs = 20
zero_digit = True

assert parameters["pre_emb"] in ["glove", "google"]
assert not parameters["pre_emb"] == "google" or parameters["w_embed_size"] == 300

param_str = "-".join(["%s:%s" % (str(k), str(v)) for (k,v) in parameters.items()]).lower()


In [3]:
model_path = "models/ner/%s/" % param_str
model_path = "models/char5_crf_chars/"

reload = 0

### Load CoNLL03

In [4]:
# CoNLL03
sents_train_03, pos_train_03, chunk_train_03, ner_train_03 = load_conll03(["cleaned_eng.train"])
sents_dev_03, pos_dev_03, chunk_train_03, ner_dev_03 = load_conll03(["cleaned_eng.testa"])
sents_test_03, pos_test_03, chunk_train_03, ner_test_03 = load_conll03(["cleaned_eng.testb"])

print("Train %s, Dev %s, Test %s" % (len(sents_train_03), len(sents_dev_03), len(sents_test_03)))

if zero_digit:
    sents_train_03 = [[zero_digits(w) for w in s] for s in sents_train_03]
    sents_test_03 = [[zero_digits(w) for w in s] for s in sents_test_03]    
    sents_dev_03 = [[zero_digits(w) for w in s] for s in sents_dev_03]    

sents_train = np.concatenate([sents_train_03, sents_dev_03, sents_test_03])

Loaded CoNLL03 in 2.0433363914489746 seconds
Loaded CoNLL03 in 0.5268721580505371 seconds
Loaded CoNLL03 in 0.4434194564819336 seconds
Train 14041, Dev 3250, Test 3453


### Compute or load mappings

In [5]:
if load_mappings("conll03"):
    w2idx, c2idx, ner2idx = load_mappings("conll03")
    idx2w = reverse_dict(w2idx)
    idx2c = reverse_dict(c2idx)
    idx2ner = reverse_dict(ner2idx)
    
else: 
    w2idx, idx2w = word_index(sents_train)
    w2idx, idx2w = add_unknown_last(w2idx, idx2w)

    idner_train, ner2idx, idx2ner = tag_mapping(ner_train_03)
    num_ner_classes = len(ner2idx)

    ner2idx = {v:k for (k,v) in idx2ner.items()}

    c2idx, idx2c = char_index(sents_train)
    c2idx, idx2c = add_unknown_last(c2idx, idx2c)
    save_mappings(w2idx, c2idx, ner2idx, "conll03")

In [6]:
idner_train = tag_mapping(ner_train_03, ner2idx)
idner_dev = tag_mapping(ner_dev_03, ner2idx)
idner_test = tag_mapping(ner_test_03, ner2idx)

### Load pretrained embeddings

In [7]:
if parameters["pre_emb"] == "glove":
    embeddings_path = "word_embeddings/glove.6B/glove.6B.%sd_w2vformat.txt" % parameters["w_embed_size"]
    binary = False
else:
    embeddings_path = "word_embeddings/google/GoogleNews-vectors-negative300.bin"
    binary = True
    
if parameters["load_embeds"]:
    loaded_embeddings, (w2idx_embeds, idx2w_embeds) = load_embeddings(embeddings_path, binary=binary)
else:
    parameters["freeze"] = 0

Loading from saved embeddings
Loading vocab


### Fit word embeddings to vocabulary

In [8]:
char_embeddings = np.random.normal(scale=0.001, size=(len(c2idx), parameters["c_embed_size"]))
embeddings = np.random.normal(scale=0.001, size=(len(w2idx), parameters["w_embed_size"]))

if parameters["load_embeds"]:
    for w, i in w2idx.items():
        idx = w2idx_embeds.get(w.lower())
        if idx is not None:
            embeddings[i] = loaded_embeddings[idx][:parameters["w_embed_size"]]

### Reload model

In [9]:
model = Tagger(model_path, tensor(embeddings),  parameters["hidden_size"], w2idx, c2idx, ner2idx, char_embeddings=tensor(char_embeddings),
               char_hidden_dim = parameters["char_hidden_size"], dropout=parameters["dropout"], 
               crf=parameters["crf"])
if reload:
    model.reload()

### Train

In [10]:
model.train(sents_train_03, sents_dev_03, sents_test_03, idner_train, idner_dev, idner_test,
            parameters["lr"], lr_method=parameters["optimizer"], lr_decay=0.9, 
            batch_size = parameters["batch_size"], eps_noimprov=20, epochs=1000, 
            freeze_embeddings=parameters["freeze"])

  0%|          | 0/703 [00:00<?, ?it/s]

Epoch 1/1000 :


100%|██████████| 703/703 [05:44<00:00,  2.04it/s]  
100%|██████████| 163/163 [00:32<00:00,  4.86it/s]


ner : p 0.960430107527, r 0.962952799122, f 0.961689798955, acc 0.9564658697091235


  0%|          | 0/173 [00:00<?, ?it/s]

ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  42759  42474     32     20     52     24     92     42      1     22   99.333
 1  B-LOC   1837    113   1596     13     67      2      9     29      6      2   86.881
 2  B-PER   1842    145     23   1585     10     76      0      3      0      0   86.048
 3  B-ORG   1341    175     27     26   1058      7     28     19      0      1   78.896
 4  I-PER   1307     59      0     50      0   1194      4      0      0      0   91.354
 5  I-ORG    751    143     16      4     73     25    435      4     12     39   57.923
 6 B-MISC    922    158     16     11     29      0      4    702      0      2   76.139
 7  I-LOC    257     37     74      1      0     10     41      0     82     12   31.907
 8 I-MISC    346    122      3      1      4      4     36     44      0    132   38.150
49258/51362 (95.90359%)
NER f1 : 75.61


100%|██████████| 173/173 [00:28<00:00,  7.12it/s]


ner : p 0.948075259516, r 0.94850602566, f 0.948290593669, acc 0.9440938946915043
ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  38323  37875     42     49     85     34    116     81      7     34   98.831
 1  B-LOC   1668     95   1404     17     86      2     20     39      3      2   84.173
 2  B-PER   1617    163     26   1315     29     77      2      5      0      0   81.323
 3  B-ORG   1661    278     87     17   1162      4     74     38      0      1   69.958
 4  I-PER   1156     43      3     65      1   1027     13      2      2      0   88.841
 5  I-ORG    835    137     24      2    110      7    494      3     24     34   59.162
 6 B-MISC    702    153     13      6     30      5      1    494      0      0   70.370
 7  I-LOC    257     45     69      0      2     11     51      1     69      9   26.848
 8 I-MISC    216     59      1      0      1      2     17     26      1    109   50.463
43949/46435 (94.64628%)
NER 

  0%|          | 0/703 [00:00<?, ?it/s]

New best score on dev.
Saving model...
Epoch 2/1000 :


100%|██████████| 703/703 [03:21<00:00,  3.50it/s]
100%|██████████| 163/163 [00:30<00:00,  4.82it/s]


ner : p 0.972759295499, r 0.974360984789, f 0.973559481374, acc 0.9677972041587165


  1%|          | 1/173 [00:00<00:31,  5.48it/s]

ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  42759  42555     37     16     27      6     52     51      5     10   99.523
 1  B-LOC   1837     50   1707      3     42      0      8     17      9      1   92.923
 2  B-PER   1842     96     34   1649     11     43      0      8      0      1   89.522
 3  B-ORG   1341    119     28     21   1116      3     26     27      0      1   83.221
 4  I-PER   1307     69      0     33      0   1194      8      0      1      2   91.354
 5  I-ORG    751     98      9      1     44     11    542      3     12     31   72.170
 6 B-MISC    922    104      7      9     20      0      5    774      0      3   83.948
 7  I-LOC    257     26     26      0      0      6     23      0    171      5   66.537
 8 I-MISC    346     85      3      1      1      4     16     24      4    208   60.116
49916/51362 (97.18469%)
NER f1 : 83.36


100%|██████████| 173/173 [00:29<00:00,  6.26it/s]


ner : p 0.957806820149, r 0.957744650468, f 0.9577757343, acc 0.9532895445246042
ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  38323  37884     47     32     83     11    104    102      9     51   98.854
 1  B-LOC   1668     52   1478      4     65      1     22     34     10      2   88.609
 2  B-PER   1617    134     39   1336     52     41      5      9      0      1   82.622
 3  B-ORG   1661    229    100     11   1236      2     42     40      1      0   74.413
 4  I-PER   1156     67      1     30      2   1035     17      2      1      1   89.533
 5  I-ORG    835    111      9      1     58      7    601      3     24     21   71.976
 6 B-MISC    702    110     17      9     25      0      5    533      0      3   75.926
 7  I-LOC    257     28     21      0      3      4     34      1    163      3   63.424
 8 I-MISC    216     56      0      0      1      1      7     11      3    137   63.426
44403/46435 (95.62399%)
NER f

  0%|          | 0/703 [00:00<?, ?it/s]

New best score on dev.
Saving model...
Epoch 3/1000 :


100%|██████████| 703/703 [03:16<00:00,  3.58it/s]
100%|██████████| 163/163 [00:30<00:00,  4.82it/s]


ner : p 0.975614533346, r 0.976360357535, f 0.975987302956, acc 0.9697831081344185


  1%|          | 1/173 [00:00<00:31,  5.48it/s]

ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  42759  42507     36     13     50      5     41     70      2     35   99.411
 1  B-LOC   1837     36   1718      3     46      1      5     24      2      2   93.522
 2  B-PER   1842     92     29   1671     26     12      0     12      0      0   90.717
 3  B-ORG   1341     94     21     16   1171      2     18     19      0      0   87.323
 4  I-PER   1307     68      0     44      0   1184      7      0      2      2   90.589
 5  I-ORG    751     81      7      2     50     10    561      5      6     29   74.700
 6 B-MISC    922     74      5      9     18      0      3    809      0      4   87.744
 7  I-LOC    257     17     25      1      0      3     17      0    189      5   73.541
 8 I-MISC    346     64      2      1      1      4     15     26      3    230   66.474
50040/51362 (97.42611%)
NER f1 : 84.6


100%|██████████| 173/173 [00:29<00:00,  7.59it/s]


ner : p 0.960966099859, r 0.959843354465, f 0.96040439903, acc 0.9553784860557769
ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  38323  37832     49     19    114     13     71    143      4     78   98.719
 1  B-LOC   1668     37   1495      7     76      0      9     40      2      2   89.628
 2  B-PER   1617    112     30   1374     59     25      3     11      1      2   84.972
 3  B-ORG   1661    162     93      8   1325      0     29     42      1      1   79.771
 4  I-PER   1156     69      3     40      4   1016     14      3      5      2   87.889
 5  I-ORG    835     92     19      1     72      8    590      3     22     28   70.659
 6 B-MISC    702     92     14      6     27      0      2    556      0      5   79.202
 7  I-LOC    257     21     22      0      3      1     30      0    175      5   68.093
 8 I-MISC    216     42      0      0      2      1     10     13      2    146   67.593
44509/46435 (95.85227%)
NER 

  0%|          | 0/703 [00:00<?, ?it/s]

New best score on dev.
Saving model...
Epoch 4/1000 :


100%|██████████| 703/703 [03:18<00:00,  3.55it/s]
100%|██████████| 163/163 [00:32<00:00,  4.94it/s]


ner : p 0.974964739069, r 0.975576289791, f 0.975270418561, acc 0.9690043222615942


  0%|          | 0/173 [00:00<?, ?it/s]

ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  42759  42287     56     52     51     14    112    130     15     42   98.896
 1  B-LOC   1837     22   1745      2     40      1      4     16      7      0   94.992
 2  B-PER   1842     48     27   1717     11     30      1      8      0      0   93.214
 3  B-ORG   1341     61     31     31   1170      1     17     26      3      1   87.248
 4  I-PER   1307     41      0     29      0   1226      8      0      1      2   93.803
 5  I-ORG    751     56      9      1     40     16    597      4     11     17   79.494
 6 B-MISC    922     50     11     10     25      0      7    815      0      4   88.395
 7  I-LOC    257     13     11      1      0      4     15      0    213      0   82.879
 8 I-MISC    346     49      2      1      1      4     14     19      8    248   71.676
50018/51362 (97.38328%)
NER f1 : 85.23


100%|██████████| 173/173 [00:28<00:00,  7.61it/s]


ner : p 0.962071654464, r 0.960968432896, f 0.961519727228, acc 0.956498331000323
ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  38323  37617     70     57    109     22    159    179     19     91   98.158
 1  B-LOC   1668     27   1529      7     63      1     11     22      6      2   91.667
 2  B-PER   1617     72     34   1422     42     30      4     10      2      1   87.941
 3  B-ORG   1661    125    100     23   1327      4     36     44      0      2   79.892
 4  I-PER   1156     25      1     26      1   1086     12      0      2      3   93.945
 5  I-ORG    835     53     13      2     43      9    674      2     27     12   80.719
 6 B-MISC    702     63     16      9     36      2      2    568      0      6   80.912
 7  I-LOC    257     16     13      0      1      4     29      0    192      2   74.708
 8 I-MISC    216     39      0      0      1      0     15      6      6    149   68.981
44564/46435 (95.97071%)
NER 

  0%|          | 0/703 [00:00<?, ?it/s]

New best score on dev.
Saving model...
Epoch 5/1000 :


100%|██████████| 703/703 [03:12<00:00,  3.65it/s]
100%|██████████| 163/163 [00:29<00:00,  5.45it/s]


ner : p 0.976754678489, r 0.976027128744, f 0.976390768084, acc 0.9694521241384682


  0%|          | 0/173 [00:00<?, ?it/s]

ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  42759  42369     29     44     46      7     61    131      2     70   99.088
 1  B-LOC   1837     36   1690      5     62      1      4     33      1      5   91.998
 2  B-PER   1842     38     19   1745     12     15      1     11      0      1   94.734
 3  B-ORG   1341     61     17     29   1184      1     21     27      0      1   88.292
 4  I-PER   1307     55      0     39      1   1195     12      1      0      4   91.431
 5  I-ORG    751     57      4      0     40     11    597      3      5     34   79.494
 6 B-MISC    922     54      5      8     26      0      5    818      0      6   88.720
 7  I-LOC    257     16     12      0      0      6     23      0    195      5   75.875
 8 I-MISC    346     46      0      1      0      4     12     21      4    258   74.566
50051/51362 (97.44753%)
NER f1 : 85.06


100%|██████████| 173/173 [00:28<00:00,  6.61it/s]


ner : p 0.962378640777, r 0.96081698003, f 0.961597176358, acc 0.9563475826424034
ID     NE  Total      O  B-LOC  B-PER  B-ORG  I-PER  I-ORG B-MISC  I-LOC I-MISC  Percent
 0      O  38323  37688     37     62    115     14    111    189      6    101   98.343
 1  B-LOC   1668     41   1480      7     81      0     15     40      2      2   88.729
 2  B-PER   1617     78     22   1441     40     16      4     12      1      3   89.116
 3  B-ORG   1661    142     68     26   1340      0     35     48      0      2   80.674
 4  I-PER   1156     43      0     45      2   1041     17      3      1      4   90.052
 5  I-ORG    835     63      7      1     46      7    673      2     17     19   80.599
 6 B-MISC    702     75     13     10     28      0      0    571      0      5   81.339
 7  I-LOC    257     15     12      0      2      4     42      0    174      8   67.704
 8 I-MISC    216     41      0      0      0      0     14     11      3    147   68.056
44555/46435 (95.95133%)
NER 

  0%|          | 0/703 [00:00<?, ?it/s]

Epoch 6/1000 :


 18%|█▊        | 125/703 [00:34<02:38,  3.64it/s]

KeyboardInterrupt: 