### Conll 2003 nmt evaluation

Data downloaded from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/NERdata).

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import warnings
import os
import sys

sys.path.append("../")

warnings.filterwarnings("ignore")

In [65]:
data_path = "/datadrive/conll-2003/"

train_path = data_path + "train.txt"
dev_path = data_path + "dev.txt"
test_path = data_path + "test.txt"

### 0. Prc data for csv format

In [66]:
import codecs


def read_data(input_file):
    """Reads a BIO data."""
    with codecs.open(input_file, "r", encoding="utf-8") as f:
        lines = []
        words = []
        labels = []
        for line in f:
            contends = line.strip()
            word = line.strip().split(' ')[0]
            label = line.strip().split(' ')[-1]
            if contends.startswith("-DOCSTART-"):
                words.append('')
                continue
            
            if len(contends) == 0 and not len(words):
                words.append("")
            
            if len(contends) == 0 and words[-1] == '.':
                l = ' '.join([label for label in labels if len(label) > 0])
                w = ' '.join([word for word in words if len(word) > 0])
                lines.append([l, w])
                words = []
                labels = []
                continue
            words.append(word)
            labels.append(label.replace("-", "_"))
        return lines


In [67]:
train_f = read_data(train_path)
dev_f = read_data(dev_path)
test_f = read_data(test_path)

In [None]:
[l for l in train_f]

In [68]:
len(train_f), len(dev_f), len(test_f)

(6973, 1739, 1559)

In [69]:
train_f[0]

['B_ORG O B_MISC O O O B_MISC O O',
 'EU rejects German call to boycott British lamb .']

In [70]:
import pandas as pd

In [71]:
train_df = pd.DataFrame(train_f, columns=["0", "1"])
train_df.to_csv(data_path + "train.csv", index=False)

In [72]:
valid_df = pd.DataFrame(dev_f, columns=["0", "1"])
valid_df.to_csv(data_path + "valid.csv", index=False)

In [73]:
test_df = pd.DataFrame(test_f, columns=["0", "1"])
test_df.to_csv(data_path + "test.csv", index=False)

### 1. Create data loaders

In [2]:
import os

data_path = "/datadrive/conll-2003/"
train_path = data_path + "train.csv"
valid_path = data_path + "valid.csv"
test_path = data_path + "test.csv"

model_dir = " /datadrive/models/multi_cased_L-12_H-768_A-12/"
init_checkpoint_pt = os.path.join("/datadrive/models/multi_cased_L-12_H-768_A-12/", "pytorch_model.bin")
bert_config_file = os.path.join("/datadrive/bert/multi_cased_L-12_H-768_A-12/", "bert_config.json")
vocab_file = os.path.join("/datadrive/bert/multi_cased_L-12_H-768_A-12/", "vocab.txt")

In [3]:
import torch
torch.cuda.set_device(1)
torch.cuda.is_available(), torch.cuda.current_device()

(True, 1)

In [4]:
from modules import BertNerData as NerData

INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


In [5]:
data = NerData.create(train_path, valid_path, vocab_file)

In [6]:
len(data.train_dl.dataset), len(data.valid_dl.dataset)

(6973, 1739)

In [7]:
print(data.id2label)

['<pad>', '[CLS]', '[SEP]', 'B_ORG', 'B_O', 'I_O', 'B_MISC', 'B_PER', 'I_PER', 'B_LOC', 'I_LOC', 'I_ORG', 'I_MISC']


In [8]:
sup_labels = ['B_ORG', 'B_MISC', 'B_PER', 'I_PER', 'B_LOC', 'I_LOC', 'I_ORG', 'I_MISC']

In [9]:
max([len(f.labels_ids) for f in data.train_dl.dataset])

424

### 2. Create model

In [10]:
from modules.models.bert_models import BertBiLSTMAttnNMT

In [11]:
model = BertBiLSTMAttnNMT.create(len(data.label2idx), bert_config_file, init_checkpoint_pt,
                                 enc_hidden_dim=128, dec_hidden_dim=128, dec_embedding_dim=16)

In [12]:
model.get_n_trainable_params()

652906

#### TODO: fix bug with len

### 3. Create Learner

In [13]:
from modules import NerLearner

In [14]:
num_epochs = 100
learner = NerLearner(model, data,
                     best_model_path="/datadrive/models/conll-2003/bilstm_attn_cased.cpt",
                     lr=0.01, clip=1.0, sup_labels=data.id2label[5:],
                     t_total=num_epochs * len(data.train_dl))

INFO:root:Don't use lr scheduler...


### 4. Start learning

In [None]:
learner.fit(num_epochs, target_metric='prec')

### 5. Evaluate dev set

In [16]:
from modules.data.bert_data import get_bert_data_loader_for_predict
dl = get_bert_data_loader_for_predict(data_path + "valid.csv", learner)

In [17]:
learner.load_model()

In [18]:
preds = learner.predict(dl)

HBox(children=(IntProgress(value=0, max=109), HTML(value='')))

IOB precision

In [19]:
from modules.train.train import validate_step
print(validate_step(learner.data.valid_dl, learner.model, learner.data.id2label, learner.sup_labels))

HBox(children=(IntProgress(value=0, max=109), HTML(value='')))

              precision    recall  f1-score   support

       B_ORG      0.922     0.934     0.928      1282
      B_MISC      0.924     0.892     0.908       905
       B_PER      0.973     0.970     0.972      1686
       I_PER      0.985     0.974     0.980      3488
       B_LOC      0.953     0.958     0.956      1669
       I_LOC      0.956     0.936     0.946      1913
       I_ORG      0.910     0.927     0.918      2129
      I_MISC      0.860     0.838     0.849      1061

   micro avg      0.946     0.940     0.943     14133
   macro avg      0.936     0.928     0.932     14133
weighted avg      0.946     0.940     0.943     14133



Span precision

In [20]:
from modules.utils.plot_metrics import get_bert_span_report
clf_report = get_bert_span_report(dl, preds, [])
print(clf_report)

              precision    recall  f1-score   support

        MISC      0.870     0.863     0.866       905
         ORG      0.815     0.836     0.826      1282
         PER      0.930     0.928     0.929      1686
           O      0.990     0.989     0.990     41801
         LOC      0.895     0.904     0.899      1669

   micro avg      0.977     0.977     0.977     47343
   macro avg      0.900     0.904     0.902     47343
weighted avg      0.978     0.977     0.978     47343



### 6. Evaluate test set

In [21]:
from modules.data.bert_data import get_bert_data_loader_for_predict
dl = get_bert_data_loader_for_predict(data_path + "test.csv", learner)

In [22]:
preds = learner.predict(dl)

HBox(children=(IntProgress(value=0, max=98), HTML(value='')))

IOB precision

In [23]:
from modules.train.train import validate_step
print(validate_step(learner.data.valid_dl, learner.model, learner.data.id2label, learner.sup_labels))

HBox(children=(IntProgress(value=0, max=109), HTML(value='')))

              precision    recall  f1-score   support

       B_ORG      0.922     0.934     0.928      1282
      B_MISC      0.924     0.892     0.908       905
       B_PER      0.973     0.970     0.972      1686
       I_PER      0.985     0.974     0.980      3488
       B_LOC      0.953     0.958     0.956      1669
       I_LOC      0.956     0.936     0.946      1913
       I_ORG      0.910     0.927     0.918      2129
      I_MISC      0.860     0.838     0.849      1061

   micro avg      0.946     0.940     0.943     14133
   macro avg      0.936     0.928     0.932     14133
weighted avg      0.946     0.940     0.943     14133



Span precision

In [24]:
from modules.utils.plot_metrics import get_bert_span_report
clf_report = get_bert_span_report(dl, preds, [])
print(clf_report)

              precision    recall  f1-score   support

        MISC      0.758     0.778     0.768       688
         ORG      0.656     0.683     0.669      1533
         PER      0.864     0.859     0.861      1566
           O      0.980     0.977     0.979     37690
         LOC      0.834     0.851     0.843      1570

   micro avg      0.955     0.955     0.955     43047
   macro avg      0.818     0.830     0.824     43047
weighted avg      0.955     0.955     0.955     43047

