In [23]:
import os
import argparse
import pickle
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from inference_helper import inference 
from model import MyModel
from dataloader import load_t1_data
from transformers import AutoTokenizer

model_dir,file = os.path.split("./checkpoints/zalo/2020_11_19_16_54_41/checkpoint_9.ckpt")
config = pickle.load(open(os.path.join(model_dir,'args'),'rb'))
checkpoint = torch.load(os.path.join(model_dir,file),map_location=torch.device("cpu"))
model_state_dict = checkpoint['model_state_dict']

In [24]:
config.threshold = 1
tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model_path)
mymodel = MyModel(config)
mymodel.load_state_dict(model_state_dict,strict=False)
device = torch.device("cuda") if  torch.cuda.is_available() else torch.device("cpu")
mymodel.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


MyModel(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=

In [25]:
test_dataloader = load_t1_data(config.dataset_tag, 
                               "./data/cleaned_data/zalo/test.json",
                               config.pretrained_model_path,
                               150,
                               45,
                               20,
                               512)
t1_predict, t2_predict = inference(mymodel,test_dataloader,config.threshold, True)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
t1_dataset: 100%|██████████| 211/211 [00:00<00:00, 218.63it/s]
t1 predict: 100%|██████████| 267/267 [00:23<00:00, 11.20it/s]
t2 dataset: 100%|█████████▉| 5340/5348 [00:03<00:00, 1517.27it/s]
t2 predict: 100%|██████████| 1477/1477 [01:12<00:00, 20.25it/s]


In [26]:
passages = test_dataloader.dataset.passages
ids = test_dataloader.dataset.ids
entities, relations = {}, {}

for p_id, (ner_type, start_pos, end_pos) in t1_predict:
    if not p_id in entities:
        entities[p_id] = []
    text = passages[p_id][start_pos : end_pos] 
    entities[p_id].append((ner_type, tokenizer.convert_tokens_to_string(text)))

for p_id, ((head_entity_type, head_start, head_stop), relation_type, (end_entity_type, end_start, end_stop)) in t2_predict:
    if not p_id in relations:
        relations[p_id] = []
    end_entity = passages[p_id][head_start : head_stop]
    head_entity = passages[p_id][end_start : end_stop]
    end_entity = tokenizer.convert_tokens_to_string(end_entity)
    head_entity = tokenizer.convert_tokens_to_string(head_entity)
    relations[p_id].append((relation_type, head_entity, end_entity))

for p_id in range(len(passages)):
    with open("./predictions/{}.txt".format(ids[p_id]), "w") as fin:
        if p_id in entities:
            for entity in entities[p_id]:
                fin.write("{}\t{}\n".format(entity[0], entity[1]))
        if p_id in relations:
            for relation in relations[p_id]:
                fin.write("{}\t{}\t{}\n".format(relation[0], relation[1], relation[2]))
