## Validation Data Miss Labels

In [2]:
import json
import random
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import AutoModel, AutoTokenizer, AutoConfig, AdamW, get_linear_schedule_with_warmup
from data_utils import (
    load_dataset, 
    get_examples_from_dialogues, 
    convert_state_dict, 
    DSTInputExample, 
    OpenVocabDSTFeature, 
    DSTPreprocessor, 
    WOSDataset,
    set_seed)

from preprocessor import TRADEPreprocessor
from model import TRADE
from inference import inference
from eval_ucompute_acc

from prettyprinter import cpprint

In [3]:
# set random seed
set_seed(42)

In [10]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
train_data_file = "../input/data/train_dataset/train_dials.json"
slot_meta = json.load(open("../input/data/train_dataset/slot_meta.json"))
train_data, dev_data, dev_labels = load_dataset(train_data_file)

train_examples = get_examples_from_dialogues(train_data,
                                             user_first=False,
                                             dialogue_level=False)
dev_examples = get_examples_from_dialogues(dev_data,
                                           user_first=False,
                                           dialogue_level=False)

100%|██████████| 6301/6301 [00:00<00:00, 8355.98it/s] 
100%|██████████| 699/699 [00:00<00:00, 15636.87it/s]


In [5]:
tokenizer = AutoTokenizer.from_pretrained('dsksd/bert-ko-small-minimal')
processor = TRADEPreprocessor(slot_meta, tokenizer, max_seq_length=512)

In [6]:
# Extracting Featrues
dev_features = processor.convert_examples_to_features(dev_examples)
dev_data = WOSDataset(dev_features)
dev_sampler = SequentialSampler(dev_data)
dev_loader = DataLoader(
    dev_data,
    batch_size=8,
    sampler=dev_sampler,
    collate_fn=processor.collate_fn,
)

Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors


In [8]:
# Slot Meta tokenizing for the decoder initial inputs
tokenized_slot_meta = []
for slot in slot_meta:
    tokenized_slot_meta.append(
        tokenizer.encode(slot.replace("-", " "), add_special_tokens=False)
    )

# Model 선언
config = AutoConfig.from_pretrained('dsksd/bert-ko-small-minimal')
config.model_name_or_path = 'dsksd/bert-ko-small-minimal'
config.n_gate = 5
config.proj_dim = None


In [11]:
model = TRADE(config, tokenized_slot_meta)
ckpt = torch.load('./result/electric-spaceship-30-best-26.pth', map_location="cpu")
model.load_state_dict(ckpt)
model.to(device)

TRADE(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

#### 수정한 부분
- self.miss_labels = {} 추가
- update_labels 함수 추가

In [12]:
# eval_utils.py
class DSTEvaluator:
    def __init__(self, slot_meta):
        self.slot_meta = slot_meta
        self.miss_labels = {}
        self.init()

    def init(self):
        self.joint_goal_hit = 0
        self.all_hit = 0
        self.slot_turn_acc = 0
        self.slot_F1_pred = 0
        self.slot_F1_count = 0
        
    def update_labels(self, guid, gold, pred):
        pred_set = set(pred)
        gold_set = set(gold)
        if pred_set != gold_set:
            unpred_labels = sorted(list(gold_set - pred_set))
            wrong_preds = sorted(list(pred_set-gold_set))
            self.miss_labels[guid] = {'unpred_labels':unpred_labels, 'wrong_preds':wrong_preds}
        
        
    def update(self, gold, pred):
        # 매 turn마다 호출되는 함수
        # gold, pred: List of state (=slot-value)
        self.all_hit += 1
        if set(pred) == set(gold):
            self.joint_goal_hit += 1

        temp_acc = compute_acc(gold, pred, self.slot_meta)
        self.slot_turn_acc += temp_acc

        temp_f1, _, _, count = compute_prf(gold, pred)
        self.slot_F1_pred += temp_f1
        self.slot_F1_count += count

    def compute(self):
        turn_acc_score = self.slot_turn_acc / self.all_hit
        slot_F1_score = self.slot_F1_pred / self.slot_F1_count
        joint_goal_accuracy = self.joint_goal_hit / self.all_hit
        eval_result = {
            "joint_goal_accuracy": joint_goal_accuracy,
            "turn_slot_accuracy": turn_acc_score,
            "turn_slot_f1": slot_F1_score,
        }
        return eval_result

#### 수정한 부분
- evaluator.update_labels(k, l, p) 추가
- miss_labels = evaluator.miss_labels 추가
- return result, miss_labels

In [13]:
# evaluation.py
def _evaluation(preds, labels, slot_meta): # predictions, dev_labels, slot_meta)
    evaluator = DSTEvaluator(slot_meta)

    evaluator.init()
    assert len(preds) == len(labels)

    # k: guid, l: [state, state, ...]
    for k, l in labels.items():
        p = preds.get(k)
        if p is None: # predictions에 에측된 states가 None인 경우
            raise Exception(f"{k} is not in the predictions!")
        evaluator.update(l, p)
        evaluator.update_labels(k, l, p)
        
    miss_labels = evaluator.miss_labels
    result = evaluator.compute()
    #print(result)
    return result, miss_labels


In [15]:
predictions = inference(model, dev_loader, processor, device)

100%|██████████| 635/635 [02:06<00:00,  5.03it/s]


In [16]:
eval_result, miss_labels = _evaluation(predictions, dev_labels, slot_meta)

NameError: name 'compute_acc' is not defined

In [None]:
cpprint(miss_labels)