### sumbt_baseline(for wandb sweep)
- checkpoint
    - add checkpoint
    - add checkpoint saving process
    - update checkpoint type(available countinous training)
- update validation per epoch or minimal loss
- add wandb

In [1]:
import sys
sys.path.append('..')

In [2]:
import os
from pathlib import Path
import json
from tqdm import tqdm
import random

import torch
import numpy as np
from transformers import BertTokenizer
from data_utils import get_examples_from_dialogues, convert_state_dict, load_dataset
from data_utils import OntologyDSTFeature, DSTPreprocessor, _truncate_seq_pair

### wandb

In [3]:
import wandb
# !wandb login  # run once

In [4]:
def increment_output_dir(output_path, exist_ok=False):
  path = Path(output_path)
  if (path.exists() and exist_ok) or (not path.exists()):
    return str(path)
  else:
    dirs = glob.glob(f"{path}*")
    matches = [re.search(rf"%s(\d+)" %path.stem, d) for d in dirs]
    i = [int(m.groups()[0]) for m in matches if m]
    n = max(i) + 1 if i else 2
    return f"{path}{n}"

### argparse setting

In [5]:
from argparse import Namespace

args = {
    'batch_size': 8,  # 8
    'hidden_dim': 300,
    'num_rnn_layers': 1,
    'zero_init_rnn': False,
    'max_seq_length': 64,
    'max_label_length': 12,
    'attn_head': 4,  # 4
    'fix_utterance_encoder': False,
    'task_name': 'sumbtgru',
    'distance_metric': 'euclidean',
    'model_name_or_path': 'dsksd/bert-ko-small-minimal',
    'warmup_ratio': 0.1,
    'learning_rate': 5e-5,  # 5e-5
    'weight_decay': 0.01,  # 0.01
    'num_train_epochs': 10
}

args = Namespace(**args)

In [6]:
# wandb sweep 생성 시 parameters에 전달하는 config 설정
hyperparameter_defaults = dict(
    batch_size = args.batch_size,
    learning_rate = args.learning_rate,
    epochs = args.num_train_epochs,
    weight_decay = args.weight_decay,
    attn_head = args.attn_head,
    distance_metric = args.distance_metric,
    
#     dropout = 0.1,
#     smoothing = 0.2
#     model_name = 'BertForSequenceClassification',
#     tokenizer_name = 'BertTokenizer',
    )

# wandb.init(config=hyperparameter_defaults, project="SUMBT-sweep")
# config = wandb.config

- wandb sweep config

In [7]:
sweep_config = {
  "name": "SUMBT-sweep",
  "method": "bayes",
  "metric": {
      "goal": "maximize",
      "name": "Joint Goal Accuracy"},
  "parameters": {
      "attn_head": {
          "distribution": "int_uniform",
          "max": 12,
          "min": 4
      },
      "batch_size": {
          "distribution": "int_uniform",
          "max": 12,
          "min": 8
      },
      "distance_metric": {
          "distribution": "categorical",
          "values": ["euclidean", "cosine"]
      },
      "learning_rate": {
          "distribution": "uniform",
          "max": 1e-03,
          "min": 5e-05
      },
      "weight_decay": {
          "distribution": "uniform",
          "max": 0.02,
          "min": 0.005
      }
    }
}

sweep_id = wandb.sweep(sweep_config, project="SUMBT-sweep")
sweep_id

Create sweep with ID: 4yym5838
Sweep URL: https://wandb.ai/taepd/SUMBT-sweep/sweeps/4yym5838


'4yym5838'

In [8]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU        
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

seed_everything(42)

## Data Loading 

In [9]:
train_data_file = "/opt/ml/repo/taepd/input/data/train_dataset/train_dials.json"
slot_meta = json.load(open("/opt/ml/repo/taepd/input/data/train_dataset/slot_meta.json"))
ontology = json.load(open("/opt/ml/repo/taepd/input/data/train_dataset/ontology.json"))
train_data, dev_data, dev_labels = load_dataset(train_data_file)

In [10]:
train_examples = get_examples_from_dialogues(data=train_data,
                                             user_first=True,
                                             dialogue_level=True)

dev_examples = get_examples_from_dialogues(data=dev_data,
                                           user_first=True,
                                           dialogue_level=True)

100%|██████████| 6301/6301 [00:00<00:00, 8456.91it/s] 
100%|██████████| 699/699 [00:00<00:00, 11855.31it/s]


In [11]:
len(train_data)

6301

In [12]:
max_turn = max([len(e['dialogue']) for e in train_data])
tokenizer = BertTokenizer.from_pretrained('dsksd/bert-ko-small-minimal')

In [13]:
print(max_turn)

34


## TODO-1: SUMBT Preprocessor 정의 

Ontology-based DST model인 SUMBT의 InputFeature를 만들기 위한 Preprocessor를 정의해야 합니다. <br>

1. `_convert_examples_to_features` 함수의 빈칸을 매워 완성하세요.
2. `recover_state` 함수의 빈칸을 매워 완성하세요.

In [14]:
class SUMBTPreprocessor(DSTPreprocessor):
    def __init__(
        self,
        slot_meta,
        src_tokenizer,
        trg_tokenizer=None,
        ontology=None,
        max_seq_length=64,
        max_turn_length=12,
    ):
        self.slot_meta = slot_meta
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer if trg_tokenizer else src_tokenizer
        self.ontology = ontology
        self.max_seq_length = max_seq_length
        self.max_turn_length = max_turn_length

    def _convert_example_to_feature(self, example):
        guid = example[0].guid.rsplit("-", 1)[0]  # dialogue_idx
        turns = []
        token_types = []
        labels = []
        num_turn = None
        for turn in example[: self.max_turn_length]:
            assert len(turn.current_turn) == 2
            uttrs = []
            for segment_idx, uttr in enumerate(turn.current_turn):
                token = self.src_tokenizer.encode(uttr, add_special_tokens=False)
                uttrs.append(token)

            _truncate_seq_pair(uttrs[0], uttrs[1], self.max_seq_length - 3)
            tokens = (
                [self.src_tokenizer.cls_token_id]
                + uttrs[0]
                + [self.src_tokenizer.sep_token_id]
                + uttrs[1]
                + [self.src_tokenizer.sep_token_id]
            )
            token_type = [0] * (len(uttrs[0]) + 2) + [1] * (len(uttrs[1]) + 1)
            if len(tokens) < self.max_seq_length:
                gap = self.max_seq_length - len(tokens)
                tokens.extend([self.src_tokenizer.pad_token_id] * gap)
                token_type.extend([0] * gap)
            turns.append(tokens)
            token_types.append(token_type)
            label = []
            if turn.label:
                slot_dict = convert_state_dict(turn.label)
            else:
                slot_dict = {}
            for slot_type in self.slot_meta:
                value = slot_dict.get(slot_type, "none")
                # TODO
                # raise Exception('label_idx를 ontology에서 꺼내오는 코드를 작성하세요!')
#                 label_idx = self.ontology[slot_type].index(value)  # 이렇게 해도 될듯한데?
                if value in self.ontology[slot_type]:
                    label_idx = self.ontology[slot_type].index(value)
                else:
                    label_idx = self.ontology[slot_type].index("none")
                label.append(label_idx)
            labels.append(label)
        num_turn = len(turns)
        if len(turns) < self.max_turn_length:
            gap = self.max_turn_length - len(turns)
            for _ in range(gap):
                dummy_turn = [self.src_tokenizer.pad_token_id] * self.max_seq_length
                turns.append(dummy_turn)
                token_types.append(dummy_turn)
                dummy_label = [-1] * len(self.slot_meta)
                labels.append(dummy_label)
        return OntologyDSTFeature(
            guid=guid,
            input_ids=turns,
            segment_ids=token_types,
            num_turn=num_turn,
            target_ids=labels,
        )

    def convert_examples_to_features(self, examples):
        return list(map(self._convert_example_to_feature, examples))

    def recover_state(self, pred_slots, num_turn):
        states = []
        for pred_slot in pred_slots[:num_turn]:
            state = []
            for s, p in zip(self.slot_meta, pred_slot):
                v = self.ontology[s][p]
                if v != "none":
                    state.append(f"{s}-{v}")
            states.append(state)
        return states

    def collate_fn(self, batch):
        guids = [b.guid for b in batch]
        input_ids = torch.LongTensor([b.input_ids for b in batch])
        segment_ids = torch.LongTensor([b.segment_ids for b in batch])
        input_masks = input_ids.ne(self.src_tokenizer.pad_token_id)
        target_ids = torch.LongTensor([b.target_ids for b in batch])
        num_turns = [b.num_turn for b in batch]
        return input_ids, segment_ids, input_masks, target_ids, num_turns, guids

## Convert_Examples_to_Features 

In [15]:
processor = SUMBTPreprocessor(slot_meta,
                              tokenizer,
                              ontology=ontology,  # predefined ontology
                              max_seq_length=64,  # 각 turn마다 최대 길이
                              max_turn_length=max_turn)  # 각 dialogue의 최대 turn 길이
train_features = processor.convert_examples_to_features(train_examples)
dev_features = processor.convert_examples_to_features(dev_examples)

In [16]:
print(len(train_features))  # 대화 level의 features
print(len(dev_features))

6301
699


In [17]:
f = train_features[0]

print(f.guid)  # 대화 unique_id
print(f.num_turn)  # 실제 대화의 turn 길이 == T
print(len(f.input_ids))  # input_ids의 턴 길이 (max_turn_length == 현재 34)
print(len(f.input_ids[0]))  # input_ids에서 각 턴의 최대 길이 (max_seq_length == 64)
print(len(f.segment_ids))  # segment_ids의 턴 길이 (max_turn_length == 34)
print(len(f.target_ids))  # target_ids의 갯수 (턴마다의 State == max_turn_length == 34)
print(len(f.target_ids[0]))  # 각 턴마다 target의 갯수 == number of Slot Meta (== 45)

snowy-hat-8324:관광_식당_11
8
34
64
34
34
45


## SUMBT 모델 선언 

In [18]:
"""
Most of code is from https://github.com/SKTBrain/SUMBT
"""

import math
import os.path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CosineEmbeddingLoss, CrossEntropyLoss
from transformers import BertModel, BertPreTrainedModel


class BertForUtteranceEncoding(BertPreTrainedModel):
    def __init__(self, config):
        super(BertForUtteranceEncoding, self).__init__(config)

        self.config = config
        self.bert = BertModel(config)

    def forward(self, input_ids, token_type_ids, attention_mask):
        return self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=False,
            output_hidden_states=False,
            return_dict=False,
        )


class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()

        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)

        self.scores = None

    def attention(self, q, k, v, d_k, mask=None, dropout=None):

        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)

        if mask is not None:
            mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim=-1)

        if dropout is not None:
            scores = dropout(scores)

        self.scores = scores
        output = torch.matmul(scores, v)
        return output

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)

        # perform linear operation and split into h heads
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)

        # transpose to get dimensions bs * h * sl * d_model
        k = k.transpose(1, 2)
        q = q.transpose(1, 2)
        v = v.transpose(1, 2)

        scores = self.attention(q, k, v, self.d_k, mask, self.dropout)

        # concatenate heads and put through final linear layer
        concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        output = self.out(concat)
        return output

    def get_scores(self):
        return self.scores


class SUMBT(nn.Module):
    def __init__(self, args, num_labels, device):  # num_labels : # of Candidate values per each Slot
        super(SUMBT, self).__init__()

        self.hidden_dim = args.hidden_dim
        self.rnn_num_layers = args.num_rnn_layers
        self.zero_init_rnn = args.zero_init_rnn
        self.max_seq_length = args.max_seq_length
        self.max_label_length = args.max_label_length
        self.num_labels = num_labels
        self.num_slots = len(num_labels)
        self.attn_head = args.attn_head
        self.device = device

        ### Utterance Encoder
        self.utterance_encoder = BertForUtteranceEncoding.from_pretrained(
            args.model_name_or_path
        )
        self.bert_output_dim = self.utterance_encoder.config.hidden_size
        self.hidden_dropout_prob = self.utterance_encoder.config.hidden_dropout_prob
        if args.fix_utterance_encoder:
            for p in self.utterance_encoder.bert.pooler.parameters():
                p.requires_grad = False

        ### slot, slot-value Encoder (not trainable)
        self.sv_encoder = BertForUtteranceEncoding.from_pretrained(
            args.model_name_or_path
        )
        # os.path.join(args.bert_dir, 'bert-base-uncased.model'))
        for p in self.sv_encoder.bert.parameters():
            p.requires_grad = False

        self.slot_lookup = nn.Embedding(self.num_slots, self.bert_output_dim)
        self.value_lookup = nn.ModuleList(
            [nn.Embedding(num_label, self.bert_output_dim) for num_label in num_labels]
        )

        ### Attention layer
        self.attn = MultiHeadAttention(self.attn_head, self.bert_output_dim, dropout=0)

        ### RNN Belief Tracker
        self.nbt = nn.GRU(
            input_size=self.bert_output_dim,
            hidden_size=self.hidden_dim,
            num_layers=self.rnn_num_layers,
            dropout=self.hidden_dropout_prob,
            batch_first=True,
        )
        self.init_parameter(self.nbt)

        if not self.zero_init_rnn:
            self.rnn_init_linear = nn.Sequential(
                nn.Linear(self.bert_output_dim, self.hidden_dim),
                nn.ReLU(),
                nn.Dropout(self.hidden_dropout_prob),
            )

        self.linear = nn.Linear(self.hidden_dim, self.bert_output_dim)
        self.layer_norm = nn.LayerNorm(self.bert_output_dim)

        ### Measure
#         self.metric = torch.nn.PairwiseDistance(p=2.0, eps=1e-06, keepdim=False)
        self.distance_metric = args.distance_metric
        if self.distance_metric == "cosine":
            self.metric = torch.nn.CosineSimilarity(dim=-1, eps=1e-08)
        elif self.distance_metric == "euclidean":
            self.metric = torch.nn.PairwiseDistance(p=2.0, eps=1e-06, keepdim=False)

        ### Classifier
        self.nll = CrossEntropyLoss(ignore_index=-1)

        ### Etc.
        self.dropout = nn.Dropout(self.hidden_dropout_prob)

    def initialize_slot_value_lookup(self, label_ids, slot_ids):

        self.sv_encoder.eval()

        # Slot encoding
        slot_type_ids = torch.zeros(slot_ids.size(), dtype=torch.long).to(
            slot_ids.device
        )
        slot_mask = slot_ids > 0
        hid_slot, _ = self.sv_encoder(
            slot_ids.view(-1, self.max_label_length),
            slot_type_ids.view(-1, self.max_label_length),
            slot_mask.view(-1, self.max_label_length),
        )
        hid_slot = hid_slot[:, 0, :]
        hid_slot = hid_slot.detach()
        self.slot_lookup = nn.Embedding.from_pretrained(hid_slot, freeze=True)

        for s, label_id in enumerate(label_ids):
            label_type_ids = torch.zeros(label_id.size(), dtype=torch.long).to(
                label_id.device
            )
            label_mask = label_id > 0
            hid_label, _ = self.sv_encoder(
                label_id.view(-1, self.max_label_length),
                label_type_ids.view(-1, self.max_label_length),
                label_mask.view(-1, self.max_label_length),
            )
            hid_label = hid_label[:, 0, :]
            hid_label = hid_label.detach()
            self.value_lookup[s] = nn.Embedding.from_pretrained(hid_label, freeze=True)
            self.value_lookup[s].padding_idx = -1

        print("Complete initialization of slot and value lookup")
        self.sv_encoder = None

    def forward(
        self,
        input_ids,
        token_type_ids,
        attention_mask,
        labels=None,
        n_gpu=1,
        target_slot=None,
    ):
        # B: Batch size, M: Max turn length, N: Max seq length, 
        # J: # of Slot Meta, H: Hidden dimension
        
        # input_ids: [B, M, N]
        # token_type_ids: [B, M, N]
        # attention_mask: [B, M, N]
        # labels: [B, M, J]

        # if target_slot is not specified, output values corresponding all slot-types
        if target_slot is None:
            target_slot = list(range(0, self.num_slots))

        ds = input_ids.size(0)  # Batch size (B)
        ts = input_ids.size(1)  # Max turn size (M)
        bs = ds * ts
        slot_dim = len(target_slot)  # J

        # Utterance encoding
        # Utterence-level로 독립적으로 인코딩하므로 flatten필요
        hidden, _ = self.utterance_encoder(
            input_ids.view(-1, self.max_seq_length),
            token_type_ids.view(-1, self.max_seq_length),
            attention_mask.view(-1, self.max_seq_length),
        )
        hidden = torch.mul(
            hidden,
            attention_mask.view(-1, self.max_seq_length, 1)
            .expand(hidden.size())
            .float(),
        )
        hidden = hidden.repeat(slot_dim, 1, 1)  # [J*M*B, N, H]

        hid_slot = self.slot_lookup.weight[target_slot, :]  # Select target slot embedding
        hid_slot = hid_slot.repeat(1, bs).view(bs * slot_dim, -1)  # [J*M*B, N, H]

        # Attended utterance vector
        hidden = self.attn(
            hid_slot,  # q^s  [J*M*B, N, H]
            hidden,  # U [J*M*B, N, H]
            hidden,  # U [J*M*B, N, H]
            mask=attention_mask.view(-1, 1, self.max_seq_length).repeat(slot_dim, 1, 1),
        )
        hidden = hidden.squeeze()  # h [J*M*B, H] Aggregated Slot Context
        hidden = hidden.view(slot_dim, ds, ts, -1).view(-1, ts, self.bert_output_dim)  # [J*B, M, H]

        # NBT
        if self.zero_init_rnn:
            h = torch.zeros(
                self.rnn_num_layers, input_ids.shape[0] * slot_dim, self.hidden_dim
            ).to(
                self.device
            )  # [1, slot_dim*ds, hidden]
        else:
            h = hidden[:, 0, :].unsqueeze(0).repeat(self.rnn_num_layers, 1, 1)
            h = self.rnn_init_linear(h)

        if isinstance(self.nbt, nn.GRU):
            rnn_out, _ = self.nbt(hidden, h)  # [J*B, M, H_GRU]
        elif isinstance(self.nbt, nn.LSTM):
            c = torch.zeros(
                self.rnn_num_layers, input_ids.shape[0] * slot_dim, self.hidden_dim
            ).to(
                self.device
            )  # [1, slot_dim*ds, hidden]
            rnn_out, _ = self.nbt(hidden, (h, c))  # [slot_dim*ds, turn, hidden]
        rnn_out = self.layer_norm(self.linear(self.dropout(rnn_out)))

        hidden = rnn_out.view(slot_dim, ds, ts, -1)  # [J, B, M, H_GRU]

        # Label (slot-value) encoding
        loss = 0
        loss_slot = []
        pred_slot = []
        output = []
        for s, slot_id in enumerate(target_slot):  ## note: target_slots are successive
            # loss calculation
            hid_label = self.value_lookup[slot_id].weight
            num_slot_labels = hid_label.size(0)

            _hid_label = (
                hid_label.unsqueeze(0)
                .unsqueeze(0)
                .repeat(ds, ts, 1, 1)
                .view(ds * ts * num_slot_labels, -1)
            )
            _hidden = (
                hidden[s, :, :, :]
                .unsqueeze(2)
                .repeat(1, 1, num_slot_labels, 1)
                .view(ds * ts * num_slot_labels, -1)
            )
            _dist = self.metric(_hid_label, _hidden).view(ds, ts, num_slot_labels)
            if self.distance_metric == "euclidean":
                _dist = -_dist
            _, pred = torch.max(_dist, -1)  # taget_ids에서 ignore index 즉, padding일 경우 -1로 setting했었음
            pred_slot.append(pred.view(ds, ts, 1))
            output.append(_dist)

            if labels is not None:
                _loss = self.nll(_dist.view(ds * ts, -1), labels[:, :, s].view(-1))
                loss_slot.append(_loss.item())
                loss += _loss

        pred_slot = torch.cat(pred_slot, 2)
        if labels is None:
            return output, pred_slot

        # calculate joint accuracy
        accuracy = (pred_slot == labels).view(-1, slot_dim)
        acc_slot = (
            torch.sum(accuracy, 0).float()
            / torch.sum(labels.view(-1, slot_dim) > -1, 0).float()
        )
        acc = (
            sum(torch.sum(accuracy, 1) / slot_dim).float()
            / torch.sum(labels[:, :, 0].view(-1) > -1, 0).float()
        )  # joint accuracy

        if n_gpu == 1:
            return loss, loss_slot, acc, acc_slot, pred_slot
        else:
            return (
                loss.unsqueeze(0),
                None,
                acc.unsqueeze(0),
                acc_slot.unsqueeze(0),
                pred_slot.unsqueeze(0),
            )

    @staticmethod
    def init_parameter(module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_normal_(module.weight)
            torch.nn.init.constant_(module.bias, 0.0)
        elif isinstance(module, nn.GRU) or isinstance(module, nn.LSTM):
            torch.nn.init.xavier_normal_(module.weight_ih_l0)
            torch.nn.init.xavier_normal_(module.weight_hh_l0)
            torch.nn.init.constant_(module.bias_ih_l0, 0.0)
            torch.nn.init.constant_(module.bias_hh_l0, 0.0)

## TODO-2: Ontology Pre-Encoding 

Ontology의 slot type들과 이에 속하는 slot_value들을 tokenizing하는 `tokenize_ontology`를 작성하세요. <br>
[CLS] Pooling하여 `slot_lookup` 과 `value_lookup` embedding matrix들을 초기화하는 <br>
`initialize_slot_value_lookup`에 인자로 넘겨주세요. <br>

In [19]:
def tokenize_ontology(ontology, tokenizer, max_seq_length=12):
    slot_types = []
    slot_values = []
    for k, v in ontology.items():
        tokens = tokenizer.encode(k)
        if len(tokens) < max_seq_length:
            gap = max_seq_length - len(tokens)
            tokens.extend([tokenizer.pad_token_id] *  gap)
        slot_types.append(tokens)
        slot_value = []
        for vv in v:
            tokens = tokenizer.encode(vv)
            if len(tokens) < max_seq_length:
                gap = max_seq_length - len(tokens)
                tokens.extend([tokenizer.pad_token_id] *  gap)
            slot_value.append(tokens)
        slot_values.append(torch.LongTensor(slot_value))
    return torch.LongTensor(slot_types), slot_values

In [20]:
slot_type_ids, slot_values_ids = tokenize_ontology(ontology, tokenizer, 12)
num_labels = [len(s) for s in slot_values_ids]  # 각 Slot 별 후보 Values의 갯수
print(num_labels)
print("Tokenized Slot: ", slot_type_ids.size())
for slot, slot_value_id in zip(slot_meta, slot_values_ids):
    print(f"Tokenized Value of {slot}", slot_value_id.size())

[4, 4, 4, 4, 4, 103, 13, 4, 7, 5, 4, 4, 4, 12, 12, 9, 67, 4, 4, 7, 4, 7, 4, 4, 5, 4, 4, 12, 569, 9, 44, 4, 10, 4, 4, 7, 4, 60, 12, 60, 190, 298, 5, 431, 286]
Tokenized Slot:  torch.Size([45, 12])
Tokenized Value of 관광-경치 좋은 torch.Size([4, 12])
Tokenized Value of 관광-교육적 torch.Size([4, 12])
Tokenized Value of 관광-도보 가능 torch.Size([4, 12])
Tokenized Value of 관광-문화 예술 torch.Size([4, 12])
Tokenized Value of 관광-역사적 torch.Size([4, 12])
Tokenized Value of 관광-이름 torch.Size([103, 12])
Tokenized Value of 관광-종류 torch.Size([13, 12])
Tokenized Value of 관광-주차 가능 torch.Size([4, 12])
Tokenized Value of 관광-지역 torch.Size([7, 12])
Tokenized Value of 숙소-가격대 torch.Size([5, 12])
Tokenized Value of 숙소-도보 가능 torch.Size([4, 12])
Tokenized Value of 숙소-수영장 유무 torch.Size([4, 12])
Tokenized Value of 숙소-스파 유무 torch.Size([4, 12])
Tokenized Value of 숙소-예약 기간 torch.Size([12, 12])
Tokenized Value of 숙소-예약 명수 torch.Size([12, 12])
Tokenized Value of 숙소-예약 요일 torch.Size([9, 12])
Tokenized Value of 숙소-이름 torch.Size([67, 12])

## Model 선언 

In [21]:
# argsparse 있던 위치

num_labels = [len(s) for s in slot_values_ids]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu = 1 if torch.cuda.device_count() < 2 else torch.cuda.device_count()
n_epochs = args.num_train_epochs

In [22]:
# model = SUMBT(args, num_labels, device)
# model.initialize_slot_value_lookup(slot_values_ids, slot_type_ids)  # Tokenized Ontology의 Pre-encoding using BERT_SV
# model.to(device)

# wandb.watch(model)
# print()

## 데이터 로더 정의

In [23]:
from data_utils import WOSDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import random


train_data = WOSDataset(train_features)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, batch_size=args.batch_size, sampler=train_sampler, collate_fn=processor.collate_fn)

dev_data = WOSDataset(dev_features)
dev_sampler = SequentialSampler(dev_data)
dev_loader = DataLoader(dev_data, batch_size=8, sampler=dev_sampler, collate_fn=processor.collate_fn)

## Optimizer & Scheduler 선언 

In [24]:
# no_decay = ["bias", "LayerNorm.weight"]
# optimizer_grouped_parameters = [
#         {
#             "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
#             "weight_decay": args.weight_decay,
#         },
#         {
#             "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
#             "weight_decay": 0.0,
#         },
#     ]

# t_total = len(train_loader) * n_epochs
# optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8)
# scheduler = get_linear_schedule_with_warmup(
#     optimizer, num_warmup_steps=int(t_total * args.warmup_ratio), num_training_steps=t_total
# )

## TODO-3: Inference code 작성 

In [25]:
from evaluation import _evaluation

In [26]:
def inference(model, eval_loader, processor, device):
    model.eval()
    predictions = {}
    for batch in eval_loader:
        input_ids, segment_ids, input_masks, target_ids, num_turns, guids = \
        [b.to(device) if not isinstance(b, list) else b for b in batch]

        with torch.no_grad():
            _, pred_slot = model(
                input_ids, segment_ids, input_masks, labels=None, n_gpu=1
            )
        
        batch_size = input_ids.size(0)
        for i in range(batch_size):
            guid = guids[i]
            states = processor.recover_state(pred_slot.tolist()[i], num_turns[i])
            for tid, state in enumerate(states):
                predictions[f"{guid}-{tid}"] = state
    return predictions

## Training 

In [27]:
# model.load_state_dict(torch.load('20epoch.pth'))

In [28]:
def train(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
    
        model = SUMBT(args, num_labels, device)
        model.initialize_slot_value_lookup(slot_values_ids, slot_type_ids)  # Tokenized Ontology의 Pre-encoding using BERT_SV
        model.to(device)

        wandb.watch(model)
        
        
        # for checkpoint management
        chk_list = []
        output_dir = increment_output_dir(wandb.run.name)
        
        if not os.path.exists(f"checkpoint/{output_dir}"):
            os.makedirs(f"checkpoint/{output_dir}")

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
                {
                    "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                    "weight_decay": args.weight_decay,
                },
                {
                    "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                    "weight_decay": 0.0,
                },
            ]

        t_total = len(train_loader) * n_epochs
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=int(t_total * args.warmup_ratio), num_training_steps=t_total
        )            
        
        
        best_score, best_checkpoint = 0, 0
        for epoch in range(n_epochs):
            batch_loss = []
            batch_loss_per_100step = []
            for step, batch in enumerate(tqdm(train_loader)):
                model.train()
                input_ids, segment_ids, input_masks, target_ids, num_turns, guids  = \
                [b.to(device) if not isinstance(b, list) else b for b in batch]

                # Forwabatch_size        
                if n_gpu == 1:
                    loss, loss_slot, acc, acc_slot, _ = model(input_ids, segment_ids, input_masks, target_ids, n_gpu)
                else:
                    loss, _, acc, acc_slot, _ = model(input_ids, segment_ids, input_masks, target_ids, n_gpu)

                batch_loss.append(loss.item())

                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                if step % 100 == 0 or step == len(train_loader):
                    batch_loss_per_100step.append(loss.item())
                    print('[%d/%d] [%d/%d] %f' % (epoch, n_epochs, step, len(train_loader), loss.item()))
                    # epoch를 마쳤거나, 최저 loss 갱신했을 때 추론
                    if step == len(train_loader) or min(batch_loss_per_100step) >= loss.item():
                        print('inferencing')
                        predictions = inference(model, dev_loader, processor, device)
                        eval_result = _evaluation(predictions, dev_labels, slot_meta)
                        current_score = eval_result['joint_goal_accuracy']

                        if best_score < current_score:
                            best_score = current_score
                            print('new best JGA score! ', best_score)
                            
                            # checkpoint 수 관리
                            if len(chk_list) >= 1:
                                os.remove(chk_list.pop(0))
                            
                            output_path = f"checkpoint/{output_dir}/{epoch}_{step}_{best_score}.pth"
                            chk_list.append(output_path)
                            
                            torch.save({
                                        'epoch': epoch,
                                        'model_state_dict': model.state_dict(),
                                        'optimizer_state_dict': optimizer.state_dict(),
                                        'loss': loss,
                                        }, output_path)
                        for k, v in eval_result.items():
                            print(f"{k}: {v}")
                        wandb.log({
                            "loss": loss.item(),
                            "Joint Goal Accuracy": eval_result['joint_goal_accuracy'],
                            "Turn Slot_Accuracy": eval_result['turn_slot_accuracy'],
                            "Turn Slot F1": eval_result['turn_slot_f1']
                            })

In [29]:
wandb.agent(sweep_id, train, count=10)
# wandb.agent('kxb5gf1d', train, count=1)

# noraml train
# train()

[34m[1mwandb[0m: Currently logged in as: [33mtaepd[0m (use `wandb login --relogin` to force relogin)


Some weights of the model checkpoint at dsksd/bert-ko-small-minimal were not used when initializing BertForUtteranceEncoding: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForUtteranceEncoding from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForUtteranceEncoding from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at dsksd/bert-ko-small-minimal 

Complete initialization of slot and value lookup


  0%|          | 0/701 [00:00<?, ?it/s]

[0/10] [0/701] 122.612656
inferencing


  0%|          | 1/701 [00:31<6:06:28, 31.41s/it]

{'joint_goal_accuracy': 0.0, 'turn_slot_accuracy': 0.03879146141215199, 'turn_slot_f1': 0.05740321942804178}
joint_goal_accuracy: 0.0
turn_slot_accuracy: 0.03879146141215199
turn_slot_f1: 0.05740321942804178


 14%|█▍        | 100/701 [02:27<11:42,  1.17s/it]

[0/10] [100/701] 35.868423
inferencing
{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8189512862616227, 'turn_slot_f1': 0.07097171949143095}
new best JGA score!  0.019310344827586208


 14%|█▍        | 101/701 [03:00<1:47:54, 10.79s/it]

joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8189512862616227
turn_slot_f1: 0.07097171949143095


 29%|██▊       | 200/701 [04:57<09:50,  1.18s/it]  

[0/10] [200/701] 23.888100
inferencing


 29%|██▊       | 201/701 [05:28<1:25:03, 10.21s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8444575807334431, 'turn_slot_f1': 0.18802284592637017}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8444575807334431
turn_slot_f1: 0.18802284592637017


 43%|████▎     | 301/701 [07:25<07:47,  1.17s/it]  

[0/10] [300/701] 25.586157


 57%|█████▋    | 401/701 [09:22<05:54,  1.18s/it]

[0/10] [400/701] 24.964899


 71%|███████▏  | 501/701 [11:19<03:53,  1.17s/it]

[0/10] [500/701] 31.384569


 86%|████████▌ | 601/701 [13:16<01:56,  1.17s/it]

[0/10] [600/701] 35.662365


100%|██████████| 701/701 [15:12<00:00,  1.30s/it]
  0%|          | 0/701 [00:00<?, ?it/s]

[0/10] [700/701] 36.874531
[1/10] [0/701] 39.812473
inferencing


  0%|          | 1/701 [00:31<6:02:21, 31.06s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205276409414228, 'turn_slot_f1': 0.01939238901852332}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205276409414228
turn_slot_f1: 0.01939238901852332


 14%|█▍        | 100/701 [02:26<11:41,  1.17s/it]

[1/10] [100/701] 32.761971
inferencing


 14%|█▍        | 101/701 [02:57<1:41:24, 10.14s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 29%|██▊       | 201/701 [04:54<09:44,  1.17s/it]  

[1/10] [200/701] 34.916893


 43%|████▎     | 301/701 [06:51<07:52,  1.18s/it]

[1/10] [300/701] 34.771412


 57%|█████▋    | 401/701 [08:48<05:50,  1.17s/it]

[1/10] [400/701] 37.532112


 71%|███████▏  | 500/701 [10:43<03:54,  1.17s/it]

[1/10] [500/701] 32.596970
inferencing


 71%|███████▏  | 501/701 [11:14<33:49, 10.15s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 86%|████████▌ | 601/701 [13:11<01:56,  1.17s/it]

[1/10] [600/701] 34.144550


100%|█████████▉| 700/701 [15:07<00:01,  1.17s/it]

[1/10] [700/701] 24.290779
inferencing


100%|██████████| 701/701 [15:37<00:00,  1.34s/it]
  0%|          | 0/701 [00:00<?, ?it/s]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208
[2/10] [0/701] 35.246906
inferencing


  0%|          | 1/701 [00:31<6:02:46, 31.10s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 14%|█▍        | 101/701 [02:28<11:41,  1.17s/it]

[2/10] [100/701] 36.169003


 29%|██▊       | 200/701 [04:24<09:45,  1.17s/it]

[2/10] [200/701] 34.149017
inferencing


 29%|██▊       | 201/701 [04:55<1:24:33, 10.15s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 43%|████▎     | 301/701 [06:52<07:46,  1.17s/it]  

[2/10] [300/701] 35.122711


 57%|█████▋    | 401/701 [08:49<05:55,  1.18s/it]

[2/10] [400/701] 36.470959


 71%|███████▏  | 500/701 [10:45<03:55,  1.17s/it]

[2/10] [500/701] 27.432068
inferencing


 71%|███████▏  | 501/701 [11:16<33:49, 10.15s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 86%|████████▌ | 601/701 [13:13<01:57,  1.18s/it]

[2/10] [600/701] 35.423752


100%|██████████| 701/701 [15:09<00:00,  1.30s/it]
  0%|          | 0/701 [00:00<?, ?it/s]

[2/10] [700/701] 34.213127
[3/10] [0/701] 33.465466
inferencing


  0%|          | 1/701 [00:31<6:02:43, 31.09s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 14%|█▍        | 101/701 [02:27<11:38,  1.16s/it]

[3/10] [100/701] 36.342976


 29%|██▊       | 200/701 [04:23<09:44,  1.17s/it]

[3/10] [200/701] 32.358261
inferencing


 29%|██▊       | 201/701 [04:54<1:24:28, 10.14s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 43%|████▎     | 301/701 [06:51<07:47,  1.17s/it]  

[3/10] [300/701] 35.618248


 57%|█████▋    | 401/701 [08:48<05:50,  1.17s/it]

[3/10] [400/701] 34.637234


 71%|███████▏  | 501/701 [10:45<03:53,  1.17s/it]

[3/10] [500/701] 36.655827


 86%|████████▌ | 601/701 [12:41<01:56,  1.17s/it]

[3/10] [600/701] 35.151821


100%|█████████▉| 700/701 [14:37<00:01,  1.17s/it]

[3/10] [700/701] 30.428123
inferencing


100%|██████████| 701/701 [15:07<00:00,  1.30s/it]
  0%|          | 0/701 [00:00<?, ?it/s]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208
[4/10] [0/701] 32.695244
inferencing


  0%|          | 1/701 [00:31<6:02:29, 31.07s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 14%|█▍        | 101/701 [02:28<11:42,  1.17s/it]

[4/10] [100/701] 37.511578


 29%|██▊       | 201/701 [04:25<09:45,  1.17s/it]

[4/10] [200/701] 36.246857


 43%|████▎     | 301/701 [06:22<07:47,  1.17s/it]

[4/10] [300/701] 36.969002


 57%|█████▋    | 401/701 [08:18<05:50,  1.17s/it]

[4/10] [400/701] 34.116360


 71%|███████▏  | 501/701 [10:15<03:52,  1.16s/it]

[4/10] [500/701] 35.689556


 86%|████████▌ | 601/701 [12:12<01:56,  1.17s/it]

[4/10] [600/701] 33.802326


100%|██████████| 701/701 [14:08<00:00,  1.21s/it]
  0%|          | 0/701 [00:00<?, ?it/s]

[4/10] [700/701] 39.050247
[5/10] [0/701] 35.670254
inferencing


  0%|          | 1/701 [00:31<6:02:40, 31.09s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 14%|█▍        | 100/701 [02:26<11:45,  1.17s/it]

[5/10] [100/701] 30.464821
inferencing


 14%|█▍        | 101/701 [02:57<1:41:30, 10.15s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 29%|██▊       | 201/701 [04:54<09:49,  1.18s/it]  

[5/10] [200/701] 33.168449


 43%|████▎     | 300/701 [06:50<07:46,  1.16s/it]

[5/10] [300/701] 28.833149
inferencing


 43%|████▎     | 301/701 [07:21<1:07:34, 10.14s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 57%|█████▋    | 401/701 [09:18<05:50,  1.17s/it]  

[5/10] [400/701] 31.744993


 71%|███████▏  | 501/701 [11:14<03:53,  1.17s/it]

[5/10] [500/701] 33.557621


 86%|████████▌ | 601/701 [13:11<01:56,  1.16s/it]

[5/10] [600/701] 35.683952


100%|██████████| 701/701 [15:06<00:00,  1.29s/it]
  0%|          | 0/701 [00:00<?, ?it/s]

[5/10] [700/701] 32.338913
[6/10] [0/701] 32.260563
inferencing


  0%|          | 1/701 [00:31<6:02:45, 31.09s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 14%|█▍        | 101/701 [02:27<11:40,  1.17s/it]

[6/10] [100/701] 34.853958


 29%|██▊       | 201/701 [04:24<09:44,  1.17s/it]

[6/10] [200/701] 35.475788


 43%|████▎     | 301/701 [06:21<07:47,  1.17s/it]

[6/10] [300/701] 35.530464


 57%|█████▋    | 401/701 [08:18<05:51,  1.17s/it]

[6/10] [400/701] 33.600800


 71%|███████▏  | 501/701 [10:15<03:53,  1.17s/it]

[6/10] [500/701] 33.968369


 86%|████████▌ | 601/701 [12:12<01:56,  1.17s/it]

[6/10] [600/701] 37.164165


100%|██████████| 701/701 [14:08<00:00,  1.21s/it]
  0%|          | 0/701 [00:00<?, ?it/s]

[6/10] [700/701] 34.885460
[7/10] [0/701] 36.625496
inferencing


  0%|          | 1/701 [00:31<6:02:38, 31.08s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 14%|█▍        | 100/701 [02:27<11:46,  1.18s/it]

[7/10] [100/701] 30.464228
inferencing


 14%|█▍        | 101/701 [02:58<1:41:31, 10.15s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 29%|██▊       | 201/701 [04:55<09:44,  1.17s/it]  

[7/10] [200/701] 34.585785


 43%|████▎     | 300/701 [06:50<07:48,  1.17s/it]

[7/10] [300/701] 27.087782
inferencing


 43%|████▎     | 301/701 [07:21<1:07:37, 10.14s/it]

{'joint_goal_accuracy': 0.019310344827586208, 'turn_slot_accuracy': 0.8205363984674218, 'turn_slot_f1': 0.019310344827586208}
joint_goal_accuracy: 0.019310344827586208
turn_slot_accuracy: 0.8205363984674218
turn_slot_f1: 0.019310344827586208


 57%|█████▋    | 401/701 [09:18<05:50,  1.17s/it]  

[7/10] [400/701] 32.752735


 71%|███████▏  | 501/701 [11:15<03:53,  1.17s/it]

[7/10] [500/701] 32.557796


 86%|████████▌ | 601/701 [13:12<01:56,  1.17s/it]

[7/10] [600/701] 35.538612


 94%|█████████▍| 658/701 [14:20<00:56,  1.31s/it]


VBox(children=(Label(value=' 0.02MB of 0.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
loss,27.08778
Joint Goal Accuracy,0.01931
Turn Slot_Accuracy,0.82054
Turn Slot F1,0.01931
_runtime,6736.0
_timestamp,1619962681.0
_step,20.0


0,1
loss,█▂▁▂▂▂▁▂▂▁▂▂▁▂▂▁▁▂▂▁▁
Joint Goal Accuracy,▁████████████████████
Turn Slot_Accuracy,▁████████████████████
Turn Slot F1,▃▃█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▁▂▂▃▃▃▃▄▄▄▅▅▆▆▆▇███
_timestamp,▁▁▁▂▂▃▃▃▃▄▄▄▅▅▆▆▆▇███
_step,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇██


KeyboardInterrupt: 

In [27]:
# model = SUMBT(args, num_labels, device)
# model.initialize_slot_value_lookup(slot_values_ids, slot_type_ids)  # Tokenized Ontology의 Pre-encoding using BERT_SV
# model.to(device)


# no_decay = ["bias", "LayerNorm.weight"]
# optimizer_grouped_parameters = [
#         {
#             "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
#             "weight_decay": args.weight_decay,
#         },
#         {
#             "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
#             "weight_decay": 0.0,
#         },
#     ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8)

# PATH = 'checkpoint/solar-sweep-3/9_700_0.7885714285714286.pth'

# checkpoint = torch.load(PATH)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# epoch = checkpoint['epoch']
# loss = checkpoint['loss']

# model1.load_state_dict(torch.load('checkpoint/solar-sweep-3/9_700_0.7885714285714286.pth'))


Some weights of the model checkpoint at dsksd/bert-ko-small-minimal were not used when initializing BertForUtteranceEncoding: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForUtteranceEncoding from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForUtteranceEncoding from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at dsksd/bert-ko-small-minimal 

Complete initialization of slot and value lookup


## Inference

In [33]:
eval_data = json.load(open(f"/opt/ml/repo/taepd/input/data/eval_dataset/eval_dials.json", "r"))

eval_examples = get_examples_from_dialogues(
    eval_data, user_first=True, dialogue_level=True
)

# Extracting Featrues
eval_features = processor.convert_examples_to_features(eval_examples)
eval_data = WOSDataset(eval_features)
eval_sampler = SequentialSampler(eval_data)
eval_loader = DataLoader(
    eval_data,
    batch_size=8,
    sampler=eval_sampler,
    collate_fn=processor.collate_fn,
)

100%|██████████| 2000/2000 [00:00<00:00, 2493.76it/s]


In [34]:
predictions = inference(model, eval_loader, processor, device)

In [35]:
json.dump(predictions, open('predictions.csv', 'w'), indent=2, ensure_ascii=False) 