### Transformer-DST `train.py`

In [1]:
import sys
sys.path.append('..')

In [2]:
from model import TransformerDST
# from pytorch_transformers import BertTokenizer, AdamW, WarmupLinearSchedule, BertConfig
from transformers import BertModel, BertTokenizer, BertConfig, ElectraModel, ElectraTokenizer, ElectraConfig, AdamW, get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup
from utils.data_utils import prepare_dataset, MultiWozDataset
from utils.data_utils import  domain2id, OP_SET, make_turn_label, postprocessing # make_slot_meta,
from utils.eval_utils import compute_prf, compute_acc, per_domain_join_accuracy
# from utils.ckpt_utils import download_ckpt, convert_ckpt_compatible
from evaluation import model_evaluation


import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.cuda.amp as amp


import numpy as np
import argparse
import random
import os
import json
import time
import pickle
from tqdm import tqdm

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device Name : {device}')

# torch.cuda.empty_cache()

Device Name : cuda


In [4]:
def masked_cross_entropy_for_value(logits, target, pad_idx=0):
    mask = target.ne(pad_idx)
    logits_flat = logits.view(-1, logits.size(-1))
    log_probs_flat = torch.log(logits_flat)
    target_flat = target.view(-1, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    losses = losses_flat.view(*target.size())
    losses = losses * mask.float()
    loss = losses.sum() / (mask.sum().float())
    return loss

In [5]:
def save(args, epoch, model, enc_optimizer, dec_optimizer=None):
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    model_file = os.path.join(
        args.save_dir, "model.e{:}.bin".format(epoch))
    torch.save(model_to_save.state_dict(), model_file)

    # enc_optim_file = os.path.join(
    #     args.save_dir, "enc_optim.e{:}.bin".format(epoch))
    # torch.save(enc_optimizer.state_dict(), enc_optim_file)
    #
    # if dec_optimizer is not None:
    #     dec_optim_file = os.path.join(
    #         args.save_dir, "dec_optim.e{:}.bin".format(epoch))
    #     torch.save(dec_optimizer.state_dict(), dec_optim_file)

In [6]:
def load(args, epoch):
    model_file = os.path.join(
        args.save_dir, "model.e{:}.bin".format(epoch))
    model_recover = torch.load(model_file, map_location='cpu')

    enc_optim_file = os.path.join(
        args.save_dir, "enc_optim.e{:}.bin".format(epoch))
    enc_recover = torch.load(enc_optim_file, map_location='cpu')
    if hasattr(enc_recover, 'state_dict'):
        enc_recover = enc_recover.state_dict()

    dec_optim_file = os.path.join(
        args.save_dir, "dec_optim.e{:}.bin".format(epoch))
    dec_recover = torch.load(dec_optim_file, map_location='cpu')
    if hasattr(dec_recover, 'state_dict'):
        dec_recover = dec_recover.state_dict()

    return model_recover, enc_recover, dec_recover

In [7]:
import wandb
# !wandb login  # run once

In [8]:
# wandb sweep 생성 시 parameters에 전달하는 config 설정
# hyperparameter_defaults = dict(
#     batch_size = args.batch_size,
#     learning_rate = args.learning_rate,
#     epochs = args.num_train_epochs,
#     weight_decay = args.weight_decay,
#     attn_head = args.attn_head,
#     distance_metric = args.distance_metric,
    
#     dropout = 0.1,
#     smoothing = 0.2
#     model_name = 'BertForSequenceClassification',
#     tokenizer_name = 'BertTokenizer',
#     )

# wandb.init(config=hyperparameter_defaults, project="TRADE")
wandb.init(project="Transformer-DST")
config = wandb.config

[34m[1mwandb[0m: Currently logged in as: [33mtaepd[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


### args setting

In [9]:
from argparse import Namespace

parser = argparse.ArgumentParser()

# Required parameters
args = {

    "use_cpu": False,   # Just for my debugging. I have not tested whether it can be used for training model.

    # Using only [CLS]
    "use_cls_only": False, 

    # w/o re-using dialogue
    "no_dial": False, 

    # Using only D_t in generation
    "use_dt_only":False, 

    # By default, "decoder" only attend on a specific [SLOT] position.
    # If using this option, the "decoder" can access to this group of "[SLOT] domain slot - value".
    # NEW: exclude "- value"
    "use_full_slot": False, 

    "only_pred_op": False,   # only train to predict state operation just for debugging

    "use_one_optim": True,   # I use one optim

    "recover_e": 0, 

    # Required parameters
    "data_root":'data',  # default='data/mwz2.1'
    "train_data":'train_dials.json',
    "dev_data":'dev_dials.json',
    "test_data":'test_dials.json',
    "ontology_data":'ontology.json',
    "slot_meta":'slot_meta.json',
    # "--vocab_path":'assets/vocab.txt',
    "bert_config_path":'assets/bert_config_base_uncased.json',
    "bert_ckpt_path":'assets/bert-base-uncased-pytorch_model.bin',
    "save_dir":'outputs',

    "random_seed": 42, 
    "num_workers": 4, 
    "batch_size": 2, 
    "enc_warmup": 0.1, 
    "dec_warmup": 0.1, 
    "enc_lr": 3e-5, #  my Transformer-AR uses 3e-5
    "dec_lr": 1e-4, 
    "n_epochs": 30, 
    "eval_epoch": 1, 

    "op_code": "4",
    "slot_token": "[SLOT]",
    "dropout": 0.1, 
    "hidden_dropout_prob": 0.1, 
    "attention_probs_dropout_prob": 0.1, 
    "decoder_teacher_forcing": 1, 
    "word_dropout": 0.1, 
    "not_shuffle_state": False,
    "shuffle_p": 0.5, 

    "n_history": 1, 
    "max_seq_length": 512, 
    "msg": None,
    "exclude_domain": False, 
    
    # generator
    'beam_size': 1,  # Beam size for searching
    "min_len": 1, 
    'length_penalty': 0,  # Length penalty for beam search
    'forbid_duplicate_ngrams': False, 
    'forbid_ignore_word': None,  # Ignore the word during forbid_duplicate_ngrams
    'ngram_size': 2
}

args = Namespace(**args)

args.train_data_path = os.path.join(args.data_root, args.train_data)
args.dev_data_path = os.path.join(args.data_root, args.dev_data)
args.test_data_path = os.path.join(args.data_root, args.test_data)
args.ontology_data = os.path.join(args.data_root, args.ontology_data)
args.slot_meta = os.path.join(args.data_root, args.slot_meta)
args.shuffle_state = False if args.not_shuffle_state else True


print('pytorch version: ', torch.__version__)
# print(args)

pytorch version:  1.7.0+cu101


In [10]:
assert args.use_one_optim is True

if args.use_cls_only:
    args.no_dial = True

print("### use_cls_only: {:}".format(args.use_cls_only))
print("### no_dial: {:}".format(args.no_dial))

if args.recover_e > 0:
    raise NotImplementedError("This option is from my oldest code version. "
                              "I have not checked it for this code version.")

if not os.path.exists(args.save_dir):
    os.mkdir(args.save_dir)
    print("### mkdir {:}".format(args.save_dir))

### use_cls_only: False
### no_dial: False


In [11]:
def worker_init_fn(worker_id):
    np.random.seed(args.random_seed + worker_id)

n_gpu = 0
if torch.cuda.is_available() and (not args.use_cpu):
    n_gpu = torch.cuda.device_count()
    device = torch.device('cuda')
    print("### Device: {:}".format(device))
else:
    print("### Use CPU (Debugging)")
    device = torch.device("cpu")

if args.random_seed < 0:
    print("### Pick a random seed")
    args.random_seed = random.sample(list(range(0, 100000)), 1)[0]

print("### Random Seed: {:}".format(args.random_seed))
np.random.seed(args.random_seed)
random.seed(args.random_seed)
rng = random.Random(args.random_seed)
torch.manual_seed(args.random_seed)

if n_gpu > 0:
    if args.random_seed >= 0:
        torch.cuda.manual_seed(args.random_seed)
        torch.cuda.manual_seed_all(args.random_seed)

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

if not os.path.exists(args.save_dir):
    os.mkdir(args.save_dir)

### Device: cuda
### Random Seed: 42


In [12]:
ontology = json.load(open(args.ontology_data))
# slot_meta, ontology = make_slot_meta(ontology)
slot_meta = json.load(open(args.slot_meta))
op2id = OP_SET[args.op_code]
print(op2id)

tokenizer = BertTokenizer.from_pretrained('dsksd/bert-ko-small-minimal', additional_special_tokens = ['[SLOT]', '[NULL]','[EOS]'])
# tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator", additional_special_tokens = ['[SLOT]', '[NULL]','[EOS]'])

{'delete': 0, 'update': 1, 'dontcare': 2, 'carryover': 3}


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


### MODEL CONFIG

In [13]:
# model_config = BertConfig.from_json_file(args.bert_config_path)
model_config = BertConfig.from_pretrained('dsksd/bert-ko-small-minimal')
# model_config = BertConfig.from_pretrained("monologg/koelectra-base-v3-discriminator")
model_config.dropout = args.dropout
model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
model_config.hidden_dropout_prob = args.hidden_dropout_prob
model_config.vocab_size += 3  # 추가되는 special token 고려
model_config.output_hidden_states=True  # 학습할 때 해야하는 설정인듯
model_config.output_attentions=True
print(model_config)
type_vocab_size = 4
dec_config = args
model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id),
                       op2id['update'],
                       tokenizer.convert_tokens_to_ids(['[MASK]'])[0],
                       tokenizer.convert_tokens_to_ids(['[SEP]'])[0],
                       tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                       tokenizer.convert_tokens_to_ids(['-'])[0],
                       type_vocab_size, args.exclude_domain)

wandb.watch(model)
# if not os.path.exists(args.bert_ckpt_path):
#     args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets')

# state_dict = torch.load(args.bert_ckpt_path, map_location='cpu')
# _k = 'embeddings.token_type_embeddings.weight'
# print("config.type_vocab_size != state_dict[bert.embeddings.token_type_embeddings.weight] ({0} != {1})".format(
#         type_vocab_size, state_dict[_k].shape[0]))
# state_dict[_k].resize_(
#     type_vocab_size, state_dict[_k].shape[1])
# state_dict[_k].data[2, :].copy_(state_dict[_k].data[0, :])
# state_dict[_k].data[3, :].copy_(state_dict[_k].data[0, :])
# model.bert.load_state_dict(state_dict)
# print("\n### Done Load BERT")
# sys.stdout.flush()


# re-initialize added special tokens ([SLOT], [NULL], [EOS])
# model.encoder.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02)
# model.encoder.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
# model.encoder.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)

# re-initialize seg-2, seg-3
# model.bert.embeddings.token_type_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
# model.bert.embeddings.token_type_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)
# model.to(device)

# tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)
tokenizer = model.tokenizer
# tokenizer = BertTokenizer.from_pretrained('dsksd/bert-ko-small-minimal', additional_special_tokens = ['[SLOT]', '[NULL]','[EOS]'])
# tokenizer = BertTokenizer.from_pretrained('dsksd/bert-ko-small-minimal')

model.to(device)

BertConfig {
  "architectures": [
    "BertForPretraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "dropout": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_attentions": true,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 35003
}

### word index of '-',  17


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


TransformerDST(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(35003, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            

In [14]:
type(model.encoder)

model.Encoder

In [15]:
print(tokenizer.all_special_tokens)
print(tokenizer.additional_special_tokens_ids)

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]', '[SLOT]', '[NULL]', '[EOS]']
[35000, 35001, 35002]


### DATA PREPERATION

In [16]:
if not os.path.exists('raw_data'):
    os.mkdir('raw_data')
print('Making Train_Data_Raw....')
if not os.path.exists('./raw_data/train_data_raw'):
    train_data_raw = prepare_dataset(data_path=args.train_data_path,
                                    tokenizer=tokenizer,
                                    slot_meta=slot_meta,
                                    n_history=args.n_history,
                                    max_seq_length=args.max_seq_length,
                                    op_code=args.op_code)
    with open('./raw_data/train_data_raw', 'wb') as f:
        pickle.dump(train_data_raw, f)
else:
    with open('./raw_data/train_data_raw', 'rb') as f:
        train_data_raw = pickle.load(f)

train_data = MultiWozDataset(train_data_raw,
                             tokenizer,
                             slot_meta,
                             args.max_seq_length,
                             rng,
                             ontology,
                             args.word_dropout,
                             args.shuffle_state,
                             args.shuffle_p, pad_id=tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                             slot_id=tokenizer.convert_tokens_to_ids(['[SLOT]'])[0],
                             decoder_teacher_forcing=args.decoder_teacher_forcing,
                             use_full_slot=args.use_full_slot,
                             use_dt_only=args.use_dt_only, no_dial=args.no_dial,
                             use_cls_only=args.use_cls_only)
print("# train examples %d" % len(train_data_raw))

print('Making Dev_Data_Raw....')
# print(f'max_seq_length : {args.max_seq_length}')
if not os.path.exists('./raw_data/dev_data_raw'):
    dev_data_raw = prepare_dataset(data_path=args.dev_data_path,
                                tokenizer=tokenizer,
                                slot_meta=slot_meta,
                                n_history=args.n_history,
                                max_seq_length=args.max_seq_length,
                                op_code=args.op_code)
    with open('./raw_data/dev_data_raw', 'wb') as f:
        pickle.dump(dev_data_raw, f)
else:
    with open('./raw_data/dev_data_raw', 'rb') as f:
        dev_data_raw = pickle.load(f)
print("# dev examples %d" % len(dev_data_raw))

if not os.path.exists('./raw_data/test_data_raw'):
    test_data_raw = prepare_dataset(data_path=args.test_data_path,
                                    tokenizer=tokenizer,
                                    slot_meta=slot_meta,
                                    n_history=args.n_history,
                                    max_seq_length=args.max_seq_length,
                                    op_code=args.op_code)
    with open('./raw_data/test_data_raw', 'wb') as f:
        pickle.dump(test_data_raw, f)
else:
    with open('./raw_data/test_data_raw', 'rb') as f:
        test_data_raw = pickle.load(f)
print("# test examples %d" % len(test_data_raw))

Making Train_Data_Raw....
### decoder_teacher_forcing: 1
# train examples 45320
Making Dev_Data_Raw....
# dev examples 5067
# test examples 14771


### Optimizer & Scheduler

In [17]:
######## STEPS ########
num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs)


######## ENC/DEC OPTIMIZER ########
if args.use_one_optim:
    print("### Use One Optim")
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.enc_lr)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(num_train_steps * args.enc_warmup),
                                         num_training_steps=num_train_steps)

else:
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    enc_param_optimizer = list(model.bert.named_parameters())  # TODO: For BERT only
    print('### Optim BERT: {:}'.format(len(enc_param_optimizer)))
    enc_optimizer_grouped_parameters = [
        {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr)
    enc_scheduler = get_linear_schedule_with_warmup(enc_optimizer, num_warmup_steps=int(num_train_steps * args.enc_warmup),
                                     num_training_steps=num_train_steps)

    dec_param_optimizer = list(model.named_parameters())  # TODO:  For other parameters
    print('### Optim All: {:}'.format(len(dec_param_optimizer)))
    dec_param_optimizer = [p for (n, p) in dec_param_optimizer if 'bert' not in n]
    print('### Optim OTH: {:}'.format(len(dec_param_optimizer)))
    dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr)
    dec_scheduler = get_linear_schedule_with_warmup(dec_optimizer, num_warmup_steps=int(num_train_steps * args.dec_warmup),
                                    num_training_steps=num_train_steps)
if args.recover_e > 0:
    model_recover, enc_recover, dec_recover = load(args, str(args.recover_e))
    print("### Recover Model E{:}".format(args.recover_e))
    sys.stdout.flush()
    model.load_state_dict(model_recover)
    print("### Recover Optim E{:}".format(args.recover_e))
    sys.stdout.flush()
    enc_optimizer.load_state_dict(enc_recover)
    dec_optimizer.load_state_dict(dec_optimizer)

if n_gpu > 1:
    model = torch.nn.DataParallel(model)

### Use One Optim


### Train DataLoading

In [18]:
print('Making Train DataLoader...')
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,
                              sampler=train_sampler,
                              batch_size=args.batch_size,
                              collate_fn=train_data.collate_fn,
                              num_workers=args.num_workers,
                              worker_init_fn=worker_init_fn)

Making Train DataLoader...


### Train

In [19]:
def get_lr(scheduler):
    return scheduler.get_last_lr()[0]

In [20]:
######## TRAINING ########
print("Let's Do the Training!")

loss_fnc = nn.CrossEntropyLoss()
best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0}

start_time = time.time()

for epoch in range(args.n_epochs):
    batch_loss = []
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        
        batch = [b.to(device) if (not isinstance(b, int)) and (not isinstance(b, dict) and (not isinstance(b, list)) and (not isinstance(b, np.ndarray))) else b for b in batch]

        input_ids_p, segment_ids_p, input_mask_p, \
        state_position_ids, op_ids, domain_ids, input_ids_g, segment_ids_g, position_ids_g, input_mask_g, \
        masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, n_total_pred = batch

        with amp.autocast():
            domain_scores, state_scores, loss_g = model(input_ids_p, segment_ids_p, input_mask_p, state_position_ids,
                input_ids_g, segment_ids_g, position_ids_g, input_mask_g,
                masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, only_pred_op=args.only_pred_op, n_gpu=n_gpu)

            if n_total_pred > 0:
                loss_g = loss_g.sum() / n_total_pred
            else:
                loss_g = 0

            loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1))

            if args.only_pred_op:
                loss = loss_s
            else:
                loss = loss_s + loss_g

            if args.exclude_domain is not True:
                loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1))
                loss = loss + loss_d

            batch_loss.append(loss.item())

            loss.backward()
            
        if args.use_one_optim:
            optimizer.step()
            scheduler.step()
        else:
            enc_optimizer.step()
            enc_scheduler.step()
            dec_optimizer.step()
            dec_scheduler.step()

        model.zero_grad()

        if step % 100 == 0:
            try:
                loss_g = loss_g.item()
            except AttributeError:
                loss_g = loss_g

            if args.exclude_domain is not True:
                print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \
                      % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step,
                         len(train_dataloader), np.mean(batch_loss),
                         loss_s.item(), loss_g, loss_d.item()))
                wandb.log({
                    "train/mean_loss": np.mean(batch_loss),
                    "train/state_loss": loss_s.item(),
                    "train/gen_loss": loss_g.item(),
                    "train/dom_loss": loss_d.item(),
                    "train/epoch": epoch+1,
                    "train/enc_learning rate": get_lr(enc_scheduler),
                    "train/dec_learning rate": get_lr(dec_scheduler)
                })
            else:
                print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \
                      % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step,
                         len(train_dataloader), np.mean(batch_loss),
                         loss_s.item(), loss_g))
                wandb.log({
                    "train/mean_loss": np.mean(batch_loss),
                    "train/state_loss": loss_s.item(),
                    "train/gen_loss": loss_g.item(),
                    "train/epoch": epoch+1,
                    "train/enc_learning rate": get_lr(enc_scheduler),
                    "train/dec_learning rate": get_lr(dec_scheduler)
                })
            sys.stdout.flush()
            batch_loss = []
    
    if args.use_one_optim:
        save(args, epoch + 1, model, optimizer)
    else:
        save(args, epoch + 1, model, enc_optimizer, dec_optimizer)
               
    if (epoch+1) % args.eval_epoch == 0 and (epoch+1 >= 8):
        eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code,
                                    use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu)
        print("### Epoch {:} Score : ".format(epoch+1), eval_res)
        wandb.log({
            'eval/epoch': epoch+1,
            'eval/joint_acc': eval_res['joint_acc'],
            'eval/slot_acc':eval_res['slot_acc'],
            'eval/slot_f1': eval_res['slot_f1'],
            'eval/op_acc': eval_res['op_acc'],
            'eval/op_f1': eval_res['op_f1'],
            'eval/final_slot_f1':eval_res['final_slot_f1'],
        })
        if eval_res['joint_acc'] > best_score['joint_acc']:
            best_score = eval_res
            
            model_to_save = model.module if hasattr(model, 'module') else model
            save_path = os.path.join(args.save_dir, 'model_best.bin')
            torch.save(model_to_save.state_dict(), save_path)
            
            if epoch+1 >= 8:  # To speed up
                eval_res_test = model_evaluation(model, test_data_raw, tokenizer, slot_meta, epoch + 1, args.op_code,
                                                 use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu)
                print("### Epoch {:} Test Score : ".format(epoch + 1), eval_res_test)

        print("### Best Joint Acc: {:} ###".format(best_score['joint_acc']))
        print('\n')

  0%|          | 0/22660 [00:00<?, ?it/s]

Let's Do the Training!


  0%|          | 0/22660 [00:00<?, ?it/s]


TypeError: forward() got an unexpected keyword argument 'position_ids'

In [None]:
###### TEST DATA EVALUATION ######
print("Test using best model...")
# 수정 전
# best_epoch = best_score['epoch']
# ckpt_path = os.path.join(args.save_dir, 'model_best.bin')
# model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain)
# ckpt = torch.load(ckpt_path, map_location='cpu')
# model.load_state_dict(ckpt)
# model.to(device)

# model_evaluation(model, test_data_raw, tokenizer, slot_meta, 29, args.op_code,
#                  is_gt_op=False, is_gt_p_state=False, is_gt_gen=False)
# model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
#                  is_gt_op=False, is_gt_p_state=False, is_gt_gen=True)
# model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
#                  is_gt_op=False, is_gt_p_state=True, is_gt_gen=False)
# model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
#                  is_gt_op=False, is_gt_p_state=True, is_gt_gen=True)
# model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
#                  is_gt_op=True, is_gt_p_state=False, is_gt_gen=False)
# model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
#                  is_gt_op=True, is_gt_p_state=True, is_gt_gen=False)
# model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
#                  is_gt_op=True, is_gt_p_state=False, is_gt_gen=True)
# model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
#                  is_gt_op=True, is_gt_p_state=True, is_gt_gen=True)

In [21]:
pred = json.load(open('preds_999.json'))

FileNotFoundError: [Errno 2] No such file or directory: 'preds_999.json'

In [24]:
predictions = {}
for k, v in pred.items():
    predictions[k] = v[0]
predictions

json.dump(
        predictions,
#         open(f"{args.output_dir}/predictions.csv", "w"),
        open(f"predictions.csv", "w"),
        indent=2,
        ensure_ascii=False,
    )