In [None]:
#pip install prettyprinter

In [None]:
#pip install ruamel.yaml

In [1]:
import argparse
import json
import os
import pickle
import random
import numpy as np

import torch
import torch.nn as nn

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoConfig

from data_utils import (YamlConfigManager, WOSDataset, get_examples_from_dialogues, load_dataset, extract_label,
                        set_seed, custom_to_mask, custom_get_examples_from_dialogues, custom_load_dataset)

from evaluation import _evaluation
from inference import inference
from model import TRADE, masked_cross_entropy_for_value
from preprocessor import TRADEPreprocessor
from prettyprinter import cpprint

from pathlib import Path
import glob
import re

import wandb
import time

from torch.cuda.amp import GradScaler, autocast

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
cfg = YamlConfigManager('./config.yml', 'base').values
cpprint(cfg)

easydict.EasyDict({
    'data_dir': '../input',
    'model_dir': 'results',
    'train_batch_size': 4,
    'eval_batch_size': 8,
    'learning_rate': 1e-05,
    'adam_epsilon': 1e-08,
    'max_grad_norm': 1.0,
    'num_train_epochs': 30,
    'warmup_ratio': 0.0,
    'random_seed': 42,
    'n_gate': 5,
    'teacher_forcing_ratio': 0.5,
    'model_name_or_path': 'klue/roberta-large',
    'proj_dim': 'None',
    'tag': ['trade'],
    'use_kfold': False,
    'num_k': 0,
    'val_ratio': 0.1,
    'scheduler': 'Linear',
    'mask': False
})


In [4]:
# Get current learning rate
def get_lr(scheduler):
    return scheduler.get_last_lr()[0]

In [5]:
# random seed 고정
set_seed(cfg.random_seed)

# Data Loading
# train_data_file = f"{cfg.data_dir}/wos-v1_train.json"
train_data = json.load(open(f"{cfg.data_dir}/wos-v1_train.json"))
dev_data = json.load(open(f"{cfg.data_dir}/wos-v1_dev.json"))
dev_labels = extract_label(dev_data)
slot_meta = json.load(open(f"{cfg.data_dir}/slot_meta.json"))

# train_data, dev_data, dev_labels = load_dataset(train_data_file, cfg.val_ratio)
# train_data, dev_data, dev_labels = custom_load_dataset(train_data_file, cfg.val_ratio, k=0)

train_examples = custom_get_examples_from_dialogues(
    train_data, user_first=False, dialogue_level=False
)
dev_examples = custom_get_examples_from_dialogues(
    dev_data, user_first=False, dialogue_level=False
)

100%|██████████| 8000/8000 [00:00<00:00, 8552.51it/s] 
100%|██████████| 1000/1000 [00:00<00:00, 13331.84it/s]


In [6]:
train_examples[0]

DSTInputExample(guid='wos-v1_train_00000-0', context_turns=[], current_turn=['', ' # ', '서울 중앙에 있는 박물관을 찾아주세요', ' * '], label=['관광-종류-박물관', '관광-지역-서울 중앙'])

In [6]:
# Define Preprocessor
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name_or_path)

# Dealing with long texts The maximum sequence length of BERT is 512.
if 'roberta' in cfg.model_name_or_path:
    processor = TRADEPreprocessor(slot_meta, tokenizer, max_seq_length=510, n_gate=cfg.n_gate)  # roberta 특성상 510까지만 가능  https://github.com/pytorch/fairseq/issues/1177
    print('roberta')
else:
    processor = TRADEPreprocessor(slot_meta, tokenizer, max_seq_length=512, n_gate=cfg.n_gate)

roberta


In [7]:
# Extracting Featrues
cpprint('Extracting Features...')
train_features = processor.sep_custom_convert_examples_to_features(train_examples)
dev_features = processor.sep_custom_convert_examples_to_features(dev_examples)

Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors


'Extracting Features...'


KeyboardInterrupt: 

In [9]:
# # 전체 train data InputFeatur 저장
# with open('custom_train_features', 'wb') as f:
#     pickle.dump(train_features, f)
# with open('custom_dev_features', 'wb') as f:
#     pickle.dump(dev_features, f)

In [10]:
# 저장된 파일 사용
with open('custom_train_features', 'rb') as f:
    train_features = pickle.load(f)
with open('custom_dev_features', 'rb') as f:
    dev_features = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'custom_train_features'

In [7]:
# Slot Meta tokenizing for the decoder initial inputs
tokenized_slot_meta = []
for slot in slot_meta:
    tokenized_slot_meta.append(
        tokenizer.encode(slot.replace("-", " "), add_special_tokens=False)
    )

In [8]:
# Model 선언
config = AutoConfig.from_pretrained(cfg.model_name_or_path)
config.model_name_or_path = cfg.model_name_or_path
config.n_gate = cfg.n_gate
config.proj_dim = None

model = TRADE(config, tokenized_slot_meta)

model.to(device)
print("Model is initialized")

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

Model is initialized


In [None]:
# --wandb initialize with configuration
wandb.init(project='KLUE-DST', tags=cfg.tag, config=cfg)

In [9]:
train_data = WOSDataset(train_features)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(
    train_data,
    batch_size=cfg.train_batch_size,
    sampler=train_sampler,
    collate_fn=processor.collate_fn,
    num_workers=4,  # num_worker = 4 * num_GPU
    pin_memory=True,
)
print("# train:", len(train_data))

dev_data = WOSDataset(dev_features)
dev_sampler = SequentialSampler(dev_data)
dev_loader = DataLoader(
    dev_data,
    batch_size=cfg.eval_batch_size,
    sampler=dev_sampler,
    collate_fn=processor.collate_fn,
    num_workers=4,
    pin_memory=True,
)
print("# dev:", len(dev_data))




NameError: name 'train_features' is not defined

In [9]:
# Optimizer 및 Scheduler 선언
n_epochs = cfg.num_train_epochs

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.01,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

t_total = len(train_loader) * n_epochs
optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.learning_rate, eps=cfg.adam_epsilon)
warmup_steps = int(t_total * cfg.warmup_ratio)
# learning rate decreases linearly from the initial lr set in the optimizer to 0
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
)
teacher_forcing = cfg.teacher_forcing_ratio

loss_fnc_1 = masked_cross_entropy_for_value  # generation
loss_fnc_2 = nn.CrossEntropyLoss()  # gating
loss_fnc_pretrain = nn.CrossEntropyLoss()  # MLM pretrain

NameError: name 'model' is not defined

In [17]:
# 모델 저장될 파일 위치 생성
if not os.path.exists(f"{cfg.model_dir}"):
    os.mkdir(f"{cfg.model_dir}")
if not os.path.exists(f"{cfg.model_dir}/{wandb.run.name}"):
    os.mkdir(f"{cfg.model_dir}/{wandb.run.name}")

AttributeError: 'NoneType' object has no attribute 'name'

In [17]:
json.dump(
    vars(cfg),
    open(f"{cfg.model_dir}/{wandb.run.name}/exp_config.json", "w"),
    indent=2,
    ensure_ascii=False,
)
json.dump(
    slot_meta,
    open(f"{cfg.model_dir}/slot_meta.json", "w"),
    indent=2,
    ensure_ascii=False,
)

### Pretraining

In [18]:
# eval_data = json.load(open(f"../input/data/eval_dataset/eval_dials.json", "r"))

# eval_examples = get_examples_from_dialogues(
#     eval_data, user_first=False, dialogue_level=False
# )

# # Extracting Featrues
# eval_features = processor.convert_examples_to_features(eval_examples)
# eval_data = WOSDataset(eval_features)
# eval_sampler = SequentialSampler(eval_data)
# eval_loader = DataLoader(
#     eval_data,
#     batch_size=8,
#     sampler=eval_sampler,
#     collate_fn=processor.collate_fn,
# )

In [19]:
# MLM_PRE = True

# scaler = GradScaler()
# n_pretrain_epochs = 10

# def mlm_pretrain(loader, n_epochs):
#     model.train()
#     for step, batch in enumerate(tqdm(loader)):
#         input_ids, segment_ids, input_masks, gating_ids, target_ids, guids = [b.to(device) if not isinstance(b, list) else b for b in batch]
        
#         with autocast(): # 밑에 해당하는 코드를 자동으로 mixed precision으로 변환시켜서 실행
#             logits, labels = model.forward_pretrain(input_ids, tokenizer)
#             loss = loss_fnc_pretrain(logits.view(-1, config.vocab_size), labels.view(-1))

#         scaler.scale(loss).backward()
#         nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         scaler.step(optimizer)
#         scaler.update()
#         scheduler.step()
#         optimizer.zero_grad()

#         if step % 100 == 0:
#             print('[%d/%d] [%d/%d] %f' % (epoch, n_epochs, step, len(loader), loss.item()))

# if MLM_PRE:
#     for epoch in range(n_pretrain_epochs):
#         mlm_pretrain(eval_loader, n_pretrain_epochs)

### Training

In [None]:
# backward pass시 gradient 정보가 손실되지 않게 하려고 사용(loss에 scale factor를 곱해서 gradient 값이 너무 작아지는 것을 방지)
scaler = GradScaler()
best_score, best_checkpoint = 0, 0

for epoch in range(n_epochs):
    start_time = time.time()
    batch_loss = []
    model.train()
    for step, batch in enumerate(train_loader):
        input_ids, segment_ids, input_masks, gating_ids, target_ids, guids = [
            b.to(device) if not isinstance(b, list) else b for b in batch
        ]
        # mask
        if cfg.mask:
            change_mask_prop = 0.8
            mask_p = random.random()
            if cfg.mask and mask_p < change_mask_prop:
                input_ids = custom_to_mask(input_ids)
        # teacher forcing
        if (
            teacher_forcing > 0.0
            and random.random() < teacher_forcing
        ):
            tf = target_ids
        else:
            tf = None

        optimizer.zero_grad()  # optimizer는 input으로 model parameter를 가진다 -> zero_grad()로 파라미터 컨드롤 가능

        with autocast():  # 밑에 해당하는 코드를 자동으로 mixed precision으로 변환시켜서 실행
            all_point_outputs, all_gate_outputs = model(
                input_ids, segment_ids, input_masks, target_ids.size(-1), tf
            )
            # generation loss
            loss_1 = loss_fnc_1(
                all_point_outputs.contiguous(),
                target_ids.contiguous().view(-1),
                tokenizer.pad_token_id,
            )
            
            # gating loss
            loss_2 = loss_fnc_2(
                all_gate_outputs.contiguous().view(-1, cfg.n_gate),
                gating_ids.contiguous().view(-1),
            )
            loss = loss_1 + loss_2
        
        batch_loss.append(loss.item())

        scaler.scale(loss).backward()
        nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        # global_step 추가 부분
        wandb.log({"train/learning_rate": get_lr(scheduler),
                   "train/epoch": epoch
                   })
        if step % 100 == 0:
            print(
                f"[{epoch}/{n_epochs}] [{step}/{len(train_loader)}] loss: {loss.item()} gen: {loss_1.item()} gate: {loss_2.item()}"
            )

            # -- train 단계에서 Loss, Accuracy 로그 저장
            wandb.log({
                "train/loss": loss.item(),
                "train/gen_loss": loss_1.item(),
                "train/gate_loss": loss_2.item(),
            })

#     predictions, p_logits, p_idx, g_logits = inference(model, dev_loader, processor, device, cfg.n_gate)
    predictions = inference(model, dev_loader, processor, device, cfg.n_gate)
    eval_result = _evaluation(predictions, dev_labels, slot_meta)

    # -- eval 단계에서 Loss, Accuracy 로그 저장
    wandb.log({
        "eval/join_goal_acc": eval_result["joint_goal_accuracy"],
        "eval/turn_slot_f1": eval_result["turn_slot_f1"],
        "eval/turn_slot_acc": eval_result["turn_slot_accuracy"],
    })

    for k, v in eval_result.items():
        print(f"{k}: {v}")

    if best_score < eval_result['joint_goal_accuracy']:
        cpprint(f"--Update Best checkpoint!, epoch: {epoch+1}")
        best_score = eval_result['joint_goal_accuracy']
        best_checkpoint = epoch
        if not os.path.isdir(cfg.model_dir):
            os.makedirs(cfg.model_dir)
        print("--Saving best model checkpoint")
        torch.save(model.state_dict(), f"{cfg.model_dir}/{wandb.run.name}/best.pth")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler.state_dict': scheduler.state_dict(),
            'loss': loss.item(),
            'gen_loss': loss_1.item(),
            'gate_loss': loss_2.item(),
        }, os.path.join(f"{cfg.model_dir}/{wandb.run.name}", "training_best_checkpoint.bin"))
        
        # logit 저장
#         np.save(os.path.join(f"{cfg.model_dir}/{wandb.run.name}", r'p_logits.npy'), p_logits)
#         np.save(os.path.join(f"{cfg.model_dir}/{wandb.run.name}", r'p_idx.npy'), p_idx)
#         np.save(os.path.join(f"{cfg.model_dir}/{wandb.run.name}", r'g_logits.npy'), g_logits)

        
    torch.save(model.state_dict(), f"{cfg.model_dir}/{wandb.run.name}/last.pth")
    print(f"time for 1 epoch: {time.time() - start_time}")



[0/30] [0/14698] loss: 10.327384948730469 gen: 8.79484748840332 gate: 1.5325372219085693
[0/30] [100/14698] loss: 7.341711044311523 gen: 6.595045566558838 gate: 0.746665358543396
[0/30] [200/14698] loss: 5.657776355743408 gen: 4.9181108474731445 gate: 0.7396653294563293
[0/30] [300/14698] loss: 4.197666168212891 gen: 3.5040550231933594 gate: 0.6936109066009521
[0/30] [400/14698] loss: 3.966791868209839 gen: 3.1811580657958984 gate: 0.7856337428092957
[0/30] [500/14698] loss: 2.7266788482666016 gen: 2.219189405441284 gate: 0.5074893236160278
[0/30] [600/14698] loss: 3.014867067337036 gen: 2.3325817584991455 gate: 0.6822852492332458
[0/30] [700/14698] loss: 2.2547967433929443 gen: 1.696719765663147 gate: 0.5580769777297974
[0/30] [800/14698] loss: 2.1247658729553223 gen: 1.6216602325439453 gate: 0.5031057000160217
[0/30] [900/14698] loss: 1.988816261291504 gen: 1.441561222076416 gate: 0.5472550392150879
[0/30] [1000/14698] loss: 2.1181466579437256 gen: 1.5566035509109497 gate: 0.56154316

### Inference

In [10]:
eval_data = json.load(open(f"{cfg.data_dir}/wos-v1_dev_fixed_v2.json"))
eval_labels = extract_label(eval_data)

eval_examples = custom_get_examples_from_dialogues(
    eval_data, user_first=False, dialogue_level=False
)

# Extracting Featrues
eval_features = processor.sep_custom_convert_examples_to_features(eval_examples)
eval_data = WOSDataset(eval_features)
eval_sampler = SequentialSampler(eval_data)
eval_loader = DataLoader(
    eval_data,
    batch_size=8,
    sampler=eval_sampler,
    collate_fn=processor.collate_fn,
)

model.load_state_dict(torch.load('results/TRADE_roberta-large/best.pth'))

100%|██████████| 1000/1000 [00:00<00:00, 3472.50it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


<All keys matched successfully>

In [11]:
predictions = inference(model, eval_loader, processor, device, cfg.n_gate)
eval_result = _evaluation(predictions, eval_labels, slot_meta)

100%|██████████| 903/903 [02:53<00:00,  5.20it/s]

{'joint_goal_accuracy': 0.7314507198228128, 'turn_slot_accuracy': 0.9913252122554636, 'turn_slot_f1': 0.9655316727871294}





In [None]:
json.dump(predictions, open('predictions.csv', 'w'), indent=2, ensure_ascii=False)