In [1]:
import json
import random
from copy import deepcopy
from dataclasses import dataclass
from typing import List
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BartForConditionalGeneration
from transformers import PreTrainedTokenizerFast

from data_utils import split_slot

In [2]:
data = json.load(open( "../input/data/train_dataset/train_dials.json"))

In [3]:
model = BartForConditionalGeneration.from_pretrained("hyunwoongko/kobart")
tokenizer = PreTrainedTokenizerFast.from_pretrained("hyunwoongko/kobart")

## InputExample 및 Feature 정의, 추출 

In [6]:
@dataclass
class CoCoGenInputExample:
    guid: str
    system_utter: str
    turn_state: List[str]
    user_utter: str

@dataclass
class CoCoGenInputFeature:
    input_id: List[int]
    target_id: List[int]

In [7]:
def get_coco_examples_from_dialogue(dialogue):
    """ Dialogue 데이터셋 파일 -> CoCoGenInputExamples """
    guid = dialogue["dialogue_idx"]
    examples = []
    d_idx = 0
    previous_state = []
    for idx, turn in enumerate(dialogue["dialogue"]):
        if turn["role"] != "user":
            continue

        if idx:
            sys_utter = dialogue["dialogue"][idx - 1]["text"]
        else:
            sys_utter = ""

        user_utter = turn["text"]
        state = turn.get("state")

        turn_state = sorted(list(set(state) - set(previous_state)))
        examples.append(CoCoGenInputExample(guid=f"{guid}-{d_idx}",
                                            system_utter=sys_utter,
                                            turn_state=turn_state,
                                            user_utter=user_utter))

        d_idx += 1
        previous_state = state

    return examples


def convert_example_to_feature(example, tokenizer):
    """ CoCoGenInputExamples -> CoCoGenInputFeature """
    sys = tokenizer.tokenize(example.system_utter)
    turn_state = ', '.join([s.replace('-', ' ') for s in example.turn_state])
    state = tokenizer.tokenize(turn_state)
    user = [tokenizer.bos_token] + tokenizer.tokenize(example.user_utter) + [tokenizer.eos_token]

    input_tokens = [tokenizer.bos_token] + sys + [tokenizer.eos_token] + state + [tokenizer.eos_token]
    input_id = tokenizer.convert_tokens_to_ids(input_tokens)
    target_id = tokenizer.convert_tokens_to_ids(user)

    return CoCoGenInputFeature(input_id=input_id, target_id=target_id)


def pad_ids(arrays, pad_idx, max_length=-1):
    if max_length < 0:
        max_length = max(list(map(len, arrays)))

    arrays = [array + [pad_idx] * (max_length - len(array)) for array in arrays]
    return arrays


def collate_fn(batch):
    input_ids = torch.LongTensor(pad_ids([b.input_id for b in batch], tokenizer.pad_token_id))
    target_ids = torch.LongTensor(pad_ids([b.target_id for b in batch], -100))
    input_masks = input_ids.ne(tokenizer.pad_token_id).float()
    return input_ids, target_ids, input_masks


In [8]:
examples = []
for dialogue in tqdm(data):
    examples.extend(get_coco_examples_from_dialogue(dialogue))

features = []
for example in tqdm(examples):
    features.append(convert_example_to_feature(example, tokenizer))

100%|██████████| 7000/7000 [00:00<00:00, 18069.44it/s]
100%|██████████| 51245/51245 [00:20<00:00, 2539.17it/s]


## Data Loader 및 Optimization 준비 

In [9]:
num_train_epochs = 50
batch_size = 32
lr = 5e-5
warmup_ratio = 0.1
weight_decay = 0.01

n_gpu = torch.cuda.device_count()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
class CoCoGenDataset(Dataset):
    def __init__(self, features):
        self.features = features
        self.length = len(self.features)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return self.features[idx]

train_data = CoCoGenDataset(features)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler, collate_fn=collate_fn)

In [11]:
t_total = len(train_loader) * num_train_epochs
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(t_total * warmup_ratio), num_training_steps=t_total
)

model.to(device)
print("")




## 50epoch 학습된 모델 체크포인트 불러오기

In [12]:
checkpoint = torch.load('./coco/training_best_checkpoint.bin')

In [13]:
print(checkpoint.keys())

dict_keys(['epoch', 'model_state_dict', 'optimizer_state_dict', 'scheduler.state_dict', 'loss'])


In [14]:
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler.state_dict'])

epoch = checkpoint['epoch']
loss = checkpoint['loss']



In [15]:
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(30000, 768, padding_idx=3)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30000, 768, padding_idx=3)
      (embed_positions): LearnedPositionalEmbedding(1028, 768, padding_idx=3)
      (layers): ModuleList(
        (0): EncoderLayer(
          (self_attn): Attention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
     

## Inference

In [16]:
def coco_generator(example, slot_value_dict, slot_comb_dict={}, verbose=False):
    if not example.turn_state:
        return example

    coco = deepcopy(example)
    num_state = len(coco.turn_state)
    is_drop = False

    # drop: dialogue state 중 하나를 제거합니다. (e.g., [식당-종류-양식당, 식당-예약 시간-18:00] -> [식당-종류-양식당])
    if num_state > 1:
        drop_idx = random.choice(range(num_state))
        coco.turn_state.pop(drop_idx)
        num_state -= 1
        is_drop = True

    # change: dialogue state의 value 중 하나를 다른 value로 대체합니다. (e.g., [식당-종류-양식당] -> [식당-종류-중식당])
    change_idx = random.choice(range(num_state))
    origin_slot_value = coco.turn_state[change_idx]
    st, sv = split_slot(origin_slot_value, True)
    candidates = slot_value_dict.get(st, [sv])
    new_sv = random.choice(candidates[1:])
    new_slot_value = f"{st}-{new_sv}"
    coco.turn_state[change_idx] = new_slot_value

    # add: slot_comb_dict에서 하나의 slot-value를 생성합니다. (e.g., [식당-종류-중식당] -> [식당-종류-중식당, 식당-예약 인원-2])
    combinations = slot_comb_dict.get(st)

    if not combinations:
        return coco

    co_st = random.choice(combinations)
    candidates = slot_value_dict.get(co_st, ['dontcare'])
    co_sv = random.choice(candidates[1:])
    new_slot_value = f"{co_st}-{co_sv}"
    coco.turn_state.append(new_slot_value)

    if verbose:
        print("Before:", example.turn_state)
        print("After:", coco.turn_state)

    return coco

## Dictionary

In [17]:
slot_value_dict = json.load(open('../input/data/train_dataset/ontology.json'))

In [20]:
slot_comb_dict = {
    '숙소-가격대': ['숙소-예약 명수', '숙소-예약 요일', '숙소-예약 기간'],
    '숙소-종류': ['숙소-예약 명수', '숙소-예약 요일', '숙소-예약 기간'],
    '숙소-지역': ['숙소-예약 명수', '숙소-예약 요일', '숙소-예약 기간'],
    '숙소-인터넷 가능': ['숙소-예약 명수', '숙소-예약 요일', '숙소-예약 기간'],
    '숙소-주차 가능': ['숙소-예약 명수', '숙소-예약 요일', '숙소-예약 기간'],
    '숙소-흡연 가능': ['숙소-예약 명수', '숙소-예약 요일', '숙소-예약 기간'],
    '숙소-조식 가능': ['숙소-예약 명수', '숙소-예약 요일', '숙소-예약 기간'],
    '숙소-헬스장 유무': ['숙소-예약 명수', '숙소-예약 요일', '숙소-예약 기간'],
    '숙소-수영장 유무': ['숙소-예약 명수', '숙소-예약 요일', '숙소-예약 기간'],
    '숙소-스파 유무': ['숙소-예약 명수', '숙소-예약 요일', '숙소-예약 기간']
}

In [38]:
x = random.choice(examples)
print(x)

CoCoGenInputExample(guid='morning-silence-2458:관광_식당_지하철_11-0', system_utter='', turn_state=['관광-종류-공원', '관광-지역-서울 중앙'], user_utter='안녕하세요. 서울 중앙에 있는 공원을 방문하고 싶은데요. 추천 좀 해주세요.')


In [39]:
x = coco_generator(x, slot_value_dict, slot_comb_dict, verbose=True)
print(x)

CoCoGenInputExample(guid='morning-silence-2458:관광_식당_지하철_11-0', system_utter='', turn_state=['관광-지역-서울 서쪽'], user_utter='안녕하세요. 서울 중앙에 있는 공원을 방문하고 싶은데요. 추천 좀 해주세요.')


In [40]:
x = convert_example_to_feature(x, tokenizer)
print(x)

CoCoGenInputFeature(input_id=[0, 1, 15310, 14331, 14245, 28155, 1], target_id=[0, 27616, 25161, 14245, 14957, 11786, 14082, 14061, 14810, 14941, 14058, 16441, 9828, 14543, 16764, 14813, 19024, 25161, 1])


In [41]:
print("input:", tokenizer.decode(x.input_id))

input: <s></s> 관광 지역 서울 서쪽</s>


In [42]:
#Huggingface의 BartForConditionalGeneration은 model.generate를 통해 쉽게 generation을 진행할 수 있습니다 🙌🏻
input_id = torch.LongTensor([x.input_id]).to(device)
o = model.generate(input_id,
                   decoder_start_token_id=tokenizer.bos_token_id,
                   bos_token_id=tokenizer.bos_token_id,
                   eos_token_id=tokenizer.eos_token_id,
                   pad_token_id=tokenizer.pad_token_id,
                   max_length=30,
                   early_stopping=True,
                   num_beams=8,
                   top_k=30,
                   temperature=1.5,
                   do_sample=True)
q = tokenizer.decode(o.tolist()[0], skip_special_tokens=True)

In [43]:
print("Before:", tokenizer.decode(x.target_id, skip_special_tokens=True))
print("After:", q)

Before: 안녕하세요. 서울 중앙에 있는 공원을 방문하고 싶은데요. 추천 좀 해주세요.
After: 안녕하세요. 서울 서쪽에 유명한 관광지가 있나요?
