In [1]:
from setproctitle import setproctitle
setproctitle("Hodong_Transformer")

In [2]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

from transformer.preprocessors.blender_bot_preprocessor import GeneratorFinetuningPreprocessor
from transformer.data.dataset import DatasetInterface, DatasetFromDir
from transformer.data.blender_bot_data_loader import GeneratorFinetuningDataLoader
from transformer.models.transformer import Transformer
from transformer.trainers.blender_bot_trainer import GeneratorFinetuningTransformerTrainer
from transformer.trainers.utils import *



### Load Dataset

In [1]:
# # AIBUD_DEV
# dataset_dir = "/Users/aibud_dev/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # Picas_Server
# dataset_dir = "/home/picas/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# Korea_Server
dataset_dir = "/home/mnt/guest1"
path = "./config/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

# # bigshane_local
# dataset_dir = "D:\_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # AWS
# dataset_dir = "/home/ubuntu/data"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

NameError: name 'json' is not defined

### Load Preprocessor

In [4]:
with open("./scripts/transformer/config/generator_finetuning_korea.json", "r", encoding="utf-8") as fp:
    config = json.load(fp)

In [5]:
# src_language = "kor"
# tgt_language = "kor"
# encoding = "utf-8"
# src_vocab_size = tgt_vocab_size = 150000

# src_spm_model_path = dataset_dir + "/spm_model/{language}/spoken_pretrain_spm_v{vocab_size}".format(language=src_language, vocab_size=config["model"]["src_vocab_size"])
# tgt_spm_model_path = dataset_dir + "/spm_model/{language}/spoken_pretrain_spm_v{vocab_size}".format(language=tgt_language, vocab_size=config["model"]["tgt_vocab_size"])
# trfr_prep = GeneratorFinetuningPreprocessor(src_language=src_language, tgt_language=tgt_language, src_spm_model_path=src_spm_model_path, tgt_spm_model_path=tgt_spm_model_path, embedding_dict=config["model"]["embedding_dict"])

In [6]:
src_spm_model_path = dataset_dir + "/spm_model/{language}/spoken_pretrain_spm_v{vocab_size}".format(language=config["data"]["src_language"], vocab_size=config["model"]["src_vocab_size"])
tgt_spm_model_path = dataset_dir + "/spm_model/{language}/spoken_pretrain_spm_v{vocab_size}".format(language=config["data"]["tgt_language"], vocab_size=config["model"]["tgt_vocab_size"])
# src_spm_model_path = config["data"]["src_spm_model_path"].format(root_dir=config["data"]["root_dir"], language=config["data"]["src_language"], vocab_size=config["model"]["src_vocab_size"])
# tgt_spm_model_path = config["data"]["tgt_spm_model_path"].format(root_dir=config["data"]["root_dir"], language=config["data"]["tgt_language"], vocab_size=config["model"]["tgt_vocab_size"])
preprocessor = GeneratorFinetuningPreprocessor(src_language=config["data"]["src_language"], tgt_language=config["data"]["tgt_language"], src_spm_model_path=src_spm_model_path, tgt_spm_model_path=tgt_spm_model_path, embedding_dict=config["model"]["embedding_dict"])

Imported konlpy.tag.Mecab successfully
loaded spm_model: '/Users/aibud_dev/_jupyter/spm_model/kor/spoken_pretrain_spm_v30000/'


## Set Trainer

In [7]:
trainer = GeneratorFinetuningTransformerTrainer(temp_dir=dataset_dir+"/model/temp/")
# trainer = GeneratorFinetuningTransformerTrainer(temp_dir=config["train"]["temp_save_path"])
# trainer.set_lr_update(initial_learning_rate=config["optimizer"]["initial_learning_rate"], num_warmup_steps=config["train"]["num_warmup_steps"])

'temp_dir' has been set to '/Users/aibud_dev/_jupyter/model/temp/20210826_180449/' to save model while training
LearningRate schedule has been set to 'transformer_lambda'


## Single-GPU Training

### Build Transformer

In [8]:
transformer = Transformer(src_pad_token_id=preprocessor.src_spm_tokenizer.special_token_dict["pad"]["id"], tgt_pad_token_id=preprocessor.tgt_spm_tokenizer.special_token_dict["pad"]["id"], **config["model"])

In [None]:
model_dir = dataset_dir + "/model/transformer/dialog_pretrain/20210821/epoch_100/"
transformer = load_state_dict(object=transformer, path=model_dir + ModelFilenameConstants.MODEL_STATE_DICT_FILENAME)

### Set criterions & optimizer

In [9]:
criterions, criterion_weights = trainer.get_criterions(tgt_timesteps=config["model"]["tgt_timesteps"], tgt_vocab_size=config["model"]["tgt_vocab_size"], tgt_pad_token_id=preprocessor.tgt_spm_tokenizer.special_token_dict["pad"]["id"], **config["criterion"])
optimizer = trainer.get_optimizer(model=transformer, **config["optimizer"])

### Set Device

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
transformer = GeneratorFinetuningTransformerTrainer.set_device(obj=transformer, device=device)
optimizer = GeneratorFinetuningTransformerTrainer.set_device(obj=optimizer, device=device)
criterions = GeneratorFinetuningTransformerTrainer.set_device(obj=criterions, device=device)

Setting model device: cpu
Setting criterions device: cpu


## Load Dataset & DataLoader

In [15]:
# data_loader_params
batch_size = 16
nprocs = 1

total_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/multi_turn_v3/"
sample_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/multi_turn_v3/sample/"
train_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/multi_turn_v3/train/"
val_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/multi_turn_v3/val/"

train_dataset = DatasetFromDir(data_dir=train_data_dir, batch_size=batch_size, device=device, nprocs=nprocs, encoding=config["data"]["encoding"], extension=config["data"]["extension"])
train_data_loader_params = trainer.get_data_loader_params(dataset=train_dataset, preprocessor=preprocessor, batch_size=batch_size, device=device, nprocs=nprocs, **config["model"], **config["data_loader"])
train_data_loader = trainer.create_data_loader(**train_data_loader_params)

val_dataset = DatasetFromDir(data_dir=val_data_dir, batch_size=batch_size, device=device, nprocs=nprocs, encoding=config["data"]["encoding"], extension=config["data"]["extension"])
val_data_loader_params = trainer.get_data_loader_params(dataset=val_dataset, preprocessor=preprocessor, batch_size=batch_size, device=device, nprocs=nprocs, **config["model"], **config["data_loader"])
val_data_loader = trainer.create_data_loader(**val_data_loader_params)

In [12]:
# ngram = 5
# utterances = [utterance for row in val_dataset.get_all_data() for utterance in row["utterances"]]
# target_prev_token_distribution, special_token_ids = preprocessor.extract_prev_token_distribution(sentences=utterances, ngram=ngram)
# trainer.set_prev_token_distribution(prev_token_distribution=target_prev_token_distribution, special_token_ids=special_token_ids)

### Dataset summary

In [16]:
train_data_loader.summary(show_sample=True)

Extracting context_rows & candidate_rows: 100%|██████████| 51779/51779 [00:00<00:00, 141980.46it/s]
Extracting length_list:   1%|          | 284/51779 [00:00<00:35, 1430.90it/s]

context & condition sample: [['맞아요. 돌아오지 않는 시간들이니까요. ', '다시 돌아가고 싶어요.'], ['저도 돌아가고 싶다는 생각을 한 적 이있었는데 지금은 계속 과거를 아쉬워하는 것보단 지금 행복을 충분히 느끼면 좋겠다고 생각해요.']]
candidate sample: ['저도 돌아가고 싶다는 생각을 한 적 이있었는데 지금은 계속 과거를 아쉬워하는 것보단 지금 행복을 충분히 느끼면 좋겠다고 생각해요.']
context & condition 

Extracting length_list: 100%|██████████| 51779/51779 [00:38<00:00, 1360.19it/s]
Extracting length_list:   1%|          | 591/51779 [00:00<00:08, 5906.75it/s]

Min: 1.000	Max: 969.000	Avg: 91.102	Q1: 48.000	Q2: 78.000	Q3: 120.000
candidate 

Extracting length_list: 100%|██████████| 51779/51779 [00:08<00:00, 5860.32it/s]

Min: 1.000	Max: 201.000	Avg: 21.516	Q1: 12.000	Q2: 18.000	Q3: 27.000





((1, 969, 48.0, 78.0, 120.0, 91.10185596477336),
 (1, 201, 12.0, 18.0, 27.0, 21.516135885204427))

### DataLoader encode test

In [15]:
row_idx = 2
target_idx = 0
for batch_idx, batch in enumerate(train_data_loader):
    src_inputs, tgt_inputs, tgt_outputs = batch
    if batch_idx >= target_idx: break

print("src_input_token:\t", [token_idx for token_idx in range(0, len(src_inputs["token"][row_idx])) if token_idx==0 or src_inputs["token"][row_idx][token_idx]==preprocessor.src_spm_tokenizer.special_token_dict["sep"]["id"]])
print("src_input_segment:\t", [token_idx for token_idx in range(0, len(src_inputs["segment"][row_idx])-1) if token_idx==0 or src_inputs["segment"][row_idx][token_idx]!=src_inputs["segment"][row_idx][token_idx+1]])
# print("src_input_turn:\t", [token_idx for token_idx in range(0, len(src_inputs["turn"][row_idx])-1) if token_idx==0 or src_inputs["turn"][row_idx][token_idx]!=src_inputs["turn"][row_idx][token_idx+1]])

for src_input_token, tgt_input_token, tgt_output_token in zip(preprocessor.src_decode(src_inputs["token"]), preprocessor.tgt_decode(tgt_inputs["token"]), preprocessor.tgt_decode(tgt_outputs["lm"])):
    print("src_input_token:\t", src_input_token)
    print("tgt_input_token:\t", tgt_input_token)
    print("tgt_output_token:\t", tgt_output_token)
    print()

src_input_token:	 [0, 49, 59]
src_input_segment:	 [0, 49, 59]
src_input_token:	 <cls><spk1> 요즘 은 그런 분 들 많 잖아요 . 자신 의 얘기 를 연기 하 는 것 도 독보 적 인 능력 이 죠 .<spk2> 제 가 그럴 능력 까진 .<spk1> 일단 해 보 는 거 죠 . 사람 일 모르 잖 아요 ?<spk2> 할 이야기 가 많 긴 해요 .<spk1> 그것 들 이 다 자신 만 의 이야기 가 되 는 거 죠 .<spk2> 제 경험 들 ?<sep> test condition 입니다 .<sep>
tgt_input_token:	 <spk1> 그렇 죠 . 누군가 는 재밌 어 하 지 않 을까요 ?
tgt_output_token:	 그렇 죠 . 누군가 는 재밌 어 하 지 않 을까요 ?

src_input_token:	 <cls><spk2> 이렇게 얘기 를 하 다 보 니 제 가 뭘 원하 는지 조금 씩 보이 기 시작 하 네요 .<sep> test condition 입니다 .<sep>
tgt_input_token:	 <spk1> 조금 이 라도 도움 이 되 셨 다니 다행 이 예요
tgt_output_token:	 조금 이 라도 도움 이 되 셨 다니 다행 이 예요

src_input_token:	 <cls><spk1> 오 징어 , 삼겹살 , 치킨 정말 맛있 죠<spk2> 맞 아요 정말 맛있 어요<spk1> 오징어 를 좋아하 는 이유 있 으세요 ?<spk2> 어 일단 쫄깃 하 고 씹 는 식감 이 좋 더라구요<sep> test condition 입니다 .<sep>
tgt_input_token:	 <spk1> 맞 아요 오징어 는 그런 점 이 좋 더라구요 . 삼겹살 은 좋 아 하 시 는 이유 가 있 으세요 ?
tgt_output_token:	 맞 아요 오징어 는 그런 점 이 좋 더라구요 . 삼겹살 은 좋 아 하 시 는 이유 가 있 으세요 ?

src_input_token:	 <cls><spk2> 학교 도서관 리모 델 링 하 

## Train test

In [16]:
epoch = 20
amp = True
scaler = None
if amp: scaler = torch.cuda.amp.GradScaler()
save_per_epoch = -1
save_per_batch = -1
keep_last = True
verbose_per_epoch = 1
verbose_per_batch = -1



### trainer.fit

In [3]:
history = trainer.fit(model=transformer, train_data_loader=train_data_loader, val_data_loader=val_data_loader, 
                      criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, device=device, 
                      epoch=epoch, amp=amp, save_per_epoch=save_per_epoch, save_per_batch=save_per_batch, keep_last=keep_last, verbose_per_epoch=verbose_per_epoch, verbose_per_batch=verbose_per_batch)

### trainer.train_epoch

In [None]:
data_iter = tqdm(train_data_loader, initial=train_data_loader.iter_start, total=len(train_data_loader))
data_iter.iter_size = train_data_loader.iter_end - train_data_loader.iter_start
epoch_train_history = trainer.train_epoch(model=transformer, data_loader=data_iter, 
                                          criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, device=device, 
                                          amp=amp, scaler=scaler, save_per_batch=save_per_batch, verbose_per_batch=verbose_per_batch)

### trainer.iteration

In [8]:
for batch_idx, batch in enumerate(train_data_loader):
    batch_idx += 1
    batch = [{k: trainer.convert_to_tensor(data=v, device=device) for k, v in _batch.items()} for _batch in batch]
    
    loss_dict, acc_dict = trainer.iteration(model=transformer, batch=batch,
                                            criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, 
                                            train=True, amp=amp, scaler=scaler)
    
    print(loss_dict)
    print(acc_dict)
    break
    

NameError: name 'train_data_loader' is not defined

### trainer.iteration & data_loader.collate_fn

In [None]:
_batch = [next(train_data_loader.dataset.__iter__()) for i in range(0, batch_size)]
batch_idx = 1
batch = train_data_loader.collate_fn(batch=_batch)
batch = [{k: trainer.convert_to_tensor(data=v, device=device) for k, v in _batch.items()} for _batch in batch]

loss_dict, acc_dict = trainer.iteration(model=transformer, batch=batch,
                                        criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, 
                                        train=True, amp=amp, scaler=scaler)

## Inference Test

In [None]:
train_data_loader_params = trainer.get_data_loader_params(dataset=train_dataset, preprocessor=preprocessor, batch_size=1, device=device, nprocs=nprocs, **config["model"], **config["data_loader"])
train_data_loader = trainer.create_data_loader(**train_data_loader_params)
val_data_loader_params = trainer.get_data_loader_params(dataset=val_dataset, preprocessor=preprocessor, batch_size=1, device=device, nprocs=nprocs, **config["model"], **config["data_loader"])
val_data_loader = trainer.create_data_loader(**val_data_loader_params)

In [None]:
import re
def loader_iter(data_loader):
    for batch in data_loader:
        yield batch

train_gen = loader_iter(train_data_loader)
val_gen = loader_iter(val_data_loader)

transformer.eval()
src_pad_token_id = preprocessor.src_spm_tokenizer.special_token_dict["pad"]["id"]
tgt_pad_token_id = preprocessor.tgt_spm_tokenizer.special_token_dict["pad"]["id"]
tgt_bos_token_id = preprocessor.tgt_spm_tokenizer.special_token_dict["speaker_1"]["id"]
tgt_eos_token_id = preprocessor.tgt_spm_tokenizer.special_token_dict["eos"]["id"]

In [None]:
gen = train_gen
src_inputs = {"token":[]}
while len(src_inputs["token"]) < 1:
    batch = next(gen)
    src_inputs, tgt_inputs, tgt_outputs = batch

context = preprocessor.src_decode(src_inputs["token"])[0]
greedy_prediction = transformer.inference_greedy(src_inputs=src_inputs, src_pad_token_id=src_pad_token_id, tgt_pad_token_id=tgt_pad_token_id, tgt_bos_token_id=tgt_bos_token_id, tgt_eos_token_id=tgt_eos_token_id)
greedy_reply = preprocessor.tgt_decode(greedy_prediction)[0]
beam_prediction, probs = transformer.inference_beam_search(src_inputs=src_inputs, src_pad_token_id=src_pad_token_id, tgt_pad_token_id=tgt_pad_token_id, tgt_bos_token_id=tgt_bos_token_id, tgt_eos_token_id=tgt_eos_token_id)
beam_replies = preprocessor.tgt_decode(beam_prediction)
sampling_prediction, probs = transformer.inference_random_sampling(src_inputs=src_inputs, src_pad_token_id=src_pad_token_id, tgt_pad_token_id=tgt_pad_token_id, tgt_bos_token_id=tgt_bos_token_id, tgt_eos_token_id=tgt_eos_token_id, num_samples=5, temperature=0.7)
sampling_replies = preprocessor.tgt_decode(sampling_prediction)

ctxt_list = re.split("(<spk1>|<spk2>)", context)[1:]
for i in range(0, len(ctxt_list), 2):
    print("{}: {}".format(ctxt_list[i], ctxt_list[i+1]))
print("{}: {}".format("<spk1>(greedy)", greedy_reply))
for beam_reply in beam_replies:
    print("{}: {}".format("<spk1>(beam)", beam_reply))
print("{}: {}".format("<spk1>(sampling)", sampling_replies[0]))
print("({}: {})".format("ans", preprocessor.tgt_decode(tgt_outputs["lm"])[0]))

## Service Test

In [None]:
_model_dir = trainer.temp_dir + "epoch_13/"
# trainer.save(path=_model_dir, model=transformer, optimizer=optimizer, history=None, config=config, preprocessor=preprocessor, save_model_hyperparams=True, save_optimizer_hyperparams=False, ddp=False)

In [None]:
from transformer.services.dialog_generator.transformer import DialogGenerator
dg = DialogGenerator(temp_dir="./")
dg.load_model(model_dir=_model_dir)

In [None]:
utterances = [
    "여기 있는 사람들이 다 롤러코스터 타려고 기다리는 사람들이야?",
    "그런 것 같아요.",
    "얼마나 기다려야 할까?",
    "최소한 한 시간 반 정도 기다려야 될 것 같아요.",
    "한 시간 반?",
    "줄이 너무 기니까 우리 다른 것부터 탈까?"
]
speaker_ids = [1, 0, 1, 0, 1, 1]
conditions = None # ["condition 문장입니다."]
beam_size = 5
min_length = 5
lp_alpha = 1.2
lp_min_length = 5
return_probs = False
max_retry = 5

In [None]:
# greedy
dg.infer_next_utterance_greedy(utterances=utterances, speaker_ids=speaker_ids, conditions=conditions, max_retry=max_retry)

In [None]:
# beam_search
dg.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=conditions,
                                    beam_size=beam_size, min_length=min_length, lp_alpha=lp_alpha, lp_min_length=lp_min_length, return_probs=return_probs,
                                    max_retry=max_retry)