In [1]:
from setproctitle import setproctitle
setproctitle("Hodong_Transformer")

In [2]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

from transformer.preprocessors.blender_bot_preprocessor import GeneratorPretrainingPreprocessor
from transformer.data.dataset import DatasetInterface, DatasetFromDir
from transformer.data.blender_bot_data_loader import GeneratorPretrainingDataLoader
from transformer.models.transformer import Transformer
from transformer.trainers.blender_bot_trainer import GeneratorPretrainingTransformerTrainer
from transformer.trainers.utils import *



### Load Dataset

In [3]:
# # AIBUD_DEV
# dataset_dir = "/Users/aibud_dev/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # Picas_Server
# dataset_dir = "/home/picas/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# Korea_Server
dataset_dir = "/home/mnt/guest1"
path = "./config/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

# # bigshane_local
# dataset_dir = "D:\_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # AWS
# dataset_dir = "/home/ubuntu/data"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

## Load Configuration

In [4]:
with open("./scripts/transformer/config/generator_finetuning_korea.json", "r", encoding="utf-8") as fp:
    config = json.load(fp)

## Load Preprocessor

In [5]:
src_spm_model_path = dataset_dir + "/spm_model/{language}/spoken_pretrain_spm_v{vocab_size}".format(language=config["data"]["src_language"], vocab_size=config["model"]["src_vocab_size"])
tgt_spm_model_path = dataset_dir + "/spm_model/{language}/spoken_pretrain_spm_v{vocab_size}".format(language=config["data"]["tgt_language"], vocab_size=config["model"]["tgt_vocab_size"])
# src_spm_model_path = config["data"]["src_spm_model_path"].format(root_dir=config["data"]["root_dir"], language=config["data"]["src_language"], vocab_size=config["model"]["src_vocab_size"])
# tgt_spm_model_path = config["data"]["tgt_spm_model_path"].format(root_dir=config["data"]["root_dir"], language=config["data"]["tgt_language"], vocab_size=config["model"]["tgt_vocab_size"])
preprocessor = GeneratorPretrainingPreprocessor(src_language=config["data"]["src_language"], tgt_language=config["data"]["tgt_language"], src_spm_model_path=src_spm_model_path, tgt_spm_model_path=tgt_spm_model_path, embedding_dict=config["model"]["embedding_dict"])

Imported konlpy.tag.Mecab successfully
loaded spm_model: '/Users/aibud_dev/_jupyter/spm_model/kor/spoken_pretrain_spm_v30000/'


## Set Trainer

In [6]:
trainer = GeneratorPretrainingTransformerTrainer(temp_dir=dataset_dir+"/model/temp/")
# trainer = GeneratorPretrainingTransformerTrainer(temp_dir=config["train"]["temp_save_path"])
# trainer.set_lr_update(initial_learning_rate=config["optimizer"]["initial_learning_rate"], num_warmup_steps=config["train"]["num_warmup_steps"])

'temp_dir' has been set to '/Users/aibud_dev/_jupyter/model/temp/20210827_174129/' to save model while training


## Single-GPU Training

### Build Transformer

In [7]:
transformer = Transformer(src_pad_token_id=preprocessor.src_spm_tokenizer.special_token_dict["pad"]["id"], tgt_pad_token_id=preprocessor.tgt_spm_tokenizer.special_token_dict["pad"]["id"], **config["model"])

In [8]:
model_dir = dataset_dir + "/model/transformer/dialog_pretrain/20210821/epoch_100/"
# model_dir = dataset_dir + "/model/temp/20210826_104849/epoch_140/"
transformer = load_state_dict(object=transformer, path=model_dir + ModelFilenameConstants.MODEL_STATE_DICT_FILENAME)

### Set criterions & optimizer

In [9]:
config["optimizer"]["initial_learning_rate"] = 1e-4
criterions, criterion_weights = trainer.get_criterions(tgt_timesteps=config["model"]["tgt_timesteps"], tgt_vocab_size=config["model"]["tgt_vocab_size"], tgt_pad_token_id=preprocessor.tgt_spm_tokenizer.special_token_dict["pad"]["id"], **config["criterion"])
optimizer = trainer.get_optimizer(model=transformer, **config["optimizer"])

### Set Device

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
transformer = GeneratorPretrainingTransformerTrainer.set_device(obj=transformer, device=device)
optimizer = GeneratorPretrainingTransformerTrainer.set_device(obj=optimizer, device=device)
criterions = GeneratorPretrainingTransformerTrainer.set_device(obj=criterions, device=device)

Setting model device: cpu
Setting criterions device: cpu


## Load Dataset & DataLoader

In [1]:
# data_loader_params
batch_size = 4 # 96
nprocs = 1

total_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/multi_turn_v1/"
sample_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/multi_turn_v1/sample/"
train_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/multi_turn_v1/train/"
val_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/multi_turn_v1/val/"

train_dataset = DatasetFromDir(data_dir=train_data_dir, batch_size=batch_size, device=device, nprocs=nprocs, encoding=config["data"]["encoding"], extension=config["data"]["extension"])
train_data_loader_params = trainer.get_data_loader_params(dataset=train_dataset, preprocessor=preprocessor, batch_size=batch_size, device=device, nprocs=nprocs, **config["model"], **config["data_loader"])
train_data_loader = trainer.create_data_loader(**train_data_loader_params)

val_dataset = DatasetFromDir(data_dir=val_data_dir, batch_size=batch_size, device=device, nprocs=nprocs, encoding=config["data"]["encoding"], extension=config["data"]["extension"])
val_data_loader_params = trainer.get_data_loader_params(dataset=val_dataset, preprocessor=preprocessor, batch_size=batch_size, device=device, nprocs=nprocs, **config["model"], **config["data_loader"])
val_data_loader = trainer.create_data_loader(**val_data_loader_params)

# ngram = 5
# utterances = [utterance for row in val_dataset.get_all_data() for utterance in row["utterances"]]
# target_prev_token_distribution, special_token_ids = preprocessor.extract_prev_token_distribution(sentences=utterances, ngram=ngram)
# trainer.set_prev_token_distribution(prev_token_distribution=target_prev_token_distribution, special_token_ids=special_token_ids)

NameError: name 'dataset_dir' is not defined

### Dataset summary

In [12]:
# train_data_loader.summary(show_sample=True)

### DataLoader encode test

In [13]:
# row_idx = 2
# src_inputs, tgt_inputs, tgt_outputs = train_data_loader.get_batch()

# print("src_input_token:\t", [token_idx for token_idx in range(0, len(src_inputs["token"][row_idx])) if token_idx==0 or src_inputs["token"][row_idx][token_idx]==preprocessor.src_spm_tokenizer.special_token_dict["sep"]["id"]])
# print("src_input_segment:\t", [token_idx for token_idx in range(0, len(src_inputs["segment"][row_idx])-1) if token_idx==0 or src_inputs["segment"][row_idx][token_idx]!=src_inputs["segment"][row_idx][token_idx+1]])
# # print("src_input_turn:\t", [token_idx for token_idx in range(0, len(src_inputs["turn"][row_idx])-1) if token_idx==0 or src_inputs["turn"][row_idx][token_idx]!=src_inputs["turn"][row_idx][token_idx+1]])

## Train test

In [14]:
epoch = 3
amp = True
scaler = None
if amp: scaler = torch.cuda.amp.GradScaler()
save_per_epoch = 1
save_per_batch = -1
keep_last = True
verbose_per_epoch = 1
verbose_per_batch = 100



### trainer.fit

In [3]:
history = trainer.fit(model=transformer, train_data_loader=train_data_loader, val_data_loader=None, 
                      criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, device=device, 
                      epoch=epoch, amp=amp, save_per_epoch=save_per_epoch, save_per_batch=save_per_batch, keep_last=keep_last, verbose_per_epoch=verbose_per_epoch, verbose_per_batch=verbose_per_batch)

### trainer.train_epoch

In [None]:
data_iter = tqdm(train_data_loader, initial=train_data_loader.iter_start, total=len(train_data_loader))
data_iter.iter_size = train_data_loader.iter_end - train_data_loader.iter_start
epoch_train_history = trainer.train_epoch(model=transformer, data_loader=data_iter, 
                                          criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, device=device, 
                                          amp=amp, scaler=scaler, save_per_batch=save_per_batch, verbose_per_batch=verbose_per_batch)

### trainer.iteration

In [8]:
for batch_idx, batch in enumerate(train_data_loader):
    batch_idx += 1
    batch = [{k: trainer.convert_to_tensor(data=v, device=device) for k, v in _batch.items()} for _batch in batch]
    
    loss_dict, acc_dict = trainer.iteration(model=transformer, batch=batch,
                                            criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, 
                                            train=True, amp=amp, scaler=scaler)
    
    print(loss_dict)
    print(acc_dict)
    break
    

NameError: name 'train_data_loader' is not defined

### trainer.iteration & data_loader.collate_fn

In [16]:
_batch = [next(train_data_loader.dataset.__iter__()) for i in range(0, batch_size)]
batch_idx = 1
batch = train_data_loader.collate_fn(batch=_batch)
batch = [{k: trainer.convert_to_tensor(data=v, device=device) for k, v in _batch.items()} for _batch in batch]

loss_dict, acc_dict = trainer.iteration(model=transformer, batch=batch,
                                        criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, 
                                        train=True, amp=amp, scaler=scaler)

RuntimeError: Expected object of scalar type Long but got scalar type Int for argument #2 'target' in call to _thnn_nll_loss_forward

## Inference Test

In [None]:
train_data_loader_params = trainer.get_data_loader_params(dataset=train_dataset, preprocessor=preprocessor, batch_size=1, device=device, nprocs=nprocs, **config["model"], **config["data_loader"])
train_data_loader = trainer.create_data_loader(**train_data_loader_params)
val_data_loader_params = trainer.get_data_loader_params(dataset=val_dataset, preprocessor=preprocessor, batch_size=1, device=device, nprocs=nprocs, **config["model"], **config["data_loader"])
val_data_loader = trainer.create_data_loader(**val_data_loader_params)

In [None]:
import re
def loader_iter(data_loader):
    for batch in data_loader:
        yield batch

train_gen = loader_iter(train_data_loader)
val_gen = loader_iter(val_data_loader)

transformer.eval()
src_pad_token_id = preprocessor.src_spm_tokenizer.special_token_dict["pad"]["id"]
tgt_pad_token_id = preprocessor.tgt_spm_tokenizer.special_token_dict["pad"]["id"]
tgt_bos_token_id = preprocessor.tgt_spm_tokenizer.special_token_dict["speaker_1"]["id"]
tgt_eos_token_id = preprocessor.tgt_spm_tokenizer.special_token_dict["eos"]["id"]

In [None]:
gen = train_gen
src_inputs = {"token":[]}
while len(src_inputs["token"]) < 1:
    batch = next(gen)
    src_inputs, tgt_inputs, tgt_outputs = batch

context = preprocessor.src_decode(src_inputs["token"])[0]
greedy_prediction = transformer.inference_greedy(src_inputs=src_inputs, src_pad_token_id=src_pad_token_id, tgt_pad_token_id=tgt_pad_token_id, tgt_bos_token_id=tgt_bos_token_id, tgt_eos_token_id=tgt_eos_token_id)
greedy_reply = preprocessor.tgt_decode(greedy_prediction)[0]
beam_prediction, probs = transformer.inference_beam_search(src_inputs=src_inputs, src_pad_token_id=src_pad_token_id, tgt_pad_token_id=tgt_pad_token_id, tgt_bos_token_id=tgt_bos_token_id, tgt_eos_token_id=tgt_eos_token_id)
beam_replies = preprocessor.tgt_decode(beam_prediction)
sampling_prediction, probs = transformer.inference_random_sampling(src_inputs=src_inputs, src_pad_token_id=src_pad_token_id, tgt_pad_token_id=tgt_pad_token_id, tgt_bos_token_id=tgt_bos_token_id, tgt_eos_token_id=tgt_eos_token_id, num_samples=5, temperature=0.7)
sampling_replies = preprocessor.tgt_decode(sampling_prediction)

ctxt_list = re.split("(<spk1>|<spk2>)", context)[1:]
for i in range(0, len(ctxt_list), 2):
    print("{}: {}".format(ctxt_list[i], ctxt_list[i+1]))
print("{}: {}".format("<spk1>(greedy)", greedy_reply))
for beam_reply in beam_replies:
    print("{}: {}".format("<spk1>(beam)", beam_reply))
print("{}: {}".format("<spk1>(sampling)", sampling_replies[0]))
print("({}: {})".format("ans", preprocessor.tgt_decode(tgt_outputs["lm"])[0]))

## Service Test

In [None]:
_model_dir = trainer.temp_dir + "epoch_13/"
# trainer.save(path=_model_dir, model=transformer, optimizer=optimizer, history=None, config=config, preprocessor=preprocessor, save_model_hyperparams=True, save_optimizer_hyperparams=False, ddp=False)

In [None]:
from transformer.services.dialog_generator.transformer import DialogGenerator
dg = DialogGenerator(temp_dir="./")
dg.load_model(model_dir=_model_dir)

In [None]:
utterances = [
    "여기 있는 사람들이 다 롤러코스터 타려고 기다리는 사람들이야?",
    "그런 것 같아요.",
    "얼마나 기다려야 할까?",
    "최소한 한 시간 반 정도 기다려야 될 것 같아요.",
    "한 시간 반?",
    "줄이 너무 기니까 우리 다른 것부터 탈까?"
]
speaker_ids = [1, 0, 1, 0, 1, 1]
conditions = None # ["condition 문장입니다."]
beam_size = 5
min_length = 5
lp_alpha = 1.2
lp_min_length = 5
return_probs = False
max_retry = 5

In [None]:
# greedy
dg.infer_next_utterance_greedy(utterances=utterances, speaker_ids=speaker_ids, conditions=conditions, max_retry=max_retry)

In [None]:
# beam_search
dg.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=conditions,
                                    beam_size=beam_size, min_length=min_length, lp_alpha=lp_alpha, lp_min_length=lp_min_length, return_probs=return_probs,
                                    max_retry=max_retry)