In [1]:
from setproctitle import setproctitle
setproctitle("Hodong_PolyEncoder")

In [2]:
import os
import json
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, IterableDataset, DataLoader

from transformer.assertions.object_assertion import DataAssertion
from transformer.utils.tokenizer import MecabTokenizer, SpmTokenizer
from transformer.data.dataset import DatasetInterface, DatasetFromDir
from transformer.models.bert import Bert
from transformer.models.poly_encoder import PolyEncoder

from transformer.preprocessors.blender_bot_preprocessor import RetrieverFinetuningPreprocessor
from transformer.data.blender_bot_data_loader import RetrieverFinetuningDataLoader
from transformer.trainers.bert_trainer import BertTrainer
from transformer.trainers.blender_bot_trainer import RetrieverEncoderBertTrainer, RetrieverFinetuningPolyEncoderTrainer
from transformer.trainers.utils import *



## Set Directories

In [3]:
# AIBUD_DEV
dataset_dir = "/Users/aibud_dev/_jupyter"
path = "./config/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

# # Picas_Server
# dataset_dir = "/home/picas/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # Korea_Server
# dataset_dir = "/home/mnt/guest1"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # bigshane_local
# dataset_dir = "D:\_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

## Load Configuration

In [4]:
# poly_encoder config
with open("./scripts/poly_encoder/config/retriever_finetuning_korea.json", "r", encoding="utf-8") as fp:
    config = json.load(fp)

# encoder config
encoder_model_dir = dataset_dir + "/model/bert/dialog_pretrain/20210722/"
encoder_config_path = encoder_model_dir + ModelFilenameConstants.TRAIN_CONFIG_FILENAME
encoder_model_state_dict_path = encoder_model_dir + ModelFilenameConstants.MODEL_STATE_DICT_FILENAME
encoder_optimizer_state_dict_path = encoder_model_dir + ModelFilenameConstants.OPTIMIZER_STATE_DICT_FILENAME
encoder_spm_model_path = encoder_model_dir + ModelFilenameConstants.SPM_MODEL_DIR
encoder_config = None
with open(encoder_config_path, "r", encoding="utf-8") as fp:
    encoder_config = json.load(fp)

## Load Preprocessor

In [5]:
spm_model_path = dataset_dir + "/spm_model/{language}/spoken_pretrain_spm_v{vocab_size}".format(language=config["data"]["language"], vocab_size=encoder_config["model"]["vocab_size"])
# spm_model_path = config["data"]["spm_model_path"].format(root_dir=config["data"]["root_dir"], language=config["data"]["language"], vocab_size=encoder_config["model"]["vocab_size"])
preprocessor = RetrieverFinetuningPreprocessor(language=config["data"]["language"], spm_model_path=spm_model_path, embedding_dict=encoder_config["model"]["embedding_dict"])

Imported konlpy.tag.Mecab successfully
loaded spm_model: '/Users/aibud_dev/_jupyter/spm_model/kor/spoken_pretrain_spm_v15000/'


## Set Trainer

In [6]:
trainer = RetrieverFinetuningPolyEncoderTrainer(temp_dir=dataset_dir+"/model/temp/")
# trainer = RetrieverFinetuningPolyEncoderTrainer(temp_dir=config["train"]["temp_save_path"])
trainer.set_lr_update(initial_learning_rate=config["optimizer"]["initial_learning_rate"], num_warmup_steps=config["train"]["num_warmup_steps"])

'temp_dir' has been set to '/Users/aibud_dev/_jupyter/model/temp/20210826_193041/' to save model while training
LearningRate schedule has been set to 'transformer_lambda'


## Single-GPU Training

### Build PolyEncoder

- Initial Build

In [7]:
# # context_encoder
# context_encoder = Bert(pad_token_id=preprocessor.spm_tokenizer.special_token_dict["pad"]["id"], **encoder_config["model"])
# context_encoder = load_state_dict(object=context_encoder, path=encoder_model_state_dict_path)
# # candidate_encoder
# candidate_encoder = Bert(pad_token_id=preprocessor.spm_tokenizer.special_token_dict["pad"]["id"], **encoder_config["model"])
# candidate_encoder = load_state_dict(object=candidate_encoder, path=encoder_model_state_dict_path)
# # poly_encoder
# poly_encoder = PolyEncoder(context_encoder=context_encoder, candidate_encoder=candidate_encoder, **config["model"])

- Load

In [8]:
model_dir = dataset_dir + "/model/poly_encoder/dialog_retriever/20210803/"
# context_encoder
context_encoder_model_path = dataset_dir + "/model/bert/dialog_pretrain/20210722/"
# context_encoder_model_path = model_dir + "context_encoder/"
# context_encoder_model_path = config["model"]["context_encoder"]["model_path"]
context_encoder = trainer.create_encoder(model_type=config["model"]["context_encoder"]["model_type"], encoder_model_path=context_encoder_model_path)
# candidate_encoder
candidate_encoder_model_path = dataset_dir + "/model/bert/dialog_pretrain/20210722/"
# candidate_encoder_model_path = model_dir + "candidate_encoder/"
# candidate_encoder_model_path = config["model"]["candidate_encoder"]["model_path"]
candidate_encoder = trainer.create_encoder(model_type=config["model"]["candidate_encoder"]["model_type"], encoder_model_path=candidate_encoder_model_path)
# poly_encoder
poly_encoder = trainer.create_model(context_encoder=context_encoder, candidate_encoder=candidate_encoder, m_code=config["model"]["m_code"], aggregation_method=config["model"]["aggregation_method"])

- load model_state_dict

In [10]:
poly_encoder = load_state_dict(object=poly_encoder, path=model_dir + ModelFilenameConstants.MODEL_STATE_DICT_FILENAME)

### Set criterions & optimizer

In [11]:
criterions, criterion_weights = trainer.get_criterions(**config["criterion"])
optimizer = trainer.get_optimizer(model=poly_encoder, **config["optimizer"])

### Set Device

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
poly_encoder = BertTrainer.set_device(obj=poly_encoder, device=device)
optimizer = BertTrainer.set_device(obj=optimizer, device=device)
criterions = BertTrainer.set_device(obj=criterions, device=device)

Setting model device: cpu
Setting criterions device: cpu


## Load Dataset & DataLoader

In [13]:
# data_loader_params
batch_size = 4
nprocs = 1

total_data_dir = dataset_dir + "/dataset/preprocessed/dialog_retriever/kor/multi_turn/"
sample_data_dir = dataset_dir + "/dataset/preprocessed/dialog_retriever/kor/multi_turn/sample/"
train_data_dir = dataset_dir + "/dataset/preprocessed/dialog_retriever/kor/multi_turn/train/"
val_data_dir = dataset_dir + "/dataset/preprocessed/dialog_retriever/kor/multi_turn/val/"

train_dataset = DatasetFromDir(data_dir=sample_data_dir, batch_size=batch_size, device=device, nprocs=nprocs, encoding=config["data"]["encoding"], extension=config["data"]["extension"])
train_data_loader_params = trainer.get_data_loader_params(dataset=train_dataset, preprocessor=preprocessor, batch_size=batch_size, device=device, nprocs=nprocs, **config["data_loader"], **encoder_config["model"])
train_data_loader = trainer.create_data_loader(**train_data_loader_params)

val_dataset = DatasetFromDir(data_dir=val_data_dir, batch_size=batch_size, device=device, nprocs=nprocs, encoding=config["data"]["encoding"], extension=config["data"]["extension"])
val_data_loader_params = trainer.get_data_loader_params(dataset=val_dataset, preprocessor=preprocessor, batch_size=batch_size, device=device, nprocs=nprocs, **config["data_loader"], **encoder_config["model"])
val_data_loader = trainer.create_data_loader(**val_data_loader_params)

### Dataset summary

In [14]:
# train_data_loader.summary(show_sample=True)

### DataLoader encode test

In [15]:
# row_idx = 2
# context_inputs, candidate_inputs, outputs = train_data_loader.get_batch()

# print("ctxt_token:\t", [token_idx for token_idx in range(0, len(context_inputs["token"][row_idx])) if token_idx==0 or context_inputs["token"][row_idx][token_idx]==preprocessor.spm_tokenizer.special_token_dict["sep"]["id"]])
# print("ctxt_segment:\t", [token_idx for token_idx in range(0, len(context_inputs["segment"][row_idx])-1) if token_idx==0 or context_inputs["segment"][row_idx][token_idx]!=context_inputs["segment"][row_idx][token_idx+1]])
# print("ctxt_turn:\t", [token_idx for token_idx in range(0, len(context_inputs["turn"][row_idx])-1) if token_idx==0 or context_inputs["turn"][row_idx][token_idx]!=context_inputs["turn"][row_idx][token_idx+1]])
# print()
# print("cdnd_token:\t", [token_idx for token_idx in range(0, len(candidate_inputs["token"][row_idx])) if token_idx==0 or candidate_inputs["token"][row_idx][token_idx]==preprocessor.spm_tokenizer.special_token_dict["sep"]["id"]])
# print("cdnd_segment:\t", [token_idx for token_idx in range(0, len(candidate_inputs["segment"][row_idx])-1) if token_idx==0 or candidate_inputs["segment"][row_idx][token_idx]!=candidate_inputs["segment"][row_idx][token_idx+1]])
# print("cdnd_turn:\t", [token_idx for token_idx in range(0, len(candidate_inputs["turn"][row_idx])-1) if token_idx==0 or candidate_inputs["turn"][row_idx][token_idx]!=candidate_inputs["turn"][row_idx][token_idx+1]])

# for ctxt_token, cdnd_token, ce_label in zip(preprocessor.decode(context_inputs["token"]), preprocessor.decode(candidate_inputs["token"]), outputs["ce"]):
#     print("ce_label:\t", ce_label)
#     print("ctxt_token:\t", ctxt_token)
#     print("cdnd_token:\t", cdnd_token)
#     print()

## Train Test

In [16]:
epoch = 200
amp = True
scaler = None
if amp: scaler = torch.cuda.amp.GradScaler()
save_per_epoch = 1
save_per_batch = -1
keep_last = False
verbose_per_epoch = 1
verbose_per_batch = -1



### trainer.fit

In [None]:
history = trainer.fit(model=poly_encoder, train_data_loader=train_data_loader, val_data_loader=val_data_loader, 
                      criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, device=device, 
                      epoch=epoch, amp=amp, save_per_epoch=save_per_epoch, save_per_batch=save_per_batch, keep_last=keep_last, verbose_per_epoch=verbose_per_epoch, verbose_per_batch=verbose_per_batch)

### trainer.train_epoch

In [None]:
data_iter = tqdm(train_data_loader, initial=train_data_loader.iter_start, total=len(train_data_loader))
data_iter.iter_size = train_data_loader.iter_end - train_data_loader.iter_start
epoch_train_history = trainer.train_epoch(model=poly_encoder, data_loader=data_iter, 
                                          criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, device=device, 
                                          amp=amp, scaler=scaler, save_per_batch=save_per_batch, verbose_per_batch=verbose_per_batch)

### trainer.iteration

In [None]:
for batch_idx, batch in enumerate(train_data_loader):
    batch_idx += 1
    batch = [{k: trainer.convert_to_tensor(data=v, device=device) for k, v in _batch.items()} for _batch in batch]

    loss_dict, acc_dict = trainer.iteration(model=poly_encoder, batch=batch,
                                            criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, 
                                            train=True, amp=amp, scaler=scaler)
    
    print(loss_dict)
    print(acc_dict)
    break



## Service Test

In [None]:
_model_dir = "/home/mnt/guest1/model/poly_encoder/dialog_retriever/20210813/"
# trainer.save(path=_model_dir, model=transformer, optimizer=optimizer, history=None, config=config, preprocessor=preprocessor, save_model_hyperparams=True, save_optimizer_hyperparams=False, ddp=False)

In [None]:
from transformer.services.dialog_retriever.poly_encoder import DialogRetriever
dr = DialogRetriever(temp_dir="./")
dr.load_model(model_dir=_model_dir)

In [None]:
utterances = [
    "여기 있는 사람들이 다 롤러코스터 타려고 기다리는 사람들이야?",
    "그런 것 같아요.",
    "얼마나 기다려야 할까?",
    "최소한 한 시간 반 정도 기다려야 될 것 같아요.",
    "한 시간 반?",
    "줄이 너무 기니까 우리 다른 것부터 탈까?"
]
speaker_ids = [1, 0, 1, 0, 1, 1]
top_n = 5
max_retry = 5

In [None]:
# greedy
output = dr.infer_next_utterance(utterances=utterances, speaker_ids=speaker_ids, top_n=top_n, max_retry=max_retry)