In [None]:
import os
import json
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, IterableDataset, DataLoader

from transformer.assertions.object_assertion import DataAssertion
from transformer.utils.tokenizer import MecabTokenizer, SpmTokenizer
from transformer.preprocessor.bert_preprocessor import DialogPretrainPreprocessor
from transformer.preprocessor.sentence_bert_preprocessor import SentenceBertPreprocessor
from transformer.data.dataset import DatasetInterface, DatasetFromDir
from transformer.data.bert_data_loader import DialogRetrieverDataLoader
from transformer.layers.attention import MultiheadAttention, PositionwiseFeedForward, CodeAttention
from transformer.layers.embedding import EmbeddingAggregation
from transformer.layers.transformer import EncoderLayer, DecoderLayer
from transformer.layers.head import LanguageModelingHead, PolyEncoderHead, NextSentencePredictionHead
from transformer.layers.utils import get_pad_mask, get_sub_mask, dot_attention
from transformer.models.transformer import Encoder, Decoder, Transformer
from transformer.models.bert import Bert
from transformer.models.poly_encoder import PolyEncoder
from transformer.trainer.bert_trainer import BlenderBotDialogEncoderTrainer
from transformer.trainer.poly_encoder_trainer import BlenderBotDialogRetrieverTrainer
from transformer.trainer.utils import *

## Set Directories

In [None]:
# # AIBUD_DEV
# dataset_dir = "/Users/aibud_dev/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # Picas_Server
# dataset_dir = "/home/picas/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# Korea_Server
dataset_dir = "/home/mnt/guest1"
path = "./config/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

# # bigshane_local
# dataset_dir = "D:\_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

In [None]:
# bert_model_dir = dataset_dir + "/model/bert_dialog_pretrain/20210722/"
# bert_train_config_path = bert_model_dir + ModelFilenameConstants.TRAIN_CONFIG_FILENAME
# bert_model_state_dict_path = bert_model_dir + ModelFilenameConstants.MODEL_STATE_DICT_FILENAME
# bert_optimizer_state_dict_path = bert_model_dir + ModelFilenameConstants.OPTIMIZER_STATE_DICT_FILENAME
# bert_spm_model_path = bert_model_dir + ModelFilenameConstants.SPM_MODEL_DIR
# bert_history_path = bert_model_dir + ModelFilenameConstants.HISTORY_FILENAME

# # Load config
# bert_config = None
# with open(bert_train_config_path, "r", encoding="utf-8") as fp:
#     bert_config = json.load(fp)

In [None]:
model_dir = dataset_dir + "/model/poly_encoder/dialog_retriever/20210803/"
train_config_path = model_dir + ModelFilenameConstants.TRAIN_CONFIG_FILENAME
model_hyperparams_path = model_dir + ModelFilenameConstants.MODEL_HYPERPARAMS_FILENAME
model_state_dict_path = model_dir + ModelFilenameConstants.MODEL_STATE_DICT_FILENAME
optimizer_state_dict_path = model_dir + ModelFilenameConstants.OPTIMIZER_STATE_DICT_FILENAME
spm_model_path = model_dir + ModelFilenameConstants.SPM_MODEL_DIR
history_path = model_dir + ModelFilenameConstants.HISTORY_FILENAME

model_hyperparams = load_hyperparams(path=model_hyperparams_path)
# # Load config
# config = None
# with open(train_config_path, "r", encoding="utf-8") as fp:
#     config = json.load(fp)

## Settings

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
nprocs = 1
batch_size = 4

## Load Trainer & Preprocessor

In [1]:
# Load trainer
dialog_retriever_trainer = BlenderBotDialogRetrieverTrainer()

# Load prep
sentence_bert_prep = SentenceBertPreprocessor(language="kor", spm_model_path=spm_model_path, embedding_dict=model_hyperparams["context_encoder_hyperparams"]["embedding_dict"])

NameError: name 'BlenderBotDialogRetrieverTrainer' is not defined

## Load Dataset & DataLoader

In [7]:
# data_loader_params
encoding = "utf-8"
left_sep_tokens = [["cls", "sep"], [None, None]]
right_sep_tokens = [["cls", "sep"], [None, None]]
left_fixed_segment_id = 0 
right_fixed_segment_id = 1 # 0
approach = "ignore"

In [8]:
# multi_turn_data_dir = dataset_dir + "/dataset/preprocessed/dialog_pretrain/kor/multi_turn/"
multi_turn_data_dir = dataset_dir + "/dataset/preprocessed/dialog_pretrain/kor/single_dataset/KaggleConversation"
# multi_turn_data_dir = dataset_dir + "/dataset/conversation/SelectStar/kor/multi_turn"
multi_turn_data_extension = "json"
dialog_dataset = DatasetFromDir(data_dir=multi_turn_data_dir, batch_size=batch_size, encoding=encoding, extension=multi_turn_data_extension, device=device, nprocs=nprocs)
dialog_retriever_data_loader_params = BlenderBotDialogRetrieverTrainer.get_data_loader_params(timesteps=model_hyperparams["context_encoder_hyperparams"]["timesteps"], left_sep_tokens=left_sep_tokens, right_sep_tokens=right_sep_tokens,
                                                                                              left_fixed_segment_id=left_fixed_segment_id, right_fixed_segment_id=right_fixed_segment_id, approach=approach, nprocs=nprocs)
dialog_retriever_data_loader = dialog_retriever_trainer.create_data_loader(dataset=dialog_dataset, batch_size=batch_size, preprocessor=sentence_bert_prep, embedding_dict=model_hyperparams["context_encoder_hyperparams"]["embedding_dict"],
                                                                           num_workers=dialog_retriever_trainer.num_workers, pin_memory=dialog_retriever_trainer.pin_memory, device=device, **dialog_retriever_data_loader_params)

## Build Poly-Encoder

#### initialize model & load state_dict

In [9]:
context_encoder_model = "bert"
context_encoder_hyperparams = {"pad_token_id":sentence_bert_prep.spm_tokenizer.special_token_dict["pad"]["id"], **bert_config["model"]}
candidate_encoder_model = "bert"
candidate_encoder_hyperparams = {"pad_token_id":sentence_bert_prep.spm_tokenizer.special_token_dict["pad"]["id"], **bert_config["model"]}
poly_encoder = PolyEncoder(context_encoder_model=context_encoder_model, context_encoder_hyperparams=context_encoder_hyperparams, candidate_encoder_model=candidate_encoder_model, candidate_encoder_hyperparams=candidate_encoder_hyperparams, m_code=m_code, aggregation_method=aggregation_method)

model_dir = "/home/guest1/torch-transformer/20210806_213831/epoch_9/"
poly_encoder = load_state_dict(object=poly_encoder, path=model_dir, map_location=device)

#### load model (with saved hyperparams)

In [None]:
model_dir = "/home/guest1/torch-transformer/20210806_213831/epoch_9/"
loaded = dialog_retriever_trainer.load(path=model_dir)
poly_encoder = loaded["model"]
if "optimizer" in loaded:
    optimizer = loaded["optimizer"]
if "history" in loaded:
    history = loaded["history"]

### Set Device

In [10]:
poly_encoder.eval()
poly_encoder = BlenderBotDialogRetrieverTrainer.set_device(obj=poly_encoder, device=device)

Setting model device: cpu


## Load dialog_response_set

In [11]:
# path = dialog_retriever_trainer.save_dialog_history_set(path=model_dir, model=poly_encoder, data_loader=dialog_retriever_data_loader, device=device)
# print(path)
# path = dialog_retriever_trainer.save_dialog_response_set(path=model_dir, model=poly_encoder, data_loader=dialog_retriever_data_loader, device=device)
# print(path)

# dialog_history_set = dialog_retriever_trainer.load_dialog_response_set(path=model_dir)
# contexts, encoded_contexts = dialog_history_set
dialog_response_set = dialog_retriever_trainer.load_dialog_response_set(path=model_dir)
candidates, encoded_candidates = dialog_response_set

## Inference Test

In [19]:
def check_result(dataset, top_n=5, max_retry=5):    
    for row in dataset:
        candidate_embed = dialog_retriever_trainer.convert_to_tensor(data=encoded_candidates, device=device)
        context_input_row, candidate_input_row = dialog_retriever_data_loader.parse_row(row=row)
        context_inputs, _, _ = sentence_bert_prep.encode(left_inputs=[context_input_row], right_inputs=[candidate_input_row], timesteps=timesteps,
                                                         left_sep_tokens=left_sep_tokens, right_sep_tokens=right_sep_tokens,
                                                         left_fixed_segment_id=left_fixed_segment_id, right_fixed_segment_id=right_fixed_segment_id, approach="ignore")
        context_inputs = {k:dialog_retriever_trainer.convert_to_tensor(data=v, device=device) for k,v in context_inputs.items()}
        if len(context_inputs["token"]) <= 0: continue
        context_embed = poly_encoder.encode_context(context_inputs=context_inputs, candidate_embed=candidate_embed)
        
        context_embed = dialog_retriever_trainer.convert_to_numpy(tensor=context_embed)
        candidate_embed = dialog_retriever_trainer.convert_to_numpy(tensor=candidate_embed)
        probs = sentence_bert_prep.get_candidate_probs(context_embed=context_embed, candidate_embed=candidate_embed)
        scores = sentence_bert_prep.get_top_n_probs(probs=probs[0], top_n=5)
        yield context_input_row, candidate_input_row, scores
        
check_result_iter = check_result(dataset=dialog_dataset, top_n=5)

In [144]:
row, _row, scores = next(check_result_iter)
for speaker_id, utterance in zip(row["turn"][0], row["token"][0]):
    print("{}: {}".format(speaker_id, utterance))

for _idx, (idx, prob) in enumerate(scores):
    print("\t{}) {} ({})".format(_idx+1, candidates[idx], prob))
print("\tAns) {}".format(_row["token"][0][0]))

1: 여기 있는 사람들이 다 롤러코스터 타려고 기다리는 사람들이야?
0: 그런 것 같아요.
1: 얼마나 기다려야 할까?
0: 최소한 한 시간 반 정도 기다려야 될 것 같아요.
1: 한 시간 반? 줄이 너무 기니까 우리 다른 것부터 탈까?
	1) 그게 좋을 것 같아요. (0.8594509232046246)
	2) 성격도 좋고요. (0.12345854404417286)
	3) 쓰레기를 버리는 일이 제일 귀찮아요. (0.007075825937301573)
	4) 고마워요. 편해서 좋아요. (0.0019271001033580588)
	5) 좋아요. 같이 갑시다. (0.0018554869207873684)
	Ans) 그게 좋을 것 같아요.


In [None]:
[
    "여기 있는 사람들이 다 롤러코스터 타려고 기다리는 사람들이야?", 
    "그런 것 같아요.", 
    "얼마나 기다려야 할까?", 
    "최소한 한 시간 반 정도 기다려야 될 것 같아요.", 
    "한 시간 반?", 
    "줄이 너무 기니까 우리 다른 것부터 탈까?"
]
[1, 0, 1, 0, 1, 1]

In [23]:
def inference_test(utterances, speaker_ids, top_n=5, max_retry=5):
    candidate_embed = dialog_retriever_trainer.convert_to_tensor(data=encoded_candidates, device=device)
    context_inputs = sentence_bert_prep.encode_utterances(utterances=utterances, speaker_ids=speaker_ids, timesteps=poly_encoder.context_encoder.timesteps, left_sep_tokens=left_sep_tokens, max_retry=max_retry)
    context_inputs = {k:dialog_retriever_trainer.convert_to_tensor(data=v, device=device) for k,v in context_inputs.items()}
    context_embed = poly_encoder.encode_context(context_inputs=context_inputs, candidate_embed=candidate_embed)

    context_embed = dialog_retriever_trainer.convert_to_numpy(tensor=context_embed)
    candidate_embed = dialog_retriever_trainer.convert_to_numpy(tensor=candidate_embed)
    probs = sentence_bert_prep.get_candidate_probs(context_embed=context_embed, candidate_embed=candidate_embed)
    scores = sentence_bert_prep.get_top_n_probs(probs=probs[0], top_n=5)
    
    for speaker_id, utterance in zip(speaker_ids, utterances):
        print("{}: {}".format(speaker_id, utterance))
    
    for _idx, (idx, prob) in enumerate(scores):
        print("\t{}) {} ({})".format(_idx, candidates[idx], prob))

In [28]:
utterances = [
    "휴 늦어서 미안해",
    "왜 이렇게 늦었어?",
    "생각보다 차가 많이 막혔어, 미안",
    "저 사람 누군지 알아?",
]
speaker_ids = [0, 1, 0, 1]
inference_test(utterances=utterances, speaker_ids=speaker_ids, top_n=5, max_retry=5)

0: 휴 늦어서 미안해
1: 왜 이렇게 늦었어?
0: 생각보다 차가 많이 막혔어, 미안
1: 저 사람 누군지 알아?
	0) 아니요, 사람이 너무 많아서 못 받았어요. 다음 콘서트에 가면 꼭 사인을 받을 거예요. (0.2763103464327441)
	1) 네, 서울보다 해물이 신선하고 맛있었어요. (0.09542561162156468)
	2) 재미있긴 뭐가 재미있어? 지하철 붐빌 땐 얼마나 불편한데. (0.0833303590960072)
	3) 나는 방학 때 아무 계획도 없는데. 난 뭐 하지? (0.06033457965981522)
	4) 네, 윗집 아주머니가 시험이 끝날 때까지만 양해해 달라고 하셔서 더 불평할 수가 없었어요. (0.06006766015290987)
