In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from setproctitle import setproctitle
setproctitle("Hodong_PolyEncoder")

In [2]:
import json
import torch
from tqdm import tqdm
import numpy as np
from transformer.data.retriever_dataset import ElectraDatasetFromDir, RetrieverDataLoader
from transformer.tokenizer.utils import make_custom_tokenizer_from_pretrained, load_tokenizer_from_pretrained
from transformer.layers.embedding import EmbeddingAggregation
from transformer.models.interface import TrainHistory
from transformer.models.utils import load_state_dict, init_path, get_score_json
from transformer.utils.information_retrieval import BM25Okapi
from transformer.models.poly_encoder import PolyEncoder
from transformer.utils.common import set_device, convert_to_tensor, convert_to_numpy

### Set WorkingDirectory

In [3]:
# # AIBUD_DEV
# dataset_dir = "/Users/aibud_dev/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # Korea_Server
# dataset_dir = "/home/mnt/guest1"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # bigshane_local
# dataset_dir = "D:\_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# AWS
dataset_dir = "/home/ubuntu/data"
path = "./config/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

### Load Tokenizer

In [4]:
tokenizer_file_path = dataset_dir + "/huggingface_tokenizer/kor/koelectra-vanila"

# # save tokenizer to local
# tokenizer_path = "monologg/koelectra-base-discriminator"
# add_special_token = True
# tokenizer = make_custom_tokenizer_from_pretrained(model_type="electra", name_or_path=tokenizer_path, add_special_token=add_special_token)
# tokenizer.save_pretrained(tokenizer_file_path)

tokenizer = load_tokenizer_from_pretrained(model_type="electra", name_or_path=tokenizer_file_path)
print("vocab_size:", len(tokenizer))

loaded pretrained huggingface_tokenizer: 'D:\_jupyter/huggingface_tokenizer/kor/koelectra-vanila'
vocab_size: 32200


### Load Dataset & DataLoader

In [5]:
timesteps = 128
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 64
nprocs = 1

dataset_name = "four_n2x8_both"
total_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/retriever/{}/".format(dataset_name)
sample_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/retriever/{}/sample/".format(dataset_name)
train_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/retriever/{}/train/".format(dataset_name)
val_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/retriever/{}/val/".format(dataset_name)
test_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/retriever/{}/test/".format(dataset_name)

# total_dataset = ElectraDatasetFromDir(data_dir=total_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs)

# train_dataset = ElectraDatasetFromDir(data_dir=train_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs)
# train_data_loader = RetrieverDataLoader(dataset=train_dataset, batch_size=batch_size, device=device)

# val_dataset = ElectraDatasetFromDir(data_dir=val_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs)
# val_data_loader = RetrieverDataLoader(dataset=val_dataset, batch_size=batch_size, device=device)

test_dataset = ElectraDatasetFromDir(data_dir=test_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs)
test_data_loader = RetrieverDataLoader(dataset=test_dataset, batch_size=batch_size, device=device)

Preprocessing data: 100%|█████████████████████████████████████████████████████████| 3481/3481 [00:06<00:00, 548.21it/s]


In [6]:
# test_data_loader.check()

### Define Model

In [12]:
# bm25
bm25 = BM25Okapi(tokenizer=tokenizer, dataset=test_dataset)

# poly_encoder
poly_encoder = PolyEncoder(encoder_type="electra", vocab_size=len(tokenizer), m_code=64)
optimizer = poly_encoder.get_optimizer(lr=3e-5)

poly_encoder = set_device(poly_encoder, device=device)
optimizer = set_device(optimizer, device=device)

'temp_dir' has been set to './20211006_160520/' to save model while training


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bigshane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Extracting responses: 100%|█████████████████████████████████████████████████████| 3481/3481 [00:00<00:00, 62158.34it/s]


Total 3218 candidates has been extracted
'temp_dir' has been set to './20211006_160525/' to save model while training


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bigshane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Some weights of the model checkpoint at monologg/koelectra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at monologg/koelectra-base-discrim

Setting model device: cuda:0


### Fit

In [None]:
epoch = 30
model_dir = dataset_dir + "/model/poly_encoder/v3/{}/".format(dataset_name)
init_path(model_dir, True)

metrics = ["hits", "semantic_score"]
hits_k = [1, 2, 5, 10]
name_or_path = "beomi/kcbert-base"

train_history = TrainHistory()
val_history = TrainHistory()
for _epoch in range(1, epoch+1):
    # train
    epoch_train_history = poly_encoder.iteration_epoch(data_loader=train_data_loader, optimizer=optimizer, device=device, train=True, verbose_per_batch=-1)
    # compute_scoresvn
    train_candidates, train_candidate_embeds = poly_encoder.extract_candidates(device=device, dataset=train_dataset, additional_responses=None, concat_candidates=True, verbose=False)
    train_scores = poly_encoder.compute_scores(metrics=metrics, tokenizer=tokenizer, data_loader=train_data_loader, device=device, hits_k=hits_k, name_or_path=name_or_path, candidates=train_candidates, candidate_embeds=train_candidate_embeds)
    for metric, metric_score in train_scores.items():
        epoch_train_history._add_acc(name=metric, value=metric_score)
        
    epoch_train_history_str = poly_encoder.verbose_template.format(mode="Epoch_train", device=device, idx=_epoch, num_iters=epoch) + str(epoch_train_history)
    print(epoch_train_history_str)
    train_history += epoch_train_history
    
    # val
    epoch_val_history = poly_encoder.iteration_epoch(data_loader=val_data_loader, optimizer=optimizer, device=device, train=False, verbose_per_batch=-1)
    # compute_scores
    val_candidates, val_candidate_embeds = poly_encoder.extract_candidates(device=device, dataset=val_dataset, additional_responses=None, concat_candidates=True, verbose=False)
    val_scores = poly_encoder.compute_scores(metrics=metrics, tokenizer=tokenizer, data_loader=val_data_loader, device=device, hits_k=hits_k, name_or_path=name_or_path, candidates=val_candidates, candidate_embeds=val_candidate_embeds)
    for metric, metric_score in val_scores.items():
        epoch_val_history._add_acc(name=metric, value=metric_score)
        
    epoch_val_history_str = poly_encoder.verbose_template.format(mode="Epoch_val", device=device, idx=_epoch, num_iters=epoch) + str(epoch_val_history)
    print(epoch_val_history_str)
    val_history += epoch_val_history

    # candidates, candidate_embeds = poly_encoder.extract_candidates(device=device, dataset=train_dataset, additional_responses=None, concat_candidates=True, verbose=True)
    candidates, candidate_embeds = train_candidates, train_candidate_embeds
    poly_encoder.save(path=model_dir + "epoch_{}/".format(_epoch), optimizer=optimizer, tokenizer=tokenizer, candidates=(candidates, candidate_embeds))
    with open(model_dir+"log.txt", "a", encoding="utf-8") as fp: 
        fp.write(epoch_train_history_str + "\n")
        fp.write(epoch_val_history_str + "\n")

In [None]:
# # Extract Candidates
# epoch = 30
# input_model_dir = dataset_dir + "/model/poly_encoder/v3/concat/{}/".format(dataset_name)
# output_model_dir = dataset_dir + "/model/poly_encoder/v3/seperate/{}/".format(dataset_name)
# from transformer.models.utils import load_state_dict
# for _epoch in range(1, epoch + 1):
#     poly_encoder = load_state_dict(object=poly_encoder, path=input_model_dir+"epoch_{}/".format(_epoch))
#     candidates, candidate_embeds = poly_encoder.extract_candidates(device=device, dataset=train_dataset, additional_responses=None, concat_candidates=False, verbose=True)
#     poly_encoder.save(path=output_model_dir+"epoch_{}/".format(_epoch), optimizer=optimizer, tokenizer=tokenizer, candidates=(candidates, candidate_embeds))

### Compute Scores

#### PolyEncoder

In [10]:
epoch = 50
metrics = ["hits", "semantic_score"]
hits_k = [1, 2, 5, 10]
name_or_path = "beomi/kcbert-base"
model_name = "PolyEncoder"

model_dir = dataset_dir + "/model/poly_encoder/{dataset_name}/".format(dataset_name=dataset_name)
log_dir = dataset_dir + "/essay/poly_encoder/{dataset_name}/".format(dataset_name=dataset_name)
init_path(log_dir, reset=True)
for _epoch in range(1, epoch+1):
    poly_encoder = load_state_dict(object=poly_encoder, path=model_dir+"epoch_{_epoch}/".format(_epoch=_epoch))
    candidates, candidate_embeds = poly_encoder.extract_candidates(device=device, dataset=test_dataset, additional_responses=None, verbose=True)
    scores = poly_encoder.compute_scores(metrics=metrics, tokenizer=tokenizer, data_loader=test_data_loader, device=device, hits_k=hits_k, name_or_path=name_or_path, candidates=candidates, candidate_embeds=candidate_embeds)
    output_json = get_score_json(model_name=model_name, dataset_name=dataset_name, test_data_size=len(test_data_loader.dataset), batch_size=batch_size, scores=scores)

    # verbose & append log
    eval_history = TrainHistory()
    loss_dict = dict()
    acc_dict = dict()
    for metric, metric_score in scores.items():
        acc_dict[metric] = metric_score
    eval_history.update(loss_dict=loss_dict, acc_dict=acc_dict, lr=-1)
    eval_str = poly_encoder.verbose_template.format(mode="Eval", device=device, idx=_epoch, num_iters=epoch) + str(eval_history)
    print(eval_str)

    with open(log_dir + "/score_logs.txt", "a", encoding="utf-8") as fp:
        fp.write(eval_str + "\n")

    # write detailed logs
    init_path(log_dir + "/detailed/", reset=False)
    with open(log_dir + "/detailed/score_logs_{_epoch}.json".format(_epoch=_epoch), "w", encoding="utf-8") as fp:
        json.dumps(output_json)

Computing scores: 100%|████████████████████████████████████████████████████████████████| 46/46 [00:14<00:00,  3.18it/s]


#### BM25

In [None]:
metrics = ["hits", "semantic_score"]
hits_k = [1, 2, 5, 10]
name_or_path = "beomi/kcbert-base"
model_name = "PolyEncoder"

candidates, candidate_embeds = poly_encoder.extract_candidates(device=device, dataset=test_dataset, additional_responses=None, verbose=True)
bm25.set_candidates(candidates=candidates)
scores = bm25.compute_scores(metrics=metrics, tokenizer=tokenizer, data_loader=test_data_loader, device=device, hits_k=hits_k)

eval_history = TrainHistory()
loss_dict = dict()
acc_dict = dict()
for metric, metric_score in scores.items():
    acc_dict[metric] = metric_score
eval_history.update(loss_dict=loss_dict, acc_dict=acc_dict, lr=-1)
eval_str = bm25.verbose_template.format(mode="Eval", device=device, idx=-1, num_iters=-1) + str(eval_history)

### Test Service

In [None]:
from transformer.services.dialog_retriever.poly_encoder import PolyEncoderDialogRetriever
service = PolyEncoderDialogRetriever()
service.verbose = False
_epoch = 10
service.set_device(device=device)
_model_dir = dataset_dir + "/model/poly_encoder/{dataset_name}/epoch_{_epoch}/".format(dataset_name=dataset_name, _epoch=_epoch)
service.load_model(model_dir=_model_dir)

# print("prev candidates size: {} / {}".format(len(service.candidates), len(service.candidate_embeds)))
# candidates, candidate_embeds = service.model.extract_candidates(device=device, dataset=train_dataset, additional_responses=None, concat_candidates=False, verbose=True)
# service.set_candidates(candidates=candidates, candidate_embeds=candidate_embeds)
print("cur candidates size: {} / {}".format(len(service.candidates), len(service.candidate_embeds)))

In [22]:
utterances = [
    "안녕하세요",
#     "무슨 일로 저에게 상담을 신청하셨나요?",
#     "요즘 인간관계가 고민이에요.",
#     "어떤 고민이죠?",
#     "친구들이랑 연락도 뜸해지고 자주 못만나서 서먹해지는 것 같아요",
#     "이래저래 연락하기 힘드신가봐요",
#     "네, 코로나 때문에 만나질 못해서 더 혼자가 된 느낌이에요.",
#     "저도 지쳐요.",
#     "당신도 사람들을 자주 못 만나시나봐요"
 ]
speaker_ids = [(i+1)%2 for i in range(len(utterances))]
min_length = 10
top_n = 5

outputs = service.infer_next_utterance(utterances=utterances, speaker_ids=speaker_ids,
                                       min_length=min_length, top_n=top_n, weight_bm25=False,
                                       prev_utterance=None, intersection_tolerance=0.9, max_retry=5)
print("vanila:\n", outputs)

outputs = service.infer_next_utterance(utterances=utterances, speaker_ids=speaker_ids,
                                       min_length=min_length, top_n=top_n, weight_bm25=True,
                                       prev_utterance=None, intersection_tolerance=0.9, max_retry=5)
print("weighted:\n", outputs)

before generate
input_ids: 1
input_ids: torch.Size([1, 4])
output: tensor([[    1,     0, 22465, 22465, 23935, 23935, 23935, 27667, 23935, 23935,
         17275, 23935, 23935, 10619,     0,     0, 22465,     0,     0, 23935,
             0,     0,     1]], device='cuda:0')


'안녕 안녕하세요하세요하세요와이하세요하세요세요하세요하세요못 안녕하세요'

### Compute Metrics

In [None]:
from transformer.services.dialog_retriever.poly_encoder import PolyEncoderDialogRetriever
service = PolyEncoderDialogRetriever()
service.verbose = False
_epoch = 34
service.set_device(device=device)
_model_dir = dataset_dir + "/model/poly_encoder/v2/{dataset_name}/epoch_{_epoch}/".format(dataset_name=dataset_name, _epoch=_epoch)
service.load_model(model_dir=_model_dir)

print("prev candidates size: {} / {}".format(len(service.candidates), len(service.candidate_embeds)))
candidates, candidate_embeds = service.model.extract_candidates(device=device, dataset=test_dataset, additional_responses=None, concat_candidates=False, verbose=True)
service.set_candidates(candidates=candidates, candidate_embeds=candidate_embeds)
print("cur candidates size: {} / {}".format(len(service.candidates), len(service.candidate_embeds)))

In [None]:
from transformer.data.utils import simplify_speaker_ids
from transformer.utils.common import get_last_index
from KoBERTScore import BERTScore
from transformer.models.utils import compute_bleu, compute_meteor, compute_rouge, compute_hits, compute_semantic_score

def get_metric_inputs(dataset, min_length=1, top_n=10):
    for row_idx in range(0, len(dataset.raw_data)):
        output = None
        
        _utterances = dataset.raw_data[row_idx]["utterances"]
        _speaker_ids = dataset.raw_data[row_idx]["speaker_ids"]
        _speaker_ids = simplify_speaker_ids(_speaker_ids, user_id=1, model_id=0)
        last_index = get_last_index(_speaker_ids, value=1)
        utterances = _utterances[:last_index+1]
        speaker_ids = _speaker_ids[:last_index+1]
        reference = _utterances[last_index+1:]
        reference = " ".join(reference)

        try:
            context = " ".join(utterances)
            outputs = service.bm25.get_top_n(context=context, n=top_n)
            bm25_prediction = [output for output in outputs]
            
            # unweighted
            outputs = service.infer_next_utterance(utterances=utterances, speaker_ids=speaker_ids,
                                                   min_length=min_length, top_n=top_n, weight_bm25=False,
                                                   prev_utterance=None, intersection_tolerance=0.9, max_retry=5)
            unweighted_prediction = [output[0] for output in outputs]

            # weighted
            outputs = service.infer_next_utterance(utterances=utterances, speaker_ids=speaker_ids,
                                                   min_length=min_length, top_n=top_n, weight_bm25=True,
                                                   prev_utterance=None, intersection_tolerance=0.9, max_retry=5)
            weighted_prediction = [output[0] for output in outputs]

            output = {
                "context": context,
                "reference": reference,
                "bm25_prediction": bm25_prediction,
                "unweighted_prediction": unweighted_prediction,
                "weighted_prediction": weighted_prediction
            }
            yield output
        except:
            yield output        

In [None]:
metrics = ["hits", "semantic_score"]
hits_k = [1,2,5,10]
name_or_path = "beomi/kcbert-base"
min_length = 1
top_n = max(hits_k)

metric_input_gen = get_metric_inputs(dataset=test_dataset, min_length=min_length, top_n=top_n)

bm25_predictions = []
unweighted_predictions = []
weighted_predictions = []
references = []
for gen_output in tqdm(metric_input_gen):
    if gen_output is None: continue
    references.append(gen_output["reference"])
    bm25_predictions.append(gen_output["bm25_prediction"])
    unweighted_predictions.append(gen_output["unweighted_prediction"])
    weighted_predictions.append(gen_output["weighted_prediction"])

# unweighted_scores
bm25_scores = dict()
if "hits" in metrics:
    # hits requires predictions: List[List[any]], references: List[str]
    bm25_scores["hits"] = compute_hits(predictions=bm25_predictions, references=references, k=hits_k)
if "semantic_score" in metrics:
    # BERTScore requires predictions: List[str], references: List[str]
    _bm25_predictions = [prediction[0] for prediction in bm25_predictions]
    poly_encoder.metrics["semantic_score"] = BERTScore(model_name_or_path=name_or_path, best_layer=-1, device=device)
    bm25_scores["semantic_score"] = compute_semantic_score(metric=poly_encoder.metrics["semantic_score"], tokenizer=tokenizer, predictions=_bm25_predictions, references=references)
print("bm25_scores:", bm25_scores)
    
# unweighted_scores
unweighted_scores = dict()
if "hits" in metrics:
    # hits requires predictions: List[List[any]], references: List[str]
    unweighted_scores["hits"] = compute_hits(predictions=unweighted_predictions, references=references, k=hits_k)
if "semantic_score" in metrics:
    # BERTScore requires predictions: List[str], references: List[str]
    _unweighted_predictions = [prediction[0] for prediction in unweighted_predictions]
    poly_encoder.metrics["semantic_score"] = BERTScore(model_name_or_path=name_or_path, best_layer=-1, device=device)
    unweighted_scores["semantic_score"] = compute_semantic_score(metric=poly_encoder.metrics["semantic_score"], tokenizer=tokenizer, predictions=_unweighted_predictions, references=references)
print("unweighted_scores:", unweighted_scores)

# weighted_scores
weighted_scores = dict()
if "hits" in metrics:
    # hits requires predictions: List[List[any]], references: List[str]
    weighted_scores["hits"] = compute_hits(predictions=weighted_predictions, references=references, k=hits_k)
if "semantic_score" in metrics:
    # BERTScore requires predictions: List[str], references: List[str]
    _weighted_predictions = [prediction[0] for prediction in weighted_predictions]
    poly_encoder.metrics["semantic_score"] = BERTScore(model_name_or_path=name_or_path, best_layer=-1, device=device)
    weighted_scores["semantic_score"] = compute_semantic_score(metric=poly_encoder.metrics["semantic_score"], tokenizer=tokenizer, predictions=_weighted_predictions, references=references)

print("weighted_scores:", weighted_scores)