In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from setproctitle import setproctitle
setproctitle("Hodong_BART")

In [2]:
import json
import torch
from tqdm import tqdm
import numpy as np
from transformers import BartModel
from transformers import BartForConditionalGeneration
from transformers import PreTrainedTokenizerFast
from transformer.data.generator_dataset import BartDatasetFromDir, GeneratorDataLoader
from transformer.tokenizer.utils import make_custom_tokenizer_from_pretrained, load_tokenizer_from_pretrained
from transformer.models.interface import TrainHistory
from transformer.models.bart import Bart
from transformer.models.utils import load_state_dict, get_score_json
from transformer.utils.common import set_device, convert_to_tensor, init_path

### Set WorkingDirectory

In [3]:
# # AIBUD_DEV
# dataset_dir = "/Users/aibud_dev/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# Korea_Server
dataset_dir = "/home/mnt/guest1"
path = "./config/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

# # bigshane_local
# dataset_dir = "D:\_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # AWS
# dataset_dir = "/home/ubuntu/data"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

### Load Tokenizer

In [4]:
tokenizer_file_path = dataset_dir + "/huggingface_tokenizer/kor/kobart-vanila"

# # save tokenizer to local
# tokenizer_path = "hyunwoongko/kobart"
# add_special_token = True
# tokenizer = make_custom_tokenizer_from_pretrained(model_type="bart", name_or_path=tokenizer_path, add_special_token=add_special_token)
# tokenizer.save_pretrained(tokenizer_file_path)

tokenizer = load_tokenizer_from_pretrained(model_type="bart", name_or_path=tokenizer_file_path)
print("vocab_size:", len(tokenizer))

update unregistered special_tokens to class_variables:{'num_token': '<num>', 'num_token_id': 30000, 'cls_token': '<cls>', 'cls_token_id': 30001, 'sep_token': '<sep>', 'sep_token_id': 30002, 'turn_token': '<turn>', 'turn_token_id': 30003, 'topic_token': '<tpc>', 'topic_token_id': 30004, 'situation_token': '<situ>', 'situation_token_id': 30005, 'context_token': '<ctxt>', 'context_token_id': 30006, 'condition_token': '<cond>', 'condition_token_id': 30007, 'candidate_token': '<cand>', 'candidate_token_id': 30008, 'persona_token': '<pers>', 'persona_token_id': 30009, 'speaker_1_token': '<spk1>', 'speaker_1_token_id': 30010, 'speaker_2_token': '<spk2>', 'speaker_2_token_id': 30011}
loaded pretrained huggingface_tokenizer: '/Users/aibud_dev/_jupyter/huggingface_tokenizer/kor/kobart-customed'
vocab_size: 30012


### Load Dataset & DataLoader

In [7]:
timesteps = 128
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 64
nprocs = 1
use_condition = True
alpha_blending = 0.5

dataset_name = "four_n2x8_both"
total_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/condition/{}/".format(dataset_name)
sample_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/condition/{}/sample/".format(dataset_name)
train_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/condition/{}/train/".format(dataset_name)
val_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/condition/{}/val/".format(dataset_name)
test_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/condition/{}/test/".format(dataset_name)

train_dataset = BartDatasetFromDir(data_dir=train_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs, use_condition=use_condition, alpha_blending=alpha_blending)
train_data_loader = GeneratorDataLoader(dataset=train_dataset, batch_size=batch_size, device=device)

val_dataset = BartDatasetFromDir(data_dir=val_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs, use_condition=False, alpha_blending=-1)
val_data_loader = GeneratorDataLoader(dataset=val_dataset, batch_size=batch_size, device=device)

test_dataset = BartDatasetFromDir(data_dir=test_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs, use_condition=False, alpha_blending=-1)
test_data_loader = GeneratorDataLoader(dataset=test_dataset, batch_size=batch_size, device=device)

Preprocessing data: 100%|██████████| 5000/5000 [00:09<00:00, 534.64it/s]
Preprocessing data: 100%|██████████| 5000/5000 [00:08<00:00, 572.00it/s]


In [None]:
test_data_loader.check()

### Define Model

In [6]:
bart = Bart(vocab_size=len(tokenizer))
optimizer = bart.get_optimizer(lr=5e-5)

bart = set_device(bart, device=device)
optimizer = set_device(optimizer, device=device)

# # continue learning
# _model_dir = dataset_dir + "/model/bart/four_n2x8_both/epoch_{}/".format(7)
# bart = load_state_dict(object=bart, path=_model_dir)

'temp_dir' has been set to './20210909_192739/' to save model while training
Setting model device: cuda:0


### Fit

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_parameters(bart)

In [None]:
epoch = 30
model_dir = dataset_dir + "/model/bart/v2/{dataset_name}/".format(dataset_name=dataset_name)
init_path(model_dir, True)

metrics = ["bleu", "meteor", "rouge", "semantic_score"]
bleu_ngrams = [3, 4]
rouge_types = ["1", "2", "L"]
name_or_path = "beomi/kcbert-base"
decoding_method = "beam_search"

train_history = TrainHistory()
val_history = TrainHistory()
for _epoch in range(1, epoch+1):
    # train
    epoch_train_history = bart.iteration_epoch(data_loader=train_data_loader, optimizer=optimizer, device=device, train=True, verbose_per_batch=-1)
    # compute scores
    train_scores = bart.compute_scores(metrics=metrics, data_loader=train_data_loader, device=device, tokenizer=tokenizer, timesteps=timesteps, bleu_ngrams=bleu_ngrams, rouge_types=rouge_types, name_or_path=name_or_path, decoding_method=decoding_method)
    for metric, metric_score in train_scores.items():
        epoch_train_history._add_acc(name=metric, value=metric_score)
        
    epoch_train_history_str = bart.verbose_template.format(mode="Epoch_train", device=device, idx=_epoch, num_iters=epoch) + str(epoch_train_history)
    print(epoch_train_history_str)
    train_history += epoch_train_history
    
    # val
    epoch_val_history = bart.iteration_epoch(data_loader=val_data_loader, optimizer=optimizer, device=device, train=False, verbose_per_batch=-1)
    # compute scores
    val_scores = bart.compute_scores(metrics=metrics, data_loader=val_data_loader, device=device, tokenizer=tokenizer, timesteps=timesteps, bleu_ngrams=bleu_ngrams, rouge_types=rouge_types, name_or_path=name_or_path, decoding_method=decoding_method)
    for metric, metric_score in val_scores.items():
        epoch_val_history._add_acc(name=metric, value=metric_score)
        
    epoch_val_history_str = bart.verbose_template.format(mode="Epoch_val", device=device, idx=_epoch, num_iters=epoch) + str(epoch_val_history)
    print(epoch_val_history_str)
    val_history += epoch_val_history
    
    bart.save(path=model_dir + "epoch_{}/".format(_epoch), optimizer=optimizer, tokenizer=tokenizer)
    with open(model_dir+"log.txt", "a", encoding="utf-8") as fp: 
        fp.write(epoch_train_history_str + "\n")
        fp.write(epoch_val_history_str + "\n")

### Compute Scores

In [None]:
epoch = 20
metrics = ["bleu", "meteor", "rouge", "semantic_score"]
bleu_ngrams = [3, 4]
rouge_types = ["1", "2", "L"]
name_or_path = "beomi/kcbert-base"
decoding_method = "beam_search"
model_name = "KoBART"

model_dir = dataset_dir + "/model/bart/{}/".format(dataset_name)
log_dir = dataset_dir + "/essay/bart/ft_o_rt_x/{dataset_name}/".format(dataset_name=dataset_name)
init_path(log_dir, reset=True)
for _epoch in range(1, epoch+1):
    bart = load_state_dict(object=bart, path=model_dir+"epoch_{}/".format(_epoch))
    scores = bart.compute_scores(metrics=metrics, data_loader=test_data_loader, device=device, tokenizer=tokenizer, timesteps=timesteps, bleu_ngrams=bleu_ngrams, rouge_types=rouge_types, name_or_path=name_or_path, decoding_method=decoding_method)
    output_json = get_score_json(model_name=model_name, dataset_name=dataset_name, test_data_size=len(test_data_loader.dataset), batch_size=batch_size, scores=scores)

    # verbose & append log
    eval_history = TrainHistory()
    loss_dict = dict()
    acc_dict = dict()
    for metric, score in scores.items():
        acc_dict[metric] = score
    eval_history.update(loss_dict=loss_dict, acc_dict=acc_dict, lr=-1)
    eval_str = bart.verbose_template.format(mode="Eval", device=device, idx=_epoch, num_iters=epoch) + str(eval_history)
    print(eval_str)

    with open(log_dir + "/score_logs.txt", "a", encoding="utf-8") as fp:
        fp.write(eval_str + "\n")

    # write detailed logs
    init_path(log_dir + "/detailed/", reset=False)
    with open(log_dir + "/detailed/score_logs_{_epoch}.json".format(_epoch=_epoch), "w", encoding="utf-8") as fp:
        json.dumps(output_json)

In [None]:
_epoch = 5
model_dir = dataset_dir + "/model/gpt2/v2/condition/{}/".format(dataset_name)
gpt2 = load_state_dict(object=gpt2, path=model_dir+"epoch_{}/".format(_epoch))
epoch_val_history = gpt2.iteration_epoch(data_loader=test_data_loader, optimizer=optimizer, device=device, train=False, verbose_per_batch=-1)
print(epoch_val_history)

### Test Service

In [None]:
from transformer.services.dialog_generator.bart import BartDialogGenerator
service = BartDialogGenerator()
service.verbose = False
_epoch = 6
service.set_device(device=device)
_model_dir = dataset_dir + "/model/bart/v2/condition/{dataset_name}/epoch_{_epoch}".format(dataset_name=dataset_name, _epoch=_epoch)
service.load_model(model_dir=_model_dir)

In [None]:
utterances = [
    "안녕하세요",
#     "무슨 일로 저에게 상담을 신청하셨나요?"
#     "요즘 인간관계가 고민이에요.",
#     "어떤 고민이죠?",
#     "친구들이랑 연락도 뜸해지고 자주 못만나서 서먹해지는 것 같아요",
#     "이래저래 연락하기 힘드신가봐요",
#     "네, 코로나 때문에 만나질 못해서 더 혼자가 된 느낌이에요.",
#     "저도 지쳐요.",
#     "당신도 사람들을 자주 못 만나시나봐요"
 ]
speaker_ids = [(i+1)%2 for i in range(len(utterances))]

outputs = service.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=None,
                                                   min_length=10, top_n=5, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                   beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
outputs[0][0]

In [None]:
utterances = [
    "오늘 하루가 정말 피곤하네요",
#     "무슨 일이 있으셨죠? 오늘 하루는 어떠셨어요?"
#     "회사에서 일이 너무 많았어요.",
#     "많이 힘드셨겠어요 힘내세요",
#     "고마워요. 게다가 요즘 상사에게 자꾸 혼나요.",
#     "왜  혼나는 일들이 쌓이셨나요?",
#     "저번에 시키신 일을 제대로 못했거든요.",
#     "어떤 일이 있었는지 말해주실 수 있나요?",
#     "제가 서류를 잘못 가져다드렸어요."
 ]
speaker_ids = [(i+1)%2 for i in range(len(utterances))]

outputs = service.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=None,
                                                   min_length=10, top_n=5, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                   beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
outputs[0][0]

### Conditional Generatio Test

In [None]:
from transformer.services.dialog_generator.bart import BartDialogGenerator
generator = BartDialogGenerator()
generator.verbose = False
_epoch = 6
generator.set_device(device=device)
_model_dir = dataset_dir + "/model/bart/v2/condition/{dataset_name}/epoch_{_epoch}".format(dataset_name=dataset_name, _epoch=_epoch)
generator.load_model(model_dir=_model_dir)

from transformer.services.dialog_retriever.poly_encoder import PolyEncoderDialogRetriever
retriever = PolyEncoderDialogRetriever()
retriever.verbose = False
_epoch = 34
retriever.set_device(device=device)
_model_dir = dataset_dir + "/model/poly_encoder/v2/{dataset_name}/epoch_{_epoch}/".format(dataset_name=dataset_name, _epoch=_epoch)
retriever.load_model(model_dir=_model_dir)

#### manual input test

In [None]:
utterances = [
    "하루가 너무 피곤하네요",
    "너무 피곤하지 않으셨으면 좋게썽요",
    "그럴 수 있을까요? 그치만 업무가 너무 많아서요",
    "업무가 너무 많아서 힘드시군요",
    "네... 도무지 끝날 기미가 안보이고 계속 일이 들어와요",
    "너무 안타깝네요 빨리 업무가 익숙해져야 할텐데요,,",
    "제가 익숙하지 않아서 그런걸까요? 보면 선배들은 빨리빨리 하긴 하더라구요.",
    "어떤 점이 가장 힘드신가요?",
    "계속 멀어지는 것 같고, 그러다보니 외로워서요"
 ]
speaker_ids = [(i+1)%2 for i in range(len(utterances))]
min_length = 10
top_n = 5

outputs = retriever.infer_next_utterance(utterances=utterances, speaker_ids=speaker_ids,
                                       min_length=min_length, top_n=top_n, weight_bm25=False,
                                       prev_utterance=None, intersection_tolerance=0.9, max_retry=5)
print("condition:", outputs[0])
condition = outputs[0][0]

# Non-condition
outputs = generator.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=None,
                                                   min_length=min_length, top_n=top_n, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                   beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
non_condition_output = outputs[0][0]
print("non-condition-gen:", non_condition_output)

# Condition
outputs = generator.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=[condition],
                                                   min_length=min_length, top_n=top_n, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                   beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
condition_output = outputs[0][0]
print("condition-gen:", condition_output)

#### dataset test

In [None]:
from transformer.data.utils import simplify_speaker_ids
from transformer.utils.common import get_last_index

def show_dataset_result(dataset, begin_idx=0, verbose=True):
    for row_idx in range(begin_idx, len(dataset.raw_data)):
        _utterances = dataset.raw_data[row_idx]["utterances"]
        _speaker_ids = dataset.raw_data[row_idx]["speaker_ids"]
        _speaker_ids = simplify_speaker_ids(_speaker_ids, user_id=1, model_id=0)
        last_index = get_last_index(_speaker_ids, value=1)
        utterances = _utterances[:last_index+1]
        speaker_ids = _speaker_ids[:last_index+1]
        reference = _utterances[last_index+1:]

        if verbose:
            print("# {}".format(row_idx))
            for speaker_id, utterance in zip(speaker_ids, utterances):
                print("{}: {}".format(speaker_id, utterance))
            print("\nreference:", reference)

        min_length = 10
        top_n = 5

        output = None
        try:
            outputs = retriever.infer_next_utterance(utterances=utterances, speaker_ids=speaker_ids,
                                                   min_length=min_length, top_n=top_n, weight_bm25=False,
                                                   prev_utterance=None, intersection_tolerance=0.9, max_retry=5)
            if verbose: print("condition:", outputs[0])
            condition = outputs[0][0]

            # Non-condition
            outputs = generator.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=None,
                                                               min_length=min_length, top_n=top_n, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                               beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
            non_condition_output = outputs[0][0]
            if verbose: print("non-condition-gen:", non_condition_output)

            # Condition
            outputs = generator.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=[condition],
                                                               min_length=min_length, top_n=top_n, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                               beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
            condition_output = outputs[0][0]
            if verbose: print("condition-gen:", condition_output)

            output = {
                "reference": reference,
                "condition": condition,
                "non-condition-gen": non_condition_output,
                "condition-gen": condition_output
            }
            yield output
        except:
            yield output
        
gen = show_dataset_result(dataset=test_dataset, begin_idx=62)

In [None]:
gen_output = next(gen)

In [None]:
test_gen = show_dataset_result(dataset=test_dataset, begin_idx=0, verbose=False)

cnt_list = [0 for i in range(4)]
item_list = [[] for i in range(4)]

for gen_output in tqdm(test_gen, initial=0, total=len(test_dataset.raw_data)):
    if gen_output is None: continue
        
    reference = gen_output["reference"]
    condition = gen_output["condition"]
    non_condition_gen = gen_output["non-condition-gen"]
    condition_gen = gen_output["condition-gen"]

    reference_token_set = set(tokenizer.encode(" ".join(reference)))
    condition_token_set = set(tokenizer.encode(condition))
    non_condition_gen_token_set = set(tokenizer.encode(non_condition_gen))
    condition_gen_token_set = set(tokenizer.encode(condition_gen))

    condition_gen_intersection = len(condition_token_set.intersection(condition_gen_token_set)) / len(condition_token_set)
    reference_cond_gen_intersection = len(reference_token_set.intersection(condition_gen_token_set)) / len(reference_token_set)
    reference_non_cond_gen_intersection = len(reference_token_set.intersection(non_condition_gen_token_set)) / len(reference_token_set)

    if condition_gen_intersection > 0.7: 
        if reference_cond_gen_intersection > 0.7:
            # case 1: condition과 gen이 거의 같은 경우 (과도한 알파블렌딩) && ref를 잘 맞힌 경우
            cnt_list[0] += 1
            item_list[0].append(gen_output)
        else:
            # case 2: condition과 gen이 거의 같은 경우 (과도한 알파블렌딩) && ref를 못 맞힌 경우
            cnt_list[1] += 1
            item_list[1].append(gen_output)
    else:
        if reference_cond_gen_intersection > 0.7:
            # case 3: condition과 gen이 다소 다른 경우 (과도한 알파블렌딩) && ref를 잘 맞힌 경우
            cnt_list[2] += 1
            item_list[2].append(gen_output)
        else:
            # case 4: condition과 gen이 다소 다른 경우 (과도한 알파블렌딩) && ref를 못 맞힌 경우
            cnt_list[3] += 1
            item_list[3].append(gen_output)

print(cnt_list)

#### dataset validation

In [None]:
targets = []
dataset = train_dataset
target_str_list = [
#     "주위에 지금 감정을 나눌 수 있는 사람이",
#     "회사에서 겪는 대인관계"
    "사람들은 눈에 보여야 인지하는 경향이"
]
for row in dataset.raw_data:
    utterances = row["utterances"]
    concated = " ".join(utterances)
    
    flag = True
    for target_str in target_str_list:
        if target_str not in concated: 
            flag = False
            break
    if flag: targets.append(row)
print("len:", len(targets))

In [None]:
# four_n2x8_one -> epoch_3가 베스트 (2)
# selectstar_n2x8_one -> epoch_3가 베스트 (3)
# four_n2x8_both -> epoch_7 > 4/5/6가 베스트 (1)
# selectstar_n2x8_both -> epoch_4가 베스트 (4)

In [None]:
# input_batch = ["<s>It <mask> retriever. My <mask> cute </s>", ... ]
# decoder_input_batch = ["</s><s>My dog is cute. It is a golden retriever", ...]
# labels_batch = ["<s>My dog is cute. It is a golden retriever</s>", ...]

### Compute Metrics

In [None]:
from transformer.services.dialog_generator.bart import BartDialogGenerator
generator = BartDialogGenerator()
generator.verbose = False
_epoch = 6
generator.set_device(device=device)
_model_dir = dataset_dir + "/model/bart/v2/condition_bm25/{dataset_name}/epoch_{_epoch}".format(dataset_name=dataset_name, _epoch=_epoch)
generator.load_model(model_dir=_model_dir)

from transformer.services.dialog_retriever.poly_encoder import PolyEncoderDialogRetriever
retriever = PolyEncoderDialogRetriever()
retriever.verbose = False
_epoch = 34
retriever.set_device(device=device)
_model_dir = dataset_dir + "/model/poly_encoder/v2/{dataset_name}/epoch_{_epoch}/".format(dataset_name=dataset_name, _epoch=_epoch)
retriever.load_model(model_dir=_model_dir)

In [None]:
from transformer.data.utils import simplify_speaker_ids
from transformer.utils.common import get_last_index
from KoBERTScore import BERTScore
from transformer.models.utils import compute_bleu, compute_meteor, compute_rouge, compute_hits, compute_semantic_score

def get_metric_inputs(dataset, min_length=1, top_n=10):
    for row_idx in range(0, len(dataset.raw_data)):
        output = None
        
        _utterances = dataset.raw_data[row_idx]["utterances"]
        _speaker_ids = dataset.raw_data[row_idx]["speaker_ids"]
        _speaker_ids = simplify_speaker_ids(_speaker_ids, user_id=1, model_id=0)
        last_index = get_last_index(_speaker_ids, value=1)
        utterances = _utterances[:last_index+1]
        speaker_ids = _speaker_ids[:last_index+1]
        reference = _utterances[last_index+1:]
        reference = " ".join(reference)

        try:
            context = " ".join(utterances)
            
            # non_condition_prediction
            outputs = generator.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=None,
                                                                 min_length=min_length, top_n=top_n, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                                 beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
            non_condition_prediction = [output[0] for output in outputs][0]
            
            # condition_prediction
            outputs = retriever.infer_next_utterance(utterances=utterances, speaker_ids=speaker_ids,
                                       min_length=min_length, top_n=top_n, weight_bm25=False,
                                       prev_utterance=None, intersection_tolerance=0.9, max_retry=5)
            condition = outputs[0][0]
            
            outputs = generator.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=[condition],
                                                                 min_length=min_length, top_n=top_n, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                                 beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
            condition_prediction = [output[0] for output in outputs][0]

            output = {
                "context": context,
                "reference": reference,
                "condition": condition,
                "non_condition_prediction": non_condition_prediction,
                "condition_prediction": condition_prediction
            }
            yield output
        except:
            yield output        

In [None]:
def compute_scores(model, tokenizer, predictions, references):
    scores = dict()
    _scores = dict()
    if "bleu" in metrics:
        _scores["bleu"] = compute_bleu(metric=model.metrics["bleu"], tokenizer=tokenizer, predictions=predictions, references=references)
    if "meteor" in metrics:
        _scores["meteor"] = compute_meteor(metric=model.metrics["meteor"], tokenizer=tokenizer, predictions=predictions, references=references)
    if "rouge" in metrics:
        _scores["rouge"] = compute_rouge(metric=model.metrics["rouge"], tokenizer=tokenizer, predictions=predictions, references=references)
    if "semantic_score" in metrics:
        _scores["semantic_score"] = compute_semantic_score(metric=model.metrics["semantic_score"], tokenizer=tokenizer, predictions=predictions, references=references)
    
    if "bleu" in metrics:
        _bleu_scores = _scores["bleu"]["precisions"]
        for ngram in bleu_ngrams:
            name = "BLEU-{n}".format(n=ngram)
            score = _bleu_scores[ngram-1]
            score = round(score, 4)
            scores[name] = score
    if "meteor" in metrics:
        score = _scores["meteor"]["meteor"]
        score = round(score, 4)
        scores["METEOR"] = score
    if "rouge" in metrics:
        for r in rouge_types:
            key = "rouge{r}".format(r=r)
            if key in _scores["rouge"]:
                name = "ROUGE-{r}".format(r=r)
                score = _scores["rouge"][key]
                score = score.mid.fmeasure
                score = round(score, 4)
                scores[name] = score
    if "hits" in metrics:
        for k, score in zip(hits_k, _scores["hits"]):
            name = "HITS@{k}".format(k=k)
            score = round(score, 4)
            scores[name] = score
    if "semantic_score" in metrics:
        name = "BERTScore".format(name_or_path=name_or_path)
        score = _scores["semantic_score"]
        score = round(score, 4)
        scores[name] = score
        
    return scores

In [None]:
metrics = ["bleu", "meteor", "rouge", "semantic_score"]
bleu_ngrams = [3, 4]
rouge_types = ["1", "2", "L"]
name_or_path = "beomi/kcbert-base"
decoding_method = "beam_search"
min_length = 10
top_n = 5

metric_input_gen = get_metric_inputs(dataset=test_dataset, min_length=min_length, top_n=top_n)

non_condition_predictions = []
condition_predictions = []
references = []
for gen_output in tqdm(metric_input_gen):
    if gen_output is None: continue
    references.append(gen_output["reference"])
    non_condition_predictions.append(gen_output["non_condition_prediction"])
    condition_predictions.append(gen_output["condition_prediction"])

non_condition_scores = compute_scores(model=bart, tokenizer=tokenizer, predictions=non_condition_predictions, references=references)
print("non_condition_score:", non_condition_score)

condition_scores = compute_scores(model=bart, tokenizer=tokenizer, predictions=condition_predictions, references=references)
print("condition_scores:", condition_scores)