In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from setproctitle import setproctitle
setproctitle("Hodong_GPT2")

In [2]:
import json
import torch
from tqdm import tqdm
import numpy as np
from transformers import PreTrainedTokenizerFast
from transformer.data.generator_dataset import GptDatasetFromDir, GeneratorDataLoader
from transformer.tokenizer.utils import make_custom_tokenizer_from_pretrained, load_tokenizer_from_pretrained
from transformer.models.interface import TrainHistory
from transformer.models.gpt import Gpt2
from transformer.models.utils import load_state_dict, get_score_json
from transformer.utils.common import set_device, convert_to_tensor, init_path

### Set WorkingDirectory

In [3]:
# # AIBUD_DEV
# dataset_dir = "/Users/aibud_dev/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# Korea_Server
dataset_dir = "/home/mnt/guest1"
path = "./config/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

# # bigshane_local
# dataset_dir = "D:\_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # AWS
# dataset_dir = "/home/ubuntu/data"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

### Load Tokenizer

In [4]:
tokenizer_file_path = dataset_dir + "/huggingface_tokenizer/kor/kogpt2-vanila"

# # save tokenizer to local
# tokenizer_path = "skt/kogpt2-base-v2"
# add_special_token = True
# tokenizer = make_custom_tokenizer_from_pretrained(model_type="gpt2", name_or_path=tokenizer_path, add_special_token=add_special_token)
# tokenizer.save_pretrained(tokenizer_file_path)

tokenizer = load_tokenizer_from_pretrained(model_type="gpt2", name_or_path=tokenizer_file_path)
print("vocab_size:", len(tokenizer))

loaded pretrained huggingface_tokenizer: 'D:\_jupyter/huggingface_tokenizer/kor/kogpt2-vanila'
vocab_size: 51200


### Load Dataset & DataLoader

In [5]:
timesteps = 128
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
batch_size = 64
nprocs = 1
use_condition = True
alpha_blending = 0.5

dataset_name = "four_n2x8_both"
total_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/condition/{}/".format(dataset_name)
sample_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/condition/{}/sample/".format(dataset_name)
train_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/condition/{}/train/".format(dataset_name)
val_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/condition/{}/val/".format(dataset_name)
test_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/condition/{}/test/".format(dataset_name)

train_dataset = GptDatasetFromDir(data_dir=train_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs, use_condition=use_condition, alpha_blending=alpha_blending)
train_data_loader = GeneratorDataLoader(dataset=train_dataset, batch_size=batch_size, device=device)

val_dataset = GptDatasetFromDir(data_dir=val_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs, use_condition=False, alpha_blending=-1)
val_data_loader = GeneratorDataLoader(dataset=val_dataset, batch_size=batch_size, device=device)

test_dataset = GptDatasetFromDir(data_dir=test_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs, use_condition=False, alpha_blending=-1)
test_data_loader = GeneratorDataLoader(dataset=test_dataset, batch_size=batch_size, device=device)

Preprocessing data: 100%|█████████████████████████████████████████████████████████| 3907/3907 [00:06<00:00, 593.76it/s]


In [None]:
test_data_loader.check()

### Define Model

In [7]:
gpt2 = Gpt2(vocab_size=len(tokenizer))
optimizer = gpt2.get_optimizer(lr=5e-5)

gpt2 = set_device(gpt2, device=device)
optimizer = set_device(optimizer, device=device)

'temp_dir' has been set to './20210927_155813/' to save model while training
Setting model device: cuda:0


### Fit

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_parameters(bart)

In [None]:
epoch = 30
model_dir = dataset_dir + "/model/gpt2/v2/non_condition/{dataset_name}/".format(dataset_name=dataset_name)
init_path(model_dir, True)

metrics = ["bleu", "meteor", "rouge", "semantic_score"]
bleu_ngrams = [3, 4]
rouge_types = ["1", "2", "L"]
name_or_path = "beomi/kcbert-base"
decoding_method = "beam_search"

train_history = TrainHistory()
val_history = TrainHistory()
for _epoch in range(1, epoch+1):
    # train
    epoch_train_history = gpt2.iteration_epoch(data_loader=train_data_loader, optimizer=optimizer, device=device, train=True, verbose_per_batch=-1)
    # compute scores
    train_scores = gpt2.compute_scores(metrics=metrics, data_loader=train_data_loader, device=device, tokenizer=tokenizer, timesteps=timesteps, bleu_ngrams=bleu_ngrams, rouge_types=rouge_types, name_or_path=name_or_path, decoding_method=decoding_method)
    for metric, metric_score in train_scores.items():
        epoch_train_history._add_acc(name=metric, value=metric_score)
    
    epoch_train_history_str = gpt2.verbose_template.format(mode="Epoch_train", device=device, idx=_epoch, num_iters=epoch) + str(epoch_train_history)
    print(epoch_train_history_str)
    train_history += epoch_train_history
    
    # val
    epoch_val_history = gpt2.iteration_epoch(data_loader=val_data_loader, optimizer=optimizer, device=device, train=False, verbose_per_batch=-1)
    # compute scores
    val_scores = gpt2.compute_scores(metrics=metrics, data_loader=val_data_loader, device=device, tokenizer=tokenizer, timesteps=timesteps, bleu_ngrams=bleu_ngrams, rouge_types=rouge_types, name_or_path=name_or_path, decoding_method=decoding_method)
    for metric, metric_score in val_scores.items():
        epoch_val_history._add_acc(name=metric, value=metric_score)
    
    epoch_val_history_str = gpt2.verbose_template.format(mode="Epoch_val", device=device, idx=_epoch, num_iters=epoch) + str(epoch_val_history)
    print(epoch_val_history_str)
    val_history += epoch_val_history
    
    gpt2.save(path=model_dir + "epoch_{}/".format(_epoch), optimizer=optimizer, tokenizer=tokenizer)
    with open(model_dir+"log.txt", "a", encoding="utf-8") as fp: 
        fp.write(epoch_train_history_str + "\n")
        fp.write(epoch_val_history_str + "\n")

### Compute Scores

In [1]:
epoch = 20
metrics = ["bleu", "meteor", "rouge", "semantic_score"]
bleu_ngrams = [3, 4]
rouge_types = ["1", "2", "L"]
name_or_path = "beomi/kcbert-base"
decoding_method = "beam_search"
model_name = "KoGPT2"

model_dir = dataset_dir + "/model/gpt2/v2/{dataset_name}/".format(dataset_name=dataset_name)
log_dir = dataset_dir + "/essay/gpt2/ft_o_rt_o/condition/{dataset_name}/".format(dataset_name=dataset_name)
init_path(log_dir, reset=True)
for _epoch in range(1, epoch+1):
    gpt2 = load_state_dict(object=gpt2, path=model_dir+"epoch_{}/".format(_epoch))
    scores = gpt2.compute_scores(metrics=metrics, data_loader=test_data_loader, device=device, tokenizer=tokenizer, timesteps=timesteps, bleu_ngrams=bleu_ngrams, rouge_types=rouge_types, name_or_path=name_or_path, decoding_method=decoding_method)
    output_json = get_score_json(model_name=model_name, dataset_name=dataset_name, test_data_size=len(test_data_loader.dataset), batch_size=batch_size, scores=scores)

    # verbose & append log
    eval_history = TrainHistory()
    loss_dict = dict()
    acc_dict = dict()
    for metric, score in scores.items():
        acc_dict[metric] = score
    eval_history.update(loss_dict=loss_dict, acc_dict=acc_dict, lr=-1)
    eval_str = gpt2.verbose_template.format(mode="Eval", device=device, idx=_epoch, num_iters=epoch) + str(eval_history)
    print(eval_str)

    with open(log_dir + "/score_logs.txt", "a", encoding="utf-8") as fp:
        fp.write(eval_str + "\n")

    # write detailed logs
    init_path(log_dir + "/detailed/", reset=False)
    with open(log_dir + "/detailed/score_logs_{_epoch}.json".format(_epoch=_epoch), "w", encoding="utf-8") as fp:
        json.dumps(output_json)

NameError: name 'dataset_dir' is not defined

In [None]:
_epoch = 5
model_dir = dataset_dir + "/model/gpt2/v2/condition/{}/".format(dataset_name)
gpt2 = load_state_dict(object=gpt2, path=model_dir+"epoch_{}/".format(_epoch))
epoch_val_history = gpt2.iteration_epoch(data_loader=test_data_loader, optimizer=optimizer, device=device, train=False, verbose_per_batch=-1)
print(epoch_val_history)

### Test Service

In [None]:
from transformer.services.dialog_generator.gpt2 import Gpt2DialogGenerator
service = Gpt2DialogGenerator()
service.verbose = False
service.set_device(device=device)
_epoch = 5
_model_dir = dataset_dir + "/model/gpt2/v2/condition//{dataset_name}/epoch_{_epoch}".format(dataset_name=dataset_name, _epoch=_epoch)
service.load_model(model_dir=_model_dir)

In [None]:
utterances = [
    "안녕하세요",
#     "무슨 일로 저에게 상담을 신청하셨나요?"
#     "요즘 인간관계가 고민이에요.",
#     "어떤 고민이죠?",
#     "친구들이랑 연락도 뜸해지고 자주 못만나서 서먹해지는 것 같아요",
#     "이래저래 연락하기 힘드신가봐요",
#     "네, 코로나 때문에 만나질 못해서 더 혼자가 된 느낌이에요.",
#     "저도 지쳐요.",
#     "당신도 사람들을 자주 못 만나시나봐요"
 ]
speaker_ids = [(i+1)%2 for i in range(len(utterances))]

outputs = service.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=None,
                                                   min_length=10, top_n=5, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                   beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
outputs[0][0]

In [None]:
utterances = [
    "오늘 하루가 정말 피곤하네요",
#     "무슨 일이 있으셨죠? 오늘 하루는 어떠셨어요?"
#     "회사에서 일이 너무 많았어요.",
#     "많이 힘드셨겠어요 힘내세요",
#     "고마워요. 게다가 요즘 상사에게 자꾸 혼나요.",
#     "왜  혼나는 일들이 쌓이셨나요?",
#     "저번에 시키신 일을 제대로 못했거든요.",
#     "어떤 일이 있었는지 말해주실 수 있나요?",
#     "제가 서류를 잘못 가져다드렸어요."
 ]
speaker_ids = [(i+1)%2 for i in range(len(utterances))]

outputs = service.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=None,
                                                   min_length=10, top_n=5, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                   beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
outputs[0][0]

In [23]:
utterances = [
    "안녕하세요",
#     "무슨 일로 저에게 상담을 신청하셨나요?"
#     "요즘 인간관계가 고민이에요.",
#     "어떤 고민이죠?",
#     "친구들이랑 연락도 뜸해지고 자주 못만나서 서먹해지는 것 같아요",
#     "이래저래 연락하기 힘드신가봐요",
#     "네, 코로나 때문에 만나질 못해서 더 혼자가 된 느낌이에요.",
#     "저도 지쳐요.",
#     "당신도 사람들을 자주 못 만나시나봐요"
 ]

text = " ".join(utterances)
input_ids = tokenizer.encode(text)
input_ids = input_ids + [tokenizer.bos_token_id]
input_ids = convert_to_tensor([input_ids], device=device)

begin_idx = len(input_ids[0])
beam_output = gpt2.generate(input_ids=input_ids, 
    max_length=timesteps,
    min_length=15,
    no_repeat_ngram_size=3,
    num_beams=10, 
    early_stopping=True
)

tokenizer.decode(beam_output[:, begin_idx:-1].tolist()[0], skip_special_tokens=True)

'말하신대로 해볼게요. 제가 할 수 있을까요?'

In [None]:
four_n2x8_one -> epoch_3가 베스트 (1)
selectstar_n2x8_one -> epoch_4가 베스트 (2)
four_n2x8_both -> epoch_3가 베스트 (4)
selectstar_n2x8_both -> epoch_2가 베스트 (3)

### Compute Metrics

In [None]:
from transformer.services.dialog_generator.bart import BartDialogGenerator
generator = BartDialogGenerator()
generator.verbose = False
_epoch = 6
generator.set_device(device=device)
_model_dir = dataset_dir + "/model/bart/v2/condition_bm25/{dataset_name}/epoch_{_epoch}".format(dataset_name=dataset_name, _epoch=_epoch)
generator.load_model(model_dir=_model_dir)

from transformer.services.dialog_retriever.poly_encoder import PolyEncoderDialogRetriever
retriever = PolyEncoderDialogRetriever()
retriever.verbose = False
_epoch = 34
retriever.set_device(device=device)
_model_dir = dataset_dir + "/model/poly_encoder/v2/{dataset_name}/epoch_{_epoch}/".format(dataset_name=dataset_name, _epoch=_epoch)
retriever.load_model(model_dir=_model_dir)

In [12]:
from transformer.data.utils import simplify_speaker_ids
from transformer.utils.common import get_last_index
from KoBERTScore import BERTScore
from transformer.models.utils import compute_bleu, compute_meteor, compute_rouge, compute_hits, compute_semantic_score

def get_metric_inputs(dataset, min_length=1, top_n=10):
    for row_idx in range(0, len(dataset.raw_data)):
        output = None
        
        _utterances = dataset.raw_data[row_idx]["utterances"]
        _speaker_ids = dataset.raw_data[row_idx]["speaker_ids"]
        _speaker_ids = simplify_speaker_ids(_speaker_ids, user_id=1, model_id=0)
        last_index = get_last_index(_speaker_ids, value=1)
        utterances = _utterances[:last_index+1]
        speaker_ids = _speaker_ids[:last_index+1]
        reference = _utterances[last_index+1:]
        reference = " ".join(reference)

        try:
            context = " ".join(utterances)
            
            # non_condition_prediction
            outputs = generator.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=None,
                                                                 min_length=min_length, top_n=top_n, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                                 beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
            non_condition_prediction = [output[0] for output in outputs][0]
            
            # condition_prediction
            outputs = retriever.infer_next_utterance(utterances=utterances, speaker_ids=speaker_ids,
                                       min_length=min_length, top_n=top_n, weight_bm25=False,
                                       prev_utterance=None, intersection_tolerance=0.9, max_retry=5)
            condition = outputs[0][0]
            
            outputs = generator.infer_next_utterance_beam_search(utterances=utterances, speaker_ids=speaker_ids, conditions=[condition],
                                                                 min_length=min_length, top_n=top_n, repetition_penalty=2.0, no_repeat_ngram_size=3,
                                                                 beam_size=10, prev_utterance=None, intersection_tolerance=0.9, max_retry=5, return_probs=True)
            condition_prediction = [output[0] for output in outputs][0]

            output = {
                "context": context,
                "reference": reference,
                "condition": condition,
                "non_condition_prediction": non_condition_prediction,
                "condition_prediction": condition_prediction
            }
            yield output
        except:
            yield output        

dict_keys(['bleu', 'rouge1', 'rouge2', 'rougeL'])

In [None]:
def compute_scores(model, tokenizer, predictions, references):
    scores = dict()
    _scores = dict()
    if "bleu" in metrics:
        _scores["bleu"] = compute_bleu(metric=model.metrics["bleu"], tokenizer=tokenizer, predictions=predictions, references=references)
    if "meteor" in metrics:
        _scores["meteor"] = compute_meteor(metric=model.metrics["meteor"], tokenizer=tokenizer, predictions=predictions, references=references)
    if "rouge" in metrics:
        _scores["rouge"] = compute_rouge(metric=model.metrics["rouge"], tokenizer=tokenizer, predictions=predictions, references=references)
    if "semantic_score" in metrics:
        _scores["semantic_score"] = compute_semantic_score(metric=model.metrics["semantic_score"], tokenizer=tokenizer, predictions=predictions, references=references)
    
    if "bleu" in metrics:
        _bleu_scores = _scores["bleu"]["precisions"]
        for ngram in bleu_ngrams:
            name = "BLEU-{n}".format(n=ngram)
            score = _bleu_scores[ngram-1]
            score = round(score, 4)
            scores[name] = score
    if "meteor" in metrics:
        score = _scores["meteor"]["meteor"]
        score = round(score, 4)
        scores["METEOR"] = score
    if "rouge" in metrics:
        for r in rouge_types:
            key = "rouge{r}".format(r=r)
            if key in _scores["rouge"]:
                name = "ROUGE-{r}".format(r=r)
                score = _scores["rouge"][key]
                score = score.mid.fmeasure
                score = round(score, 4)
                scores[name] = score
    if "hits" in metrics:
        for k, score in zip(hits_k, _scores["hits"]):
            name = "HITS@{k}".format(k=k)
            score = round(score, 4)
            scores[name] = score
    if "semantic_score" in metrics:
        name = "BERTScore".format(name_or_path=name_or_path)
        score = _scores["semantic_score"]
        score = round(score, 4)
        scores[name] = score
        
    return scores

In [None]:
metrics = ["bleu", "meteor", "rouge", "semantic_score"]
bleu_ngrams = [3, 4]
rouge_types = ["1", "2", "L"]
name_or_path = "beomi/kcbert-base"
decoding_method = "beam_search"
min_length = 10
top_n = 5

metric_input_gen = get_metric_inputs(dataset=test_dataset, min_length=min_length, top_n=top_n)

non_condition_predictions = []
condition_predictions = []
references = []
for gen_output in tqdm(metric_input_gen):
    if gen_output is None: continue
    references.append(gen_output["reference"])
    non_condition_predictions.append(gen_output["non_condition_prediction"])
    condition_predictions.append(gen_output["condition_prediction"])

non_condition_scores = compute_scores(model=bart, tokenizer=tokenizer, predictions=non_condition_predictions, references=references)
print("non_condition_score:", non_condition_score)

condition_scores = compute_scores(model=bart, tokenizer=tokenizer, predictions=condition_predictions, references=references)
print("condition_scores:", condition_scores)