In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from setproctitle import setproctitle
setproctitle("Hodong_BART")

In [2]:
import json
import torch
from tqdm import tqdm
import numpy as np
from transformers import BartModel
from transformers import BartForConditionalGeneration
from transformers import PreTrainedTokenizerFast
from transformer.data.generator_dataset import BartDatasetFromDir, GeneratorDataLoader
from transformer.tokenizer.utils import make_custom_tokenizer_from_pretrained, load_tokenizer_from_pretrained
from transformer.models.interface import TrainHistory
from transformer.models.bart import Bart
from transformer.models.utils import load_state_dict
from transformer.utils.common import set_device, convert_to_tensor, init_path



### Set WorkingDirectory

In [3]:
# # AIBUD_DEV
# dataset_dir = "/Users/aibud_dev/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # Korea_Server
# dataset_dir = "/home/mnt/guest1"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # bigshane_local
# dataset_dir = "D:\_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# AWS
dataset_dir = "/home/ubuntu/data"
path = "./config/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

### Load Tokenizer

In [4]:
tokenizer_file_path = dataset_dir + "/huggingface_tokenizer/kor/kobart-customed"

# # save tokenizer to local
# tokenizer_path = "hyunwoongko/kobart"
# add_special_token = True
# tokenizer = make_custom_tokenizer_from_pretrained(model_type="bart", name_or_path=tokenizer_path, add_special_token=add_special_token)
# tokenizer.save_pretrained(tokenizer_file_path)

tokenizer = load_tokenizer_from_pretrained(model_type="bart", name_or_path=tokenizer_file_path)
print("vocab_size:", len(tokenizer))

for _id, _token in zip(tokenizer.all_special_ids, tokenizer.all_special_tokens):
    print(_id, ":", _token)

loaded pretrained huggingface_tokenizer: 'D:\_jupyter/huggingface_tokenizer/kor/kobart-customed'
vocab_size: 30012
0 : <s>
1 : </s>
5 : <unk>
3 : <pad>
6 : <mask>
30000 : <num>
30001 : <cls>
30002 : <sep>
30003 : <turn>
30004 : <tpc>
30005 : <situ>
30006 : <ctxt>
30007 : <cond>
30008 : <cand>
30009 : <pers>
30010 : <spk1>
30011 : <spk2>


### Load Dataset & DataLoader

In [5]:
timesteps = 128
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 24
nprocs = 1
use_condition = True
alpha_blending = 0.5

total_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/selectstar_n2x8_one/"
sample_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/selectstar_n2x8_one/sample/"
train_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/selectstar_n2x8_one/train/"
val_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/selectstar_n2x8_one/val/"

train_dataset = BartDatasetFromDir(data_dir=train_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs, use_condition=use_condition, alpha_blending=alpha_blending)
train_data_loader = GeneratorDataLoader(dataset=train_dataset, batch_size=batch_size, device=device)

val_dataset = BartDatasetFromDir(data_dir=val_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs, use_condition=use_condition, alpha_blending=-1)
val_data_loader = GeneratorDataLoader(dataset=val_dataset, batch_size=batch_size, device=device)

Preprocessing data: 100%|████████████████████████████████████████████████████| 192784/192784 [02:08<00:00, 1503.05it/s]
Preprocessing data: 100%|██████████████████████████████████████████████████████| 36109/36109 [00:25<00:00, 1418.23it/s]


In [6]:
train_data_loader.check()

input_ids:	<ctxt> 요즘도 낚시를 즐기시나요? 예. 코로나 거리두기 4단계로 격상되고는 아예 가게 문닫고 낚시터에서 살아요. 저런 많이 힘드시겠어요. 저보다 제 집사람이 많이 힘들어 하죠. 그렇군요. 부인께서 살림을 하신다고 했죠?<cond>
decoder_input_ids:	<s> 예. 다행인건 집사람의 성격이 긍정적이라는 거에요.
labels:	예. 다행인건 집사람의 성격이 긍정적이라는 거에요.</s>

header: ['input_ids', 'token_type_ids', 'attention_mask', 'decoder_input_ids', 'decoder_token_type_ids', 'decoder_attention_mask', 'labels']
[[30006     1     1     0     0     1 14858]
 [16896     1     1 14858     0     1 19505]
 [ 9866     1     1 19505     0     1 12037]
 [22759     1     1 12037     0     1  9034]
 [16632     1     1  9034     0     1 14230]
 [23526     1     1 14230     0     1 17726]
 [11465     1     1 17726     0     1 12024]
 [ 9495     1     1 12024     0     1 25023]
 [17784     1     1 25023     0     1 23898]
 [14858     0     1 23898     0     1 14394]
 [14469     0     1 14394     0     1 14118]
 [10338     0     1 14118     0     1 11786]
 [ 9495     0     1 11786     0     1 14543]
 [18323     0     1 14543     0     1

### Define Model

In [7]:
bart = Bart(vocab_size=len(tokenizer))
optimizer = bart.get_optimizer(lr=5e-5)

bart = set_device(bart, device=device)
optimizer = set_device(optimizer, device=device)

'temp_dir' has been set to './20210912_194938/' to save model while training
Setting model device: cuda:0


### Fit

In [8]:
epoch = 5
model_dir = dataset_dir + "/model/bart/selectstar_n2x8_one/"
init_path(model_dir, True)

train_history = TrainHistory()
val_history = TrainHistory()
for _epoch in range(1, epoch+1):
    # train
    epoch_train_history = bart.iteration_epoch(data_loader=train_data_loader, optimizer=optimizer, device=device, train=True, verbose_per_batch=-1)
    epoch_train_history_str = bart.verbose_template.format(mode="Epoch_train", device=device, idx=_epoch, num_iters=len(train_data_loader)) + str(epoch_train_history)
    print(epoch_train_history_str)
    train_history += epoch_train_history
    
    # val
    epoch_val_history = bart.iteration_epoch(data_loader=val_data_loader, optimizer=optimizer, device=device, train=False, verbose_per_batch=-1)
    epoch_val_history_str = bart.verbose_template.format(mode="Epoch_val", device=device, idx=_epoch, num_iters=len(val_data_loader)) + str(epoch_val_history)
    print(epoch_val_history_str)
    val_history += epoch_val_history
    
    bart.save(path=model_dir + "epoch_{}/".format(_epoch), optimizer=optimizer, tokenizer=tokenizer)
    with open(model_dir+"log.txt", "a", encoding="utf-8") as fp: 
        fp.write(epoch_train_history_str + "\n")
        fp.write(epoch_val_history_str + "\n")

train: 100%|███████████████████████████████████████████████████████████████████| 16066/16066 [1:04:19<00:00,  4.16it/s]
val:   0%|                                                                                    | 0/3010 [00:00<?, ?it/s]

Epoch_train (cuda:0) [ 1 /16066]: (loss) lm: 2.591e+00,  | (acc) lm: 4.867e-01, ppl: 2.677e+01,  | train_time: 3859.0s, last_lr:  0.0000500000


val: 100%|█████████████████████████████████████████████████████████████████████████| 3010/3010 [04:34<00:00, 10.97it/s]


Epoch_val (cuda:0) [ 1 /3010]: (loss) lm: 3.832e+00,  | (acc) lm: 3.557e-01, ppl: 4.891e+01,  | train_time: 274.0s, last_lr:  0.0000500000


train:   0%|                                                                                 | 0/16066 [00:00<?, ?it/s]

Saved into D:\_jupyter/model/bart/selectstar_n2x8_one/epoch_1/


train: 100%|███████████████████████████████████████████████████████████████████| 16066/16066 [1:28:23<00:00,  3.03it/s]
val:   0%|                                                                                    | 0/3010 [00:00<?, ?it/s]

Epoch_train (cuda:0) [ 2 /16066]: (loss) lm: 1.208e+00,  | (acc) lm: 7.418e-01, ppl: 3.722e+00,  | train_time: 5303.0s, last_lr:  0.0000500000


val: 100%|█████████████████████████████████████████████████████████████████████████| 3010/3010 [06:22<00:00,  7.86it/s]


Epoch_val (cuda:0) [ 2 /3010]: (loss) lm: 4.455e+00,  | (acc) lm: 3.427e-01, ppl: 9.307e+01,  | train_time: 382.0s, last_lr:  0.0000500000


train:   0%|                                                                                 | 0/16066 [00:00<?, ?it/s]

Saved into D:\_jupyter/model/bart/selectstar_n2x8_one/epoch_2/


train: 100%|███████████████████████████████████████████████████████████████████| 16066/16066 [1:27:41<00:00,  3.05it/s]
val:   0%|                                                                                    | 0/3010 [00:00<?, ?it/s]

Epoch_train (cuda:0) [ 3 /16066]: (loss) lm: 6.672e-01,  | (acc) lm: 8.568e-01, ppl: 2.088e+00,  | train_time: 5261.0s, last_lr:  0.0000500000


val: 100%|█████████████████████████████████████████████████████████████████████████| 3010/3010 [08:50<00:00,  5.67it/s]


Epoch_val (cuda:0) [ 3 /3010]: (loss) lm: 4.877e+00,  | (acc) lm: 3.371e-01, ppl: 1.436e+02,  | train_time: 530.0s, last_lr:  0.0000500000


train:   0%|                                                                                 | 0/16066 [00:00<?, ?it/s]

Saved into D:\_jupyter/model/bart/selectstar_n2x8_one/epoch_3/


train: 100%|███████████████████████████████████████████████████████████████████| 16066/16066 [1:37:49<00:00,  2.74it/s]
val:   0%|                                                                                    | 0/3010 [00:00<?, ?it/s]

Epoch_train (cuda:0) [ 4 /16066]: (loss) lm: 4.360e-01,  | (acc) lm: 9.054e-01, ppl: 1.617e+00,  | train_time: 5869.0s, last_lr:  0.0000500000


val: 100%|█████████████████████████████████████████████████████████████████████████| 3010/3010 [04:42<00:00, 10.67it/s]


Epoch_val (cuda:0) [ 4 /3010]: (loss) lm: 5.195e+00,  | (acc) lm: 3.348e-01, ppl: 1.995e+02,  | train_time: 282.0s, last_lr:  0.0000500000


train:   0%|                                                                                 | 0/16066 [00:00<?, ?it/s]

Saved into D:\_jupyter/model/bart/selectstar_n2x8_one/epoch_4/


train: 100%|█████████████████████████████████████████████████████████████████████| 16066/16066 [52:07<00:00,  5.14it/s]
val:   0%|                                                                                    | 0/3010 [00:00<?, ?it/s]

Epoch_train (cuda:0) [ 5 /16066]: (loss) lm: 3.182e-01,  | (acc) lm: 9.296e-01, ppl: 1.415e+00,  | train_time: 3127.0s, last_lr:  0.0000500000


val: 100%|█████████████████████████████████████████████████████████████████████████| 3010/3010 [03:26<00:00, 14.60it/s]


Epoch_val (cuda:0) [ 5 /3010]: (loss) lm: 5.454e+00,  | (acc) lm: 3.343e-01, ppl: 2.604e+02,  | train_time: 206.0s, last_lr:  0.0000500000
Saved into D:\_jupyter/model/bart/selectstar_n2x8_one/epoch_5/


In [9]:
bart = load_state_dict(object=bart, path=model_dir+"epoch_{}/".format(4))

In [15]:
utterances = [
    "안녕하세요",
    "여쭤봐도 될까요?"
    "요즘 인간관계가 고민이에요.",
    "어떤 고민인지 여쭤봐도 될까요?",
    "친구들이랑 연락도 뜸해지고 자주 못만나서 서먹해지는 것 같아요",
    "그러셨군요.. 좀 더 자세히 말씀해주시겠어요?",
    "코로나 때문에 만나질 못해서 더 혼자가 된 느낌이에요.",
    "저도 지쳐요.",
    "당신도 사람들을 자주 못 만나시나봐요"
 ]

text = " ".join(utterances)
input_ids = tokenizer.encode(text)
# input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
input_ids = [30006] + input_ids + [30007]
decoder_input_ids = [tokenizer.eos_token_id, tokenizer.bos_token_id]
input_ids = convert_to_tensor([input_ids], device=device)
decoder_input_ids = convert_to_tensor([decoder_input_ids], device=device)

beam_output = bart.generate(input_ids=input_ids, 
                            decoder_input_ids=decoder_input_ids,
                            max_length=128,
                            min_length=10,
                            no_repeat_ngram_size=3,
                            num_beams=10, 
                            early_stopping=True)

tokenizer.decode(beam_output.tolist()[0], skip_special_tokens=True)

'아니에요. 그럴 그럴때는 어떻게 하세요?'

In [19]:
utterances = [
    "오늘 하루가 정말 피곤하네요",
#     "무슨 일 때문에 그런지 여쭤봐도 될까요?"
#     "회사에서 일이 너무 많았어요.",
#     "회사에서 안 좋은 일이 있으신가요?",
#     "실수가 잦아져서인지 요즘 상사에게 자꾸 혼나요.",
#     "왜  혼나는 일들이 쌓이셨나요?",
#     "저번에 시키신 일을 제대로 못했거든요.",
#     "어떤 일이 있었는지 말해주실 수 있나요?",
#     "제가 서류를 잘못 가져다드렸어요."
 ]

text = " ".join(utterances)
input_ids = tokenizer.encode(text)
input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
decoder_input_ids = [tokenizer.eos_token_id, tokenizer.bos_token_id]
input_ids = convert_to_tensor([input_ids], device=device)
decoder_input_ids = convert_to_tensor([decoder_input_ids], device=device)

beam_output = bart.generate(input_ids=input_ids, 
                            decoder_input_ids=decoder_input_ids,
                            max_length=128,
                            min_length=10,
                            no_repeat_ngram_size=3,
                            num_beams=10, 
                            early_stopping=True)

tokenizer.decode(beam_output.tolist()[0], skip_special_tokens=True)

'그러시군요.  많이 힘드시겠어요.'

In [None]:
# four_n2x8_one -> epoch_3가 베스트 (2)
# selectstar_n2x8_one -> epoch_3가 베스트 (3)
# four_n2x8_both -> epoch_7 > 4/5/6가 베스트 (1)
# selectstar_n2x8_both -> epoch_4가 베스트 (4)

In [None]:
# input_batch = ["<s>It <mask> retriever. My <mask> cute </s>", ... ]
# decoder_input_batch = ["</s><s>My dog is cute. It is a golden retriever", ...]
# labels_batch = ["<s>My dog is cute. It is a golden retriever</s>", ...]