In [1]:
from setproctitle import setproctitle
setproctitle("Hodong_Bert")

In [2]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

from transformer.models.bert import Bert
from transformer.preprocessors.bert_preprocessor import BertPreprocessor
from transformer.preprocessors.blender_bot_preprocessor import RetrieverEncoderPreprocessor
from transformer.data.dataset import DatasetInterface, DatasetFromDir
from transformer.data.blender_bot_data_loader import RetrieverEncoderDataLoader
from transformer.trainers.bert_trainer import BertTrainer
from transformer.trainers.blender_bot_trainer import RetrieverEncoderBertTrainer
from transformer.trainers.utils import *



### Load Dataset

In [3]:
# # AIBUD_DEV
# dataset_dir = "/Users/aibud_dev/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # Picas_Server
# dataset_dir = "/home/picas/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# Korea_Server
dataset_dir = "/home/mnt/guest1"
path = "./config/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

# # bigshane_local
# dataset_dir = "D:\_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

## Load Configuration

In [4]:
with open("./scripts/bert/config/retriever_encoder_pretraining_korea.json", "r", encoding="utf-8") as fp:
    config = json.load(fp)

## Load Preprocessor

In [5]:
spm_model_path = dataset_dir + "/spm_model/{language}/spoken_pretrain_spm_v{vocab_size}".format(language=config["data"]["language"], vocab_size=config["model"]["vocab_size"])
# spm_model_path = config["data"]["spm_model_path"].format(root_dir=config["data"]["root_dir"], language=config["data"]["language"], vocab_size=config["model"]["vocab_size"])
preprocessor = RetrieverEncoderPreprocessor(language=config["data"]["language"], spm_model_path=spm_model_path, embedding_dict=config["model"]["embedding_dict"])

Imported konlpy.tag.Mecab successfully
loaded spm_model: '/Users/aibud_dev/_jupyter/spm_model/kor/spoken_pretrain_spm_v30000/'


## Set Trainer

In [6]:
trainer = RetrieverEncoderBertTrainer(temp_dir=dataset_dir+"/model/temp/")
# trainer = RetrieverEncoderBertTrainer(temp_dir=config["train"]["temp_save_path"])
trainer.set_lr_update(initial_learning_rate=config["optimizer"]["initial_learning_rate"], num_warmup_steps=config["train"]["num_warmup_steps"])

'temp_dir' has been set to '/Users/aibud_dev/_jupyter/model/temp/20210824_095302/' to save model while training
LearningRate schedule has been set to 'transformer_lambda'


## Single-GPU Training

### Build Bert

In [7]:
bert = Bert(pad_token_id=preprocessor.spm_tokenizer.special_token_dict["pad"]["id"], **config["model"])

### Set criterions & optimizer

In [8]:
criterions, criterion_weights = trainer.get_criterions(pad_token_id=preprocessor.spm_tokenizer.special_token_dict["pad"]["id"], **config["criterion"])
optimizer = trainer.get_optimizer(model=bert, **config["optimizer"])

### Set Device

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
bert = BertTrainer.set_device(obj=bert, device=device)
optimizer = BertTrainer.set_device(obj=optimizer, device=device)
criterions = BertTrainer.set_device(obj=criterions, device=device)

Setting model device: cpu
Setting criterions device: cpu


## Load Dataset & DataLoader

In [10]:
# data_loader_params
batch_size = 4
nprocs = 1

total_data_dir = dataset_dir + "/dataset/preprocessed/dialog_pretrain/kor/multi_turn/"
sample_data_dir = dataset_dir + "/dataset/preprocessed/dialog_pretrain/kor/multi_turn/sample/"
train_data_dir = dataset_dir + "/dataset/preprocessed/dialog_pretrain/kor/multi_turn/train/"
val_data_dir = dataset_dir + "/dataset/preprocessed/dialog_pretrain/kor/multi_turn/val/"

train_dataset = DatasetFromDir(data_dir=sample_data_dir, batch_size=batch_size, device=device, nprocs=nprocs, encoding=config["data"]["encoding"], extension=config["data"]["extension"])
train_data_loader_params = trainer.get_data_loader_params(dataset=train_dataset, preprocessor=preprocessor, batch_size=batch_size, device=device, nprocs=nprocs, **config["model"], **config["data_loader"])
train_data_loader = trainer.create_data_loader(**train_data_loader_params)

val_dataset = DatasetFromDir(data_dir=val_data_dir, batch_size=batch_size, device=device, nprocs=nprocs, encoding=config["data"]["encoding"], extension=config["data"]["extension"])
val_data_loader_params = trainer.get_data_loader_params(dataset=val_dataset, preprocessor=preprocessor, batch_size=batch_size, device=device, nprocs=nprocs, **config["model"], **config["data_loader"])
val_data_loader = trainer.create_data_loader(**val_data_loader_params)

### Dataset summary

In [None]:
# val_data_loader.summary(show_sample=True)

### DataLoader encode test

In [13]:
# row_idx = 6
# inputs, outputs = train_data_loader.get_batch()

# print("input_token:\t", [token_idx for token_idx in range(0, len(inputs["token"][row_idx])) if token_idx==0 or inputs["token"][row_idx][token_idx]==preprocessor.spm_tokenizer.special_token_dict["sep"]["id"]])
# print("output_token:\t", [token_idx for token_idx in range(0, len(outputs["mlm"][row_idx])) if token_idx==0 or outputs["mlm"][row_idx][token_idx]==preprocessor.spm_tokenizer.special_token_dict["sep"]["id"]])
# print("input_segment:\t", [token_idx for token_idx in range(0, len(inputs["segment"][row_idx])-1) if token_idx==0 or inputs["segment"][row_idx][token_idx]!=inputs["segment"][row_idx][token_idx+1]])
# print("input_turn:\t", [token_idx for token_idx in range(0, len(inputs["turn"][row_idx])-1) if token_idx==0 or inputs["turn"][row_idx][token_idx]!=inputs["turn"][row_idx][token_idx+1]])

# for input_token, output_token in zip(preprocessor.decode(inputs["token"]), preprocessor.decode(outputs["mlm"])):
#     print("input_token:\t", input_token)
#     print("output_token:\t", output_token)
#     print()

input_token:	 [0, 105, 119]
output_token:	 [0, 105, 119]
input_segment:	 [0, 105, 119]
input_turn:	 [0, 5, 11, 23, 33, 42, 49, 54, 64, 76, 94, 105]


## Train Test

In [36]:
epoch = 5
amp = True
scaler = None
if amp: scaler = torch.cuda.amp.GradScaler()
save_per_epoch = -1
save_per_batch = -1
keep_last = True
verbose_per_epoch = 1
verbose_per_batch = -1



### trainer.fit

In [None]:
history = trainer.fit(model=bert, train_data_loader=train_data_loader, val_data_loader=None, 
                      criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, device=device, 
                      epoch=epoch, amp=amp, save_per_epoch=save_per_epoch, save_per_batch=save_per_batch, keep_last=keep_last, verbose_per_epoch=verbose_per_epoch, verbose_per_batch=verbose_per_batch)

### trainer.train_epoch

In [None]:
data_iter = tqdm(train_data_loader, initial=train_data_loader.iter_start, total=len(train_data_loader))
data_iter.iter_size = train_data_loader.iter_end - train_data_loader.iter_start
epoch_train_history = trainer.train_epoch(model=bert, data_loader=data_iter, 
                                          criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, device=device, 
                                          amp=amp, scaler=scaler, save_per_batch=save_per_batch, verbose_per_batch=verbose_per_batch)

### trainer.iteration

In [37]:
for batch_idx, batch in enumerate(train_data_loader):
    batch_idx += 1
    break
    batch = [{k: trainer.convert_to_tensor(data=v, device=device) for k, v in _batch.items()} for _batch in batch]
    
    loss_dict, acc_dict = trainer.iteration(model=bert, batch=batch,
                                            criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, 
                                            train=True, amp=amp, scaler=scaler)
    
    print(loss_dict)
    print(acc_dict)
    break

### trainer.iteration & data_loader.collate_fn

In [None]:
_batch = [next(train_data_loader.dataset.__iter__()) for i in range(0, batch_size)]
batch_idx = 1
batch = train_data_loader.collate_fn(batch=_batch)
batch = [{k: trainer.convert_to_tensor(data=v, device=device) for k, v in _batch.items()} for _batch in batch]

loss_dict, acc_dict = trainer.iteration(model=transformer, batch=batch,
                                        criterions=criterions, criterion_weights=criterion_weights, optimizer=optimizer, 
                                        train=True, amp=amp, scaler=scaler)