In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from setproctitle import setproctitle
setproctitle("Hodong_PolyEncoder")

In [2]:
import json
import torch
from tqdm import tqdm
import numpy as np
from rank_bm25 import BM25Okapi

from transformer.data.retriever_dataset import ElectraDatasetFromDir, RetrieverDataLoader
from transformer.tokenizer.utils import make_custom_tokenizer_from_pretrained, load_tokenizer_from_pretrained
from transformer.models.interface import TrainHistory
from transformer.models.utils import get_score_json
from transformer.utils.information_retrieval import BM25Okapi
from transformer.utils.common import set_device, convert_to_tensor, convert_to_numpy, init_path, get_last_index

### Set WorkingDirectory

In [3]:
# # AIBUD_DEV
# dataset_dir = "/Users/aibud_dev/_jupyter"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# # Korea_Server
# dataset_dir = "/home/mnt/guest1"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

# bigshane_local
dataset_dir = "D:\_jupyter"
path = "./config/file_path.json"
file_path = None
with open(path, "r", encoding="utf-8") as fp:
    file_path = json.load(fp)

# # AWS
# dataset_dir = "/home/ubuntu/data"
# path = "./config/file_path.json"
# file_path = None
# with open(path, "r", encoding="utf-8") as fp:
#     file_path = json.load(fp)

### Load Tokenizer

In [4]:
tokenizer_file_path = dataset_dir + "/huggingface_tokenizer/kor/koelectra-vanila"

# # save tokenizer to local
# tokenizer_path = "monologg/koelectra-base-discriminator"
# add_special_token = True
# tokenizer = make_custom_tokenizer_from_pretrained(model_type="electra", name_or_path=tokenizer_path, add_special_token=add_special_token)
# tokenizer.save_pretrained(tokenizer_file_path)

tokenizer = load_tokenizer_from_pretrained(model_type="electra", name_or_path=tokenizer_file_path)
print("vocab_size:", len(tokenizer))

loaded pretrained huggingface_tokenizer: 'D:\_jupyter/huggingface_tokenizer/kor/koelectra-vanila'
vocab_size: 32200


### Load Dataset & DataLoader

In [5]:
timesteps = 128
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 64
nprocs = 1

dataset_name = "four_n2x8_both"
total_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/{}/".format(dataset_name)
sample_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/{}/sample/".format(dataset_name)
train_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/{}/train/".format(dataset_name)
val_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/{}/val/".format(dataset_name)
test_data_dir = dataset_dir + "/dataset/preprocessed/dialog_finetuning/kor/{}/test/".format(dataset_name)

# total_dataset = ElectraDatasetFromDir(data_dir=total_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs)

# train_dataset = ElectraDatasetFromDir(data_dir=train_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs)
# train_data_loader = RetrieverDataLoader(dataset=train_dataset, batch_size=batch_size, device=device)

# val_dataset = ElectraDatasetFromDir(data_dir=val_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs)
# val_data_loader = RetrieverDataLoader(dataset=val_dataset, batch_size=batch_size, device=device)

test_dataset = ElectraDatasetFromDir(data_dir=test_data_dir, tokenizer=tokenizer, timesteps=timesteps, batch_size=batch_size, device=device, nprocs=nprocs)
test_data_loader = RetrieverDataLoader(dataset=test_dataset, batch_size=batch_size, device=device)

Preprocessing data: 100%|█████████████████████████████████████████████████████████| 6872/6872 [00:12<00:00, 533.63it/s]


### Define Model

In [6]:
candidates = []
for row in test_dataset.raw_data:
    last_index = get_last_index(row["speaker_ids"], value=test_dataset.user_speaker_id)
    candidate = row["utterances"][last_index+1:]
    candidate = " ".join(candidate)
    candidates.append(candidate)

bm25 = BM25Okapi(tokenizer=tokenizer, candidates=candidates)

'temp_dir' has been set to './20211006_100628/' to save model while training


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bigshane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Compute Scores

In [7]:
metrics = ["hits", "semantic_score"]
hits_k = [1,2,5,10]
model_name = "BM25Okapi"

model_dir = dataset_dir + "/model/bm25/{dataset_name}/".format(dataset_name=dataset_name)
log_dir = dataset_dir + "/essay/bm25/{dataset_name}/".format(dataset_name=dataset_name)
init_path(log_dir, reset=True)

scores = bm25.compute_scores(metrics=metrics, tokenizer=tokenizer, data_loader=test_data_loader, device=device, hits_k=hits_k)
output_json = get_score_json(model_name=model_name, dataset_name=dataset_name, test_data_size=len(test_dataset.raw_data), batch_size=batch_size, scores=scores)

# verbose & append log
eval_history = TrainHistory()
loss_dict = dict()
acc_dict = dict()
for metric, metric_score in scores.items():
    acc_dict[metric] = metric_score
eval_history.update(loss_dict=loss_dict, acc_dict=acc_dict, lr=-1)
eval_str = bm25.verbose_template.format(mode="Eval", device=device, idx=-1, num_iters=-1) + str(eval_history)
print(eval_str)

with open(log_dir + "/score_logs.txt", "a", encoding="utf-8") as fp:
    fp.write(eval_str + "\n")

# write detailed logs
init_path(log_dir, reset=False)
init_path(log_dir + "/detailed/", reset=True)
with open(log_dir + "/detailed/score_logs.json", "w", encoding="utf-8") as fp:
    json.dumps(output_json)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load beomi/kcbert-base with 4 layers


Computing scores: 100%|████████████████████████████████████████████████████████████| 6872/6872 [11:55<00:00,  9.61it/s]


Eval (cuda:0) [-1 /-1 ]: (loss)  | (acc) HITS@1: 6.370e-02, HITS@2: 8.440e-02, HITS@5: 1.267e-01, HITS@10: 1.781e-01, BERTScore: 6.782e-01,  | train_time: 0.0s, last_lr: -1.0000000000


In [None]:
utterances = [
    "안녕하세요",
#     "무슨 일로 저에게 상담을 신청하셨나요?"
#     "요즘 인간관계가 고민이에요.",
#     "어떤 고민이죠?",
#     "친구들이랑 연락도 뜸해지고 자주 못만나서 서먹해지는 것 같아요",
#     "이래저래 연락하기 힘드신가봐요",
#     "네, 코로나 때문에 만나질 못해서 더 혼자가 된 느낌이에요.",
#     "저도 지쳐요.",
#     "당신도 사람들을 자주 못 만나시나봐요"
 ]
speaker_ids = [(i+1)%2 for i in range(len(utterances))]

outputs = service.infer_next_utterance(utterances, speaker_ids, 10, 5, None, 0.5, 5)
outputs[0][0]