In [1]:
import logging
import os
import sys
import random
import numpy as np
import torch
import wandb
from typing import NoReturn
from tqdm import tqdm

from arguments import DataTrainingArguments, ModelArguments
from datasets import DatasetDict, load_from_disk, load_metric, load_dataset
from trainer_qa import QuestionAnsweringTrainer
from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    set_seed,
    EarlyStoppingCallback,
)
from utils_qa import check_no_error, postprocess_qa_predictions

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['WANDB_DISABLED'] = 'false'
os.environ['WANDB_PROJECT'] = 'level2-MRC'
os.environ['WANDB_ENTITY'] = 'm2f'
os.environ['WANDB_NOTEBOOK_NAME'] = 'JHW_JUPYTER_SERVER'

In [3]:
seed = 28
deterministic = False

random.seed(seed) # python random seed 고정
np.random.seed(seed) # numpy random seed 고정
torch.manual_seed(seed) # torch random seed 고정
torch.cuda.manual_seed_all(seed)
if deterministic: # cudnn random seed 고정 - 고정 시 학습 속도가 느려질 수 있습니다. 
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False

In [4]:
model_args = ModelArguments()
data_args = DataTrainingArguments()

In [5]:
training_args = TrainingArguments(
    output_dir='./models/train_dataset2',
	seed=seed,
    do_train=True,
    do_eval=True,

    save_strategy='steps',
    save_steps=100,
	logging_dir='./logs',
    logging_strategy='steps',
    logging_steps=1,

    num_train_epochs=10,
    learning_rate=1e-5,
    lr_scheduler_type='cosine',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    weight_decay=0.01,

    evaluation_strategy='steps',
    eval_steps=100,

    save_total_limit=1,
    load_best_model_at_end = True,
    metric_for_best_model='eval_exact_match',
    greater_is_better=True,

    report_to='wandb',
    run_name='JHW_28',

    fp16=True,
)

In [6]:
logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -    %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

# verbosity 설정 : Transformers logger의 정보로 사용합니다 (on main process only)
logger.info("Training/evaluation parameters %s", training_args)

# 모델을 초기화하기 전에 난수를 고정합니다.
set_seed(training_args.seed)

## Dataset Load

In [7]:
datasets = load_from_disk(data_args.dataset_name)

In [9]:
datasets['train'] = datasets['train'].remove_columns(['document_id','__index_level_0__'])
datasets['validation'] = datasets['validation'].remove_columns(['document_id','__index_level_0__'])

In [10]:
# for korquad
# korquad = load_dataset('parquet', data_files=['../data/train_aug.parquet'])
# datasets['train'] = korquad['train']
# datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 70133
    })
    validation: Dataset({
        features: ['title', 'context', 'question', 'id', 'answers'],
        num_rows: 240
    })
})

In [12]:
## only use if you want to finetune korquad
# datasets = load_dataset('squad_kor_v1')

### Add Negative Samples

In [10]:
from datasets import concatenate_datasets
# Train 내의 Random Context 증강
n_negative_samples = 2
total_iteration = len(datasets['train'])

for i in tqdm(range(total_iteration)):
    temp_dict = datasets['train'][i]
    temp_dict['answers'] = {'answer_start': [], 'text': []}
    select_idx = np.random.choice([idx for idx in range(len(datasets['train'])) if idx != i], size=n_negative_samples, replace=False)
    negative_datasets = datasets['train'].select(select_idx)
    for n_context in negative_datasets['context']:
        temp_dict['context'] = n_context
        datasets['train'] = datasets['train'].add_item(temp_dict)

100%|██████████| 3952/3952 [49:15<00:00,  1.34it/s]  


In [30]:
datasets.save_to_disk('../data/negativetrain')

Saving the dataset (1/1 shards): 100%|██████████| 11856/11856 [00:01<00:00, 10857.31 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 240/240 [00:00<00:00, 8979.86 examples/s] 


In [8]:
datasets = load_from_disk('../data/negativetrain')

In [9]:
datasets

DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'question', 'id', 'answers'],
        num_rows: 11856
    })
    validation: Dataset({
        features: ['title', 'context', 'question', 'id', 'answers'],
        num_rows: 240
    })
})

## Model, Tokenizer, Config

In [8]:
model_args.model_name_or_path = 'CurtisJeon/klue-roberta-large-korquad_v1_qa-finetuned_42'

In [9]:
config = AutoConfig.from_pretrained(
    model_args.config_name
    if model_args.config_name is not None
    else model_args.model_name_or_path,
    cache_dir='/data/ephemeral/huggingface'
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name
    if model_args.tokenizer_name is not None
    else model_args.model_name_or_path,
    # 'use_fast' argument를 True로 설정할 경우 rust로 구현된 tokenizer를 사용할 수 있습니다.
    # False로 설정할 경우 python으로 구현된 tokenizer를 사용할 수 있으며,
    # rust version이 비교적 속도가 빠릅니다.
    use_fast=True,
    cache_dir='/data/ephemeral/huggingface'
)
model = AutoModelForQuestionAnswering.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir='/data/ephemeral/huggingface'
)

## token_type_ids 를 쓰는지 안쓰는지
use_token_type_ids = False

config.json: 100%|██████████| 765/765 [00:00<00:00, 2.96MB/s]
tokenizer_config.json: 100%|██████████| 1.39k/1.39k [00:00<00:00, 548kB/s]
vocab.txt: 100%|██████████| 248k/248k [00:00<00:00, 483kB/s]
tokenizer.json: 100%|██████████| 752k/752k [00:00<00:00, 1.12MB/s]
special_tokens_map.json: 100%|██████████| 971/971 [00:00<00:00, 2.46MB/s]
model.safetensors: 100%|██████████| 1.34G/1.34G [00:58<00:00, 23.1MB/s]


In [10]:
# For Hub Upload
MODEL_SAVE_REPO = 'klue-roberta-large-korquad_v1_qa-finetuned'
API_KEY = ''

# model.push_to_hub(
#     MODEL_SAVE_REPO , 
#     use_temp_dir=True, 
#     use_auth_token=API_KEY
# )

# tokenizer.push_to_hub(
#     MODEL_SAVE_REPO , 
#     use_temp_dir=True, 
#     use_auth_token=API_KEY
# )

model.safetensors: 100%|██████████| 1.34G/1.34G [00:55<00:00, 24.0MB/s]
README.md: 100%|██████████| 5.18k/5.18k [00:00<00:00, 9.13MB/s]


CommitInfo(commit_url='https://huggingface.co/CurtisJeon/klue-roberta-large-korquad_v1_qa-finetuned/commit/60fea41d546df062d8e98c20e2a66864d7e08558', commit_message='Upload tokenizer', commit_description='', oid='60fea41d546df062d8e98c20e2a66864d7e08558', pr_url=None, pr_revision=None, pr_num=None)

## MRC-Preprocess

In [10]:
column_names = datasets["train"].column_names
question_column_name = "question" if "question" in column_names else column_names[0]
context_column_name = "context" if "context" in column_names else column_names[1]
answer_column_name = "answers" if "answers" in column_names else column_names[2]

In [11]:
# Padding에 대한 옵션을 설정합니다.
# (question|context) 혹은 (context|question)로 세팅 가능합니다.
pad_on_right = tokenizer.padding_side == "right"

In [12]:
# 오류가 있는지 확인합니다.
last_checkpoint, max_seq_length = check_no_error(
    data_args, training_args, datasets, tokenizer
)

In [13]:
 # Train preprocessing / 전처리를 진행합니다.
def prepare_train_features(examples):
    # truncation과 padding(length가 짧을때만)을 통해 toknization을 진행하며, stride를 이용하여 overflow를 유지합니다.
    # 각 example들은 이전의 context와 조금씩 겹치게됩니다.
    tokenized_examples = tokenizer(
        examples[question_column_name if pad_on_right else context_column_name],
        examples[context_column_name if pad_on_right else question_column_name],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_seq_length,
        stride=data_args.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_token_type_ids=use_token_type_ids, # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
        padding="max_length" if data_args.pad_to_max_length else False,
    )

    # 길이가 긴 context가 등장할 경우 truncate를 진행해야하므로, 해당 데이터셋을 찾을 수 있도록 mapping 가능한 값이 필요합니다.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # token의 캐릭터 단위 position를 찾을 수 있도록 offset mapping을 사용합니다.
    # start_positions과 end_positions을 찾는데 도움을 줄 수 있습니다.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # 데이터셋에 "start position", "enc position" label을 부여합니다.
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)  # cls index

        # sequence id를 설정합니다 (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # 하나의 example이 여러개의 span을 가질 수 있습니다.
        sample_index = sample_mapping[i]
        answers = examples[answer_column_name][sample_index]

        # answer가 없을 경우 cls_index를 answer로 설정합니다(== example에서 정답이 없는 경우 존재할 수 있음).
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # text에서 정답의 Start/end character index
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # text에서 current span의 Start token index
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # text에서 current span의 End token index
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # 정답이 span을 벗어났는지 확인합니다(정답이 없는 경우 CLS index로 label되어있음).
            if not (
                offsets[token_start_index][0] <= start_char
                and offsets[token_end_index][1] >= end_char
            ):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # token_start_index 및 token_end_index를 answer의 끝으로 이동합니다.
                # Note: answer가 마지막 단어인 경우 last offset을 따라갈 수 있습니다(edge case).
                while (
                    token_start_index < len(offsets)
                    and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

# Validation preprocessing
def prepare_validation_features(examples):
    # truncation과 padding(length가 짧을때만)을 통해 toknization을 진행하며, stride를 이용하여 overflow를 유지합니다.
    # 각 example들은 이전의 context와 조금씩 겹치게됩니다.
    tokenized_examples = tokenizer(
        examples[question_column_name if pad_on_right else context_column_name],
        examples[context_column_name if pad_on_right else question_column_name],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_seq_length,
        stride=data_args.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_token_type_ids=use_token_type_ids, # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
        padding="max_length" if data_args.pad_to_max_length else False,
    )

    # 길이가 긴 context가 등장할 경우 truncate를 진행해야하므로, 해당 데이터셋을 찾을 수 있도록 mapping 가능한 값이 필요합니다.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # evaluation을 위해, prediction을 context의 substring으로 변환해야합니다.
    # corresponding example_id를 유지하고 offset mappings을 저장해야합니다.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # sequence id를 설정합니다 (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # 하나의 example이 여러개의 span을 가질 수 있습니다.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping을 None으로 설정해서 token position이 context의 일부인지 쉽게 판별 할 수 있습니다.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]
    return tokenized_examples

In [14]:
train_dataset = datasets["train"]

# dataset에서 train feature를 생성합니다.
train_dataset = train_dataset.map(
    prepare_train_features,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)

eval_dataset = datasets["validation"]

# Validation Feature 생성
eval_dataset = eval_dataset.map(
    prepare_validation_features,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)

Map: 100%|██████████| 3952/3952 [00:05<00:00, 758.11 examples/s]
Map: 100%|██████████| 240/240 [00:00<00:00, 548.35 examples/s]


In [15]:
data_collator = DataCollatorWithPadding(
    tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None
)

## MRC-Postprocess

In [16]:
# Post-processing:
def post_processing_function(examples, features, predictions, training_args):
    # Post-processing: start logits과 end logits을 original context의 정답과 match시킵니다.
    predictions = postprocess_qa_predictions(
        examples=examples,
        features=features,
        predictions=predictions,
        max_answer_length=data_args.max_answer_length,
        output_dir=training_args.output_dir,
    )
    # Metric을 구할 수 있도록 Format을 맞춰줍니다.
    formatted_predictions = [
        {"id": k, "prediction_text": v} for k, v in predictions.items()
    ]
    if training_args.do_predict:
        return formatted_predictions

    elif training_args.do_eval:
        references = [
            {"id": ex["id"], "answers": ex[answer_column_name]}
            for ex in datasets["validation"]
        ]
        return EvalPrediction(
            predictions=formatted_predictions, label_ids=references
        )

## Metrics

In [17]:
metric = load_metric("squad")

def compute_metrics(p: EvalPrediction):
    return metric.compute(predictions=p.predictions, references=p.label_ids)

  metric = load_metric("squad")


## Trainer

In [19]:
wandb.login(key='')



[34m[1mwandb[0m: Currently logged in as: [33mgusdnr122997[0m ([33mm2af[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /data/ephemeral/home/.netrc


True

In [20]:
early_stopping = EarlyStoppingCallback(
     early_stopping_patience=3,
)

In [21]:
# Trainer 초기화
trainer = QuestionAnsweringTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    eval_examples=datasets["validation"] if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    post_process_function=post_processing_function,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)



In [44]:
# Training
if training_args.do_train:
    if last_checkpoint is not None:
        checkpoint = last_checkpoint
    elif os.path.isdir(model_args.model_name_or_path):
        checkpoint = model_args.model_name_or_path
    else:
        checkpoint = None
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    trainer.save_model()  # Saves the tokenizer too for easy upload

    metrics = train_result.metrics
    metrics["train_samples"] = len(train_dataset)

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    output_train_file = os.path.join(training_args.output_dir, "train_results.txt")

    with open(output_train_file, "w") as writer:
        logger.info("***** Train results *****")
        for key, value in sorted(train_result.metrics.items()):
            logger.info(f"  {key} = {value}")
            writer.write(f"{key} = {value}\n")

    # State 저장
    trainer.state.save_to_json(
        os.path.join(training_args.output_dir, "trainer_state.json")
    )

# Evaluation
if training_args.do_eval:
    logger.info("*** Evaluate ***")
    metrics = trainer.evaluate()

    metrics["eval_samples"] = len(eval_dataset)

    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

[34m[1mwandb[0m: Currently logged in as: [33mgusdnr122997[0m ([33mm2f[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


In [25]:
wandb.finish()



0,1
eval/exact_match,▁▃▆█▆▆▅█
eval/f1,▁▂▇█▆▅▆█
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▁▂▃▅▆██████████████████████████████████▇
train/loss,▄█▇▅▃▅▆▆▆▆▄▂▂▇▄▄▄▂▇▆▅▂▄▂▃▃▃▅▃▃▁▅▁▄▂▂▆▃▃▃
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁

0,1
eval/exact_match,70.83333
eval/f1,80.93161
train/epoch,1.94
train/global_step,700.0
train/learning_rate,1e-05
train/loss,0.1296
train/total_flos,2.071736340364339e+16
train/train_loss,0.30219
train/train_runtime,1038.4126
train/train_samples_per_second,110.977


## Inference

In [18]:
from typing import Callable, Dict, List, NoReturn, Tuple
from datasets import (
    Dataset,
    DatasetDict,
    Features,
    Sequence,
    Value,
    load_from_disk,
    load_metric,
)

In [19]:
model_args = ModelArguments(
    model_name_or_path='./models/train_dataset/',
)
data_args = DataTrainingArguments(
    dataset_name='../data/test_dataset/',
)

In [20]:
training_args = TrainingArguments(
    output_dir='./outputs/test_dataset3/',
	seed=seed,

    do_train=True,
    do_predict=True,
)

In [21]:
test_datasets = load_from_disk(data_args.dataset_name)
test_datasets

DatasetDict({
    validation: Dataset({
        features: ['question', 'id'],
        num_rows: 600
    })
})

### Retrieval (Sparse)

In [22]:
# if True
data_args.eval_retrieval

True

In [25]:
from retrieval import SparseRetrieval
from bm25 import BM25

# retriever = SparseRetrieval(
#     tokenize_fn=tokenizer.tokenize, data_path="../data", context_path="wiki_preprocessed.json"
# )
passage_tokenizer = AutoTokenizer.from_pretrained('monologg/koelectra-base-v3-finetuned-korquad', cache_dir='/data/ephemeral/huggingface')

retriever = BM25(
    tokenize_fn=passage_tokenizer.tokenize, data_path="../data", context_path="wiki_preprocessed_v2.json"
)
retriever.get_sparse_embedding()

Lengths of unique contexts : 55963
Embedding pickle load.


In [25]:
print('Faiss:', data_args.use_faiss)
if data_args.use_faiss:
    retriever.build_faiss(num_clusters=data_args.num_clusters)
    df = retriever.retrieve_faiss(
        test_datasets["validation"], topk=data_args.top_k_retrieval
    )
else:
    df = retriever.retrieve(test_datasets["validation"], topk=data_args.top_k_retrieval)

Faiss: False


100%|██████████| 600/600 [12:39<00:00,  1.27s/it]


[query exhaustive search] done in 762.739 s


Sparse retrieval: 100%|██████████| 600/600 [00:00<00:00, 10147.63it/s]


In [26]:
df.head()

Unnamed: 0,question,id,context
0,유령'은 어느 행성에서 지구로 왔는가?,mrc-1-000653,목성의 대기에서 보이는 줄무늬는 적도와 평행하면서 행성을 둘러싸는 대(zone)와 ...
1,용병회사의 경기가 좋아진 것은 무엇이 끝난 이후부터인가?,mrc-1-001113,냉전 종식 이후 전 세계적으로 소규모의 끊임없는 국지 분쟁들이 생겨나고 강대국들의 ...
2,돌푸스에게 불특정 기간동안 하원이 잠시 쉬는 것을 건의 받았던 인물은?,mrc-0-002191,"1933년 3월, 투표 과정의 위법성에 대한 문제제기가 불거졌다. 당시 오스트리아 ..."
3,"마오리언어와 영어, 뉴질랜드 수화를 공식 언어로 사용하는 나라는?",mrc-0-003951,"유럽인들의 아메리카와 오세아니아 식민지화로 인해 아메리카와 오세아니아의 문화적, 민..."
4,디엔비엔푸 전투에서 보응우옌잡이 상대한 국가는?,mrc-1-001272,1926년 학생 시절 베트남청년혁명당에 가입했고 1930년에 학생 파업을 지지했다는...


In [27]:
df.to_csv('../data/bm25_k25_v2.csv',index=False)

In [28]:
# test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다.
if training_args.do_predict:
    f = Features(
        {
            "context": Value(dtype="string", id=None),
            "id": Value(dtype="string", id=None),
            "question": Value(dtype="string", id=None),
        }
    )
elif training_args.do_eval:
    f = Features(
        {
            "answers": Sequence(
                feature={
                    "text": Value(dtype="string", id=None),
                    "answer_start": Value(dtype="int32", id=None),
                },
                length=-1,
                id=None,
            ),
            "context": Value(dtype="string", id=None),
            "id": Value(dtype="string", id=None),
            "question": Value(dtype="string", id=None),
        }
    )
f

{'context': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None)}

In [29]:
test_datasets = DatasetDict({"validation": Dataset.from_pandas(df, features=f)})
test_datasets

DatasetDict({
    validation: Dataset({
        features: ['context', 'id', 'question'],
        num_rows: 600
    })
})

### Retrieval(DPR)

In [24]:
from dpr import DenseRetrieval
from transformers import TrainingArguments, AutoTokenizer
from inference import *

args = TrainingArguments(
    output_dir="dense_retrieval",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01
)

train_dataset = datasets['train']
cache_dir = '/data/ephemeral/huggingface'

p_encoder_path = '/data/ephemeral/huggingface/p_encoder_bert'
q_encoder_path = '/data/ephemeral/huggingface/q_encoder_bert'
ep_tokenizer = AutoTokenizer.from_pretrained('klue/bert-base', cache_dir=cache_dir)
p_encoder = BertEncoder.from_pretrained(p_encoder_path, cache_dir=cache_dir).to(args.device)
q_encoder = BertEncoder.from_pretrained(q_encoder_path, cache_dir=cache_dir).to(args.device)

retriever = DenseRetrieval(
    args=args,
    dataset=train_dataset,
    num_neg=2,
    tokenizer=ep_tokenizer,
    p_encoder=p_encoder,
    q_encoder=q_encoder,
    do_train=False
)

In [22]:
if retriever.do_train:
    retriever.train()

AttributeError: 'DenseRetrieval' object has no attribute 'do_train'

In [25]:
retriever.get_dense_embeddings('../data/dense.bin', corpus_path="../data/wiki_preprocessed_v2.json")

Embedding pickle load.


In [26]:
df = retriever.retrieve(test_datasets["validation"], topk=data_args.top_k_retrieval)

100%|██████████| 600/600 [00:09<00:00, 60.70it/s]
100%|██████████| 600/600 [00:02<00:00, 257.60it/s]


[query exhaustive search] done in 12.616 s


Dense retrieval: 100%|██████████| 600/600 [00:03<00:00, 163.50it/s]


In [28]:
f = Features(
    {
        "context": Value(dtype="string", id=None),
        "id": Value(dtype="string", id=None),
        "question": Value(dtype="string", id=None),
    }
)

In [29]:
test_datasets = DatasetDict({"validation": Dataset.from_pandas(df, features=f)})

In [30]:
test_datasets

DatasetDict({
    validation: Dataset({
        features: ['context', 'id', 'question'],
        num_rows: 600
    })
})

### Retrieval(BM25 + DPR)
- ver2 : add SentTran Retrieval -> Remove (Low Prediction)

In [46]:
from custom_retriever import CustomRetriever

p_encoder_path = 'CurtisJeon/klue-bert-base-context'
q_encoder_path = 'CurtisJeon/klue-bert-base-question'

retriever = CustomRetriever(
    p_encoder_path=p_encoder_path, q_encoder_path=q_encoder_path, weights=(0.5, 0.5) # (bm25, dpr)
)
# retriever.dpr_tokenizer = AutoTokenizer.from_pretrained('kykim/bert-kor-base', cache_dir='/data/ephemeral/huggingface')
retriever.get_embeddings()

Lengths of unique contexts : 55963
Embedding pickle load.
Embedding pickle load.


In [47]:
df = retriever.retrieve(test_datasets["validation"], topk=data_args.top_k_retrieval)

BM25 Result pickle load.


100%|██████████| 600/600 [00:11<00:00, 52.32it/s]
100%|██████████| 600/600 [00:07<00:00, 79.08it/s] 


[query exhaustive search] done in 91.225 s


Sparse retrieval: 100%|██████████| 600/600 [00:00<00:00, 3458.59it/s]


In [36]:
df.to_csv('../data/bm25_dpr(5_5)_v2.csv', index=False)

In [39]:
df.head()

Unnamed: 0,question,id,context
0,유령'은 어느 행성에서 지구로 왔는가?,mrc-1-000653,목성의 대기에서 보이는 줄무늬는 적도와 평행하면서 행성을 둘러싸는 대(zone)와 ...
1,용병회사의 경기가 좋아진 것은 무엇이 끝난 이후부터인가?,mrc-1-001113,냉전 종식 이후 전 세계적으로 소규모의 끊임없는 국지 분쟁들이 생겨나고 강대국들의 ...
2,돌푸스에게 불특정 기간동안 하원이 잠시 쉬는 것을 건의 받았던 인물은?,mrc-0-002191,"1933년 3월, 투표 과정의 위법성에 대한 문제제기가 불거졌다. 당시 오스트리아 ..."
3,"마오리언어와 영어, 뉴질랜드 수화를 공식 언어로 사용하는 나라는?",mrc-0-003951,"유럽인들의 아메리카와 오세아니아 식민지화로 인해 아메리카와 오세아니아의 문화적, 민..."
4,디엔비엔푸 전투에서 보응우옌잡이 상대한 국가는?,mrc-1-001272,1926년 학생 시절 베트남청년혁명당에 가입했고 1930년에 학생 파업을 지지했다는...


In [48]:
# test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다.
if training_args.do_predict:
    f = Features(
        {
            "context": Value(dtype="string", id=None),
            "id": Value(dtype="string", id=None),
            "question": Value(dtype="string", id=None),
        }
    )
elif training_args.do_eval:
    f = Features(
        {
            "answers": Sequence(
                feature={
                    "text": Value(dtype="string", id=None),
                    "answer_start": Value(dtype="int32", id=None),
                },
                length=-1,
                id=None,
            ),
            "context": Value(dtype="string", id=None),
            "id": Value(dtype="string", id=None),
            "question": Value(dtype="string", id=None),
        }
    )
f

{'context': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None)}

In [49]:
test_datasets = DatasetDict({"validation": Dataset.from_pandas(df, features=f)})
test_datasets

DatasetDict({
    validation: Dataset({
        features: ['context', 'id', 'question'],
        num_rows: 600
    })
})

### Retrieval(SentTran)

In [23]:
from sent_retrieval import STRetrieval

retriever = STRetrieval()
retriever.get_embeddings()

Build passage embedding
Embedding pickle saved.


In [24]:
df = retriever.retrieve(test_datasets["validation"], topk=data_args.top_k_retrieval)

[query exhaustive search] done in 1.822 s


Dense retrieval: 100%|██████████| 600/600 [00:02<00:00, 276.08it/s]


In [25]:
# test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다.
if training_args.do_predict:
    f = Features(
        {
            "context": Value(dtype="string", id=None),
            "id": Value(dtype="string", id=None),
            "question": Value(dtype="string", id=None),
        }
    )
elif training_args.do_eval:
    f = Features(
        {
            "answers": Sequence(
                feature={
                    "text": Value(dtype="string", id=None),
                    "answer_start": Value(dtype="int32", id=None),
                },
                length=-1,
                id=None,
            ),
            "context": Value(dtype="string", id=None),
            "id": Value(dtype="string", id=None),
            "question": Value(dtype="string", id=None),
        }
    )
f

{'context': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None)}

In [26]:
test_datasets = DatasetDict({"validation": Dataset.from_pandas(df, features=f)})
test_datasets

DatasetDict({
    validation: Dataset({
        features: ['context', 'id', 'question'],
        num_rows: 600
    })
})

### Inference

In [50]:
column_names = test_datasets["validation"].column_names

question_column_name = "question" if "question" in column_names else column_names[0]
context_column_name = "context" if "context" in column_names else column_names[1]
answer_column_name = "answers" if "answers" in column_names else column_names[2]

In [52]:
# Padding에 대한 옵션을 설정합니다.
# (question|context) 혹은 (context|question)로 세팅 가능합니다.
pad_on_right = tokenizer.padding_side == "right"

# 오류가 있는지 확인합니다.
last_checkpoint, max_seq_length = check_no_error(
    data_args, training_args, test_datasets, tokenizer
)

In [53]:
test_dataset = test_datasets["validation"]

# Validation Feature 생성
test_dataset = test_dataset.map(
    prepare_validation_features,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map: 100%|██████████| 600/600 [00:29<00:00, 20.59 examples/s]


In [54]:
print("init trainer...")
# Trainer 초기화
trainer = QuestionAnsweringTrainer(
    model=model,
    args=training_args,
    train_dataset=None,
    eval_dataset=test_dataset,
    eval_examples=test_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    post_process_function=post_processing_function,
    compute_metrics=compute_metrics,
)

init trainer...


In [55]:
#### eval dataset & eval example - predictions.json 생성됨
predictions = trainer.predict(
    test_dataset=test_dataset, test_examples=test_datasets["validation"]
)

# predictions.json 은 postprocess_qa_predictions() 호출시 이미 저장됩니다.
print(
    "No metric can be presented because there is no correct answer given. Job done!"
)

02/22/2024 08:56:47 - INFO - utils_qa -    Post-processing 600 example predictions split into 17347 features.


100%|██████████| 600/600 [01:17<00:00,  7.79it/s]

02/22/2024 08:58:05 - INFO - utils_qa -    Saving predictions to ./outputs/test_dataset3/predictions.json.
02/22/2024 08:58:05 - INFO - utils_qa -    Saving nbest_preds to ./outputs/test_dataset3/nbest_predictions.json.





No metric can be presented because there is no correct answer given. Job done!


## PUSH TO HUB

In [28]:
MODEL_SAVE_REPO = 'klue-roberta-large-korquad_v1_qa'
API_KEY = ''

In [29]:
model.push_to_hub(
    MODEL_SAVE_REPO , 
    use_temp_dir=True, 
    use_auth_token=API_KEY
)

Configuration saved in /tmp/tmpthjcymnr/config.json
Model weights saved in /tmp/tmpthjcymnr/pytorch_model.bin
Uploading the following files to CurtisJeon/klue-roberta-large-korquad_v1_qa: pytorch_model.bin,config.json
pytorch_model.bin: 100%|██████████| 1.34G/1.34G [00:56<00:00, 23.9MB/s]


CommitInfo(commit_url='https://huggingface.co/CurtisJeon/klue-roberta-large-korquad_v1_qa/commit/1cdbdc888287e1549931bcc40f6f4fe68ecb8d7b', commit_message='Upload RobertaForQuestionAnswering', commit_description='', oid='1cdbdc888287e1549931bcc40f6f4fe68ecb8d7b', pr_url=None, pr_revision=None, pr_num=None)

In [30]:
tokenizer.push_to_hub(
    MODEL_SAVE_REPO, 
    use_temp_dir=True, 
    use_auth_token=API_KEY
)

tokenizer config file saved in /tmp/tmpixan8nqw/tokenizer_config.json
Special tokens file saved in /tmp/tmpixan8nqw/special_tokens_map.json
Uploading the following files to CurtisJeon/klue-roberta-large-korquad_v1_qa: tokenizer_config.json,vocab.txt,tokenizer.json,special_tokens_map.json


CommitInfo(commit_url='https://huggingface.co/CurtisJeon/klue-roberta-large-korquad_v1_qa/commit/01447181b4ff69e793b439604ad8797725205899', commit_message='Upload tokenizer', commit_description='', oid='01447181b4ff69e793b439604ad8797725205899', pr_url=None, pr_revision=None, pr_num=None)