In [1]:
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    BertModel, BertPreTrainedModel,
    AdamW, get_linear_schedule_with_warmup,
    TrainingArguments, EvalPrediction, default_data_collator, DataCollatorWithPadding
)

from trainer_qa import QuestionAnsweringTrainer
from utils_qa import postprocess_qa_predictions

2022-05-09 16:02:27.042327: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-09 16:02:27.042370: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# 난수 고정
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    random.seed(random_seed)
    np.random.seed(random_seed)
    
set_seed(42) # magic number :)

In [3]:
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

PyTorch version:[1.10.0+cu102].
device:[cuda:0].


In [4]:
# 데이터 및 평가지표 불러오기
from datasets import load_dataset

datasets = load_dataset("squad_kor_v1")

Reusing dataset squad_kor_v1 (/opt/ml/.cache/huggingface/datasets/squad_kor_v1/squad_kor_v1/1.0.0/31982418accc53b059af090befa81e68880acc667ca5405d30ce6fa7910950a7)


In [5]:
len(datasets["train"])

60407

In [6]:
from datasets import load_metric

metric = load_metric("squad")

In [7]:
# Pre-trained 모델 불러오기
from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer
)

model_name = "klue/bert-base" 

In [8]:
config = AutoConfig.from_pretrained(
    model_name
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)
model = AutoModelForQuestionAnswering.from_pretrained(
    model_name,
    config=config
)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model chec

In [9]:
# 파라미터 설정

# 데이터 전처리를 위한 파라미터
max_seq_length = 384 # 질문과 컨텍스트, special token을 합한 문자열의 최대 길이
pad_to_max_length = True
doc_stride = 128 # 컨텍스트가 너무 길어서 나눴을 때 오버랩되는 시퀀스 길이

# 학습을 위한 파라미터 (파라미터는 편하게 수정해서 사용하시면 됩니다) 
max_train_samples = 16
max_val_samples = 16
preprocessing_num_workers = 4
batch_size = 16
num_train_epochs = 30
n_best_size = 20
max_answer_length = 30

In [10]:
# Train을 위한 데이터 준비
def prepare_train_features(examples):
    # 주어진 텍스트를 토크나이징함
    # 이 때 텍스트의 길이가 max_seq_length를 넘으면 stride만큼 슬라이딩하며 여러 개로 나눔
    # 즉, 하나의 example에서 일부분이 겹치는 여러 sequence(feature)가 생길 수 있음
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",  # max_seq_length까지 truncate함 / pair의 두번째 파트(context)만 잘라냄
        max_length=max_seq_length,
        stride=doc_stride,
        return_overflowing_tokens=True, # 길이를 넘어가는 토큰들을 반환할 것인지
        return_offsets_mapping=True,  # 각 토큰에 대해 (char_start, char_end) 정보를 반환한 것인지
        padding="max_length",
    )
    
    # example 하나가 여러 sequence에 대응하는 경우를 위해 매핑이 필요
    overflow_to_sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # [0 ,0, 1, 1,]
    # offset_mappings으로 토큰이 원본 context 내 몇번째 글자부터 몇번째 글자까지 해당하는지 알 수 있음
    offset_mapping = tokenized_examples.pop("offset_mapping") # [(0, 0) , (0, 1) .... ]

    # 정답지를 만들기 위한 리스트
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        
        # 해당 example에 해당하는 sequence를 찾음
        sequence_ids = tokenized_examples.sequence_ids(i)
         
        # sequence가 속하는 example을 찾는다
        example_index = overflow_to_sample_mapping[i]
        answers = examples["answers"][example_index]
        
        # 텍스트에서 answer의 시작점, 끝점
        answer_start_offset = answers["answer_start"][0]
        answer_end_offset = answer_start_offset + len(answers["text"][0])
        
        # 텍스트에서 현재 span의 시작 토큰 인덱스
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1
        
        # 텍스트에서 현재 span 끝 토큰 인덱스
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1
        
        
        # answer가 현재 span을 벗어났는지 체크
        if not (
            offsets[token_start_index][0] <= answer_start_offset
            and offsets[token_end_index][1] >= answer_end_offset
        ):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # token_start_index와 token_end_index를 answer의 시작점과 끝점으로 옮김
            while (
                token_start_index < len(offsets)
                and offsets[token_start_index][0] <= answer_start_offset
            ):
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= answer_end_offset:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [11]:
train_dataset = datasets["train"]

In [12]:
# 전체 train dataset을 사용하는 예제가 아니고, sampling된 데이터를 사용하는 코드입니다. 적절하게 코드를 수정하여 사용하셔도 좋습니다.
train_dataset = train_dataset.select(range(max_train_samples)) 
column_names = datasets["train"].column_names
train_dataset = train_dataset.map(
        prepare_train_features,
        batched=True,
        num_proc=preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=True,
)

    

HBox(children=(FloatProgress(value=0.0, description='#2', max=1.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#1', max=1.0, style=ProgressStyle(description_width='init…




HBox(children=(FloatProgress(value=0.0, description='#3', max=1.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#0', max=1.0, style=ProgressStyle(description_width='init…






In [13]:
# validation을 위한 데이터 준비
def prepare_validation_features(examples):
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=max_seq_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1

        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [14]:
# 전체 데이터로 평가
eval_examples = datasets["validation"]

# 샘플 데이터로 평가
# eval_examples = eval_examples.select(range(max_val_samples)) 

eval_dataset = eval_examples.map(
        prepare_validation_features,
        batched=True,
        num_proc=preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=True,
    )

    

HBox(children=(FloatProgress(value=0.0, description='#0', max=2.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#2', max=2.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#3', max=2.0, style=ProgressStyle(description_width='init…

HBox(children=(FloatProgress(value=0.0, description='#1', max=2.0, style=ProgressStyle(description_width='init…







In [15]:
# def post_processing_function(examples, features, predictions):
#     # Post-processing: start logits과 end logits을 original context의 정답과 match시킵니다.
#     predictions = postprocess_qa_predictions(
#         examples=examples,
#         features=features,
#         predictions=predictions,
#         max_answer_length=max_answer_length, # 30
#         output_dir='./rudals/outputs/', # temporal
#     )
    
#     # Metric을 구할 수 있도록 Format을 맞춰줍니다.
#     formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
#     references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]
#     return EvalPrediction(predictions=formatted_predictions, label_ids=references)# 모델 예측 값을 후처리 하는 함수 (qa 성능 향상에 필수적임)

def post_processing_function(examples, features, predictions):
    # Post-processing: original context에서 start logit과 end logit을 matching
    predictions = postprocess_qa_predictions(
        examples=examples,
        features=features,
        predictions=predictions,
        version_2_with_negative=False,
        n_best_size=n_best_size,
        max_answer_length=max_answer_length,
        null_score_diff_threshold=0.0,
        output_dir=training_args.output_dir,
        is_world_process_zero=trainer.is_world_process_zero(),
    )
    
    # Metric을 계산할 수 있는 format으로 수정
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]
    return EvalPrediction(predictions=formatted_predictions, label_ids=references)

In [16]:
def compute_metrics(p: EvalPrediction):
    return metric.compute(predictions=p.predictions, references=p.label_ids)

In [17]:
# 학습에 필요한 Arguments 정의
training_args = TrainingArguments(
    output_dir="outputs",
    do_train=True, 
    do_eval=True, 
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    save_strategy='no'
)

In [18]:
trainer = QuestionAnsweringTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    eval_examples=datasets["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    post_process_function=post_processing_function,
    compute_metrics=compute_metrics,
)

In [25]:
import wandb
wandb.login()
wandb.init(project='MRC', entity='boostcamp_nlp06', name='TEST')

TypeError: get() takes no keyword arguments

In [26]:
train_result = trainer.train()

Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/wandb/sdk/wandb_init.py", line 996, in init
    wi.setup(kwargs)
  File "/opt/conda/lib/python3.8/site-packages/wandb/sdk/wandb_init.py", line 133, in setup
    self._wl = wandb_setup.setup()
  File "/opt/conda/lib/python3.8/site-packages/wandb/sdk/wandb_setup.py", line 318, in setup
    ret = _setup(settings=settings)
  File "/opt/conda/lib/python3.8/site-packages/wandb/sdk/wandb_setup.py", line 313, in _setup
    wl = _WandbSetup(settings=settings)
  File "/opt/conda/lib/python3.8/site-packages/wandb/sdk/wandb_setup.py", line 299, in __init__
    _WandbSetup._instance = _WandbSetup__WandbSetup(settings=settings, pid=pid)
  File "/opt/conda/lib/python3.8/site-packages/wandb/sdk/wandb_setup.py", line 107, in __init__
    self._settings = self._settings_setup(settings, self._early_logger)
  File "/opt/conda/lib/python3.8/site-packages/wandb/sdk/wandb_setup.py", line 134, in _settings_setup
    s._infer_set

Exception: problem