In [2]:
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from datasets import load_from_disk, load_dataset, concatenate_datasets

import seaborn as sns
import matplotlib.pyplot as plt
from Data.preprocessing import *
from arguments import *

from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    set_seed,
)

In [3]:
model_args, data_args, training_args = return_arg()

In [4]:
korquad = load_dataset('squad_kor_v1')
datasets = load_from_disk(data_args.dataset_name)

Reusing dataset squad_kor_v1 (/opt/ml/.cache/huggingface/datasets/squad_kor_v1/squad_kor_v1/1.0.0/31982418accc53b059af090befa81e68880acc667ca5405d30ce6fa7910950a7)


In [5]:
print(korquad)
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 60407
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5774
    })
})
DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'answers', 'context', 'document_id', 'id', 'question', 'title'],
        num_rows: 3952
    })
    validation: Dataset({
        features: ['__index_level_0__', 'answers', 'context', 'document_id', 'id', 'question', 'title'],
        num_rows: 240
    })
})


In [6]:
train_korquad = korquad['train']
valid_korquad = korquad['validation']
train_data = datasets['train']
valid_data = datasets['validation']

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name is not None else model_args.model_name_or_path,
        # 'use_fast' argument를 True로 설정할 경우 rust로 구현된 tokenizer를 사용할 수 있습니다.
        # False로 설정할 경우 python으로 구현된 tokenizer를 사용할 수 있으며,
        # rust version이 비교적 속도가 빠릅니다.
        use_fast=True,
    )

In [8]:
column_names = train_data.column_names
question_column_name = "question" if "question" in column_names else column_names[0]
context_column_name = "context" if "context" in column_names else column_names[1]
answer_column_name = "answers" if "answers" in column_names else column_names[2]

print(column_names)

pad_on_right = tokenizer.padding_side == "right"
print(pad_on_right)

last_checkpoint, max_seq_length = check_no_error(
        data_args, training_args, datasets, tokenizer
)
print(last_checkpoint, max_seq_length)

train_dataset = train_data.map(
    function=lambda x: prepare_train_features(x, tokenizer=tokenizer, pad_on_right=pad_on_right,
                                              context_column_name=context_column_name, question_column_name=question_column_name,
                                              answer_column_name=answer_column_name,
                                              data_args=data_args, max_seq_length=max_seq_length),
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)

Loading cached processed dataset at ../../data/train_dataset/train/cache-15f2deffcd472bce.arrow


['title', 'context', 'question', 'id', 'answers', 'document_id', '__index_level_0__']
True
./qa_train/roberta_korquad/checkpoint-19000 384


In [9]:
column_names_korquad = train_korquad.column_names
question_column_name_k = "question" if "question" in column_names_korquad else column_names_korquad[0]
context_column_name_k = "context" if "context" in column_names_korquad else column_names_korquad[1]
answer_column_name_k = "answers" if "answers" in column_names_korquad else column_names_korquad[2]

print(column_names_korquad)

pad_on_right = tokenizer.padding_side == "right"
print(pad_on_right)

last_checkpoint, max_seq_length = check_no_error(
        data_args, training_args, korquad, tokenizer
)
print(last_checkpoint, max_seq_length)

korquad_train_dataset = train_korquad.map(
    function=lambda x: prepare_train_features(x, tokenizer=tokenizer, pad_on_right=pad_on_right,
                                              context_column_name=context_column_name_k, question_column_name=question_column_name_k,
                                              answer_column_name=answer_column_name_k,
                                              data_args=data_args, max_seq_length=max_seq_length),
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names_korquad,
    load_from_cache_file=not data_args.overwrite_cache,
)

Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/squad_kor_v1/squad_kor_v1/1.0.0/31982418accc53b059af090befa81e68880acc667ca5405d30ce6fa7910950a7/cache-31218f9574a997cc.arrow


['answers', 'context', 'id', 'question', 'title']
True
./qa_train/roberta_korquad/checkpoint-19000 384


In [10]:
korquad_eval_dataset = valid_korquad.map(
            function=lambda x: prepare_validation_features(x, tokenizer=tokenizer, pad_on_right=pad_on_right,
                                                      context_column_name=context_column_name_k, question_column_name=question_column_name_k,
                                                      answer_column_name=answer_column_name_k,
                                                      data_args=data_args, max_seq_length=max_seq_length),
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names_korquad,
            load_from_cache_file=not data_args.overwrite_cache,
        )

Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/squad_kor_v1/squad_kor_v1/1.0.0/31982418accc53b059af090befa81e68880acc667ca5405d30ce6fa7910950a7/cache-c2ee6dd3fd36e5e0.arrow


In [11]:
print(korquad_eval_dataset)
print(len(korquad_eval_dataset))
print(len(set(korquad_eval_dataset['example_id'])))
print(valid_korquad['id'][0])
print(korquad_eval_dataset['example_id'][0])

Dataset({
    features: ['attention_mask', 'example_id', 'input_ids', 'offset_mapping', 'token_type_ids'],
    num_rows: 6865
})
6865
5774
6548850-0-0
6548850-0-0


In [12]:
print(train_dataset)
print(korquad_train_dataset)

Dataset({
    features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions'],
    num_rows: 7978
})
Dataset({
    features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions'],
    num_rows: 69399
})


In [13]:
new_train = concatenate_datasets(dsets=[train_dataset, korquad_train_dataset])

In [14]:
print(train_data)
print(train_korquad)

Dataset({
    features: ['__index_level_0__', 'answers', 'context', 'document_id', 'id', 'question', 'title'],
    num_rows: 3952
})
Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 60407
})


In [15]:
def dict2str(example):
    example['answers'] = str(example['answers'])
    return example
def str2dict(example):
    example['answers'] = eval(example['answers'])
    return example

In [16]:
new_train_data = train_data.map(dict2str)
new_train_korquad = train_korquad.map(dict2str)

new_train_data = new_train_data.remove_columns(['__index_level_0__', 'document_id'])

HBox(children=(FloatProgress(value=0.0, max=3952.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=60407.0), HTML(value='')))




In [17]:
new_set = new_train_data.cast(new_train_korquad.features)

In [18]:
print(new_set)
print(new_train_korquad)

Dataset({
    features: ['answers', 'context', 'id', 'question', 'title'],
    num_rows: 3952
})
Dataset({
    features: ['answers', 'context', 'id', 'question', 'title'],
    num_rows: 60407
})


In [19]:
concated = concatenate_datasets([new_set, new_train_korquad])

In [20]:
concated = concated.map(str2dict)

HBox(children=(FloatProgress(value=0.0, max=64359.0), HTML(value='')))




In [21]:
concated

Dataset({
    features: ['answers', 'context', 'id', 'question', 'title'],
    num_rows: 64359
})