### Augmentation - Final

In [1]:
from datasets import load_from_disk, Dataset, DatasetDict
from sklearn.utils import shuffle
import pandas as pd
import shutil
import os

# 데이터 로드 및 비율 설정
dataset = load_from_disk("../data/raw/train_dataset")
train_df = dataset["train"].to_pandas()
valid_df = dataset["validation"].to_pandas()
train_ratio = len(train_df) / (len(train_df) + len(valid_df))
valid_ratio = 1 - train_ratio

def combine_datasets(df1, df2):
    combined_df = pd.concat([df1, df2], axis=0).reset_index(drop=True)
    return combined_df

def df_to_datasetdict(train_df, valid_df):
    train = Dataset.from_pandas(train_df)
    validation = Dataset.from_pandas(valid_df)
    return DatasetDict({"train": train, "validation": validation})

def save_dataset(dataset, save_path_base="../data/preprocessed"):
    counter = 1
    save_path = f"{save_path_base}/data{counter}"
    
    # 이미 존재하는 폴더가 있을 경우, 숫자를 증가시켜 새로운 폴더 이름 찾기
    while os.path.exists(save_path):
        counter += 1
        save_path = f"{save_path_base}/data{counter}"
    
    # 데이터셋 저장 및 접근 권한 설정
    dataset.save_to_disk(save_path)
    os.system(f"sudo chmod -R 777 {save_path}")
    print(f"Dataset saved at: {save_path}")

def shuffle_df(df, random_state=42):
    return shuffle(df, random_state=random_state).reset_index(drop=True)

def sample_df(df, sample_size, random_state=42):
    return df.sample(n=sample_size, random_state=random_state).reset_index(drop=True)

def augmentation(external_datasets, sample_size=None, sampling_method="separate"):
    # 여러 외부 DatasetDict 로드 및 결합
    combined_train, combined_valid = load_and_combine_external_datasets(external_datasets)

    # sample_size가 None인 경우: 결합된 데이터프레임을 기존 데이터셋에 추가
    if sample_size is None:
        return handle_no_sample_size(combined_train, combined_valid, sampling_method)

    # sample_size가 지정된 경우: 비율에 맞춰 샘플링 및 결합
    return handle_sample_size(combined_train, combined_valid, sample_size, sampling_method)

def load_and_combine_external_datasets(dataset_paths):
    train_list = []
    valid_list = []

    for dataset_path in dataset_paths:
        # DatasetDict를 로드하고, train과 validation으로 분리
        external_dataset = load_from_disk(dataset_path)
        train = external_dataset["train"].to_pandas()
        valid = external_dataset["validation"].to_pandas()
        
        train_list.append(train)
        valid_list.append(valid)
        
        # 불러온 원본 데이터를 삭제
        # shutil.rmtree(dataset_path)
    
    # train과 valid 각각 결합
    combined_train = pd.concat(train_list, axis=0).reset_index(drop=True)
    combined_valid = pd.concat(valid_list, axis=0).reset_index(drop=True)
    return combined_train, combined_valid

def handle_no_sample_size(combined_train, combined_valid, sampling_method):
    # sample_size: None
    if sampling_method == "separate":
        # separate 방식: train에 combined_train, valid에 combined_valid 결합
        augmented_train_df = combine_datasets(train_df, combined_train)
        augmented_valid_df = combine_datasets(valid_df, combined_valid)
    elif sampling_method == "combine":
        # combined 방식: train과 valid를 합쳐 기존 train에 결합, valid는 그대로 유지
        combined_external = combine_datasets(combined_train, combined_valid)
        augmented_train_df = combine_datasets(train_df, combined_external)
        augmented_valid_df = valid_df
    
    return save_and_return_dataset(augmented_train_df, augmented_valid_df)

def handle_sample_size(combined_train, combined_valid, sample_size, sampling_method):
    # sample_size: 지정
    train_sample_size = round(sample_size * train_ratio)
    valid_sample_size = round(sample_size * valid_ratio)
    
    if sampling_method == "separate":
        # separate 방식: 각각 샘플링 후 기존 데이터셋에 결합
        sampled_train = sample_df(combined_train, train_sample_size)
        sampled_valid = sample_df(combined_valid, valid_sample_size)
        
        augmented_train_df = combine_datasets(train_df, sampled_train)
        augmented_valid_df = combine_datasets(valid_df, sampled_valid)
        
    elif sampling_method == "combine":
        # combined 방식: 결합 후 단일 샘플링하여 기존 train에 결합, valid는 유지
        combined_external = combine_datasets(combined_train, combined_valid)
        combined_sampled = sample_df(combined_external, sample_size)
        augmented_train_df = combine_datasets(train_df, combined_sampled)
        augmented_valid_df = valid_df

    return save_and_return_dataset(augmented_train_df, augmented_valid_df)

def save_and_return_dataset(augmented_train_df, augmented_valid_df):
    # 데이터셋 섞기
    augmented_train_df = shuffle_df(augmented_train_df)
    augmented_valid_df = shuffle_df(augmented_valid_df)

    # DatasetDict로 변환 및 저장
    augmented_dataset = df_to_datasetdict(augmented_train_df, augmented_valid_df)
    save_dataset(augmented_dataset)
    return augmented_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
augmentation(
    external_datasets=[
        "../data/external/korquad",
#        "../data/external/ko_wiki"
    ],
#    sample_size=30000,
    sampling_method="combine"
)

Saving the dataset (1/1 shards): 100%|██████████| 29985/29985 [00:00<00:00, 416164.65 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 240/240 [00:00<00:00, 50536.32 examples/s]

Dataset saved at: ../data/preprocessed/data1





DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'question', 'id', 'answers', 'document_id', '__index_level_0__'],
        num_rows: 29985
    })
    validation: Dataset({
        features: ['title', 'context', 'question', 'id', 'answers', 'document_id', '__index_level_0__'],
        num_rows: 240
    })
})