In [16]:
import datasets
import pandas as pd
import os
from question_answering.paths import extractive_qa_paths

datasets.logging.set_verbosity_error()
datasets.logging.disable_progress_bar()

In [17]:
def preprocess_squad_dataset_sample(sample, keep_multiple_answers: bool):
    answer_texts = sample["answers"]["text"]
    answer_starts = sample["answers"]["answer_start"]
    
    if keep_multiple_answers:
        sample["answer_text"] = answer_texts
        sample["answer_start"] = answer_starts
    else:
        sample["answer_text"] = answer_texts[0]
        sample["answer_start"] = answer_starts[0]

    return sample


def save_squad_dataset(dataset, set_type, keep_multiple_answers: bool = False, save_as_json=False):
    dataset = dataset.map(
        lambda sample: preprocess_squad_dataset_sample(sample, keep_multiple_answers=keep_multiple_answers)
    )
    dataset = dataset.remove_columns(["title", "answers"])

    dataset_df = dataset.to_pandas()
    dataset_df = dataset_df.dropna()

    path = extractive_qa_paths.squad_dataset_dir
    dir_exists = os.path.exists(path)
    if not dir_exists:
        os.makedirs(path)
    if save_as_json:
        dataset_df.to_json(path / f"{set_type}.json")
    else:
        dataset_df.to_csv(path / f"{set_type}.csv", index=True, index_label="index")

def load_squad_dataset_and_save_as_csv():
    squad_dataset = datasets.load_dataset("squad")
    train_temp_pd = squad_dataset["train"].to_pandas()
    validation_temp_pd = squad_dataset["validation"].to_pandas()

    # Create orig train and orig eval csvs with shuffling
    orig_train_dataset = datasets.Dataset.from_pandas(train_temp_pd, preserve_index=False).shuffle(seed=42)
    orig_val_dataset = datasets.Dataset.from_pandas(validation_temp_pd, preserve_index=False).shuffle(seed=42)
    save_squad_dataset(orig_train_dataset, set_type="orig_train", keep_multiple_answers=True, save_as_json=True)
    save_squad_dataset(orig_val_dataset, set_type="orig_val", keep_multiple_answers=True, save_as_json=True)

    # Create csvs with random split 70:15:15 and one answer per sample
    combined_temp_pd = pd.concat((train_temp_pd, validation_temp_pd))
    squad_dataset = datasets.Dataset.from_pandas(combined_temp_pd, preserve_index=False)
    squad_dataset = squad_dataset.shuffle(seed=42)
    train_valtest_dataset = squad_dataset.train_test_split(test_size=0.3, shuffle=False)
    val_test_dataset = train_valtest_dataset["test"].train_test_split(
        test_size=0.5, shuffle=False
    )

    train_dataset = train_valtest_dataset["train"]
    val_dataset = val_test_dataset["train"]
    test_dataset = val_test_dataset["test"]

    save_squad_dataset(train_dataset, set_type="train")
    save_squad_dataset(val_dataset, set_type="val")
    save_squad_dataset(test_dataset, set_type="test")

In [18]:
load_squad_dataset_and_save_as_csv()