In [1]:
import datasets
import pandas as pd
import os
from question_answering.paths import extractive_qa_paths

datasets.logging.set_verbosity_error()
datasets.logging.disable_progress_bar()

In [2]:
def preprocess_medical_dataset_sample(sample):
    sample["answer_text"] = sample["answers"]["text"][0]
    sample["answer_start"] = sample["answers"]["answer_start"][0]

    return sample


def save_medical_dataset_as_csv(dataset, set_type):
    dataset = dataset.map(preprocess_medical_dataset_sample)
    dataset = dataset.remove_columns(["document_id", "answers"])

    dataset_df = dataset.to_pandas()
    dataset_df = dataset_df.dropna()

    path = extractive_qa_paths.medical_dataset_dir
    dir_exists = os.path.exists(path)
    if not dir_exists:
        os.makedirs(path)
    dataset_df.to_csv(path / f"{set_type}.csv", index=True, index_label="index")


def load_medical_dataset_and_save_as_csv():
    medical_dataset = datasets.load_dataset("covid_qa_deepset")["train"]
    medical_dataset = medical_dataset.shuffle(seed=42)
    train_valtest_dataset = medical_dataset.train_test_split(
        test_size=0.3, shuffle=False
    )
    val_test_dataset = train_valtest_dataset["test"].train_test_split(
        test_size=0.5, shuffle=False
    )

    train_dataset = train_valtest_dataset["train"]
    val_dataset = val_test_dataset["train"]
    test_dataset = val_test_dataset["test"]

    save_medical_dataset_as_csv(train_dataset, set_type="train")
    save_medical_dataset_as_csv(val_dataset, set_type="val")
    save_medical_dataset_as_csv(test_dataset, set_type="test")

In [3]:
def preprocess_squad_dataset_sample(sample):
    sample["answer_text"] = sample["answers"]["text"][0]
    sample["answer_start"] = sample["answers"]["answer_start"][0]

    return sample


def save_squad_dataset_as_csv(dataset, set_type):
    dataset = dataset.map(preprocess_squad_dataset_sample)
    dataset = dataset.remove_columns(["title", "answers"])

    dataset_df = dataset.to_pandas()
    dataset_df = dataset_df.dropna()

    path = extractive_qa_paths.squad_dataset_dir
    dir_exists = os.path.exists(path)
    if not dir_exists:
        os.makedirs(path)
    dataset_df.to_csv(path / f"{set_type}.csv", index=True, index_label="index")


def load_squad_dataset_and_save_as_csv():
    squad_dataset = datasets.load_dataset("squad")
    train_temp_pd = squad_dataset["train"].to_pandas()
    validation_temp_pd = squad_dataset["validation"].to_pandas()
    combined_temp_pd = pd.concat((train_temp_pd, validation_temp_pd))
    squad_dataset = datasets.Dataset.from_pandas(combined_temp_pd, preserve_index=False)
    squad_dataset = squad_dataset.shuffle(seed=42)
    train_valtest_dataset = squad_dataset.train_test_split(test_size=0.3, shuffle=False)
    val_test_dataset = train_valtest_dataset["test"].train_test_split(
        test_size=0.5, shuffle=False
    )

    train_dataset = train_valtest_dataset["train"]
    val_dataset = val_test_dataset["train"]
    test_dataset = val_test_dataset["test"]

    save_squad_dataset_as_csv(train_dataset, set_type="train")
    save_squad_dataset_as_csv(val_dataset, set_type="val")
    save_squad_dataset_as_csv(test_dataset, set_type="test")

In [4]:
load_medical_dataset_and_save_as_csv()

In [5]:
load_squad_dataset_and_save_as_csv()

FileNotFoundError: No (supported) data files or dataset script found in squad. 