In [1]:
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


datasets = {
    "squad_v2": {
                "train": "hf://datasets/rajpurkar/squad_v2/squad_v2/train-00000-of-00001.parquet",
                "test": "hf://datasets/rajpurkar/squad_v2/squad_v2/validation-00000-of-00001.parquet"
            },
    "ai2_arc": {
                "train": "hf://datasets/allenai/ai2_arc/ARC-Challenge/train-00000-of-00001.parquet",
                "test": "hf://datasets/allenai/ai2_arc/ARC-Challenge/test-00000-of-00001.parquet"
            },
    "boolq": {
                "train": "hf://datasets/google/boolq/data/train-00000-of-00001.parquet",
                "test": "hf://datasets/google/boolq/data/validation-00000-of-00001.parquet"
            }
}

def clean_data(dataset_name: str, df, type_of_dataset: str):
    match dataset_name:
        case "squad_v2":
            df["answers"] = df["answers"].apply(
                lambda x: x["text"][0] if len(x.get("text")) > 0 else ""
            )
            df.rename(
                columns={
                    "answers": "answer",
                },
                inplace=True,
            )
            df.drop(columns="id", inplace=True)
        case "ai2_arc":
            df["choices"] = df["choices"].apply(
                lambda x: " ".join(
                    [f"{label}. {text}" for label, text in zip(x["label"], x["text"])]
                )
            )
            df["question"] = df["question"] + " " + df["choices"]
            df.rename(
                columns={
                    "id": "title",
                    "answerKey": "answer",
                },
                inplace=True,
            )
            df.drop(columns="choices", inplace=True)
        case "boolq":
            df.rename(
                columns={
                    "passage": "context",
                },
                inplace=True,
            )

    df.to_parquet(f"../Datasets/{type_of_dataset}-{dataset_name}.parquet")

In [3]:
for dataset_name in datasets.keys():
    dataset = datasets[dataset_name]
    for type_of_dataset in dataset.keys():
        df = pd.read_parquet(dataset[type_of_dataset])
        clean_data(dataset_name, df, type_of_dataset)

In [4]:
dataset = load_dataset("nvidia/OpenMathInstruct-2", split="train", streaming=True)

# Solo tomar 10k ejemplos sin descargar todo
subset = []
for i, example in enumerate(dataset):
    if i >= 10000:
        break
    subset.append(example)

df = pd.DataFrame(subset)

# Eliminar columnas innecesarias
df = df.drop(columns=["generated_solution", "problem_source"], errors="ignore")

# Renombrar columna
df = df.rename(columns={"expected_answer": "answer"})

# Filtrar filas donde 'answer' no es un número
df = df[df["answer"].apply(lambda x: isinstance(x, (int, float)) or (isinstance(x, str) and x.replace('.', '', 1).isdigit()))]

# Convertir 'answer' a float
df["answer"] = df["answer"].astype(float)

# Separar en train y test (80% train, 20% test)
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

# Guardar en disco
train_df.to_parquet("../Datasets/train-OpenMathInstruct-2.parquet")
test_df.to_parquet("../Datasets/test-OpenMathInstruct-2.parquet")