In [1]:
!pip install datasets



In [4]:
# Full script (HF datasets) with robust SQuAD2.0 unanswerable detection
# Outputs JSONL lines like: {"id": ..., "input": ..., "output": ..., "split": "train"|"dev"}

import json, random
from datasets import load_dataset

NO_ANS = "<NO-ANSWER>"
SEED = 42

DATASET1_SIZE = 10000
DATASET2_ANS_SIZE = 5000
DATASET2_NOANS_SIZE = 5000
DATASET4_ANS_SIZE = 1000
DATASET4_NOANS_SIZE = 1000

OUT_DATASET1 = "dataset1.jsonl"
OUT_DATASET2 = "dataset2.jsonl"
OUT_DATASET4 = "dataset4.jsonl"


def write_jsonl(path, rows, split):
    with open(path, "w") as f:
        for r in rows:
            f.write(json.dumps({
                "id": r["id"],
                "input": r["question"],
                "output": r["label"],
                "split": split
            }) + "\n")


def squad_answerable_rows(hf_ds):
    """
    rajpurkar/squad (SQuAD 1.1): examples are answerable.
    We still guard on having a non-empty answer.
    """
    rows = []
    for ex in hf_ds:
        answers = ex.get("answers", {})
        texts = answers.get("text", []) if isinstance(answers, dict) else []
        if not texts:
            continue
        ans_text = (texts[0] or "").strip()
        if not ans_text:
            continue
        rows.append({
            "id": ex["id"],
            "question": ex["question"].strip(),
            "label": ans_text
        })
    return rows


def squad_v2_unanswerable_rows(hf_ds):
    """
    rajpurkar/squad_v2 (SQuAD 2.0): robustly detect unanswerables.
    Some processed versions rely on:
      - ex["is_impossible"] == True/1
      - OR answers["text"] is empty
    We accept either signal.
    """
    rows = []
    for ex in hf_ds:
        answers = ex.get("answers", {})
        texts = answers.get("text", []) if isinstance(answers, dict) else []

        impossible_flag = bool(ex.get("is_impossible", False))  # handles True/False or 1/0
        empty_answers = (len(texts) == 0)

        if impossible_flag or empty_answers:
            rows.append({
                "id": ex["id"],
                "question": ex["question"].strip(),
                "label": NO_ANS
            })
    return rows


def main():
    rng = random.Random(SEED)

    # -------- Load from Hugging Face (train + validation) --------
    squad = load_dataset("rajpurkar/squad")        # train, validation
    squad_v2 = load_dataset("rajpurkar/squad_v2")  # train, validation

    # (Optional) quick sanity check of SQuAD2 schema/signals
    ex0 = squad_v2["train"][0]
    keys0 = list(ex0.keys())
    ans0 = ex0.get("answers", {})
    ans0_texts = ans0.get("text", []) if isinstance(ans0, dict) else []
    print("SQuAD2 train example keys:", keys0)
    print("SQuAD2 train example has is_impossible:", "is_impossible" in ex0)
    print("SQuAD2 train example answers[text] len:", len(ans0_texts))

    squad_train_ans = squad_answerable_rows(squad["train"])
    squad_dev_ans   = squad_answerable_rows(squad["validation"])

    squadv2_train_unans = squad_v2_unanswerable_rows(squad_v2["train"])
    squadv2_dev_unans   = squad_v2_unanswerable_rows(squad_v2["validation"])

    print("SQuAD 1.1 train answerables:", len(squad_train_ans))
    print("SQuAD 1.1 validation answerables:", len(squad_dev_ans))
    print("SQuAD 2.0 train unanswerables:", len(squadv2_train_unans))
    print("SQuAD 2.0 validation unanswerables:", len(squadv2_dev_unans))

    # -------- Dataset 1 (train): 10k answerables from SQuAD 1.1 train --------
    if len(squad_train_ans) < DATASET1_SIZE:
        raise ValueError(
            f"Not enough SQuAD 1.1 train answerables for Dataset 1. "
            f"Have {len(squad_train_ans)}, need {DATASET1_SIZE}."
        )
    dataset1 = rng.sample(squad_train_ans, DATASET1_SIZE)
    write_jsonl(OUT_DATASET1, dataset1, split="train")
    print(f"Wrote Dataset 1: {len(dataset1)} -> {OUT_DATASET1}")

    # -------- Dataset 2 (train): 5k from dataset1 + 5k SQuAD 2.0 train unanswerables --------
    if len(dataset1) < DATASET2_ANS_SIZE:
        raise ValueError("Dataset 1 too small to sample answerables for Dataset 2.")
    if len(squadv2_train_unans) < DATASET2_NOANS_SIZE:
        raise ValueError(
            f"Not enough SQuAD 2.0 train unanswerables for Dataset 2. "
            f"Have {len(squadv2_train_unans)}, need {DATASET2_NOANS_SIZE}.\n"
            f"If this is unexpected, check the printed schema signals above."
        )

    dataset2_ans = rng.sample(dataset1, DATASET2_ANS_SIZE)
    dataset2_unans = rng.sample(squadv2_train_unans, DATASET2_NOANS_SIZE)

    dataset2 = dataset2_ans + dataset2_unans
    rng.shuffle(dataset2)
    write_jsonl(OUT_DATASET2, dataset2, split="train")
    print(f"Wrote Dataset 2: {len(dataset2)} -> {OUT_DATASET2} "
          f"(answerable={len(dataset2_ans)}, unanswerable={len(dataset2_unans)})")

    # -------- Dataset 4 (dev held-out): from validation splits only --------
    if len(squad_dev_ans) < DATASET4_ANS_SIZE:
        raise ValueError(
            f"Not enough SQuAD 1.1 validation answerables for Dataset 4. "
            f"Have {len(squad_dev_ans)}, need {DATASET4_ANS_SIZE}."
        )
    if len(squadv2_dev_unans) < DATASET4_NOANS_SIZE:
        raise ValueError(
            f"Not enough SQuAD 2.0 validation unanswerables for Dataset 4. "
            f"Have {len(squadv2_dev_unans)}, need {DATASET4_NOANS_SIZE}."
        )

    dataset4_ans = rng.sample(squad_dev_ans, DATASET4_ANS_SIZE)
    dataset4_unans = rng.sample(squadv2_dev_unans, DATASET4_NOANS_SIZE)

    dataset4 = dataset4_ans + dataset4_unans
    rng.shuffle(dataset4)
    write_jsonl(OUT_DATASET4, dataset4, split="dev")
    print(f"Wrote Dataset 4: {len(dataset4)} -> {OUT_DATASET4} "
          f"(answerable={len(dataset4_ans)}, unanswerable={len(dataset4_unans)})")


if __name__ == "__main__":
    main()


SQuAD2 train example keys: ['id', 'title', 'context', 'question', 'answers']
SQuAD2 train example has is_impossible: False
SQuAD2 train example answers[text] len: 1
SQuAD 1.1 train answerables: 87599
SQuAD 1.1 validation answerables: 10570
SQuAD 2.0 train unanswerables: 43498
SQuAD 2.0 validation unanswerables: 5945
Wrote Dataset 1: 10000 -> dataset1.jsonl
Wrote Dataset 2: 10000 -> dataset2.jsonl (answerable=5000, unanswerable=5000)
Wrote Dataset 4: 2000 -> dataset4.jsonl (answerable=1000, unanswerable=1000)
