In [1]:
!git clone --depth 1 https://github.com/google-deepmind/AQuA.git
!ls -la AQuA

import json, os, random
from pathlib import Path

Cloning into 'AQuA'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 11 (delta 2), reused 9 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (11/11), 30.30 MiB | 24.54 MiB/s, done.
Resolving deltas: 100% (2/2), done.
total 97532
drwxr-xr-x 3 root root     4096 Jan  1 21:36 .
drwxr-xr-x 1 root root     4096 Jan  1 21:36 ..
-rw-r--r-- 1 root root     1452 Jan  1 21:36 CONTRIBUTING
-rw-r--r-- 1 root root   132008 Jan  1 21:36 dev.json
-rw-r--r-- 1 root root   142236 Jan  1 21:36 dev.tok.json
drwxr-xr-x 8 root root     4096 Jan  1 21:36 .git
-rw-r--r-- 1 root root      552 Jan  1 21:36 LICENSE
-rw-r--r-- 1 root root     1812 Jan  1 21:36 README.md
-rw-r--r-- 1 root root   130192 Jan  1 21:36 test.json
-rw-r--r-- 1 root root   139683 Jan  1 21:36 test.tok.json
-rw-r--r-- 1 root root 47570935 Jan  1 21:36 train.json
-rw-r--r-- 1 root root 51721975 Jan  1 21:36 train.tok.js

In [2]:
DATA_DIR = Path("AQuA")

def _load_jsonl(path: Path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data

# The repo typically contains: train.json, dev.json, test.json (JSONL format)
splits = {}
for name in ["train", "dev", "test"]:
    p = DATA_DIR / f"{name}.json"
    if p.exists():
        splits[name] = _load_jsonl(p)
        print(f"Loaded {name}: {len(splits[name])} examples from {p}")
    else:
        print(f"Could not find {p}. Listing repo contents:")
        !find AQuA -maxdepth 2 -type f | sed 's|^| - |'
        break

Loaded train: 97467 examples from AQuA/train.json
Loaded dev: 254 examples from AQuA/dev.json
Loaded test: 254 examples from AQuA/test.json


In [3]:
ex = random.choice(splits["train"])
print("\nKeys:", ex.keys())
print("\nQuestion:\n", ex.get("question"))
print("\nOptions:\n", ex.get("options"))
print("\nCorrect:\n", ex.get("correct"))
print("\nRationale (often present):\n", ex.get("rationale", "")[:500], "...")


Keys: dict_keys(['question', 'options', 'rationale', 'correct'])

Question:
 In an election between two candidates first candidate got 50% of votes polled and second Candidate got 2800 votes. The total number of votes polled was?

Options:
 ['A)5600 votes', 'B)5800 votes', 'C)5900 votes', 'D)5980 votes', 'E)5990 votes']

Correct:
 A

Rationale (often present):
 Total = 100 %,
First person got 50%
second person got remaining 50 % of votes.
than 50 % = 2800
50% = 50Ã—56 = 2800
100% =100Ã—56 =5600 votes
A ...


In [4]:
# Sample 100 examples (deterministic seed)
random.seed(42)
sample100 = random.sample(splits["train"], 100)

# Normalize into a simple MCQ schema
def to_mcq(ex):
    return {
        "question": ex["question"],
        "choices": ex["options"],          # list like ["A) ...", "B) ...", ...]
        "answer": ex["correct"],           # e.g. "A"
        "rationale": ex.get("rationale")   # may be None
    }

sample100_mcq = [to_mcq(x) for x in sample100]

# Save for reuse
out_path = "aqua_rat_sample100.jsonl"
with open(out_path, "w", encoding="utf-8") as f:
    for x in sample100_mcq:
        f.write(json.dumps(x, ensure_ascii=False) + "\n")

print("Saved:", out_path, "with", len(sample100_mcq), "examples")
print("First example:\n", json.dumps(sample100_mcq[0], indent=2, ensure_ascii=False)[:1000])


Saved: aqua_rat_sample100.jsonl with 100 examples
First example:
 {
  "question": "In how many different ways can the letters of the word 'MATHEMATICS' be arranged so that the vowels always come together ?",
  "choices": [
    "A)120960",
    "B)135650",
    "C)115850",
    "D)142560",
    "E)185260"
  ],
  "answer": "A",
  "rationale": "In the word 'MATHEMATICS' we treat the two vowels AEAI as one letter. Thus,we have MTHMTCS (AEAI)\nNow, we have to arranged 8 letters, out of which M occurs twice, T occurs twice and the rest are different.\nNumber of ways of arranging these letters= 8!/(2!)(2!)=10080.\nNow AEAI has 4 letters in which A occurs 2 times and the rest are different.\nnumber of ways of arranging these letters =4!/2!=12.\nRequired number of ways =(10080*12)=120960.\nAnswer is A"
}


In [None]:
!ls -lh aqua_rat_sample100.jsonl
!head -n 2 aqua_rat_sample100.jsonl

-rw-r--r-- 1 root root 48K Jan  1 21:37 aqua_rat_sample100.jsonl
{"question": "In how many different ways can the letters of the word 'MATHEMATICS' be arranged so that the vowels always come together ?", "choices": ["A)120960", "B)135650", "C)115850", "D)142560", "E)185260"], "answer": "A", "rationale": "In the word 'MATHEMATICS' we treat the two vowels AEAI as one letter. Thus,we have MTHMTCS (AEAI)\nNow, we have to arranged 8 letters, out of which M occurs twice, T occurs twice and the rest are different.\nNumber of ways of arranging these letters= 8!/(2!)(2!)=10080.\nNow AEAI has 4 letters in which A occurs 2 times and the rest are different.\nnumber of ways of arranging these letters =4!/2!=12.\nRequired number of ways =(10080*12)=120960.\nAnswer is A"}
{"question": "an operation * is defined by the equation:\na*b=a-b/a+b, for all numbers a and b such that a is not equal to -b, if a is not equal to -t and a*t=0, then =?", "choices": ["A)a) -a", "B)b) -1/a", "C)c) 1/a", "D)d)0", "E)