In [1]:
# Cell 1: Imports + paths
from pathlib import Path
import json

manifest_path = Path("data/instances/bike_sharing_10k/seed_42/manifest.json")
out_path = Path("data/instances/bike_sharing_10k/seed_42/questions_and_answers.json")

In [2]:
# Cell 2: Load manifest.json
manifest = json.loads(manifest_path.read_text())
manifest.keys(), len(manifest.get("phenomena", []))

(dict_keys(['dataset_instance_id', 'seed', 'base_dataset', 'phenomena']), 3)

In [3]:
# Cell 3: Extract questions + answers (in order)
questions = []
answers = []

for ph in manifest.get("phenomena", []):
    qlist = ph.get("questions", [])
    # qlist is expected to be a list of objects like:
    # {"question": "...", "answer": ... , ...}
    if isinstance(qlist, list):
        for qobj in qlist:
            if isinstance(qobj, dict) and "question" in qobj:
                questions.append(qobj["question"])
                answers.append(qobj.get("answer"))
    # Backward compatibility: if someone stored a dict mapping question->answer
    elif isinstance(qlist, dict):
        for q, a in qlist.items():
            questions.append(q)
            answers.append(a)

len(questions), len(answers)

(9, 9)

In [4]:
# Cell 4: Write questions_and_answers.json
payload = {
    "questions": questions,
    "answers": answers,
}

out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(payload, indent=2))

print(f"Wrote: {out_path}")

Wrote: data/instances/bike_sharing_10k/seed_42/questions_and_answers.json


In [5]:
# Cell 5: Quick sanity check (show first few)
for i in range(min(5, len(questions))):
    print(f"{i+1}. Q: {questions[i]}")
    print(f"   A: {answers[i]}")
    print()

1. Q: Which column (excluding row_id) has the highest fraction of missing values? Return the single column name.
   A: workingday

2. Q: Considering only columns ['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt'], how many rows have at least one missing value? Return the count.
   A: 2919

3. Q: Considering only columns ['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt'], which row_id has the largest number of missing values? If tied, return the smallest row_id.
   A: 7405

4. Q: Which column is a duplicate of 'temp'?
   A: feature_0429

5. Q: Which pair of columns are identical for all rows (excluding row_id)? Return the two column names sorted alphabetically.
   A: ['feature_0429', 'temp']

