In [4]:
"""
Reduce and overwrite test datasets.

- Place this script at the project root (next to the `Datasets/` folder)
- Requires: pandas, pyarrow
    pip install pandas pyarrow
"""

from pathlib import Path
from typing import Optional, Tuple
import pandas as pd

# =========================
# CONFIG
# =========================
# How many rows to KEEP (sample) per dataset?
#   - None → do NOT sample/size-reduce
#   - int (e.g., 500) → sample that many rows and OVERWRITE the file
N_AI2_ARC: Optional[int]  = None      # test-ai2_arc.parquet
N_BOOLQ: Optional[int]    = None      # test-boolq.parquet
N_SQUADV2: Optional[int]  = None      # test-squad_v2.parquet
N_OPENMATH: Optional[int] = None      # test-OpenMathInstruct-2.parquet

# Persist column rename even when not sampling?
PERSIST_RENAME_IF_CHANGED: bool = True

# Paths
DATASETS_DIR = Path("../Datasets")

# Map file name -> sampling size
targets = {
    "test-ai2_arc.parquet": N_AI2_ARC,
    "test-boolq.parquet": N_BOOLQ,
    "test-squad_v2.parquet": N_SQUADV2,
    "test-OpenMathInstruct-2.parquet": N_OPENMATH,
}

def load_parquet(path: Path) -> pd.DataFrame:
    """Read a parquet file into a DataFrame."""
    return pd.read_parquet(path)

def atomic_overwrite(df: pd.DataFrame, path: Path) -> None:
    """
    Write to a temporary file and then replace the original.
    This reduces the chance of a corrupted file if the process is interrupted.
    """
    tmp = path.with_suffix(path.suffix + ".tmp")
    df.to_parquet(tmp, index=False)
    tmp.replace(path)  # atomic on most OSes; fine for our use case

def rename_problem_to_question(df: pd.DataFrame, file_name: str) -> Tuple[pd.DataFrame, bool]:
    """
    Safely rename 'problem' column to 'question' if present.
    Returns (df, renamed_flag).
    - If 'question' already exists, skip to avoid duplicates.
    """
    cols = df.columns
    if "problem" in cols:
        if "question" in cols:
            print(f"⚠️  '{file_name}': found both 'problem' and 'question'. Skipping rename to avoid duplicates.")
            return df, False
        print(f"🔤 Renaming column in '{file_name}': problem → question")
        return df.rename(columns={"problem": "question"}), True
    return df, False

def process(file_name: str, n_sample: Optional[int]) -> None:
    """Print dataset info, show a small sample, and optionally overwrite with a reduced sample."""
    path = DATASETS_DIR / file_name
    if not path.exists():
        print(f"⚠️  Missing file: {path}")
        return

    df = load_parquet(path)

    # Rename 'problem' -> 'question' if applicable (safe)
    df, renamed = rename_problem_to_question(df, file_name)

    rows, cols = df.shape
    print("\n" + "=" * 80)
    print(f"📄 Dataset: {file_name}")
    print(f"🔢 Rows: {rows:,} | Columns: {cols}")

    # Show a quick random sample (up to 5 rows, fixed seed for reproducibility)
    k = min(5, rows)
    print("\n🧪 Sample:")
    print(df.sample(n=k, random_state=42))

    # Overwrite logic
    if isinstance(n_sample, int) and n_sample > 0:
        take = min(n_sample, rows)
        df_small = df.sample(n=take, random_state=42).reset_index(drop=True)  # already with renamed columns
        atomic_overwrite(df_small, path)
        print(f"\n💾 Overwritten (sampled): {path} with {take} rows.")
    elif renamed and PERSIST_RENAME_IF_CHANGED:
        # Persist the rename even if no sampling is requested
        atomic_overwrite(df, path)
        print(f"\n💾 Overwritten (rename only): {path} with {rows} rows (no sampling).")
    else:
        print("\n⏭️  Skipped overwrite (no sampling and no rename to persist).")

for fname, n in targets.items():
    process(fname, n)



📄 Dataset: test-ai2_arc.parquet
🔢 Rows: 200 | Columns: 3

🧪 Sample:
                 title                                           question  \
95     Mercury_7042945  A student measures the acceleration of a stone...   
15     Mercury_7108990  Which object occupies the greatest amount of s...   
30     Mercury_7030555  Four significantly different household applian...   
158     MEAP_2005_5_15  Organisms have many methods of surviving winte...   
128  NAEP_2005_8_S11+1  Household appliances convert electricity into ...   

    answer  
95       B  
15       A  
30       C  
158      B  
128      C  

⏭️  Skipped overwrite (no sampling and no rename to persist).

📄 Dataset: test-boolq.parquet
🔢 Rows: 200 | Columns: 3

🧪 Sample:
                                              question  answer  \
95      is there more than one national treasure movie    True   
15                is alex from 13 reasons why in nerve    True   
30   is age of ultron connected to guardians of the...    True