In [3]:
"""
Reduce and overwrite test datasets.

- Place this script at the project root (next to the `Datasets/` folder)
- Requires: pandas, pyarrow
    pip install pandas pyarrow
"""

from pathlib import Path
from typing import Optional
import pandas as pd

# =========================
# CONFIG: how many rows to KEEP (sample) per dataset?
#   - Set to None → do NOT save/overwrite
#   - Set to an int (e.g., 500) → sample that many rows and OVERWRITE the file
# =========================
N_AI2_ARC: Optional[int] = None      # test-ai2_arc.parquet
N_BOOLQ: Optional[int]   = None      # test-boolq.parquet
N_SQUADV2: Optional[int] = None      # test-squad_v2.parquet

# Paths
DATASETS_DIR = Path("../Datasets")

# Map file name -> sampling size
targets = {
    "test-ai2_arc.parquet": N_AI2_ARC,
    "test-boolq.parquet": N_BOOLQ,
    "test-squad_v2.parquet": N_SQUADV2,
}

def load_parquet(path: Path) -> pd.DataFrame:
    """Read a parquet file into a DataFrame."""
    return pd.read_parquet(path)

def atomic_overwrite(df: pd.DataFrame, path: Path) -> None:
    """
    Write to a temporary file and then replace the original.
    This reduces the chance of a corrupted file if the process is interrupted.
    """
    tmp = path.with_suffix(path.suffix + ".tmp")
    df.to_parquet(tmp, index=False)
    tmp.replace(path)  # atomic on most OSes; fine for our use case

def process(file_name: str, n_sample: Optional[int]) -> None:
    """Print dataset info, show a small sample, and optionally overwrite with a reduced sample."""
    path = DATASETS_DIR / file_name
    if not path.exists():
        print(f"⚠️  Missing file: {path}")
        return

    df = load_parquet(path)
    rows, cols = df.shape
    print("\n" + "=" * 80)
    print(f"📄 Dataset: {file_name}")
    print(f"🔢 Rows: {rows:,} | Columns: {cols}")

    # Show a quick random sample (up to 5 rows, fixed seed for reproducibility)
    k = min(5, rows)
    print("\n🧪 Sample:")
    print(df.sample(n=k, random_state=42))

    # Conditional overwrite
    if isinstance(n_sample, int) and n_sample > 0:
        take = min(n_sample, rows)
        df_small = df.sample(n=take, random_state=42).reset_index(drop=True)
        atomic_overwrite(df_small, path)
        print(f"\n💾 Overwritten: {path} with {take} rows.")
    else:
        print("\n⏭️  Skipped overwrite (config = None or invalid).")


for fname, n in targets.items():
    process(fname, n)


📄 Dataset: test-ai2_arc.parquet
🔢 Rows: 30 | Columns: 3

🧪 Sample:
                title                                           question  \
27    Mercury_7133245  Tonya plugged in a toaster. She placed two sli...   
15    Mercury_7108990  Which object occupies the greatest amount of s...   
23  Mercury_SC_405296  Which change would be harmful to the habitat o...   
17    Mercury_7138390  The number of living organisms in and around a...   
8      TAKS_2009_8_32  Which of the following traits is most influenc...   

   answer  
27      B  
15      A  
23      C  
17      B  
8       A  

⏭️  Skipped overwrite (config = None or invalid).

📄 Dataset: test-boolq.parquet
🔢 Rows: 200 | Columns: 3

🧪 Sample:
                                              question  answer  \
95      is there more than one national treasure movie    True   
15                is alex from 13 reasons why in nerve    True   
30   is age of ultron connected to guardians of the...    True   
158         is boric 