In [2]:
"""
Reduce and overwrite TRAIN datasets.

- Place this script at the project root (next to the `Datasets/` folder)
- Requires: pandas, pyarrow
    pip install pandas pyarrow
"""

from pathlib import Path
from typing import Optional, Tuple
import pandas as pd

# =========================
# CONFIG
# =========================
# How many rows to KEEP (sample) per dataset?
#   - None → do NOT sample/size-reduce
#   - int (e.g., 50_000) → sample that many rows and OVERWRITE the file
N_TRAIN_AI2_ARC: Optional[int]  = 1000      # train-ai2_arc.parquet
N_TRAIN_BOOLQ: Optional[int]    = 1000      # train-boolq.parquet
N_TRAIN_SQUADV2: Optional[int]  = 1000      # train-squad_v2.parquet
N_TRAIN_OPENMATH: Optional[int] = 1000      # train-OpenMathInstruct-2.parquet

# Persist column rename even when not sampling?
PERSIST_RENAME_IF_CHANGED: bool = True

# Paths
DATASETS_DIR = Path("../Datasets")

# Map file name -> sampling size
targets = {
    "train-ai2_arc.parquet": N_TRAIN_AI2_ARC,
    "train-boolq.parquet": N_TRAIN_BOOLQ,
    "train-squad_v2.parquet": N_TRAIN_SQUADV2,
    "train-OpenMathInstruct-2.parquet": N_TRAIN_OPENMATH,
}

def load_parquet(path: Path) -> pd.DataFrame:
    """Read a parquet file into a DataFrame."""
    return pd.read_parquet(path)

def atomic_overwrite(df: pd.DataFrame, path: Path) -> None:
    """
    Write to a temporary file and then replace the original.
    This reduces the chance of a corrupted file if the process is interrupted.
    """
    tmp = path.with_suffix(path.suffix + ".tmp")
    df.to_parquet(tmp, index=False)
    tmp.replace(path)  # atomic on most OSes; fine for our use case

def rename_problem_to_question(df: pd.DataFrame, file_name: str) -> Tuple[pd.DataFrame, bool]:
    """
    Safely rename 'problem' column to 'question' if present.
    Returns (df, renamed_flag).
    - If 'question' already exists, skip to avoid duplicates.
    """
    cols = df.columns
    if "problem" in cols:
        if "question" in cols:
            print(f"⚠️  '{file_name}': found both 'problem' and 'question'. Skipping rename to avoid duplicates.")
            return df, False
        print(f"🔤 Renaming column in '{file_name}': problem → question")
        return df.rename(columns={"problem": "question"}), True
    return df, False

def process(file_name: str, n_sample: Optional[int]) -> None:
    """Print dataset info, show a small sample, and optionally overwrite with a reduced sample."""
    path = DATASETS_DIR / file_name
    if not path.exists():
        print(f"⚠️  Missing file: {path}")
        return

    df = load_parquet(path)

    # Rename 'problem' -> 'question' if applicable (safe)
    df, renamed = rename_problem_to_question(df, file_name)

    rows, cols = df.shape
    print("\n" + "=" * 80)
    print(f"📄 Dataset: {file_name}")
    print(f"🔢 Rows: {rows:,} | Columns: {cols}")

    # Show a quick random sample (up to 5 rows, fixed seed for reproducibility)
    k = min(5, rows)
    print("\n🧪 Sample:")
    print(df.sample(n=k, random_state=42))

    # Overwrite logic
    if isinstance(n_sample, int) and n_sample > 0:
        take = min(n_sample, rows)
        df_small = df.sample(n=take, random_state=42).reset_index(drop=True)  # already with renamed columns
        atomic_overwrite(df_small, path)
        print(f"\n💾 Overwritten (sampled): {path} with {take} rows.")
    elif renamed and PERSIST_RENAME_IF_CHANGED:
        # Persist the rename even if no sampling is requested
        atomic_overwrite(df, path)
        print(f"\n💾 Overwritten (rename only): {path} with {rows} rows (no sampling).")
    else:
        print("\n⏭️  Skipped overwrite (no sampling and no rename to persist).")

if __name__ == "__main__":
    for fname, n in targets.items():
        process(fname, n)



📄 Dataset: train-ai2_arc.parquet
🔢 Rows: 1,119 | Columns: 3

🧪 Sample:
                    title                                           question  \
243       Mercury_7030083  The day before the class is going to do a lab ...   
101     Mercury_SC_401589  Which mixture contains ingredients that can be...   
961        Mercury_417154  In 2005, a team of scientists discovered a pho...   
1060  Mercury_SC_LBS10384  Vegetables can be scientifically classified by...   
522    TIMSS_2007_8_pg128  A sound is heard when you pluck a string on a ...   

     answer  
243       B  
101       B  
961       B  
1060      D  
522       B  

💾 Overwritten (sampled): ..\Datasets\train-ai2_arc.parquet with 1000 rows.

📄 Dataset: train-boolq.parquet
🔢 Rows: 9,427 | Columns: 3

🧪 Sample:
                                               question  answer  \
8681     did jurassic world fallen kingdom come out yet    True   
2362  has there ever been a host team in the super bowl   False   
6232      is it 

In [2]:
"""
Jupyter helper — suggest `max_length` per dataset by measuring tokenized lengths.

What it does
- Auto-discovers ../Datasets/*.parquet (both train-*.parquet and test-*.parquet)
- Guesses relevant text columns (question/context/etc.). You can override.
- Tokenizes with your model’s tokenizer; falls back to char-based heuristic if needed.
- Reports p50/p90/p95/p98/p99/max and suggested max_length values (cover_90/95/98/near_max).

How to use
- Just run this cell. Optionally adjust the CONFIG block.
"""

# ========================
# CONFIG (edit as needed)
# ========================
from pathlib import Path

DATASETS_DIR = Path("../Datasets")
MODEL_NAME = "Qwen/Qwen3-0.6B"   # tokenizer name
SAMPLE_MAX = 30_000              # max rows to analyze per dataset (random sample)
TEMPLATE_OVERHEAD = 32           # tokens added by your prompt template/system
ROUND_MULTIPLE = 128             # round suggestions up to this multiple
TEXT_COLS_OVERRIDE = None        # e.g., ["question","context","choices","response"] or None to auto
GLOB_PATTERN = "*-*.parquet"     # which files to analyze inside DATASETS_DIR

# ========================
# Implementation
# ========================
import math
import ast
import numpy as np
import pandas as pd

def guess_text_columns(df: pd.DataFrame):
    # Prioritized list of likely text-bearing columns
    candidates = [
        "question","answer",
    ]
    present = [c for c in candidates if c in df.columns]
    if not present:
        present = [c for c in df.columns if pd.api.types.is_string_dtype(df[c])]
    return present[:6]

def _to_list_if_serialized(obj):
    if isinstance(obj, list):
        return obj
    if isinstance(obj, str):
        s = obj.strip()
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
            try:
                return list(ast.literal_eval(s))
            except Exception:
                return [obj]
    return [obj]

def _row_to_text(row, cols):
    parts = []
    for c in cols:
        if c not in row or pd.isna(row[c]):
            continue
        val = row[c]
        if c == "choices":
            items = _to_list_if_serialized(val)
            parts.append("Choices:\n- " + "\n- ".join(str(x) for x in items))
        else:
            parts.append(f"{c.capitalize()}: {val}")
    return "\n".join(parts).strip()

def _build_texts(df, cols):
    for _, r in df[cols].iterrows():
        yield _row_to_text(r, cols)

def load_token_counter(model_name):
    # Try HF tokenizer; if it fails (no internet/cache), fall back to a chars-per-token rule
    try:
        from transformers import AutoTokenizer
        tok = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True)
        def count_tokens(text: str) -> int:
            return len(tok(text, add_special_tokens=True,
                           return_attention_mask=False,
                           return_token_type_ids=False)["input_ids"])
        print(f"✅ Loaded tokenizer: {model_name}")
        return count_tokens
    except Exception as e:
        print(f"⚠️ Could not load tokenizer ({e}). Using char-based estimate.")
        avg_chars_per_token = 3.7
        def count_tokens(text: str) -> int:
            return int(math.ceil(len(text) / avg_chars_per_token))
        return count_tokens

def round_up(x: float, base: int = 128) -> int:
    return int(base * math.ceil(x / base))

def analyze_dataset(path: Path, model_name: str, text_cols_override=None,
                    sample_max: int = 30_000, template_overhead: int = 32,
                    round_multiple: int = 128):
    df = pd.read_parquet(path)
    if len(df) == 0:
        raise ValueError(f"{path.name} is empty.")

    cols = text_cols_override or guess_text_columns(df)
    if not cols:
        raise ValueError(f"Could not guess text columns for {path.name}. Set TEXT_COLS_OVERRIDE.")

    if sample_max and len(df) > sample_max:
        df = df.sample(n=sample_max, random_state=42).reset_index(drop=True)

    count_tokens = load_token_counter(model_name)
    texts = list(_build_texts(df, cols))
    lengths = np.fromiter((count_tokens(t) + template_overhead for t in texts), dtype=np.int32)

    pct = {p: int(np.percentile(lengths, p)) for p in [50, 90, 95, 98, 99]}
    stats = {
        "dataset": path.name,
        "rows_scanned": len(lengths),
        "min": int(lengths.min()),
        "p50": pct[50],
        "p90": pct[90],
        "p95": pct[95],
        "p98": pct[98],
        "p99": pct[99],
        "max": int(lengths.max()),
        "mean": float(lengths.mean()),
        "std": float(lengths.std()),
        "cols_used": ", ".join(cols),
    }

    suggestions = {
        "cover_90": round_up(stats["p90"], round_multiple),
        "cover_95": round_up(stats["p95"], round_multiple),
        "cover_98": round_up(stats["p98"], round_multiple),
        "near_max": round_up(stats["max"], round_multiple),
    }

    def coverage(L): return float((lengths <= L).mean())

    rec = {
        "suggest_cover_90": suggestions["cover_90"],
        "coverage_90_%": round(coverage(suggestions["cover_90"]) * 100, 1),
        "suggest_cover_95": suggestions["cover_95"],
        "coverage_95_%": round(coverage(suggestions["cover_95"]) * 100, 1),
        "suggest_cover_98": suggestions["cover_98"],
        "coverage_98_%": round(coverage(suggestions["cover_98"]) * 100, 1),
        "suggest_near_max": suggestions["near_max"],
        "coverage_near_max_%": round(coverage(suggestions["near_max"]) * 100, 1),
        "recommended_max_length": suggestions["cover_95"],  # default recommendation
        "template_overhead": template_overhead,
        "round_multiple": round_multiple,
    }

    # Pretty one-line recommendation for copy-paste
    pretty = (
        f"[{path.name}] max_length ≈ {rec['recommended_max_length']}  "
        f"(covers ~{rec['coverage_95_%']}% | p95={stats['p95']} | cols={stats['cols_used']})"
    )

    return stats, rec, pretty

# Run over discovered datasets and show a table
paths = sorted(DATASETS_DIR.glob(GLOB_PATTERN))
if not paths:
    raise FileNotFoundError(f"No datasets matched {DATASETS_DIR / GLOB_PATTERN}")

all_rows = []
pretties = []
for p in paths:
    stats, rec, pretty = analyze_dataset(
        p,
        model_name=MODEL_NAME,
        text_cols_override=TEXT_COLS_OVERRIDE,
        sample_max=SAMPLE_MAX,
        template_overhead=TEMPLATE_OVERHEAD,
        round_multiple=ROUND_MULTIPLE,
    )
    all_rows.append({**stats, **rec})
    pretties.append(pretty)

res_df = pd.DataFrame(all_rows).sort_values("dataset").reset_index(drop=True)

# Display results
from IPython.display import display
print("✅ Analysis complete. Recommended `max_length` per dataset (cover_95):")
for line in pretties:
    print(" •", line)

print("\nDetailed table:")
display(res_df)


  from .autonotebook import tqdm as notebook_tqdm


✅ Loaded tokenizer: Qwen/Qwen3-0.6B
✅ Loaded tokenizer: Qwen/Qwen3-0.6B
✅ Loaded tokenizer: Qwen/Qwen3-0.6B
✅ Loaded tokenizer: Qwen/Qwen3-0.6B
✅ Loaded tokenizer: Qwen/Qwen3-0.6B
✅ Loaded tokenizer: Qwen/Qwen3-0.6B
✅ Loaded tokenizer: Qwen/Qwen3-0.6B
✅ Loaded tokenizer: Qwen/Qwen3-0.6B
✅ Analysis complete. Recommended `max_length` per dataset (cover_95):
 • [test-ai2_arc.parquet] max_length ≈ 256  (covers ~100.0% | p95=141 | cols=question, answer)
 • [test-boolq.parquet] max_length ≈ 128  (covers ~100.0% | p95=51 | cols=question, answer)
 • [test-OpenMathInstruct-2.parquet] max_length ≈ 256  (covers ~98.0% | p95=188 | cols=question, answer)
 • [test-squad_v2.parquet] max_length ≈ 128  (covers ~100.0% | p95=60 | cols=question, answer)
 • [train-ai2_arc.parquet] max_length ≈ 256  (covers ~100.0% | p95=138 | cols=question, answer)
 • [train-boolq.parquet] max_length ≈ 128  (covers ~100.0% | p95=51 | cols=question, answer)
 • [train-OpenMathInstruct-2.parquet] max_length ≈ 256  (covers ~9

Unnamed: 0,dataset,rows_scanned,min,p50,p90,p95,p98,p99,max,mean,...,coverage_90_%,suggest_cover_95,coverage_95_%,suggest_cover_98,coverage_98_%,suggest_near_max,coverage_near_max_%,recommended_max_length,template_overhead,round_multiple
0,test-OpenMathInstruct-2.parquet,200,55,93,152,188,231,327,571,107.645,...,98.0,256,98.0,256,98.0,640,100.0,256,32,128
1,test-ai2_arc.parquet,200,56,92,128,141,165,192,226,96.83,...,90.0,256,100.0,256,100.0,256,100.0,256,32,128
2,test-boolq.parquet,200,46,47,50,51,52,52,54,47.835,...,100.0,128,100.0,128,100.0,128,100.0,128,32,128
3,test-squad_v2.parquet,200,42,50,57,60,63,63,65,50.925,...,100.0,128,100.0,128,100.0,128,100.0,128,32,128
4,train-OpenMathInstruct-2.parquet,1000,50,93,141,161,196,265,476,100.785,...,98.9,256,98.9,256,98.9,512,100.0,256,32,128
5,train-ai2_arc.parquet,1000,58,91,123,138,153,161,216,94.463,...,92.7,256,100.0,256,100.0,256,100.0,256,32,128
6,train-boolq.parquet,1000,46,48,50,51,53,54,61,47.964,...,100.0,128,100.0,128,100.0,128,100.0,128,32,128
7,train-squad_v2.parquet,1000,41,51,59,63,67,76,100,52.023,...,100.0,128,100.0,128,100.0,128,100.0,128,32,128
