# Turn datasets into eval-ready format

In [None]:
import pandas as pd

# ───────▸ EDIT THESE ──────────────────────────────────────────────────────────
parquet_files = [...] # List of Parquet files to combine
output_csv = "" # Path to output CSV file
# ───────────────────────────────────────────────────────────────────────────────

# Read each Parquet into a DataFrame
dfs = []
for p in parquet_files:
    print(f"Loading {p} …")
    dfs.append(pd.read_parquet(p))

# Concatenate all together
combined_df = pd.concat(dfs, ignore_index=True)
print(f"Combined DataFrame has {combined_df.shape[0]} rows and {combined_df.shape[1]} columns.")

# Write out to CSV
combined_df.to_csv(output_csv, index=False)
print(f"Wrote CSV to: {output_csv}")


In [None]:
import pandas as pd

# ───────▸ EDIT THESE ──────────────────────────────────────────────────────────
input_csv   = ""      # your input file
output_csv  = ""    # where to save the reformatted CSV
# ───────────────────────────────────────────────────────────────────────────────

# 1. Load your existing data
df = pd.read_csv(input_csv)

# 2. Build the prompt and rename columns
df['prompt']     = df['label'].apply(lambda lbl: f"How many {lbl} are in this picture?")
df['num_points'] = df['count']

# 3. Select & reorder into the “old” format + sha
out_df = df[['image_url', 'prompt', 'num_points', 'image_sha256']]

# 4. Save and peek
out_df.to_csv(output_csv, index=False)
print(f"Saved {len(out_df)} rows to {output_csv}")
out_df.head()


In [None]:
import pandas as pd
from pathlib import Path

# ─────── EDIT THESE ────────────────────────────────────────────────────────────
input_files = [] # List of CSV files to combine
output_csv = "dataset.csv"
# ───────────────────────────────────────────────────────────────────────────────

all_dfs = []
for fn in input_files:
    df = pd.read_csv(fn)

    # (1) only integer answers
    ans_num = pd.to_numeric(df['answer'], errors='coerce')
    mask = ans_num.notnull() & (ans_num % 1 == 0)
    df = df.loc[mask].copy()
    df['answer'] = ans_num[mask].astype(int)

    # (2) build URL
    df['image_url'] = (
        "https://storage.googleapis.com/geckonum_t2i_benchmark/"
        + df['model'] + "/" + df['image_id'] + ".png"
    )

    # (3) prompt = question_id
    df['prompt'] = df['question']

    # (4) select columns
    all_dfs.append(df[['image_url', 'prompt', 'answer']])

# concatenate
combined = pd.concat(all_dfs, ignore_index=True)

# (5) collapse duplicates, keep first annotator’s answer
unique = combined.drop_duplicates(subset=['image_url','prompt'], keep='first')

# (6) write out
unique.to_csv(output_csv, index=False)
print(f"Wrote {len(unique)} unique rows to {output_csv}")

# show a sample
unique.head()


In [None]:
import pandas as pd
from pathlib import Path

# ───────▸ EDIT THESE ──────────────────────────────────────────────────────────
csv_path    = Path("")       # CSV with column of relative paths
col_name    = "file_name"                     # that column (e.g. 'val2014/XXX.jpg')
images_dir  = Path("")   # contains subfolders like 'val2014/'
exts        = {".jpg", ".png"}                # restrict to these extensions or None
dry_run     = False                         # True: print only; False: actually delete
# ───────────────────────────────────────────────────────────────────────────────

# Load allowed relative‑paths
df = pd.read_csv(csv_path)
if col_name not in df.columns:
    raise ValueError(f"Column {col_name!r} not found in {csv_path}")
allowed = set(df[col_name].astype(str))

# Walk recursively
for img_path in images_dir.rglob("*"):
    if not img_path.is_file():
        continue
    if exts and img_path.suffix.lower() not in exts:
        continue

    # Compute path *relative* to the root, with forward‑slashes
    rel = img_path.relative_to(images_dir).as_posix()

    # If that rel‑path isn't in CSV, delete (or dry‑run)
    if rel not in allowed:
        if dry_run:
            print(f"[DRY RUN] would delete: {rel}")
        else:
            print(f"Deleting: {rel}")
            img_path.unlink()

print("Done.")
