# Turn datasets into eval-ready format

In [None]:
import pandas as pd

# ───────▸ EDIT THESE ──────────────────────────────────────────────────────────
parquet_files = [...] # List of Parquet files to combine
output_csv = "" # Path to output CSV file
# ───────────────────────────────────────────────────────────────────────────────

# Read each Parquet into a DataFrame
dfs = []
for p in parquet_files:
    print(f"Loading {p} …")
    dfs.append(pd.read_parquet(p))

# Concatenate all together
combined_df = pd.concat(dfs, ignore_index=True)
print(f"Combined DataFrame has {combined_df.shape[0]} rows and {combined_df.shape[1]} columns.")

# Write out to CSV
combined_df.to_csv(output_csv, index=False)
print(f"Wrote CSV to: {output_csv}")


In [None]:
import pandas as pd

# ───────▸ EDIT THESE ──────────────────────────────────────────────────────────
input_csv   = ""      # your input file
output_csv  = ""    # where to save the reformatted CSV
# ───────────────────────────────────────────────────────────────────────────────

# 1. Load your existing data
df = pd.read_csv(input_csv)

# 2. Build the prompt and rename columns
df['prompt']     = df['label'].apply(lambda lbl: f"How many {lbl} are in this picture?")
df['num_points'] = df['count']

# 3. Select & reorder into the “old” format + sha
out_df = df[['image_url', 'prompt', 'num_points', 'image_sha256']]

# 4. Save and peek
out_df.to_csv(output_csv, index=False)
print(f"Saved {len(out_df)} rows to {output_csv}")
out_df.head()


In [None]:
import pandas as pd
from pathlib import Path

# ─────── EDIT THESE ────────────────────────────────────────────────────────────
input_files = [] # List of CSV files to combine
output_csv = "dataset.csv"
# ───────────────────────────────────────────────────────────────────────────────

all_dfs = []
for fn in input_files:
    df = pd.read_csv(fn)

    # (1) only integer answers
    ans_num = pd.to_numeric(df['answer'], errors='coerce')
    mask = ans_num.notnull() & (ans_num % 1 == 0)
    df = df.loc[mask].copy()
    df['answer'] = ans_num[mask].astype(int)

    # (2) build URL
    df['image_url'] = (
        "https://storage.googleapis.com/geckonum_t2i_benchmark/"
        + df['model'] + "/" + df['image_id'] + ".png"
    )

    # (3) prompt = question_id
    df['prompt'] = df['question']

    # (4) select columns
    all_dfs.append(df[['image_url', 'prompt', 'answer']])

# concatenate
combined = pd.concat(all_dfs, ignore_index=True)

# (5) collapse duplicates, keep first annotator’s answer
unique = combined.drop_duplicates(subset=['image_url','prompt'], keep='first')

# (6) write out
unique.to_csv(output_csv, index=False)
print(f"Wrote {len(unique)} unique rows to {output_csv}")

# show a sample
unique.head()


In [None]:
import pandas as pd
from pathlib import Path

# ───────▸ EDIT THESE ──────────────────────────────────────────────────────────
csv_path    = Path("/cluster/project/sachan/pmlr/grounding-vlms/eval/datasets/TallyQA/original_dataset.csv")       # CSV with column of relative paths
col_name    = "file_name"                     # that column (e.g. 'val2014/XXX.jpg')
images_dir  = Path("/cluster/project/sachan/pmlr/grounding-vlms/eval/datasets/TallyQA/images")   # contains subfolders like 'val2014/'
exts        = {".jpg", ".png"}                # restrict to these extensions or None
dry_run     = False                         # True: print only; False: actually delete
# ───────────────────────────────────────────────────────────────────────────────

# Load allowed relative‑paths
df = pd.read_csv(csv_path)
if col_name not in df.columns:
    raise ValueError(f"Column {col_name!r} not found in {csv_path}")
allowed = set(df[col_name].astype(str))

# Walk recursively
for img_path in images_dir.rglob("*"):
    if not img_path.is_file():
        continue
    if exts and img_path.suffix.lower() not in exts:
        continue

    # Compute path *relative* to the root, with forward‑slashes
    rel = img_path.relative_to(images_dir).as_posix()

    # If that rel‑path isn't in CSV, delete (or dry‑run)
    if rel not in allowed:
        if dry_run:
            print(f"[DRY RUN] would delete: {rel}")
        else:
            print(f"Deleting: {rel}")
            img_path.unlink()

print("Done.")


In [None]:
import pandas as pd

# 1. Load your CSV
df = pd.read_csv('/Users/timeilers/Documents/ETH/Material/AICP/Results/data/FSC-147_dataset.csv')  # ← change to your CSV filename

# 2. Extract the phrase between "How many" and "are in this picture"
df['subject'] = df['prompt'].str.extract(r'How many\s+(.+?)\s+are in this picture')

# 3. (Optional) Save to a new CSV
df.to_csv('/Users/timeilers/Documents/ETH/Material/AICP/Results/data/new_FSC-147_dataset.csv', index=False)


In [None]:
import re
import pandas as pd
import spacy

# load spaCy’s English model
nlp = spacy.load("en_core_web_sm")

df = pd.read_csv('/Users/timeilers/Documents/ETH/Material/AICP/Results/data/TallyQA_dataset.csv')

def extract_head(text):
    doc = nlp(text)
    # grab the first noun chunk whose root is a NOUN or PROPN
    for chunk in doc.noun_chunks:
        if chunk.root.pos_ in ("NOUN", "PROPN"):
            return chunk.root.text
    return None

df["label"] = df["prompt"].apply(extract_head)

df.to_csv('/Users/timeilers/Documents/ETH/Material/AICP/Results/data/new_TallyQA_dataset.csv', index=False)

In [None]:
# Sampling for TallyQA
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd

df = pd.read_csv('/cluster/project/sachan/pmlr/grounding-vlms/eval/datasets/TallyQA/original_dataset.csv')

df['prefix'] = df['file_name'].str.split('/', n=1).str[0]
df['strata'] = df['prefix'] + '___' + df['truth'].astype(str) + '___' + df['label']

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

train_idx, sample_idx = next(sss.split(df, df['strata']))

sample_df = df.iloc[sample_idx].drop(columns=['strata', 'prefix']).reset_index(drop=True)
sample_df.to_csv('/cluster/project/sachan/pmlr/grounding-vlms/eval/datasets/TallyQA/dataset_sampled.csv', index=False)

In [None]:
# Sampling for GeckoNum
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd

df = pd.read_csv('/cluster/project/sachan/pmlr/grounding-vlms/eval/datasets/GeckoNum/original_dataset.csv')
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_idx, sample_idx = next(sss.split(df, df['truth']))

sample_df = df.iloc[sample_idx].reset_index(drop=True)
sample_df.to_csv('/cluster/project/sachan/pmlr/grounding-vlms/eval/datasets/GeckoNum/dataset.csv', index=False)

In [14]:
import pandas as pd

df = pd.read_csv('/cluster/project/sachan/pmlr/grounding-vlms/eval/valid_results/o4-mini/TallyQA_results.csv')

# Series of counts per idx
counts = df['idx'].value_counts()

# index values with count > 1
dup_values = counts[counts > 1].index.tolist()

# Of the duplicate rows, keep the one that appears last in the CSV and print all deleted rows
for idx in dup_values:
    # Get all rows with the same index value
    duplicate_rows = df[df['idx'] == idx]
    
    # Keep the last row and drop the rest
    df = df.drop(duplicate_rows.index[:-1])
    
    # Print the deleted rows
    print(f"Deleted rows for idx {idx}:")
    print(duplicate_rows.iloc[:-1])

df.to_csv('/cluster/project/sachan/pmlr/grounding-vlms/eval/valid_results/o4-mini/TallyQA_results_new.csv', index=False)


Deleted rows for idx 15024:
         idx  result  raw_result
15021  15024       0           0
Deleted rows for idx 8284:
       idx  result  raw_result
8281  8284       0           0
Deleted rows for idx 2404:
       idx  result  raw_result
2401  2404       0           0
Deleted rows for idx 10456:
         idx  result  raw_result
10453  10456       0           0
Deleted rows for idx 2594:
       idx  result  raw_result
2591  2594       0           0
Deleted rows for idx 5762:
       idx  result  raw_result
5759  5762       0           0
Deleted rows for idx 11164:
         idx  result  raw_result
11161  11164       0           0
Deleted rows for idx 9789:
       idx  result  raw_result
9786  9789       0           0
Deleted rows for idx 14301:
         idx  result  raw_result
14298  14301       0           0
Deleted rows for idx 21029:
         idx  result  raw_result
21026  21029       0           0
Deleted rows for idx 8038:
       idx  result  raw_result
8035  8038       0         