In [1]:
contam_stats_dir = "./my_stats"
eval_stats_path = "eval_stats.csv"

In [2]:
import pandas as pd


In [3]:
# Load eval stats
eval_stats = pd.read_csv(eval_stats_path)
print(f"Loaded {len(eval_stats)} evaluation datasets")
eval_stats


Loaded 217 evaluation datasets


Unnamed: 0,eval_name,questions,answers,passages,min_q_len,avg_q_len,max_q_len,min_a_len,avg_a_len,max_a_len,min_p_len,avg_p_len,max_p_len
0,agi_eval_aqua_rat,1524,1524,0,12,192,491,4.0,39.0,830.0,,,
1,agi_eval_gaokao_english,1224,1224,1224,23,59,162,7.0,36.0,113.0,307.0,1808.0,4047.0
2,agi_eval_logiqa_en,2604,2604,2604,35,86,231,4.0,90.0,499.0,44.0,407.0,1004.0
3,agi_eval_lsat_ar,1150,1150,1150,21,98,283,4.0,40.0,189.0,449.0,593.0,771.0
4,agi_eval_lsat_lr,2550,2550,2550,24,95,311,24.0,123.0,401.0,108.0,387.0,797.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,winogrande,86864,83330,0,58,99,190,2.0,6.0,31.0,,,
213,xstest,450,450,0,12,44,99,4.0,5.0,6.0,,,
214,zebra_logic_grid,1000,1000,0,394,1223,2946,92.0,268.0,601.0,,,
215,zebra_logic_mc,13242,13242,13242,47,51,57,3.0,6.0,19.0,394.0,1531.0,2946.0


In [4]:
# Load contamination summary
summary = pd.read_csv(f"{contam_stats_dir}/summary.csv")
print("Contamination Summary:")
summary


Contamination Summary:


Unnamed: 0,metric,value
0,training_docs_contaminated,600601
1,total_contamination_instances,1626141
2,unique_eval_instances,350487


In [5]:
# Load training docs contaminated by suite
training_docs_by_suite = pd.read_csv(f"{contam_stats_dir}/training_docs_by_suite.csv")
print(f"Training documents contaminated across {len(training_docs_by_suite)} eval suites")
training_docs_by_suite.head(10)


Training documents contaminated across 108 eval suites


Unnamed: 0,eval_suite,training_docs_contaminated
0,squad_v2,213312
1,squad,188275
2,gsm8k,120644
3,drop,72176
4,trivia_qa,69218
5,jeopardy,29693
6,hendrycks_math_algebra,9640
7,hendrycks_math_intermediate_algebra,9303
8,hellaswag,7949
9,hendrycks_math_prealgebra,6319


In [6]:
# Load eval instances contaminated by suite
eval_instances_by_suite = pd.read_csv(f"{contam_stats_dir}/eval_instances_by_suite.csv")
print(f"Unique eval instances found across {len(eval_instances_by_suite)} suites")
eval_instances_by_suite.head()


Unique eval instances found across 108 suites


Unnamed: 0,eval_suite,unique_eval_instances
0,squad_v2,117371
1,squad,81718
2,drop,58140
3,trivia_qa,24606
4,jeopardy,11651


In [7]:
# Load n-gram distribution
ngram_dist = pd.read_csv(f"{contam_stats_dir}/ngram_distribution.csv")
print("N-gram Match Distribution:")
ngram_dist


N-gram Match Distribution:


Unnamed: 0,statistic,value
0,min_matches,1
1,max_matches,12442
2,median_matches,12
3,avg_matches,24.0
4,match_range,contamination_instances
5,1-5,184855
6,6-10,517583
7,11-20,460863
8,21-50,336295
9,51-100,103200


In [8]:
# Summary of loaded data
print("="*60)
print("LOADED DATASETS SUMMARY")
print("="*60)
print(f"\n1. Eval Stats: {len(eval_stats)} evaluation datasets")
print(f"2. Contamination Summary: {len(summary)} metrics")
print(f"3. Training Docs by Suite: {len(training_docs_by_suite)} eval suites")
print(f"4. Eval Instances by Suite: {len(eval_instances_by_suite)} eval suites")
print(f"5. N-gram Distribution: {len(ngram_dist)} rows")
print("\nAll CSV files loaded successfully!")


LOADED DATASETS SUMMARY

1. Eval Stats: 217 evaluation datasets
2. Contamination Summary: 3 metrics
3. Training Docs by Suite: 108 eval suites
4. Eval Instances by Suite: 108 eval suites
5. N-gram Distribution: 13 rows

All CSV files loaded successfully!


In [9]:
# Calculate percent of eval instances contaminated per suite
# Merge eval_instances_by_suite with eval_stats to get the total questions for each suite
# The column in eval_stats for suite names is 'eval_name', and 'questions' is the total per task
merged = pd.merge(
    eval_instances_by_suite,
    eval_stats[['eval_name', 'questions']],
    left_on='eval_suite',
    right_on='eval_name',
    how='left'
)

# Compute the percentage ratio
merged['percent_contaminated'] = merged['unique_eval_instances'] / merged['questions'] * 100

# Sort by ratio descending, show top 10
pct_table = merged[['eval_suite', 'unique_eval_instances', 'questions', 'percent_contaminated']].sort_values(
    'percent_contaminated', ascending=False
)
print("Percent of eval instances contaminated by suite (top 10):")
display(pct_table.head(30))


Percent of eval instances contaminated by suite (top 10):


Unnamed: 0,eval_suite,unique_eval_instances,questions,percent_contaminated
46,bbh_causal_judgement,181,187.0,96.791444
5,gsm8k,7472,8792.0,84.986351
1,squad,81718,98169.0,83.242164
0,squad_v2,117371,142192.0,82.544025
44,tulu3_do_anything_now,219,300.0,73.0
2,drop,58140,86946.0,66.869091
34,gpqa_extended,311,546.0,56.959707
19,hendrycks_math_geometry,732,1349.0,54.262417
14,hendrycks_math_intermediate_algebra,1188,2198.0,54.049136
21,hendrycks_math_precalculus,666,1292.0,51.547988


In [10]:
# Show the row where eval_suite is "drop_mc" in pct_table
drop_mc_row = pct_table[pct_table['eval_suite'] == "drop_mc"]
print("Row for eval_suite == 'drop_mc':")
display(drop_mc_row)


Row for eval_suite == 'drop_mc':


Unnamed: 0,eval_suite,unique_eval_instances,questions,percent_contaminated
51,drop_mc,130,6352.0,2.046599
