In [1]:
import pprint
import string
import itertools

import datasets
import pandas as pd
import numpy as np
import torch
import joblib
import scipy.sparse
import sklearn.feature_extraction


ds_names = [
    "anonym-repos/Calc-gsm8k",
    "anonym-repos/Calc-aqua_rat",
    "anonym-repos/Calc-math_qa",
    "anonym-repos/Calc-ape210k",
    "anonym-repos/Calc-mawps",
    "anonym-repos/Calc-svamp",
    "anonym-repos/Calc-asdiv_a",
]

In [2]:
keep_symbols = set(string.ascii_lowercase.lower() + " ")
dss = {}
split_names = set()

for full_name in ds_names:
    ds = datasets.load_dataset(full_name, "original-splits")
    ds_name = full_name.split("/")[-1].lower()
    for split_name, split in ds.items():
        split_names.add(split_name)
        key = ds_name, split_name
        dss[key] = split.to_pandas()[["question", "chain", "result"]]
        dss[key]["question_simplified"] = (
            dss[key]["question"]
            .str.encode("ascii", errors="ignore")
            .str.decode("ascii")
            .str.lower()
            .str.split()
            .str.join(" ")
            .apply(lambda text: "".join([c for c in text if c in keep_symbols]))
            .str.split()
            .str.join(" ")
        )

Downloading readme:   0%|          | 0.00/3.46k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.09k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.34k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

In [3]:
bow_ngrams_vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=True, dtype=np.int32, ngram_range=(1, 2))

bow_ngrams_vectorizer.fit(
    itertools.chain.from_iterable(ds["question_simplified"] for ds in dss.values())
)

bows = {}

for key, ds in dss.items():
    bows[key] = bow_ngrams_vectorizer.transform(ds["question_simplified"])

In [4]:
def pairwise_jaccard_sim(bows_1: scipy.sparse.csr_matrix, bows_2: scipy.sparse.csr_matrix) -> np.ndarray:
    """
    Computes the Jaccard distance between each row of X matrix and each row of Y matrix.
    """
    sizes_of_1 = bows_1.getnnz(axis=1).astype(np.float32)
    sizes_of_2 = bows_2.getnnz(axis=1).astype(np.float32)
    intersect = (bows_1 @ bows_2.T).toarray().astype(np.float32)
    union = sizes_of_1.reshape(-1, 1) + sizes_of_2.reshape(1, -1) - intersect
    with np.errstate(divide='ignore', invalid='ignore'):
        result = intersect / union
        np.nan_to_num(result, nan=0, posinf=0, neginf=0, copy=False)
    return result


def get_highest_k_matches(scores: torch.Tensor, k: int):
    top_in_rows = torch.topk(k=k, dim=1, sorted=False, largest=True, input=scores)
    top_in_cols = torch.topk(k=k, dim=1, sorted=False, largest=True, input=scores.T)
    return top_in_rows, top_in_cols


def check_leak(bows_1, bows_2, top_k=10):
    scores = pairwise_jaccard_sim(bows_1, bows_2)
    return get_highest_k_matches(torch.tensor(scores), k=top_k)

In [5]:
check_leaks = []
for ds_name_1, ds_split_name_1 in dss.keys():
    for ds_name_2, ds_split_name_2 in dss.keys():
        if ds_split_name_1 == "train" and ds_split_name_2 != "train":
            check_leaks.append(((ds_name_1, ds_split_name_1), (ds_name_2, ds_split_name_2)))


pprint.pprint(check_leaks)
print(len(check_leaks))

[(('calc-gsm8k', 'train'), ('calc-gsm8k', 'test')),
 (('calc-gsm8k', 'train'), ('calc-aqua_rat', 'test')),
 (('calc-gsm8k', 'train'), ('calc-aqua_rat', 'validation')),
 (('calc-gsm8k', 'train'), ('calc-math_qa', 'test')),
 (('calc-gsm8k', 'train'), ('calc-math_qa', 'validation')),
 (('calc-gsm8k', 'train'), ('calc-ape210k', 'test')),
 (('calc-gsm8k', 'train'), ('calc-ape210k', 'validation')),
 (('calc-gsm8k', 'train'), ('calc-mawps', 'test')),
 (('calc-gsm8k', 'train'), ('calc-mawps', 'validation')),
 (('calc-gsm8k', 'train'), ('calc-svamp', 'test')),
 (('calc-gsm8k', 'train'), ('calc-asdiv_a', 'test')),
 (('calc-aqua_rat', 'train'), ('calc-gsm8k', 'test')),
 (('calc-aqua_rat', 'train'), ('calc-aqua_rat', 'test')),
 (('calc-aqua_rat', 'train'), ('calc-aqua_rat', 'validation')),
 (('calc-aqua_rat', 'train'), ('calc-math_qa', 'test')),
 (('calc-aqua_rat', 'train'), ('calc-math_qa', 'validation')),
 (('calc-aqua_rat', 'train'), ('calc-ape210k', 'test')),
 (('calc-aqua_rat', 'train'), ('ca

In [6]:
candidates = {}

with joblib.Parallel(n_jobs=-1) as parallel:
    jobs = (joblib.delayed(check_leak)(bows[ds_1], bows[ds_2]) for ds_1, ds_2 in check_leaks)
    results = parallel(jobs)
    for (ds_train, ds_eval), leak_candidates in zip(check_leaks, results):
        candidates[ds_train, ds_eval] = leak_candidates


In [7]:
threshold = 0.5
print_examples = False

for (ds_train, ds_eval), (train_sim, eval_sim) in candidates.items():
    is_mostly_formula_problem = (dss[ds_eval]["question_simplified"].apply(len) / dss[ds_eval]["question"].apply(len)) < 0.5
    # example of mostly_formula_problem is: Solve 2x + 3x^2 + 8/5 = 1295
    # on those examples, we don't want to check for similarity on words
    sus_mask = (eval_sim.values > threshold) # has shape (len_eval, top_k)
    sus_mask[is_mostly_formula_problem] = False
    suspicious_frac = sus_mask.any(dim=1).float().mean().item()
    if suspicious_frac > 0.05:
        print(f"{suspicious_frac:.2%} of {'/'.join(ds_eval):<30} examples appear similar to some examples in {'/'.join(ds_train)}")
        sus_mask_in_train = (train_sim.values > threshold).any(dim=1).float().mean().item()
        print(f"-> {sus_mask_in_train:.2%} of {'/'.join(ds_train):<27} examples would have to be dropped")
        print()
        if not print_examples:
            continue
        all_sus_eval_idxs, train_nth_similar = sus_mask.nonzero(as_tuple=True)
        sample = torch.randint(0, len(all_sus_eval_idxs), (10,))
        sampled_sus_eval_idxs = all_sus_eval_idxs[sample]
        sampled_train_nth_similar = train_nth_similar[sample]
        sampled_eval_questions = dss[ds_eval]["question"].iloc[sampled_sus_eval_idxs]
        sampled_train_questions = dss[ds_train]["question"].iloc[eval_sim.indices[sampled_sus_eval_idxs, sampled_train_nth_similar]]
        sampled_similarities = eval_sim.values[sampled_sus_eval_idxs, sampled_train_nth_similar]
        for eval_question, train_question, similarity in zip(sampled_eval_questions, sampled_train_questions, sampled_similarities):
            print("  eval: ", eval_question)
            print("  train:", train_question)
            print(f"  {similarity=:.2f}")
            print()

        print()
        print("-" * 100)


29.92% of calc-aqua_rat/test             examples appear similar to some examples in calc-aqua_rat/train
-> 2.20% of calc-aqua_rat/train         examples would have to be dropped

24.80% of calc-aqua_rat/validation       examples appear similar to some examples in calc-aqua_rat/train
-> 0.36% of calc-aqua_rat/train         examples would have to be dropped

98.52% of calc-math_qa/test              examples appear similar to some examples in calc-aqua_rat/train
-> 24.65% of calc-aqua_rat/train         examples would have to be dropped

98.13% of calc-math_qa/validation        examples appear similar to some examples in calc-aqua_rat/train
-> 30.98% of calc-aqua_rat/train         examples would have to be dropped

9.06% of calc-aqua_rat/test             examples appear similar to some examples in calc-math_qa/train
-> 1.76% of calc-math_qa/train          examples would have to be dropped

6.69% of calc-aqua_rat/validation       examples appear similar to some examples in calc-math_qa/tra

In [8]:
# Data leaks:
# aqua_rat train -> math_qa test + validation (whole math_qa seems to be a subset of aqua_rat train) 
# math_qa train -> math_qa test + validation
# aqua_rat train -> aqua_rat test + validation
# ape210k train -> ape210k test + validation
# mawps train -> mawps test + validation + asdiv_a/test

# What will differ in Calc-X collection from original datasets:
# gsm8k: create validation sample from train, because it has none originally
# svamp: it's ok, will be the same
# asdiv: it's ok, will be the same
# mawps: will have filtered train set
# ape210k: will have filtered test + validation sets (they are large enough anyways and this is less lossy than filtering train)
# math_qa: won't have any test and validation set - they are leaked in both math_qa/train and aqua_rat/train 
# aqua_rat: will have filtered train set because we lose little data and validation and test sets are small originally

In [9]:
mawps_leak_with_test = (candidates[("calc-mawps", "train"), ("calc-mawps", "test")][0].values > threshold).any(dim=1)
mawps_leak_with_val = (candidates[("calc-mawps", "train"), ("calc-mawps", "validation")][0].values > threshold).any(dim=1)
mawps_leak_with_asdiv = (candidates[("calc-mawps", "train"), ("calc-asdiv_a", "test")][0].values > threshold).any(dim=1)
mawps_train_leak = (mawps_leak_with_test | mawps_leak_with_val | mawps_leak_with_asdiv)
print(mawps_train_leak.shape)
mawps_train_leak.float().mean()

torch.Size([3636])


tensor(0.7005)

In [10]:
dss["calc-mawps", "train"][~mawps_train_leak.numpy()]

Unnamed: 0,question,chain,result,question_simplified
2,Mark had 2 Doll. Roland proffered him some mor...,"<gadget id=""calculator"">161 - 2</gadget>\n<out...",159,mark had doll roland proffered him some more n...
4,Gloria had some raspberry. Margaret gave him 7...,"<gadget id=""calculator"">33 - 7</gadget>\n<outp...",26,gloria had some raspberry margaret gave him mo...
7,Tina had 7 raspberry . He hash each raspberry ...,"<gadget id=""calculator"">10 * 7</gadget>\n<outp...",70,tina had raspberry he hash each raspberry into...
10,Ernesto had some pear. Jimmie gave him 8 more....,"<gadget id=""calculator"">35 - 8</gadget>\n<outp...",27,ernesto had some pear jimmie gave him more now...
11,Elizabeth had some cherry. Don took 74 from hi...,"<gadget id=""calculator"">74 + 74</gadget>\n<out...",148,elizabeth had some cherry don took from him no...
...,...,...,...,...
3617,Crystal had 178 blackberry. Michael took 55 fr...,"<gadget id=""calculator"">178 - 55</gadget>\n<ou...",123,crystal had blackberry michael took from him n...
3622,Josephine had some blackberry. Katharine gave ...,"<gadget id=""calculator"">44 - 5</gadget>\n<outp...",39,josephine had some blackberry katharine gave h...
3623,"On Friday, Fred paid $5.92 each on 2 tickets t...","<gadget id=""calculator"">2 * 5.92</gadget>\n<ou...",1.37,on friday fred paid each on tickets to a movie...
3625,Jerome had 209 strawberry. Janet grabbed some ...,"<gadget id=""calculator"">209 - 56</gadget>\n<ou...",153,jerome had strawberry janet grabbed some straw...


In [12]:
mawps = datasets.load_dataset("anonym-repos/Calc-mawps", "original-splits")
mawps_new_split = datasets.DatasetDict({
    "train": datasets.Dataset.from_pandas(mawps["train"].to_pandas()[~mawps_train_leak.numpy()], preserve_index=False),
    "validation": mawps["validation"],
    "test": mawps["test"],
})
# mawps_new_split.push_to_hub("anonym-repos/Calc-mawps")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

In [28]:
asdiv = datasets.load_dataset("anonym-repos/Calc-asdiv_a", "original-splits")
# asdiv.push_to_hub("anonym-repos/Calc-asdiv_a")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/752 [00:00<?, ?B/s]

In [29]:
svamp = datasets.load_dataset("anonym-repos/Calc-svamp", "original-splits")
# svamp.push_to_hub("anonym-repos/Calc-svamp")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

In [30]:
gsm8k_orig_split = datasets.load_dataset("anonym-repos/Calc-gsm8k", "original-splits")

print("ORIGINAL SPLIT:")
print(gsm8k_orig_split)
seed = 0

gsm8k_train_new_split, gsm8k_valid_new_split = gsm8k_orig_split["train"].train_test_split(seed=seed, test_size=200).values()
gsm8k_new_split = datasets.DatasetDict({
    "train": gsm8k_train_new_split,
    "validation": gsm8k_valid_new_split,
    "test": gsm8k_orig_split["test"],
})
print("NEW SPLIT:")
print(gsm8k_new_split)

ORIGINAL SPLIT:
DatasetDict({
    test: Dataset({
        features: ['id', 'question', 'chain', 'result', 'result_float'],
        num_rows: 1319
    })
    train: Dataset({
        features: ['id', 'question', 'chain', 'result', 'result_float'],
        num_rows: 7473
    })
})
NEW SPLIT:
DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'chain', 'result', 'result_float'],
        num_rows: 7273
    })
    validation: Dataset({
        features: ['id', 'question', 'chain', 'result', 'result_float'],
        num_rows: 200
    })
    test: Dataset({
        features: ['id', 'question', 'chain', 'result', 'result_float'],
        num_rows: 1319
    })
})


In [31]:
# gsm8k_new_split.push_to_hub("anonym-repos/Calc-gsm8k")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/3.46k [00:00<?, ?B/s]

In [34]:
mathqa = datasets.load_dataset("anonym-repos/Calc-math_qa", "original-splits")
del mathqa["validation"]
del mathqa["test"]

mathqa

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'chain', 'result', 'result_float', 'question_without_options', 'options', 'annotated_formula', 'linear_formula', 'rationale', 'category'],
        num_rows: 20868
    })
})

In [35]:
# mathqa.push_to_hub("anonym-repos/Calc-math_qa")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/4.15k [00:00<?, ?B/s]

In [10]:
(candidates[("calc-ape210k", "train"), ("calc-ape210k", "test")][1].values < 0.50).all(dim=1).float().sum()

tensor(1785.)

In [36]:
ape_test_ok = (candidates[("calc-ape210k", "train"), ("calc-ape210k", "test")][1].values < 0.50).all(dim=1)
ape_val_ok = (candidates[("calc-ape210k", "train"), ("calc-ape210k", "validation")][1].values < 0.50).all(dim=1)

In [43]:
ape210k = datasets.load_dataset("anonym-repos/Calc-ape210k", "original-splits")
ape210k_new_split = datasets.DatasetDict({
    "train": ape210k["train"],
    "validation": datasets.Dataset.from_pandas(ape210k["validation"].to_pandas()[ape_val_ok.numpy()], preserve_index=False),
    "test": datasets.Dataset.from_pandas(ape210k["test"].to_pandas()[ape_test_ok.numpy()], preserve_index=False),
})
ape210k_new_split

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_chinese', 'chain', 'result', 'result_float', 'equation'],
        num_rows: 195179
    })
    validation: Dataset({
        features: ['id', 'question', 'question_chinese', 'chain', 'result', 'result_float', 'equation'],
        num_rows: 1783
    })
    test: Dataset({
        features: ['id', 'question', 'question_chinese', 'chain', 'result', 'result_float', 'equation'],
        num_rows: 1785
    })
})

In [44]:
# ape210k_new_split.push_to_hub("anonym-repos/Calc-ape210k")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/196 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

In [51]:
aqua_train_ok = (
    (candidates[("calc-aqua_rat", "train"), ("calc-aqua_rat", "test")][0].values < 0.5).all(dim=1)
    &
    (candidates[("calc-aqua_rat", "train"), ("calc-aqua_rat", "validation")][0].values < 0.5).all(dim=1)
)

aqua_train_ok.sum().item(), len(aqua_train_ok)

(94760, 97467)

In [53]:
aqua = datasets.load_dataset("anonym-repos/Calc-aqua_rat", "original-splits")
aqua_new_split = datasets.DatasetDict({
    "train": datasets.Dataset.from_pandas(aqua["train"].to_pandas()[aqua_train_ok.numpy()], preserve_index=False),
    "validation": aqua["validation"],
    "test": aqua["test"],
})
aqua_new_split

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'chain', 'result', 'options', 'question_without_options'],
        num_rows: 94760
    })
    validation: Dataset({
        features: ['id', 'question', 'chain', 'result', 'options', 'question_without_options'],
        num_rows: 254
    })
    test: Dataset({
        features: ['id', 'question', 'chain', 'result', 'options', 'question_without_options'],
        num_rows: 254
    })
})

In [54]:
# aqua_new_split.push_to_hub("anonym-repos/Calc-aqua_rat")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/95 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/5.25k [00:00<?, ?B/s]