In [2]:
import datasets
ds = datasets.load_dataset("livecodebench/code_generation_lite", split="test")
ds


Downloading data:  34%|██████████████████████████████████████▊                                                                          | 430M/1.25G [00:07<00:14, 55.7MB/s][A
Downloading data:  35%|███████████████████████████████████████▋                                                                         | 440M/1.25G [00:07<00:14, 55.7MB/s][A
Downloading data:  36%|████████████████████████████████████████▋                                                                        | 451M/1.25G [00:08<00:14, 56.1MB/s][A
Downloading data:  37%|█████████████████████████████████████████▌                                                                       | 461M/1.25G [00:08<00:13, 57.5MB/s][A
Downloading data:  38%|██████████████████████████████████████████▌                                                                      | 472M/1.25G [00:08<00:13, 56.3MB/s][A
Downloading data:  39%|███████████████████████████████████████████▌                                                    

Dataset({
    features: ['question_title', 'question_content', 'platform', 'question_id', 'contest_id', 'contest_date', 'starter_code', 'difficulty', 'public_test_cases', 'private_test_cases', 'metadata'],
    num_rows: 400
})

In [4]:
import datetime
cutoff = datetime.datetime(2023, 9, 30, 0, 0)
ds_decont = ds.filter(lambda ex: ex["contest_date"] >= cutoff)
ds_decont


Filter:   0%|                                                                                                                                | 0/400 [00:00<?, ? examples/s][A
Filter: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:00<00:00, 496.33 examples/s][A


Dataset({
    features: ['question_title', 'question_content', 'platform', 'question_id', 'contest_id', 'contest_date', 'starter_code', 'difficulty', 'public_test_cases', 'private_test_cases', 'metadata'],
    num_rows: 202
})

In [5]:
import json
import pickle
import zlib
import base64


def decode_tests(tests):
    return json.loads(
                pickle.loads(
                    zlib.decompress(
                        base64.b64decode(tests)
                    )
                )
            )

In [11]:
from tqdm import tqdm
# format we want:
# - question: has prompt
# - starter_code: has starter code, if any
# - difficulty: has difficulty
# - input_output: has tests, with fn_name key if needed
# - title: just for metadata
# - source: just for metadata
# - date: just for metadata
# - id: for unique id
def clean_and_push(ds, reponame):
    cleaned_ds = []
    for ex in tqdm(ds, total=len(ds)):
        raw_tests = decode_tests(ex["private_test_cases"]) + json.loads(ex["public_test_cases"])
        tests = {"inputs": [], "outputs": []}
        metadata = json.loads(ex["metadata"])
        
        for test in raw_tests:
            inp = test["input"]
            out = test["output"]
            if "func_name" in metadata:
                inp = [json.loads(i) for i in inp.split("\n")]
                out = json.loads(out)
            tests["inputs"].append(inp)
            tests["outputs"].append(out)
    
        if "func_name" in metadata:
            name = metadata["func_name"]
            tests["fn_name"] = name
            
            
        
        obj = {
            "question": ex["question_content"],
            "starter_code": ex["starter_code"],
            "difficulty": ex["difficulty"],
            "input_output": json.dumps(tests),
            "title": ex["question_title"],
            "source": ex["platform"],
            "date": ex["contest_date"],
            "id": ex["question_id"],
        }
        cleaned_ds.append(obj)
        
    cleaned_ds = datasets.Dataset.from_list(cleaned_ds)
    print("pushing to: ", reponame)
    cleaned_ds.push_to_hub(reponame, split="test")

In [None]:
clean_and_push(ds_decont, "cassanof/livecodebench_lite_filtered")

In [12]:
cutoff = datetime.datetime(2023, 9, 30, 0, 0)
ds_cont = ds.filter(lambda ex: ex["contest_date"] < cutoff)
ds_cont

Dataset({
    features: ['question_title', 'question_content', 'platform', 'question_id', 'contest_id', 'contest_date', 'starter_code', 'difficulty', 'public_test_cases', 'private_test_cases', 'metadata'],
    num_rows: 198
})

In [13]:
clean_and_push(ds_cont, "cassanof/livecodebench_lite_contaminated")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 198/198 [00:18<00:00, 10.56it/s]


pushing to:  cassanof/livecodebench_lite_contaminated


Uploading the dataset shards:   0%|                                                                                                                   | 0/3 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|                                                                                                             | 0/1 [00:00<?, ?ba/s][A
Creating parquet from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.46ba/s][A
Uploading the dataset shards:  33%|███████████████████████████████████▋                                                                       | 1/3 [00:03<00:07,  3.88s/it]
Creating parquet from Arrow format:   0%|                                                                                                             | 0/1 [00:00<?, ?ba/s][A
Creating parquet from Arrow format: 100%|█████████████████████████████████████████████████████████████████████████████████████