In [1]:
import datasets
ds = datasets.load_dataset("livecodebench/code_generation_lite", split="test")
ds

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['question_title', 'question_content', 'platform', 'question_id', 'contest_id', 'contest_date', 'starter_code', 'difficulty', 'public_test_cases', 'private_test_cases', 'metadata'],
    num_rows: 400
})

In [2]:
import datetime
def parse_date(date):
    date_format = '%Y-%m-%dT%H:%M:%S'
    return datetime.datetime.strptime(date, date_format)

In [3]:
import datetime
cutoff = datetime.datetime(2023, 9, 30, 0, 0)
ds_decont = ds.filter(lambda ex: parse_date(ex["contest_date"]) >= cutoff)
ds_decont

Dataset({
    features: ['question_title', 'question_content', 'platform', 'question_id', 'contest_id', 'contest_date', 'starter_code', 'difficulty', 'public_test_cases', 'private_test_cases', 'metadata'],
    num_rows: 202
})

In [4]:
import json
import pickle
import zlib
import base64


def decode_tests(tests):
    return json.loads(
                pickle.loads(
                    zlib.decompress(
                        base64.b64decode(tests)
                    )
                )
            )

In [5]:
ds[0]["public_test_cases"]

'[{"input": "6\\nabc\\nacb\\nbac\\nbca\\ncab\\ncba\\n", "output": "YES\\nYES\\nYES\\nNO\\nNO\\nYES\\n", "testtype": "stdin"}]'

In [6]:
decode_tests(ds[0]["private_test_cases"])

[{'input': '1\nabc\n', 'output': 'YES\n', 'testtype': 'stdin'},
 {'input': '3\nabc\nabc\nabc\n',
  'output': 'YES\nYES\nYES\n',
  'testtype': 'stdin'},
 {'input': '5\ncab\nacb\ncba\nbac\nbca\n',
  'output': 'NO\nYES\nYES\nYES\nNO\n',
  'testtype': 'stdin'},
 {'input': '6\nabc\nabc\nabc\nabc\nabc\nabc\n',
  'output': 'YES\nYES\nYES\nYES\nYES\nYES\n',
  'testtype': 'stdin'}]

In [7]:
from tqdm import tqdm
# format we want:
# - question: has prompt
# - starter_code: has starter code, if any
# - difficulty: has difficulty
# - input_output: has tests, with fn_name key if needed
# - title: just for metadata
# - source: just for metadata
# - date: just for metadata
# - id: for unique id
def clean_and_push(ds, reponame):
    cleaned_ds = []
    for ex in tqdm(ds, total=len(ds)):
        public_raw_tests = json.loads(ex["public_test_cases"])
        raw_tests = decode_tests(ex["private_test_cases"]) + public_raw_tests
        tests = {"inputs": [], "outputs": []}
        public_tests = {"inputs": [], "outputs": []}
        metadata = json.loads(ex["metadata"])
        
        for test in raw_tests:
            inp = test["input"]
            out = test["output"]
            
            if "func_name" in metadata:
                inp = [json.loads(i) for i in inp.split("\n")]
                out = json.loads(out)
            
            tests["inputs"].append(inp)
            tests["outputs"].append(out)

        for test in public_raw_tests:
            inp = test["input"]
            out = test["output"]
            
            if "func_name" in metadata:
                inp = [json.loads(i) for i in inp.split("\n")]
                out = json.loads(out)
            
            public_tests["inputs"].append(inp)
            public_tests["outputs"].append(out)
    
        if "func_name" in metadata:
            name = metadata["func_name"]
            tests["fn_name"] = name
            public_tests["fn_name"] = name
            
        
        obj = {
            "question": ex["question_content"],
            "starter_code": ex["starter_code"],
            "difficulty": ex["difficulty"],
            "input_output": json.dumps(tests),
            "public_input_output": json.dumps(public_tests),
            "title": ex["question_title"],
            "source": ex["platform"],
            "date": ex["contest_date"],
            "id": ex["question_id"],
        }
        cleaned_ds.append(obj)
        
    cleaned_ds = datasets.Dataset.from_list(cleaned_ds)
    print("pushing to: ", reponame)
    cleaned_ds.push_to_hub(reponame, split="test")

In [14]:
clean_and_push(ds_decont, "codegenning/livecodebench_lite_filtered")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 202/202 [00:09<00:00, 20.71it/s]


pushing to:  codegenning/livecodebench_lite_filtered


Uploading the dataset shards:   0%|                                                                                                                                                             | 0/3 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|                                                                                                                                                       | 0/1 [00:00<?, ?ba/s][A
Creating parquet from Arrow format: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.01ba/s][A
Uploading the dataset shards:  33%|█████████████████████████████████████████████████▋                                                                                                   | 1/3 [00:03<00:06,  3.41s/it]
Creating parquet from Arrow format:   0%|                                                                                             

In [None]:
cutoff = datetime.datetime(2023, 9, 30, 0, 0)
ds_cont = ds.filter(lambda ex: ex["contest_date"] < cutoff)
ds_cont

In [None]:
clean_and_push(ds_cont, "codegenning/livecodebench_lite_contaminated")

# DeepSeekCoder V2 Cutoffs

In [8]:
import datasets
ds = datasets.load_dataset("livecodebench/code_generation_lite", split="test", version_tag="release_v2")
ds

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['question_title', 'question_content', 'platform', 'question_id', 'contest_id', 'contest_date', 'starter_code', 'difficulty', 'public_test_cases', 'private_test_cases', 'metadata'],
    num_rows: 511
})

In [9]:
import datetime
cutoff = datetime.datetime(2023, 12, 1, 0, 0)
ds_decont = ds.filter(lambda ex: parse_date(ex["contest_date"]) >= cutoff)
ds_decont

Dataset({
    features: ['question_title', 'question_content', 'platform', 'question_id', 'contest_id', 'contest_date', 'starter_code', 'difficulty', 'public_test_cases', 'private_test_cases', 'metadata'],
    num_rows: 226
})

In [None]:
clean_and_push(ds_decont, "codegenning/livecodebench_lite_v2")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 226/226 [00:24<00:00,  9.38it/s]


pushing to:  codegenning/livecodebench_lite_v2


Uploading the dataset shards:   0%|                                                                                                                                                             | 0/5 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|                                                                                                                                                       | 0/1 [00:00<?, ?ba/s][A
Creating parquet from Arrow format: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.02ba/s][A
Uploading the dataset shards:  20%|█████████████████████████████▊                                                                                                                       | 1/5 [00:00<00:01,  2.01it/s]
Creating parquet from Arrow format:   0%|                                                                                             