In [3]:
from datasets import load_dataset, Dataset
from IPython.display import display, Markdown
from tqdm import tqdm
from rich.syntax import Syntax
from rich.console import Console
import pandas as pd
from enum import Enum

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import sys

sys.path.append("..")
from evaluation.code_execution import (
    evaluate_sample_exec,
    print_code_snippet,
    ProblemSource,
    Difficulty,
    Language,
)

In [3]:
# https://huggingface.co/datasets/deepmind/code_contests
cc_dataset = load_dataset("deepmind/code_contests")["train"]

In [None]:
console = Console()


first_n_samples = []

iterator = tqdm(cc_dataset)
for i, sample in enumerate(iterator):
    description = sample["description"]
    test_inputs = sample["public_tests"]["input"] + sample["private_tests"]["input"]
    test_outputs = sample["public_tests"]["output"] + sample["private_tests"]["output"]
    problem_source = ProblemSource(sample["source"])
    difficulty = Difficulty(sample["difficulty"])

    should_keep_problem = True

    if problem_source != ProblemSource.CODEFORCES:
        should_keep_problem = False

    if sample["is_description_translated"]:  # type: ignore
        should_keep_problem = False

    if sample["input_file"] or sample["output_file"]:  # type: ignore
        should_keep_problem = False

    if len(sample["private_tests"]["input"]) == 0:
        should_keep_problem = False

    # console.clear()
    # console.print(description)
    # display(Markdown(description))

    has_python_solution = False

    for j, (lang_txt, solution_code) in enumerate(
        zip(sample["solutions"]["language"], sample["solutions"]["solution"])
    ):
        language = Language(lang_txt)
        if language == Language.PYTHON3:
            has_python_solution = True
            break

        # solution_code = solution_code.replace("sys.exit", "exit")
        # # clear_output(wait=True)
        # display(f"problem {i}, solution {j}")
        # display(Markdown(f"```python\n{solution_code}\n```"))
        # print_code_snippet(solution_code, console)

        # for inputs, outputs in zip(test_inputs, test_outputs):
        #     # each time input is called, it will return the next input value
        #     out = evaluate_python_code_exec(solution_code, inputs)

    if not has_python_solution:
        should_keep_problem = False

    if should_keep_problem:
        sample_to_keep = {
            "source": problem_source,
            "difficulty": difficulty,
        }
        for key in (
            "name",
            "description",
            "public_tests",
            "private_tests",
            "cf_rating",
            "cf_points",
        ):
            sample_to_keep[key] = sample[key]
        alL_solutions = [
            solution_code
            for lang_txt, solution_code in zip(
                sample["solutions"]["language"],
                sample["solutions"]["solution"],  # type: ignore
            )
            if Language(lang_txt) == Language.PYTHON3
        ]
        sample_to_keep["solution"] = alL_solutions[0]
        first_n_samples.append(sample_to_keep)

    iterator.set_postfix({"n_samples": len(first_n_samples)})

    if len(first_n_samples) > 2500:
        break

first_n_df = pd.DataFrame(first_n_samples)

In [None]:
for i, problem in enumerate(cc_dataset):
    val = f"### Problem {i}: {problem['name']}\n{problem['description']}"
    display(Markdown(val))

In [None]:
for col in first_n_df.columns:
    if pd.api.types.is_object_dtype(first_n_df[col]) and isinstance(
        first_n_df[col].iloc[0], Enum
    ):
        first_n_df[col] = first_n_df[col].apply(
            lambda x: x.value if isinstance(x, Enum) else x
        )

Dataset.from_pandas(first_n_df).to_parquet(
    "dataset_samples/codeforces_problems_subset.parquet"
)

In [None]:
test_df = Dataset.from_parquet("dataset_samples/codeforces_problems_subset.parquet")
test_df[0]

In [None]:
test_df.push_to_hub("codeforces_problems_subset")

In [None]:
test_code = """
def solution(problem_input):
    n, m, k = map(int, problem_input[0].split())
    edges = [list(map(int, line.split())) for line in problem_input[1:]]
    print(problem_input)

    adj = [[] for _ in range(n + 1)]

    results = []
    for i in range(m):
        adj[edges[i][0]].append(edges[i][1])
        adj[edges[i][1]].append(edges[i][0])

        max_people = 0
        for j in range(1 << n):
            group = []
            for bit in range(n):
                if (j >> bit) & 1:
                    group.append(bit + 1)

            valid_group = True
            for person in group:
                friend_count = 0
                for friend in adj[person]:
                    if friend in group:
                        friend_count += 1
                if friend_count < k:
                    valid_group = False
                    break

            if valid_group:
                max_people = max(max_people, len(group))
        results.append(max_people)
    return results
"""

tests = {
    "input": [
        "4 4 2\n2 3\n1 2\n1 3\n1 4\n",
        "5 8 2\n2 1\n4 2\n5 4\n5 2\n4 3\n5 1\n4 1\n3 2\n",
        "5 7 2\n1 5\n3 2\n2 5\n3 4\n1 2\n5 3\n1 3\n",
    ],
    "output": ["0\n0\n3\n3\n", "0\n0\n0\n3\n3\n4\n4\n5\n", "0\n0\n0\n0\n3\n4\n4\n"],
}

for test_input, test_output in zip(tests["input"], tests["output"]):
    err, out = evaluate_python_code_exec(test_code, test_input)
    output_list = test_output.strip().split("\n")
    print(out, output_list)

In [None]:
subset_ds = load_dataset("roborovski/codeforces_problems_subset")
subset_df = subset_ds["train"].to_pandas()

In [None]:
console = Console()
n_printed = 0
for i, problem in enumerate(cc_dataset):
    description = problem["description"]
    # display(Markdown(f"### Problem {i}: {problem['name']}\n{description}"))

    for j, (lang_txt, solution_code) in enumerate(
        zip(problem["solutions"]["language"], problem["solutions"]["solution"])
    ):
        if Language(lang_txt) != Language.PYTHON3:
            continue
        print_code_snippet(solution_code, console)
        n_printed += 1

    if n_printed > 5:
        break

In [7]:
v2_dataset = Dataset.from_parquet("../codecontests_dpo_v2.parquet")
v2_dataset

Dataset({
    features: ['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'],
    num_rows: 2892
})

In [13]:
def _filter_problem(problem):
    print(problem.keys())
    if any([x is None for x in problem.values()]):
        return False
    return True


v2_dataset = v2_dataset.filter(_filter_problem)

Filter: 100%|██████████| 2668/2668 [00:00<00:00, 46808.06 examples/s]

dict_keys(['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'])
dict_keys(['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'])
dict_keys(['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'])
dict_keys(['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'])
dict_keys(['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'])
dict_keys(['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'])
dict_keys(['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'])
dict_keys(['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'])
dict_keys(['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'])
dict_keys(['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'])
dict_keys(['chosen', 'chosen_score', 'rejected', 'rejected_score', 'name', 'description'])




In [16]:
v2_dataset.to_parquet("codecontests_dpo_v2_filtered.parquet")

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 36.25ba/s]


10067600