In [3]:
from datasets import load_dataset, Dataset
from IPython.display import display, Markdown
from tqdm import tqdm
from rich.syntax import Syntax
from rich.console import Console
import pandas as pd
from enum import Enum
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import sys
sys.path.append('..')
from evaluation.code_execution import evaluate_sample_codecontests, evaluate_python_code_exec, print_code_snippet, ProblemSource, Difficulty, Language

In [3]:
# https://huggingface.co/datasets/deepmind/code_contests
cc_dataset = load_dataset("deepmind/code_contests")["train"]

In [7]:
console = Console()


first_n_samples = []

iterator = tqdm(cc_dataset)
for i, sample in enumerate(iterator):
    description = sample["description"]
    test_inputs = sample["public_tests"]["input"] + sample["private_tests"]["input"]
    test_outputs = sample["public_tests"]["output"] + sample["private_tests"]["output"]
    problem_source = ProblemSource(sample["source"])
    difficulty = Difficulty(sample["difficulty"])

    should_keep_problem = True

    if problem_source != ProblemSource.CODEFORCES:
        should_keep_problem = False

    if sample["is_description_translated"]:  # type: ignore
        should_keep_problem = False

    if sample["input_file"] or sample["output_file"]:  # type: ignore
        should_keep_problem = False

    if len(sample["private_tests"]["input"]) == 0:
        should_keep_problem = False

    # console.clear()
    # console.print(description)
    # display(Markdown(description))

    has_python_solution = False

    for j, (lang_txt, solution_code) in enumerate(
        zip(sample["solutions"]["language"], sample["solutions"]["solution"])
    ):
        language = Language(lang_txt)
        if language == Language.PYTHON3:
            has_python_solution = True
            break

        # solution_code = solution_code.replace("sys.exit", "exit")
        # # clear_output(wait=True)
        # display(f"problem {i}, solution {j}")
        # display(Markdown(f"```python\n{solution_code}\n```"))
        # print_code_snippet(solution_code, console)

        # for inputs, outputs in zip(test_inputs, test_outputs):
        #     # each time input is called, it will return the next input value
        #     out = evaluate_python_code_exec(solution_code, inputs)

    if not has_python_solution:
        should_keep_problem = False

    if should_keep_problem:
        sample_to_keep = {
            "source": problem_source,
            "difficulty": difficulty,
        }
        for key in ("name", "description", "public_tests", "private_tests", "cf_rating", "cf_points"):
            sample_to_keep[key] = sample[key]
        first_n_samples.append(sample_to_keep)
    
    iterator.set_postfix({"n_samples": len(first_n_samples)})

    if len(first_n_samples) > 10000:
        break

first_n_df = pd.DataFrame(first_n_samples)

100%|██████████| 13328/13328 [01:03<00:00, 210.40it/s, n_samples=5213]


In [None]:
for i, problem in enumerate(cc_dataset):
    val = f"### Problem {i}: {problem['name']}\n{problem['description']}"
    display(Markdown(val))

In [8]:

for col in first_n_df.columns:
    if pd.api.types.is_object_dtype(first_n_df[col]) and isinstance(first_n_df[col].iloc[0], Enum):
        first_n_df[col] = first_n_df[col].apply(lambda x: x.value if isinstance(x, Enum) else x)

Dataset.from_pandas(first_n_df).to_parquet("dataset_samples/codeforces_problems_subset.parquet")

Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 24.01ba/s]


204624800

In [4]:
test_df = Dataset.from_parquet("../dataset_samples/codeforces_problems_subset.parquet")
test_df[0]

{'source': 2,
 'difficulty': 11,
 'name': '1037_E. Trips',
 'description': "There are n persons who initially don't know each other. On each morning, two of them, who were not friends before, become friends.\n\nWe want to plan a trip for every evening of m days. On each trip, you have to select a group of people that will go on the trip. For every person, one of the following should hold: \n\n  * Either this person does not go on the trip, \n  * Or at least k of his friends also go on the trip. \n\n\n\nNote that the friendship is not transitive. That is, if a and b are friends and b and c are friends, it does not necessarily imply that a and c are friends.\n\nFor each day, find the maximum number of people that can go on the trip on that day.\n\nInput\n\nThe first line contains three integers n, m, and k (2 ≤ n ≤ 2 ⋅ 10^5, 1 ≤ m ≤ 2 ⋅ 10^5, 1 ≤ k < n) — the number of people, the number of days and the number of friends each person on the trip should have in the group.\n\nThe i-th (1 ≤ 

In [5]:
test_df.push_to_hub("codeforces_problems_subset")


Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 12.73ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.61s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/roborovski/codeforces_problems_subset/commit/cbfb4e8e4e87b814a2dc27dcd3ba81608cb2ddbf', commit_message='Upload dataset', commit_description='', oid='cbfb4e8e4e87b814a2dc27dcd3ba81608cb2ddbf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/roborovski/codeforces_problems_subset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='roborovski/codeforces_problems_subset'), pr_revision=None, pr_num=None)

In [None]:
test_code = """
def solution(problem_input):
    n, m, k = map(int, problem_input[0].split())
    edges = [list(map(int, line.split())) for line in problem_input[1:]]
    print(problem_input)

    adj = [[] for _ in range(n + 1)]

    results = []
    for i in range(m):
        adj[edges[i][0]].append(edges[i][1])
        adj[edges[i][1]].append(edges[i][0])

        max_people = 0
        for j in range(1 << n):
            group = []
            for bit in range(n):
                if (j >> bit) & 1:
                    group.append(bit + 1)

            valid_group = True
            for person in group:
                friend_count = 0
                for friend in adj[person]:
                    if friend in group:
                        friend_count += 1
                if friend_count < k:
                    valid_group = False
                    break

            if valid_group:
                max_people = max(max_people, len(group))
        results.append(max_people)
    return results
"""

tests = {
    "input": [
        "4 4 2\n2 3\n1 2\n1 3\n1 4\n",
        "5 8 2\n2 1\n4 2\n5 4\n5 2\n4 3\n5 1\n4 1\n3 2\n",
        "5 7 2\n1 5\n3 2\n2 5\n3 4\n1 2\n5 3\n1 3\n",
    ],
    "output": ["0\n0\n3\n3\n", "0\n0\n0\n3\n3\n4\n4\n5\n", "0\n0\n0\n0\n3\n4\n4\n"],
}

for test_input, test_output in zip(tests["input"], tests["output"]):
    err, out = evaluate_python_code_exec(test_code, test_input)
    output_list = test_output.strip().split("\n")
    print(out, output_list)

In [None]:
subset_ds = load_dataset("roborovski/codeforces_problems_subset")
subset_df = subset_ds["train"].to_pandas()

In [None]:
console = Console()
n_printed = 0
for i, problem in enumerate(cc_dataset):
    description = problem["description"]
    # display(Markdown(f"### Problem {i}: {problem['name']}\n{description}"))

    for j, (lang_txt, solution_code) in enumerate(
        zip(problem["solutions"]["language"], problem["solutions"]["solution"])
    ):
        if Language(lang_txt) != Language.PYTHON3:
            continue
        print_code_snippet(solution_code, console)
        n_printed += 1
    
    if n_printed > 5:
        break