In [33]:
from IPython.display import display, Markdown
import polars as pl
import mistune

In [5]:
dataset_pl = pl.read_parquet("../codecontests_cot_sft.parquet")

In [None]:
all_problems = []
all_completions = []
for i in range(len(dataset_pl)):
    row = dataset_pl[i]
    for j in range(len(row["problem"])):
        all_problems.append(row["problem"][j])
    for j in range(len(row["completions"][0])):
        all_completions.append(row["completions"][0][j])

In [None]:
for problem, completion in zip(all_problems, all_completions):
    display(Markdown(f"### Problem:"))
    display(problem)
    display(completion)

In [None]:
import pprint
import mistune.renderers
import mistune.renderers.markdown
from collections import defaultdict


markdown_renderer = mistune.create_markdown(renderer=None)

heading_freqs = defaultdict(int)

n_with_steps = 0


def recursive_get_all_children(element, children_list):
    if "children" in element:
        for child in element["children"]:
            recursive_get_all_children(child, children_list)
    else:
        children_list.append(element)
    return children_list


out_rows = []

for i in range(len(dataset_pl)):
    # display(Markdown('## Problem'))
    problem_str = dataset_pl[i]["problem"][0]
    try:
        solution_str = dataset_pl[i]["completions"][0][0]
    except:
        print("warn: no solution")
        continue
    problem_md = markdown_renderer(problem_str)
    solution_md = markdown_renderer(solution_str)
    thoughts = []
    solution_code = None
    for i, element in enumerate(solution_md):
        if element["type"] == "heading":
            heading_text = element["children"][0]["raw"].lower()
            has_steps = False
            required_phrases = ["reasoning", "steps", "approach"]
            if (
                any(phrase in heading_text for phrase in required_phrases)
                and not "code" in heading_text
            ):
                has_steps = True
            if has_steps:
                n_with_steps += has_steps
                heading_freqs[heading_text] += 1
                steps_list_idx = i + 1
                if solution_md[i + 1]["type"] == "blank_line":
                    steps_list_idx = i + 2
                steps_list_element = solution_md[steps_list_idx]
                for step in steps_list_element["children"]:
                    # 0 is the prefix + :
                    if "children" not in step or len(step["children"]) < 2:
                        print(step)
                        continue
                    bullet_point_text = step["children"][1]
                    if "children" not in bullet_point_text:
                        print(bullet_point_text)
                        continue
                    for sub_step in bullet_point_text["children"]:
                        for sub_sub_step in sub_step["children"]:
                            all_text = recursive_get_all_children(sub_sub_step, [])
                            if any(["raw" not in x for x in all_text]):
                                print(all_text)
                                continue
                            all_text_str = "".join([t["raw"] for t in all_text])
                            thoughts.append(all_text_str)
        if element["type"] == "block_code" and solution_code is None:
            if "raw" not in element:
                print(element)
                continue
            solution_code = element["raw"]

        if solution_code is not None and len(thoughts) > 0:
            out_rows.append(
                {"problem": problem_str, "code": solution_code, "thoughts": thoughts}
            )
            break

In [246]:
out_rows_pl = pl.DataFrame(out_rows)
out_rows_pl.write_parquet("codecontests_cot_sft_formatted_thoughts.parquet")

In [None]:
sorted_freqs_table = sorted(heading_freqs.items(), key=lambda x: x[1], reverse=True)
sorted_freqs_table