In [1]:
from IPython.display import display
import polars as pl
import mistune
from tqdm import tqdm
import pprint
import mistune.renderers
import mistune.renderers.markdown
from collections import defaultdict
from typing import Literal

In [2]:
dataset_pl = pl.read_parquet("../codecontests_cot_sft_v2.parquet")

In [None]:
dataset_pl

In [93]:

ModelChoice = Literal["gpt-4o", "deepseek"]

model_choice: ModelChoice = "gpt-4o"

markdown_renderer = mistune.create_markdown(renderer=None)

heading_freqs = defaultdict(int)

n_with_steps = 0


def recursive_get_all_children(element, children_list, disallowed_types=[]):
    if "type" in element and element["type"] in disallowed_types:
        return children_list
    if "children" in element:
        for child in element["children"]:
            recursive_get_all_children(child, children_list, disallowed_types)
    else:
        children_list.append(element)
    return children_list


cot_formatted_rows = []

for i in range(len(dataset_pl)):
    # display(Markdown('## Problem'))
    problem_str = dataset_pl[i]["problem"][0]
    problem_name = dataset_pl[i]["name"][0]
    try:
        solution_str = dataset_pl[i]["completions"][0][0]
    except:
        print("warn: no solution")
        continue
    problem_md = markdown_renderer(problem_str)
    solution_md = markdown_renderer(solution_str)
    # print(solution_str)
    # display(solution_md)
    thoughts = []
    solution_code = None
    for i, element in enumerate(solution_md):
        # Deepseek has a preamble, then the steps; 4o prints a paragraph then the steps, so the first
        # list is always the steps
        if model_choice == "deepseek":
            if element["type"] == "heading":
                heading_text = element["children"][0]["raw"].lower()
                has_steps = False
                required_phrases = ["reasoning", "steps", "approach"]

                if (
                    any(phrase in heading_text for phrase in required_phrases)
                    and "code" not in heading_text
                ):
                    has_steps = True
                if has_steps:
                    n_with_steps += has_steps
                    heading_freqs[heading_text] += 1
                    steps_list_idx = i + 1
                    if solution_md[i + 1]["type"] == "blank_line":
                        steps_list_idx = i + 2
                    steps_list_element = solution_md[steps_list_idx]
                    for step in steps_list_element["children"]:
                        # 0 is the prefix + :
                        if "children" not in step or len(step["children"]) < 2:
                            print(step)
                            continue
                        bullet_point_text = step["children"][1]
                        if "children" not in bullet_point_text:
                            print(bullet_point_text)
                            continue
                        for sub_step in bullet_point_text["children"]:
                            for sub_sub_step in sub_step["children"]:
                                all_text = recursive_get_all_children(sub_sub_step, [])
                                if any(["raw" not in x for x in all_text]):
                                    print(all_text)
                                    continue
                                all_text_str = "".join([t["raw"] for t in all_text])
                                thoughts.append(all_text_str)
        elif model_choice == "gpt-4o":
            if element["type"] == "list":
                # pprint(element)
                for child in element["children"]:
                    for sub_child in child["children"]:
                        all_text = recursive_get_all_children(sub_child, [], ["strong"])
                        all_text = [t["raw"] for t in all_text if "raw" in t]
                        all_text = [t.lstrip(": ") for t in all_text]
                        all_text = [t for t in all_text if len(t) > 0 and t != "\n"]
                        all_text_str = " ".join([t for t in all_text])
                        if len(all_text_str) > 0:
                            thoughts.append(all_text_str)
        if element["type"] == "block_code" and solution_code is None:
            if "raw" not in element:
                print(element)
                continue
            solution_code = element["raw"]

        if solution_code is not None and len(thoughts) > 0:
            cot_formatted_rows.append(
                {
                    "problem": problem_str,
                    "code": solution_code,
                    "thoughts": thoughts,
                    "problem_name": problem_name,
                }
            )
            break


In [None]:
cot_formatted_rows

In [96]:
out_rows_pl = pl.DataFrame(cot_formatted_rows)
out_rows_pl.write_parquet("codecontests_cot_sft_formatted_thoughts_v2_gpt.parquet")

In [None]:
cot_formatted_rows

In [None]:
from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
from typing import Sequence


def format_out_row_str(row: dict) -> Sequence[ChatCompletionMessageParam]:
    problem, code, thoughts = row["problem"], row["code"], row["thoughts"]

    thoughts_list = [f"<thought>{t}</thought>" for t in thoughts]
    thoughts_str = "\n".join(thoughts_list)
    code_str = f"<solution>{code}</solution>"
    conv: Sequence[ChatCompletionMessageParam] = [
        {
            "role": "user",
            "content": f"Solve the following programming problem in Python.\n{problem}",
        },
        {
            "role": "assistant",
            "content": f"{thoughts_str}\n\n{code_str}",
        },
    ]
    return conv


display(pprint(format_out_row_str(cot_formatted_rows[0])))

In [None]:
conv_out = []
for row in tqdm(cot_formatted_rows):
    conv_out.append({"conversation": format_out_row_str(row), "problem_name": row["problem_name"]})
out_rows_pl = pl.DataFrame(conv_out)

In [None]:
display(out_rows_pl)
out_rows_pl.write_parquet("codecontests_cot_sft_formatted_thoughts_conversations.parquet")