In [1]:
!pip install -q datasets pandas

In [2]:
import re
import pandas as pd
from datasets import load_dataset

# -----------------------------------------------------------
# 1. Load GSM8K train split
# -----------------------------------------------------------
gsm = load_dataset("gsm8k", "main", split="train")
print("Loaded GSM8K train size:", len(gsm))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Loaded GSM8K train size: 7473


In [None]:
def extract_final_answer(answer_text: str) -> str | None:
    m = re.search(r"####\s*(.+)", answer_text)
    if not m:
        return None
    return m.group(1).strip()


def extract_rationale(answer_text: str) -> str:
    return answer_text.split("####")[0].strip()


def split_rationale_into_steps(rationale: str) -> list[str]:
    # --- 1) Try line-based splitting ---
    raw_lines = [ln.strip() for ln in rationale.split("\n") if ln.strip()]

    # If we got multiple non-empty lines, assume each line is a step.
    if len(raw_lines) > 1:
        return raw_lines

    # --- 2) Fallback: sentence-based splitting ---
    text = rationale.replace("\n", " ").strip()

    # Split on sentence boundaries ('.', '!', '?') followed by whitespace
    raw_sentences = re.split(r'(?<=[\.!?])\s+', text)
    steps = [s.strip() for s in raw_sentences if s.strip()]

    return steps


# -----------------------------------------------------------
# 3. Build stepwise training rows (input, target)
# -----------------------------------------------------------

rows = []

for idx, row in enumerate(gsm):
    question = row["question"]
    full_answer = row["answer"]

    final_answer = extract_final_answer(full_answer)
    if final_answer is None:
        continue

    rationale = extract_rationale(full_answer)
    steps = split_rationale_into_steps(rationale)

    if not steps:
        continue

    history_steps: list[str] = []

    for step in steps:
        if history_steps:
            history_str = ""
            for i, h in enumerate(history_steps, start=1):
                history_str += f"STEP {i}: {h}\n"
            history_str = history_str.rstrip()
        else:
            history_str = ""

        input_text = (
            f"Problem: {question}\n\n"
            f"Steps completed so far:\n{history_str}"
        )

        target_text = f"STEP: {step}"

        rows.append({
            "Problem": input_text,
            "Next Step": target_text,
        })

        history_steps.append(step)

    if history_steps:
        history_str = ""
        for i, h in enumerate(history_steps, start=1):
            history_str += f"STEP {i}: {h}\n"
        history_str = history_str.rstrip()
    else:
        history_str = ""

    input_text = (
        f"Problem: {question}\n\n"
        f"Steps completed so far:\n{history_str}"
    )
    final_target = f"FINAL_ANSWER: {final_answer}"

    rows.append({
        "Problem": input_text,
        "Next Step": final_target,
    })

    if (idx + 1) % 100 == 0:
        print(f"Processed {idx+1}/{len(gsm)} problems, total rows so far: {len(rows)}")

print("Total training rows:", len(rows))


# -----------------------------------------------------------
# 4. Save to CSV
# -----------------------------------------------------------

df = pd.DataFrame(rows, columns=["Problem", "Next Step"])
df.to_csv("gsm8k_stepwise.csv", index=False)
print("Saved to gsm8k_stepwise.csv")

df.head(10)

Processed 100/7473 problems, total rows so far: 474
Processed 200/7473 problems, total rows so far: 942
Processed 300/7473 problems, total rows so far: 1425
Processed 400/7473 problems, total rows so far: 1889
Processed 500/7473 problems, total rows so far: 2342
Processed 600/7473 problems, total rows so far: 2788
Processed 700/7473 problems, total rows so far: 3216
Processed 800/7473 problems, total rows so far: 3672
Processed 900/7473 problems, total rows so far: 4111
Processed 1000/7473 problems, total rows so far: 4548
Processed 1100/7473 problems, total rows so far: 4986
Processed 1200/7473 problems, total rows so far: 5463
Processed 1300/7473 problems, total rows so far: 5917
Processed 1400/7473 problems, total rows so far: 6379
Processed 1500/7473 problems, total rows so far: 6843
Processed 1600/7473 problems, total rows so far: 7297
Processed 1700/7473 problems, total rows so far: 7719
Processed 1800/7473 problems, total rows so far: 8176
Processed 1900/7473 problems, total row

Unnamed: 0,Problem,Next Step
0,Problem: Natalia sold clips to 48 of her frien...,STEP: Natalia sold 48/2 = <<48/2=24>>24 clips ...
1,Problem: Natalia sold clips to 48 of her frien...,STEP: Natalia sold 48+24 = <<48+24=72>>72 clip...
2,Problem: Natalia sold clips to 48 of her frien...,FINAL_ANSWER: 72
3,Problem: Weng earns $12 an hour for babysittin...,STEP: Weng earns 12/60 = $<<12/60=0.2>>0.2 per...
4,Problem: Weng earns $12 an hour for babysittin...,"STEP: Working 50 minutes, she earned 0.2 x 50 ..."
5,Problem: Weng earns $12 an hour for babysittin...,FINAL_ANSWER: 10
6,Problem: Betty is saving money for a new walle...,"STEP: In the beginning, Betty has only 100 / 2..."
7,Problem: Betty is saving money for a new walle...,STEP: Betty's grandparents gave her 15 * 2 = $...
8,Problem: Betty is saving money for a new walle...,"STEP: This means, Betty needs 100 - 50 - 30 - ..."
9,Problem: Betty is saving money for a new walle...,FINAL_ANSWER: 5
