In [42]:
import json
import random
import re
from pathlib import Path
from typing import List, Dict, Set, Tuple

import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from tqdm.auto import tqdm

# --- Path and Directory Definitions ---

def find_project_root(marker: str = ".git") -> Path:
    """Traverse upwards to find the project root, marked by the git repository."""
    current_path = Path.cwd().resolve()
    while current_path != current_path.parent:
        if (current_path / marker).exists():
            return current_path
        current_path = current_path.parent
    raise FileNotFoundError(f"Could not find project root. Marker '{marker}' not found.")

# --- Global Constants and Paths ---
PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / 'data'
OUTPUT_DIR = DATA_DIR / "sft-datasets/verifier-v2-two-task"
PROCESSED_TEMPLATE_DIR = DATA_DIR / "template-generated-processed"

# --- Ensure output directory exists ---
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Configuration ---
# Seed for reproducibility of shuffling and sampling
RANDOM_SEED = 42

# Define the number of core problems to use as a base
NUM_CONCEPTUAL_PROBLEMS = 1000
NUM_COMPUTATIONAL_PROBLEMS = 1000

print(f"Project root: {PROJECT_ROOT}")
print(f"Dataset output directory: {OUTPUT_DIR}")
print(f"Random seed set to: {RANDOM_SEED}")

# --- Load Catalog of Programmatically generated Computational Errors ---
# This catalog points to individual JSON files with generated flawed solutions.
PROGRAMMATIC_COMPUTATIONAL_DIR = DATA_DIR / "computational-errors-generated"
PROGRAMMATIC_CATALOG_PATH = PROGRAMMATIC_COMPUTATIONAL_DIR / "computational_error_catalog.csv"

# --- Load manually generated conceptual/computational errors ---
# This file contains the final, human-approved conceptual/computational error text and explanations.
MANUAL_ERRORS_CSV_PATH = DATA_DIR / "manually_generated_errors_final.csv"

# --- Load Original GSM8K Dataset for 'Correct' examples and problem text ---
GSM8K_DATASET: Dataset = load_dataset("gsm8k", "main")["train"]

# --- Loading and Basic Validation ---
# 1. Programmatic Computational Errors Catalog
programmatic_comp_df = pd.read_csv(PROGRAMMATIC_CATALOG_PATH)
print(f"Loaded {len(programmatic_comp_df):,} records from programmatic computational error catalog.")
# drop rows with missing "erroneous_line_number"
programmatic_comp_df.dropna(subset=["erroneous_line_number"], inplace=True)
print(f"Programmatic error records after dropping missing line numbers: {len(programmatic_comp_df):,}")

# 2. Manually Generated Conceptual/Computational Errors
manual_errors_df = pd.read_csv(MANUAL_ERRORS_CSV_PATH)
print(f"Loaded {len(manual_errors_df):,} records from manually validated errors CSV.")
# drop rows with missing "erroneous_line_number"
manual_errors_df.dropna(subset=["erroneous_line_number"], inplace=True)
print(f"Manual error records after dropping missing line numbers: {len(manual_errors_df):,}")

Project root: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math
Dataset output directory: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/sft-datasets/verifier-v2-two-task
Random seed set to: 42
Loaded 22,623 records from programmatic computational error catalog.
Programmatic error records after dropping missing line numbers: 21,952
Loaded 1,963 records from manually validated errors CSV.
Manual error records after dropping missing line numbers: 1,740


In [43]:
def sanitize_commas(text: str) -> str:
    """Removes comma separators from numbers to prevent model artifacts."""
    return re.sub(r'(\d),(\d)', r'\1\2', text)

def sanitize_text(text: str) -> str:
    """
    Replaces a comprehensive set of problematic Unicode characters with their
    ASCII equivalents to prevent model generation and string parsing errors.
    """
    if not isinstance(text, str):
        return ""
        
    replacements = {
        "\u2212": "-",  # Minus Sign
        "\u00d7": "*",  # Multiplication Sign
        "\u00f7": "/",  # Division Sign
        "\u22c5": "*",  # Dot Operator
        "\u201c": '"',  # Left Double Quotation Mark
        "\u201d": '"',  # Right Double Quotation Mark
        "\u2018": "'",  # Left Single Quotation Mark
        "\u2019": "'",  # Right Single Quotation Mark
        "\u2014": "-",  # Em Dash
        "\u2013": "-",  # En Dash
        "\u2026": "...",# Horizontal Ellipsis
        "\u00a0": " ",  # No-Break Space
    }
    for uni, ascii_char in replacements.items():
        text = text.replace(uni, ascii_char)
    return text

def clean_and_split_solution(raw_text: str) -> Tuple[str, str | None]:
    """
    Takes a raw solution text, sanitizes it, and separates the reasoning
    lines from the final answer line.
    
    Returns:
        A tuple containing (cleaned_reasoning_text, final_answer_string).
        final_answer_string is None if not found.
    """
    if not isinstance(raw_text, str):
        return "", None
        
    # 1. Sanitize all characters first
    sanitized_text = sanitize_text(raw_text)
    
    # 2. Remove calculator annotations
    text_no_annotations = re.sub(r'<<.*?>>', '', sanitized_text)
    
    # 3. Remove comma separators from numbers
    text_no_commas = sanitize_commas(text_no_annotations)
    
    lines = text_no_commas.split('\n')
    final_answer = None
    
    # 4. Find and extract the final answer line
    if lines and re.match(r'^\s*####\s*.*$', lines[-1]):
        final_answer_line = lines.pop().strip()
        # Extract just the number part after ####
        match = re.search(r'####\s*(.*)', final_answer_line)
        if match:
            final_answer = match.group(1).strip()

    # 5. Process the remaining reasoning lines
    cleaned_lines = [line.strip() for line in lines if line.strip()]
    reasoning_text = '\n'.join(cleaned_lines)
    
    return reasoning_text, final_answer

def convert_solution_to_json_str(cleaned_reasoning: str, final_answer: str | None) -> str:
    """
    Takes cleaned reasoning text and a final answer, and converts them into
    a single JSON-formatted string.
    """
    lines = cleaned_reasoning.split('\n')
    solution_dict = {f"L{i+1}": line for i, line in enumerate(lines) if line}
    
    if final_answer is not None:
        solution_dict["FA"] = final_answer
        
    return json.dumps(solution_dict, indent=2)

In [44]:
# print 5 randomly chosen samples from GSM8K_DATASET
print("Sample GSM8K problems:")
random_indices = random.sample(range(len(GSM8K_DATASET)), 5)
for idx in random_indices:
    sample = GSM8K_DATASET[idx]
    print(f"### Question {idx}:")
    print(sample['question'])
    print()
    print("### Original, raw answer:")
    print(sample['answer'])
    print()

    # Clean and split the solution
    reasoning, final_answer = clean_and_split_solution(sample['answer'])
    solution_str = convert_solution_to_json_str(reasoning, final_answer)
    print("### Cleaned solution JSON:")
    print(solution_str)
    print("-" * 80)

Sample GSM8K problems:
### Question 6660:
James started a food fight in his school's cafeteria, and the school administration is making him work off the cost of all the food he wasted at minimum wage ($8). James has to pay for 20 wasted pounds of meat at $5/pound, 15 wasted pounds of fruits and vegetables at $4/pound, 60 wasted pounds of bread products at $1.50/pound, and 10 hours of time-and-a-half pay for the janitorial staff, who normally make $10/hour. How many hours will James have to work to pay for everything?

### Original, raw answer:
First find the total cost of the wasted meat: $5/pound * 20 pounds = $<<5*20=100>>100
Then find the total cost of the wasted vegetables: $4/pound * 15 pounds = $<<4*15=60>>60
Then find the total cost of the wasted bread products: $1.50/pound * 60 pounds = $<<1.5*60=90>>90
Then find the time-and-a-half rate for the janitors: $10/hour * 1.5 = $<<10*1.5=15>>15/hour
Then multiply that by the number of hours to find the total janitorial cost: $15/hour

In [45]:
programmatic_comp_df.columns

Index(['index', 'tier', 'model', 'erroneous_line_number', 'explanation',
       'wrong_answer', 'correct_trace_generated', 'target_variable',
       'error_type', 'correct_value', 'flawed_value', 'repro_seed', 'date_utc',
       'time_utc', 'filepath'],
      dtype='object')

In [46]:
manual_errors_df.columns

Index(['answer', 'erroneous_line_number', 'error_type', 'explanation',
       'filepath', 'index', 'question', 'wrong_answer'],
      dtype='object')

In [47]:
# Assistant content (i.e. label) format:
ASSISTANT_CONTENT_FORMAT = \
"""```json
{
  "verdict": "correct" | "conceptual" | "computational",
  "erroneous_line_number": str | None,
  "explanation": str | None
}
```"""

In [48]:
ANCHOR_INDICES = sorted(list(
    manual_errors_df[manual_errors_df['error_type'] == 'conceptual']['index'].unique()
))

print(f"Identified {len(ANCHOR_INDICES)} anchor indices with manual conceptual errors.")

Identified 956 anchor indices with manual conceptual errors.


In [None]:
def format_user_content(
        question: str,
        raw_answer: str
    ):
    """
    Formats the user content for the SFT dataset.
    
    Args:
        question (str): The problem statement.
        answer (str): The (raw, unprocessed) solution to the problem.
        
    Returns:
        str: Formatted user content string.
    """
    reasoning, final_answer = clean_and_split_solution(raw_answer)
    solution_json = convert_solution_to_json_str(reasoning, final_answer)

    user_content = \
f"""### Question
{question}

### Answer
{solution_json}"""
    return user_content

def format_assistant_content(
        verdict: str,
        erroneous_line_number: str,
        explanation: str
    ):
    """
    Formats the assistant content for the SFT dataset.
    Args:
        verdict (str): The verdict of the solution, one of "correct", "conceptual", or "computational".
        erroneous_line_number (str | None): The line number where the error occurred, if applicable.
        explanation (str | None): Explanation of the error, if applicable.
    Returns:
        str: json-formatted assistant content string.
    """
    assistant_content = {
        "verdict": verdict,
        "erroneous_line_number": erroneous_line_number,
        "explanation": explanation
    }
    return "```json\n" + json.dumps(assistant_content, indent=2) + "\n```"

SyntaxError: unterminated string literal (detected at line 45) (4018768123.py, line 45)

In [50]:
sample = {
    "messages": [
        {
            "role": "system", 
            "content": "You are a mathematical solution verifier. Analyze the given problem and solution, then provide a verdict in JSON format."
        },
        {
            "role": "user", 
            "content": "### Question:\n{question}\n\n### Answer:\n{solution_json}"
        },
        {
            "role": "assistant", 
            "content": "{\"verdict\": \"correct\", \"erroneous_line_number\": null, \"explanation\": null}"
        }
    ],
    # Optional metadata for tracking
    "index": 123,
    "error_type": "correct"
}

In [51]:
def prepare_conceptual_and_correct_samples():
    """
    For each anchor index, prepares a 'correct' sample, a 'conceptual_error' sample, and (optionally) a 'computational_error' sample.
    For the computational errors, if they exist in the manual errors DataFrame, they will be included. If no computational error exists for a given anchor index, only the 'correct' and 'conceptual_error' samples will be returned, and these indices will be returned in a list to be used for adding computational errors from the programmatic catalog.

    Returns:
        samples: List of dictionaries with 'messages' key for each sample
        missing_computational_indices: List of anchor indices missing computational errors
    """
    samples = []
    missing_computational_indices = []
    
    # System message for all samples
    SYSTEM_MESSAGE = "You are a mathematical solution verifier. Analyze the given problem and solution, then provide a verdict in JSON format indicating whether the solution is correct, contains a conceptual error, or contains a computational error."
    
    conceptual_df = manual_errors_df[
        (manual_errors_df['error_type'] == 'conceptual') &
        (manual_errors_df['index'].isin(ANCHOR_INDICES))
    ]

    for idx in ANCHOR_INDICES:
        idx = int(idx)
        original_problem = GSM8K_DATASET[idx]
        question = original_problem['question']

        # 1. Add the "Correct" sample
        raw_correct_answer = original_problem['answer']
        user_content_correct = format_user_content(question, raw_correct_answer)
        assistant_content_correct = format_assistant_content(
            verdict="correct",
            erroneous_line_number="None",
            explanation="None"
        )
        
        correct_sample = {
            "messages": [
                {"role": "system", "content": SYSTEM_MESSAGE},
                {"role": "user", "content": user_content_correct},
                {"role": "assistant", "content": assistant_content_correct}
            ],
            "index": idx,
            "verdict": "correct"
        }
        samples.append(correct_sample)

        # 2. Add the "Conceptual" sample
        candidate_rows = conceptual_df[conceptual_df['index'] == idx]
        chosen_row = candidate_rows.iloc[0]
        raw_wrong_answer = chosen_row['wrong_answer']
        user_content_conceptual = format_user_content(question, raw_wrong_answer)

        erroneous_line_number = chosen_row["erroneous_line_number"]
        explanation = chosen_row["explanation"]
        assistant_content_conceptual = format_assistant_content(
            verdict="conceptual_error",
            erroneous_line_number=str(erroneous_line_number),
            explanation=explanation
        )
        
        conceptual_sample = {
            "messages": [
                {"role": "system", "content": SYSTEM_MESSAGE},
                {"role": "user", "content": user_content_conceptual},
                {"role": "assistant", "content": assistant_content_conceptual}
            ],
            "index": idx,
            "verdict": "conceptual_error"
        }
        samples.append(conceptual_sample)

        # 3. Check for manual computational errors
        computational_rows = manual_errors_df[
            (manual_errors_df['error_type'] == 'computational') &
            (manual_errors_df['index'] == idx)
        ]
        
        if not computational_rows.empty:
            comp_row = computational_rows.iloc[0]
            raw_wrong_answer = comp_row['wrong_answer']
            user_content_comp = format_user_content(question, comp_row['wrong_answer'])

            raw_wrong_answer = comp_row['wrong_answer']
            erroneous_line_number = comp_row["erroneous_line_number"]
            explanation = comp_row["explanation"]
            assistant_content_comp = format_assistant_content(
                verdict="computational",
                erroneous_line_number=erroneous_line_number,
                explanation=explanation
            )
            
            computational_sample = {
                "messages": [
                    {"role": "system", "content": SYSTEM_MESSAGE},
                    {"role": "user", "content": user_content_comp},
                    {"role": "assistant", "content": assistant_content_comp}
                ],
                "index": idx,
                "error_type": "computational"
            }
            samples.append(computational_sample)
        else:
            missing_computational_indices.append(idx)
    
    return samples, missing_computational_indices

In [52]:
samples, missing_computational_indices = prepare_conceptual_and_correct_samples()

In [55]:
examples = samples[:5]  # Take the first 5 samples for demonstration
print("Example samples:")
for i, sample in enumerate(examples):
    print(f"Sample {i+1}:")
    user_content = sample['messages'][1]['content']
    assistant_content = sample['messages'][2]['content']
    print("User Content:")
    print(user_content)
    print()
    print("Assistant Content:")
    print(assistant_content)
    print("=" * 80)

Example samples:
Sample 1:
User Content:
### Question
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?

### Answer
{
  "L1": "In the beginning, Betty has only 100 / 2 = $50.",
  "L2": "Betty's grandparents gave her 15 * 2 = $30.",
  "L3": "This means, Betty needs 100 - 50 - 30 - 15 = $5 more.",
  "FA": "5"
}

Assistant Content:
{
  "verdict": "correct",
  "erroneous_line_number": "None",
  "explanation": "None"
}
Sample 2:
User Content:
### Question
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?

### Answer
{
  "L1": "In the beginning, Betty has only 100 / 2 = $50.",
