In [1]:
# --- Imports and Path Definitions ---

import json
import re
import ast
import inspect
import importlib.util
from pathlib import Path
from types import ModuleType
from typing import Callable, Any, Dict, List, Set
from fractions import Fraction as BuiltinFraction
import copy
import datetime

import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset, Dataset
from joblib import Parallel, delayed

def find_project_root(marker: str = ".git"):
    """
    Traverses the directory structure upwards from the current working directory
    to locate the project's root, which is identified by the presence of a
    specific marker file or directory (e.g., '.git').

    Args:
        marker: The filename or directory name that marks the project root.

    Returns:
        A Path object to the project root directory.
    
    Raises:
        FileNotFoundError: If the project root cannot be found.
    """
    current_path = Path.cwd().resolve()
    while current_path != current_path.parent:
        if (current_path / marker).exists():
            return current_path
        current_path = current_path.parent
    raise FileNotFoundError(f"Could not find project root. Marker '{marker}' not found.")

PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / 'data'

# --- Directory Paths ---
PROCESSED_TEMPLATE_DIR = DATA_DIR / "template-generated-processed"
CONCEPTUAL_CANDIDATES_DIR = DATA_DIR / "conceptual-error-candidates"
CATALOG_PATH = CONCEPTUAL_CANDIDATES_DIR / "conceptual_candidate_catalog.csv"

# --- Models ---
MODELS = ['google_gemini-2.5-flash', 'openai_gpt-4.1']

print(f"Project root: {PROJECT_ROOT}")
print(f"Input (Processed Templates): {PROCESSED_TEMPLATE_DIR}")
print(f"Output (Conceptual Candidates): {CONCEPTUAL_CANDIDATES_DIR}")
print(f"Catalog Path: {CATALOG_PATH}")

# --- Ensure Directories Exist ---
PROCESSED_TEMPLATE_DIR.mkdir(parents=True, exist_ok=True)
CONCEPTUAL_CANDIDATES_DIR.mkdir(parents=True, exist_ok=True)

Project root: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math
Input (Processed Templates): /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/template-generated-processed
Output (Conceptual Candidates): /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates
Catalog Path: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates/conceptual_candidate_catalog.csv


In [2]:
# --- Load GSM8K Dataset ---
GSM8K_TRAIN: Dataset = load_dataset("gsm8k", "main")["train"]

# --- Tier Definition Functions ---
def has_computational_division(solution_text: str):
    """Checks if a solution text contains a division operation."""
    pattern = re.compile(r'/\s*\d')
    return bool(pattern.search(solution_text))

def has_float(solution_text: str):
    """Checks if a solution text contains a float value."""
    pattern = re.compile(r'(?<!\d)\.\d+|\d+\.\d+')
    return bool(pattern.search(solution_text))

def is_symbolic(solution_text: str):
    """Checks if a solution text uses symbolic algebra (e.g., 'Let x...')."""
    pattern = re.compile(r'^Let [a-zA-Z] ', re.MULTILINE)
    return bool(pattern.search(solution_text))

def mutually_disjoint_tiers(dataset: Dataset):
    """
    Categorizes all problems in the dataset into mutually disjoint tiers
    based on the mathematical operations present in their solution text.
    """
    tiers = {}
    symbolic_set = {idx for idx, sample in enumerate(dataset) if is_symbolic(sample.get("answer", ""))}
    non_symbolic_indices = [idx for idx in range(len(dataset)) if idx not in symbolic_set]
    
    tiers["tier1"] = sorted([idx for idx in non_symbolic_indices if not has_float(dataset[idx].get("answer", "")) and not has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier2"] = sorted([idx for idx in non_symbolic_indices if has_float(dataset[idx].get("answer", "")) and not has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier3"] = sorted([idx for idx in non_symbolic_indices if not has_float(dataset[idx].get("answer", "")) and has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier4"] = sorted([idx for idx in non_symbolic_indices if has_float(dataset[idx].get("answer", "")) and has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier5"] = sorted(list(symbolic_set))
    return tiers

TIER_LISTS = mutually_disjoint_tiers(GSM8K_TRAIN)
print("Tier definitions loaded.")

Tier definitions loaded.


In [3]:
def sanitize_text(text: str):
    """
    Replaces a comprehensive set of problematic Unicode characters with their
    ASCII equivalents to prevent model generation and string parsing errors.
    """
    replacements = {
        "\u2212": "-", "\u00d7": "*", "\u00f7": "/", "\u22c5": "*",
        "\u201c": '"', "\u201d": '"', "\u2018": "'", "\u2019": "'",
        "\u2014": "-", "\u2013": "-", "\u2026": "...", "\u00a0": " ",
    }
    for uni, ascii_char in replacements.items():
        text = text.replace(uni, ascii_char)
    return text

def build_solution_mapping(index: int):
    """
    Extracts the original natural language solution from the dataset, sanitizes
    it, and structures it into a line-numbered dictionary including the 'FA'
    (Final Answer) line.
    """
    try:
        solution_text = GSM8K_TRAIN[index]["answer"]
        sanitized_text = sanitize_text(solution_text)
        lines = [ln.strip() for ln in sanitized_text.splitlines() if ln.strip()]

        solution_mapping = {}
        if lines and re.match(r"^####\s*[\d\.,]+$", lines[-1]):
            solution_mapping["FA"] = lines.pop(-1).strip()
        
        for i, line in enumerate(lines, 1):
            solution_mapping[f"L{i}"] = line
            
        return solution_mapping
    except IndexError:
        return {}

print("Sanitization and Solution Mapping utilities defined.")

Sanitization and Solution Mapping utilities defined.


In [4]:
catalog = pd.read_csv(CATALOG_PATH)
display(catalog.head())
display(catalog.info())

Unnamed: 0,index,tier,model,mutation_type,target_variable,correct_value,flawed_value,repro_seed,date_utc,time_utc,mutation_details,filepath
0,4,tier1,google_gemini-2.5-flash,input_misrepresentation,pages_per_friend_per_week,6,4.0,-5110171652223615853,2025-07-20,17:24:50,"{""type"": ""input_misrepresentation"", ""target_va...",data/conceptual-error-candidates/tier1/4/googl...
1,4,tier1,google_gemini-2.5-flash,input_misrepresentation,total_pages_per_year,624,72.0,-5110171652223615853,2025-07-20,17:24:50,"{""type"": ""input_misrepresentation"", ""target_va...",data/conceptual-error-candidates/tier1/4/googl...
2,4,tier1,google_gemini-2.5-flash,operator_swap,total_pages_per_week,12,3.0,-5110171652223615853,2025-07-20,17:24:50,"{""type"": ""operator_swap"", ""target_variable"": ""...",data/conceptual-error-candidates/tier1/4/googl...
3,4,tier1,google_gemini-2.5-flash,incorrect_final_answer_selection,answer,624,12.0,-5110171652223615853,2025-07-20,17:24:50,"{""type"": ""incorrect_final_answer_selection"", ""...",data/conceptual-error-candidates/tier1/4/googl...
4,4,tier1,google_gemini-2.5-flash,incorrect_final_answer_selection,answer,624,6.0,-5110171652223615853,2025-07-20,17:24:50,"{""type"": ""incorrect_final_answer_selection"", ""...",data/conceptual-error-candidates/tier1/4/googl...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33738 entries, 0 to 33737
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   index             33738 non-null  int64 
 1   tier              33738 non-null  object
 2   model             33738 non-null  object
 3   mutation_type     33738 non-null  object
 4   target_variable   33738 non-null  object
 5   correct_value     33738 non-null  object
 6   flawed_value      33738 non-null  object
 7   repro_seed        33738 non-null  int64 
 8   date_utc          33738 non-null  object
 9   time_utc          33738 non-null  object
 10  mutation_details  33738 non-null  object
 11  filepath          33738 non-null  object
dtypes: int64(2), object(10)
memory usage: 3.1+ MB


None