In [36]:
# ---------------------------------------------------------------------- #
#  Imports
# ---------------------------------------------------------------------- #

from datasets import load_dataset
import re
import json
import importlib.util
import copy
import ast
import inspect
from pathlib import Path
from typing import Dict, Any, Tuple, Optional, List
from num2words import num2words

# ---------------------------------------------------------------------- #
#  Global constants & Configuration
# ---------------------------------------------------------------------- #

def find_project_root():
    """Traverse upwards to find the project root, marked by the .git folder."""
    current_path = Path.cwd()
    while current_path != current_path.parent:
        if (current_path / ".git").is_dir():
            return current_path
        current_path = current_path.parent
    raise FileNotFoundError("Could not find project root. Is this a git repository?")


PROJECT_ROOT = find_project_root()

# Define project paths
CORRECT_CODE_DIR = PROJECT_ROOT / 'data' / 'code_gen_outputs_traced'
# The 'FLAWED_CODE_DIR' now serves as the location for both the flawed
# code files and their corresponding metadata files.
FLAWED_CODE_DIR = PROJECT_ROOT / 'data' / 'code_with_error_traced'

# Confirm the paths
print(f"Project root found: {PROJECT_ROOT}")
print(f"Path to correct code: {CORRECT_CODE_DIR}")
print(f"Path to flawed code & metadata: {FLAWED_CODE_DIR}")


# Load dataset and define model lists
gsm8k_train = load_dataset("gsm8k", "main", split="train")

MODEL_DICT = {
  "anthropic": ["claude-3-5-haiku-20241022"],
  "openai": ["gpt-4.1-mini"],
  "google": ["gemini-2.0-flash-thinking-exp",
             "gemini-2.5-flash-lite-preview-06-17",
             "gemini-2.5-flash"]
}

MODELS = [f"{provider}_{model}" for provider, sublist in MODEL_DICT.items() for model in sublist]


# ==============================================================================
# Utility Functions
# ==============================================================================

def build_solution_mapping(index: int, dataset: "datasets.Dataset") -> Dict[str, str]:
    """
    Extracts the natural language solution for a given problem index,
    cleans it, and structures it into a line-numbered dictionary.
    """
    solution_mapping = {}
    solution_text = dataset[index]["answer"]
    lines = [ln.strip() for ln in solution_text.splitlines() if ln.strip()]

    # Improved regex to handle commas in the final answer
    if lines and re.match(r"^####\s*[\d\.,]+$", lines[-1]):
        solution_mapping["FA"] = lines.pop(-1).strip()

    # Normalize calculator annotation brackets for consistent parsing
    angle = re.compile(r"<<([^>]+)>>")
    lines = [angle.sub(r"[[\1]]", ln) for ln in lines]

    for i, line in enumerate(lines, 1):
        solution_mapping[f"L{i}"] = line

    return solution_mapping

def execution_trace(func) -> Dict[str, Any]:
    """Simulates execution of a function and returns a variable-to-value map."""
    src = inspect.getsource(func)
    tree = ast.parse(src)
    func_def = tree.body[0]
    env = {}
    
    # Get default args
    arg_names = [arg.arg for arg in func_def.args.args]
    defaults = func_def.args.defaults
    for name, val_node in zip(arg_names[-len(defaults):], defaults):
        env[name] = eval(compile(ast.Expression(val_node), '', 'eval'))

    # Execute body
    for stmt in func_def.body:
        if isinstance(stmt, ast.Assign):
            code_obj = compile(ast.Module([stmt], []), '', 'exec')
            exec(code_obj, {}, env)
    return env

Project root found: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math
Path to correct code: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/code_gen_outputs_traced
Path to flawed code & metadata: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/code_with_error_traced


In [37]:
class NaturalLanguageErrorInjector:
    """
    Generates a flawed natural language (NL) solution and its corresponding
    structured JSON label by injecting a programmatic error from a source
    code function.
    """

    def __init__(self, problem_index: int, model_name: str, error_type: str,
                 correct_code_dir: Path, flawed_code_dir: Path,
                 dataset: "datasets.Dataset"):
        """
        Initializes the injector for a specific error instance.
        """
        self.problem_index = problem_index
        self.model_name = model_name
        self.error_type = error_type
        
        self.correct_code_dir = correct_code_dir
        self.flawed_code_dir = flawed_code_dir
        self.dataset = dataset

        self.f_oracle = None
        self.f_flawed = None
        self.correct_trace = None
        self.flawed_trace = None
        self.metadata = None
        self.original_nl_solution = None
        self.deleted_nl_line_text = ""
        self.operator_swap_successful = False

        self.op_map = {'Mult': '*', 'Add': '+', 'Sub': '-', 'Div': '/'}
        self.op_synonyms = {'Mult': ['*', 'x'], 'Add': ['+'], 'Sub': ['-'], 'Div': ['/']}

    def _load_module_from_path(self, file_path: Path, module_name: str):
        if not file_path.exists():
            print(f"❌ Error: File not found at {file_path}")
            return None
        spec = importlib.util.spec_from_file_location(module_name, file_path)
        if spec and spec.loader:
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
            return module
        return None

    def _load_data_sources(self) -> bool:
        # Load correct code and trace
        correct_path = self.correct_code_dir / str(self.problem_index) / f"{self.model_name}.py"
        self.f_oracle = self._load_module_from_path(correct_path, "f_oracle")
        if not self.f_oracle: return False
        self.correct_trace = execution_trace(self.f_oracle.solve)

        # Load flawed code and trace
        base_flawed_dir = self.flawed_code_dir / self.error_type / str(self.problem_index)
        flawed_path = base_flawed_dir / f"{self.model_name}.py"
        self.f_flawed = self._load_module_from_path(flawed_path, "f_flawed")
        if self.f_flawed:
            try:
                self.flawed_trace = execution_trace(self.f_flawed.solve)
            except Exception:
                if self.error_type != 'skipped_step': print(f"⚠️ Warning: Could not trace flawed function for {self.error_type}")
                self.flawed_trace = {}
        else:
            if self.error_type != 'skipped_step': return False
            self.flawed_trace = {}

        # Load metadata
        metadata_path = base_flawed_dir / f"metadata_{self.problem_index}_{self.error_type}.json"
        if not metadata_path.exists():
            print(f"❌ Error: Metadata file not found at {metadata_path}")
            return False
        with open(metadata_path, 'r') as f:
            full_metadata = json.load(f)
            self.metadata = full_metadata.get(self.model_name)
        if not self.metadata:
            print(f"❌ Error: No metadata found for model '{self.model_name}' in {metadata_path}")
            return False

        # Load NL solution
        self.original_nl_solution = build_solution_mapping(self.problem_index, self.dataset)
        if not self.original_nl_solution:
            print(f"❌ Error: Could not build NL solution for index {self.problem_index}")
            return False
            
        return True

    # --- BUG FIX #1: Handles numbers with and without commas ---
    def _replace_number_in_string(self, text: str, old_num: float, new_num: float) -> str:
        if abs(old_num - new_num) < 1e-9:
            return text
        
        new_num_str = str(int(new_num)) if new_num.is_integer() else f"{new_num:.2f}".rstrip('0').rstrip('.')
        
        # Define multiple string versions of the old number to find and replace
        old_num_strs_to_find = []
        if old_num.is_integer():
            old_num_int = int(old_num)
            old_num_strs_to_find.append(str(old_num_int))
            # Handle comma-separated version
            old_num_strs_to_find.append(f"{old_num_int:,}")
            # Handle word form
            old_num_strs_to_find.append(num2words(old_num_int))
        else:
            old_num_strs_to_find.append(str(old_num))

        modified_text = text
        for old_str in old_num_strs_to_find:
            pattern = re.compile(r'\b' + re.escape(old_str) + r'\b', re.IGNORECASE)
            modified_text = pattern.sub(new_num_str, modified_text)
        
        return modified_text

    def _modify_nl_line(self, line_text: str) -> str:
        # Pre-process line by removing commas to simplify number finding
        text_for_search = line_text.replace(',', '')
        try:
            numbers_in_text = set(re.findall(r'\b\d+\.?\d*\b', text_for_search))
            unique_numbers = sorted([float(n) for n in numbers_in_text], reverse=True)
        except (ValueError, TypeError):
            return line_text

        if not unique_numbers:
            return line_text

        num_to_var_map = {}
        trace_copy = self.correct_trace.copy()
        for num in unique_numbers:
            found_var = None
            for var, val in trace_copy.items():
                if isinstance(val, (int, float)) and abs(val - num) < 1e-6:
                    found_var = var
                    break
            if found_var:
                num_to_var_map[num] = found_var
                del trace_copy[found_var]

        if not num_to_var_map:
            return line_text

        modified_line = line_text
        for old_num, var_name in num_to_var_map.items():
            if var_name in self.flawed_trace:
                new_flawed_value = float(self.flawed_trace[var_name])
                modified_line = self._replace_number_in_string(modified_line, old_num, new_flawed_value)
        
        return modified_line

    def _recalculate_annotation(self, line_text: str) -> str:
        """
        Finds a calculator annotation, re-evaluates its expression, and updates it.
        This version uses a corrected regex to match annotations without spaces.
        """
        
        def replacer(match):
            prefix, expression, equals, old_result, suffix = match.groups()
            try:
                # Security Note: Using eval is a risk if input isn't trusted.
                # Here, we trust it as it comes from our controlled NL solution.
                # The replace(' ', '') call handles extra spaces for more robust evaluation.
                new_result = eval(expression.replace('x', '*').replace(' ', ''))
                new_result_str = str(int(new_result)) if float(new_result).is_integer() else f"{new_result:.2f}".rstrip('0').rstrip('.')
                # Reconstruct with the original equals sign, but the new result.
                return f"{prefix}{expression}{equals}{new_result_str}{suffix}"
            except:
                # If eval fails (e.g., contains variables like 'S'), return original match.
                return match.group(0)

        # CORRECTED REGEX: Removed spaces around the equals sign.
        pattern = re.compile(r'(\[\[)(.*?)(=)([\d\.\-]+)(\]\])')
        return pattern.sub(replacer, line_text)

    def _cleanup_double_operators(self, line_text: str) -> str:
        """Replaces string artifacts like '+-' with '-' and '--' with '+'."""
        # Corrected replacement removes unwanted space after '+'
        return line_text.replace('+-', '-').replace('--', '+')

    def _generate_flawed_nl_solution(self) -> Dict[str, str]:
        """
        Creates the flawed natural language solution mapping by following a
        robust, multi-stage modification pipeline for each line.
        """
        flawed_nl = copy.deepcopy(self.original_nl_solution)
        self.operator_swap_successful = False

        if self.error_type == 'skipped_step':
            line_to_delete = self.metadata['line_label']
            if line_to_delete in flawed_nl:
                self.deleted_nl_line_text = flawed_nl[line_to_delete]
                del flawed_nl[line_to_delete]
            return flawed_nl

        sorted_keys = sorted(flawed_nl.keys(), key=lambda k: (k[0] != 'L', int(k[1:]) if k.startswith('L') else float('inf')))
        
        for key in sorted_keys:
            line_text = flawed_nl.get(key, "")
            if not line_text: continue

            # --- Stage 1: Propagate flawed numerical values to operands ---
            modified_line = self._modify_nl_line(line_text)

            # --- Stage 2: Inject operator error if applicable ---
            if self.error_type == 'incorrect_operation' and key == self.metadata.get('line_label'):
                original_op_name = self.metadata.get('original_op')
                new_op_name = self.metadata.get('new_op')
                symbols_to_replace = self.op_synonyms.get(original_op_name, [])
                new_op_symbol = self.op_map.get(new_op_name, '?')
                is_op_present = any(op in modified_line for op in symbols_to_replace)

                if is_op_present and new_op_symbol != '?':
                    for old_symbol in symbols_to_replace:
                        modified_line = modified_line.replace(old_symbol, new_op_symbol)
                    self.operator_swap_successful = True
            
            # --- Stage 3: Make Annotation and Prose Consistent ---
            recalculated_line = self._recalculate_annotation(modified_line)
            
            # Extract the new, definitive result from the recalculated annotation
            match_recalc = re.search(r'\[\[.*?=([\d\.\-]+)\]\]', recalculated_line)
            if match_recalc:
                new_result_str = match_recalc.group(1)
                
                # Create a precise regex to find and replace only the number immediately after the annotation
                # This avoids accidentally changing other numbers in the prose.
                # It looks for `]]` followed by optional currency symbols/spaces and then the number.
                prose_result_pattern = re.compile(r'(\]\][\s\$]*)([\d\.\,]+)')
                
                # Replace the prose result with the new definitive result from the annotation
                final_line = prose_result_pattern.sub(r'\g<1>' + new_result_str, recalculated_line, 1)
            else:
                # If there's no annotation, just use the line as is
                final_line = recalculated_line

            # --- Stage 4: Final Formatting Cleanup ---
            flawed_nl[key] = self._cleanup_double_operators(final_line)
        
        return flawed_nl

    def _generate_json_label(self) -> Dict[str, Any]:
        m = self.metadata
        explanation, correction = "Error", "Error"

        if self.error_type == 'computational_error':
            explanation = f"There is a computational error. The solution states the result is {m['new_value']}, but the correct value is {m['original_value']}."
            correction = f"To correct this, replace the incorrect value {m['new_value']} with the correct value {m['original_value']}."

        elif self.error_type == 'incorrect_operation':
            original_op_symbol = self.op_map.get(m.get('original_op'), '?')
            new_op_symbol = self.op_map.get(m.get('new_op'), '?')
            explanation = f"The solution incorrectly uses a '{new_op_symbol}' operation where a '{original_op_symbol}' operation was needed."
            correction = f"To correct this, the operator should be changed from '{new_op_symbol}' to '{original_op_symbol}'."

        elif self.error_type == 'skipped_step':
            explanation = f"A necessary calculation step is missing. The solution fails to perform the step that would have defined the variable '{m['deleted_variable']}'."
            correction = f"To correct this, the following step must be inserted back into the solution: '{self.deleted_nl_line_text}'"
        
        elif self.error_type == 'incorrect_operand':
            explanation = f"The calculation uses an incorrect variable. It incorrectly references '{m['new_operand']}' instead of '{m['original_operand']}'."
            correction = f"To correct this, the variable '{m['new_operand']}' should be replaced with the correct variable, '{m['original_operand']}'."

        return {
            "verdict": "Flawed",
            "error_details": {
                "error_type": self.error_type,
                "erroneous_line_number": m['line_label'],
                "explanation": explanation,
                "correction": correction
            }
        }

    def inject_nl_error(self) -> Optional[Tuple[Dict[str, str], Dict[str, Any]]]:
        if not self._load_data_sources():
            return None
        
        flawed_nl_solution = self._generate_flawed_nl_solution()
        
        if self.error_type == 'incorrect_operation' and not self.operator_swap_successful:
            return None

        json_label = self._generate_json_label()
        
        return flawed_nl_solution, json_label

In [38]:
def test_single_injection(
        problem_index: int, 
        model_name: str, 
        error_type: str, 
        verbose: bool = True):
    """
    A simple wrapper to test the NaturalLanguageErrorInjector for a single case.
    Uses the global path and dataset variables defined in Cell 1.
    """
    if verbose:
        print(f"============================================================")
        print(f"  Testing Injection for: {error_type.upper()}")
        print(f"  Problem Index: {problem_index}, Model: {model_name}")
        print(f"============================================================\n")

    injector = NaturalLanguageErrorInjector(
        problem_index=problem_index,
        model_name=model_name,
        error_type=error_type,
        correct_code_dir=CORRECT_CODE_DIR,
        flawed_code_dir=FLAWED_CODE_DIR,
        dataset=gsm8k_train
    )

    result = injector.inject_nl_error()

    if result:
        flawed_solution, final_label = result
        if verbose:
            print("--- Original NL Solution ---")
            print(json.dumps(injector.original_nl_solution, indent=4))
            
            print("\n--- Generated Flawed NL Solution ---")
            print(json.dumps(flawed_solution, indent=4))
            
            print("\n--- Generated JSON Label ---")
            print(json.dumps(final_label, indent=4))
            print("\n✅ Injection Successful.\n")
        return result
    else:
        if verbose:
            print(f"❌ Injection Failed for {error_type}.\n")
        return None

In [39]:
# --- Example Usage ---
# Assuming 'metadata.json' files exist in the specified directories.
# E.g., 'data/code_with_error_traced/computational_error/0/metadata.json'

# test_problem_index = 0
test_model = "anthropic_claude-3-5-haiku-20241022"
test_error_types = ['computational_error', 'incorrect_operation', 'skipped_step']

for test_problem_index in range(10):
    for error_type in test_error_types:
        test_single_injection(
            problem_index=test_problem_index,
            model_name=test_model,
            error_type=error_type,
        )

  Testing Injection for: COMPUTATIONAL_ERROR
  Problem Index: 0, Model: anthropic_claude-3-5-haiku-20241022

--- Original NL Solution ---
{
    "FA": "#### 72",
    "L1": "Natalia sold 48/2 = [[48/2=24]]24 clips in May.",
    "L2": "Natalia sold 48+24 = [[48+24=72]]72 clips altogether in April and May."
}

--- Generated Flawed NL Solution ---
{
    "FA": "#### 82",
    "L1": "Natalia sold 48/2 = [[48/2=24]]24 clips in May.",
    "L2": "Natalia sold 48+24 = [[48+24=72]]72 clips altogether in April and May."
}

--- Generated JSON Label ---
{
    "verdict": "Flawed",
    "error_details": {
        "error_type": "computational_error",
        "erroneous_line_number": "L2",
        "explanation": "There is a computational error. The solution states the result is 82.0, but the correct value is 72.0.",
        "correction": "To correct this, replace the incorrect value 82.0 with the correct value 72.0."
    }
}

✅ Injection Successful.

  Testing Injection for: INCORRECT_OPERATION
  Problem I

In [40]:
for test_problem_index in range(11, 21):
    for error_type in test_error_types:
        test_single_injection(
            problem_index=test_problem_index,
            model_name=test_model,
            error_type=error_type,
        )

  Testing Injection for: COMPUTATIONAL_ERROR
  Problem Index: 11, Model: anthropic_claude-3-5-haiku-20241022

--- Original NL Solution ---
{
    "FA": "#### 5",
    "L1": "He saved up $110 total because 95 + 15 = [[95+15=110]]110",
    "L2": "He saved $15 from his allowance because 3 x 5 = [[3*5=15]]15",
    "L3": "He earned $60 mowing lawns because 4 x 15 = [[4*15=60]]60",
    "L4": "He earned $35 shoveling driveways because 110 - 60 - 15 = [[110-60-15=35]]35",
    "L5": "He shoveled 5 driveways because 35 / 7 = [[35/7=5]]5"
}

--- Generated Flawed NL Solution ---
{
    "FA": "#### 5",
    "L1": "He saved up $110 total because 95 + 15 = [[95+15=110]]110",
    "L2": "He saved $15 from his allowance because 3 x 5 = [[3*5=15]]15",
    "L3": "He earned $60 mowing lawns because 4 x 15 = [[4*15=60]]60",
    "L4": "He earned $25 shoveling driveways because 110 - 60 - 15 = [[110-60-15=35]]35",
    "L5": "He shoveled 5 driveways because 25 / 7 = [[25/7=3.57]]3.57"
}

--- Generated JSON Label -

In [41]:
def test_single_injection(problem_index: int, model_name: str, error_type: str,
                          print_original_solution: bool = True, verbose: bool = True):
    """
    A simple wrapper to test the NaturalLanguageErrorInjector for a single case.
    Uses the global path and dataset variables.

    Args:
        problem_index: The GSM8K index to test.
        model_name: The model whose output will be used.
        error_type: The error type to inject.
        print_original_solution: If True, prints the original NL solution.
        verbose: If True, prints detailed step-by-step outputs.
    """
    if verbose:
        print(f"------------------------------------------------------------")
        print(f"  Testing Injection for: {error_type.upper()}")
        print(f"------------------------------------------------------------\n")

    injector = NaturalLanguageErrorInjector(
        problem_index=problem_index,
        model_name=model_name,
        error_type=error_type,
        correct_code_dir=CORRECT_CODE_DIR,
        flawed_code_dir=FLAWED_CODE_DIR,
        dataset=gsm8k_train
    )

    result = injector.inject_nl_error()

    if result:
        flawed_solution, final_label = result
        if verbose:
            if print_original_solution:
                print("--- Original NL Solution ---")
                print(json.dumps(injector.original_nl_solution, indent=4))
            
            print("--- Generated Flawed NL Solution ---")
            print(json.dumps(flawed_solution, indent=4))
            
            print("\n--- Generated JSON Label ---")
            print(json.dumps(final_label, indent=4))
            print("\n✅ Injection Successful.\n")
        return result
    else:
        if verbose:
            print(f"❌ Injection Failed for {error_type}.\n")
        return None

# ==============================================================================
# New, more concise testing loop for multiple problems
# ==============================================================================

# --- Configuration ---
# You can change this list to test any range of problems
indices_to_test = range(50) 
test_model = "anthropic_claude-3-5-haiku-20241022"
test_error_types = ['computational_error', 'incorrect_operation', 'skipped_step']

# --- Main Loop ---
for index in indices_to_test:
    print(f"\n\n============================================================")
    print(f"  ANALYZING PROBLEM INDEX: {index}  ")
    print(f"============================================================\n")
    
    # Print the original solution once per problem
    original_solution = build_solution_mapping(index, gsm8k_train)
    print("--- Original NL Solution ---")
    print(json.dumps(original_solution, indent=4))
    print("\n")
    
    # Test all error types for this problem
    for error_type in test_error_types:
        test_single_injection(
            problem_index=index,
            model_name=test_model,
            error_type=error_type,
            # Pass False to prevent re-printing the original solution
            print_original_solution=False 
        )



  ANALYZING PROBLEM INDEX: 0  

--- Original NL Solution ---
{
    "FA": "#### 72",
    "L1": "Natalia sold 48/2 = [[48/2=24]]24 clips in May.",
    "L2": "Natalia sold 48+24 = [[48+24=72]]72 clips altogether in April and May."
}


------------------------------------------------------------
  Testing Injection for: COMPUTATIONAL_ERROR
------------------------------------------------------------

--- Generated Flawed NL Solution ---
{
    "FA": "#### 82",
    "L1": "Natalia sold 48/2 = [[48/2=24]]24 clips in May.",
    "L2": "Natalia sold 48+24 = [[48+24=72]]72 clips altogether in April and May."
}

--- Generated JSON Label ---
{
    "verdict": "Flawed",
    "error_details": {
        "error_type": "computational_error",
        "erroneous_line_number": "L2",
        "explanation": "There is a computational error. The solution states the result is 82.0, but the correct value is 72.0.",
        "correction": "To correct this, replace the incorrect value 82.0 with the correct value 72.

In [42]:
# --- Bug #1: Testing Comma-Separated Number Handling ---
# These problems involve large numbers with commas.
# Key Test: Index 15
comma_test_indices = [15, 42]

# --- Bug #2: Testing Annotation Consistency ---
# These problems test if the expression inside [[...]] is correctly recalculated.
# Key Tests: Index 4, 11, 27
annotation_consistency_indices = [4, 11, 27, 36]

# --- Bug #3: Testing Double Negative Formatting ---
# These problems test if '+-' and '--' are cleaned up.
# Key Test: Index 18
double_negative_indices = [3, 18]

# --- Quality Control Gate Confirmation ---
# These problems should fail for 'incorrect_operation' to confirm
# that low-quality examples are being correctly discarded.
quality_gate_indices = [0, 6, 17]

# --- General Sanity Checks ---
# A selection of standard problems to ensure no new bugs were introduced.
# Includes problems with algebraic variables (10) and complex expressions (30, 44).
sanity_check_indices = [2, 10, 21, 28, 30, 41, 44]


# --- Combined List for a Comprehensive Test Run ---
# This list contains all unique indices from the groups above, sorted.
all_test_indices = sorted(list(set(
    comma_test_indices +
    annotation_consistency_indices +
    double_negative_indices +
    quality_gate_indices +
    sanity_check_indices
)))

# --- Run the comprehensive test for all selected indices ---
for index in all_test_indices:
    print(f"\n\n============================================================")
    print(f"  ANALYZING PROBLEM INDEX: {index}  ")
    print(f"============================================================\n")
    
    # Print the original solution once per problem
    original_solution = build_solution_mapping(index, gsm8k_train)
    print("--- Original NL Solution ---")
    print(json.dumps(original_solution, indent=4))
    print("\n")
    
    # Test all error types for this problem
    for error_type in test_error_types:
        test_single_injection(
            problem_index=index,
            model_name=test_model,
            error_type=error_type,
            # Pass False to prevent re-printing the original solution
            print_original_solution=False 
        )



  ANALYZING PROBLEM INDEX: 0  

--- Original NL Solution ---
{
    "FA": "#### 72",
    "L1": "Natalia sold 48/2 = [[48/2=24]]24 clips in May.",
    "L2": "Natalia sold 48+24 = [[48+24=72]]72 clips altogether in April and May."
}


------------------------------------------------------------
  Testing Injection for: COMPUTATIONAL_ERROR
------------------------------------------------------------

--- Generated Flawed NL Solution ---
{
    "FA": "#### 82",
    "L1": "Natalia sold 48/2 = [[48/2=24]]24 clips in May.",
    "L2": "Natalia sold 48+24 = [[48+24=72]]72 clips altogether in April and May."
}

--- Generated JSON Label ---
{
    "verdict": "Flawed",
    "error_details": {
        "error_type": "computational_error",
        "erroneous_line_number": "L2",
        "explanation": "There is a computational error. The solution states the result is 82.0, but the correct value is 72.0.",
        "correction": "To correct this, replace the incorrect value 82.0 with the correct value 72.

In [43]:
for index in range(50,100):
    print(f"\n\n============================================================")
    print(f"  ANALYZING PROBLEM INDEX: {index}  ")
    print(f"============================================================\n")
    
    # Print the original solution once per problem
    original_solution = build_solution_mapping(index, gsm8k_train)
    print("--- Original NL Solution ---")
    print(json.dumps(original_solution, indent=4))
    print("\n")
    
    # Test all error types for this problem
    for error_type in test_error_types:
        test_single_injection(
            problem_index=index,
            model_name=test_model,
            error_type=error_type,
            # Pass False to prevent re-printing the original solution
            print_original_solution=False 
        )



  ANALYZING PROBLEM INDEX: 50  

--- Original NL Solution ---
{
    "FA": "#### 5",
    "L1": "He needs to save up $400 because 4 x 100 = [[4*100=400]]400",
    "L2": "He has 8 months to earn this money because 12 - 4 = [[12-4=8]]8",
    "L3": "He needs to earn $50 a month because 400 / 8 = [[400/8=50]]50",
    "L4": "He needs to do 5 tasks a month because 50 / 10 = [[50/10=5]]5"
}


------------------------------------------------------------
  Testing Injection for: COMPUTATIONAL_ERROR
------------------------------------------------------------

--- Generated Flawed NL Solution ---
{
    "FA": "#### +5",
    "L1": "He needs to save up $400 because 4 x 100 = [[4*100=400]]400",
    "L2": "He has 8 months to earn this money because 12 - 4 = [[12-4=8]]8",
    "L3": "He needs to earn $50 a month because 400 / 8 = [[400/8=50]]50",
    "L4": "He needs to do +5 tasks a month because 50 / 10 = [[50/10=5]]+5"
}

--- Generated JSON Label ---
{
    "verdict": "Flawed",
    "error_details": {
