In [2]:
# ---------------------------------------------------------------------- #
#  Imports
# ---------------------------------------------------------------------- #

from datasets import load_dataset
import re
import json
import importlib.util
import copy
import ast
import inspect
from pathlib import Path
from typing import Dict, Any, Tuple, Optional, List
from num2words import num2words

# ---------------------------------------------------------------------- #
#  Global constants & Configuration
# ---------------------------------------------------------------------- #

def find_project_root():
    """Traverse upwards to find the project root, marked by the .git folder."""
    current_path = Path.cwd()
    while current_path != current_path.parent:
        if (current_path / ".git").is_dir():
            return current_path
        current_path = current_path.parent
    raise FileNotFoundError("Could not find project root. Is this a git repository?")


PROJECT_ROOT = find_project_root()

# Define project paths
CORRECT_CODE_DIR = PROJECT_ROOT / 'data' / 'code_gen_outputs_traced'
# The 'FLAWED_CODE_DIR' now serves as the location for both the flawed
# code files and their corresponding metadata files.
FLAWED_CODE_DIR = PROJECT_ROOT / 'data' / 'code_with_error_traced'

# Confirm the paths
print(f"Project root found: {PROJECT_ROOT}")
print(f"Path to correct code: {CORRECT_CODE_DIR}")
print(f"Path to flawed code & metadata: {FLAWED_CODE_DIR}")


# Load dataset and define model lists
gsm8k_train = load_dataset("gsm8k", "main", split="train")

MODEL_DICT = {
  "anthropic": ["claude-3-5-haiku-20241022"],
  "openai": ["gpt-4.1-mini"],
  "google": ["gemini-2.0-flash-thinking-exp",
             "gemini-2.5-flash-lite-preview-06-17",
             "gemini-2.5-flash"]
}

MODELS = [f"{provider}_{model}" for provider, sublist in MODEL_DICT.items() for model in sublist]


# ==============================================================================
# Utility Functions
# ==============================================================================

def build_solution_mapping(index: int, dataset: "datasets.Dataset") -> Dict[str, str]:
    """
    Extracts the natural language solution for a given problem index,
    cleans it, and structures it into a line-numbered dictionary.
    """
    solution_mapping = {}
    solution_text = dataset[index]["answer"]
    lines = [ln.strip() for ln in solution_text.splitlines() if ln.strip()]

    # Improved regex to handle commas in the final answer
    if lines and re.match(r"^####\s*[\d\.,]+$", lines[-1]):
        solution_mapping["FA"] = lines.pop(-1).strip()

    # Normalize calculator annotation brackets for consistent parsing
    angle = re.compile(r"<<([^>]+)>>")
    lines = [angle.sub(r"[[\1]]", ln) for ln in lines]

    for i, line in enumerate(lines, 1):
        solution_mapping[f"L{i}"] = line

    return solution_mapping

def execution_trace(func) -> Dict[str, Any]:
    """Simulates execution of a function and returns a variable-to-value map."""
    src = inspect.getsource(func)
    tree = ast.parse(src)
    func_def = tree.body[0]
    env = {}
    
    # Get default args
    arg_names = [arg.arg for arg in func_def.args.args]
    defaults = func_def.args.defaults
    for name, val_node in zip(arg_names[-len(defaults):], defaults):
        env[name] = eval(compile(ast.Expression(val_node), '', 'eval'))

    # Execute body
    for stmt in func_def.body:
        if isinstance(stmt, ast.Assign):
            code_obj = compile(ast.Module([stmt], []), '', 'exec')
            exec(code_obj, {}, env)
    return env

  from .autonotebook import tqdm as notebook_tqdm


Project root found: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math
Path to correct code: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/code_gen_outputs_traced
Path to flawed code & metadata: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/code_with_error_traced


In [3]:
class NaturalLanguageErrorInjector:
    """
    Generates a flawed natural language (NL) solution and its corresponding
    structured JSON label by injecting a programmatic error from a source
    code function.
    """

    def __init__(self, problem_index: int, model_name: str, error_type: str,
                 correct_code_dir: Path, flawed_code_dir: Path,
                 dataset: "datasets.Dataset"):
        """
        Initializes the injector for a specific error instance.
        """
        self.problem_index = problem_index
        self.model_name = model_name
        self.error_type = error_type
        
        # Paths and data sources are passed in directly
        self.correct_code_dir = correct_code_dir
        self.flawed_code_dir = flawed_code_dir
        self.dataset = dataset

        # Initialize data attributes
        self.f_oracle = None
        self.f_flawed = None
        self.correct_trace = None
        self.flawed_trace = None
        self.metadata = None
        self.original_nl_solution = None
        self.deleted_nl_line_text = ""

        # Map metadata strings to symbols for explanations
        self.op_map = {
            'Mult': '*', 'Add': '+', 'Sub': '-', 'Div': '/'
        }
        # Map AST op classes to symbols for explanations
        self.op_synonyms = {
            'Mult': ['*', 'x'],
            'Add': ['+'],
            'Sub': ['-'],
            'Div': ['/']
        }

        # Add a flag to track successful operator swaps
        self.operator_swap_successful = False

    def _load_module_from_path(self, file_path: Path, module_name: str):
        if not file_path.exists():
            print(f"❌ Error: File not found at {file_path}")
            return None
        spec = importlib.util.spec_from_file_location(module_name, file_path)
        if spec and spec.loader:
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
            return module
        return None

    def _load_data_sources(self) -> bool:
        # 1. Load correct/oracle code and trace
        correct_path = self.correct_code_dir / str(self.problem_index) / f"{self.model_name}.py"
        self.f_oracle = self._load_module_from_path(correct_path, "f_oracle")
        if not self.f_oracle: return False
        self.correct_trace = execution_trace(self.f_oracle.solve)

        # 2. Load flawed code and trace
        base_flawed_dir = self.flawed_code_dir / self.error_type / str(self.problem_index)
        flawed_path = base_flawed_dir / f"{self.model_name}.py"
        self.f_flawed = self._load_module_from_path(flawed_path, "f_flawed")
        if self.f_flawed:
            try:
                self.flawed_trace = execution_trace(self.f_flawed.solve)
            except Exception:
                if self.error_type != 'skipped_step':
                    print(f"⚠️ Warning: Could not trace flawed function for {self.error_type}")
                self.flawed_trace = {}
        else:
            if self.error_type != 'skipped_step': return False
            self.flawed_trace = {}

        # 3. Load injection metadata from the same directory as the flawed code
        metadata_path = base_flawed_dir / f"metadata_{self.problem_index}_{self.error_type}.json"
        if not metadata_path.exists():
            print(f"❌ Error: Metadata file not found at {metadata_path}")
            return False
        with open(metadata_path, 'r') as f:
            full_metadata = json.load(f)
            self.metadata = full_metadata.get(self.model_name)
        if not self.metadata:
            print(f"❌ Error: No metadata found for model '{self.model_name}' in {metadata_path}")
            return False

        # 4. Load original NL solution from the dataset
        self.original_nl_solution = build_solution_mapping(self.problem_index, self.dataset)
        if not self.original_nl_solution:
            print(f"❌ Error: Could not build NL solution for index {self.problem_index}")
            return False
            
        return True

    def _replace_number_in_string(self, text: str, old_num: float, new_num: float) -> str:
        if abs(old_num - new_num) < 1e-9:
            return text
        
        # Improved formatting to avoid trailing decimals
        new_num_str = str(int(new_num)) if new_num.is_integer() else f"{new_num:.2f}".rstrip('0').rstrip('.')
        
        if old_num.is_integer():
            old_num_word = num2words(int(old_num))
            word_pattern = re.compile(r'\b' + re.escape(old_num_word) + r'\b', re.IGNORECASE)
            text = word_pattern.sub(new_num_str, text)

        old_num_str = str(int(old_num)) if old_num.is_integer() else str(old_num)
        numeral_pattern = re.compile(r'\b' + re.escape(old_num_str) + r'\b')
        text = numeral_pattern.sub(new_num_str, text)
        
        return text

    def _modify_nl_line(self, line_text: str) -> str:
        """
        Modifies a single NL line by replacing all numbers (operands and results)
        with their corresponding flawed values from the execution trace.
        """
        # 1. Find all unique numbers in the line text. The regex finds integers and floats.
        try:
            # Use a more robust regex to find standalone numbers.
            # Use a set for automatic deduplication.
            numbers_in_text = set(re.findall(r'\b\d+\.?\d*\b', line_text))
            unique_numbers = sorted([float(n) for n in numbers_in_text], reverse=True)
        except (ValueError, TypeError):
            return line_text # Return original if parsing fails

        if not unique_numbers:
            return line_text

        # 2. Map these numbers to variables from the correct trace.
        num_to_var_map = {}
        # Make a copy of correct_trace values to avoid re-mapping the same value to different vars.
        trace_copy = self.correct_trace.copy()
        
        for num in unique_numbers:
            found_var = None
            for var, val in trace_copy.items():
                if isinstance(val, (int, float)) and abs(val - num) < 1e-6:
                    found_var = var
                    break
            if found_var:
                num_to_var_map[num] = found_var
                # Remove from copy to prevent a value like '2' from mapping to multiple variables.
                del trace_copy[found_var]

        if not num_to_var_map:
            return line_text

        # 3. Iteratively replace each number with its flawed counterpart.
        modified_line = line_text
        for old_num, var_name in num_to_var_map.items():
            if var_name in self.flawed_trace:
                new_flawed_value = float(self.flawed_trace[var_name])
                modified_line = self._replace_number_in_string(modified_line, old_num, new_flawed_value)
        
        return modified_line

    def _generate_flawed_nl_solution(self) -> Dict[str, str]:
        """
        Creates the flawed natural language solution mapping by propagating numerical
        errors and handling specific error type modifications.
        """
        flawed_nl = copy.deepcopy(self.original_nl_solution)
        self.operator_swap_successful = False # Reset flag for each run

        if self.error_type == 'skipped_step':
            # ... (no changes to this block)
            line_to_delete = self.metadata['line_label']
            if line_to_delete in flawed_nl:
                self.deleted_nl_line_text = flawed_nl[line_to_delete]
                del flawed_nl[line_to_delete]
            return flawed_nl

        sorted_keys = sorted(flawed_nl.keys(), key=lambda k: (k[0] != 'L', int(k[1:]) if k.startswith('L') else float('inf')))
        
        for key in sorted_keys:
            line_text = flawed_nl.get(key, "")
            modified_line = self._modify_nl_line(line_text)

            if self.error_type == 'incorrect_operation' and key == self.metadata.get('line_label'):
                original_op_name = self.metadata.get('original_op')
                new_op_name = self.metadata.get('new_op')
                
                symbols_to_replace = self.op_synonyms.get(original_op_name, [])
                new_op_symbol = self.op_map.get(new_op_name, '?')

                is_op_present_in_line = any(op in modified_line for op in symbols_to_replace)

                if is_op_present_in_line and new_op_symbol != '?':
                    for old_symbol in symbols_to_replace:
                        modified_line = modified_line.replace(old_symbol, new_op_symbol)
                    # --- SET FLAG ON SUCCESS ---
                    self.operator_swap_successful = True
            
            flawed_nl[key] = modified_line
        
        return flawed_nl

    def _generate_json_label(self) -> Dict[str, Any]:
        """
        Constructs the final structured JSON label using metadata-driven
        templates based on the error type.
        """
        m = self.metadata
        explanation = "Error: Could not generate explanation."
        correction = "Error: Could not generate correction."

        if self.error_type == 'computational_error':
            explanation = f"There is a computational error. The solution states the result is {m['new_value']}, but the correct value is {m['original_value']}."
            correction = f"To correct this, replace the incorrect value {m['new_value']} with the correct value {m['original_value']}."

        elif self.error_type == 'incorrect_operation':
            # This is the corrected logic that fixes the '?' bug.
            original_op_symbol = self.op_map.get(m.get('original_op'), '?')
            new_op_symbol = self.op_map.get(m.get('new_op'), '?')
            
            explanation = f"The solution incorrectly uses a '{new_op_symbol}' operation where a '{original_op_symbol}' operation was needed."
            correction = f"To correct this, the operator should be changed from '{new_op_symbol}' to '{original_op_symbol}'."

        elif self.error_type == 'skipped_step':
            explanation = f"A necessary calculation step is missing. The solution fails to perform the step that would have defined the variable '{m['deleted_variable']}'."
            correction = f"To correct this, the following step must be inserted back into the solution: '{self.deleted_nl_line_text}'"
        
        elif self.error_type == 'incorrect_operand':
            explanation = f"The calculation uses an incorrect variable. It incorrectly references '{m['new_operand']}' instead of '{m['original_operand']}'."
            correction = f"To correct this, the variable '{m['new_operand']}' should be replaced with the correct variable, '{m['original_operand']}'."

        return {
            "verdict": "Flawed",
            "error_details": {
                "error_type": self.error_type,
                # This ensures the line number from metadata is always used correctly.
                "erroneous_line_number": m['line_label'],
                "explanation": explanation,
                "correction": correction
            }
        }

    def inject_nl_error(self) -> Optional[Tuple[Dict[str, str], Dict[str, Any]]]:
        """
        Orchestrates the end-to-end process of generating a flawed NL solution
        and its corresponding JSON label, with quality control checks.
        """
        if not self._load_data_sources():
            # print("--- Process halted due to data loading failure. ---")
            return None
        
        flawed_nl_solution = self._generate_flawed_nl_solution()
        
        # --- FINAL QUALITY GATE ---
        # For incorrect_operation, if the swap did not actually happen in the NL text
        # (due to code/NL mismatch or lack of annotation), discard this example.
        if self.error_type == 'incorrect_operation' and not self.operator_swap_successful:
            # print(f"Discarding sample for index {self.problem_index}: Operator swap failed.")
            return None

        json_label = self._generate_json_label()
        
        return flawed_nl_solution, json_label

In [4]:
def test_single_injection(
        problem_index: int, 
        model_name: str, 
        error_type: str, 
        verbose: bool = True):
    """
    A simple wrapper to test the NaturalLanguageErrorInjector for a single case.
    Uses the global path and dataset variables defined in Cell 1.
    """
    if verbose:
        print(f"============================================================")
        print(f"  Testing Injection for: {error_type.upper()}")
        print(f"  Problem Index: {problem_index}, Model: {model_name}")
        print(f"============================================================\n")

    injector = NaturalLanguageErrorInjector(
        problem_index=problem_index,
        model_name=model_name,
        error_type=error_type,
        correct_code_dir=CORRECT_CODE_DIR,
        flawed_code_dir=FLAWED_CODE_DIR,
        dataset=gsm8k_train
    )

    result = injector.inject_nl_error()

    if result:
        flawed_solution, final_label = result
        if verbose:
            print("--- Original NL Solution ---")
            print(json.dumps(injector.original_nl_solution, indent=4))
            
            print("\n--- Generated Flawed NL Solution ---")
            print(json.dumps(flawed_solution, indent=4))
            
            print("\n--- Generated JSON Label ---")
            print(json.dumps(final_label, indent=4))
            print("\n✅ Injection Successful.\n")
        return result
    else:
        if verbose:
            print(f"❌ Injection Failed for {error_type}.\n")
        return None

In [5]:
# --- Example Usage ---
# Assuming 'metadata.json' files exist in the specified directories.
# E.g., 'data/code_with_error_traced/computational_error/0/metadata.json'

# test_problem_index = 0
test_model = "anthropic_claude-3-5-haiku-20241022"
test_error_types = ['computational_error', 'incorrect_operation', 'skipped_step']

for test_problem_index in range(10):
    for error_type in test_error_types:
        test_single_injection(
            problem_index=test_problem_index,
            model_name=test_model,
            error_type=error_type,
        )

  Testing Injection for: COMPUTATIONAL_ERROR
  Problem Index: 0, Model: anthropic_claude-3-5-haiku-20241022

--- Original NL Solution ---
{
    "FA": "#### 72",
    "L1": "Natalia sold 48/2 = [[48/2=24]]24 clips in May.",
    "L2": "Natalia sold 48+24 = [[48+24=72]]72 clips altogether in April and May."
}

--- Generated Flawed NL Solution ---
{
    "FA": "#### 82",
    "L1": "Natalia sold 48/2 = [[48/2=24]]24 clips in May.",
    "L2": "Natalia sold 48+24 = [[48+24=82]]82 clips altogether in April and May."
}

--- Generated JSON Label ---
{
    "verdict": "Flawed",
    "error_details": {
        "error_type": "computational_error",
        "erroneous_line_number": "L2",
        "explanation": "There is a computational error. The solution states the result is 82.0, but the correct value is 72.0.",
        "correction": "To correct this, replace the incorrect value 82.0 with the correct value 72.0."
    }
}

✅ Injection Successful.

  Testing Injection for: INCORRECT_OPERATION
  Problem I

In [6]:
for test_problem_index in range(11, 21):
    for error_type in test_error_types:
        test_single_injection(
            problem_index=test_problem_index,
            model_name=test_model,
            error_type=error_type,
        )

  Testing Injection for: COMPUTATIONAL_ERROR
  Problem Index: 11, Model: anthropic_claude-3-5-haiku-20241022

--- Original NL Solution ---
{
    "FA": "#### 5",
    "L1": "He saved up $110 total because 95 + 15 = [[95+15=110]]110",
    "L2": "He saved $15 from his allowance because 3 x 5 = [[3*5=15]]15",
    "L3": "He earned $60 mowing lawns because 4 x 15 = [[4*15=60]]60",
    "L4": "He earned $35 shoveling driveways because 110 - 60 - 15 = [[110-60-15=35]]35",
    "L5": "He shoveled 5 driveways because 35 / 7 = [[35/7=5]]5"
}

--- Generated Flawed NL Solution ---
{
    "FA": "#### 5",
    "L1": "He saved up $110 total because 95 + 15 = [[95+15=110]]110",
    "L2": "He saved $15 from his allowance because 3 x 5 = [[3*5=15]]15",
    "L3": "He earned $60 mowing lawns because 4 x 15 = [[4*15=60]]60",
    "L4": "He earned $25 shoveling driveways because 110 - 60 - 15 = [[110-60-15=25]]25",
    "L5": "He shoveled 5 driveways because 25 / 7 = [[25/7=5]]5"
}

--- Generated JSON Label ---
{
 

In [7]:
def test_single_injection(problem_index: int, model_name: str, error_type: str,
                          print_original_solution: bool = True, verbose: bool = True):
    """
    A simple wrapper to test the NaturalLanguageErrorInjector for a single case.
    Uses the global path and dataset variables.

    Args:
        problem_index: The GSM8K index to test.
        model_name: The model whose output will be used.
        error_type: The error type to inject.
        print_original_solution: If True, prints the original NL solution.
        verbose: If True, prints detailed step-by-step outputs.
    """
    if verbose:
        print(f"------------------------------------------------------------")
        print(f"  Testing Injection for: {error_type.upper()}")
        print(f"------------------------------------------------------------\n")

    injector = NaturalLanguageErrorInjector(
        problem_index=problem_index,
        model_name=model_name,
        error_type=error_type,
        correct_code_dir=CORRECT_CODE_DIR,
        flawed_code_dir=FLAWED_CODE_DIR,
        dataset=gsm8k_train
    )

    result = injector.inject_nl_error()

    if result:
        flawed_solution, final_label = result
        if verbose:
            if print_original_solution:
                print("--- Original NL Solution ---")
                print(json.dumps(injector.original_nl_solution, indent=4))
            
            print("--- Generated Flawed NL Solution ---")
            print(json.dumps(flawed_solution, indent=4))
            
            print("\n--- Generated JSON Label ---")
            print(json.dumps(final_label, indent=4))
            print("\n✅ Injection Successful.\n")
        return result
    else:
        if verbose:
            print(f"❌ Injection Failed for {error_type}.\n")
        return None

# ==============================================================================
# New, more concise testing loop for multiple problems
# ==============================================================================

# --- Configuration ---
# You can change this list to test any range of problems
indices_to_test = range(50) 
test_model = "anthropic_claude-3-5-haiku-20241022"
test_error_types = ['computational_error', 'incorrect_operation', 'skipped_step']

# --- Main Loop ---
for index in indices_to_test:
    print(f"\n\n============================================================")
    print(f"  ANALYZING PROBLEM INDEX: {index}  ")
    print(f"============================================================\n")
    
    # Print the original solution once per problem
    original_solution = build_solution_mapping(index, gsm8k_train)
    print("--- Original NL Solution ---")
    print(json.dumps(original_solution, indent=4))
    print("\n")
    
    # Test all error types for this problem
    for error_type in test_error_types:
        test_single_injection(
            problem_index=index,
            model_name=test_model,
            error_type=error_type,
            # Pass False to prevent re-printing the original solution
            print_original_solution=False 
        )



  ANALYZING PROBLEM INDEX: 0  

--- Original NL Solution ---
{
    "FA": "#### 72",
    "L1": "Natalia sold 48/2 = [[48/2=24]]24 clips in May.",
    "L2": "Natalia sold 48+24 = [[48+24=72]]72 clips altogether in April and May."
}


------------------------------------------------------------
  Testing Injection for: COMPUTATIONAL_ERROR
------------------------------------------------------------

--- Generated Flawed NL Solution ---
{
    "FA": "#### 82",
    "L1": "Natalia sold 48/2 = [[48/2=24]]24 clips in May.",
    "L2": "Natalia sold 48+24 = [[48+24=82]]82 clips altogether in April and May."
}

--- Generated JSON Label ---
{
    "verdict": "Flawed",
    "error_details": {
        "error_type": "computational_error",
        "erroneous_line_number": "L2",
        "explanation": "There is a computational error. The solution states the result is 82.0, but the correct value is 72.0.",
        "correction": "To correct this, replace the incorrect value 82.0 with the correct value 72.

In [8]:
import re

# Pattern: one or more digits, slash, one or more digits (not part of a larger number)
fraction_pattern = re.compile(r'\b\d+/\d+\b')

indices_with_fraction = []
for idx, sample in enumerate(gsm8k_train):
    answer = sample.get("answer", "")
    if fraction_pattern.search(answer):
        indices_with_fraction.append(idx)

print("Indices with num/num in answer:", indices_with_fraction)

Indices with num/num in answer: [0, 1, 2, 3, 5, 10, 11, 12, 13, 14, 16, 19, 23, 25, 27, 29, 33, 35, 36, 37, 38, 40, 42, 47, 48, 49, 50, 52, 54, 55, 57, 58, 61, 62, 64, 66, 67, 68, 70, 72, 73, 74, 80, 81, 83, 89, 90, 91, 92, 94, 99, 100, 106, 107, 109, 112, 115, 117, 121, 122, 123, 124, 127, 128, 129, 130, 133, 136, 137, 140, 141, 144, 145, 148, 149, 152, 154, 155, 156, 157, 160, 161, 162, 164, 165, 166, 173, 175, 176, 180, 181, 182, 187, 189, 192, 197, 198, 199, 201, 202, 203, 204, 205, 212, 219, 220, 222, 224, 226, 227, 229, 230, 231, 234, 236, 237, 238, 239, 241, 244, 246, 247, 249, 252, 253, 254, 256, 258, 259, 261, 262, 266, 267, 268, 269, 271, 273, 274, 276, 277, 281, 285, 290, 295, 296, 297, 299, 301, 302, 303, 304, 308, 312, 313, 315, 316, 319, 321, 323, 326, 327, 328, 329, 331, 332, 334, 336, 338, 340, 341, 343, 345, 346, 352, 358, 360, 361, 362, 370, 371, 372, 374, 380, 381, 382, 383, 385, 387, 389, 390, 395, 399, 400, 403, 404, 405, 406, 407, 410, 414, 422, 424, 425, 429, 432

In [9]:
len(indices_with_fraction)

3655

In [10]:
indices_with_slash = []
for idx, sample in enumerate(gsm8k_train):
    answer = sample.get("answer", "")
    if "/" in answer:
        indices_with_slash.append(idx)

print(f"Total indices with '/' in answer: {len(indices_with_slash)}")
print("Indices with '/' in answer:", indices_with_slash)

Total indices with '/' in answer: 4146
Indices with '/' in answer: [0, 1, 2, 3, 5, 9, 10, 11, 12, 13, 14, 16, 19, 21, 23, 25, 26, 27, 29, 31, 33, 35, 36, 37, 38, 39, 40, 42, 46, 47, 48, 49, 50, 52, 54, 55, 57, 58, 61, 62, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 76, 80, 81, 83, 84, 86, 89, 90, 91, 92, 94, 95, 99, 100, 106, 107, 109, 112, 115, 117, 118, 119, 121, 122, 123, 124, 127, 128, 129, 130, 133, 136, 137, 138, 140, 141, 143, 144, 145, 148, 149, 152, 154, 155, 156, 157, 160, 161, 162, 164, 165, 166, 173, 175, 176, 180, 181, 182, 187, 189, 192, 193, 194, 197, 198, 199, 201, 202, 203, 204, 205, 211, 212, 219, 220, 222, 224, 226, 227, 229, 230, 231, 233, 234, 236, 237, 238, 239, 240, 241, 244, 246, 247, 249, 252, 253, 254, 256, 258, 259, 261, 262, 264, 266, 267, 268, 269, 271, 273, 274, 276, 277, 281, 285, 290, 295, 296, 297, 299, 301, 302, 303, 304, 306, 308, 310, 312, 313, 315, 316, 319, 321, 323, 326, 327, 328, 329, 331, 332, 334, 336, 338, 340, 341, 343, 344, 345, 346, 352, 356, 3

In [11]:
for index in indices_with_fraction[:20]:
    print(f"Index {index}: {gsm8k_train[index]['answer']}")
    print()

Index 0: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

Index 1: Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.
Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.
#### 10

Index 2: In the beginning, Betty has only 100 / 2 = $<<100/2=50>>50.
Betty's grandparents gave her 15 * 2 = $<<15*2=30>>30.
This means, Betty needs 100 - 50 - 30 - 15 = $<<100-50-30-15=5>>5 more.
#### 5

Index 3: Maila read 12 x 2 = <<12*2=24>>24 pages today.
So she was able to read a total of 12 + 24 = <<12+24=36>>36 pages since yesterday.
There are 120 - 36 = <<120-36=84>>84 pages left to be read.
Since she wants to read half of the remaining pages tomorrow, then she should read 84/2 = <<84/2=42>>42 pages.
#### 42

Index 5: There are 80/100 * 10 = <<80/100*10=8>>8 more purple flowers than yellow flowers.
So in Mark's garden, there are 10 + 8 = <<10+8=18>>18 purple flowers.
Purple and yellow flowers sum up to 10 + 18 = <<10+1

In [12]:
import re

# Pattern matches:
# - num * num/num (e.g., 3 * 4/5)
# - num*(num/num) (e.g., 3*(4/5))
pattern = re.compile(
    r'\b\d+\s*\*\s*\d+/\d+\b'         # num * num/num
    r'|\b\d+\s*\*\s*\(\s*\d+/\d+\s*\)',  # num * (num/num)
    re.IGNORECASE
)

indices_with_mult_fraction = []
for idx, sample in enumerate(gsm8k_train):
    answer = sample.get("answer", "")
    if pattern.search(answer):
        indices_with_mult_fraction.append(idx)

print("Indices with num * num/num or num*(num/num) in answer:", indices_with_mult_fraction)

Indices with num * num/num or num*(num/num) in answer: [33, 35, 36, 40, 58, 66, 67, 72, 89, 133, 148, 152, 181, 187, 197, 198, 226, 316, 319, 370, 399, 424, 474, 480, 538, 559, 571, 581, 606, 627, 658, 661, 667, 704, 733, 751, 806, 811, 817, 847, 858, 927, 939, 950, 971, 1025, 1031, 1075, 1104, 1132, 1140, 1159, 1173, 1213, 1215, 1236, 1245, 1271, 1272, 1325, 1328, 1385, 1405, 1450, 1455, 1493, 1498, 1500, 1523, 1530, 1536, 1544, 1561, 1611, 1623, 1659, 1663, 1700, 1712, 1717, 1726, 1727, 1757, 1759, 1786, 1803, 1824, 1867, 1906, 1917, 1942, 1953, 1958, 1970, 1986, 1992, 1997, 2010, 2036, 2087, 2095, 2127, 2149, 2158, 2174, 2205, 2215, 2217, 2220, 2234, 2255, 2276, 2278, 2280, 2282, 2285, 2319, 2342, 2351, 2360, 2389, 2402, 2436, 2458, 2465, 2472, 2518, 2541, 2560, 2658, 2660, 2684, 2711, 2722, 2775, 2827, 2852, 2859, 2861, 2867, 2875, 2907, 2936, 2956, 2965, 2986, 3021, 3036, 3045, 3047, 3115, 3140, 3142, 3159, 3163, 3164, 3174, 3182, 3194, 3203, 3232, 3279, 3304, 3337, 3341, 3375, 33

In [13]:
for index in indices_with_mult_fraction[:20]:
    print(f"Index {index}: {gsm8k_train[index]['answer']}")
    print()

Index 33: To make the pizza, Kimber half as many teaspoons of salt as the number of cups of flour, meaning she needs 1/2*16 = <<16*1/2=8>>8 teaspoons of salt.
The total number of cups of flour and teaspoons of salt she needs is 8+16 = <<8+16=24>>24
She also needs 10 cups of water, which means the total number of cups of water and flour and teaspoons of salt she needs is 24+10 = <<24+10=34>>34
#### 34

Index 35: Since 40% of his students got below B, 100% - 40% = 60% of Mr. Sanchez's students got B and above.
Thus, 60 x 60/100 = <<60*60/100=36>>36 students got B and above in their final grade.
#### 36

Index 36: Lisa earned $60 * 1/2 = $<<60*1/2=30>>30.
Tommy earned $30 * 1/2 = $<<30*1/2=15>>15.
Lisa earned $30 - $15 = $<<30-15=15>>15 more than Tommy.
#### 15

Index 40: Mr. Benson had a 5% discount for each of the 12 - 10 = <<12-10=2>>2 tickets.
So, those two tickets had a $40 x 5/100 = $<<40*5/100=2>>2 discount each.
Hence, each ticket cost $40 - $2 = $<<40-2=38>>38 each.
Thus, two dis

In [14]:
import re

# Pattern matches:
# - (num/num)*num (e.g., (3/4)*5)
# - num/num * num (e.g., 3/4 * 5)
pattern = re.compile(
    r'\(\s*\d+/\d+\s*\)\s*\*\s*\d+'   # (num/num)*num
    r'|\b\d+/\d+\s*\*\s*\d+\b',       # num/num * num
    re.IGNORECASE
)

indices_with_fraction_mult = []
for idx, sample in enumerate(gsm8k_train):
    answer = sample.get("answer", "")
    if pattern.search(answer):
        indices_with_fraction_mult.append(idx)

print("Indices with (num/num)*num or num/num * num in answer:", indices_with_fraction_mult)

Indices with (num/num)*num or num/num * num in answer: [5, 27, 33, 55, 61, 68, 72, 83, 92, 107, 112, 115, 122, 130, 154, 155, 156, 162, 202, 246, 259, 271, 273, 276, 301, 304, 319, 321, 332, 361, 374, 400, 403, 414, 425, 429, 434, 441, 459, 468, 483, 488, 533, 537, 549, 561, 562, 571, 610, 646, 669, 671, 687, 698, 720, 750, 756, 764, 776, 785, 790, 791, 811, 824, 841, 849, 850, 902, 909, 914, 921, 922, 927, 931, 957, 961, 967, 969, 973, 1003, 1035, 1039, 1055, 1073, 1077, 1081, 1083, 1097, 1105, 1121, 1125, 1130, 1173, 1178, 1198, 1217, 1238, 1247, 1274, 1316, 1321, 1324, 1356, 1395, 1398, 1407, 1413, 1446, 1512, 1516, 1521, 1525, 1529, 1532, 1550, 1560, 1579, 1587, 1592, 1601, 1605, 1609, 1659, 1662, 1671, 1689, 1692, 1702, 1754, 1783, 1786, 1788, 1795, 1805, 1819, 1826, 1846, 1861, 1901, 1902, 1903, 1911, 1924, 1963, 1971, 1974, 1987, 1992, 1995, 1997, 2028, 2039, 2070, 2107, 2109, 2130, 2131, 2147, 2151, 2168, 2175, 2178, 2199, 2214, 2220, 2225, 2248, 2257, 2269, 2288, 2292, 2311, 2

In [15]:
for index in indices_with_fraction_mult[:20]:
    print(f"Index {index}: {gsm8k_train[index]['answer']}")
    print()

Index 5: There are 80/100 * 10 = <<80/100*10=8>>8 more purple flowers than yellow flowers.
So in Mark's garden, there are 10 + 8 = <<10+8=18>>18 purple flowers.
Purple and yellow flowers sum up to 10 + 18 = <<10+18=28>>28 flowers.
That means in Mark's garden there are 25/100 * 28 = <<25/100*28=7>>7 green flowers.
So in total Mark has 28 + 7 = <<28+7=35>>35 plants in his garden.
#### 35

Index 27: The number of non-valuable files Brennan downloaded in the first round is 70/100*800 = <<70/100*800=560>>560 files.
The number of valuable files Brennan downloaded in the first round is 800-560 = <<800-560=240>>240
When he downloaded 400 new files, there were 3/5*400= <<3/5*400=240>>240 non-useful files, which he deleted again.
The total number of valuable files he downloaded in the second round is 400-240 = <<400-240=160>>160
To write his research, Brennan had 160+240 = <<160+240=400>>400 useful files to reference to write his research.
#### 400

Index 33: To make the pizza, Kimber half as ma

In [16]:
# Assume the following lists are already defined:
# indices_with_fraction
# indices_with_mult_fraction
# indices_with_fraction_mult

set_slash = set(indices_with_slash)
set_fraction = set(indices_with_fraction)
set_mult_fraction = set(indices_with_mult_fraction)
set_fraction_mult = set(indices_with_fraction_mult)

# 0. indices with slash - indices with fraction
diff0 = sorted(set_slash - set_fraction)

# 1. indices with fraction - indices with mult fraction
diff1 = sorted(set_fraction - set_mult_fraction)

# 2. indices with fraction - indices with fraction mult
diff2 = sorted(set_fraction - set_fraction_mult)

# 3. Union of indices with mult fraction and indices with fraction mult
union3 = sorted(set_mult_fraction | set_fraction_mult)

# 4. indices with fraction minus the set in point 3
diff4 = sorted(set_fraction - (set_mult_fraction | set_fraction_mult))

print(f"There are {len(diff0)} samples in indices_with_slash - indices_with_fraction")
print(f"There are {len(diff1)} samples in indices_with_fraction - indices_with_mult_fraction")
print(f"There are {len(diff2)} samples in indices_with_fraction - indices_with_fraction_mult")
print(f"There are {len(union3)} samples in the union of indices_with_mult_fraction and indices_with_fraction_mult")
print(f"There are {len(diff4)} samples in indices_with_fraction minus the set in point 3")

There are 491 samples in indices_with_slash - indices_with_fraction
There are 3274 samples in indices_with_fraction - indices_with_mult_fraction
There are 3093 samples in indices_with_fraction - indices_with_fraction_mult
There are 887 samples in the union of indices_with_mult_fraction and indices_with_fraction_mult
There are 2768 samples in indices_with_fraction minus the set in point 3


In [17]:
len(gsm8k_train) - len(indices_with_fraction)

3818

In [18]:
len(gsm8k_train)

7473

In [19]:
indices_without_fraction = sorted(set(range(len(gsm8k_train))) - set(indices_with_fraction))

print(len(indices_without_fraction), "indices without fraction found.")
print(indices_without_fraction)

3818 indices without fraction found.
[4, 6, 7, 8, 9, 15, 17, 18, 20, 21, 22, 24, 26, 28, 30, 31, 32, 34, 39, 41, 43, 44, 45, 46, 51, 53, 56, 59, 60, 63, 65, 69, 71, 75, 76, 77, 78, 79, 82, 84, 85, 86, 87, 88, 93, 95, 96, 97, 98, 101, 102, 103, 104, 105, 108, 110, 111, 113, 114, 116, 118, 119, 120, 125, 126, 131, 132, 134, 135, 138, 139, 142, 143, 146, 147, 150, 151, 153, 158, 159, 163, 167, 168, 169, 170, 171, 172, 174, 177, 178, 179, 183, 184, 185, 186, 188, 190, 191, 193, 194, 195, 196, 200, 206, 207, 208, 209, 210, 211, 213, 214, 215, 216, 217, 218, 221, 223, 225, 228, 232, 233, 235, 240, 242, 243, 245, 248, 250, 251, 255, 257, 260, 263, 264, 265, 270, 272, 275, 278, 279, 280, 282, 283, 284, 286, 287, 288, 289, 291, 292, 293, 294, 298, 300, 305, 306, 307, 309, 310, 311, 314, 317, 318, 320, 322, 324, 325, 330, 333, 335, 337, 339, 342, 344, 347, 348, 349, 350, 351, 353, 354, 355, 356, 357, 359, 363, 364, 365, 366, 367, 368, 369, 373, 375, 376, 377, 378, 379, 384, 386, 388, 391, 392, 3

In [20]:
for index in diff0[:20]:
    print(f"Index {index}: {gsm8k_train[index]['answer']}")
    print()

Index 9: She works 8 hours a day for $18 per hour so she makes 8*18 = $<<8*18=144.00>>144.00 per 8-hour shift
She works 10 hours a day and anything over 8 hours is eligible for overtime, so she gets 10-8 = <<10-8=2>>2 hours of overtime
Overtime is calculated as time and a half so and she makes $18/hour so her overtime pay is 18*.5 = $<<18*.5=9.00>>9.00
Her overtime pay is 18+9 = $<<18+9=27.00>>27.00
Her base pay is $144.00 per 8-hour shift and she works 5 days and makes 5 * $144 = $<<144*5=720.00>>720.00
Her overtime pay is $27.00 per hour and she works 2 hours of overtime per day and makes 27*2 = $<<27*2=54.00>>54.00 in overtime pay
2 hours of overtime pay for 5 days means she makes 54*5 = $270.00
In 5 days her base pay is $720.00 and she makes $270.00 in overtime pay so she makes $720 + $270 = $<<720+270=990.00>>990.00
#### 990

Index 21: First find the total number of snakes eaten: 5 snakes/jaguar * 6 jaguars = <<5*6=30>>30 snakes
Then find the total number of birds eaten per day: 3

In [21]:
import re

# Matches -5, -3.2, -0.75, etc. (hyphen, optional space, digits, optional decimal)
pattern = re.compile(r'-\s*\d+(\.\d+)?')

indices_with_negative_number = []
for idx, sample in enumerate(gsm8k_train):
    question = sample.get("question", "")
    if pattern.search(question):
        indices_with_negative_number.append(idx)

print("Indices with negative number in question:", indices_with_negative_number)

Indices with negative number in question: [107, 492, 1102, 1402, 1731, 2182, 2406, 2717, 3110, 3320, 3390, 5127, 5715, 5844, 6035, 6189, 6543, 7324]


In [22]:
for index in indices_with_negative_number:
    print(f"Index {index}: {gsm8k_train[index]['question']}")
    print()

Index 107: There is very little car traffic on Happy Street. During the week, most cars pass it on Tuesday - 25. On Monday, 20% less than on Tuesday, and on Wednesday, 2 more cars than on Monday. On Thursday and Friday, it is about 10 cars each day. On the weekend, traffic drops to 5 cars per day. How many cars travel down Happy Street from Monday through Sunday?

Index 492: There are 40 Judges in the state of Rhode Island.  10 percent of Judges are under 30 years old.  60 percent of Judges are 30-50 years old.  The rest of the Judges are over 50 years old.  How many Judges are over 50 years old?

Index 1102: To pass the time while she is waiting somewhere Carla likes to count things around her. While she is waiting for school to start on Monday she counts the tiles on the ceiling--38. While she is waiting for everyone to finish their tests after she has handed in hers, she counts the books in the room--75. On Tuesday Carla counts all the tiles twice in a row and she counts the books t

In [23]:
import re

calc_annotation_pattern = re.compile(r'<<.*?>>')

indices_without_calc_annotation = []
for idx, sample in enumerate(gsm8k_train):
    answer = sample.get("answer", "")
    if not calc_annotation_pattern.search(answer):
        indices_without_calc_annotation.append(idx)

print(f"There are {len(indices_without_calc_annotation)} Indices without calculator annotation in answer:", indices_without_calc_annotation)

There are 95 Indices without calculator annotation in answer: [29, 109, 135, 150, 193, 302, 339, 375, 393, 473, 492, 618, 675, 691, 744, 802, 807, 894, 1017, 1234, 1420, 1552, 1628, 1680, 1707, 1786, 1941, 1990, 2059, 2076, 2133, 2241, 2287, 2394, 2429, 2452, 2456, 2472, 2517, 2582, 2822, 2899, 2921, 3055, 3089, 3148, 3157, 3210, 3332, 3339, 3346, 3361, 3383, 3392, 3423, 3484, 3554, 3558, 3562, 3712, 3746, 3821, 3842, 4102, 4121, 4159, 4349, 4358, 4881, 4920, 4984, 4994, 5352, 5434, 5629, 5815, 5886, 5974, 6152, 6268, 6281, 6362, 6514, 6539, 6638, 6718, 6726, 6899, 6958, 7115, 7222, 7274, 7293, 7340, 7380]


In [24]:
for index in indices_without_calc_annotation:
    print(f"Index {index}: {gsm8k_train[index]['answer']}")
    print()

Index 29: Let her previous monthly income be p
The cost of her rent and utilities was 40% of p which is (40/100)*p = 2p/5
Her income was increased by $600 so it is now p+$600
The cost of her rent and utilities now amount to 25% of (p+$600) which is (25/100)*(p+$600) = (p+$600)/4
Equating both expressions for cost of rent and utilities: 2p/5 = (p+$600)/4
Multiplying both sides of the equation by 20 gives 8p = 5p+$3000
Subtracting 5p from both sides gives: 3p = $3000
Dividing both sides by 3 gives p = $1000
#### 1000

Index 109: To figure out this problem we need to turn it into an equation. Let’s make A the number of nuggets Alyssa ate. We know all three girls ate 100 nuggets total, so 100 nuggets = A (how many Alyssa ate) + 2A (how many Keely ate) + 3A (how many Kendall ate) or 100 = A + 2A + 3A which is 100 = 5A
Then we will divide each side by 5 to figure out how many nuggets Alyssa ate, 100/5 = 5A/5 or 20 = A.
#### 20

Index 135: Let the number of square feet in Benedict's house be 

In [25]:
import re

pattern = re.compile(r'^Let [a-zA-Z] ', re.MULTILINE)

indices_with_let_var = []
for idx, sample in enumerate(gsm8k_train):
    answer = sample.get("answer", "")
    if pattern.search(answer):
        indices_with_let_var.append(idx)

print(f"There are {len(indices_with_let_var)} Indices with a line starting with 'Let @ ' in answer:", indices_with_let_var)

There are 212 Indices with a line starting with 'Let @ ' in answer: [8, 10, 150, 253, 289, 349, 373, 375, 397, 422, 464, 473, 559, 565, 581, 666, 696, 718, 744, 752, 792, 807, 890, 1007, 1051, 1087, 1106, 1137, 1142, 1183, 1259, 1310, 1337, 1358, 1367, 1368, 1369, 1383, 1420, 1432, 1433, 1537, 1776, 1858, 1899, 1969, 1999, 2009, 2026, 2059, 2089, 2106, 2116, 2133, 2222, 2244, 2287, 2328, 2394, 2439, 2452, 2453, 2456, 2492, 2504, 2603, 2638, 2659, 2686, 2687, 2899, 2900, 2921, 2972, 2979, 3030, 3098, 3148, 3151, 3157, 3162, 3176, 3210, 3283, 3331, 3332, 3339, 3346, 3371, 3392, 3400, 3419, 3534, 3554, 3562, 3583, 3699, 3746, 3785, 3806, 3821, 3826, 3842, 3886, 3915, 3922, 3944, 3963, 3971, 4094, 4099, 4117, 4126, 4159, 4177, 4218, 4332, 4375, 4448, 4564, 4596, 4644, 4653, 4654, 4683, 4684, 4688, 4713, 4775, 4814, 4881, 4920, 4940, 4952, 4984, 4994, 5002, 5099, 5107, 5208, 5315, 5319, 5352, 5374, 5400, 5406, 5421, 5422, 5431, 5434, 5457, 5484, 5494, 5563, 5666, 5678, 5757, 5810, 5844, 585

In [26]:
for index in indices_with_let_var:
    print(f"Index {index}: {gsm8k_train[index]['answer']}")
    print()

Index 8: Let S be the amount Alexis paid for the shoes.
She spent S + 30 + 46 + 38 + 11 + 18 = S + <<+30+46+38+11+18=143>>143.
She used all but $16 of her budget, so S + 143 = 200 - 16 = 184.
Thus, Alexis paid S = 184 - 143 = $<<184-143=41>>41 for the shoes.
#### 41

Index 10: Let S be the number of people on the first hundred years’ ship.
The second hundred years’ ship had twice as many as the first, so it had 2S people.
The third hundred years’ ship had twice as many as the second, so it had 2 * 2S = <<2*2=4>>4S people.
All the ships had S + 2S + 4S = 7S = 847 people.
Thus, the ship that the monster ate in the first hundred years had S = 847 / 7 = <<847/7=121>>121 people on it.
#### 121

Index 150: Let p be the number of packages Angela delivers and m be the number of meals. We know that p + m = 27 and p = 8m.
Substituting the second equation into the first equation, we get 8m + m = 27
Combining like terms, we get 9m = 27
Dividing both sides by 9, we get m = 3
#### 3

Index 253: Let 

In [27]:
import re

pattern = re.compile(r'divide|divided', re.IGNORECASE)

indices_divide_no_slash = []
for idx, sample in enumerate(gsm8k_train):
    answer = sample.get("answer", "")
    if "/" not in answer and pattern.search(answer):
        indices_divide_no_slash.append(idx)

print(f"There are {len(indices_divide_no_slash)} Indices where answer does NOT contain '/' but contains 'divide' or 'divided':", indices_divide_no_slash)

There are 7 Indices where answer does NOT contain '/' but contains 'divide' or 'divided': [455, 744, 2287, 3762, 4131, 6305, 6493]


In [28]:
for index in indices_divide_no_slash:
    print(f"Index {index}: {gsm8k_train[index]['answer']}")
    print()

Index 455: The amoeba will divide into 1 * 2 = <<1*2=2>>2 amoebae after 2 days.
The amoeba will divide into 2 * 2 = <<2*2=4>>4 amoebae after 4 days.
The amoeba will divide into 4 * 2 = <<4*2=8>>8 amoebae after 6 days.
The amoeba will divide into 8 * 2 = <<8*2=16>>16 amoebae after 8 days.
#### 8

Index 744: Let x be the number of minutes it takes Yolanda's husband to catch her.
We know that Yolanda will spend a total of x + 15 minutes riding her bike since she left 15 minutes before her husband.
The distance each person travels is equal to their travel speed times the number of minutes they spend traveling. That means Yolanda's distance is equal to 20 mph * (x + 15) and her husband's distance is equal to 40 mph * x
Yolanda's husband catches up to her when they've both traveled the same distance, which is when 20(x + 15) = 40x
We can simplify this equation by multiplying 20 through the parentheses to get 20x + 300 = 40x
Then we can subtract 20x from each side to get 300 = 20x
Finally, we

In [29]:
import re

calc_annotation_pattern = re.compile(r'<<.*?>>')
indices_line_with_eq_no_calc = []

for idx, sample in enumerate(gsm8k_train):
    answer = sample.get("answer", "")
    for line in answer.splitlines():
        if "=" in line and not calc_annotation_pattern.search(line):
            indices_line_with_eq_no_calc.append(idx)
            break  # Only need one such line per answer

print(f"There are {len(indices_line_with_eq_no_calc)} indices where at least one line contains '=' and no calculator annotation:", indices_line_with_eq_no_calc)

There are 1322 indices where at least one line contains '=' and no calculator annotation: [8, 9, 10, 23, 25, 29, 32, 35, 39, 44, 48, 53, 71, 72, 85, 88, 103, 109, 123, 135, 150, 151, 154, 167, 168, 184, 190, 193, 197, 198, 201, 216, 217, 227, 228, 232, 237, 242, 243, 247, 253, 256, 258, 261, 264, 275, 289, 293, 298, 302, 303, 304, 306, 315, 318, 323, 325, 328, 339, 349, 359, 360, 363, 365, 370, 373, 375, 382, 389, 393, 397, 400, 414, 415, 422, 425, 426, 428, 442, 447, 449, 459, 460, 464, 467, 473, 480, 488, 492, 499, 502, 504, 515, 517, 518, 529, 537, 546, 548, 552, 553, 554, 559, 565, 566, 567, 581, 593, 601, 609, 618, 623, 630, 636, 638, 654, 657, 666, 672, 675, 691, 696, 700, 716, 718, 719, 722, 724, 732, 734, 735, 744, 752, 753, 754, 757, 760, 764, 772, 778, 792, 795, 801, 802, 807, 808, 814, 816, 833, 837, 866, 867, 868, 869, 874, 890, 894, 904, 913, 925, 927, 940, 953, 959, 964, 966, 971, 978, 981, 983, 991, 993, 1002, 1007, 1008, 1017, 1028, 1034, 1041, 1049, 1051, 1052, 1057, 1

In [30]:
for index in indices_line_with_eq_no_calc[:30]:
    print(f"Index {index}: {gsm8k_train[index]['answer']}")
    print()

Index 8: Let S be the amount Alexis paid for the shoes.
She spent S + 30 + 46 + 38 + 11 + 18 = S + <<+30+46+38+11+18=143>>143.
She used all but $16 of her budget, so S + 143 = 200 - 16 = 184.
Thus, Alexis paid S = 184 - 143 = $<<184-143=41>>41 for the shoes.
#### 41

Index 9: She works 8 hours a day for $18 per hour so she makes 8*18 = $<<8*18=144.00>>144.00 per 8-hour shift
She works 10 hours a day and anything over 8 hours is eligible for overtime, so she gets 10-8 = <<10-8=2>>2 hours of overtime
Overtime is calculated as time and a half so and she makes $18/hour so her overtime pay is 18*.5 = $<<18*.5=9.00>>9.00
Her overtime pay is 18+9 = $<<18+9=27.00>>27.00
Her base pay is $144.00 per 8-hour shift and she works 5 days and makes 5 * $144 = $<<144*5=720.00>>720.00
Her overtime pay is $27.00 per hour and she works 2 hours of overtime per day and makes 27*2 = $<<27*2=54.00>>54.00 in overtime pay
2 hours of overtime pay for 5 days means she makes 54*5 = $270.00
In 5 days her base pay i

In [36]:
for index in [310, 3822, 7371]:
    print(f"Index: {index}")
    print("Question:")
    print(f"{gsm8k_train[index]['question']}")
    print("Solution mapping:")
    print(build_solution_mapping(index, gsm8k_train))
    print()

Index: 310
Question:
Janet hires six employees. Four of them are warehouse workers who make $15/hour, and the other two are managers who make $20/hour. Janet has to pay 10% of her workers' salaries in FICA taxes. If everyone works 25 days a month and 8 hours a day, how much does Janet owe total for their wages and taxes for one month?
Solution mapping:
{'FA': '#### 22000', 'L1': 'First figure out how many hours each worker works per month by multiplying the number of days they work by the number of hours a day they work: 25 days * 8 hours/day = [[25*8=200]]200 hours', 'L2': 'Then calculate how much one warehouse worker makes per month by multiplying their hourly rate by the number of hours they work: 200 hours * $15/hour = $[[200*15=3000]]3000', 'L3': 'Then multiply that number by 4 to find out how much all the warehouse workers make: $3000/worker * 4 workers = $[[3000*4=12000]]12,000', 'L4': 'Now multiply the hours each manager works (also 200) by their hourly wage to find out how muc