In [2]:
import json
import random
import re
from pathlib import Path
from typing import List, Dict, Set, Tuple

import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from tqdm.auto import tqdm

# --- Path and Directory Definitions ---

def find_project_root(marker: str = ".git") -> Path:
    """Traverse upwards to find the project root, marked by the git repository."""
    current_path = Path.cwd().resolve()
    while current_path != current_path.parent:
        if (current_path / marker).exists():
            return current_path
        current_path = current_path.parent
    raise FileNotFoundError(f"Could not find project root. Marker '{marker}' not found.")

VALIDATORS = ["ali", "arvind", "mauro", "ling", "yewei"]

PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / 'data'
CONCEPTUAL_ERRORS_DIR = DATA_DIR / 'conceptual-errors-accepted'
COMPUTATIONAL_ERRORS_DIR = DATA_DIR / 'computational-errors-generated'
CONCEPTUAL_CATALOG_DIR = DATA_DIR / 'conceptual-error-candidates'

# load all catalog filepaths into a dict
CATALOG_FILEPATH_DICT = {
    "manual": DATA_DIR / 'manually_generated_errors_final.csv',
    "computational": COMPUTATIONAL_ERRORS_DIR / 'computational_error_catalog.csv'
}
for name in VALIDATORS:
    CATALOG_FILEPATH_DICT[f"conceptual_{name}"] = CONCEPTUAL_CATALOG_DIR / f'validation_catalog_{name}.csv'

# Display the filepaths
for name, path in CATALOG_FILEPATH_DICT.items():
    print(f"{name}: {path}")

# make dictionary with all catalogs
CATALOG_DICT = {key: pd.read_csv(path) for key, path in CATALOG_FILEPATH_DICT.items()}

manual: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/manually_generated_errors_final.csv
computational: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/computational-errors-generated/computational_error_catalog.csv
conceptual_ali: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates/validation_catalog_ali.csv
conceptual_arvind: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates/validation_catalog_arvind.csv
conceptual_mauro: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates/validation_catalog_mauro.csv
conceptual_ling: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates/validation_catalog_ling.csv
conceptual_yewei: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates/validation_catalog_yewei.csv


In [3]:
for key, df in CATALOG_DICT.items():
    print(f"{key}: {len(df)} rows")
    print("columns:", df.columns.tolist())

manual: 1963 rows
columns: ['answer', 'erroneous_line_number', 'error_type', 'explanation', 'filepath', 'index', 'question', 'wrong_answer']
computational: 22623 rows
columns: ['index', 'tier', 'model', 'erroneous_line_number', 'explanation', 'wrong_answer', 'correct_trace_generated', 'target_variable', 'error_type', 'correct_value', 'flawed_value', 'repro_seed', 'date_utc', 'time_utc', 'filepath']
conceptual_ali: 398 rows
columns: ['index', 'tier', 'model', 'mutation_type', 'target_variable', 'correct_value', 'flawed_value', 'repro_seed', 'decision_date_utc', 'decision_time_utc', 'status', 'manual_edits', 'filepath', 'validator', 'tier_numeric', 'priority']
conceptual_arvind: 394 rows
columns: ['index', 'tier', 'model', 'mutation_type', 'target_variable', 'correct_value', 'flawed_value', 'repro_seed', 'decision_date_utc', 'decision_time_utc', 'status', 'manual_edits', 'filepath', 'validator', 'tier_numeric', 'priority']
conceptual_mauro: 381 rows
columns: ['index', 'tier', 'model', 'm

In [11]:
GSM8K_TRAIN = load_dataset("gsm8k", "main")["train"]

In [14]:
# --- Tier Definition Functions (copied from arvind-july-25.ipynb) ---

def has_computational_division(solution_text: str):
    """Checks if a solution text contains a division operation."""
    pattern = re.compile(r'/\s*\d')
    return bool(pattern.search(solution_text))

def has_float(solution_text: str):
    """Checks if a solution text contains a float value."""
    pattern = re.compile(r'(?<!\d)\.\d+|\d+\.\d+')
    return bool(pattern.search(solution_text))

def is_symbolic(solution_text: str):
    """Checks if a solution text uses symbolic algebra (e.g., 'Let x...')."""
    pattern = re.compile(r'^Let [a-zA-Z] ', re.MULTILINE)
    return bool(pattern.search(solution_text))

def mutually_disjoint_tiers(dataset):
    """
    Categorizes all problems in the dataset into mutually disjoint tiers
    based on the mathematical operations present in their solution text.
    """
    tiers = {}
    symbolic_set = {idx for idx, sample in enumerate(dataset) if is_symbolic(sample.get("answer", ""))}
    non_symbolic_indices = [idx for idx in range(len(dataset)) if idx not in symbolic_set]
    
    tiers["tier1"] = sorted([idx for idx in non_symbolic_indices if not has_float(dataset[idx].get("answer", "")) and not has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier2"] = sorted([idx for idx in non_symbolic_indices if has_float(dataset[idx].get("answer", "")) and not has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier3"] = sorted([idx for idx in non_symbolic_indices if not has_float(dataset[idx].get("answer", "")) and has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier4"] = sorted([idx for idx in non_symbolic_indices if has_float(dataset[idx].get("answer", "")) and has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier5"] = sorted(list(symbolic_set))
    return tiers

def sanitize_commas(text: str) -> str:
    """Removes comma separators from numbers to prevent model artifacts."""
    return re.sub(r'(\d),(\d)', r'\1\2', text)

def sanitize_text(text: str) -> str:
    """
    Replaces a comprehensive set of problematic Unicode characters with their
    ASCII equivalents to prevent model generation and string parsing errors.
    """
    if not isinstance(text, str):
        return ""
        
    replacements = {
        "\u2212": "-",  # Minus Sign
        "\u00d7": "*",  # Multiplication Sign
        "\u00f7": "/",  # Division Sign
        "\u22c5": "*",  # Dot Operator
        "\u201c": '"',  # Left Double Quotation Mark
        "\u201d": '"',  # Right Double Quotation Mark
        "\u2018": "'",  # Left Single Quotation Mark
        "\u2019": "'",  # Right Single Quotation Mark
        "\u2014": "-",  # Em Dash
        "\u2013": "-",  # En Dash
        "\u2026": "...",# Horizontal Ellipsis
        "\u00a0": " ",  # No-Break Space
    }
    for uni, ascii_char in replacements.items():
        text = text.replace(uni, ascii_char)
    return text

def clean_and_split_solution(raw_text: str) -> Tuple[str, str | None]:
    """
    Takes a raw solution text, sanitizes it, and separates the reasoning
    lines from the final answer line. Returns a tuple containing 
    (cleaned_reasoning_text, final_answer_string).
    """
    if not isinstance(raw_text, str):
        return "", None
        
    sanitized_text = sanitize_text(raw_text)
    text_no_annotations = re.sub(r'<<.*?>>', '', sanitized_text)
    text_no_commas = sanitize_commas(text_no_annotations)
    
    lines = text_no_commas.split('\n')
    final_answer = None
    
    if lines and re.match(r'^\s*####\s*.*$', lines[-1]):
        final_answer_line = lines.pop().strip()
        match = re.search(r'####\s*(.*)', final_answer_line)
        if match:
            final_answer = match.group(1).strip()

    cleaned_lines = [line.strip() for line in lines if line.strip()]
    reasoning_text = '\n'.join(cleaned_lines)
    
    return reasoning_text, final_answer

print("✓ Helper functions loaded successfully")

✓ Helper functions loaded successfully


In [15]:
# --- Create Tier Mappings ---

def add_tier_column(df, tier_lists):
    """
    Adds a 'tier' column to the dataframe based on the TIER_LISTS dictionary.
    Maps each GSM8K index to its corresponding tier.
    """
    index_to_tier = {}
    for tier_name, indices in tier_lists.items():
        for idx in indices:
            index_to_tier[idx] = tier_name
    
    df['tier'] = df['index'].map(index_to_tier)
    
    missing_tiers = df['tier'].isna().sum()
    if missing_tiers > 0:
        print(f"Warning: {missing_tiers} indices could not be mapped to tiers")
        df['tier'] = df['tier'].fillna('unknown')
    
    return df

# Generate tier mappings for the entire GSM8K dataset
TIER_LISTS = mutually_disjoint_tiers(GSM8K_TRAIN)
print("Tier distribution in GSM8K:")
for tier, indices in TIER_LISTS.items():
    print(f"  {tier}: {len(indices)} problems")

Tier distribution in GSM8K:
  tier1: 2767 problems
  tier2: 837 problems
  tier3: 3113 problems
  tier4: 544 problems
  tier5: 212 problems


In [16]:
# --- Solution Analysis Functions ---

def validate_line_number(erroneous_line_number: str, gsm8k_answer: str) -> bool:
    """
    Validates that the erroneous_line_number exists in the parsed GSM8K solution.
    Returns True if the line number is valid, False otherwise.
    """
    try:
        reasoning, final_answer = clean_and_split_solution(gsm8k_answer)
        lines = [line for line in reasoning.split('\n') if line.strip()]
        
        if erroneous_line_number == "FA":
            return final_answer is not None
        elif erroneous_line_number.startswith("L"):
            line_num = int(erroneous_line_number[1:])
            return 1 <= line_num <= len(lines)
        else:
            return False
    except:
        return False

def calculate_solution_metrics(gsm8k_answer: str) -> Tuple[int, Dict[str, str]]:
    """
    Calculates solution metrics including total line count and creates a mapping
    of line numbers to content. Returns (solution_length, line_mapping).
    """
    try:
        reasoning, final_answer = clean_and_split_solution(gsm8k_answer)
        lines = [line for line in reasoning.split('\n') if line.strip()]
        
        solution_length = len(lines)
        if final_answer is not None:
            solution_length += 1  # Include FA in count
            
        line_mapping = {f"L{i+1}": line for i, line in enumerate(lines)}
        if final_answer is not None:
            line_mapping["FA"] = final_answer
            
        return solution_length, line_mapping
    except:
        return 0, {}

def calculate_relative_line_position(erroneous_line_number: str, gsm8k_answer: str) -> float:
    """
    Calculates the relative position of the erroneous line within the solution.
    Returns a value between 0 and 1, where 0 is the first line and 1 is the last line.
    """
    try:
        reasoning, final_answer = clean_and_split_solution(gsm8k_answer)
        lines = [line for line in reasoning.split('\n') if line.strip()]
        total_lines = len(lines)
        
        if final_answer is not None:
            total_lines += 1
            
        if erroneous_line_number == "FA":
            return 1.0
        elif erroneous_line_number.startswith("L"):
            line_num = int(erroneous_line_number[1:])
            return line_num / total_lines
        else:
            return 0.5  # Default fallback
    except:
        return 0.5

print("✓ Solution analysis functions loaded successfully")

✓ Solution analysis functions loaded successfully


In [17]:
# --- Process Manual Catalog ---

def process_manual_catalog(catalog_dict, gsm8k_train, tier_lists) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Processes the manual catalog to extract standardized error data.
    Returns (clean_rows_df, problematic_rows_df).
    """
    print("=== Processing Manual Catalog ===")
    
    df = catalog_dict['manual'].copy()
    print(f"Initial rows: {len(df)}")
    
    clean_rows = []
    problematic_rows = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing manual catalog"):
        try:
            problem_index = int(row['index'])
            
            # Check if erroneous_line_number is missing
            if pd.isna(row['erroneous_line_number']):
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': 'Missing erroneous_line_number',
                    'source_catalog': 'manual'
                })
                continue
            
            # Get GSM8K data
            if problem_index >= len(gsm8k_train):
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': f'Index {problem_index} out of range',
                    'source_catalog': 'manual'
                })
                continue
                
            gsm8k_problem = gsm8k_train[problem_index]
            
            # Validate line number
            if not validate_line_number(row['erroneous_line_number'], gsm8k_problem['answer']):
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': f'Invalid line number: {row["erroneous_line_number"]}',
                    'source_catalog': 'manual'
                })
                continue
            
            # Calculate metrics
            solution_length, _ = calculate_solution_metrics(gsm8k_problem['answer'])
            relative_position = calculate_relative_line_position(row['erroneous_line_number'], gsm8k_problem['answer'])
            
            # Create standardized row
            clean_row = {
                'index': problem_index,
                'tier': None,  # Will be added later
                'question': gsm8k_problem['question'],
                'correct_answer': gsm8k_problem['answer'],
                'wrong_answer': row['wrong_answer'],
                'error_type': row['error_type'] + '_error',
                'erroneous_line_number': row['erroneous_line_number'],
                'explanation': row['explanation'],
                'error_subtype': 'NA',
                'source': 'manual',
                'solution_length': solution_length,
                'relative_line_position': relative_position
            }
            
            clean_rows.append(clean_row)
            
        except Exception as e:
            problematic_rows.append({
                **row.to_dict(),
                'error_reason': f'Processing error: {str(e)}',
                'source_catalog': 'manual'
            })
    
    clean_df = pd.DataFrame(clean_rows)
    problematic_df = pd.DataFrame(problematic_rows)
    
    # Add tier information
    if not clean_df.empty:
        clean_df = add_tier_column(clean_df, tier_lists)
        # Keep all tiers for manual catalog (including tier5)
    
    print(f"Clean rows: {len(clean_df)}")
    print(f"Problematic rows: {len(problematic_df)}")
    
    return clean_df, problematic_df

In [18]:
manual_clean, manual_problematic = process_manual_catalog(CATALOG_DICT, GSM8K_TRAIN, TIER_LISTS)

=== Processing Manual Catalog ===
Initial rows: 1963


Processing manual catalog:   0%|          | 0/1963 [00:00<?, ?it/s]

Clean rows: 1740
Problematic rows: 223


In [19]:
# --- Process Computational Catalog ---

def process_computational_catalog(catalog_dict, gsm8k_train, tier_lists) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Processes the computational catalog to extract standardized error data.
    Excludes tier5 problems. Returns (clean_rows_df, problematic_rows_df).
    """
    print("=== Processing Computational Catalog ===")
    
    df = catalog_dict['computational'].copy()
    print(f"Initial rows: {len(df)}")
    
    clean_rows = []
    problematic_rows = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing computational catalog"):
        try:
            problem_index = int(row['index'])
            
            # Check if erroneous_line_number is missing
            if pd.isna(row['erroneous_line_number']):
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': 'Missing erroneous_line_number',
                    'source_catalog': 'computational'
                })
                continue
            
            # Get GSM8K data
            if problem_index >= len(gsm8k_train):
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': f'Index {problem_index} out of range',
                    'source_catalog': 'computational'
                })
                continue
                
            gsm8k_problem = gsm8k_train[problem_index]
            
            # Check if this is tier5 and exclude it
            problem_tier = None
            for tier_name, indices in tier_lists.items():
                if problem_index in indices:
                    problem_tier = tier_name
                    break
            
            if problem_tier == 'tier5':
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': 'Excluded tier5 problem',
                    'source_catalog': 'computational'
                })
                continue
            
            # Validate line number
            if not validate_line_number(row['erroneous_line_number'], gsm8k_problem['answer']):
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': f'Invalid line number: {row["erroneous_line_number"]}',
                    'source_catalog': 'computational'
                })
                continue
            
            # Calculate metrics
            solution_length, _ = calculate_solution_metrics(gsm8k_problem['answer'])
            relative_position = calculate_relative_line_position(row['erroneous_line_number'], gsm8k_problem['answer'])
            
            # Create standardized row
            clean_row = {
                'index': problem_index,
                'tier': problem_tier,
                'question': gsm8k_problem['question'],
                'correct_answer': gsm8k_problem['answer'],
                'wrong_answer': row['wrong_answer'],
                'error_type': 'computational_error',
                'erroneous_line_number': row['erroneous_line_number'],
                'explanation': row['explanation'],
                'error_subtype': row['error_type'],  # Use original error_type as subtype
                'source': 'programmatic',
                'solution_length': solution_length,
                'relative_line_position': relative_position
            }
            
            clean_rows.append(clean_row)
            
        except Exception as e:
            problematic_rows.append({
                **row.to_dict(),
                'error_reason': f'Processing error: {str(e)}',
                'source_catalog': 'computational'
            })
    
    clean_df = pd.DataFrame(clean_rows)
    problematic_df = pd.DataFrame(problematic_rows)
    
    print(f"Clean rows: {len(clean_df)}")
    print(f"Problematic rows: {len(problematic_df)}")
    
    return clean_df, problematic_df

In [20]:
computational_clean, computational_problematic = process_computational_catalog(CATALOG_DICT, GSM8K_TRAIN, TIER_LISTS)

=== Processing Computational Catalog ===
Initial rows: 22623


Processing computational catalog:   0%|          | 0/22623 [00:00<?, ?it/s]

Clean rows: 21768
Problematic rows: 855


In [36]:
# --- Process Validator Catalogs (Updated with Fixed Path Handling and Answer Formatting) ---

def process_validator_catalogs(catalog_dict, gsm8k_train, tier_lists, validators, project_root) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Processes all validator catalogs to extract standardized error data from JSON files.
    Only includes accepted samples and excludes tier5 problems.
    Uses proper cross-platform path handling and fixes answer formatting.
    Returns (clean_rows_df, problematic_rows_df).
    """
    print("=== Processing Validator Catalogs ===")
    
    all_clean_rows = []
    all_problematic_rows = []
    
    def check_file_exists_and_get_path(filepath, base_dir=None):
        """
        Check if a file exists and return the correct path, handling cross-platform path issues.
        Based on logic from july-26-merging-repairing-accepted.ipynb
        """
        if pd.isna(filepath) or filepath == "":
            return None, False
        
        # Convert to Path object and normalize path separators
        # Replace Windows backslashes with forward slashes for cross-platform compatibility
        normalized_filepath = str(filepath).replace('\\', '/')
        file_path = Path(normalized_filepath)
        
        # If it's a relative path, make it relative to the project root
        if not file_path.is_absolute():
            full_path = project_root / file_path
        else:
            full_path = file_path
        
        return full_path, full_path.exists()
    
    def fix_answer_formatting(wrong_answer: str) -> str:
        """
        Fixes the formatting of wrong answers by moving the final answer line
        from the beginning to the end if it's misplaced.
        """
        if not isinstance(wrong_answer, str):
            return wrong_answer
        
        lines = wrong_answer.strip().split('\n')
        final_answer_line = None
        other_lines = []
        
        # Find and extract the final answer line
        for line in lines:
            if re.match(r'^\s*####\s*.*$', line.strip()):
                final_answer_line = line.strip()
            elif line.strip():  # Skip empty lines
                other_lines.append(line)
        
        # Reconstruct with final answer at the end
        if final_answer_line and other_lines:
            return '\n'.join(other_lines) + '\n' + final_answer_line
        elif final_answer_line:
            return final_answer_line
        else:
            return wrong_answer
    
    for validator in validators:
        print(f"\nProcessing validator: {validator}")
        df = catalog_dict[f'conceptual_{validator}'].copy()
        
        # Filter to only accepted samples
        df = df[df['status'] == 'accepted']
        print(f"Accepted rows for {validator}: {len(df)}")
        
        for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {validator}"):
            try:
                problem_index = int(row['index'])
                
                # Get GSM8K data
                if problem_index >= len(gsm8k_train):
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': f'Index {problem_index} out of range',
                        'source_catalog': f'conceptual_{validator}'
                    })
                    continue
                    
                gsm8k_problem = gsm8k_train[problem_index]
                
                # Check if this is tier5 and exclude it
                problem_tier = None
                for tier_name, indices in tier_lists.items():
                    if problem_index in indices:
                        problem_tier = tier_name
                        break
                
                if problem_tier == 'tier5':
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': 'Excluded tier5 problem',
                        'source_catalog': f'conceptual_{validator}'
                    })
                    continue
                
                # Load JSON file with proper path handling
                try:
                    filepath, file_exists = check_file_exists_and_get_path(row['filepath'])
                    
                    if not file_exists or filepath is None:
                        all_problematic_rows.append({
                            **row.to_dict(),
                            'error_reason': f'JSON file not found: {row["filepath"]}',
                            'source_catalog': f'conceptual_{validator}'
                        })
                        continue
                    
                    with open(filepath, 'r', encoding='utf-8') as f:
                        json_data = json.load(f)
                        
                except Exception as e:
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': f'JSON loading error: {str(e)}',
                        'source_catalog': f'conceptual_{validator}'
                    })
                    continue
                
                # Extract data from JSON
                try:
                    raw_wrong_answer = json_data['context']['flawed_solution']
                    explanation = json_data['error_details']['explanation']
                    erroneous_line_number = json_data['error_details']['erroneous_line_number']
                    error_subtype = json_data['error_details']['error_type']
                except KeyError as e:
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': f'Missing JSON field: {str(e)}',
                        'source_catalog': f'conceptual_{validator}'
                    })
                    continue
                
                # Fix the answer formatting
                wrong_answer = fix_answer_formatting(raw_wrong_answer)
                
                # Check for null erroneous_line_number (from the repair logic in the notebook)
                if erroneous_line_number is None or erroneous_line_number == "null":
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': f'Null erroneous_line_number in JSON',
                        'source_catalog': f'conceptual_{validator}'
                    })
                    continue
                
                # Validate line number
                if not validate_line_number(erroneous_line_number, gsm8k_problem['answer']):
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': f'Invalid line number: {erroneous_line_number}',
                        'source_catalog': f'conceptual_{validator}'
                    })
                    continue
                
                # Calculate metrics
                solution_length, _ = calculate_solution_metrics(gsm8k_problem['answer'])
                relative_position = calculate_relative_line_position(erroneous_line_number, gsm8k_problem['answer'])
                
                # Create standardized row
                clean_row = {
                    'index': problem_index,
                    'tier': problem_tier,
                    'question': gsm8k_problem['question'],
                    'correct_answer': gsm8k_problem['answer'],
                    'wrong_answer': wrong_answer,  # Now properly formatted
                    'error_type': 'conceptual_error',
                    'erroneous_line_number': erroneous_line_number,
                    'explanation': explanation,
                    'error_subtype': error_subtype,
                    'source': 'programmatic',
                    'solution_length': solution_length,
                    'relative_line_position': relative_position
                }
                
                all_clean_rows.append(clean_row)
                
            except Exception as e:
                all_problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': f'Processing error: {str(e)}',
                    'source_catalog': f'conceptual_{validator}'
                })
    
    clean_df = pd.DataFrame(all_clean_rows)
    problematic_df = pd.DataFrame(all_problematic_rows)
    
    print(f"\nTotal clean rows: {len(clean_df)}")
    print(f"Total problematic rows: {len(problematic_df)}")
    
    return clean_df, problematic_df

In [37]:
validator_clean, validator_problematic = process_validator_catalogs(
    CATALOG_DICT, GSM8K_TRAIN, TIER_LISTS, VALIDATORS, PROJECT_ROOT
)

=== Processing Validator Catalogs ===

Processing validator: ali
Accepted rows for ali: 341


Processing ali:   0%|          | 0/341 [00:00<?, ?it/s]


Processing validator: arvind
Accepted rows for arvind: 91


Processing arvind:   0%|          | 0/91 [00:00<?, ?it/s]


Processing validator: mauro
Accepted rows for mauro: 312


Processing mauro:   0%|          | 0/312 [00:00<?, ?it/s]


Processing validator: ling
Accepted rows for ling: 110


Processing ling:   0%|          | 0/110 [00:00<?, ?it/s]


Processing validator: yewei
Accepted rows for yewei: 290


Processing yewei:   0%|          | 0/290 [00:00<?, ?it/s]


Total clean rows: 1144
Total problematic rows: 0


In [38]:
# --- Combine All Catalogs ---

def create_master_catalogs(manual_clean, computational_clean, validator_clean, 
                          manual_problematic, computational_problematic, validator_problematic):
    """
    Combines all clean and problematic dataframes into final master catalogs.
    Returns (master_catalog, catalog_problematic).
    """
    print("=== Creating Master Catalogs ===")
    
    # Combine all clean dataframes
    all_clean_dfs = []
    if not manual_clean.empty:
        all_clean_dfs.append(manual_clean)
    if not computational_clean.empty:
        all_clean_dfs.append(computational_clean)
    if not validator_clean.empty:
        all_clean_dfs.append(validator_clean)
    
    master_catalog = pd.concat(all_clean_dfs, ignore_index=True) if all_clean_dfs else pd.DataFrame()
    
    # Combine all problematic dataframes
    all_problematic_dfs = []
    if not manual_problematic.empty:
        all_problematic_dfs.append(manual_problematic)
    if not computational_problematic.empty:
        all_problematic_dfs.append(computational_problematic)
    if not validator_problematic.empty:
        all_problematic_dfs.append(validator_problematic)
    
    catalog_problematic = pd.concat(all_problematic_dfs, ignore_index=True) if all_problematic_dfs else pd.DataFrame()
    
    # Ensure consistent column order for master catalog
    if not master_catalog.empty:
        column_order = [
            'index', 'tier', 'question', 'correct_answer', 'wrong_answer', 
            'error_type', 'erroneous_line_number', 'explanation', 'error_subtype',
            'source', 'solution_length', 'relative_line_position'
        ]
        master_catalog = master_catalog[column_order]
    
    print(f"Master catalog rows: {len(master_catalog)}")
    print(f"Problematic rows: {len(catalog_problematic)}")
    
    return master_catalog, catalog_problematic

master_catalog, catalog_problematic = create_master_catalogs(
    manual_clean, 
    computational_clean, 
    validator_clean,
    manual_problematic, 
    computational_problematic, 
    validator_problematic
)

print("✓ Master catalogs created successfully")

=== Creating Master Catalogs ===
Master catalog rows: 24652
Problematic rows: 1078
✓ Master catalogs created successfully


In [39]:
# --- Generate Summary Statistics ---

def print_catalog_summary(master_catalog, catalog_problematic):
    """
    Prints comprehensive summary statistics for the master catalog and problematic entries.
    """
    print("=" * 80)
    print("MASTER CATALOG SUMMARY STATISTICS")
    print("=" * 80)
    
    if master_catalog.empty:
        print("❌ Master catalog is empty!")
        return
    
    # Basic statistics
    print(f"\n📊 BASIC STATISTICS")
    print(f"Total rows: {len(master_catalog):,}")
    print(f"Unique GSM8K indices: {master_catalog['index'].nunique():,}")
    print(f"Average entries per index: {len(master_catalog) / master_catalog['index'].nunique():.2f}")
    
    # Source distribution
    print(f"\n🔧 SOURCE DISTRIBUTION")
    source_counts = master_catalog['source'].value_counts()
    for source, count in source_counts.items():
        pct = (count / len(master_catalog)) * 100
        print(f"  {source}: {count:,} ({pct:.1f}%)")
    
    # Error type distribution
    print(f"\n🐛 ERROR TYPE DISTRIBUTION")
    error_type_counts = master_catalog['error_type'].value_counts()
    for error_type, count in error_type_counts.items():
        pct = (count / len(master_catalog)) * 100
        print(f"  {error_type}: {count:,} ({pct:.1f}%)")
    
    # Tier distribution
    print(f"\n🎯 TIER DISTRIBUTION")
    tier_counts = master_catalog['tier'].value_counts().sort_index()
    for tier, count in tier_counts.items():
        pct = (count / len(master_catalog)) * 100
        print(f"  {tier}: {count:,} ({pct:.1f}%)")
    
    # Error subtype distribution
    print(f"\n🔍 ERROR SUBTYPE DISTRIBUTION")
    subtype_counts = master_catalog['error_subtype'].value_counts()
    print(f"Total unique subtypes: {len(subtype_counts)}")
    for subtype, count in subtype_counts.head(10).items():
        pct = (count / len(master_catalog)) * 100
        print(f"  {subtype}: {count:,} ({pct:.1f}%)")
    if len(subtype_counts) > 10:
        print(f"  ... and {len(subtype_counts) - 10} more")
    
    # Solution length statistics
    print(f"\n📏 SOLUTION LENGTH STATISTICS")
    print(f"  Mean: {master_catalog['solution_length'].mean():.1f} lines")
    print(f"  Median: {master_catalog['solution_length'].median():.1f} lines")
    print(f"  Min: {master_catalog['solution_length'].min()} lines")
    print(f"  Max: {master_catalog['solution_length'].max()} lines")
    print(f"  Std: {master_catalog['solution_length'].std():.1f} lines")
    
    # Relative line position statistics
    print(f"\n📍 RELATIVE LINE POSITION STATISTICS")
    print(f"  Mean: {master_catalog['relative_line_position'].mean():.3f}")
    print(f"  Median: {master_catalog['relative_line_position'].median():.3f}")
    print(f"  Min: {master_catalog['relative_line_position'].min():.3f}")
    print(f"  Max: {master_catalog['relative_line_position'].max():.3f}")
    print(f"  Std: {master_catalog['relative_line_position'].std():.3f}")
    
    # Cross-tabulation: Error Type vs Tier
    print(f"\n📊 CROSS-TABULATION: ERROR TYPE vs TIER")
    crosstab = pd.crosstab(master_catalog['tier'], master_catalog['error_type'], margins=True)
    print(crosstab)
    
    # Cross-tabulation: Source vs Error Type
    print(f"\n📊 CROSS-TABULATION: SOURCE vs ERROR TYPE")
    crosstab_source = pd.crosstab(master_catalog['source'], master_catalog['error_type'], margins=True)
    print(crosstab_source)
    
    # Problematic entries summary
    print(f"\n" + "=" * 80)
    print("PROBLEMATIC ENTRIES SUMMARY")
    print("=" * 80)
    
    if catalog_problematic.empty:
        print("✅ No problematic entries found!")
    else:
        print(f"Total problematic rows: {len(catalog_problematic):,}")
        
        # Error reasons
        if 'error_reason' in catalog_problematic.columns:
            print(f"\n❌ ERROR REASONS")
            error_reasons = catalog_problematic['error_reason'].value_counts()
            for reason, count in error_reasons.items():
                pct = (count / len(catalog_problematic)) * 100
                print(f"  {reason}: {count:,} ({pct:.1f}%)")
        
        # Source catalog distribution
        if 'source_catalog' in catalog_problematic.columns:
            print(f"\n📁 PROBLEMATIC BY SOURCE CATALOG")
            source_prob_counts = catalog_problematic['source_catalog'].value_counts()
            for source, count in source_prob_counts.items():
                pct = (count / len(catalog_problematic)) * 100
                print(f"  {source}: {count:,} ({pct:.1f}%)")

# Generate the summary
print_catalog_summary(master_catalog, catalog_problematic)

MASTER CATALOG SUMMARY STATISTICS

📊 BASIC STATISTICS
Total rows: 24,652
Unique GSM8K indices: 6,777
Average entries per index: 3.64

🔧 SOURCE DISTRIBUTION
  programmatic: 22,912 (92.9%)
  manual: 1,740 (7.1%)

🐛 ERROR TYPE DISTRIBUTION
  computational_error: 22,542 (91.4%)
  conceptual_error: 2,110 (8.6%)

🎯 TIER DISTRIBUTION
  tier1: 11,188 (45.4%)
  tier2: 3,458 (14.0%)
  tier3: 8,288 (33.6%)
  tier4: 1,662 (6.7%)
  tier5: 56 (0.2%)

🔍 ERROR SUBTYPE DISTRIBUTION
Total unique subtypes: 15
  generate_digit_transposition_error: 9,505 (38.6%)
  generate_off_by_n_error: 7,083 (28.7%)
  generate_off_by_factor_of_10_error: 2,561 (10.4%)
  NA: 1,740 (7.1%)
  generate_stem_off_by_n_error: 1,512 (6.1%)
  generate_multiplication_by_reciprocal_error: 696 (2.8%)
  incorrect_final_answer_selection: 340 (1.4%)
  generate_decimal_shift_error: 284 (1.2%)
  operator_swap: 222 (0.9%)
  incomplete_calculation: 222 (0.9%)
  ... and 5 more

📏 SOLUTION LENGTH STATISTICS
  Mean: 4.9 lines
  Median: 5.0 lin

In [43]:
# --- Pretty Print Sample Rows by Error Type and Source ---

def pretty_print_sample_rows(master_catalog):
    """
    Pretty prints one sample row from each combination of error_type and source.
    """
    print("=" * 100)
    print("SAMPLE ROWS BY ERROR TYPE AND SOURCE")
    print("=" * 100)
    
    # Get all combinations of error_type and source
    combinations = master_catalog.groupby(['error_type', 'source']).size().reset_index()
    
    for _, combo in combinations.iterrows():
        error_type = combo['error_type']
        source = combo['source']
        count = combo[0]
        
        print(f"\n{'='*20} {error_type.upper()} + {source.upper()} {'='*20}")
        print(f"Total samples: {count:,}")
        print("-" * 80)
        
        # Get one sample from this combination
        sample_df = master_catalog[
            (master_catalog['error_type'] == error_type) & 
            (master_catalog['source'] == source)
        ]
        
        if len(sample_df) > 0:
            sample = sample_df.iloc[0]
            
            print(f"📍 INDEX: {sample['index']}")
            print(f"🎯 TIER: {sample['tier']}")
            print(f"🔧 ERROR SUBTYPE: {sample['error_subtype']}")
            print(f"📏 SOLUTION LENGTH: {sample['solution_length']} lines")
            print(f"📍 ERROR LINE: {sample['erroneous_line_number']} (relative position: {sample['relative_line_position']:.3f})")
            print()
            
            print("❓ QUESTION:")
            print(f"   {sample['question']}")
            print()
            
            print("✅ CORRECT ANSWER:")
            correct_lines = sample['correct_answer'].split('\n')
            for i, line in enumerate(correct_lines, 1):
                if line.strip():
                    print(f"   {line}")
            print()
            
            print("❌ WRONG ANSWER:")
            wrong_lines = sample['wrong_answer'].split('\n')
            for i, line in enumerate(wrong_lines, 1):
                if line.strip():
                    print(f"   {line}")
            print()
            
            print("💡 EXPLANATION:")
            print(f"   {sample['explanation']}")
            print()
        else:
            print("No samples found for this combination.")
        
        print("=" * 80)

# Run the pretty printer
pretty_print_sample_rows(master_catalog)

SAMPLE ROWS BY ERROR TYPE AND SOURCE

Total samples: 774
--------------------------------------------------------------------------------
📍 INDEX: 1000
🎯 TIER: tier3
🔧 ERROR SUBTYPE: NA
📏 SOLUTION LENGTH: 3 lines
📍 ERROR LINE: L1 (relative position: 0.333)

❓ QUESTION:
   John buys a heating pad for $30.  He uses it 3 times a week for 2 weeks.  How much does he spend on each use?

✅ CORRECT ANSWER:
   He uses it 3*2=<<3*2=6>>6 times
   So he pays 30/6=$<<30/6=5>>5
   #### 5

❌ WRONG ANSWER:
   He uses it 3*2=<<3*2=5>>5 times
   So he pays 30/5=$<<30/5=6>>6
   #### 6

💡 EXPLANATION:
   computational error: John uses it 3*2=<<3*2=6>>6 times.


Total samples: 21,768
--------------------------------------------------------------------------------
📍 INDEX: 4
🎯 TIER: tier1
🔧 ERROR SUBTYPE: generate_digit_transposition_error
📏 SOLUTION LENGTH: 4 lines
📍 ERROR LINE: L2 (relative position: 0.500)

❓ QUESTION:
   James writes a 3-page letter to 2 different friends twice a week.  How many pages d

In [44]:
# save to csv in the data folder
master_catalog.to_csv(DATA_DIR / 'master_catalog.csv', index=False)

In [45]:
# save the problematic catalog to csv in the data folder
catalog_problematic.to_csv(DATA_DIR / 'catalog_problematic.csv', index=False)