In [75]:
import json
import random
import re
from pathlib import Path
from typing import List, Dict, Set, Tuple

import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from tqdm.auto import tqdm

# --- Path and Directory Definitions ---

def find_project_root(marker: str = ".git") -> Path:
    """Traverse upwards to find the project root, marked by the git repository."""
    current_path = Path.cwd().resolve()
    while current_path != current_path.parent:
        if (current_path / marker).exists():
            return current_path
        current_path = current_path.parent
    raise FileNotFoundError(f"Could not find project root. Marker '{marker}' not found.")

VALIDATORS = ["ali", "arvind", "mauro", "ling", "yewei"]

PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / 'data'
CONCEPTUAL_ERRORS_DIR = DATA_DIR / 'conceptual-errors-accepted'
COMPUTATIONAL_ERRORS_DIR = DATA_DIR / 'computational-errors-generated'
CONCEPTUAL_CATALOG_DIR = DATA_DIR / 'conceptual-error-candidates'

# load all catalog filepaths into a dict
CATALOG_FILEPATH_DICT = {
    "manual": DATA_DIR / 'manually_generated_errors_final.csv',
    "computational": COMPUTATIONAL_ERRORS_DIR / 'computational_error_catalog.csv'
}
for name in VALIDATORS:
    CATALOG_FILEPATH_DICT[f"conceptual_{name}"] = CONCEPTUAL_CATALOG_DIR / f'validation_catalog_{name}.csv'

# Display the filepaths
for name, path in CATALOG_FILEPATH_DICT.items():
    print(f"{name}: {path}")

# make dictionary with all catalogs
CATALOG_DICT = {key: pd.read_csv(path) for key, path in CATALOG_FILEPATH_DICT.items()}

manual: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/manually_generated_errors_final.csv
computational: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/computational-errors-generated/computational_error_catalog.csv
conceptual_ali: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates/validation_catalog_ali.csv
conceptual_arvind: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates/validation_catalog_arvind.csv
conceptual_mauro: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates/validation_catalog_mauro.csv
conceptual_ling: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates/validation_catalog_ling.csv
conceptual_yewei: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/conceptual-error-candidates/validation_catalog_yewei.csv


In [76]:
for key, df in CATALOG_DICT.items():
    print(f"{key}: {len(df)} rows")
    print("columns:", df.columns.tolist())

manual: 1963 rows
columns: ['answer', 'erroneous_line_number', 'error_type', 'explanation', 'filepath', 'index', 'question', 'wrong_answer']
computational: 22623 rows
columns: ['index', 'tier', 'model', 'erroneous_line_number', 'explanation', 'wrong_answer', 'correct_trace_generated', 'target_variable', 'error_type', 'correct_value', 'flawed_value', 'repro_seed', 'date_utc', 'time_utc', 'filepath']
conceptual_ali: 398 rows
columns: ['index', 'tier', 'model', 'mutation_type', 'target_variable', 'correct_value', 'flawed_value', 'repro_seed', 'decision_date_utc', 'decision_time_utc', 'status', 'manual_edits', 'filepath', 'validator', 'tier_numeric', 'priority']
conceptual_arvind: 394 rows
columns: ['index', 'tier', 'model', 'mutation_type', 'target_variable', 'correct_value', 'flawed_value', 'repro_seed', 'decision_date_utc', 'decision_time_utc', 'status', 'manual_edits', 'filepath', 'validator', 'tier_numeric', 'priority']
conceptual_mauro: 381 rows
columns: ['index', 'tier', 'model', 'm

In [77]:
GSM8K_TRAIN = load_dataset("gsm8k", "main")["train"]

In [78]:
# --- Tier Definition Functions (copied from arvind-july-25.ipynb) ---

def has_computational_division(solution_text: str):
    """Checks if a solution text contains a division operation."""
    pattern = re.compile(r'/\s*\d')
    return bool(pattern.search(solution_text))

def has_float(solution_text: str):
    """Checks if a solution text contains a float value."""
    pattern = re.compile(r'(?<!\d)\.\d+|\d+\.\d+')
    return bool(pattern.search(solution_text))

def is_symbolic(solution_text: str):
    """Checks if a solution text uses symbolic algebra (e.g., 'Let x...')."""
    pattern = re.compile(r'^Let [a-zA-Z] ', re.MULTILINE)
    return bool(pattern.search(solution_text))

def mutually_disjoint_tiers(dataset):
    """
    Categorizes all problems in the dataset into mutually disjoint tiers
    based on the mathematical operations present in their solution text.
    """
    tiers = {}
    symbolic_set = {idx for idx, sample in enumerate(dataset) if is_symbolic(sample.get("answer", ""))}
    non_symbolic_indices = [idx for idx in range(len(dataset)) if idx not in symbolic_set]
    
    tiers["tier1"] = sorted([idx for idx in non_symbolic_indices if not has_float(dataset[idx].get("answer", "")) and not has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier2"] = sorted([idx for idx in non_symbolic_indices if has_float(dataset[idx].get("answer", "")) and not has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier3"] = sorted([idx for idx in non_symbolic_indices if not has_float(dataset[idx].get("answer", "")) and has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier4"] = sorted([idx for idx in non_symbolic_indices if has_float(dataset[idx].get("answer", "")) and has_computational_division(dataset[idx].get("answer", ""))])
    tiers["tier5"] = sorted(list(symbolic_set))
    return tiers

# --- Create Tier Mappings ---

def add_tier_column(df, tier_lists):
    """
    Adds a 'tier' column to the dataframe based on the TIER_LISTS dictionary.
    Maps each GSM8K index to its corresponding tier.
    """
    index_to_tier = {}
    for tier_name, indices in tier_lists.items():
        for idx in indices:
            index_to_tier[idx] = tier_name
    
    df['tier'] = df['index'].map(index_to_tier)
    
    missing_tiers = df['tier'].isna().sum()
    if missing_tiers > 0:
        print(f"Warning: {missing_tiers} indices could not be mapped to tiers")
        df['tier'] = df['tier'].fillna('unknown')
    
    return df

# Generate tier mappings for the entire GSM8K dataset
TIER_LISTS = mutually_disjoint_tiers(GSM8K_TRAIN)
print("Tier distribution in GSM8K:")
for tier, indices in TIER_LISTS.items():
    print(f"  {tier}: {len(indices)} problems")

Tier distribution in GSM8K:
  tier1: 2767 problems
  tier2: 837 problems
  tier3: 3113 problems
  tier4: 544 problems
  tier5: 212 problems


In [79]:
def sanitize_text(text: str) -> str:
    """
    Comprehensive text sanitization function that:
    1. Converts literal \n to actual newlines
    2. Replaces problematic Unicode characters with ASCII equivalents
    3. Removes comma separators from numbers
    
    This prevents model generation and string parsing errors.
    """
    text = text.replace('\\n', '\n')

    replacements = {
        "\u2212": "-",    # Minus Sign
        "\u00d7": "*",    # Multiplication Sign
        "\u00f7": "/",    # Division Sign
        "\u22c5": "*",    # Dot Operator
        "\u201c": '"',    # Left Double Quotation Mark
        "\u201d": '"',    # Right Double Quotation Mark
        "\u2018": "'",    # Left Single Quotation Mark
        "\u2019": "'",    # Right Single Quotation Mark
        "\u2014": "-",    # Em Dash
        "\u2013": "-",    # En Dash
        "\u2026": "...",  # Horizontal Ellipsis
        "\u00a0": " ",    # No-Break Space
        "\u00f1": "n",    # Spanish ñ -> n
        "\u200b": "",     # Zero Width Space -> remove completely
    }
    for uni, ascii_char in replacements.items():
        text = text.replace(uni, ascii_char)

    text = re.sub(r'(\d),(\d)', r'\1\2', text)
    
    return text

def make_answer_mapping(answer: str) -> Dict[str, str]:
    """
    Create a mapping from line numbers to solution lines from sanitized answer text.
    Returns a dict mapping line identifiers ("L1", "L2", ..., "FA") to solution lines
    WITHOUT calculator annotations.
    """
    lines = answer.split('\n')
    final_answer = None
    answer_mapping = {}

    if lines and re.match(r'^\s*####\s*.*$', lines[-1]):
        final_answer_line = lines.pop().strip()
        match = re.search(r'####\s*(.*)', final_answer_line)
        if match:
            final_answer = match.group(1).strip()

    cleaned_lines = [line.strip() for line in lines if line.strip()]
    for i, line in enumerate(cleaned_lines):
        # Remove calculator annotations from each line
        clean_line = re.sub(r'<<.*?>>', '', line).strip()
        answer_mapping[f"L{i+1}"] = clean_line
    
    if final_answer is not None:
        answer_mapping["FA"] = final_answer
    
    return answer_mapping

def make_separate_mappings(answer: str) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Extract calculator equations and create clean answer mapping without annotations.
    Takes the original answer text and returns both equation mapping and clean answer mapping.
    """
    lines = answer.split('\n')
    final_answer = None
    
    # Handle final answer line
    if lines and re.match(r'^\s*####\s*.*$', lines[-1]):
        final_answer_line = lines.pop().strip()
        match = re.search(r'####\s*(.*)', final_answer_line)
        if match:
            final_answer = match.group(1).strip()

    cleaned_lines = [line.strip() for line in lines if line.strip()]
    
    eqn_mapping = {}
    clean_answer_mapping = {}
    
    for i, line in enumerate(cleaned_lines):
        line_id = f"L{i+1}"
        
        # Extract calculator equations
        calculator_matches = re.findall(r'<<(.*?)>>', line)
        if calculator_matches:
            eqn_mapping[line_id] = calculator_matches[0]
        else:
            eqn_mapping[line_id] = ""
        
        # Create clean text without calculator annotations
        clean_text = re.sub(r'<<.*?>>', '', line).strip()
        clean_answer_mapping[line_id] = clean_text
    
    # Handle final answer
    if final_answer is not None:
        clean_answer_mapping["FA"] = final_answer
        eqn_mapping["FA"] = ""
    
    return eqn_mapping, clean_answer_mapping

def reconstruct_answer_from_clean_mapping(clean_mapping: Dict[str, str]) -> str:
    """
    Returns a reconstructed answer text from clean mapping with FINAL ANSWER: prefix.
    """
    if not clean_mapping:
        return ""
    
    lines = []
    i = 1
    while f"L{i}" in clean_mapping:
        line_text = clean_mapping[f"L{i}"].strip()
        if line_text:
            lines.append(line_text)
        i += 1
    
    if "FA" in clean_mapping and clean_mapping["FA"].strip():
        lines.append(f"FINAL ANSWER: {clean_mapping['FA'].strip()}")
    
    return '\n'.join(lines)

def process_answer_with_full_mappings(
        answer_text: str,
        answer_prefix: str) -> Dict[str, any]:
    """
    Process an answer text and extract all mappings and derived information.
    
    Args:
        answer_text: Raw answer text
        answer_prefix: "correct" or "flawed"
    
    Returns:
        Dict containing all processed information or None if processing fails
    """
    try:
        sanitized_text = sanitize_text(answer_text)
        
        # Extract equations and clean mappings from original text
        eqn_mapping, clean_answer_mapping = make_separate_mappings(sanitized_text)
        
        if not clean_answer_mapping:
            return None

        # Reconstruct clean answer
        reconstructed_answer = reconstruct_answer_from_clean_mapping(clean_answer_mapping)
        
        # Store the CLEAN mapping as the main answer mapping
        answer_mapping_json = json.dumps(clean_answer_mapping, ensure_ascii=False, indent=2)
        eqn_mapping_json = json.dumps(eqn_mapping, ensure_ascii=False, indent=2)

        return {
            answer_prefix + '_answer': reconstructed_answer,
            answer_prefix + '_mapping': answer_mapping_json,  # Now clean
            answer_prefix + '_eqn_mapping': eqn_mapping_json,
            answer_prefix + '_answer_length': len(clean_answer_mapping)
        }
        
    except Exception as e:
        # Any processing error returns None, will be handled as problematic
        return None

def validate_line_number_with_mapping(erroneous_line_number: str, answer_mapping: Dict[str, str]) -> bool:
    """
    Returns True if the erroneous line number exists in the answer mapping, False otherwise.
    """
    if not erroneous_line_number or not answer_mapping:
        return False
    return erroneous_line_number in answer_mapping

In [None]:
def process_single_catalog_row(
        gsm8k_problem: Dict, 
        wrong_answer_text: str,
        erroneous_line_number: str, 
        explanation: str,
        error_type: str, 
        error_subtype: str, 
        source: str,
        tier_lists: Dict,
        catalog_index: int) -> Tuple[Dict[str, any], bool]:
    """
    Process a single catalog row and return all required columns.
    
    Args:
        gsm8k_problem: GSM8K problem data
        wrong_answer_text: Wrong answer text
        erroneous_line_number: Line identifier with error
        explanation: Error explanation
        error_type: Type of error (computational_error/conceptual_error)
        error_subtype: Subtype of error
        source: Source of data (manual/programmatic)
        tier_lists: Tier mapping dictionary
        catalog_index: The GSM8K index from the source catalog
    
    Returns:
        Tuple of (processed_row_dict, is_successful)
    """
    try:
        problem_index = catalog_index
        
        # Determine tier
        problem_tier = None
        for tier_name, indices in tier_lists.items():
            if problem_index in indices:
                problem_tier = tier_name
                break
        
        # Process question
        cleaned_question = sanitize_text(gsm8k_problem['question'])
        
        # Process correct answer
        correct_processed = process_answer_with_full_mappings(gsm8k_problem['answer'], "correct")
        if not correct_processed:
            return {}, False
        
        # Process wrong answer
        wrong_processed = process_answer_with_full_mappings(wrong_answer_text, "wrong")
        if not wrong_processed:
            return {}, False
        
        # Parse the clean mappings for validation and line extraction
        correct_mapping = json.loads(correct_processed['correct_mapping'])
        wrong_mapping = json.loads(wrong_processed['wrong_mapping'])
        wrong_eqn_mapping = json.loads(wrong_processed['wrong_eqn_mapping'])
        
        # Validate erroneous line number against correct answer
        if not validate_line_number_with_mapping(erroneous_line_number, correct_mapping):
            return {}, False
        
        # Extract erroneous line information from wrong answer (already clean)
        if erroneous_line_number == "FA":
            # For final answer, reconstruct with "FINAL ANSWER:" prefix
            fa_content = wrong_mapping.get("FA", "")
            erroneous_line = f"FINAL ANSWER: {fa_content}" if fa_content else ""
        else:
            # For regular lines, use the clean version directly
            erroneous_line = wrong_mapping.get(erroneous_line_number, "")

        erroneous_line_eqn = wrong_eqn_mapping.get(erroneous_line_number, "")
        
        # Build the complete row
        processed_row = {
            'index': problem_index,
            'tier': problem_tier,
            'question': cleaned_question,
            'correct_answer': correct_processed['correct_answer'],
            'wrong_answer': wrong_processed['wrong_answer'],
            'error_type': error_type,
            'explanation': explanation,
            'erroneous_line_number': erroneous_line_number,
            'erroneous_line': erroneous_line,
            'erroneous_line_eqn': erroneous_line_eqn,
            'correct_answer_mapping': correct_processed['correct_mapping'],
            'wrong_answer_mapping': wrong_processed['wrong_mapping'],
            'correct_eqn_mapping': correct_processed['correct_eqn_mapping'],
            'wrong_eqn_mapping': wrong_processed['wrong_eqn_mapping'],
            'correct_answer_length': correct_processed['correct_answer_length'],
            'wrong_answer_length': wrong_processed['wrong_answer_length'],
            'source': source,
            'error_subtype': error_subtype
        }
        
        return processed_row, True
        
    except Exception as e:
        return {}, False

In [81]:
def process_manual_catalog_with_new_pipeline(catalog_dict, gsm8k_train, tier_lists) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Process manual catalog using the new pipeline with full mappings.
    """
    print("=== Processing Manual Catalog with New Pipeline ===")
    
    df = catalog_dict['manual'].copy()
    print(f"Initial rows: {len(df)}")
    
    clean_rows = []
    problematic_rows = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing manual catalog"):
        try:
            problem_index = int(row['index'])
            
            # Check if erroneous_line_number is missing
            if pd.isna(row['erroneous_line_number']):
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': 'Missing erroneous_line_number',
                    'source_catalog': 'manual'
                })
                continue
            
            # Get GSM8K data
            if problem_index >= len(gsm8k_train):
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': f'Index {problem_index} out of range',
                    'source_catalog': 'manual'
                })
                continue
                
            gsm8k_problem = gsm8k_train[problem_index]
            
            # Process using the new pipeline
            processed_row, success = process_single_catalog_row(
                gsm8k_problem=gsm8k_problem,
                wrong_answer_text=row['wrong_answer'],
                erroneous_line_number=row['erroneous_line_number'],
                explanation=row['explanation'],
                error_type=row['error_type'] + '_error',
                error_subtype='NA',
                source='manual',
                tier_lists=tier_lists,
                catalog_index=problem_index  # Add this line
            )
            
            if success:
                clean_rows.append(processed_row)
            else:
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': 'Processing failed in pipeline',
                    'source_catalog': 'manual'
                })
            
        except Exception as e:
            problematic_rows.append({
                **row.to_dict(),
                'error_reason': f'Processing error: {str(e)}',
                'source_catalog': 'manual'
            })
    
    clean_df = pd.DataFrame(clean_rows)
    problematic_df = pd.DataFrame(problematic_rows)
    
    print(f"Clean rows: {len(clean_df)}")
    print(f"Problematic rows: {len(problematic_df)}")
    
    return clean_df, problematic_df

def process_computational_catalog_with_new_pipeline(catalog_dict, gsm8k_train, tier_lists) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Process computational catalog using the new pipeline with full mappings.
    """
    print("=== Processing Computational Catalog with New Pipeline ===")
    
    df = catalog_dict['computational'].copy()
    print(f"Initial rows: {len(df)}")
    
    clean_rows = []
    problematic_rows = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing computational catalog"):
        try:
            problem_index = int(row['index'])
            
            # Check if erroneous_line_number is missing
            if pd.isna(row['erroneous_line_number']):
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': 'Missing erroneous_line_number',
                    'source_catalog': 'computational'
                })
                continue
            
            # Get GSM8K data
            if problem_index >= len(gsm8k_train):
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': f'Index {problem_index} out of range',
                    'source_catalog': 'computational'
                })
                continue
                
            gsm8k_problem = gsm8k_train[problem_index]
            
            # Check if this is tier5 and exclude it
            problem_tier = None
            for tier_name, indices in tier_lists.items():
                if problem_index in indices:
                    problem_tier = tier_name
                    break
            
            if problem_tier == 'tier5':
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': 'Excluded tier5 problem',
                    'source_catalog': 'computational'
                })
                continue
            
            # Process using the new pipeline
            processed_row, success = process_single_catalog_row(
                gsm8k_problem=gsm8k_problem,
                wrong_answer_text=row['wrong_answer'],
                erroneous_line_number=row['erroneous_line_number'],
                explanation=row['explanation'],
                error_type='computational_error',
                error_subtype=row['error_type'],
                source='programmatic',
                tier_lists=tier_lists,
                catalog_index=problem_index  # Add this line
            )
            
            if success:
                clean_rows.append(processed_row)
            else:
                problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': 'Processing failed in pipeline',
                    'source_catalog': 'computational'
                })
            
        except Exception as e:
            problematic_rows.append({
                **row.to_dict(),
                'error_reason': f'Processing error: {str(e)}',
                'source_catalog': 'computational'
            })
    
    clean_df = pd.DataFrame(clean_rows)
    problematic_df = pd.DataFrame(problematic_rows)
    
    print(f"Clean rows: {len(clean_df)}")
    print(f"Problematic rows: {len(problematic_df)}")
    
    return clean_df, problematic_df

def process_validator_catalogs_with_new_pipeline(catalog_dict, gsm8k_train, tier_lists, validators, project_root) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Process all validator catalogs using the new pipeline with full mappings.
    """
    print("=== Processing Validator Catalogs with New Pipeline ===")
    
    all_clean_rows = []
    all_problematic_rows = []
    
    def check_file_exists_and_get_path(filepath, base_dir=None):
        """
        Check if a file exists and return the correct path, handling cross-platform path issues.
        """
        if pd.isna(filepath) or filepath == "":
            return None, False
        
        normalized_filepath = str(filepath).replace('\\', '/')
        file_path = Path(normalized_filepath)
        
        if not file_path.is_absolute():
            full_path = project_root / file_path
        else:
            full_path = file_path
        
        return full_path, full_path.exists()
    
    def fix_answer_formatting(wrong_answer: str) -> str:
        """
        Fixes the formatting of wrong answers by moving the final answer line
        from the beginning to the end if it's misplaced.
        """
        if not isinstance(wrong_answer, str):
            return wrong_answer
        
        lines = wrong_answer.strip().split('\n')
        final_answer_line = None
        other_lines = []
        
        for line in lines:
            if re.match(r'^\s*####\s*.*$', line.strip()):
                final_answer_line = line.strip()
            elif line.strip():
                other_lines.append(line)
        
        if final_answer_line and other_lines:
            return '\n'.join(other_lines) + '\n' + final_answer_line
        elif final_answer_line:
            return final_answer_line
        else:
            return wrong_answer
    
    for validator in validators:
        print(f"\nProcessing validator: {validator}")
        df = catalog_dict[f'conceptual_{validator}'].copy()
        
        # Filter to only accepted samples
        df = df[df['status'] == 'accepted']
        print(f"Accepted rows for {validator}: {len(df)}")
        
        for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {validator}"):
            try:
                problem_index = int(row['index'])
                
                # Get GSM8K data
                if problem_index >= len(gsm8k_train):
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': f'Index {problem_index} out of range',
                        'source_catalog': f'conceptual_{validator}'
                    })
                    continue
                    
                gsm8k_problem = gsm8k_train[problem_index]
                
                # Check if this is tier5 and exclude it
                problem_tier = None
                for tier_name, indices in tier_lists.items():
                    if problem_index in indices:
                        problem_tier = tier_name
                        break
                
                if problem_tier == 'tier5':
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': 'Excluded tier5 problem',
                        'source_catalog': f'conceptual_{validator}'
                    })
                    continue
                
                # Load JSON file with proper path handling
                try:
                    filepath, file_exists = check_file_exists_and_get_path(row['filepath'])
                    
                    if not file_exists or filepath is None:
                        all_problematic_rows.append({
                            **row.to_dict(),
                            'error_reason': f'JSON file not found: {row["filepath"]}',
                            'source_catalog': f'conceptual_{validator}'
                        })
                        continue
                    
                    with open(filepath, 'r', encoding='utf-8') as f:
                        json_data = json.load(f)
                        
                except Exception as e:
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': f'JSON loading error: {str(e)}',
                        'source_catalog': f'conceptual_{validator}'
                    })
                    continue
                
                # Extract data from JSON
                try:
                    raw_wrong_answer = json_data['context']['flawed_solution']
                    explanation = json_data['error_details']['explanation']
                    erroneous_line_number = json_data['error_details']['erroneous_line_number']
                    error_subtype = json_data['error_details']['error_type']
                except KeyError as e:
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': f'Missing JSON field: {str(e)}',
                        'source_catalog': f'conceptual_{validator}'
                    })
                    continue
                
                # Fix the answer formatting
                wrong_answer = fix_answer_formatting(raw_wrong_answer)
                
                # Check for null erroneous_line_number
                if erroneous_line_number is None or erroneous_line_number == "null":
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': f'Null erroneous_line_number in JSON',
                        'source_catalog': f'conceptual_{validator}'
                    })
                    continue
                
                # Process using the new pipeline
                processed_row, success = process_single_catalog_row(
                    gsm8k_problem=gsm8k_problem,
                    wrong_answer_text=wrong_answer,
                    erroneous_line_number=erroneous_line_number,
                    explanation=explanation,
                    error_type='conceptual_error',
                    error_subtype=error_subtype,
                    source='programmatic',
                    tier_lists=tier_lists,
                    catalog_index=problem_index  # Add this line
                )
                
                if success:
                    all_clean_rows.append(processed_row)
                else:
                    all_problematic_rows.append({
                        **row.to_dict(),
                        'error_reason': 'Processing failed in pipeline',
                        'source_catalog': f'conceptual_{validator}'
                    })
                
            except Exception as e:
                all_problematic_rows.append({
                    **row.to_dict(),
                    'error_reason': f'Processing error: {str(e)}',
                    'source_catalog': f'conceptual_{validator}'
                })
    
    clean_df = pd.DataFrame(all_clean_rows)
    problematic_df = pd.DataFrame(all_problematic_rows)
    
    print(f"\nTotal clean rows: {len(clean_df)}")
    print(f"Total problematic rows: {len(problematic_df)}")
    
    return clean_df, problematic_df

In [82]:
MASTER_CATALOG_COLUMNS = [
    'index', 
    'tier', 
    'question', 
    'correct_answer', 
    'wrong_answer', 
    'error_type', 
    'explanation', 
    'erroneous_line_number', 
    'erroneous_line', 
    'erroneous_line_eqn', 
    'correct_answer_mapping', 
    'wrong_answer_mapping',
    'correct_eqn_mapping', 
    'wrong_eqn_mapping', 
    'correct_answer_length',
    'wrong_answer_length', 
    'source', 
    'error_subtype'
]

def create_master_catalogs_with_new_structure(manual_clean, computational_clean, validator_clean, 
                          manual_problematic, computational_problematic, validator_problematic):
    """
    Combines all clean and problematic dataframes into final master catalogs with new structure.
    """
    print("=== Creating Master Catalogs with New Structure ===")
    
    # Combine all clean dataframes
    all_clean_dfs = []
    if not manual_clean.empty:
        all_clean_dfs.append(manual_clean)
    if not computational_clean.empty:
        all_clean_dfs.append(computational_clean)
    if not validator_clean.empty:
        all_clean_dfs.append(validator_clean)
    
    master_catalog = pd.concat(all_clean_dfs, ignore_index=True) if all_clean_dfs else pd.DataFrame()
    
    # Combine all problematic dataframes
    all_problematic_dfs = []
    if not manual_problematic.empty:
        all_problematic_dfs.append(manual_problematic)
    if not computational_problematic.empty:
        all_problematic_dfs.append(computational_problematic)
    if not validator_problematic.empty:
        all_problematic_dfs.append(validator_problematic)
    
    catalog_problematic = pd.concat(all_problematic_dfs, ignore_index=True) if all_problematic_dfs else pd.DataFrame()
    
    # Ensure consistent column order for master catalog
    if not master_catalog.empty:
        master_catalog = master_catalog[MASTER_CATALOG_COLUMNS]
    
    print(f"Master catalog rows: {len(master_catalog)}")
    print(f"Problematic rows: {len(catalog_problematic)}")
    
    return master_catalog, catalog_problematic

In [83]:
# 1. Run the full pipeline
print("🚀 Starting Full Pipeline Execution")
print("=" * 80)

# Process manual catalog
manual_clean, manual_problematic = process_manual_catalog_with_new_pipeline(
    CATALOG_DICT, GSM8K_TRAIN, TIER_LISTS
)

# Process computational catalog  
computational_clean, computational_problematic = process_computational_catalog_with_new_pipeline(
    CATALOG_DICT, GSM8K_TRAIN, TIER_LISTS
)

# Process validator catalogs
validator_clean, validator_problematic = process_validator_catalogs_with_new_pipeline(
    CATALOG_DICT, GSM8K_TRAIN, TIER_LISTS, VALIDATORS, PROJECT_ROOT
)

# Create master catalogs
master_catalog, catalog_problematic = create_master_catalogs_with_new_structure(
    manual_clean, computational_clean, validator_clean,
    manual_problematic, computational_problematic, validator_problematic
)

print("\n🎉 Pipeline Execution Complete!")
print(f"✅ Master catalog: {len(master_catalog):,} rows")
print(f"❌ Problematic rows: {len(catalog_problematic):,} rows")

🚀 Starting Full Pipeline Execution
=== Processing Manual Catalog with New Pipeline ===
Initial rows: 1963


Processing manual catalog:   0%|          | 0/1963 [00:00<?, ?it/s]

Clean rows: 1740
Problematic rows: 223
=== Processing Computational Catalog with New Pipeline ===
Initial rows: 22623


Processing computational catalog:   0%|          | 0/22623 [00:00<?, ?it/s]

Clean rows: 21768
Problematic rows: 855
=== Processing Validator Catalogs with New Pipeline ===

Processing validator: ali
Accepted rows for ali: 341


Processing ali:   0%|          | 0/341 [00:00<?, ?it/s]


Processing validator: arvind
Accepted rows for arvind: 91


Processing arvind:   0%|          | 0/91 [00:00<?, ?it/s]


Processing validator: mauro
Accepted rows for mauro: 312


Processing mauro:   0%|          | 0/312 [00:00<?, ?it/s]


Processing validator: ling
Accepted rows for ling: 110


Processing ling:   0%|          | 0/110 [00:00<?, ?it/s]


Processing validator: yewei
Accepted rows for yewei: 290


Processing yewei:   0%|          | 0/290 [00:00<?, ?it/s]


Total clean rows: 1144
Total problematic rows: 0
=== Creating Master Catalogs with New Structure ===
Master catalog rows: 24652
Problematic rows: 1078

🎉 Pipeline Execution Complete!
✅ Master catalog: 24,652 rows
❌ Problematic rows: 1,078 rows


In [84]:
# 2. Pretty print one randomly chosen row from each catalog source
import json

def pretty_print_sample_rows_by_source(master_catalog):
    """
    Pretty prints one randomly chosen row from each source catalog.
    """
    print("🎲 RANDOM SAMPLE FROM EACH SOURCE CATALOG")
    print("=" * 80)
    
    if master_catalog.empty:
        print("❌ Master catalog is empty!")
        return
    
    # Define source mappings
    source_mappings = {
        'manual': 'Manual Catalog',
        'computational': 'Computational Catalog', 
        'conceptual_ali': 'Conceptual Validator - Ali',
        'conceptual_arvind': 'Conceptual Validator - Arvind',
        'conceptual_mauro': 'Conceptual Validator - Mauro',
        'conceptual_ling': 'Conceptual Validator - Ling',
        'conceptual_yewei': 'Conceptual Validator - Yewei'
    }
    
    # Get available sources in the data
    available_sources = master_catalog['source'].unique()
    error_types = master_catalog['error_type'].unique()
    
    # Sample one row from each available source/error_type combination
    sampled_sources = set()
    
    # First priority: manual source
    if 'manual' in available_sources:
        manual_samples = master_catalog[master_catalog['source'] == 'manual']
        if not manual_samples.empty:
            sample = manual_samples.sample(1).iloc[0]
            sampled_sources.add('manual')
            print_sample_row(sample, 'Manual Catalog')
    
    # Second priority: computational (programmatic source + computational_error)
    programmatic_computational = master_catalog[
        (master_catalog['source'] == 'programmatic') & 
        (master_catalog['error_type'] == 'computational_error')
    ]
    if not programmatic_computational.empty:
        sample = programmatic_computational.sample(1).iloc[0]
        sampled_sources.add('computational')
        print_sample_row(sample, 'Computational Catalog')
    
    # Third priority: conceptual validators (programmatic source + conceptual_error)
    programmatic_conceptual = master_catalog[
        (master_catalog['source'] == 'programmatic') & 
        (master_catalog['error_type'] == 'conceptual_error')
    ]
    
    # Try to get one sample from each validator if possible
    validators_sampled = set()
    for _ in range(5):  # Try up to 5 times to get different validators
        if not programmatic_conceptual.empty and len(validators_sampled) < 5:
            sample = programmatic_conceptual.sample(1).iloc[0]
            
            # Determine which validator this likely came from based on patterns
            # This is approximate since we don't store validator info directly
            validator_name = f"Conceptual Validator #{len(validators_sampled) + 1}"
            if validator_name not in validators_sampled:
                validators_sampled.add(validator_name)
                print_sample_row(sample, validator_name)
    
    print(f"\n📊 Summary: Displayed samples from {len(sampled_sources) + len(validators_sampled)} source categories")

def print_sample_row(sample, source_name):
    """Helper function to pretty print a single sample row with full content."""
    print(f"\n🔍 {source_name.upper()}")
    print("-" * 60)
    print(f"📋 Index: {sample['index']} | Tier: {sample['tier']} | Error Type: {sample['error_type']}")
    print(f"📝 Error Subtype: {sample['error_subtype']}")
    print(f"🎯 Erroneous Line Number: {sample['erroneous_line_number']}")
    print(f"🔴 Erroneous Line: {sample['erroneous_line']}")
    print(f"🧮 Erroneous Line Equation: {sample['erroneous_line_eqn']}")
    
    print(f"\n❓ Question:")
    print(f"{sample['question']}")
    
    print(f"\n✅ Correct Answer (Full):")
    print(f"{sample['correct_answer']}")
    
    print(f"\n❌ Wrong Answer (Full):")
    print(f"{sample['wrong_answer']}")
    
    print(f"\n💡 Explanation:")
    print(f"{sample['explanation']}")
    
    print(f"\n📊 Answer Lengths:")
    print(f"   Correct: {sample['correct_answer_length']} lines")
    print(f"   Wrong: {sample['wrong_answer_length']} lines")
    
    print(f"\n🗂️ Correct Answer Mapping (JSON):")
    print(f"{sample['correct_answer_mapping']}")
    
    print(f"\n🗂️ Wrong Answer Mapping (JSON):")
    print(f"{sample['wrong_answer_mapping']}")
    
    print(f"\n🧮 Correct Equation Mapping (JSON):")
    print(f"{sample['correct_eqn_mapping']}")
    
    print(f"\n🧮 Wrong Equation Mapping (JSON):")
    print(f"{sample['wrong_eqn_mapping']}")
    
    print(f"\n🏷️ Source: {sample['source']}")
    
    print("=" * 80)

# Run the pretty printer
pretty_print_sample_rows_by_source(master_catalog)

🎲 RANDOM SAMPLE FROM EACH SOURCE CATALOG

🔍 MANUAL CATALOG
------------------------------------------------------------
📋 Index: 1184 | Tier: tier1 | Error Type: computational_error
📝 Error Subtype: NA
🎯 Erroneous Line Number: L3
🔴 Erroneous Line: A medium bed can hold 3 rows with 20 seeds sown per row, 3 * 20=50 seeds per medium bed.
🧮 Erroneous Line Equation: 3*20=50

❓ Question:
Grace is looking to plant some lettuce in her raised bed garden. Her raised bed is comprised of 2 large beds on top with 2 medium beds on the bottom. The top bed can hold 4 rows of lettuce with 25 seeds being sown per row. The medium bed can house 3 rows with 20 seeds being sown per row. How many seeds can Grace plant in all four beds of her raised bed garden?

✅ Correct Answer (Full):
A large bed can hold 4 rows with 25 seeds per row, 4 * 25=100 seeds per large bed
100 seeds per large bed and there are 2 beds, 100 * 2= 200 seeds needed in total for both large beds.
A medium bed can hold 3 rows with 20 seeds

In [85]:
# 3. Save the master catalog to a new folder "../data/aug-5-dataset"
from pathlib import Path
import shutil

# Create the new directory
AUG_5_DATASET_DIR = DATA_DIR / "aug-5-dataset"
AUG_5_DATASET_DIR.mkdir(parents=True, exist_ok=True)

print(f"💾 SAVING MASTER CATALOG TO: {AUG_5_DATASET_DIR}")
print("=" * 80)

# Save master catalog
master_catalog_path = AUG_5_DATASET_DIR / "master_catalog.csv"
master_catalog.to_csv(master_catalog_path, index=False)
print(f"✅ Master catalog saved: {master_catalog_path}")
print(f"   📊 {len(master_catalog):,} rows with {len(MASTER_CATALOG_COLUMNS)} columns")

# Save problematic catalog
problematic_catalog_path = AUG_5_DATASET_DIR / "catalog_problematic.csv"
catalog_problematic.to_csv(problematic_catalog_path, index=False)
print(f"✅ Problematic catalog saved: {problematic_catalog_path}")
print(f"   📊 {len(catalog_problematic):,} problematic rows")

# Save summary statistics
summary_path = AUG_5_DATASET_DIR / "dataset_summary.txt"
with open(summary_path, 'w') as f:
    f.write("AUG-5 DATASET SUMMARY\n")
    f.write("=" * 50 + "\n\n")
    f.write(f"Generated on: August 5, 2025\n")
    f.write(f"Total master catalog rows: {len(master_catalog):,}\n")
    f.write(f"Total problematic rows: {len(catalog_problematic):,}\n")
    f.write(f"Unique GSM8K indices: {master_catalog['index'].nunique():,}\n\n")
    
    f.write("SOURCE DISTRIBUTION:\n")
    source_counts = master_catalog['source'].value_counts()
    for source, count in source_counts.items():
        pct = (count / len(master_catalog)) * 100
        f.write(f"  {source}: {count:,} ({pct:.1f}%)\n")
    
    f.write("\nERROR TYPE DISTRIBUTION:\n")
    error_type_counts = master_catalog['error_type'].value_counts()
    for error_type, count in error_type_counts.items():
        pct = (count / len(master_catalog)) * 100
        f.write(f"  {error_type}: {count:,} ({pct:.1f}%)\n")
    
    f.write("\nTIER DISTRIBUTION:\n")
    tier_counts = master_catalog['tier'].value_counts().sort_index()
    for tier, count in tier_counts.items():
        pct = (count / len(master_catalog)) * 100
        f.write(f"  {tier}: {count:,} ({pct:.1f}%)\n")
    
    f.write(f"\nCOLUMNS ({len(MASTER_CATALOG_COLUMNS)}):\n")
    for i, col in enumerate(MASTER_CATALOG_COLUMNS, 1):
        f.write(f"  {i:2d}. {col}\n")

print(f"✅ Summary statistics saved: {summary_path}")

def get_column_description(col_name):
    """Get description for each column."""
    descriptions = {
        'index': 'GSM8K problem index',
        'tier': 'Problem difficulty tier (tier1-tier5)',
        'question': 'Math problem question text',
        'correct_answer': 'Correct solution with FINAL ANSWER prefix',
        'wrong_answer': 'Flawed solution with FINAL ANSWER prefix', 
        'error_type': 'Type of error (computational_error/conceptual_error)',
        'explanation': 'Human explanation of the error',
        'erroneous_line_number': 'Line identifier containing the error (L1, L2, FA)',
        'erroneous_line': 'Text content of the erroneous line',
        'erroneous_line_eqn': 'Calculator equation from erroneous line',
        'correct_answer_mapping': 'JSON mapping of line IDs to correct solution lines',
        'wrong_answer_mapping': 'JSON mapping of line IDs to wrong solution lines',
        'correct_eqn_mapping': 'JSON mapping of line IDs to correct calculator equations',
        'wrong_eqn_mapping': 'JSON mapping of line IDs to wrong calculator equations', 
        'correct_answer_length': 'Number of lines in correct solution',
        'wrong_answer_length': 'Number of lines in wrong solution',
        'source': 'Data source (manual/programmatic)',
        'error_subtype': 'Detailed error subtype classification'
    }
    return descriptions.get(col_name, 'No description available')

# Save column schema
schema_path = AUG_5_DATASET_DIR / "column_schema.json"
schema_info = {
    "dataset_name": "aug-5-dataset", 
    "creation_date": "2025-08-05",
    "total_columns": len(MASTER_CATALOG_COLUMNS),
    "columns": {
        col: {
            "position": i + 1,
            "data_type": str(master_catalog[col].dtype) if col in master_catalog.columns else "unknown",
            "description": get_column_description(col)
        }
        for i, col in enumerate(MASTER_CATALOG_COLUMNS)
    }
}

with open(schema_path, 'w') as f:
    json.dump(schema_info, f, indent=2)

print(f"✅ Column schema saved: {schema_path}")

# List all files in the new directory
print(f"\n📁 FILES IN {AUG_5_DATASET_DIR.name}/:")
for file_path in sorted(AUG_5_DATASET_DIR.iterdir()):
    file_size = file_path.stat().st_size / (1024 * 1024)  # Size in MB
    print(f"   📄 {file_path.name} ({file_size:.1f} MB)")

print(f"\n🎉 Dataset successfully saved to: {AUG_5_DATASET_DIR}")
print("✨ Ready for downstream processing and analysis!")

💾 SAVING MASTER CATALOG TO: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/aug-5-dataset
✅ Master catalog saved: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/aug-5-dataset/master_catalog.csv
   📊 24,652 rows with 18 columns
✅ Problematic catalog saved: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/aug-5-dataset/catalog_problematic.csv
   📊 1,078 problematic rows
✅ Summary statistics saved: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/aug-5-dataset/dataset_summary.txt
✅ Column schema saved: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/aug-5-dataset/column_schema.json

📁 FILES IN aug-5-dataset/:
   📄 catalog_problematic.csv (0.4 MB)
   📄 column_schema.json (0.0 MB)
   📄 dataset_summary.txt (0.0 MB)
   📄 manual_length_mismatch.csv (0.3 MB)
   📄 master_catalog.csv (47.0 MB)

🎉 Dataset successfully saved to: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/aug-5-dataset
✨ Ready for downstrea

In [86]:
# Read the CSV and examine the data types and a sample row
import pandas as pd
import json

# Read the saved master catalog
df = pd.read_csv(master_catalog_path)

print("📊 DATAFRAME INFO:")
print("=" * 50)
df.info()

print("\n📋 COLUMN DTYPES:")
print("=" * 50)
for col in df.columns:
    print(f"{col}: {df[col].dtype}")

print("\n🔍 SAMPLE ROW (First Row):")
print("=" * 50)
sample_row = df.iloc[0]
for col, value in sample_row.items():
    print(f"\n{col}:")
    print(f"  Type: {type(value)}")
    print(f"  Value: {repr(value)}")  # repr shows the actual string representation
    
    # Special handling for JSON columns
    if col.endswith('_mapping'):
        print(f"  First 100 chars: {str(value)[:100]}...")
        try:
            parsed = json.loads(value)
            print(f"  JSON Parse: SUCCESS - {type(parsed)}")
        except Exception as e:
            print(f"  JSON Parse: FAILED - {str(e)}")

print("\n🧪 JSON PARSING TEST:")
print("=" * 50)
# Test parsing the JSON columns
json_columns = ['correct_answer_mapping', 'wrong_answer_mapping', 'correct_eqn_mapping', 'wrong_eqn_mapping']

for col in json_columns:
    print(f"\nTesting {col}:")
    test_value = df.iloc[0][col]
    print(f"Raw value: {repr(test_value)}")
    
    try:
        parsed = json.loads(test_value)
        print(f"✅ Parsed successfully: {type(parsed)}")
        print(f"   Sample content: {list(parsed.keys())[:3]}")
    except json.JSONDecodeError as e:
        print(f"❌ JSON parsing failed: {e}")
        
        # Try to fix the escaping issue
        try:
            fixed_value = test_value.replace('""', '"')
            parsed = json.loads(fixed_value)
            print(f"✅ Fixed and parsed: {type(parsed)}")
        except Exception as e2:
            print(f"❌ Still failed after fixing: {e2}")

📊 DATAFRAME INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24652 entries, 0 to 24651
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   index                   24652 non-null  int64 
 1   tier                    24652 non-null  object
 2   question                24652 non-null  object
 3   correct_answer          24652 non-null  object
 4   wrong_answer            24652 non-null  object
 5   error_type              24652 non-null  object
 6   explanation             24651 non-null  object
 7   erroneous_line_number   24652 non-null  object
 8   erroneous_line          24382 non-null  object
 9   erroneous_line_eqn      22616 non-null  object
 10  correct_answer_mapping  24652 non-null  object
 11  wrong_answer_mapping    24652 non-null  object
 12  correct_eqn_mapping     24652 non-null  object
 13  wrong_eqn_mapping       24652 non-null  object
 14  correct_answer_length   24652 non-nu

In [90]:
df = df.dropna(subset=['erroneous_line', 'explanation'])

In [None]:
import pandas as pd
import json
import numpy as np
from collections import defaultdict

def create_error_detection_dataset_simplified(df):
    """
    Create error detection dataset with simplified correct sample logic.
    """
    print("🔧 CREATING ERROR DETECTION DATASET (SIMPLIFIED)")
    print("=" * 70)
    
    # Tier priorities: tier4 (500), tier2 (500), tier3 (500), tier1 (500)
    tier_priorities = [
        ('tier4', 500),
        ('tier2', 500), 
        ('tier3', 500),
        ('tier1', 500)
    ]
    
    used_indices = set()  # Track used GSM8K indices for conceptual/computational
    selected_samples = []
    
    # Step 1: Get all conceptual error samples
    print(f"\n📋 Step 1: Processing CONCEPTUAL ERROR samples...")
    conceptual_samples = df[df['error_type'] == 'conceptual_error'].copy()
    N = len(conceptual_samples)
    
    print(f"   Found N = {N:,} conceptual error samples")
    
    # Track conceptual indices
    conceptual_indices = set(conceptual_samples['index'].tolist())
    used_indices.update(conceptual_indices)
    
    # Add split column (will assign later)
    conceptual_samples['split'] = 'train'  # Default, will be reassigned
    selected_samples.append(conceptual_samples)
    
    print(f"   ✅ Added {len(conceptual_samples):,} conceptual error samples")
    
    # Step 2: Add N computational error samples (distinct problem indices)
    print(f"\n📋 Step 2: Processing COMPUTATIONAL ERROR samples...")
    print(f"   Target: {N:,} samples with distinct indices from conceptual samples")
    
    # Get computational candidates excluding conceptual indices
    computational_candidates = df[
        (df['error_type'] == 'computational_error') & 
        (~df['index'].isin(used_indices))
    ].copy()
    
    print(f"   Available distinct computational candidates: {len(computational_candidates):,}")
    
    # Select computational samples by tier priority
    selected_computational = []
    remaining_slots = N
    
    for tier, tier_limit in tier_priorities:
        if remaining_slots <= 0:
            break
            
        tier_candidates = computational_candidates[computational_candidates['tier'] == tier]
        available_count = len(tier_candidates)
        actual_count = min(tier_limit, available_count, remaining_slots)
        
        if actual_count > 0:
            # Randomly sample from this tier
            tier_selected = tier_candidates.sample(n=actual_count, random_state=42).copy()
            selected_computational.append(tier_selected)
            remaining_slots -= actual_count
            
            # Track used indices
            used_indices.update(tier_selected['index'].tolist())
            
            print(f"   {tier}: {actual_count:,} selected (available: {available_count:,}, limit: {tier_limit:,})")
    
    # Combine computational samples
    if selected_computational:
        computational_samples = pd.concat(selected_computational, ignore_index=True)
        computational_samples['split'] = 'train'  # Default, will be reassigned
        selected_samples.append(computational_samples)
        
        print(f"   ✅ Added {len(computational_samples):,} computational error samples")
        
        # Show tier distribution
        tier_dist = computational_samples['tier'].value_counts().sort_index()
        print(f"      Tier distribution: {dict(tier_dist)}")
    else:
        print(f"   ❌ Could not select enough computational samples")
        computational_samples = pd.DataFrame()
    
    # Step 3: Add 2000 randomly chosen correct samples (following tier priority)
    print(f"\n📋 Step 3: Processing CORRECT samples...")
    print(f"   Target: 2000 randomly chosen correct samples (following tier priority)")
    
    # Get all available candidates (don't worry about used indices)
    correct_candidates = df.copy()
    
    print(f"   Available correct candidates: {len(correct_candidates):,}")
    
    # Select correct samples by tier priority
    selected_correct = []
    remaining_slots = 2000
    
    for tier, tier_limit in tier_priorities:
        if remaining_slots <= 0:
            break
            
        tier_candidates = correct_candidates[correct_candidates['tier'] == tier]
        available_count = len(tier_candidates)
        actual_count = min(tier_limit, available_count, remaining_slots)
        
        if actual_count > 0:
            # Randomly sample from this tier
            tier_selected = tier_candidates.sample(n=actual_count, random_state=45).copy()
            selected_correct.append(tier_selected)
            remaining_slots -= actual_count
            
            print(f"   {tier}: {actual_count:,} selected (available: {available_count:,}, limit: {tier_limit:,})")
    
    # Combine correct samples
    if selected_correct:
        correct_samples = pd.concat(selected_correct, ignore_index=True)
        correct_samples['error_type'] = 'correct'
        correct_samples['split'] = 'train'  # Default, will be reassigned
        selected_samples.append(correct_samples)
        
        print(f"   ✅ Added {len(correct_samples):,} correct samples")
        
        # Show tier distribution
        tier_dist = correct_samples['tier'].value_counts().sort_index()
        print(f"      Tier distribution: {dict(tier_dist)}")
        
        # Check for distinctness within correct samples
        unique_correct_indices = correct_samples['index'].nunique()
        total_correct_samples = len(correct_samples)
        print(f"      Unique indices in correct samples: {unique_correct_indices:,} / {total_correct_samples:,}")
        if unique_correct_indices == total_correct_samples:
            print("      ✅ All correct samples have distinct indices")
        else:
            print(f"      ⚠️  {total_correct_samples - unique_correct_indices} duplicate indices in correct samples")
    else:
        print(f"   ❌ Could not select correct samples")
    
    # Combine all samples
    if selected_samples:
        final_dataset = pd.concat(selected_samples, ignore_index=True)
        
        # Add proper train/test split (4:1 ratio within each error type)
        print(f"\n📋 Adding train/test splits...")
        
        for error_type in ['correct', 'conceptual_error', 'computational_error']:
            type_mask = final_dataset['error_type'] == error_type
            type_indices = final_dataset[type_mask].index.tolist()
            
            if len(type_indices) > 0:
                np.random.seed(44 + hash(error_type) % 100)  # Different seed per type
                n_samples = len(type_indices)
                n_train = int(0.8 * n_samples)  # 4:1 ratio
                
                shuffled_indices = np.random.permutation(type_indices)
                train_indices = shuffled_indices[:n_train]
                test_indices = shuffled_indices[n_train:]
                
                final_dataset.loc[train_indices, 'split'] = 'train'
                final_dataset.loc[test_indices, 'split'] = 'test'
        
        print(f"\n🎯 FINAL DATASET SUMMARY")
        print("=" * 50)
        print(f"Total samples: {len(final_dataset):,}")
        
        # Summary by error type
        print(f"\nBy Error Type:")
        for error_type in ['correct', 'conceptual_error', 'computational_error']:
            type_data = final_dataset[final_dataset['error_type'] == error_type]
            if len(type_data) > 0:
                train_count = len(type_data[type_data['split'] == 'train'])
                test_count = len(type_data[type_data['split'] == 'test'])
                unique_indices = type_data['index'].nunique()
                print(f"  {error_type}: {len(type_data):,} (train: {train_count:,}, test: {test_count:,}, unique indices: {unique_indices:,})")
        
        # Summary by split
        print(f"\nBy Split:")
        split_counts = final_dataset['split'].value_counts()
        for split, count in split_counts.items():
            pct = (count / len(final_dataset)) * 100
            print(f"  {split}: {count:,} ({pct:.1f}%)")
        
        # Summary by tier
        print(f"\nBy Tier:")
        tier_counts = final_dataset['tier'].value_counts().sort_index()
        for tier, count in tier_counts.items():
            pct = (count / len(final_dataset)) * 100
            print(f"  {tier}: {count:,} ({pct:.1f}%)")
        
        # Summary by source
        print(f"\nBy Source:")
        source_counts = final_dataset['source'].value_counts()
        for source, count in source_counts.items():
            pct = (count / len(final_dataset)) * 100
            print(f"  {source}: {count:,} ({pct:.1f}%)")
        
        # Check overlap between error types
        conceptual_idx = set(final_dataset[final_dataset['error_type'] == 'conceptual_error']['index'])
        computational_idx = set(final_dataset[final_dataset['error_type'] == 'computational_error']['index'])
        correct_idx = set(final_dataset[final_dataset['error_type'] == 'correct']['index'])
        
        print(f"\nIndex Overlap Analysis:")
        conceptual_computational_overlap = len(conceptual_idx & computational_idx)
        conceptual_correct_overlap = len(conceptual_idx & correct_idx)
        computational_correct_overlap = len(computational_idx & correct_idx)
        
        print(f"  Conceptual ∩ Computational: {conceptual_computational_overlap:,} indices")
        print(f"  Conceptual ∩ Correct: {conceptual_correct_overlap:,} indices")
        print(f"  Computational ∩ Correct: {computational_correct_overlap:,} indices")
        
        return final_dataset
    else:
        print("\n❌ No samples could be selected!")
        return pd.DataFrame()

# Create the simplified error detection dataset
error_detection_dataset = create_error_detection_dataset_simplified(df)

# Save the dataset
if not error_detection_dataset.empty:
    output_path = AUG_5_DATASET_DIR / "error_detection_dataset.csv"
    error_detection_dataset.to_csv(output_path, index=False)
    print(f"\n💾 Dataset saved to: {output_path}")
else:
    print("❌ Dataset creation failed!")

🔧 CREATING ERROR DETECTION DATASET (SIMPLIFIED)

📋 Step 1: Processing CONCEPTUAL ERROR samples...
   Found N = 2,067 conceptual error samples
   ✅ Added 2,067 conceptual error samples

📋 Step 2: Processing COMPUTATIONAL ERROR samples...
   Target: 2,067 samples with distinct indices from conceptual samples
   Available distinct computational candidates: 16,204
   tier4: 500 selected (available: 787, limit: 500)
   tier2: 500 selected (available: 2,051, limit: 500)
   tier3: 500 selected (available: 5,505, limit: 500)
   tier1: 500 selected (available: 7,851, limit: 500)
   ✅ Added 2,000 computational error samples
      Tier distribution: {'tier1': 500, 'tier2': 500, 'tier3': 500, 'tier4': 500}

📋 Step 3: Processing CORRECT samples...
   Target: 2000 randomly chosen correct samples (following tier priority)
   Available correct candidates: 24,376
   tier4: 500 selected (available: 1,637, limit: 500)
   tier2: 500 selected (available: 3,436, limit: 500)
   tier3: 500 selected (available

TypeError: Object of type int64 is not JSON serializable

In [114]:
def check_annotation_coverage(dataset_df, verbose=True):
    """
    Check how many samples in the dataset are missing complete calculator annotation coverage.
    
    Args:
        dataset_df: DataFrame containing the dataset to check
        verbose: Whether to print detailed breakdown
    
    Returns:
        dict: Summary of annotation coverage statistics
    """
    
    def has_complete_annotation_coverage(row, eqn_mapping_col):
        """Check if all solution lines except FA have calculator annotations."""
        try:
            eqn_mapping = json.loads(row[eqn_mapping_col])
            
            # Count total lines (excluding FA)
            answer_length = row['correct_answer_length'] if 'correct' in eqn_mapping_col else row['wrong_answer_length']
            expected_lines = answer_length - 1 if 'FA' in eqn_mapping else answer_length
            
            # Count non-empty equations (excluding FA)
            non_empty_equations = sum(1 for key, value in eqn_mapping.items() 
                                    if key != 'FA' and value and str(value).strip())
            
            return non_empty_equations == expected_lines
            
        except (json.JSONDecodeError, KeyError, TypeError):
            return False
    
    # Initialize results
    results = {
        'total_samples': len(dataset_df),
        'by_error_type': {},
        'missing_coverage': {
            'total': 0,
            'by_error_type': {}
        }
    }
    
    # Group by error_type for analysis
    for error_type in dataset_df['error_type'].unique():
        type_data = dataset_df[dataset_df['error_type'] == error_type]
        
        # Determine which equation mapping column to use
        if error_type == 'correct':
            eqn_col = 'correct_eqn_mapping'
        else:
            eqn_col = 'wrong_eqn_mapping'
        
        # Check coverage for this error type
        has_coverage = type_data.apply(
            lambda row: has_complete_annotation_coverage(row, eqn_col), axis=1
        )
        
        # Count missing coverage
        missing_count = (~has_coverage).sum()
        total_count = len(type_data)
        
        results['by_error_type'][error_type] = {
            'total': total_count,
            'with_coverage': has_coverage.sum(),
            'missing_coverage': missing_count,
            'coverage_rate': has_coverage.sum() / total_count if total_count > 0 else 0
        }
        
        results['missing_coverage']['by_error_type'][error_type] = missing_count
        results['missing_coverage']['total'] += missing_count
    
    # Print results if verbose
    if verbose:
        print("🔍 Calculator Annotation Coverage Analysis")
        print("=" * 50)
        print(f"Total samples in dataset: {results['total_samples']:,}")
        print(f"Total samples missing coverage: {results['missing_coverage']['total']:,}")
        print(f"Overall coverage rate: {((results['total_samples'] - results['missing_coverage']['total']) / results['total_samples'] * 100):.1f}%")
        print()
        
        print("By Error Type:")
        for error_type, stats in results['by_error_type'].items():
            print(f"  {error_type}:")
            print(f"    Total samples: {stats['total']:,}")
            print(f"    With complete coverage: {stats['with_coverage']:,}")
            print(f"    Missing coverage: {stats['missing_coverage']:,}")
            print(f"    Coverage rate: {stats['coverage_rate']:.1%}")
            print()
    
    return results

coverage_results = check_annotation_coverage(error_detection_dataset)

🔍 Calculator Annotation Coverage Analysis
Total samples in dataset: 6,067
Total samples missing coverage: 1,297
Overall coverage rate: 78.6%

By Error Type:
  conceptual_error:
    Total samples: 2,067
    With complete coverage: 1,591
    Missing coverage: 476
    Coverage rate: 77.0%

  computational_error:
    Total samples: 2,000
    With complete coverage: 1,571
    Missing coverage: 429
    Coverage rate: 78.5%

  correct:
    Total samples: 2,000
    With complete coverage: 1,608
    Missing coverage: 392
    Coverage rate: 80.4%



In [116]:
error_detection_dataset.columns

Index(['index', 'tier', 'question', 'correct_answer', 'wrong_answer',
       'error_type', 'explanation', 'erroneous_line_number', 'erroneous_line',
       'erroneous_line_eqn', 'correct_answer_mapping', 'wrong_answer_mapping',
       'correct_eqn_mapping', 'wrong_eqn_mapping', 'correct_answer_length',
       'wrong_answer_length', 'source', 'error_subtype', 'split'],
      dtype='object')