In [1]:
import json
import random
import re
from pathlib import Path
from typing import List, Dict, Set, Tuple

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# --- Path and Directory Definitions ---
def find_project_root(marker: str = ".git") -> Path:
    """Traverse upwards to find the project root, marked by the git repository."""
    current_path = Path.cwd().resolve()
    while current_path != current_path.parent:
        if (current_path / marker).exists():
            return current_path
        current_path = current_path.parent
    raise FileNotFoundError(f"Could not find project root. Marker '{marker}' not found.")

# --- Global Constants and Paths ---
PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / 'data'
OUTPUT_DIR = DATA_DIR / "line-classification"

# --- Ensure output directory exists ---
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Configuration ---
RANDOM_SEED = 42
LINE_SEP_TOKEN = "<|LINE_SEP|>"  # Special token for line separation

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Random seed set to: {RANDOM_SEED}")
print(f"Line separator token: {LINE_SEP_TOKEN}")

Project root: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math
Data directory: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data
Output directory: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/line-classification
Random seed set to: 42
Line separator token: <|LINE_SEP|>


In [2]:
# Load the master catalog
master_catalog_path = DATA_DIR / "master_catalog_sanitized.csv"
if not master_catalog_path.exists():
    raise FileNotFoundError(f"Master catalog not found: {master_catalog_path}")

master_df = pd.read_csv(master_catalog_path)
print(f"Loaded master catalog with {len(master_df):,} records")
print(f"Columns: {list(master_df.columns)}")

print("\n=== Master Catalog Overview ===")
print(f"Total samples: {len(master_df):,}")
print(f"Unique indices: {master_df['index'].nunique():,}")

print("\n--- Error Type Distribution ---")
error_type_counts = master_df['error_type'].value_counts()
print(error_type_counts)

print("\n--- Source Distribution ---") 
source_counts = master_df['source'].value_counts()
print(source_counts)

Loaded master catalog with 24,652 records
Columns: ['index', 'tier', 'question', 'correct_answer', 'wrong_answer', 'error_type', 'erroneous_line_number', 'explanation', 'error_subtype', 'source', 'solution_length', 'relative_line_position']

=== Master Catalog Overview ===
Total samples: 24,652
Unique indices: 6,777

--- Error Type Distribution ---
error_type
computational_error    22542
conceptual_error        2110
Name: count, dtype: int64

--- Source Distribution ---
source
programmatic    22912
manual           1740
Name: count, dtype: int64


In [3]:
def select_balanced_computational_indices_simple(master_df, conceptual_indices, target_count):
    """
    Simplified computational error sampling:
    1. Form 3 groups based on relative line position (early, middle, late)
    2. From each group, take as many distinct tier 4 samples as possible
    3. For remaining, randomly sample distinct indices from other tiers
    """
    
    print("=== Simple Computational Error Sampling ===")
    
    # Get available computational errors (excluding conceptual indices)
    computational_df = master_df[
        (master_df['error_type'] == 'computational_error') & 
        (~master_df['index'].isin(conceptual_indices))
    ].copy()
    
    if len(computational_df) == 0:
        print("Warning: No computational errors available after excluding conceptual indices")
        return []
    
    print(f"Available computational error samples: {len(computational_df)}")
    print(f"Available unique indices: {computational_df['index'].nunique()}")
    
    # Step 1: Form 3 groups based on relative line position
    # Simple tertile split
    positions = computational_df['relative_line_position'].dropna()
    tertile_33 = positions.quantile(0.33)
    tertile_67 = positions.quantile(0.67)
    
    print(f"Position tertiles: 33%={tertile_33:.3f}, 67%={tertile_67:.3f}")
    
    # Create groups
    early_group = computational_df[computational_df['relative_line_position'] <= tertile_33]
    middle_group = computational_df[
        (computational_df['relative_line_position'] > tertile_33) & 
        (computational_df['relative_line_position'] <= tertile_67)
    ]
    late_group = computational_df[computational_df['relative_line_position'] > tertile_67]
    
    print(f"Early group: {len(early_group)} samples, {early_group['index'].nunique()} unique indices")
    print(f"Middle group: {len(middle_group)} samples, {middle_group['index'].nunique()} unique indices")
    print(f"Late group: {len(late_group)} samples, {late_group['index'].nunique()} unique indices")
    
    selected_indices = []
    
    # Step 2: From each group, take as many distinct tier 4 samples as possible
    for group_name, group_df in [("Early", early_group), ("Middle", middle_group), ("Late", late_group)]:
        tier4_indices = group_df[group_df['tier'] == 'tier4']['index'].unique()
        print(f"{group_name} group tier 4 unique indices: {len(tier4_indices)}")
        selected_indices.extend(tier4_indices.tolist())
    
    print(f"Total tier 4 indices selected: {len(selected_indices)}")
    
    # Step 3: For remaining, randomly sample distinct indices from other tiers
    remaining_needed = target_count - len(selected_indices)
    
    if remaining_needed > 0:
        print(f"Need {remaining_needed} more indices from other tiers")
        
        # Get all non-tier4 indices that aren't already selected
        non_tier4_df = computational_df[computational_df['tier'] != 'tier4']
        available_indices = non_tier4_df['index'].unique()
        available_indices = [idx for idx in available_indices if idx not in selected_indices]
        
        print(f"Available non-tier4 indices: {len(available_indices)}")
        
        if len(available_indices) >= remaining_needed:
            additional_indices = random.sample(available_indices, remaining_needed)
            selected_indices.extend(additional_indices)
        else:
            print(f"Warning: Only {len(available_indices)} available, adding all")
            selected_indices.extend(available_indices)
    
    print(f"Final selection: {len(selected_indices)} unique indices")
    
    # Verify uniqueness
    if len(selected_indices) != len(set(selected_indices)):
        print(f"ERROR: Duplicate indices detected!")
        selected_indices = list(set(selected_indices))
        print(f"After deduplication: {len(selected_indices)} unique indices")
    
    # Show final distribution
    selected_df = computational_df[computational_df['index'].isin(selected_indices)]
    selected_unique = selected_df.drop_duplicates('index')
    
    print(f"\n--- Final Distribution ---")
    tier_counts = selected_unique['tier'].value_counts().sort_index()
    print("Tier distribution:")
    for tier in ['tier1', 'tier2', 'tier3', 'tier4']:
        count = tier_counts.get(tier, 0)
        pct = (count / len(selected_unique)) * 100
        print(f"  {tier}: {count} indices ({pct:.1f}%)")
    
    # Position distribution of selected indices
    print("\nPosition distribution:")
    for group_name, (min_pos, max_pos) in [
        ("Early", (0, tertile_33)),
        ("Middle", (tertile_33, tertile_67)), 
        ("Late", (tertile_67, 1.0))
    ]:
        group_selected = selected_unique[
            (selected_unique['relative_line_position'] >= min_pos) & 
            (selected_unique['relative_line_position'] <= max_pos)
        ]
        count = len(group_selected)
        pct = (count / len(selected_unique)) * 100
        print(f"  {group_name}: {count} indices ({pct:.1f}%)")
    
    return selected_indices

def get_disjoint_sets_simple(master_df):
    """
    Create three disjoint sets with simplified computational sampling.
    """
    
    print("=== Creating Disjoint Sets with Simple Sampling ===")
    
    # Set A: All indices with conceptual errors
    conceptual_indices = master_df[
        master_df['error_type'] == 'conceptual_error'
    ]['index'].unique()
    conceptual_indices = sorted(conceptual_indices)
    N = len(conceptual_indices)
    
    print(f"Set A (Conceptual errors): {N} unique indices")
    
    # Set B: Simple computational error sampling
    computational_indices = select_balanced_computational_indices_simple(
        master_df, conceptual_indices, N
    )
    
    print(f"Set B (Computational errors): {len(computational_indices)} unique indices")
    
    # Set C: Correct samples from remaining disjoint indices
    used_indices = set(conceptual_indices) | set(computational_indices)
    all_indices = set(master_df['index'].unique())
    remaining_indices = list(all_indices - used_indices)
    
    print(f"Available indices for correct samples: {len(remaining_indices)}")
    
    if len(remaining_indices) < len(computational_indices):
        print(f"WARNING: Only {len(remaining_indices)} remaining indices available, need {len(computational_indices)}")
        correct_indices = remaining_indices
    else:
        correct_indices = random.sample(remaining_indices, len(computational_indices))
    
    print(f"Set C (Correct samples): {len(correct_indices)} unique indices")
    
    # Final verification that all sets are truly disjoint
    set_A = set(conceptual_indices)
    set_B = set(computational_indices)
    set_C = set(correct_indices)
    
    overlap_AB = len(set_A & set_B)
    overlap_AC = len(set_A & set_C)
    overlap_BC = len(set_B & set_C)
    
    print(f"\n--- Disjoint Verification ---")
    print(f"Conceptual ∩ Computational: {overlap_AB} indices (should be 0)")
    print(f"Conceptual ∩ Correct: {overlap_AC} indices (should be 0)")
    print(f"Computational ∩ Correct: {overlap_BC} indices (should be 0)")
    
    total_unique_indices = len(set_A | set_B | set_C)
    expected_total = len(conceptual_indices) + len(computational_indices) + len(correct_indices)
    print(f"Total unique indices: {total_unique_indices} (should equal {expected_total})")
    
    if total_unique_indices != expected_total:
        print("ERROR: Sets are not properly disjoint!")
    else:
        print("✓ All sets are properly disjoint")
    
    return conceptual_indices, computational_indices, correct_indices

conceptual_indices, computational_indices, correct_indices = get_disjoint_sets_simple(master_df)

=== Creating Disjoint Sets with Simple Sampling ===
Set A (Conceptual errors): 1881 unique indices
=== Simple Computational Error Sampling ===
Available computational error samples: 16287
Available unique indices: 4896
Position tertiles: 33%=0.250, 67%=0.600
Early group: 5599 samples, 3676 unique indices
Middle group: 5680 samples, 3654 unique indices
Late group: 5008 samples, 2866 unique indices
Early group tier 4 unique indices: 181
Middle group tier 4 unique indices: 183
Late group tier 4 unique indices: 180
Total tier 4 indices selected: 544
Need 1337 more indices from other tiers
Available non-tier4 indices: 4627
Final selection: 1881 unique indices
ERROR: Duplicate indices detected!
After deduplication: 1606 unique indices

--- Final Distribution ---
Tier distribution:
  tier1: 581 indices (36.2%)
  tier2: 164 indices (10.2%)
  tier3: 590 indices (36.7%)
  tier4: 269 indices (16.7%)

Position distribution:
  Early: 1156 indices (72.0%)
  Middle: 387 indices (24.1%)
  Late: 157 in

In [4]:
def add_line_separator_tokens(solution_text: str, line_sep_token: str = LINE_SEP_TOKEN) -> str:
    """
    Since preprocessing is already done in master catalog, just add line separator tokens.
    
    Args:
        solution_text: Already preprocessed solution text from sanitized master catalog
        line_sep_token: Special token to use for line separation
        
    Returns:
        Solution text with line separator tokens for tokenization
    """
    if not isinstance(solution_text, str):
        return ""
    
    # Split by newlines, filter empty lines, and join with special token
    lines = solution_text.split('\n')
    non_empty_lines = [line.strip() for line in lines if line.strip()]  # Filter empty lines
    
    processed = line_sep_token.join(non_empty_lines) + line_sep_token
    return processed

def create_user_prompt_simple(question: str, solution: str) -> str:
    """
    Creates user prompt from already sanitized question and solution.
    
    Args:
        question: Already sanitized GSM8K problem statement
        solution: Already preprocessed solution text with line separators
        
    Returns:
        Formatted user prompt for line-level error detection
    """
    system_instruction = "Analyze the following mathematical problem and solution to identify the line containing the error."
    
    user_prompt = f"""{system_instruction}

### Problem:
{question}

### Solution:
{solution}"""
    
    return user_prompt

In [5]:
# Simple debugging: print raw solution vs line sep version
print("🔍 SIMPLE LINE SEPARATOR DEBUG")
print("=" * 50)

# Get a few samples that showed mismatches
problem_indices = [143, 140, 145]

for idx in problem_indices:
    sample = master_df[master_df['index'] == idx].iloc[0]
    
    print(f"\n--- INDEX {idx} ---")
    print(f"Stored solution_length: {sample['solution_length']}")
    
    # Raw solution from master catalog
    raw_solution = sample['wrong_answer']
    print(f"\nRAW SOLUTION:")
    print(repr(raw_solution))
    
    # After adding line separators
    solution_with_tokens = add_line_separator_tokens(raw_solution)
    print(f"\nAFTER LINE SEPARATOR TOKENS:")
    print(repr(solution_with_tokens))
    
    # Count tokens
    token_count = solution_with_tokens.count(LINE_SEP_TOKEN)
    print(f"\nTOKEN COUNT: {token_count}")
    print(f"STORED LENGTH: {sample['solution_length']}")
    print(f"DIFFERENCE: {token_count - sample['solution_length']}")
    
    # Split manually to see what's happening
    lines = raw_solution.split('\n')
    print(f"\nMANUAL LINE SPLIT:")
    for i, line in enumerate(lines):
        print(f"  {i}: {repr(line)}")
    
    non_empty = [line.strip() for line in lines if line.strip()]
    print(f"\nNON-EMPTY LINES: {len(non_empty)}")
    for i, line in enumerate(non_empty):
        print(f"  {i}: {repr(line)}")

🔍 SIMPLE LINE SEPARATOR DEBUG

--- INDEX 143 ---
Stored solution_length: 4

RAW SOLUTION:
'There are 60 goldfish because 15 / .25 = 60\n75% of the fish are below the surface because 100 - 25 = 75\nThere are 35 goldfish below the surface because 60 x .75 = 35\n#### 35'

AFTER LINE SEPARATOR TOKENS:
'There are 60 goldfish because 15 / .25 = 60<|LINE_SEP|>75% of the fish are below the surface because 100 - 25 = 75<|LINE_SEP|>There are 35 goldfish below the surface because 60 x .75 = 35<|LINE_SEP|>#### 35<|LINE_SEP|>'

TOKEN COUNT: 4
STORED LENGTH: 4
DIFFERENCE: 0

MANUAL LINE SPLIT:
  0: 'There are 60 goldfish because 15 / .25 = 60'
  1: '75% of the fish are below the surface because 100 - 25 = 75'
  2: 'There are 35 goldfish below the surface because 60 x .75 = 35'
  3: '#### 35'

NON-EMPTY LINES: 4
  0: 'There are 60 goldfish because 15 / .25 = 60'
  1: '75% of the fish are below the surface because 100 - 25 = 75'
  2: 'There are 35 goldfish below the surface because 60 x .75 = 35'
  3:

In [6]:


# ==============================================================================
# Updated Dataset Creation Functions
# ==============================================================================
def create_line_labels_optimized(solution_with_tokens, relative_line_position, solution_length, sample_index=None):
    """
    Create one-hot encoded line labels using pre-computed relative position and length.
    Updated to work with sanitized master catalog and line separator tokens.
    
    Args:
        solution_with_tokens (str): Solution text with line separator tokens added
        relative_line_position (float or None): Relative position of error (0.0-1.0), 
                                              or None for correct solutions
        solution_length (int): Pre-computed number of lines (from sanitized catalog)
        sample_index (int, optional): Sample index for debugging mismatch reports
    
    Returns:
        list: One-hot encoded labels where 1 indicates the error line, 0 otherwise.
              For correct solutions, all labels are 0.
    """
    
    if pd.isna(solution_with_tokens) or solution_with_tokens is None or solution_with_tokens == "":
        return []
    
    # Use the pre-computed solution_length from sanitized catalog
    num_lines = int(solution_length)
    
    # Verify actual line count matches stored length (for debugging)
    if LINE_SEP_TOKEN in solution_with_tokens:
        # Count line separator tokens to verify consistency
        actual_line_count = solution_with_tokens.count(LINE_SEP_TOKEN)
        if actual_line_count != num_lines:
            index_info = f" (Index: {sample_index})" if sample_index is not None else ""
            print(f"Warning: Line count mismatch{index_info}! Stored: {num_lines}, Actual: {actual_line_count}")
            # Use actual count to prevent index errors
            num_lines = actual_line_count
    
    # Handle edge case of empty solutions
    if num_lines <= 0:
        return []
    
    # Initialize all labels as 0 (no error)
    line_labels = [0] * num_lines
    
    # For error samples, set the error line to 1
    if relative_line_position is not None and not pd.isna(relative_line_position):
        # Convert relative position (0.0-1.0) to absolute line number (0-based)
        if num_lines == 1:
            error_line_number = 0  # Only one line, must be line 0
        else:
            error_line_number = int(relative_line_position * (num_lines - 1))
        
        # Ensure error line is within valid bounds
        error_line_number = max(0, min(error_line_number, num_lines - 1))
        line_labels[error_line_number] = 1
    
    return line_labels

def select_best_sample(idx_samples):
    """
    Select the best sample from a group of samples for the same index.
    Priority: manual > programmatic
    """
    # Prioritize manual over programmatic
    manual_samples = idx_samples[idx_samples['source'] == 'manual']
    if len(manual_samples) > 0:
        return manual_samples.iloc[0]  # Take first manual sample
    else:
        return idx_samples.iloc[0]  # Take first programmatic sample

def create_flawed_only_line_classification_dataset_simple(master_df, conceptual_indices, computational_indices):
    """
    SIMPLIFIED: Create dataset from preprocessed master catalog.
    No additional preprocessing needed since master catalog is already sanitized.
    """
    
    print("=== Creating Flawed-Only Dataset from Sanitized Master Catalog ===")
    
    dataset = []
    
    # Process conceptual error samples
    print(f"Processing {len(conceptual_indices)} conceptual error samples...")
    for idx in tqdm(conceptual_indices, desc="Conceptual errors"):
        idx_samples = master_df[
            (master_df['index'] == idx) & 
            (master_df['error_type'] == 'conceptual_error')
        ]
        
        if len(idx_samples) == 0:
            continue
            
        selected_sample = select_best_sample(idx_samples)
        
        # Text is already preprocessed, just add line separator tokens
        preprocessed_solution = selected_sample['wrong_answer']  # Already sanitized!
        solution_with_tokens = add_line_separator_tokens(preprocessed_solution)
        
        # Question is already preprocessed too
        preprocessed_question = selected_sample['question']  # Already sanitized!
        user_prompt = create_user_prompt_simple(preprocessed_question, solution_with_tokens)
        
        # Correct answer is also already preprocessed
        correct_answer = selected_sample['correct_answer']  # Already sanitized!
        
        # Use the stored values directly (they're already calculated correctly)
        line_labels = create_line_labels_optimized(
            solution_with_tokens, 
            selected_sample['relative_line_position'],
            selected_sample['solution_length'],
            sample_index=idx  # Add this parameter
        )
        
        # This should now work perfectly since everything is pre-aligned
        if len(line_labels) > 0 and sum(line_labels) == 1:
            sample = {
                'text': user_prompt,
                'correct_answer': correct_answer,
                'line_labels': line_labels,
                'error_type': 'conceptual_error',
                'index': idx,
                'tier': selected_sample['tier'],
                'source': selected_sample['source'],
                'relative_line_position': selected_sample['relative_line_position'],
                'solution_length': selected_sample['solution_length']
            }
            dataset.append(sample)
    
    # Process computational error samples
    print(f"Processing {len(computational_indices)} computational error samples...")
    for idx in tqdm(computational_indices, desc="Computational errors"):
        idx_samples = master_df[
            (master_df['index'] == idx) & 
            (master_df['error_type'] == 'computational_error')
        ]
        
        if len(idx_samples) == 0:
            continue
            
        selected_sample = select_best_sample(idx_samples)
        
        # Text is already preprocessed, just add line separator tokens
        preprocessed_solution = selected_sample['wrong_answer']  # Already sanitized!
        solution_with_tokens = add_line_separator_tokens(preprocessed_solution)
        
        # Question is already preprocessed too
        preprocessed_question = selected_sample['question']  # Already sanitized!
        user_prompt = create_user_prompt_simple(preprocessed_question, solution_with_tokens)
        
        # Correct answer is also already preprocessed
        correct_answer = selected_sample['correct_answer']  # Already sanitized!
        
        # Use the stored values directly (they're already calculated correctly)
        line_labels = create_line_labels_optimized(
            solution_with_tokens, 
            selected_sample['relative_line_position'],
            selected_sample['solution_length'],
            sample_index=idx  # Add this parameter
        )
        
        # This should now work perfectly since everything is pre-aligned
        if len(line_labels) > 0 and sum(line_labels) == 1:
            sample = {
                'text': user_prompt,
                'correct_answer': correct_answer,
                'line_labels': line_labels,
                'error_type': 'computational_error',
                'index': idx,
                'tier': selected_sample['tier'],
                'source': selected_sample['source'],
                'relative_line_position': selected_sample['relative_line_position'],
                'solution_length': selected_sample['solution_length']
            }
            dataset.append(sample)
    
    dataset_df = pd.DataFrame(dataset)
    
    print(f"\nTotal samples created: {len(dataset_df)}")
    
    # This validation should now show 100% success!
    print("\n--- Label Validation ---")
    label_sums = dataset_df['line_labels'].apply(sum)
    valid_samples = (label_sums == 1).sum()
    print(f"Samples with exactly 1 error line: {valid_samples}/{len(dataset_df)} ({valid_samples/len(dataset_df)*100:.1f}%)")
    
    if (label_sums != 1).any():
        print(f"WARNING: {(label_sums != 1).sum()} samples don't have exactly 1 error line")
        invalid_samples = dataset_df[label_sums != 1]
        print("This shouldn't happen with sanitized catalog!")
        
    print("✅ SUCCESS: Using preprocessed master catalog eliminates all preprocessing inconsistencies!")
    
    return dataset_df

def save_flawed_only_dataset(dataset_df, output_dir):
    """
    Save the flawed-only line classification dataset with updated metadata.
    """
    
    print("=== Saving Flawed-Only Line Classification Dataset ===")
    
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Save main dataset
    dataset_path = output_dir / "flawed_only_line_classification_dataset.csv"
    dataset_df.to_csv(dataset_path, index=False)
    print(f"✓ Dataset saved to: {dataset_path}")
    print(f"  Size: {len(dataset_df):,} samples")
    print(f"  Unique indices: {dataset_df['index'].nunique():,}")
    
    # Calculate class balance for metadata
    total_line_labels = sum(len(labels) for labels in dataset_df['line_labels'])
    total_error_lines = sum(sum(labels) for labels in dataset_df['line_labels'])
    total_correct_lines = total_line_labels - total_error_lines
    imbalance_ratio = total_correct_lines / total_error_lines if total_error_lines > 0 else 0
    
    # Create metadata
    metadata = {
        "creation_info": {
            "creation_date": pd.Timestamp.now().isoformat(),
            "random_seed": RANDOM_SEED,
            "source_catalog": "master_catalog.csv",
            "creator": "make-line-classification-dataset.ipynb",
            "dataset_strategy": "flawed_only_with_preprocessing"
        },
        "preprocessing_applied": {
            "newline_fixes": f"Converted literal \\n to actual newlines, then replaced with special token: {LINE_SEP_TOKEN}",
            "line_separation": f"Used special token '{LINE_SEP_TOKEN}' for reliable line boundary detection",
            "unicode_sanitization": "Converted problematic Unicode chars to ASCII equivalents",
            "user_prompt_formatting": "Combined question + solution into standardized user prompt format",
            "text_cleaning": "Removed comma separators from numbers"
        },
        "dataset_info": {
            "description": "Flawed-only line-level error detection dataset with comprehensive preprocessing",
            "strategy": "flawed_only_balanced_preprocessed", 
            "total_samples": len(dataset_df),
            "unique_problems": dataset_df['index'].nunique(),
            "composition": dict(dataset_df['error_type'].value_counts()),
            "source_distribution": dict(dataset_df['source'].value_counts()),
            "tier_distribution": dict(dataset_df['tier'].value_counts()),
            "class_balance": {
                "total_line_positions": total_line_labels,
                "error_line_positions": total_error_lines,
                "correct_line_positions": total_correct_lines,
                "imbalance_ratio": f"{imbalance_ratio:.1f}:1",
                "error_percentage": f"{total_error_lines/total_line_labels*100:.1f}%"
            }
        },
        "column_descriptions": {
            "text": "Complete user prompt: system instruction + problem + preprocessed solution",
            "correct_answer": "Original GSM8K correct answer for reference",
            "line_labels": "One-hot encoded labels [0,0,1,0,...] indicating error line",
            "error_type": "'conceptual_error' or 'computational_error' (no correct samples)",
            "index": "Original GSM8K problem index",
            "tier": "Problem difficulty tier (tier1-tier4)",
            "source": "'manual' or 'programmatic'"
        }
    }
    
    metadata_path = output_dir / "flawed_only_dataset_metadata.json"
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2, default=str)
    print(f"✓ Metadata saved to: {metadata_path}")
    
    # Create README
    readme_content = f"""# Flawed-Only Line Classification Dataset (Preprocessed)

Created on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
Random seed: {RANDOM_SEED}
Strategy: **Flawed-Only with Comprehensive Preprocessing**

## Overview

This dataset addresses class imbalance by using **only flawed samples** for line-level error detection, with comprehensive preprocessing applied to fix tokenization issues and standardize input format.

## Key Improvements

### 1. Class Balance
- **Original approach**: ~85-90% zeros (severe imbalance, ratio ~5-6:1)
- **Flawed-only approach**: ~{100-total_error_lines/total_line_labels*100:.1f}% zeros (better balance, ratio ~{imbalance_ratio:.1f}:1)

### 2. Preprocessing Applied
- **Line separation**: Replaced newlines with special token `{LINE_SEP_TOKEN}` for reliable tokenization
- **Unicode sanitization**: Converted problematic Unicode characters to ASCII equivalents
- **User prompt formatting**: Combined question + solution into standardized format
- **Text cleaning**: Removed comma separators from numbers

## Dataset Structure

- **Total samples**: {len(dataset_df):,} (all flawed)
- **Unique problems**: {dataset_df['index'].nunique():,}
- **Strategy**: Balanced conceptual + computational errors with preprocessing

### Composition
- **Conceptual errors**: {dict(dataset_df['error_type'].value_counts()).get('conceptual_error', 0)} samples
- **Computational errors**: {dict(dataset_df['error_type'].value_counts()).get('computational_error', 0)} samples
- **Correct solutions**: 0 samples (removed to eliminate imbalance)

### Class Balance Analysis
- **Total line positions**: {total_line_labels:,}
- **Error lines (label=1)**: {total_error_lines:,} ({total_error_lines/total_line_labels*100:.1f}%)
- **Correct lines (label=0)**: {total_correct_lines:,} ({total_correct_lines/total_line_labels*100:.1f}%)
- **Imbalance ratio**: {imbalance_ratio:.1f}:1

## Column Descriptions

- `text`: Complete user prompt with system instruction + problem + preprocessed solution
- `correct_answer`: Original GSM8K correct answer for reference
- `line_labels`: One-hot encoded labels `[0,0,1,0,...]` indicating error line
- `error_type`: 'conceptual_error' or 'computational_error'
- `index`: Original GSM8K problem index
- `tier`: Problem difficulty tier (tier1-tier4)
- `source`: 'manual' or 'programmatic'

## Usage Example

```python
import pandas as pd
import ast

# Load dataset
df = pd.read_csv('flawed_only_line_classification_dataset.csv')

# Parse line_labels from string representation
df['line_labels'] = df['line_labels'].apply(ast.literal_eval)

# Example: Check a sample with error
sample = df.iloc[0]
print("User prompt:")
print(sample['text'])
print(f"\\nError type: {{sample['error_type']}}")
print(f"Error line position: {{sample['line_labels'].index(1)}}")
```

## Training Considerations

- **Tokenization**: Preprocessing ensures reliable newline token detection
- **Loss function**: Standard cross-entropy works well (no complex weighting needed)
- **Data collator**: Custom collator for variable-length `line_labels`
- **Evaluation**: Line-level accuracy, precision, recall, F1
- **Much more stable training** than severely imbalanced dataset"""
    
    readme_path = output_dir / "README.md"
    with open(readme_path, 'w') as f:
        f.write(readme_content)
    print(f"✓ README saved to: {readme_path}")

    return output_dir

In [7]:
# Execute the flawed-only strategy
print("🚀 Creating flawed-only line classification dataset...")
flawed_only_dataset_df = create_flawed_only_line_classification_dataset_simple(  # ← NEW FUNCTION NAME
    master_df, conceptual_indices, computational_indices
)

# Save the flawed-only dataset
flawed_only_output_dir = OUTPUT_DIR / "flawed-only"
saved_flawed_dir = save_flawed_only_dataset(flawed_only_dataset_df, flawed_only_output_dir)
print(f"\n📁 Flawed-only dataset saved to: {saved_flawed_dir}")

🚀 Creating flawed-only line classification dataset...
=== Creating Flawed-Only Dataset from Sanitized Master Catalog ===
Processing 1881 conceptual error samples...


Conceptual errors:   0%|          | 0/1881 [00:00<?, ?it/s]

Processing 1606 computational error samples...


Computational errors:   0%|          | 0/1606 [00:00<?, ?it/s]


Total samples created: 3487

--- Label Validation ---
Samples with exactly 1 error line: 3487/3487 (100.0%)
✅ SUCCESS: Using preprocessed master catalog eliminates all preprocessing inconsistencies!
=== Saving Flawed-Only Line Classification Dataset ===
✓ Dataset saved to: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/line-classification/flawed-only/flawed_only_line_classification_dataset.csv
  Size: 3,487 samples
  Unique indices: 3,487
✓ Metadata saved to: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/line-classification/flawed-only/flawed_only_dataset_metadata.json
✓ README saved to: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/line-classification/flawed-only/README.md

📁 Flawed-only dataset saved to: /Users/arvindsuresh/Documents/Github/Erdos-DL-June25-Math/data/line-classification/flawed-only


In [8]:
# Quick diagnosis of the mismatch issue
print("🔍 DIAGNOSING LINE COUNT MISMATCHES")
print("=" * 60)

# Check a few samples to see what's happening
sample_indices = [140, 143, 145, 7193, 7339]

for idx in sample_indices[:3]:
    sample = master_df[master_df['index'] == idx].iloc[0]
    
    print(f"\n--- INDEX {idx} ---")
    print(f"Stored solution_length: {sample['solution_length']}")
    
    # Count actual lines in the sanitized text
    raw_solution = sample['wrong_answer']
    lines = raw_solution.split('\n')
    actual_lines = [line.strip() for line in lines if line.strip()]
    actual_count = len(actual_lines)
    
    print(f"Actual line count: {actual_count}")
    print(f"Difference: {actual_count - sample['solution_length']}")
    
    print(f"Sample solution text:")
    print(repr(raw_solution))

# Check overall mismatch rate
print(f"\n--- OVERALL MISMATCH CHECK ---")
mismatches = 0
total_checked = 0

for _, sample in master_df.head(50).iterrows():
    raw_solution = sample['wrong_answer']
    stored_length = int(sample['solution_length'])
    
    lines = raw_solution.split('\n')
    actual_lines = [line.strip() for line in lines if line.strip()]
    actual_length = len(actual_lines)
    
    if actual_length != stored_length:
        mismatches += 1
    total_checked += 1

mismatch_rate = (mismatches / total_checked) * 100
print(f"Mismatch rate in first 50 samples: {mismatches}/{total_checked} ({mismatch_rate:.1f}%)")

if mismatch_rate > 10:
    print("❌ HIGH MISMATCH RATE - The solution_length field in master_catalog_sanitized.csv is inconsistent!")
    print("💡 SOLUTION: You need to recalculate solution_length after text sanitization")
else:
    print("✅ Low mismatch rate - issue might be elsewhere")

🔍 DIAGNOSING LINE COUNT MISMATCHES

--- INDEX 140 ---
Stored solution_length: 5
Actual line count: 5
Difference: 0
Sample solution text:
'First find how many kids from Riverside High are rejected: 20% * 120 kids = 24 kids\nThen find how many kids from West Side High are rejected: 70% * 90 kids = 63 kids\nThen find how many kids from Mountaintop High are rejected: 50 kids / 2 = 25 kids\nThen add the number of rejected kids from each school to find the total number of rejected kids: 24 kids + 63 kids + 25 kids = 112 kids\n#### 112'

--- INDEX 143 ---
Stored solution_length: 4
Actual line count: 4
Difference: 0
Sample solution text:
'There are 60 goldfish because 15 / .25 = 60\n75% of the fish are below the surface because 100 - 25 = 75\nThere are 35 goldfish below the surface because 60 x .75 = 35\n#### 35'

--- INDEX 145 ---
Stored solution_length: 5
Actual line count: 5
Difference: 0
Sample solution text:
'He watched 2*20=40 minutes of Jeopardy.\nWheel of Fortune is 2*20=40 minutes eac

In [9]:
# Print sample inspection code
print("=== Dataset Sample Inspection ===")

# Check dataset structure
print(f"Dataset shape: {flawed_only_dataset_df.shape}")
print(f"Columns: {list(flawed_only_dataset_df.columns)}")

# Print first 3 samples with detailed formatting
for i in range(min(3, len(flawed_only_dataset_df))):
    sample = flawed_only_dataset_df.iloc[i]
    
    print(f"\n{'='*60}")
    print(f"SAMPLE {i+1} (Index: {sample['index']})")
    print(f"{'='*60}")
    
    print(f"Error Type: {sample['error_type']}")
    print(f"Source: {sample['source']}")
    print(f"Tier: {sample['tier']}")
    print(f"Solution Length: {sample['solution_length']} lines")
    print(f"Relative Error Position: {sample['relative_line_position']:.3f}")
    
    # Parse line labels
    import ast
    line_labels = ast.literal_eval(sample['line_labels']) if isinstance(sample['line_labels'], str) else sample['line_labels']
    error_line_idx = line_labels.index(1) if 1 in line_labels else -1
    print(f"Error Line Index: {error_line_idx} (0-based)")
    print(f"Line Labels: {line_labels}")
    
    print(f"\n--- USER PROMPT ---")
    print(sample['text'])
    
    print(f"\n--- CORRECT ANSWER (GSM8K) ---")
    print(sample['correct_answer'][:200] + "..." if len(sample['correct_answer']) > 200 else sample['correct_answer'])
    
    # Show which specific line contains the error
    solution_section = sample['text'].split("### Solution:")[-1].strip()
    solution_lines = solution_section.split('\n')
    
    print(f"\n--- SOLUTION LINES WITH ERROR MARKING ---")
    for idx, line in enumerate(solution_lines):
        if line.strip():  # Only show non-empty lines
            marker = " ← ERROR" if idx == error_line_idx else ""
            print(f"Line {idx}: {line.strip()}{marker}")

# Show preprocessing effects
print(f"\n{'='*60}")
print("PREPROCESSING VERIFICATION")
print(f"{'='*60}")

# Check for calculator annotations removal
has_calculator_annotations = flawed_only_dataset_df['text'].str.contains(r'<<.*?>>', regex=True).any()
print(f"Calculator annotations found: {has_calculator_annotations} (should be False)")

# Check for literal \n vs actual newlines
has_literal_newlines = flawed_only_dataset_df['text'].str.contains(r'\\n', regex=False).any()
print(f"Literal \\n found: {has_literal_newlines} (should be False)")

# Check for Unicode characters with detailed reporting
unicode_chars = {
    "\u2212": "Minus Sign (−)",
    "\u00d7": "Multiplication Sign (×)", 
    "\u00f7": "Division Sign (÷)",
    "\u22c5": "Dot Operator (⋅)",
    "\u201c": "Left Double Quotation Mark",
    "\u201d": "Right Double Quotation Mark", 
    "\u2018": "Left Single Quotation Mark",
    "\u2019": "Right Single Quotation Mark",
    "\u2014": "Em Dash (—)",
    "\u2013": "En Dash (–)", 
    "\u2026": "Horizontal Ellipsis (…)",
    "\u00a0": "No-Break Space"
}

print(f"\n--- UNICODE CHARACTER DETECTION ---")
found_unicode = []
for char, description in unicode_chars.items():
    char_found = flawed_only_dataset_df['text'].str.contains(char, regex=False).any()
    if char_found:
        # Count occurrences
        total_occurrences = flawed_only_dataset_df['text'].str.count(char).sum()
        samples_with_char = flawed_only_dataset_df['text'].str.contains(char, regex=False).sum()
        found_unicode.append(f"  ❌ {description}: {total_occurrences} occurrences in {samples_with_char} samples")
        
        # Show first few examples
        examples = flawed_only_dataset_df[flawed_only_dataset_df['text'].str.contains(char, regex=False)]['text'].head(2)
        for idx, example in enumerate(examples):
            excerpt = example.replace(char, f"**{char}**")[:150] + "..."
            print(f"      Example {idx+1}: {excerpt}")

if found_unicode:
    print("Unicode characters found:")
    for issue in found_unicode:
        print(issue)
else:
    print("✅ No problematic Unicode characters found")

# Additional Unicode scan for any non-ASCII characters
print(f"\n--- COMPREHENSIVE NON-ASCII SCAN ---")
non_ascii_found = False
non_ascii_summary = {}

print(f"Scanning all {len(flawed_only_dataset_df)} samples for non-ASCII characters...")

for idx, text in enumerate(flawed_only_dataset_df['text']):
    non_ascii_chars = [char for char in text if ord(char) > 127]
    if non_ascii_chars:
        unique_non_ascii = list(set(non_ascii_chars))
        if not non_ascii_found:
            print("Non-ASCII characters found:")
            non_ascii_found = True
        
        # Track summary statistics
        for char in unique_non_ascii:
            unicode_point = f"U+{ord(char):04X}"
            if unicode_point not in non_ascii_summary:
                non_ascii_summary[unicode_point] = {
                    'char': char,
                    'count': 0,
                    'samples': []
                }
            non_ascii_summary[unicode_point]['count'] += text.count(char)
            if len(non_ascii_summary[unicode_point]['samples']) < 3:  # Store first 3 sample indices
                non_ascii_summary[unicode_point]['samples'].append(idx)

if non_ascii_found:
    print(f"\n--- NON-ASCII CHARACTER SUMMARY ---")
    for unicode_point, info in sorted(non_ascii_summary.items()):
        char = info['char']
        count = info['count']
        sample_count = len(info['samples'])
        print(f"  {unicode_point} ('{char}'): {count} occurrences in {sample_count}+ samples")
        
        # Show first few sample indices
        if info['samples']:
            sample_indices = [flawed_only_dataset_df.iloc[i]['index'] for i in info['samples'][:3]]
            print(f"    Found in problem indices: {sample_indices}")
else:
    print("✅ No non-ASCII characters found in any samples")

# Summary statistics
if non_ascii_found:
    total_non_ascii_chars = sum(info['count'] for info in non_ascii_summary.values())
    total_affected_samples = len(set(sample_idx for info in non_ascii_summary.values() for sample_idx in info['samples']))
    print(f"\n--- SCAN SUMMARY ---")
    print(f"Total non-ASCII characters found: {total_non_ascii_chars}")
    print(f"Unique Unicode characters: {len(non_ascii_summary)}")
    print(f"Samples affected: {total_affected_samples}+ out of {len(flawed_only_dataset_df)} ({total_affected_samples/len(flawed_only_dataset_df)*100:.1f}%)")

=== Dataset Sample Inspection ===
Dataset shape: (3487, 9)
Columns: ['text', 'correct_answer', 'line_labels', 'error_type', 'index', 'tier', 'source', 'relative_line_position', 'solution_length']

SAMPLE 1 (Index: 1)
Error Type: conceptual_error
Source: programmatic
Tier: tier4
Solution Length: 3 lines
Relative Error Position: 0.500
Error Line Index: 1 (0-based)
Line Labels: [0, 1, 0]

--- USER PROMPT ---
Analyze the following mathematical problem and solution to identify the line containing the error.

### Problem:
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?

### Solution:
Weng earns 12/60 = $0.2 per minute.<|LINE_SEP|>Working 50 minutes, she earned 50 x 50 = $2500.<|LINE_SEP|>#### 2500<|LINE_SEP|>

--- CORRECT ANSWER (GSM8K) ---
Weng earns 12/60 = $0.2 per minute.
Working 50 minutes, she earned 0.2 x 50 = $10.
#### 10

--- SOLUTION LINES WITH ERROR MARKING ---
Line 0: Weng earns 12/60 = $0.2 per minute.<|LINE_SEP|>

In [10]:
# ==============================================================================
# DEBUGGING: Line Count Mismatch Investigation
# ==============================================================================
def debug_line_count_mismatch(master_df, sample_indices=None, max_samples=20):
    """
    Debug line count mismatches between stored solution_length and actual preprocessed lines.
    """
    print("🔍 DEBUGGING LINE COUNT MISMATCH")
    print("=" * 70)
    
    if sample_indices is None:
        # Use the problematic indices we found earlier
        sample_indices = [9, 18, 25, 29, 30]  # From the debugging output
    
    problematic_cases = []
    
    for idx in sample_indices[:max_samples]:
        print(f"\n{'='*60}")
        print(f"DEBUGGING INDEX {idx}")
        print(f"{'='*60}")
        
        # Get the sample from master catalog
        sample_data = master_df[master_df['index'] == idx]
        if len(sample_data) == 0:
            print(f"No data found for index {idx}")
            continue
        
        # Take the first available sample (conceptual or computational)
        sample = sample_data.iloc[0]
        
        print(f"📍 Error type: {sample['error_type']}")
        print(f"📍 Source: {sample['source']}")
        print(f"📍 Tier: {sample['tier']}")
        print(f"📍 Stored solution_length: {sample['solution_length']}")
        print(f"📍 Stored relative_line_position: {sample['relative_line_position']}")
        
        # Get the raw solution
        raw_solution = sample['wrong_answer']
        print(f"\n--- RAW SOLUTION ---")
        print(repr(raw_solution))
        
        # Process the solution step by step
        print(f"\n--- PREPROCESSING STEPS ---")
        
        # Step 1: Convert literal \n to actual newlines
        step1 = raw_solution.replace('\\n', '\n')
        print(f"Step 1 - After \\n conversion:")
        print(repr(step1))
        
        # Step 2: Remove calculator annotations
        import re
        step2 = re.sub(r'<<.*?>>', '', step1)
        print(f"Step 2 - After calculator annotation removal:")
        print(repr(step2))
        
        # Step 3: Sanitize Unicode and commas (simplified)
        step3 = step2  # Skip for now to focus on line counting
        
        # Step 4: Split by newlines and count
        lines_before_filter = step3.split('\n')
        print(f"Step 4 - Lines before filtering:")
        for i, line in enumerate(lines_before_filter):
            print(f"  Line {i}: {repr(line)}")
        
        # Step 5: Filter empty lines
        non_empty_lines = [line.strip() for line in lines_before_filter if line.strip()]
        print(f"Step 5 - Non-empty lines after filtering:")
        for i, line in enumerate(non_empty_lines):
            print(f"  Line {i}: {repr(line)}")
        
        # Calculate actual vs expected
        actual_line_count = len(non_empty_lines)
        stored_line_count = int(sample['solution_length'])
        
        print(f"\n--- LINE COUNT COMPARISON ---")
        print(f"📊 Stored solution_length: {stored_line_count}")
        print(f"📊 Actual preprocessed lines: {actual_line_count}")
        print(f"📊 Difference: {actual_line_count - stored_line_count}")
        
        # Show what the original line counting logic would have done
        print(f"\n--- ORIGINAL LINE COUNTING SIMULATION ---")
        # The original logic probably counted lines in the raw text
        original_lines = raw_solution.replace('\\n', '\n').split('\n')
        original_non_empty = [line.strip() for line in original_lines if line.strip()]
        print(f"Original raw line count: {len(original_non_empty)}")
        
        # Check if calculator annotations affected line count
        with_calc_lines = step1.split('\n')
        without_calc_lines = step2.split('\n')
        calc_lines_before = len([line.strip() for line in with_calc_lines if line.strip()])
        calc_lines_after = len([line.strip() for line in without_calc_lines if line.strip()])
        
        print(f"Lines before calculator removal: {calc_lines_before}")
        print(f"Lines after calculator removal: {calc_lines_after}")
        print(f"Calculator annotations removed {calc_lines_before - calc_lines_after} lines")
        
        # Calculate what the error position should be
        if sample['relative_line_position'] is not None:
            # Original calculation (wrong)
            original_error_line = int(sample['relative_line_position'] * (stored_line_count - 1))
            
            # Corrected calculation
            corrected_error_line = int(sample['relative_line_position'] * (actual_line_count - 1))
            
            print(f"\n--- ERROR POSITION COMPARISON ---")
            print(f"Original error line (using stored length): {original_error_line}")
            print(f"Corrected error line (using actual length): {corrected_error_line}")
            print(f"Valid range for actual lines: 0 to {actual_line_count - 1}")
            
            # Check if original position is out of bounds
            if original_error_line >= actual_line_count:
                print(f"❌ Original position {original_error_line} is OUT OF BOUNDS for {actual_line_count} lines!")
            else:
                print(f"✅ Original position {original_error_line} is within bounds")
        
        problematic_cases.append({
            'index': idx,
            'stored_length': stored_line_count,
            'actual_length': actual_line_count,
            'difference': actual_line_count - stored_line_count,
            'error_type': sample['error_type'],
            'relative_position': sample['relative_line_position']
        })
    
    # Summary analysis
    print(f"\n{'='*70}")
    print("SUMMARY ANALYSIS")
    print(f"{'='*70}")
    
    differences = [case['difference'] for case in problematic_cases]
    
    print(f"📊 Analyzed {len(problematic_cases)} problematic cases")
    print(f"📊 Line count differences: {differences}")
    print(f"📊 Average difference: {np.mean(differences):.2f}")
    print(f"📊 Most common difference: {max(set(differences), key=differences.count)}")
    
    # Show patterns by error type
    conceptual_diffs = [case['difference'] for case in problematic_cases if case['error_type'] == 'conceptual_error']
    computational_diffs = [case['difference'] for case in problematic_cases if case['error_type'] == 'computational_error']
    
    print(f"\n--- PATTERNS BY ERROR TYPE ---")
    if conceptual_diffs:
        print(f"Conceptual errors: avg difference = {np.mean(conceptual_diffs):.2f}")
    if computational_diffs:
        print(f"Computational errors: avg difference = {np.mean(computational_diffs):.2f}")
    
    return problematic_cases

# Run the debugging
debug_results = debug_line_count_mismatch(master_df)

🔍 DEBUGGING LINE COUNT MISMATCH

DEBUGGING INDEX 9
📍 Error type: computational_error
📍 Source: manual
📍 Tier: tier2
📍 Stored solution_length: 9
📍 Stored relative_line_position: 0.75

--- RAW SOLUTION ---
'She works 8 hours a day for $18 per hour so she makes 8*18 = $144.00 per 8-hour shift\nShe works 10 hours a day and anything over 8 hours is eligible for overtime, so she gets 10-8 = 2 hours of overtime\nOvertime is calculated as time and a half so and she makes $18/hour so her overtime pay is 18*.5 = $9.00\nHer overtime pay is 18+9 = $27.00\nHer base pay is $144.00 per 8-hour shift and she works 5 days and makes 5 * $144 = $720.00\nHer overtime pay is $27.00 per hour and she works 2 hours of overtime per day and makes 27*2 = $54.00 in overtime pay\n2 hours of overtime pay for 5 days means she makes 54*5 = $250.00\nIn 5 days her base pay is $720.00 and she makes $250.00 in overtime pay so she makes $720 + $250 = $970.00\n#### 970'

--- PREPROCESSING STEPS ---
Step 1 - After \n convers

In [11]:
# ==============================================================================
# COMPREHENSIVE SOLUTION LENGTH ANALYSIS
# ==============================================================================
def analyze_solution_length_accuracy(master_df, sample_size=100):
    """
    Analyze how accurate the stored solution_length values are across the entire dataset.
    """
    print("🔍 COMPREHENSIVE SOLUTION LENGTH ANALYSIS")
    print("=" * 70)
    
    # Sample from the dataset
    sample_data = master_df.sample(min(sample_size, len(master_df)), random_state=42)
    
    mismatches = []
    perfect_matches = 0
    
    print(f"Analyzing {len(sample_data)} samples...")
    
    for _, sample in tqdm(sample_data.iterrows(), total=len(sample_data), desc="Analyzing"):
        raw_solution = sample['wrong_answer']
        stored_length = int(sample['solution_length'])
        
        # Apply the same preprocessing as in the dataset creation
        processed = raw_solution.replace('\\n', '\n')
        processed = re.sub(r'<<.*?>>', '', processed)  # Remove calculator annotations
        
        # Count actual lines
        lines = processed.split('\n')
        actual_lines = [line.strip() for line in lines if line.strip()]
        actual_length = len(actual_lines)
        
        if actual_length != stored_length:
            mismatches.append({
                'index': sample['index'],
                'error_type': sample['error_type'],
                'source': sample['source'],
                'tier': sample['tier'],
                'stored_length': stored_length,
                'actual_length': actual_length,
                'difference': actual_length - stored_length
            })
        else:
            perfect_matches += 1
    
    # Analysis results
    mismatch_rate = len(mismatches) / len(sample_data) * 100
    perfect_rate = perfect_matches / len(sample_data) * 100
    
    print(f"\n📊 ANALYSIS RESULTS:")
    print(f"   Perfect matches: {perfect_matches}/{len(sample_data)} ({perfect_rate:.1f}%)")
    print(f"   Mismatches: {len(mismatches)}/{len(sample_data)} ({mismatch_rate:.1f}%)")
    
    if mismatches:
        differences = [m['difference'] for m in mismatches]
        print(f"\n📊 MISMATCH STATISTICS:")
        print(f"   Average difference: {np.mean(differences):.2f}")
        print(f"   Std deviation: {np.std(differences):.2f}")
        print(f"   Range: {min(differences)} to {max(differences)}")
        print(f"   Most common difference: {max(set(differences), key=differences.count)}")
        
        # Show distribution of differences
        from collections import Counter
        diff_counts = Counter(differences)
        print(f"\n📊 DIFFERENCE DISTRIBUTION:")
        for diff, count in sorted(diff_counts.items()):
            percentage = count / len(mismatches) * 100
            print(f"   Difference {diff:+d}: {count} cases ({percentage:.1f}%)")
        
        # Show patterns by error type
        print(f"\n📊 PATTERNS BY ERROR TYPE:")
        for error_type in ['conceptual_error', 'computational_error']:
            type_mismatches = [m for m in mismatches if m['error_type'] == error_type]
            if type_mismatches:
                type_diffs = [m['difference'] for m in type_mismatches]
                print(f"   {error_type}: {len(type_mismatches)} mismatches, avg diff = {np.mean(type_diffs):.2f}")
        
        # Show a few examples
        print(f"\n📋 EXAMPLE MISMATCHES:")
        for i, mismatch in enumerate(mismatches[:5]):
            print(f"   {i+1}. Index {mismatch['index']} ({mismatch['error_type']}): "
                  f"stored={mismatch['stored_length']}, actual={mismatch['actual_length']}, "
                  f"diff={mismatch['difference']:+d}")
    
    return mismatches, perfect_matches

# Run comprehensive analysis
mismatches, perfect_matches = analyze_solution_length_accuracy(master_df, sample_size=len(master_df))

🔍 COMPREHENSIVE SOLUTION LENGTH ANALYSIS
Analyzing 24652 samples...


Analyzing:   0%|          | 0/24652 [00:00<?, ?it/s]


📊 ANALYSIS RESULTS:
   Perfect matches: 24652/24652 (100.0%)
   Mismatches: 0/24652 (0.0%)
