In [1]:
#

In [10]:
import pandas as pd
import regex as re
import os
from pathlib import Path
import json
import xml.sax.saxutils as saxutils


In [None]:
import xml.etree.ElementTree as ET

def parse_xml(xml_content):
    """Parse XML string and return root element."""
    try:
        root = ET.fromstring(xml_content)
        return root
    except ET.ParseError as e:
        print(f"‚ùå XML parsing error: {e}")
        


In [11]:
import xml.etree.ElementTree as ET
import html

def parse_xml(xml_content):
    """Parse XML string and return root element with automatic error recovery."""
    try:
        # First attempt: parse as-is
        root = ET.fromstring(xml_content)
        return root
    except ET.ParseError as e:
        print(f"‚ö†Ô∏è  Initial parse failed: {e}")
        print("üîß Attempting to fix common XML issues...")
        
        try:
            # Strategy 1: Escape unescaped ampersands
            # Replace & with &amp; EXCEPT when it's already part of an entity
            import re
            fixed_content = re.sub(r'&(?!(?:amp|lt|gt|quot|apos|#\d+|#x[0-9a-fA-F]+);)', '&amp;', xml_content)
            
            root = ET.fromstring(fixed_content)
            print("‚úÖ Fixed by escaping ampersands")
            return root
            
        except ET.ParseError:
            try:
                # Strategy 2: Wrap in CDATA if it's a simple content issue
                # This is more aggressive - use cautiously
                lines = xml_content.split('\n')
                # Try to identify problematic content and wrap it
                # (This is a simplified approach - adjust based on your XML structure)
                
                # Strategy 2b: Try decoding HTML entities first
                fixed_content = html.unescape(xml_content)
                # Then re-escape for XML
                fixed_content = fixed_content.replace('&', '&amp;')
                fixed_content = fixed_content.replace('<', '&lt;').replace('>', '&gt;')
                
                root = ET.fromstring(fixed_content)
                print("‚úÖ Fixed by decoding HTML entities")
                return root
                
            except ET.ParseError:
                print("‚ùå Could not automatically fix XML. Manual inspection required.")
                print(f"üìç Error location: {e}")
                
                # Print context around error for debugging
                try:
                    error_line = int(str(e).split('line ')[1].split(',')[0])
                    error_col = int(str(e).split('column ')[1].split(':')[0])
                    lines = xml_content.split('\n')
                    
                    print(f"\nüîç Context around line {error_line}:")
                    start = max(0, error_line - 2)
                    end = min(len(lines), error_line + 1)
                    
                    for i in range(start, end):
                        marker = ">>> " if i == error_line - 1 else "    "
                        print(f"{marker}Line {i+1}: {lines[i][:100]}")
                        if i == error_line - 1:
                            print(f"    {' ' * (error_col - 1)}^--- Error here")
                except:
                    print("Could not extract error context")
                
                return None

In [12]:

df = pd.read_pickle('./RESULTS.pkl')
df['RESULT'] = df['RESULT'].apply(lambda x: str(x).strip().removeprefix("```xml").removesuffix("```").strip())

df['RESULT'].apply(lambda x: (x[:10], x[-10:])).value_counts()


RESULT
(<?xml vers, \n</module>)    48
Name: count, dtype: int64

In [13]:
# Process XML strings from DataFrame column
df['XML'] = df['RESULT'].apply(lambda x: parse_xml(x))

‚ö†Ô∏è  Initial parse failed: not well-formed (invalid token): line 3, column 44
üîß Attempting to fix common XML issues...
‚úÖ Fixed by escaping ampersands
‚ö†Ô∏è  Initial parse failed: not well-formed (invalid token): line 3, column 42
üîß Attempting to fix common XML issues...
‚úÖ Fixed by escaping ampersands
‚ö†Ô∏è  Initial parse failed: not well-formed (invalid token): line 3, column 34
üîß Attempting to fix common XML issues...
‚úÖ Fixed by escaping ampersands


In [14]:
df[df['XML'].isna()]


Unnamed: 0,IDX,DATA,PROMPT,PROMPT_ID,NARRATIVE,RESULT,STATUS,ERROR,XML


In [9]:

print(df['RESULT'].iloc[8])

<?xml version="1.0" encoding="UTF-8"?>
<module>
  <module_name>Back to the Future: Props & Re-renders</module_name>
  <slug>back-to-future-prop-changes</slug>
  
  <plan><![CDATA[
## Component Architecture

**Core Structure:**
- Single-page component with chapter-based navigation (5 chapters)
- Fixed header with title and concept
- Main content area with narrative text and interactive demonstrations
- Fixed footer with chapter navigation controls
- Responsive layout: mobile-first, breakpoints at 768px and 1024px

**State Management:**
- `useState` for current chapter index (0-4)
- `useState` for interactive demo states (photo fade level, prop values)
- `useEffect` for smooth transitions between chapters
- `useMemo` for chapter content to avoid re-creating objects

**Visual Design Implementation:**
- Color scheme: Blue (#3b82f6, #1e40af) and Orange (#f97316, #ea580c)
- Background: Dark blue gradient (from slate-950 to blue-950)
- Accent colors: Electric blue for primary actions, orange 