In [1]:
#

In [2]:
import pandas as pd
import regex as re
import os
from pathlib import Path
import json

In [3]:
from extract_persona_b_output import main


def clean_xml_string(s: str) -> str:
    """Clean XML string by removing markdown fences and extracting XML content."""
    s = s.strip()

    # Remove markdown code fences with language identifier
    s = re.sub(r"^```xml\s*\n", "", s, flags=re.MULTILINE)
    s = re.sub(r"^```\s*\n", "", s, flags=re.MULTILINE)
    s = re.sub(r"\n```\s*$", "", s, flags=re.MULTILINE)

    # Remove any remaining ``` markers
    s = s.strip("`").strip()

    # Find the actual XML start if there's junk before it
    xml_start = s.find("<?xml")
    if xml_start > 0:
        s = s[xml_start:]

    # Find the XML end if there's junk after it
    xml_end = s.find("</module>")
    if xml_end > 0:
        s = s[: xml_end + len("</module>")]

    return s.strip()


def fix_xml_encoding_issues(s: str) -> str:
    """Fix common XML encoding issues."""
    # Replace smart quotes with regular quotes
    s = s.replace('"', '"').replace('"', '"')
    s = s.replace(""", "'").replace(""", "'")

    # Fix common unescaped characters IN ATTRIBUTE VALUES AND TEXT
    # This is tricky - we need to avoid breaking CDATA sections

    # Strategy: Only fix ampersands that are NOT already part of entities
    # Match & that's NOT followed by amp; lt; gt; quot; apos; or #
    s = re.sub(r"&(?!(?:amp|lt|gt|quot|apos|#\d+|#x[0-9a-fA-F]+);)", "&amp;", s)

    return s

df = pd.read_pickle('./RESULTS.pkl')
df['RESULT'] = df['RESULT'].apply(lambda x: clean_xml_string(x))
df['RESULT'] = df['RESULT'].apply(lambda x: fix_xml_encoding_issues(x))
df

Unnamed: 0,IDX,DATA,PROMPT,PROMPT_ID,NARRATIVE,RESULT,STATUS,ERROR
0,0,{'title': 'Mean Girls is The Context API (The ...,# The Implementation Translator\n\n## Your Tas...,1,"xml\n<?xml version=""1.0"" encoding=""UTF-8""?>\n<...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<modul...",succeeded,
1,1,{'title': 'Frankenstein is Controlled vs Uncon...,# The Implementation Translator\n\n## Your Tas...,2,"xml\n<?xml version=""1.0"" encoding=""UTF-8""?>\n<...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<modul...",succeeded,
2,2,"{'title': 'Fight Club is React.StrictMode', 'c...",# The Implementation Translator\n\n## Your Tas...,3,"xml\n<?xml version=""1.0"" encoding=""UTF-8""?>\n<...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<modul...",succeeded,
3,3,{'title': 'The Matrix is useEffect Dependencie...,# The Implementation Translator\n\n## Your Tas...,4,"xml\n<?xml version=""1.0"" encoding=""UTF-8""?>\n<...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<modul...",succeeded,
4,4,{'title': 'Inception is Component Composition'...,# The Implementation Translator\n\n## Your Tas...,5,"xml\n<?xml version=""1.0"" encoding=""UTF-8""?>\n<...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<modul...",succeeded,
5,5,{'title': 'Memento is State Management Without...,# The Implementation Translator\n\n## Your Tas...,6,"xml\n<?xml version=""1.0"" encoding=""UTF-8""?>\n<...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<modul...",succeeded,
6,6,{'title': 'Groundhog Day is Component Re-rende...,# The Implementation Translator\n\n## Your Tas...,7,"xml\n<?xml version=""1.0"" encoding=""UTF-8""?>\n<...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<modul...",succeeded,
7,7,"{'title': '1984 is State Mutation', 'concept':...",# The Implementation Translator\n\n## Your Tas...,8,"xml\n<?xml version=""1.0"" encoding=""UTF-8""?>\n<...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<modul...",succeeded,
8,8,{'title': 'Back to the Future is Prop Changes ...,# The Implementation Translator\n\n## Your Tas...,9,"xml\n<?xml version=""1.0"" encoding=""UTF-8""?>\n<...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<modul...",succeeded,
9,9,{'title': 'The Truman Show is Component Lifecy...,# The Implementation Translator\n\n## Your Tas...,10,"xml\n<?xml version=""1.0"" encoding=""UTF-8""?>\n<...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<modul...",succeeded,


In [4]:
# Process XML strings from DataFrame column
df['RESULT'].apply(lambda x: main(
    input_file=x, 
    path=False,  # Key parameter!
    output_dir='./EXPORT',
    save_plan=False,
    save_snippets=False
))

ðŸ“– Processing XML string

ðŸ“¦ EXTRACTION SUMMARY
Module Name: Mean Girls: Context API
Slug: mean-girls-context-api

Files extracted: 1
  - src/modules/mean-girls-context-api/index.tsx

Integration snippets: 3
  - route
  - import
  - home_card

âœ… Written: EXPORT\modules\mean-girls-context-api\index.tsx

âœ¨ Extraction complete!

ðŸ“‹ Next steps:
1. Add import to App.tsx:
   import MeanGirlsContextAPI from "@modules/mean-girls-context-api";

2. Add route to App.tsx:
   <Route
  path="/mean-girls-context-api"
  element={
    <ModuleWrapper
      bgClass="bg-white"
      textClass="text-slate-700"
      fontClass="font-sans"
    >
      <MeanGirlsContextAPI />
    </ModuleWrapper>
  }
/>

3. Add card to src/modules/home/index.tsx:
   {
  path: "/mean-girls-context-api",
  title: "Mean Girls",
  subtitle: "North Shore High, 2004",
  concept: "Context API &amp; Prop Drilling",
  icon: Sparkles,
  colorClass: "text-pink-500",
  bgClass: "bg-pink-50 border-pink-300 hover:border-pink-500"

0     {'success': True, 'files_written': 1, 'output_...
1     {'success': True, 'files_written': 1, 'output_...
2     {'success': True, 'files_written': 1, 'output_...
3     {'success': True, 'files_written': 1, 'output_...
4     {'success': True, 'files_written': 1, 'output_...
5     {'success': True, 'files_written': 1, 'output_...
6     {'success': True, 'files_written': 1, 'output_...
7     {'success': True, 'files_written': 1, 'output_...
8     {'success': True, 'files_written': 1, 'output_...
9     {'success': True, 'files_written': 1, 'output_...
10    {'success': True, 'files_written': 1, 'output_...
11    {'success': True, 'files_written': 1, 'output_...
12    {'success': True, 'files_written': 1, 'output_...
13    {'success': True, 'files_written': 1, 'output_...
14    {'success': True, 'files_written': 1, 'output_...
15    {'success': True, 'files_written': 1, 'output_...
16    {'success': True, 'files_written': 1, 'output_...
17    {'success': True, 'files_written': 1, 'out