In [None]:
#

In [None]:
import pandas as pd
import regex as re
import os

In [None]:
from pathlib import Path
import json

def debug_placeholder_replacement():
    """Debug the white paper placeholder replacement process"""
    
    # Read the white paper
    try:
        with open('./WHITE_PAPER_v01.md', encoding='utf-8') as f:
            white_paper = f.read()
        print("✓ Successfully read WHITE_PAPER_v01.md")
    except FileNotFoundError:
        print("✗ WHITE_PAPER_v01.md not found")
        return
    
    # Find all placeholders in the white paper
    placeholders_in_text = []
    for line_num, line in enumerate(white_paper.split('\n'), 1):
        if '_placeholder' in line.lower() or '_PLACEHOLDER' in line:
            placeholders_in_text.append((line_num, line.strip()))
    
    print(f"\n=== PLACEHOLDERS FOUND IN WHITE PAPER ===")
    for line_num, line in placeholders_in_text:
        print(f"Line {line_num}: {line}")
    
    # Check appendices directory
    appendices_dir = Path('./appendices')
    if not appendices_dir.exists():
        print(f"\n✗ Appendices directory '{appendices_dir}' does not exist")
        return
    
    print(f"\n✓ Appendices directory exists: {appendices_dir}")
    
    # Find all appendix files
    appendix_files = sorted(appendices_dir.glob('APPENDIX_*'))
    print(f"\n=== APPENDIX FILES FOUND ===")
    for file in appendix_files:
        print(f"- {file.name}")
    
    # Check what placeholders we're trying to replace
    placeholder_list = [
        'PYTHON_SCRIPT_A_PLACEHOLDER',
        'PYTHON_SCRIPT_B_PLACEHOLDER', 
        'LLM_CONVERSATION_C_PLACEHOLDER',
        'TOOLKIT_D_PLACEHOLDER'
    ]
    
    print(f"\n=== CHECKING PLACEHOLDER MATCHES ===")
    for placeholder in placeholder_list:
        if placeholder in white_paper:
            print(f"✓ Found '{placeholder}' in white paper")
        else:
            print(f"✗ '{placeholder}' NOT found in white paper")
    
    # Create mapping between placeholders and files
    file_mapping = {}
    for file in appendix_files:
        file_name = file.name
        # Try to match file names to placeholders using more specific patterns
        if file_name.startswith('APPENDIX_A'):
            file_mapping['PYTHON_SCRIPT_A_PLACEHOLDER'] = file
        elif file_name.startswith('APPENDIX_B'):
            file_mapping['PYTHON_SCRIPT_B_PLACEHOLDER'] = file
        elif file_name.startswith('APPENDIX_C'):
            file_mapping['LLM_CONVERSATION_C_PLACEHOLDER'] = file
        elif file_name.startswith('APPENDIX_D'):
            file_mapping['TOOLKIT_D_PLACEHOLDER'] = file
    
    print(f"\n=== FILE MAPPING ===")
    for placeholder, file_path in file_mapping.items():
        print(f"'{placeholder}' -> '{file_path.name}'")
        if file_path.exists():
            print(f"  ✓ File exists")
        else:
            print(f"  ✗ File does not exist")

def read_file(file_path):
    """Read file content with proper encoding"""
    try:
        with open(file_path, encoding='utf-8') as f:
            content = f.read()
            
        # Special handling for APPENDIX_C (JSON file)
        if file_path.name == 'APPENDIX_C':
            try:
                json_data = json.loads(content)
                # Extract chunkedPrompt from JSON
                chunked_prompt = json_data.get('chunkedPrompt', json_data)
                chunked_prompt = chunked_prompt['chunks'][:9]
                # Format as code block with JSON syntax highlighting
                return f"```json\n{json.dumps(chunked_prompt, indent=2)}\n```"
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON from {file_path}: {e}")
                return f"```json\n{content}\n```"  # Fallback to raw content in code block
        
        return content
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

def fix_placeholder_replacement():
    """Fixed version of the placeholder replacement"""
    
    # Read the white paper
    with open('./WHITE_PAPER_v01.md', encoding='utf-8') as f:
        white_paper = f.read()
    
    appendices_dir = Path('./appendices')
    appendix_files = sorted(appendices_dir.glob('APPENDIX_*'))
    
    # Create explicit mapping
    replacements = {}
    for file in appendix_files:
        file_name = file.name
        if file_name.startswith('APPENDIX_A'):
            replacements['PYTHON_SCRIPT_A_PLACEHOLDER'] = file
        elif file_name.startswith('APPENDIX_B'):
            replacements['PYTHON_SCRIPT_B_PLACEHOLDER'] = file  
        elif file_name.startswith('APPENDIX_C'):
            replacements['LLM_CONVERSATION_C_PLACEHOLDER'] = file
        elif file_name.startswith('APPENDIX_D'):
            replacements['TOOLKIT_D_PLACEHOLDER'] = file
    
    # Perform replacements
    replacement_count = 0
    for placeholder, file_path in replacements.items():
        if placeholder in white_paper and file_path.exists():
            content = read_file(file_path)
            if content:
                white_paper = white_paper.replace(placeholder, content)
                replacement_count += 1
                print(f"✓ Replaced '{placeholder}' with content from '{file_path.name}'")
            else:
                print(f"✗ Could not read content from '{file_path.name}'")
        else:
            if placeholder not in white_paper:
                print(f"✗ Placeholder '{placeholder}' not found in white paper")
            if not file_path.exists():
                print(f"✗ File '{file_path.name}' does not exist")
    
    # Write the updated white paper
    with open('./white_paper.md', mode='w', encoding='utf-8') as f:
        f.write(white_paper)
    
    print(f"\n✓ Completed! Made {replacement_count} replacements")
    print("✓ Output written to 'white_paper.md'")
    
    # Verify no placeholders remain
    remaining_placeholders = [
        word for word in white_paper.split() 
        if '_placeholder' in word.lower()
    ]
    
    if remaining_placeholders:
        print(f"\n⚠ Warning: {len(remaining_placeholders)} placeholders still remain:")
        for placeholder in set(remaining_placeholders):
            print(f"  - {placeholder}")
    else:
        print(f"\n✓ All placeholders successfully replaced!")

# Run the debug function first
print("=== DEBUGGING PLACEHOLDER REPLACEMENT ===")
debug_placeholder_replacement()

print("\n" + "="*50)
print("=== RUNNING FIXED REPLACEMENT ===")
# fix_placeholder_replacement()

In [None]:
import json
with open('./appendices/APPENDIX_C', encoding='utf-8') as f:
    content = f.read()
    json_data = json.loads(content)
    json_data = json_data['chunkedPrompt']



In [None]:
df = pd.DataFrame(json_data['chunks'])
# print(df['text'].iloc[9])

In [None]:
json_data['chunks'][8]

In [None]:
"Developer_Simulation_Starter_Kit".upper()