In [1]:
# Setup Environment
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import warnings
import openai
from dotenv import load_dotenv
warnings.filterwarnings('ignore')

# Load environment variables from .env file
load_dotenv()

# OpenRouter API setup
print("🔬 Simple LLM Migration Tool Initialized")
print(f"⚙️  Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Initialize OpenRouter client
try:
    # Get API key from environment variable
    OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
    
    if not OPENROUTER_API_KEY:
        raise ValueError("OPENROUTER_API_KEY not found in environment variables. Please check your .env file.")
    
    # Create the OpenRouter client
    client = openai.OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=OPENROUTER_API_KEY,
    )
    print("✅ OpenRouter client initialized successfully with environment variable")
except Exception as e:
    print(f"❌ Error initializing OpenRouter client: {e}")
    print("💡 Make sure you have a .env file with OPENROUTER_API_KEY=your_key_here")
    client = None

print("\n🌐 Visit https://openrouter.ai/models to browse available models")
print("📋 Use model IDs like 'anthropic/claude-3.5-sonnet', 'meta-llama/llama-3.1-8b-instruct', etc.")


🔬 Simple LLM Migration Tool Initialized
⚙️  Timestamp: 2025-09-02 00:49:10
✅ OpenRouter client initialized successfully with environment variable

🌐 Visit https://openrouter.ai/models to browse available models
📋 Use model IDs like 'anthropic/claude-3.5-sonnet', 'meta-llama/llama-3.1-8b-instruct', etc.


In [70]:
# LOAD TEST FILES
test_files = {}
old_version_path = Path('selected_100_files\extra_large_1000_plus')

In [71]:
# BASIC PROMPTING STRATEGY
BASIC_PROMPT_TEMPLATE = """You are a senior PHP developer with expertise in legacy code modernization. Your task is to migrate this PHP code to PHP 8.3 standards while maintaining functional equivalence.

Please migrate the following PHP code to PHP 8.3:

{code}

Your response should follow this EXACT format:

// MIGRATION_START
[your migrated PHP code here]
// MIGRATION_END

CRITICAL FORMATTING REQUIREMENT: 
- Place the MIGRATION_START marker BEFORE the opening <?php tag
- Place the MIGRATION_END marker AFTER the closing PHP code
- Do NOT place these markers inside the PHP code itself

Provide only the migrated PHP code with the markers placed correctly outside the PHP code block, no additional commentary."""

print("✅ Basic prompting strategy configured with fixed marker placement")

✅ Basic prompting strategy configured with fixed marker placement


In [72]:
# COMPREHENSIVE PROMPTING STRATEGY
COMPREHENSIVE_PROMPT_TEMPLATE = """You are a senior PHP developer with expertise in legacy code modernization. Your task is to migrate old PHP code up to PHP 8.3 standards while maintaining the functionality of the original code.

Migration Requirements:
1. Update deprecated syntax
2. Replace deprecated functions
3. Implement modern PHP features
4. Improve security and code quality
5. Maintain functional equivalence
6. Enforce strict typing
7. Adopt core PHP 8.3 constructs

Please migrate the following PHP code to PHP 8.3:

{code}



Your response should follow this EXACT format:

// MIGRATION_START
[your migrated PHP code here]
// MIGRATION_END

CRITICAL FORMATTING REQUIREMENT: 
- Place the MIGRATION_START marker BEFORE the opening <?php tag
- Place the MIGRATION_END marker AFTER the closing PHP code
- Do NOT place these markers inside the PHP code itself

Include the markers as comments OUTSIDE the PHP code block. Keep the original comments as they are.
Do not add any other text, explanations, or commentary outside the markers. Make sure you give the COMPLETE migrated code."""

# CHUNKING PROMPTS FOR LARGE FILES
CHUNK_BASIC_PROMPT_TEMPLATE = """You are a senior PHP developer with expertise in legacy code modernization. Your task is to migrate this PARTIAL SEGMENT of a larger PHP file up to PHP 8.3 standards.

CONTEXT:
- Original file: {filename}
- Processing lines: {start_line} to {end_line} (of {total_lines} total lines)
- This is chunk {chunk_number} of {total_chunks}

CRITICAL INSTRUCTIONS:
1. This is only a SEGMENT of the complete file
2. Do NOT add opening <?php tags if the code segment doesn't start with one
3. Do NOT add closing ?> tags 
4. Do NOT try to complete missing parts or add code that isn't provided
5. Preserve the exact structure - if it starts with a method, start with that method
6. If it starts mid-class, do NOT add class opening braces

Please migrate ONLY the following PHP code segment to PHP 8.3:

{code}

Your response should follow this EXACT format:

// MIGRATION_START
[your migrated code segment here - exactly as provided, no extra <?php tags]
// MIGRATION_END

CRITICAL FORMATTING REQUIREMENT: 
- Place the MIGRATION_START marker BEFORE the code segment
- Place the MIGRATION_END marker AFTER the code segment
- Do NOT place these markers inside the PHP code itself

Migrate only the provided code segment. Do not add missing functions, classes, or try to complete the file."""

CHUNK_COMPREHENSIVE_PROMPT_TEMPLATE = """You are a senior PHP developer with expertise in legacy code modernization. Your task is to migrate this PARTIAL SEGMENT of a larger PHP file to PHP 8.3 standards while maintaining functional equivalence.

CONTEXT:
- Original file: {filename}
- Processing lines: {start_line} to {end_line} (of {total_lines} total lines)  
- This is chunk {chunk_number} of {total_chunks}

CRITICAL INSTRUCTIONS:
1. This is only a SEGMENT of the complete file
2. Do NOT add opening <?php tags if the code segment doesn't start with one
3. Do NOT add closing ?> tags
4. Do NOT try to complete missing parts or add code that isn't provided
5. Preserve the exact structure - if it starts with a method, start with that method
6. If it starts mid-class, do NOT add class opening braces

Migration Requirements for this segment:
1. Update deprecated syntax
2. Replace deprecated functions
3. Implement modern PHP features
4. Improve security and code quality
5. Maintain functional equivalence
6. Enforce strict typing
7. Adopt core PHP 8.3 constructs


Please migrate ONLY the following PHP code segment to PHP 8.3:

{code}

Your response should follow this EXACT format:

// MIGRATION_START
[your migrated code segment here - exactly as provided, no extra <?php tags]
// MIGRATION_END

CRITICAL FORMATTING REQUIREMENT: 
- Place the MIGRATION_START marker BEFORE the code segment
- Place the MIGRATION_END marker AFTER the code segment
- Do NOT place these markers inside the PHP code itself

Include the markers as comments OUTSIDE the code segment. Keep the original comments as they are.
Migrate only the provided code segment. Do not add missing functions, classes, or try to complete the file."""

print("✅ Comprehensive and chunking prompting strategies configured with fixed marker placement")
print("🔧 Updated all prompts to prevent placing MIGRATION markers inside PHP code")

✅ Comprehensive and chunking prompting strategies configured with fixed marker placement
🔧 Updated all prompts to prevent placing MIGRATION markers inside PHP code


In [73]:
DEFAULT_CHUNK_SIZE = 500  # Set your preferred chunk size here

In [74]:
# PROMPT HELPER FUNCTION
PROMPT_TEMPLATES = {
    'basic': BASIC_PROMPT_TEMPLATE,
    'comprehensive': COMPREHENSIVE_PROMPT_TEMPLATE,
    'chunk_basic': CHUNK_BASIC_PROMPT_TEMPLATE,
    'chunk_comprehensive': CHUNK_COMPREHENSIVE_PROMPT_TEMPLATE,
}


def create_prompt(code: str, strategy: str = "basic", **kwargs) -> str:
    """Create migration prompts using different strategies."""
    if strategy not in PROMPT_TEMPLATES:
        raise ValueError(f"Unknown prompting strategy: {strategy}. Available: {list(PROMPT_TEMPLATES.keys())}")
    
    template = PROMPT_TEMPLATES[strategy]
    
    # For chunking strategies, we need additional parameters
    if strategy.startswith('chunk_'):
        required_params = ['filename', 'start_line', 'end_line', 'total_lines', 'chunk_number', 'total_chunks']
        missing_params = [param for param in required_params if param not in kwargs]
        if missing_params:
            raise ValueError(f"Chunking strategy requires parameters: {missing_params}")
    
    return template.format(code=code, **kwargs)

def chunk_code(code: str, chunk_size: int = None) -> list:
    """Split code into chunks by line count."""
    if chunk_size is None:
        chunk_size = DEFAULT_CHUNK_SIZE
    
    lines = code.split('\n')
    total_lines = len(lines)
    
    if total_lines <= chunk_size:
        return [code]  # No need to chunk
    
    chunks = []
    for i in range(0, total_lines, chunk_size):
        chunk_lines = lines[i:i + chunk_size]
        chunk_code = '\n'.join(chunk_lines)
        
        chunk_info = {
            'code': chunk_code,
            'start_line': i + 1,
            'end_line': min(i + chunk_size, total_lines),
            'total_lines': total_lines
        }
        chunks.append(chunk_info)
    
    return chunks

print("🎯 All prompting strategies configured and ready")
print(f"📋 Available strategies: {list(PROMPT_TEMPLATES.keys())}")
print("🔧 Added chunking utilities for large files")
print(f"📦 Default chunk size: {DEFAULT_CHUNK_SIZE} lines")

🎯 All prompting strategies configured and ready
📋 Available strategies: ['basic', 'comprehensive', 'chunk_basic', 'chunk_comprehensive']
🔧 Added chunking utilities for large files
📦 Default chunk size: 500 lines


In [75]:
# NEW ORGANIZED CHUNKING SYSTEM
import requests
import json

def migrate_file_chunked(filename: str, original_code: str, model_name: str, strategy: str, api_key: str, chunk_size: int):
    """Migrate a large file using NEW organized chunking approach."""
    chunks = chunk_code(original_code, chunk_size)
    total_chunks = len(chunks)
    
    print(f" Split into {total_chunks} chunks of ~{chunk_size} lines each")
    
    # Create organized folder structure
    chunked_output_dir = Path('chunked_model_output')
    chunked_output_dir.mkdir(exist_ok=True)
    
    # Create model folder
    model_short = model_name.split('/')[-1].replace('-', '_').replace(':', '_').replace('.', '_').lower()
    model_dir = chunked_output_dir / model_short
    model_dir.mkdir(exist_ok=True)
    
    # Create file folder (remove .php extension for folder name)
    file_base = filename.replace('.php', '')
    file_dir = model_dir / file_base
    file_dir.mkdir(exist_ok=True)
    
    print(f"📁 Saving chunks to: {file_dir}")
    
    # Use chunking strategy
    chunk_strategy = f"chunk_{strategy}" if not strategy.startswith('chunk_') else strategy
    
    all_responses = []
    
    for i, chunk_info in enumerate(chunks, 1):
        print(f"\n[Chunk {i}/{total_chunks}] Processing lines {chunk_info['start_line']}-{chunk_info['end_line']}...")
        
        # Create prompt with chunking context
        prompt = create_prompt(
            chunk_info['code'], 
            chunk_strategy,
            filename=filename,
            start_line=chunk_info['start_line'],
            end_line=chunk_info['end_line'],
            total_lines=chunk_info['total_lines'],
            chunk_number=i,
            total_chunks=total_chunks
        )
        
        print(f"📏 Chunk prompt length: {len(prompt):,} characters")
        
        # Make API call for this chunk
        chunk_filename = f"{filename}_chunk_{i}"
        response = make_api_call_chunked(chunk_filename, prompt, model_name, api_key, chunk_strategy, file_dir, i)
        
        if response is None:
            print(f"❌ Failed to process chunk {i}")
            all_responses.append(None)
        else:
            all_responses.append(response)
            print(f"✅ Chunk {i} processed successfully")
    
    # Count successful chunks
    successful_chunks = sum(1 for r in all_responses if r is not None)
    print(f"\n🎉 Chunked migration completed!")
    print(f"✅ Successful chunks: {successful_chunks}/{total_chunks}")
    print(f"📁 All chunks saved in: {file_dir}")
    
    return all_responses

def make_api_call_chunked(filename: str, prompt: str, model_name: str, api_key: str, strategy: str, file_dir: Path, chunk_number: int):
    """Make API call and save to organized chunk structure."""
    
    # Direct API call to OpenRouter
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
        "HTTP-Referer": "https://github.com/research-project",
        "X-Title": "LLM PHP Migration Research"
    }
    
    payload = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 80000,
        "temperature": 0.3
    }
    
    try:
        print(f"🔗 Making API call to OpenRouter...")
        response = requests.post(
            "https://openrouter.ai/api/v1/chat/completions",
            headers=headers,
            data=json.dumps(payload),
            timeout=300
        )
        
        print(f"📊 HTTP Status: {response.status_code}")
        print(f"📏 Response length: {len(response.text)} characters")
        
        if response.status_code != 200:
            print(f"❌ API Error: {response.status_code}")
            print(f"📄 Response: {response.text[:1000]}...")
            return None
        
        # Parse response
        try:
            result = response.json()
            raw_response = result['choices'][0]['message']['content']
            
            if not raw_response or len(raw_response.strip()) < 10:
                print(f"❌ Model response is empty or too short")
                return None
                
        except (json.JSONDecodeError, KeyError, IndexError) as e:
            print(f"❌ Response parsing error: {e}")
            return None
        
        # Save to organized structure - just the chunk number as filename
        chunk_file = file_dir / f"{chunk_number}.txt"
        
        with open(chunk_file, 'w', encoding='utf-8') as f:
            f.write(f"=== RAW MODEL RESPONSE ===\n")
            f.write(f"File: {filename}\n")
            f.write(f"Model: {model_name}\n")
            f.write(f"Strategy: {strategy}\n")
            f.write(f"Chunk: {chunk_number}\n")
            f.write(f"Length: {len(raw_response)} characters\n")
            f.write(f"Timestamp: {datetime.now()}\n")
            f.write("=" * 50 + "\n\n")
            f.write(raw_response)
        
        print(f"✅ Chunk saved to: {chunk_file}")
        return raw_response
        
    except requests.exceptions.RequestException as e:
        print(f"❌ Request error: {e}")
        return None
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        return None

# Keep the existing functions for single files
def migrate_file(filename: str, model_name: str, strategy: str = "basic", api_key: str = OPENROUTER_API_KEY, 
                chunk_size: int = None, auto_chunk: bool = True):
    """Enhanced migration function with NEW organized chunking."""
    
    if chunk_size is None:
        chunk_size = DEFAULT_CHUNK_SIZE
    
    if filename not in test_files:
        print(f"❌ File '{filename}' not found")
        return None
    
    original_code = test_files[filename]
    line_count = len(original_code.split('\n'))
    
    print(f"🚀 Migrating {filename} using {model_name} with {strategy} strategy...")
    print(f"📏 Input code length: {len(original_code):,} characters ({line_count:,} lines)")
    
    # Decide whether to chunk
    should_chunk = auto_chunk and line_count > chunk_size
    
    if should_chunk:
        print(f"📦 Large file detected ({line_count} lines) - using NEW organized chunking")
        return migrate_file_chunked(filename, original_code, model_name, strategy, api_key, chunk_size)
    else:
        print(f"📄 Processing as single file ({line_count} lines, chunk limit: {chunk_size})")
        return migrate_file_single(filename, original_code, model_name, strategy, api_key)

def migrate_file_single(filename: str, original_code: str, model_name: str, strategy: str, api_key: str):
    """Migrate a single file without chunking - saves to regular model_output."""
    prompt = create_prompt(original_code, strategy)
    print(f"📏 Prompt length: {len(prompt):,} characters")
    
    return make_api_call(filename, prompt, model_name, api_key, strategy)

def make_api_call(filename: str, prompt: str, model_name: str, api_key: str, strategy: str):
    """Make API call for single files - saves to model_output directory."""
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
        "HTTP-Referer": "https://github.com/research-project",
        "X-Title": "LLM PHP Migration Research"
    }
    
    payload = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 80000,
        "temperature": 0.3
    }
    
    try:
        response = requests.post(
            "https://openrouter.ai/api/v1/chat/completions",
            headers=headers,
            data=json.dumps(payload),
            timeout=300
        )
        
        if response.status_code != 200:
            print(f"❌ API Error: {response.status_code}")
            return None
        
        result = response.json()
        raw_response = result['choices'][0]['message']['content']
        
        # Save to organized model_output structure with simple filenames
        output_dir = Path('model_output')
        output_dir.mkdir(exist_ok=True)
        
        # Create model subfolder
        model_short = model_name.split('/')[-1].replace('-', '_').replace(':', '_').replace('.', '_').lower()
        model_folder = output_dir / model_short
        model_folder.mkdir(exist_ok=True)
        
        # Simple filename - just the base name with .txt extension
        base_name = filename.replace('.php', '')
        output_file = model_folder / f"{base_name}.txt"
        
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(f"=== RAW MODEL RESPONSE ===\n")
            f.write(f"File: {filename}\n")
            f.write(f"Model: {model_name}\n")
            f.write(f"Strategy: {strategy}\n")
            f.write(f"Length: {len(raw_response)} characters\n")
            f.write(f"Timestamp: {datetime.now()}\n")
            f.write("=" * 50 + "\n\n")
            f.write(raw_response)
        
        print(f"✅ Response saved to: {output_file}")
        return raw_response
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

print("✅ NEW Organized Chunking System Ready!")
print("🎯 Updated Features:")
print("   • Chunked files: chunked_model_output/model_name/filename/1.txt, 2.txt, etc.")
print("   • Single files: model_output/model_name/filename.txt (CLEAN STRUCTURE)")
print("   • Clean folder organization for both chunked and single files")
print("   • Simple filenames without strategy suffixes")
print("📁 New model_output structure: model_output/model_name/filename.txt")

✅ NEW Organized Chunking System Ready!
🎯 Updated Features:
   • Chunked files: chunked_model_output/model_name/filename/1.txt, 2.txt, etc.
   • Single files: model_output/model_name/filename.txt (CLEAN STRUCTURE)
   • Clean folder organization for both chunked and single files
   • Simple filenames without strategy suffixes
📁 New model_output structure: model_output/model_name/filename.txt


In [76]:


if old_version_path.exists():
    # Recursively find all PHP files in all subfolders
    for php_file in old_version_path.rglob('*.php'):
        try:
            with open(php_file, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                if content.strip():
                    test_files[php_file.name] = content
        except Exception as e:
            print(f"⚠️  Could not load {php_file.name}: {e}")
    
    print(f"📁 Loaded {len(test_files)} PHP files from selected_100_files:")
    for filename in sorted(test_files.keys()):
        size = len(test_files[filename])
        print(f"   📄 {filename} ({size:,} chars)")
else:
    print("❌ selected_100_files directory not found")
    print("💡 Make sure the selected 100 files are in 'selected_100_files/' directory")


📁 Loaded 12 PHP files from selected_100_files:
   📄 001_getid3.lib.php (43,494 chars)
   📄 002_module.audio-video.asf.php (129,115 chars)
   📄 003_wp-db.php (60,679 chars)
   📄 004_class-IXR.php (32,854 chars)
   📄 005_class-snoopy.php (37,776 chars)
   📄 006_widgets.php (47,668 chars)
   📄 009_getid3.php (62,683 chars)
   📄 010_class-wp-theme.php (39,449 chars)
   📄 012_module.audio-video.riff.php (111,145 chars)
   📄 013_file.php (45,191 chars)
   📄 014_module.tag.id3v2.php (134,159 chars)
   📄 057_class-wp-customize-manager.php (32,639 chars)


In [77]:
# HELPER FUNCTIONS WITH CHUNKING SUPPORT
def quick_migrate(filename: str, model: str = "anthropic/claude-3.5-sonnet", strategy: str = "basic", 
                 chunk_size: int = None, auto_chunk: bool = True):
    """Quick migration with defaults and chunking support."""
    if chunk_size is None:
        chunk_size = DEFAULT_CHUNK_SIZE
    return migrate_file(filename, model, strategy, chunk_size=chunk_size, auto_chunk=auto_chunk)

def batch_migrate(filenames: list, model: str = "anthropic/claude-3.5-sonnet", strategy: str = "basic", 
                 chunk_size: int = None, auto_chunk: bool = True):
    """Migrate multiple files with chunking support."""
    if chunk_size is None:
        chunk_size = DEFAULT_CHUNK_SIZE
        
    print(f"🔄 Batch migrating {len(filenames)} files...")
    if auto_chunk:
        print(f"📦 Auto-chunking enabled for files > {chunk_size} lines")
    
    results = []
    
    for i, filename in enumerate(filenames, 1):
        print(f"\n[{i}/{len(filenames)}] Processing {filename}...")
        result = migrate_file(filename, model, strategy, chunk_size=chunk_size, auto_chunk=auto_chunk)
        results.append(result)
    
    # Count results (for chunked files, result is a list)
    total_files = len(filenames)
    successful_files = 0
    total_chunks = 0
    successful_chunks = 0
    
    for result in results:
        if result is not None:
            if isinstance(result, list):  # Chunked file
                total_chunks += len(result)
                successful_chunks += sum(1 for r in result if r is not None)
                if any(r is not None for r in result):  # At least one chunk succeeded
                    successful_files += 1
            else:  # Single file
                successful_files += 1
                total_chunks += 1
                successful_chunks += 1
    
    print(f"\n🎉 Batch migration completed!")
    print(f"✅ Successful files: {successful_files}/{total_files}")
    if total_chunks > len(filenames):
        print(f"📦 Total chunks processed: {successful_chunks}/{total_chunks}")
    
    return results

def analyze_file_sizes(chunk_threshold: int = None):
    """Analyze file sizes to see which ones would be chunked."""
    if chunk_threshold is None:
        chunk_threshold = DEFAULT_CHUNK_SIZE
        
    if not test_files:
        print("❌ No test files loaded")
        return
    
    print("📊 File Size Analysis")
    print("=" * 50)
    
    small_files = []
    large_files = []
    
    for filename, content in test_files.items():
        line_count = len(content.split('\n'))
        char_count = len(content)
        
        if line_count <= chunk_threshold:
            small_files.append((filename, line_count, char_count))
        else:
            large_files.append((filename, line_count, char_count))
    
    print(f"📄 Small files (≤{chunk_threshold} lines): {len(small_files)}")
    for filename, lines, chars in sorted(small_files, key=lambda x: x[1], reverse=True)[:10]:
        print(f"   {filename}: {lines:,} lines, {chars:,} chars")
    
    if len(small_files) > 10:
        print(f"   ... and {len(small_files) - 10} more")
    
    print(f"\n📦 Large files (>{chunk_threshold} lines): {len(large_files)}")
    for filename, lines, chars in sorted(large_files, key=lambda x: x[1], reverse=True):
        chunks_needed = (lines + chunk_threshold - 1) // chunk_threshold  # Ceiling division
        print(f"   {filename}: {lines:,} lines, {chars:,} chars → {chunks_needed} chunks")
    
    if large_files:
        total_large_lines = sum(lines for _, lines, _ in large_files)
        total_chunks_needed = sum((lines + chunk_threshold - 1) // chunk_threshold for _, lines, _ in large_files)
        print(f"\n📊 Summary for large files:")
        print(f"   Total lines: {total_large_lines:,}")
        print(f"   Total chunks needed: {total_chunks_needed}")

def browse_saved_chunks(model_name: str = None):
    """Browse and analyze saved chunks in the chunks_sent_to_model directory."""
    chunks_dir = Path('chunks_sent_to_model')
    
    if not chunks_dir.exists():
        print("❌ No chunks directory found. Run some migrations first!")
        return
    
    print("📁 Browsing saved chunks...")
    print("=" * 50)
    
    model_dirs = list(chunks_dir.iterdir())
    if not model_dirs:
        print("❌ No model directories found in chunks_sent_to_model/")
        return
    
    # Filter by model if specified
    if model_name:
        model_short = model_name.split('/')[-1].replace('-', '_').replace(':', '_').replace('.', '_').lower()
        model_dirs = [d for d in model_dirs if d.name == model_short]
        if not model_dirs:
            print(f"❌ No chunks found for model: {model_name}")
            print(f"Available models: {[d.name for d in chunks_dir.iterdir() if d.is_dir()]}")
            return
    
    for model_dir in sorted(model_dirs):
        if not model_dir.is_dir():
            continue
        
        print(f"\n📂 Model: {model_dir.name}")
        
        # Group files by base name
        files_by_base = {}
        for file in model_dir.iterdir():
            if file.is_file():
                # Extract base name
                if '_chunk_' in file.stem:
                    base_name = file.stem.split('_chunk_')[0]
                elif '_single_file_' in file.stem:
                    base_name = file.stem.split('_single_file_')[0]
                else:
                    base_name = file.stem
                
                if base_name not in files_by_base:
                    files_by_base[base_name] = {'chunks': [], 'single': [], 'other': []}
                
                if '_chunk_' in file.stem:
                    files_by_base[base_name]['chunks'].append(file)
                elif '_single_file_' in file.stem:
                    files_by_base[base_name]['single'].append(file)
                else:
                    files_by_base[base_name]['other'].append(file)
        
        # Display grouped files
        for base_name, file_groups in sorted(files_by_base.items()):
            chunks = file_groups['chunks']
            single = file_groups['single']
            
            if chunks:
                # Count unique chunks (divide by 3 since each chunk has 3 files: code, prompt, metadata)
                unique_chunks = len([f for f in chunks if f.name.endswith('_metadata.json')])
                print(f"   📦 {base_name}.php - {unique_chunks} chunks")
                
                # Show chunk details
                for chunk_file in sorted([f for f in chunks if f.name.endswith('_metadata.json')]):
                    try:
                        with open(chunk_file, 'r') as f:
                            metadata = json.load(f)
                        chunk_num = metadata.get('chunk_number', 'unknown')
                        total_chunks = metadata.get('total_chunks', 'unknown')
                        lines = f"{metadata.get('start_line', '?')}-{metadata.get('end_line', '?')}"
                        code_len = metadata.get('code_length', 0)
                        prompt_len = metadata.get('prompt_length', 0)
                        print(f"      📄 Chunk {chunk_num}/{total_chunks}: lines {lines}, {code_len:,} chars code, {prompt_len:,} chars prompt")
                    except Exception as e:
                        print(f"      ❌ Error reading {chunk_file.name}: {e}")
            
            elif single:
                print(f"   📄 {base_name}.php - single file (no chunking)")

def inspect_chunk(model_name: str, filename: str, chunk_number: int = None):
    """Inspect a specific chunk's data in detail."""
    chunks_dir = Path('chunks_sent_to_model')
    model_short = model_name.split('/')[-1].replace('-', '_').replace(':', '_').replace('.', '_').lower()
    model_dir = chunks_dir / model_short
    
    if not model_dir.exists():
        print(f"❌ No chunks found for model: {model_name}")
        return
    
    base_name = filename.replace('.php', '')
    
    # If chunk_number specified, look for that specific chunk
    if chunk_number is not None:
        pattern = f"{base_name}_chunk_{chunk_number:03d}_of_*_metadata.json"
        metadata_files = list(model_dir.glob(pattern))
        
        if not metadata_files:
            print(f"❌ Chunk {chunk_number} not found for {filename}")
            return
        
        metadata_file = metadata_files[0]
    else:
        # Look for single file first
        single_pattern = f"{base_name}_single_file_code.php"
        single_files = list(model_dir.glob(single_pattern))
        
        if single_files:
            print(f"📄 Inspecting single file: {filename}")
            # Show single file data
            code_file = model_dir / f"{base_name}_single_file_code.php"
            prompt_file = model_dir / f"{base_name}_single_file_prompt.txt"
            
            if code_file.exists():
                print(f"📂 Code file: {code_file}")
                with open(code_file, 'r', encoding='utf-8') as f:
                    content = f.read()
                    print(f"   📏 Size: {len(content):,} characters")
                    # Show first few lines
                    lines = content.split('\n')[:15]
                    print("   📄 First 15 lines:")
                    for i, line in enumerate(lines, 1):
                        print(f"      {i:2d}: {line[:100]}{'...' if len(line) > 100 else ''}")
            
            if prompt_file.exists():
                print(f"\n📝 Prompt file: {prompt_file}")
                with open(prompt_file, 'r', encoding='utf-8') as f:
                    content = f.read()
                    print(f"   📏 Size: {len(content):,} characters")
            return
        
        # Look for chunks
        chunk_pattern = f"{base_name}_chunk_*_metadata.json"
        metadata_files = list(model_dir.glob(chunk_pattern))
        
        if not metadata_files:
            print(f"❌ No chunks or single file found for {filename}")
            return
        
        # Show all chunks
        print(f"📦 Found {len(metadata_files)} chunks for {filename}")
        for metadata_file in sorted(metadata_files):
            with open(metadata_file, 'r') as f:
                metadata = json.load(f)
            chunk_num = metadata.get('chunk_number', 'unknown')
            print(f"   📄 Chunk {chunk_num}")
        
        # Ask to inspect first chunk by default
        metadata_file = sorted(metadata_files)[0]
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
        chunk_number = metadata.get('chunk_number', 1)
    
    # Load and display chunk data
    try:
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)
        
        print(f"\n📊 Chunk {metadata['chunk_number']} of {metadata['total_chunks']} - {filename}")
        print("=" * 50)
        print(f"📏 Lines: {metadata['start_line']} to {metadata['end_line']} (of {metadata['total_lines']} total)")
        print(f"📄 Chunk lines: {metadata['chunk_lines']}")
        print(f"💻 Code length: {metadata['code_length']:,} characters")
        print(f"📝 Prompt length: {metadata['prompt_length']:,} characters")
        print(f"⏰ Timestamp: {metadata['timestamp']}")
        
        # Show corresponding files
        base_pattern = metadata_file.stem.replace('_metadata', '')
        code_file = model_dir / f"{base_pattern}_code.php"
        prompt_file = model_dir / f"{base_pattern}_prompt.txt"
        
        if code_file.exists():
            print(f"\n📂 Code file: {code_file}")
            with open(code_file, 'r', encoding='utf-8') as f:
                content = f.read()
                # Show first few lines after the header comment
                lines = content.split('\n')
                start_idx = 0
                for i, line in enumerate(lines):
                    if line.strip() == '*/' or (line.strip().startswith('*/') and len(line.strip()) == 2):
                        start_idx = i + 1
                        break
                
                relevant_lines = lines[start_idx:start_idx + 15]
                print("   📄 First 15 lines of actual code:")
                for i, line in enumerate(relevant_lines, start_idx + 1):
                    print(f"      {i:2d}: {line[:100]}{'...' if len(line) > 100 else ''}")
        
        if prompt_file.exists():
            print(f"\n📝 Prompt file: {prompt_file}")
            print("   💡 Use 'with open(prompt_file) as f: print(f.read())' to see full prompt")
        
    except Exception as e:
        print(f"❌ Error inspecting chunk: {e}")

print("✅ Enhanced helper functions with chunk browsing ready!")
print("💡 New chunk inspection features:")
print("   browse_saved_chunks() - Browse all saved chunks")
print("   browse_saved_chunks('model/name') - Browse chunks for specific model")
print("   inspect_chunk('model/name', 'filename.php') - Inspect all chunks for a file")
print("   inspect_chunk('model/name', 'filename.php', 2) - Inspect specific chunk number")
print(f"   Default chunk size: {DEFAULT_CHUNK_SIZE} lines (configurable)")
print("   auto_chunk parameter - Enable/disable automatic chunking")
print("\n💡 Usage examples:")
print("   migrate_file('large_file.php', 'qwen/qwen3-coder:free', 'basic', chunk_size=800)")
print("   quick_migrate('large_file.php', auto_chunk=False)  # Force single file")
print("   batch_migrate(['file1.php', 'file2.php'], chunk_size=1200)")
print("   browse_saved_chunks('deepseek/deepseek-chat-v3.1:free')  # Browse specific model chunks")

✅ Enhanced helper functions with chunk browsing ready!
💡 New chunk inspection features:
   browse_saved_chunks() - Browse all saved chunks
   browse_saved_chunks('model/name') - Browse chunks for specific model
   inspect_chunk('model/name', 'filename.php') - Inspect all chunks for a file
   inspect_chunk('model/name', 'filename.php', 2) - Inspect specific chunk number
   Default chunk size: 500 lines (configurable)
   auto_chunk parameter - Enable/disable automatic chunking

💡 Usage examples:
   migrate_file('large_file.php', 'qwen/qwen3-coder:free', 'basic', chunk_size=800)
   quick_migrate('large_file.php', auto_chunk=False)  # Force single file
   batch_migrate(['file1.php', 'file2.php'], chunk_size=1200)
   browse_saved_chunks('deepseek/deepseek-chat-v3.1:free')  # Browse specific model chunks


In [78]:
# ANALYZE YOUR FILES FOR CHUNKING
print("🔍 Analyzing loaded files to see chunking requirements...")
analyze_file_sizes()

# Test chunking on a sample file
print("\n🧪 Testing chunking logic on sample files...")
for filename in list(test_files.keys())[:3]:
    code = test_files[filename]
    line_count = len(code.split('\n'))
    chunks = chunk_code(code, DEFAULT_CHUNK_SIZE)
    
    print(f"\n📄 {filename}:")
    print(f"   Lines: {line_count:,}")
    print(f"   Chunks: {len(chunks)}")
    
    if len(chunks) > 1:
        print(f"   Chunk breakdown:")
        for i, chunk_info in enumerate(chunks, 1):
            print(f"     Chunk {i}: lines {chunk_info['start_line']}-{chunk_info['end_line']} ({chunk_info['end_line'] - chunk_info['start_line'] + 1} lines)")

print(f"\n✅ File analysis complete!")

🔍 Analyzing loaded files to see chunking requirements...
📊 File Size Analysis
📄 Small files (≤500 lines): 0

📦 Large files (>500 lines): 12
   014_module.tag.id3v2.php: 3,415 lines, 134,159 chars → 7 chunks
   012_module.audio-video.riff.php: 2,436 lines, 111,145 chars → 5 chunks
   003_wp-db.php: 2,187 lines, 60,679 chars → 5 chunks
   002_module.audio-video.asf.php: 2,020 lines, 129,115 chars → 5 chunks
   009_getid3.php: 1,776 lines, 62,683 chars → 4 chunks
   006_widgets.php: 1,515 lines, 47,668 chars → 4 chunks
   001_getid3.lib.php: 1,342 lines, 43,494 chars → 3 chunks
   057_class-wp-customize-manager.php: 1,273 lines, 32,639 chars → 3 chunks
   005_class-snoopy.php: 1,257 lines, 37,776 chars → 3 chunks
   010_class-wp-theme.php: 1,236 lines, 39,449 chars → 3 chunks
   013_file.php: 1,151 lines, 45,191 chars → 3 chunks
   004_class-IXR.php: 1,101 lines, 32,854 chars → 3 chunks

📊 Summary for large files:
   Total lines: 20,709
   Total chunks needed: 48

🧪 Testing chunking logic

In [79]:
# READY TO USE!
print("🎉 SIMPLE MIGRATION TOOL READY!")
print("=" * 40)

if test_files:
    print(f"📁 Available files: {len(test_files)} loaded")
    print(f"📋 Sample files: {list(test_files.keys())[:5]}")
    
    print(f"\n💡 Usage examples:")
    print("   migrate_file('014_module.tag.id3v2.php', 'qwen/qwen3-coder:free', 'basic')")
    print("   quick_migrate('014_module.tag.id3v2.php')")
    print("   batch_migrate(['file1.php', 'file2.php'], 'meta-llama/llama-3.1-8b-instruct')")
    
    print(f"\n📁 Raw responses will be saved to model_output/ folder")
    print("🔬 Examine the raw model responses to see what they return!")
else:
    print("❌ No test files loaded - check selected_100_files directory")
    print("💡 Make sure the selected 100 files are in 'selected_100_files/' directory")

🎉 SIMPLE MIGRATION TOOL READY!
📁 Available files: 12 loaded
📋 Sample files: ['001_getid3.lib.php', '002_module.audio-video.asf.php', '003_wp-db.php', '004_class-IXR.php', '005_class-snoopy.php']

💡 Usage examples:
   migrate_file('014_module.tag.id3v2.php', 'qwen/qwen3-coder:free', 'basic')
   quick_migrate('014_module.tag.id3v2.php')
   batch_migrate(['file1.php', 'file2.php'], 'meta-llama/llama-3.1-8b-instruct')

📁 Raw responses will be saved to model_output/ folder
🔬 Examine the raw model responses to see what they return!


In [80]:
# DEMONSTRATE CHUNK SAVING MECHANISM
print("🔍 CHUNK SAVING DEMONSTRATION")
print("=" * 50)

# Show what happens when chunks are saved
chunks_dir = Path('chunks_sent_to_model')
if chunks_dir.exists():
    print(f"📁 Chunks directory exists: {chunks_dir}")
    
    # Browse existing chunks
    print("\n📊 Current saved chunks:")
    browse_saved_chunks()
    
    print("\n💡 To inspect a specific chunk:")
    print("   inspect_chunk('deepseek_deepseek_chat_v3_1_free', '007_class-wp.php', 1)")
    print("   inspect_chunk('deepseek_deepseek_chat_v3_1_free', 'some_single_file.php')")
    
else:
    print("📁 No chunks directory found yet - will be created when you run migrations")

print("\n🎯 What gets saved for each chunk:")
print("   📄 *_code.php - The actual PHP code chunk sent to the model")
print("   📝 *_prompt.txt - The complete prompt including context and instructions")
print("   📊 *_metadata.json - Chunk metadata (line numbers, lengths, timestamps)")

print("\n🔧 For single files (no chunking):")
print("   📄 *_single_file_code.php - The complete file code")
print("   📝 *_single_file_prompt.txt - The complete prompt")

print("\n💡 This allows you to:")
print("   ✅ See exactly what code was sent to each model")
print("   ✅ Compare prompts across different strategies")
print("   ✅ Debug chunking behavior")
print("   ✅ Verify chunk boundaries and context")
print("   ✅ Analyze prompt effectiveness")

🔍 CHUNK SAVING DEMONSTRATION
📁 No chunks directory found yet - will be created when you run migrations

🎯 What gets saved for each chunk:
   📄 *_code.php - The actual PHP code chunk sent to the model
   📝 *_prompt.txt - The complete prompt including context and instructions
   📊 *_metadata.json - Chunk metadata (line numbers, lengths, timestamps)

🔧 For single files (no chunking):
   📄 *_single_file_code.php - The complete file code
   📝 *_single_file_prompt.txt - The complete prompt

💡 This allows you to:
   ✅ See exactly what code was sent to each model
   ✅ Compare prompts across different strategies
   ✅ Debug chunking behavior
   ✅ Verify chunk boundaries and context
   ✅ Analyze prompt effectiveness


In [81]:
# Try with a different model that's more reliable
migrate_file('001_getid3.lib.php', 'mistralai/mistral-small-3.2-24b-instruct:free', 'basic')

# UNCOMMENT THE LINE BELOW TO START BATCH MIGRATION
# batch_migrate(list(test_files.keys()) , model='mistralai/mistral-small-3.2-24b-instruct:free', strategy='basic')

# Or use a smaller test first:
# batch_migrate(list(test_files.keys())[:5] , model='mistralai/mistral-small-3.2-24b-instruct:free', strategy='basic')

🚀 Migrating 001_getid3.lib.php using mistralai/mistral-small-3.2-24b-instruct:free with basic strategy...
📏 Input code length: 43,494 characters (1,342 lines)
📦 Large file detected (1342 lines) - using NEW organized chunking
 Split into 3 chunks of ~500 lines each
📁 Saving chunks to: chunked_model_output\mistral_small_3_2_24b_instruct_free\001_getid3.lib

[Chunk 1/3] Processing lines 1-500...
📏 Chunk prompt length: 17,179 characters
🔗 Making API call to OpenRouter...
📊 HTTP Status: 429
📏 Response length: 314 characters
❌ API Error: 429
📄 Response: {"error":{"message":"Rate limit exceeded: free-models-per-day. Add 10 credits to unlock 1000 free model requests per day","code":429,"metadata":{"headers":{"X-RateLimit-Limit":"50","X-RateLimit-Remaining":"0","X-RateLimit-Reset":"1756771200000"},"provider_name":null}},"user_id":"user_2xMT55snBKFgKi83Qy4yYtb4ain"}...
❌ Failed to process chunk 1

[Chunk 2/3] Processing lines 501-1000...
📏 Chunk prompt length: 18,129 characters
🔗 Making API call

[None, None, None]

In [82]:
# SIMPLIFIED OUTPUT PARSER - NO CHUNKING FOR NOW
import re
from pathlib import Path

class OutputParser:
    """Simple parser for model responses - single files only."""
    
    def __init__(self):
        self.model_output_path = Path('model_output')
        self.parsed_path = Path('new-version')
        self.parsed_path.mkdir(exist_ok=True)
    
    def extract_migrated_code(self, response_content: str) -> str:
        """Extract code between MIGRATION_START and MIGRATION_END markers."""
        # Look for migration markers (most reliable method)
        start_pattern = r'//\s*MIGRATION_START\s*\n'
        end_pattern = r'\n//\s*MIGRATION_END'
        
        start_match = re.search(start_pattern, response_content, re.IGNORECASE)
        end_match = re.search(end_pattern, response_content, re.IGNORECASE)
        
        if start_match and end_match:
            start_pos = start_match.end()
            end_pos = end_match.start()
            migrated_code = response_content[start_pos:end_pos].strip()
            return migrated_code
        
        # No markers found
        return ""
    
    def extract_metadata(self, response_content: str) -> dict:
        """Extract metadata from response file header."""
        lines = response_content.split('\n')
        metadata = {}
        
        for line in lines[:15]:  # Check first 15 lines
            if line.startswith('File:'):
                metadata['original_file'] = line.split(':', 1)[1].strip()
            elif line.startswith('Model:'):
                metadata['model'] = line.split(':', 1)[1].strip()
            elif line.startswith('Strategy:'):
                metadata['strategy'] = line.split(':', 1)[1].strip()
        
        return metadata
    
    def parse_single_file(self, response_file: Path) -> dict:
        """Parse a single response file."""
        try:
            print(f"Processing {response_file.name}")
            
            with open(response_file, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Extract metadata
            metadata = self.extract_metadata(content)
            if not metadata.get('original_file'):
                print(f"   ERROR: No original file found in metadata")
                return {'success': False}
            
            # Extract migrated code
            migrated_code = self.extract_migrated_code(content)
            if not migrated_code:
                print(f"   ERROR: No migrated code found between markers")
                return {'success': False}
            
            print(f"   SUCCESS: Found {len(migrated_code)} chars of migrated code")
            return {
                'success': True,
                'metadata': metadata,
                'migrated_code': migrated_code
            }
        
        except Exception as e:
            print(f"   ERROR: {e}")
            return {'success': False, 'error': str(e)}
    
    def save_parsed_file(self, result: dict, response_filename: str, model_folder_name: str = None) -> bool:
        """Save parsed result to organized structure."""
        try:
            metadata = result['metadata']
            migrated_code = result['migrated_code']
            
            # Determine model folder name
            if model_folder_name:
                # Use provided model folder name (from new structure)
                model_clean = model_folder_name
            else:
                # Extract from metadata (fallback for old structure)
                model_name = metadata.get('model', 'unknown_model')
                model_clean = model_name.replace('/', '_').replace('-', '_').replace(':', '_').replace('.', '_').lower()
            
            # Create model folder in new-version
            model_folder = self.parsed_path / model_clean
            model_folder.mkdir(exist_ok=True)
            
            # Determine output filename
            original_filename = metadata.get('original_file')
            if original_filename:
                output_file = model_folder / original_filename
            else:
                # Fallback: derive from response filename
                if response_filename.endswith('.txt'):
                    php_filename = response_filename[:-4] + '.php'  # Replace .txt with .php
                else:
                    php_filename = response_filename + '.php'
                output_file = model_folder / php_filename
            
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(migrated_code)
            
            print(f"   ✅ SAVED: {output_file}")
            return True
            
        except Exception as e:
            print(f"   ❌ SAVE ERROR: {e}")
            return False
    
    def process_all_responses(self):
        """Process all response files in model_output directory."""
        print("🔄 Processing all model responses...")
        
        if not self.model_output_path.exists():
            print(f"❌ Directory {self.model_output_path} not found")
            return
        
        # Look for model subfolders in model_output
        model_folders = [d for d in self.model_output_path.iterdir() if d.is_dir()]
        
        if not model_folders:
            # Fallback: look for .txt files directly in model_output (old structure)
            response_files = list(self.model_output_path.glob('*.txt'))
            if response_files:
                print(f"📁 Found {len(response_files)} response files in old structure")
                self._process_files_directly(response_files)
            else:
                print("❌ No model folders or .txt files found in model_output/")
            return
        
        print(f"📁 Found {len(model_folders)} model folders:")
        for folder in model_folders:
            print(f"   📂 {folder.name}/")
        
        total_success = 0
        total_failed = 0
        
        # Process each model folder
        for model_folder in model_folders:
            print(f"\n🔄 Processing model: {model_folder.name}")
            
            # Get all .txt files in this model folder
            response_files = list(model_folder.glob('*.txt'))
            print(f"   📄 Found {len(response_files)} response files")
            
            if not response_files:
                print("   ⚠️  No .txt files found in this model folder")
                continue
            
            success_count = 0
            failed_count = 0
            
            for response_file in response_files:
                result = self.parse_single_file(response_file)
                
                if result['success']:
                    # Update metadata to include model folder name
                    if 'metadata' not in result:
                        result['metadata'] = {}
                    result['metadata']['model_folder'] = model_folder.name
                    
                    if self.save_parsed_file(result, response_file.name, model_folder.name):
                        success_count += 1
                    else:
                        failed_count += 1
                else:
                    failed_count += 1
            
            print(f"   ✅ Successfully processed: {success_count} files")
            print(f"   ❌ Failed to process: {failed_count} files")
            
            total_success += success_count
            total_failed += failed_count
        
        print(f"\n🎉 Overall processing completed!")
        print(f"✅ Total successfully processed: {total_success} files")
        print(f"❌ Total failed to process: {total_failed} files")
        
        # Show what was created
        if total_success > 0:
            print(f"\n📁 Results saved to '{self.parsed_path}':")
            for model_folder in sorted(self.parsed_path.iterdir()):
                if model_folder.is_dir():
                    php_files = list(model_folder.glob('*.php'))
                    print(f"   📂 {model_folder.name}/ ({len(php_files)} files)")
    
    def _process_files_directly(self, response_files):
        """Process files directly from model_output (fallback for old structure)."""
        success_count = 0
        failed_count = 0
        
        for response_file in response_files:
            result = self.parse_single_file(response_file)
            
            if result['success']:
                if self.save_parsed_file(result, response_file.name):
                    success_count += 1
                else:
                    failed_count += 1
            else:
                failed_count += 1
        
        print(f"\n🎉 Processing completed!")
        print(f"✅ Successfully processed: {success_count} files")
        print(f"❌ Failed to process: {failed_count} files")

# Initialize simplified parser
parser = OutputParser()
print("Simplified output parser ready!")
print("Features:")
print("   • Simple single-file processing only")
print("   • Reliable MIGRATION_START/END marker extraction")
print("   • Clear error reporting")
print("   • No chunking complexity")
print("Usage: parser.process_all_responses()")

Simplified output parser ready!
Features:
   • Simple single-file processing only
   • Reliable MIGRATION_START/END marker extraction
   • Clear error reporting
   • No chunking complexity
Usage: parser.process_all_responses()


In [83]:
# FILE RECONSTRUCTOR - COMBINES PARSED CHUNKS INTO COMPLETE FILES
class FileReconstructor:
    """Reconstructs complete files from parsed chunk files."""
    
    def __init__(self, parser):
        self.parser = parser  # Use the simple parser for individual chunks
        self.chunked_output_path = Path('chunked_model_output')
        self.final_output_path = Path('new-version')
        self.final_output_path.mkdir(exist_ok=True)
    
    def find_chunked_files(self):
        """Find all chunked file directories."""
        if not self.chunked_output_path.exists():
            print("No chunked_model_output directory found")
            return []
        
        chunked_files = []
        
        for model_dir in self.chunked_output_path.iterdir():
            if model_dir.is_dir():
                for file_dir in model_dir.iterdir():
                    if file_dir.is_dir():
                        # Check if it has numbered chunk files
                        chunk_files = list(file_dir.glob('*.txt'))
                        if chunk_files:
                            chunked_files.append({
                                'model': model_dir.name,
                                'filename': file_dir.name,
                                'directory': file_dir,
                                'chunk_count': len(chunk_files)
                            })
        
        return chunked_files
    
    def get_chunk_files(self, directory: Path):
        """Get all chunk files from a directory, sorted by number."""
        chunk_files = []
        
        for file in directory.glob('*.txt'):
            try:
                # Extract number from filename (1.txt -> 1)
                chunk_num = int(file.stem)
                chunk_files.append((chunk_num, file))
            except ValueError:
                print(f"WARNING: Skipping non-numeric chunk file: {file.name}")
        
        # Sort by chunk number
        chunk_files.sort(key=lambda x: x[0])
        return chunk_files
    
    def reconstruct_file(self, file_info: dict):
        """Reconstruct a complete file from its chunks."""
        print(f"\nReconstructing {file_info['filename']}.php from {file_info['chunk_count']} chunks")
        print(f"Model: {file_info['model']}")
        print(f"Directory: {file_info['directory']}")
        
        # Get sorted chunk files
        chunk_files = self.get_chunk_files(file_info['directory'])
        
        if not chunk_files:
            print("   ERROR: No valid chunk files found")
            return False
        
        # Check for missing chunks
        expected_numbers = list(range(1, len(chunk_files) + 1))
        actual_numbers = [num for num, _ in chunk_files]
        missing = set(expected_numbers) - set(actual_numbers)
        
        if missing:
            print(f"   WARNING: Missing chunks: {sorted(missing)}")
        
        print(f"   Found chunks: {actual_numbers}")
        
        # Parse each chunk
        parsed_chunks = []
        metadata = None
        
        for chunk_num, chunk_file in chunk_files:
            print(f"   Processing chunk {chunk_num}...")
            result = self.parser.parse_single_file(chunk_file)
            
            if result['success']:
                parsed_chunks.append({
                    'number': chunk_num,
                    'code': result['migrated_code'],
                    'metadata': result['metadata']
                })
                
                # Use metadata from first successful chunk
                if metadata is None:
                    metadata = result['metadata']
                    
                print(f"      SUCCESS: {len(result['migrated_code'])} chars")
            else:
                print(f"      ERROR: Failed to parse chunk {chunk_num}")
                parsed_chunks.append({
                    'number': chunk_num,
                    'code': None,
                    'metadata': None
                })
        
        if not any(chunk['code'] for chunk in parsed_chunks):
            print("   ERROR: No chunks could be parsed successfully")
            return False
        
        # Combine chunks
        combined_code = []
        successful_chunks = 0
        final_code = ""  # Initialize final_code
        
        for chunk in parsed_chunks:
            if chunk['code']:
                combined_code.append(chunk['code'])
                successful_chunks += 1
            else:
                print(f"   WARNING: Chunk {chunk['number']} failed - adding placeholder comment")
                combined_code.append(f"// ERROR: Chunk {chunk['number']} failed to parse")
        
        final_code = ''.join(combined_code)
        print(f"   Combined {successful_chunks}/{len(parsed_chunks)} chunks successfully")
        print(f"   Final code length: {len(final_code)} characters")
        
        # Save reconstructed file
        return self.save_reconstructed_file(file_info, final_code, metadata)
    
    def save_reconstructed_file(self, file_info: dict, code: str, metadata: dict):
        """Save the reconstructed complete file."""
        try:
            # Create model folder in final output
            model_folder = self.final_output_path / file_info['model']
            model_folder.mkdir(exist_ok=True)
            
            # Save the reconstructed file
            output_file = model_folder / f"{file_info['filename']}.php"
            
            with open(output_file, 'w', encoding='utf-8') as f:
                # Write clean PHP code without metadata header
                f.write(code)
            
            print(f"   SAVED: {output_file}")
            return True
            
        except Exception as e:
            print(f"   ERROR saving file: {e}")
            return False
    
    def reconstruct_all_files(self):
        """Reconstruct all chunked files found."""
        print("🔧 Starting file reconstruction...")
        
        chunked_files = self.find_chunked_files()
        
        if not chunked_files:
            print("No chunked files found to reconstruct")
            return
        
        print(f"Found {len(chunked_files)} chunked files to reconstruct:")
        for file_info in chunked_files:
            print(f"   {file_info['model']}/{file_info['filename']}.php ({file_info['chunk_count']} chunks)")
        
        successful = 0
        failed = 0
        
        for file_info in chunked_files:
            if self.reconstruct_file(file_info):
                successful += 1
            else:
                failed += 1
        
        print(f"\n🎉 Reconstruction completed!")
        print(f"✅ Successfully reconstructed: {successful} files")
        print(f"❌ Failed to reconstruct: {failed} files")
        
        if successful > 0:
            print(f"\n📁 Reconstructed files saved to: {self.final_output_path}")
            for model_folder in sorted(self.final_output_path.iterdir()):
                if model_folder.is_dir():
                    php_files = list(model_folder.glob('*.php'))
                    print(f"   {model_folder.name}/ ({len(php_files)} files)")

# Initialize the reconstructor with our simple parser
reconstructor = FileReconstructor(parser)
print("✅ File Reconstructor ready!")
print("🎯 Features:")
print("   • Finds all chunked files in chunked_model_output/")
print("   • Uses simple parser to parse individual chunks")
print("   • Combines chunks in correct order")
print("   • Handles missing chunks gracefully")
print("   • Saves complete files to new-version/")
print("💡 Usage: reconstructor.reconstruct_all_files()")

✅ File Reconstructor ready!
🎯 Features:
   • Finds all chunked files in chunked_model_output/
   • Uses simple parser to parse individual chunks
   • Combines chunks in correct order
   • Handles missing chunks gracefully
   • Saves complete files to new-version/
💡 Usage: reconstructor.reconstruct_all_files()


In [84]:
parser.process_all_responses()
reconstructor.reconstruct_all_files()

🔄 Processing all model responses...
📁 Found 1 model folders:
   📂 mistral_small_3_2_24b_instruct_free/

🔄 Processing model: mistral_small_3_2_24b_instruct_free
   📄 Found 20 response files
Processing 007_class-wp.txt
   SUCCESS: Found 28022 chars of migrated code
   ✅ SAVED: new-version\mistral_small_3_2_24b_instruct_free\007_class-wp.php
Processing 008_wp-login.txt
   SUCCESS: Found 35944 chars of migrated code
   ✅ SAVED: new-version\mistral_small_3_2_24b_instruct_free\008_wp-login.php
Processing 017_press-this.txt
   SUCCESS: Found 32665 chars of migrated code
   ✅ SAVED: new-version\mistral_small_3_2_24b_instruct_free\017_press-this.php
Processing 020_class-ftp.txt
   SUCCESS: Found 36555 chars of migrated code
   ✅ SAVED: new-version\mistral_small_3_2_24b_instruct_free\020_class-ftp.php
Processing 021_class-pop3.txt
   SUCCESS: Found 16721 chars of migrated code
   ✅ SAVED: new-version\mistral_small_3_2_24b_instruct_free\021_class-pop3.php
Processing 022_class-wp-customize-setting