# CORPUS-INFORMED AUTO-TRANSLATOR: VECTOR CREATION NOTEBOOK
Converting JSON Corpus Contents into Semantic Vectors

## Notebook Outline

**Part 1: Data Preparation and Planning**

- STEP 1: LOAD THE REQUIRED LIBRARIES
- STEP 2: LOAD AND EXAMINE DATABASE STRUCTURE
- STEP 3: LOAD UNPROCESSED DOCUMENTS FOR BATCH PROCESSING (WITHOUT RE-VECTORIZATION)

**Part 2: Function and Model Setup**

- STEP 4: DEFINE TEXT EXTRACTION FUNCTIONS (STANDARDIZED)
- STEP 5: DEFINE VECTORIZATION FUNCTIONS
- STEP 6: INITIALIZE THE MULTILINGUAL EMBEDDING MODEL

**Part 3: Execution and Output**
- STEP 7: BATCH PROCESS DOCUMENTS
- STEP 8: GENERATE HTML VISUALIZATION REPORT

## =============================================================================

## Local Environment Setup

In [None]:
# ==============================================================================
# LOCAL ENVIRONMENT SETUP - Run this FIRST!
# ==============================================================================

print("🌍 Setting up local Pragmatic Auto-Translator environment...")

import os
import sys
from pathlib import Path

# Navigate to project root (assumes notebook is in scripts/vectorization/)
print("📁 Setting up project paths...")
current_notebook_dir = Path.cwd()

# Find project root by looking for config.py in scripts folder
project_root = None
search_dir = current_notebook_dir

# Search up the directory tree for scripts/config.py
for _ in range(5):  # Prevent infinite loop
    scripts_dir = search_dir / 'scripts'
    config_file = scripts_dir / 'config.py'
    
    if config_file.exists():
        project_root = search_dir
        break
    
    parent = search_dir.parent
    if parent == search_dir:  # Reached filesystem root
        break
    search_dir = parent

if project_root is None:
    print("❌ Could not find project root with scripts/config.py")
    print(f"💡 Current directory: {current_notebook_dir}")
    print("💡 Make sure you're running this notebook from within the project structure")
    sys.exit(1)

# Set working directory and Python path
os.chdir(project_root)
scripts_path = project_root / 'scripts'
if str(scripts_path) not in sys.path:
    sys.path.insert(0, str(scripts_path))

print(f"✅ Project root: {project_root}")
print(f"✅ Scripts path added: {scripts_path}")

# Import and validate configuration
print("📋 Loading project configuration...")
try:
    from config import *
    
    # Initialize directories
    ensure_directories(DOMAIN)
    
    # Verify corpus databases and show results
    corpus_ready = verify_corpus_databases(DOMAIN)
    
    if corpus_ready:
        print(f"\n✅ Configuration loaded: {DOMAIN.upper()} domain, {len(LANGUAGES)} languages, {MODEL_NAME}")
        setup_success = True
    else:
        print(f"\n⚠️ Missing corpus files - check DOMAIN setting or run corpus collection")
        setup_success = False

except ImportError as e:
    print(f"❌ Configuration import failed: {e}")
    setup_success = False
except Exception as e:
    print(f"❌ Configuration error: {e}")
    setup_success = False

# Final status
if setup_success:
    print("\n✅ Environment ready for STEP 1: Load Required Libraries")
else:
    print("\n❌ Setup failed - resolve issues before proceeding")
    print("💡 Check that:")
    print("   - You're in the correct project directory")
    print("   - config.py exists in scripts/ folder")
    print("   - Corpus database files exist for configured languages")


## Part 1: Data Preparation and Planning

### STEP 1: LOAD THE REQUIRED LIBRARIES

In [None]:
# ==============================================================================
# STEP 1: LOAD THE REQUIRED LIBRARIES
# ==============================================================================

print("📚 Loading required libraries...")

# Core Python libraries (built-in)
import json
import logging
from typing import Dict, List, Optional
from datetime import datetime
import re
from collections import defaultdict

# Check and install required libraries
required_libs = ["numpy", "sentence-transformers", "scikit-learn", "matplotlib", "seaborn", "tqdm"]
missing_libs = []

# Test imports and collect missing libraries
import importlib

for lib in required_libs:
    try:
        if lib == "sentence-transformers":
            importlib.import_module("sentence_transformers")
        elif lib == "scikit-learn":
            importlib.import_module("sklearn")
        else:
            importlib.import_module(lib)
    except ImportError:
        missing_libs.append(lib)

# Install missing libraries
if missing_libs:
    print(f"📦 Installing missing libraries: {', '.join(missing_libs)}")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_libs)
    print("✅ Installation complete")

# Now import everything
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("🚀 All libraries loaded - ready for vectorization")
print(f"📊 Ready to process {len(LANGUAGES)} languages: {', '.join(LANGUAGES)}")
print(f"🎯 Domain: {DOMAIN}")

### STEP 2: LOAD AND EXAMINE DATABASE STRUCTURE

In [None]:
# ==============================================================================
# STEP 2: LOAD CORPUS DATABASES AND DEFINE CORPUS ITEM LOADING HELPER FUNCTION  
# ==============================================================================

# Load document metadata from corpus databases
document_metadata = load_all_databases()  # Note: This loads metadata, not actual content

if not document_metadata:
    raise Exception("❌ No document metadata loaded - check corpus database files")

# Create summary for processing
document_summary = {}
for language, docs in document_metadata.items():
    document_summary[language] = {
        'count': len(docs),
        'sample_ids': list(docs.keys())[:3]  # Show first 3 IDs as examples
    }

print(f"✅ Document metadata loaded - ready to load unprocessed documents for vectorization")

# Add this to your Step 2 (after loading document_metadata)

def load_corpus_item(language, doc_id):
    """
    Load both the metadata and content data for a specific document
    
    Args:
        language: Language code ('eng', 'esp', etc.)
        doc_id: Document ID (e.g., 'gai-eng_corpus-item001')
    
    Returns:
        Dictionary containing both metadata and content
    """
    # Load metadata from database
    if language not in document_metadata:
        raise ValueError(f"Language '{language}' not found in loaded databases")
    
    if doc_id not in document_metadata[language]:
        raise ValueError(f"Document '{doc_id}' not found in {language} database")
    
    metadata = document_metadata[language][doc_id]
    
    # Load content from separate content file using config paths
    content_file_path = PATHS[language]['processed'] / f'{doc_id}.json'
    
    if not content_file_path.exists():
        raise FileNotFoundError(f"Content file not found: {content_file_path}")
    
    try:
        with open(content_file_path, 'r', encoding='utf-8') as f:
            content_data = json.load(f)
    except Exception as e:
        raise ValueError(f"Error loading content file {content_file_path}: {e}")
    
    # Merge metadata and content
    merged_data = {
        'document_metadata': metadata.get('document_metadata', {}),
        'processing_metadata': metadata.get('processing_metadata', {}),
        'document_id': content_data.get('document_id', doc_id),
        'content': content_data.get('content', {})
    }
    
    return merged_data

print(f"\n✅ Helper function loaded - load_corpus_item() ready to use")

### STEP 3: LOAD UNPROCESSED DOCUMENTS FOR BATCH PROCESSING (NO RE-VECTORIZATION)

In [None]:
# ==============================================================================
# STEP 3: LOAD UNPROCESSED DOCUMENTS FOR BATCH PROCESSING (NO RE-VECTORIZATION)
# ==============================================================================

def load_existing_vectors():
    """
    Load all existing vector files and extract processed document IDs
    Returns set of document IDs that already have vectors
    """
    processed_doc_ids = set()
    vectors_dir = PATHS['vectors']
    
    print(f"🔍 CHECKING EXISTING VECTORS IN: {vectors_dir}")
    
    # Check each vector file type defined in config
    for vector_type, filename in OUTPUT_FILES.items():
        if not vector_type.endswith('_vectors'):
            continue  # Skip non-vector files
            
        filepath = vectors_dir / filename
        
        if filepath.exists():
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                vectors = data.get('vectors', [])
                
                # Extract document IDs from vectors
                for vector in vectors:
                    doc_id = vector.get('document_id', '')
                    if doc_id:
                        processed_doc_ids.add(doc_id)
                
                print(f"  ✅ {vector_type}: {len(vectors)} vectors found")
                
            except Exception as e:
                print(f"  ❌ Error reading {filepath}: {e}")
        else:
            print(f"  📝 {vector_type}: No existing file")
    
    print(f"\n📊 SUMMARY: {len(processed_doc_ids)} documents already have vectors")
    return processed_doc_ids

def find_unprocessed_documents(all_documents, processed_doc_ids):
    """
    Find documents that need vectorization
    """
    unprocessed = []
    
    for language, docs in all_documents.items():
        for doc_id, doc_data in docs.items():
            if doc_id not in processed_doc_ids:
                doc_metadata = doc_data.get('document_metadata', {})
                processing_metadata = doc_data.get('processing_metadata', {})
                
                unprocessed.append({
                    'document_id': doc_id,
                    'language': language,
                    'title': doc_metadata.get('title', 'No title'),
                    'text_type': doc_metadata.get('text_type', 'Unknown'),
                    'word_count': processing_metadata.get('word_count', 0)
                })
    
    return unprocessed

def load_documents_for_processing(documents_to_process):
    """
    Load the actual document content for vectorization
    """
    loaded_documents = []
    failed_count = 0
    
    print(f"\n📖 LOADING DOCUMENT CONTENT:")
    
    for doc_info in documents_to_process:
        doc_id = doc_info['document_id']
        language = doc_info['language']
        
        try:
            document = load_corpus_item(language, doc_id)
            document['processing_info'] = doc_info
            loaded_documents.append(document)
            
        except Exception as e:
            print(f"  ❌ {doc_id}: Error - {e}")
            failed_count += 1
    
    if failed_count == 0:
        print(f"  ✅ All {len(loaded_documents)} documents loaded successfully")
    else:
        print(f"  ✅ {len(loaded_documents)} loaded, {failed_count} failed")
    
    return loaded_documents

# Execute the loading process
print("🚀 STARTING DOCUMENT LOADING PROCESS")
print("="*50)

# Check for existing vectors
existing_processed_ids = load_existing_vectors()

# Find unprocessed documents
documents_to_process = find_unprocessed_documents(document_metadata, existing_processed_ids)

# Show processing summary
total_docs = sum(len(docs) for docs in document_metadata.values())
print(f"\n📊 PROCESSING SUMMARY:")
print(f"  • Total documents: {total_docs} | Already processed: {len(existing_processed_ids)} | Need processing: {len(documents_to_process)}")

# Load documents for processing (if any)
if documents_to_process:
    print(f"\n📝 DOCUMENTS TO PROCESS ({len(documents_to_process)} total):")
    for i, doc in enumerate(documents_to_process, 1):
        print(f"  {i:2d}. {doc['document_id']} ({doc['language'].upper()}) - {doc['word_count']:,} words")
        print(f"      {doc['title'][:60]}{'...' if len(doc['title']) > 60 else ''}")
    
    # Load the actual content
    loaded_docs = load_documents_for_processing(documents_to_process)
    
    print(f"\n✅ LOADING COMPLETE: {len(loaded_docs)} documents ready for text extraction")

else:
    print(f"\n🎉 ALL DOCUMENTS ALREADY PROCESSED!")
    loaded_docs = []

print(f"\n✅ STEP 3 COMPLETE - Ready for text extraction functions!")

## =============================================================================

## Part 2: Function and Model Setup

### STEP 4: STANDARDIZED TEXT EXTRACTION FUNCTIONS

In [None]:
# ==============================================================================
# STEP 4: STANDARDIZED TEXT EXTRACTION FUNCTIONS
# ==============================================================================

def extract_clean_text(text):
    """
    Clean and normalize text content
    
    Args:
        text: Raw text string
        
    Returns:
        Cleaned text string
    """
    if not text or text == "null":
        return ""
    
    # Convert to string if not already
    text = str(text)
    
    # Normalize whitespace
    text = ' '.join(text.split())
    
    # Normalize quotation marks (straight quotes only as per QA checklist)
    text = text.replace('"', '"').replace('"', '"')
    text = text.replace(''', "'").replace(''', "'")
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text

def extract_paragraph_content(paragraph):
    """
    Extract semantic content from a standardized paragraph object
    
    Args:
        paragraph: Paragraph object from standardized JSON structure
        
    Returns:
        tuple: (paragraph_id, combined_text)
    """
    if not isinstance(paragraph, dict):
        return "unknown", ""
    
    paragraph_id = paragraph.get('id', 'unknown')
    text_parts = []
    
    # Main paragraph text (includes anchor text for links)
    main_text = extract_clean_text(paragraph.get('text', ''))
    if main_text:
        text_parts.append(main_text)
    
    # Process inline equations (semantic content for technical domains)
    inline_equations = paragraph.get('inline_equations', [])
    for equation in inline_equations:
        marker = equation.get('marker', '')
        latex = equation.get('latex', '')
        if marker and latex:
            text_parts.append(f"[Equation: {marker} = {latex}]")
    
    # Combine all text parts
    combined_text = ' '.join(text_parts)
    
    return paragraph_id, combined_text

def extract_section_content(section, document_id, parent_section_id="", nesting_level=0):
    """
    Extract content from a section and all its nested subsections (arbitrary depth)
    
    Args:
        section: Section object from standardized JSON structure
        document_id: Document identifier
        parent_section_id: Parent section ID for building hierarchy
        nesting_level: Current nesting depth (for debugging/logging)
        
    Returns:
        dict: Section data with text and paragraph information
    """
    section_id = section.get('id', 'unknown')
    section_title = extract_clean_text(section.get('title', ''))
    
    # Build full section ID considering hierarchy
    if parent_section_id:
        full_section_id = f"{parent_section_id}_{section_id.split('_')[-1]}"
    else:
        full_section_id = section_id
    
    # Initialize containers
    section_text_parts = []
    all_paragraphs = []
    all_subsections = []
    
    # Add section title to text
    if section_title:
        section_text_parts.append(section_title)
    
    # Process paragraphs at this level
    paragraphs = section.get('paragraphs', [])
    for paragraph in paragraphs:
        para_id, para_text = extract_paragraph_content(paragraph)
        
        if para_text:
            section_text_parts.append(para_text)
            
            # Store paragraph with metadata
            all_paragraphs.append({
                'id': para_id,
                'text': para_text,
                'section_id': full_section_id,
                'section_title': section_title,
                'document_id': document_id,
                'nesting_level': nesting_level
            })
    
    # Process all possible nested subsection keys (handles arbitrary depth)
    nested_section_keys = [
        'subsections', 
        'subsubsections', 
        'subsubsubsections',
        'subsubsubsubsections'  # Just in case you go even deeper!
    ]
    
    # Also dynamically find any keys that contain 'section' for future-proofing
    for key in section.keys():
        if 'section' in key.lower() and key not in nested_section_keys and isinstance(section[key], list):
            nested_section_keys.append(key)
    
    # Process all found nested sections
    for subsection_key in nested_section_keys:
        nested_sections = section.get(subsection_key, [])
        
        for nested_section in nested_sections:
            nested_section_data = extract_section_content(
                nested_section, 
                document_id, 
                full_section_id,
                nesting_level + 1
            )
            
            # Add nested section text to current section
            section_text_parts.append(nested_section_data['text'])
            
            # Collect all paragraphs and subsections from nested levels
            all_paragraphs.extend(nested_section_data['paragraphs'])
            all_subsections.append(nested_section_data)
            all_subsections.extend(nested_section_data['subsections'])
    
    # Combine section text
    section_text = ' '.join(section_text_parts)
    
    return {
        'id': full_section_id,
        'title': section_title,
        'text': section_text,
        'document_id': document_id,
        'paragraphs': all_paragraphs,
        'subsections': all_subsections,
        'nesting_level': nesting_level
    }

def extract_document_content(corpus_item):
    """
    Extract all content from a standardized corpus document
    
    Args:
        corpus_item: Complete document object from load_corpus_item()
        
    Returns:
        dict: Complete document content with text at multiple granularities
    """
    # Get document metadata
    document_id = corpus_item.get('document_id', 'unknown')
    doc_metadata = corpus_item.get('document_metadata', {})
    
    title = extract_clean_text(doc_metadata.get('title', ''))
    text_type = doc_metadata.get('text_type', 'unknown')
    language_family = doc_metadata.get('language_family', 'unknown')
    language_variant = doc_metadata.get('language_variant', 'unknown')
    
    print(f"📄 Processing: {document_id} ({text_type})")
    
    # Initialize content containers
    document_text_parts = []
    all_sections = []
    all_paragraphs = []
    
    # Add title to document text
    if title:
        document_text_parts.append(title)
    
    # Get content structure
    content = corpus_item.get('content', {})
    
    # Process abstract (if present)
    abstract = content.get('abstract')
    if abstract and abstract != "null":
        clean_abstract = extract_clean_text(abstract)
        if clean_abstract:
            document_text_parts.append(clean_abstract)
            
            # Store abstract as special paragraph
            all_paragraphs.append({
                'id': f"{document_id}_abstract",
                'text': clean_abstract,
                'section_id': 'abstract',
                'section_title': 'Abstract',
                'document_id': document_id
            })
    
    # Process sections
    sections = content.get('sections', [])
    for section in sections:
        section_data = extract_section_content(section, document_id, "", 0)  # Start at nesting level 0
        
        # Add section text to document
        document_text_parts.append(section_data['text'])
        
        # Store section information
        all_sections.append({
            'id': section_data['id'],
            'title': section_data['title'],
            'text': section_data['text'],
            'document_id': document_id,
            'nesting_level': section_data['nesting_level']
        })
        
        # Collect all paragraphs from section and all nested subsections
        all_paragraphs.extend(section_data['paragraphs'])
        
        # Collect all subsections (flattened from all nesting levels)
        all_sections.extend([
            {
                'id': sub['id'],
                'title': sub['title'],
                'text': sub['text'],
                'document_id': document_id,
                'nesting_level': sub['nesting_level']
            }
            for sub in section_data['subsections']
        ])
    
    # Process top-level figures (if present)
    figures = content.get('figures', [])
    for figure in figures:
        caption = extract_clean_text(figure.get('caption', ''))
        if caption:
            figure_text = f"Figure {figure.get('id', '')}: {caption}"
            document_text_parts.append(figure_text)
    
    # Process top-level tables (if present)
    tables = content.get('tables', [])
    for table in tables:
        caption = extract_clean_text(table.get('caption', ''))
        if caption:
            table_text = f"Table {table.get('id', '')}: {caption}"
            document_text_parts.append(table_text)
    
    # Combine all document text
    full_document_text = ' '.join(document_text_parts)
    
    # Calculate content statistics
    stats = {
        'total_sections': len(all_sections),
        'total_paragraphs': len(all_paragraphs),
        'document_length_chars': len(full_document_text),
        'document_length_words': len(full_document_text.split()),
        'text_type': text_type,
        'has_abstract': bool(abstract and abstract != "null"),
        'has_figures': len(figures) > 0,
        'has_tables': len(tables) > 0,
        'language': f"{language_family}-{language_variant}"
    }
    
    print(f"  ✅ Extracted: {stats['total_sections']} sections, {stats['total_paragraphs']} paragraphs")
    print(f"  📊 Length: {stats['document_length_words']:,} words")
    
    return {
        'document_id': document_id,
        'title': title,
        'text_type': text_type,
        'language': f"{language_family}-{language_variant}",
        'document_text': full_document_text,
        'sections': all_sections,
        'paragraphs': all_paragraphs,
        'statistics': stats,
        'processing_metadata': {
            'extracted_at': datetime.now().isoformat(),
            'extraction_method': 'standardized_schema_v1'
        }
    }

def validate_extracted_content(extracted_content):
    """
    Validate that content extraction produced expected results
    
    Args:
        extracted_content: Result from extract_document_content()
        
    Returns:
        bool: True if validation passes
    """
    required_fields = ['document_id', 'document_text', 'sections', 'paragraphs', 'statistics']
    
    for field in required_fields:
        if field not in extracted_content:
            print(f"❌ Missing required field: {field}")
            return False
    
    # Check that we have actual content
    if not extracted_content['document_text'].strip():
        print(f"❌ Empty document text for {extracted_content['document_id']}")
        return False
    
    if len(extracted_content['paragraphs']) == 0:
        print(f"❌ No paragraphs extracted for {extracted_content['document_id']}")
        return False
    
    print(f"✅ Content validation passed for {extracted_content['document_id']}")
    return True

# Test the extraction functions with a sample document
print("🧪 Testing extraction functions...")

if document_metadata and len(loaded_docs) > 0:
    # Test with first loaded document
    test_doc = loaded_docs[0]
    test_result = extract_document_content(test_doc)
    
    if validate_extracted_content(test_result):
        print("✅ Step 4 extraction functions ready for batch processing")
        print(f"📋 Test document: {test_result['document_id']}")
        print(f"📊 Test stats: {test_result['statistics']['total_sections']} sections, {test_result['statistics']['total_paragraphs']} paragraphs")
    else:
        print("❌ Extraction function validation failed")
else:
    print("⚠️ No documents available for testing - functions defined but not tested")
    print("✅ Step 4 extraction functions ready (untested)")

print("\n" + "="*60)
print("📋 STEP 4 COMPLETE: Standardized text extraction functions ready")
print("🎯 Functions defined:")
print("   • extract_clean_text() - Text normalization")
print("   • extract_paragraph_content() - Semantic paragraph processing")
print("   • extract_section_content() - Recursive section processing (arbitrary depth)")
print("   • extract_document_content() - Complete document processing")
print("   • validate_extracted_content() - Content validation")
print("🔄 Supports unlimited nesting: sections → subsections → subsubsections → subsubsubsections → ...")
print("📝 Extracts: Main text + section titles + equations (excludes footnotes & URLs)")
print("="*60)

### STEP 5: DEFINE VECTORIZATION FUNCTIONS

In [None]:
# ==============================================================================
# STEP 5: VECTORIZATION FUNCTIONS 
# ==============================================================================

def create_vector_metadata(model):
    """
    Create metadata section matching JS-expected schema
    """
    return {
        "model": MODEL_NAME,
        "dimension": MODEL_DIMENSIONS,
        "task": MODEL_TASK,
        "normalization": True,
        "created": datetime.now().isoformat(),
        "model_parameters": {
            "trust_remote_code": MODEL_TRUST_REMOTE_CODE,
            "normalize_embeddings": True
        }
    }

def create_document_vectors(extracted_content, model):
    """
    Create document-level vectors with metadata
    """
    print(f"🎯 Creating document vector for: {extracted_content['document_id']}")
    
    doc_text = extracted_content['document_text']
    word_count = len(doc_text.split())
    
    # Create vector using Jina-v3
    vector = model.encode(
        doc_text,
        task=MODEL_TASK,
        normalize_embeddings=True
    )
    
    return {
        'id': extracted_content['document_id'],
        'title': extracted_content['title'],  # Document title already included
        'text': doc_text,
        'word_count': word_count,
        'character_count': len(doc_text),
        'vector': vector.tolist()
    }

def create_section_vectors(extracted_content, model):
    """
    Create section-level vectors with metadata including document title
    """
    print(f"📚 Creating section vectors for: {extracted_content['document_id']}")
    
    section_vectors = []
    document_title = extracted_content.get('title', 'No title')  # Get document title
    
    for section in extracted_content['sections']:
        if section['text'].strip():  # Only process non-empty sections
            section_text = section['text']
            word_count = len(section_text.split())
            
            # Create vector
            vector = model.encode(
                section_text,
                task=MODEL_TASK,
                normalize_embeddings=True
            )
            
            section_vectors.append({
                'id': section['id'],
                'document_id': section['document_id'],
                'document_title': document_title,  
                'title': section['title'],         # Section title
                'text': section_text,
                'word_count': word_count,
                'character_count': len(section_text),
                'nesting_level': section['nesting_level'],
                'vector': vector.tolist()
            })
    
    print(f"  ✅ Created {len(section_vectors)} section vectors (with document titles)")
    return section_vectors

def create_paragraph_vectors(extracted_content, model):
    """
    Create paragraph-level vectors with metadata including document title
    """
    print(f"📝 Creating paragraph vectors for: {extracted_content['document_id']}")
    
    paragraph_vectors = []
    document_title = extracted_content.get('title', 'No title')  # Get document title
    
    for paragraph in extracted_content['paragraphs']:
        if paragraph['text'].strip():  # Only process non-empty paragraphs
            para_text = paragraph['text']
            word_count = len(para_text.split())
            
            # Create vector
            vector = model.encode(
                para_text,
                task=MODEL_TASK,
                normalize_embeddings=True
            )
            
            paragraph_vectors.append({
                'id': paragraph['id'],
                'document_id': paragraph['document_id'],
                'document_title': document_title,  
                'text': para_text,
                'word_count': word_count,
                'character_count': len(para_text),
                'section_id': paragraph['section_id'],
                'section_title': paragraph['section_title'],
                'vector': vector.tolist()
            })
    
    print(f"  ✅ Created {len(paragraph_vectors)} paragraph vectors (with document titles)")
    return paragraph_vectors

def append_vectors_to_file(new_vectors, vector_type, model):
    """
    Append vectors to JS-compatible JSON files with proper schema
    
    Args:
        new_vectors: List of vectors to append
        vector_type: 'document', 'section', or 'paragraph'
        model: SentenceTransformer model for metadata
    
    Returns:
        dict: File statistics
    """
    # Use correct filename format expected by JS
    filename_map = {
        'document': OUTPUT_FILES['document_vectors'],
        'section': OUTPUT_FILES['section_vectors'],
        'paragraph': OUTPUT_FILES['paragraph_vectors']
    }
    
    if vector_type not in filename_map:
        raise ValueError(f"Invalid vector_type: {vector_type}")
    
    filepath = PATHS['vectors'] / filename_map[vector_type]
    
    # Load existing file or create new structure
    if filepath.exists():
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        except Exception as e:
            print(f"  ⚠️ Error reading existing file, creating new: {e}")
            existing_data = {"metadata": {}, "vectors": []}
    else:
        existing_data = {"metadata": {}, "vectors": []}
    
    # Ensure proper structure
    if 'vectors' not in existing_data:
        existing_data['vectors'] = []
    if 'metadata' not in existing_data:
        existing_data['metadata'] = {}
    
    # Get current count for sequential numbering
    current_count = len(existing_data['vectors'])
    timestamp = datetime.now().isoformat()
    
    # Add vectors with JS-compatible schema
    for i, vector in enumerate(new_vectors):
        # Create vector object matching JS expectations
        vector_obj = {
            'id': vector['id'],
            'count': current_count + i + 1,  # Sequential count (required by JS)
            'created': timestamp,
            'text': vector['text'],
            'word_count': vector['word_count'],
            'character_count': vector['character_count'],
            'vector': vector['vector']
        }
        
        # Add type-specific fields
        if vector_type == 'document':
            vector_obj['title'] = vector.get('title', 'No title')
        elif vector_type == 'section':
            vector_obj['document_id'] = vector['document_id']
            vector_obj['document_title'] = vector.get('document_title', 'No title')  # ADD: Document title
            vector_obj['title'] = vector.get('title', 'No title')  # Section title
            vector_obj['level'] = vector.get('nesting_level', 0)  # JS expects 'level' not 'nesting_level'
        elif vector_type == 'paragraph':
            vector_obj['document_id'] = vector['document_id']
            vector_obj['document_title'] = vector.get('document_title', 'No title')  # ADD: Document title
            # Include section context for paragraphs
            if 'section_id' in vector:
                vector_obj['section_id'] = vector['section_id']
            if 'section_title' in vector:
                vector_obj['section_title'] = vector['section_title']
        
        existing_data['vectors'].append(vector_obj)
    
    # Update metadata
    existing_data['metadata'] = create_vector_metadata(model)
    
    # Save file
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=2)
    
    # Calculate stats
    file_size_mb = filepath.stat().st_size / (1024*1024)
    
    print(f"  💾 Saved to: {filepath.name}")
    print(f"  📊 Added {len(new_vectors)} vectors (total: {len(existing_data['vectors'])})")
    print(f"  📁 File size: {file_size_mb:.2f} MB")
    
    return {
        'file': str(filepath),
        'vectors_added': len(new_vectors),
        'total_vectors': len(existing_data['vectors']),
        'file_size_mb': file_size_mb
    }

def process_document_vectors(extracted_content, model):
    """
    Process a single document into all vector types and append to files
    
    Args:
        extracted_content: Output from extract_document_content()
        model: Loaded SentenceTransformer model
    
    Returns:
        dict: Processing statistics
    """
    doc_id = extracted_content['document_id']
    doc_title = extracted_content.get('title', 'No title')
    print(f"\n🔄 PROCESSING VECTORS FOR: {doc_id}")
    print(f"📖 Document: {doc_title}")
    print("="*60)
    
    stats = {}
    
    # Create and save document vectors
    if CREATE_DOCUMENT_VECTORS:
        doc_vectors = [create_document_vectors(extracted_content, model)]
        stats['document'] = append_vectors_to_file(doc_vectors, 'document', model)
    
    # Create and save section vectors
    if CREATE_SECTION_VECTORS:
        section_vectors = create_section_vectors(extracted_content, model)
        if section_vectors:
            stats['section'] = append_vectors_to_file(section_vectors, 'section', model)
        else:
            print("  📚 No sections to vectorize")
    
    # Create and save paragraph vectors
    if CREATE_PARAGRAPH_VECTORS:
        paragraph_vectors = create_paragraph_vectors(extracted_content, model)
        if paragraph_vectors:
            stats['paragraph'] = append_vectors_to_file(paragraph_vectors, 'paragraph', model)
        else:
            print("  📝 No paragraphs to vectorize")
    
    print(f"✅ COMPLETED: {doc_id}")
    return stats

def verify_vector_files():
    """
    Verify that vector files match JS expectations and include document titles
    """
    print(f"\n🔍 VERIFYING VECTOR FILES (WITH DOCUMENT TITLES)")
    print("="*40)
    
    vector_types = ['document', 'section', 'paragraph']
    all_good = True
    
    for vector_type in vector_types:
        filename = OUTPUT_FILES[f'{vector_type}_vectors']
        filepath = PATHS['vectors'] / filename
        
        if filepath.exists():
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # Check structure
                metadata = data.get('metadata', {})
                vectors = data.get('vectors', [])
                
                print(f"📄 {filename}:")
                print(f"   Model: {metadata.get('model', 'MISSING')}")
                print(f"   Dimension: {metadata.get('dimension', 'MISSING')}")
                print(f"   Vector count: {len(vectors)}")
                
                # Check first vector structure
                if vectors:
                    first_vector = vectors[0]
                    required_fields = ['id', 'count', 'created', 'text', 'vector', 'word_count']
                    
                    # Add type-specific required fields
                    if vector_type == 'document':
                        required_fields.extend(['title'])
                    elif vector_type in ['section', 'paragraph']:
                        required_fields.extend(['document_id', 'document_title'])
                    
                    missing = [f for f in required_fields if f not in first_vector]
                    
                    if missing:
                        print(f"   ❌ Missing fields: {missing}")
                        all_good = False
                    else:
                        print(f"   ✅ Schema complete (with document titles)")
                        
                        # Check vector dimension
                        actual_dim = len(first_vector['vector'])
                        if actual_dim != MODEL_DIMENSIONS:
                            print(f"   ❌ Vector dimension: {actual_dim} (expected {MODEL_DIMENSIONS})")
                            all_good = False
                        else:
                            print(f"   ✅ Vector dimension: {actual_dim}")
                        
                        # Verify document title field for section/paragraph vectors
                        if vector_type in ['section', 'paragraph']:
                            doc_title = first_vector.get('document_title', 'MISSING')
                            if doc_title == 'MISSING':
                                print(f"   ❌ Document title missing")
                                all_good = False
                            else:
                                print(f"   ✅ Document title: {doc_title[:30]}...")
                
            except Exception as e:
                print(f"   ❌ Error: {e}")
                all_good = False
        else:
            print(f"📄 {filename}: Not found")
        
        print()
    
    if all_good:
        print("✅ All files compatible with JS code and include document titles!")
    else:
        print("⚠️ Some compatibility issues found")
    
    return all_good

print("✅ STEP 5 COMPLETE: Enhanced vectorization functions ready")
print("🎯 Functions defined:")
print("   • create_document_vectors() - Document-level vectorization")
print("   • create_section_vectors() - Section-level vectorization (+ document title)") 
print("   • create_paragraph_vectors() - Paragraph-level vectorization (+ document title)")
print("   • process_document_vectors() - Complete document processing")
print("   • verify_vector_files() - JS compatibility verification")
print(f"📁 Output files: {DOMAIN}-corpus-[type]-vectors.json")
print("💡 Includes word_count, character_count, and document_title metadata")
print("📖 Section and paragraph vectors now include document title for easier reference")
print("🔗 Fully compatible with existing JS translation pipeline")

### STEP 6: INITIALIZE THE MULTILINGUAL EMBEDDING MODEL

In [None]:
import subprocess
import sys

# Install einops in the same environment as the notebook
subprocess.check_call([sys.executable, "-m", "pip", "install", "einops"])

In [None]:
# ==============================================================================
# STEP 6: INITIALIZE THE MULTILINGUAL EMBEDDING MODEL
# ==============================================================================

print(f"🤖 Loading {MODEL_NAME}...")
print("📥 First run may take a moment to download model (~2GB)")

try:
    # Load jina-embeddings-v3 model
    model = SentenceTransformer(
        MODEL_NAME,
        trust_remote_code=MODEL_TRUST_REMOTE_CODE
    )
    
    # Verify model loaded correctly
    actual_dimension = model.get_sentence_embedding_dimension()
    max_length = model.max_seq_length
    
    print(f"✅ Model loaded successfully!")
    print(f"📊 Dimension: {actual_dimension} | Max length: {max_length}")
    
    # Check dimension matches config
    if actual_dimension != MODEL_DIMENSIONS:
        print(f"⚠️ WARNING: Expected {MODEL_DIMENSIONS}D, got {actual_dimension}D")
        print("Check your config.py MODEL_DIMENSIONS setting")
    
    # Quick functionality test
    print("🧪 Testing model...")
    test_vector = model.encode(
        "Test sentence for model verification.",
        task=MODEL_TASK,
        normalize_embeddings=True
    )
    
    # Verify output
    if len(test_vector) == MODEL_DIMENSIONS:
        print(f"✅ Model test passed - ready for vectorization!")
    else:
        print(f"❌ Test failed - vector dimension: {len(test_vector)}")
    
    print(f"🎯 Model optimized for: {MODEL_TASK}")
    print(f"🌐 Supports: English, Spanish and Simplified Chinese")

except Exception as e:
    print(f"❌ Model loading failed: {e}")
    print("💡 Try: pip install sentence-transformers>=2.7.0")
    model = None

if model is not None:
    print(f"\n✅ STEP 6 COMPLETE: Model ready for batch processing")
else:
    print(f"\n❌ STEP 6 FAILED: Fix model loading before proceeding")

## =============================================================================

## Part 3: Execution and Output

### STEP 7: BATCH PROCESS DOCUMENTS

In [None]:
# ==============================================================================
# STEP 7: BATCH PROCESS DOCUMENTS INTO VECTORS
# ==============================================================================

print("🚀 STARTING BATCH VECTORIZATION PROCESS")
print("=" * 60)

# Check if we have the model and documents ready
if 'model' not in locals() or model is None:
    print("❌ Model not loaded - run STEP 6 first")
    sys.exit(1)

if 'documents_to_process' not in locals():
    print("❌ Documents not identified - run STEP 3 first")
    sys.exit(1)

# Process documents if we have any to process
if documents_to_process and len(documents_to_process) > 0:
    print(f"📋 Processing {len(documents_to_process)} new documents")
    print(f"🤖 Model: {MODEL_NAME}")
    print(f"📐 Dimensions: {MODEL_DIMENSIONS}")
    print(f"🎯 Task: {MODEL_TASK}")
    print("-" * 40)
    
    # Load document content if not already loaded
    if 'loaded_docs' not in locals() or not loaded_docs:
        print("📖 Loading document content...")
        loaded_docs = load_documents_for_processing(documents_to_process)
        print(f"✅ Loaded {len(loaded_docs)} documents for processing")
    
    # Track processing statistics
    processing_stats = {
        'documents_processed': 0,
        'documents_failed': 0,
        'total_vectors_created': 0,
        'files_updated': []
    }
    
    # Process each document
    for i, corpus_item in enumerate(loaded_docs, 1):
        doc_id = corpus_item.get('document_id', f'unknown_{i}')
        
        # Show progress for every document
        print(f"\n📄 {i}/{len(loaded_docs)}: {doc_id}")
        
        try:
            # Extract text content using established function
            extracted_content = extract_document_content(corpus_item)
            
            # Validate extraction worked
            if not validate_extracted_content(extracted_content):
                print(f"  ❌ Content extraction failed")
                processing_stats['documents_failed'] += 1
                continue
            
            # Process into vectors using established function
            doc_stats = process_document_vectors(extracted_content, model)
            
            # Update statistics
            processing_stats['documents_processed'] += 1
            
            # Count vectors created
            vectors_created = 0
            if 'document' in doc_stats:
                vectors_created += doc_stats['document']['vectors_added']
            if 'section' in doc_stats:
                vectors_created += doc_stats['section']['vectors_added']
            if 'paragraph' in doc_stats:
                vectors_created += doc_stats['paragraph']['vectors_added']
            
            processing_stats['total_vectors_created'] += vectors_created
            print(f"  ✅ Created {vectors_created} vectors")
            
        except Exception as e:
            print(f"  ❌ Error processing {doc_id}: {e}")
            processing_stats['documents_failed'] += 1
            continue
    
    # Final processing summary
    print("\n" + "=" * 60)
    print("📊 BATCH PROCESSING COMPLETE")
    print("=" * 60)
    print(f"✅ Documents processed: {processing_stats['documents_processed']}")
    print(f"🎯 Total vectors created: {processing_stats['total_vectors_created']}")
    
    if processing_stats['documents_failed'] > 0:
        print(f"⚠️  Documents failed: {processing_stats['documents_failed']}")
    
    # Show language breakdown
    lang_counts = {}
    for doc in loaded_docs:
        # Extract language from processing_info
        lang = doc.get('processing_info', {}).get('language', 'unknown')
        lang_counts[lang] = lang_counts.get(lang, 0) + 1
    
    if lang_counts:
        lang_summary = ", ".join([f"{lang.upper()}: {count}" for lang, count in lang_counts.items()])
        print(f"🌐 Languages processed: {lang_summary}")
    
    # Verify vector files
    print(f"\n🔍 VERIFYING OUTPUT FILES")
    print("-" * 30)
    if verify_vector_files():
        print("✅ All vector files are properly formatted")
    else:
        print("⚠️  Some vector files may have issues")

else:
    print("🎉 NO PROCESSING NEEDED")
    print("All documents already have vectors")

# Show final corpus status
print(f"\n📊 FINAL CORPUS STATUS")
print("-" * 30)

# Reload existing vectors to get current totals
current_vectors = load_existing_vectors()
total_corpus_docs = sum(len(docs) for docs in document_metadata.values())
processed_docs = len(current_vectors)
coverage = (processed_docs / total_corpus_docs * 100) if total_corpus_docs > 0 else 0

print(f"Documents: {processed_docs}/{total_corpus_docs} ({coverage:.1f}% coverage)")
print(f"Model: {MODEL_NAME}")
print(f"Dimensions: {MODEL_DIMENSIONS}")

# Check if corpus is complete
if processed_docs >= total_corpus_docs:
    print("🎉 CORPUS VECTORIZATION COMPLETE!")
    print("🌐 Ready for cross-lingual analysis and clustering")
else:
    remaining = total_corpus_docs - processed_docs
    print(f"📋 Remaining: {remaining} documents to process")

# Store final results for next steps
final_processing_summary = {
    'documents_processed': processing_stats.get('documents_processed', 0),
    'total_vectors_created': processing_stats.get('total_vectors_created', 0),
    'corpus_coverage_percent': round(coverage, 1),
    'total_documents_in_corpus': total_corpus_docs,
    'processed_documents_count': processed_docs,
    'model_used': MODEL_NAME,
    'vector_dimensions': MODEL_DIMENSIONS,
    'task_optimization': MODEL_TASK,
    'output_directory': str(PATHS['vectors'])
}

print(f"\n✅ STEP 7 COMPLETE")
print(f"📝 Results stored in 'final_processing_summary' variable")
print(f"📁 Vector files saved to: {PATHS['vectors']}")
print(f"🚀 Ready for STEP 8: Generate visualization report")

### STEP 8: GENERATE HTML VISUALIZATION REPORT

In [None]:
import sys
import subprocess

# Install plotly in the current Python environment
subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly", "scikit-learn"])
print("✅ Plotly installed in current environment")

In [None]:
# ==============================================================================
# STEP 8: GENERATE OPTIMIZED JSON VISUALIZATION DATA
# Exports clean JSON data for frontend visualization system
# ==============================================================================

import plotly.graph_objects as go
import plotly.express as px
import numpy as np
from sklearn.decomposition import PCA
from datetime import datetime
import json
from pathlib import Path

def extract_language_from_id(doc_id):
    """Extract language code from document ID (e.g., 'gai-eng_item001' -> 'eng')"""
    try:
        if '_' in doc_id:
            prefix = doc_id.split('_')[0]
            if '-' in prefix:
                return prefix.split('-')[-1]
        return 'unknown'
    except:
        return 'unknown'

def get_language_color_scheme():
    """Define color schemes for each language and granularity level"""
    return {
        'eng': {
            'Document': '#1f77b4',    # Dark blue
            'Section (L0)': '#5299c4', # Medium blue  
            'Section (L1+)': '#7db8d4', # Light blue (all sub-levels)
            'Paragraph': '#a8d1e8'     # Lightest blue
        },
        'esp': {
            'Document': '#2ca02c',     # Dark green
            'Section (L0)': '#5cb85c', # Medium green
            'Section (L1+)': '#8cc98c', # Light green (all sub-levels)
            'Paragraph': '#b8dab8'     # Lightest green
        },
        'zho-chn': {
            'Document': '#d62728',     # Dark red
            'Section (L0)': '#e55858', # Medium red
            'Section (L1+)': '#f08888', # Light red (all sub-levels)
            'Paragraph': '#fbb8b8'     # Lightest red
        },
        'unknown': {
            'Document': '#666666',     # Gray
            'Section (L0)': '#888888',
            'Section (L1+)': '#aaaaaa',
            'Paragraph': '#cccccc'
        }
    }

def load_all_vectors_for_visualization():
    """Load all vectors with improved language and type classification"""
    vector_dir = PATHS['vectors']
    
    all_vectors = []
    all_labels = []
    all_types = []
    all_languages = []
    all_colors = []
    all_details = []
    
    color_scheme = get_language_color_scheme()
    
    # Load document vectors
    doc_file = vector_dir / OUTPUT_FILES['document_vectors']
    if doc_file.exists():
        print(f"📄 Loading document vectors from {doc_file.name}")
        with open(doc_file, 'r', encoding='utf-8') as f:
            doc_data = json.load(f)
        
        for vector in doc_data.get('vectors', []):
            doc_id = vector['id']
            language = extract_language_from_id(doc_id)
            vector_type = 'Document'
            
            all_vectors.append(vector['vector'])
            
            # Create readable label
            title = vector.get('title', 'No title')
            short_title = title[:25] + "..." if len(title) > 25 else title
            all_labels.append(f"DOC ({language.upper()}): {short_title}")
            
            all_types.append(vector_type)
            all_languages.append(language)
            all_colors.append(color_scheme[language][vector_type])
            
            # Document popup data (NO excerpt for documents)
            all_details.append({
                'corpus_item': doc_id,
                'document_title': title,
                'type': 'DOCUMENT',
                'language': language.upper(),
                'word_count': vector.get('word_count', 'N/A')
            })
    
    # Load section vectors (sample to avoid clutter)
    section_file = vector_dir / OUTPUT_FILES['section_vectors']
    if section_file.exists():
        print(f"📚 Loading section vectors from {section_file.name}")
        with open(section_file, 'r', encoding='utf-8') as f:
            section_data = json.load(f)
        
        sections = section_data.get('vectors', [])
        # Take every 3rd section to avoid overcrowding
        for vector in sections[::3]:
            doc_id = vector.get('document_id', 'unknown')
            language = extract_language_from_id(doc_id)
            level = vector.get('level', 0)
            
            # Group all sub-levels (L1, L2, L3...) together
            if level == 0:
                vector_type = 'Section (L0)'
                level_display = 'L0'
            else:
                vector_type = 'Section (L1+)'
                level_display = f'L{level}'
            
            all_vectors.append(vector['vector'])
            
            # Create readable label
            title = vector.get('title', 'No title')
            short_title = title[:20] + "..." if len(title) > 20 else title
            all_labels.append(f"SEC-{level_display} ({language.upper()}): {short_title}")
            
            all_types.append(vector_type)
            all_languages.append(language)
            all_colors.append(color_scheme[language][vector_type])
            
            # Section popup data
            text = vector.get('text', '')
            excerpt = ' '.join(text.split()[:10])  # First 10 words
            if len(text.split()) > 10:
                excerpt += "..."
            
            all_details.append({
                'corpus_item': doc_id,
                'document_title': vector.get('document_title', 'No title'),
                'type': 'SECTION',
                'language': language.upper(),
                'section_id': vector['id'],
                'section_title': title,
                'excerpt': excerpt,
                'level': level
            })
    
    # Load paragraph vectors (smaller sample)
    para_file = vector_dir / OUTPUT_FILES['paragraph_vectors']
    if para_file.exists():
        print(f"📝 Loading paragraph vectors from {para_file.name}")
        with open(para_file, 'r', encoding='utf-8') as f:
            para_data = json.load(f)
        
        paragraphs = para_data.get('vectors', [])
        # Take every 8th paragraph to avoid overcrowding
        for vector in paragraphs[::8]:
            doc_id = vector.get('document_id', 'unknown')
            language = extract_language_from_id(doc_id)
            vector_type = 'Paragraph'
            
            all_vectors.append(vector['vector'])
            
            # Create readable label
            text = vector.get('text', '')
            short_text = text[:25] + "..." if len(text) > 25 else text
            all_labels.append(f"PARA ({language.upper()}): {short_text}")
            
            all_types.append(vector_type)
            all_languages.append(language)
            all_colors.append(color_scheme[language][vector_type])
            
            # Paragraph popup data
            excerpt = ' '.join(text.split()[:10])  # First 10 words
            if len(text.split()) > 10:
                excerpt += "..."
            
            all_details.append({
                'corpus_item': doc_id,
                'document_title': vector.get('document_title', 'No title'),
                'type': 'PARAGRAPH',
                'language': language.upper(),
                'paragraph_id': vector['id'],
                'section_title': vector.get('section_title', 'No section'),
                'excerpt': excerpt
            })
    
    print(f"✅ Loaded {len(all_vectors)} vectors for visualization")
    return np.array(all_vectors), all_labels, all_types, all_languages, all_colors, all_details

def create_comprehensive_corpus_statistics():
    """Generate comprehensive corpus statistics including all required metrics"""
    vector_dir = PATHS['vectors']
    
    # Initialize statistics structure
    stats = {
        'total_documents': 0,
        'total_sections': 0,
        'total_paragraphs': 0,
        'total_vectors': 0,
        'languages': {},
        'vector_types': {
            'document_vectors': 0,
            'section_vectors': 0,
            'paragraph_vectors': 0
        },
        'coverage_percent': 0,
        'processed_documents': set(),
        'corpus_documents': 0
    }
    
    # Count vectors by type and language
    # Document vectors
    doc_file = vector_dir / OUTPUT_FILES['document_vectors']
    if doc_file.exists():
        try:
            with open(doc_file, 'r', encoding='utf-8') as f:
                doc_data = json.load(f)
            documents = doc_data.get('vectors', [])
            stats['total_documents'] = len(documents)
            stats['vector_types']['document_vectors'] = len(documents)
            
            for doc in documents:
                doc_id = doc.get('id', '')
                stats['processed_documents'].add(doc_id)
                language = extract_language_from_id(doc_id)
                stats['languages'][language] = stats['languages'].get(language, 0) + 1
        except Exception as e:
            print(f"Error reading document vectors: {e}")
    
    # Section vectors
    section_file = vector_dir / OUTPUT_FILES['section_vectors']
    if section_file.exists():
        try:
            with open(section_file, 'r', encoding='utf-8') as f:
                section_data = json.load(f)
            sections = section_data.get('vectors', [])
            stats['total_sections'] = len(sections)
            stats['vector_types']['section_vectors'] = len(sections)
        except Exception as e:
            print(f"Error reading section vectors: {e}")
    
    # Paragraph vectors
    para_file = vector_dir / OUTPUT_FILES['paragraph_vectors']
    if para_file.exists():
        try:
            with open(para_file, 'r', encoding='utf-8') as f:
                para_data = json.load(f)
            paragraphs = para_data.get('vectors', [])
            stats['total_paragraphs'] = len(paragraphs)
            stats['vector_types']['paragraph_vectors'] = len(paragraphs)
        except Exception as e:
            print(f"Error reading paragraph vectors: {e}")
    
    stats['total_vectors'] = stats['total_documents'] + stats['total_sections'] + stats['total_paragraphs']
    
    # Calculate coverage based on original corpus
    if 'document_metadata' in globals():
        total_corpus_docs = sum(len(docs) for docs in document_metadata.values())
        stats['corpus_documents'] = total_corpus_docs
        if total_corpus_docs > 0:
            stats['coverage_percent'] = round((len(stats['processed_documents']) / total_corpus_docs) * 100, 1)
    
    # Convert set to count for JSON serialization
    stats['processed_documents'] = len(stats['processed_documents'])
    
    return stats

def create_visualization_data(vectors, labels, types, languages, colors, details):
    """Create PCA projections and prepare all chart data"""
    
    # Create PCA projections
    pca_2d = PCA(n_components=2)
    pca_3d = PCA(n_components=3)
    
    coords_2d = pca_2d.fit_transform(vectors)
    coords_3d = pca_3d.fit_transform(vectors)
    
    # Build chart data
    chart_2d_data = []
    chart_3d_data = []
    
    for i, (label, type_name, lang, color, detail) in enumerate(zip(labels, types, languages, colors, details)):
        # 2D data point
        point_2d = {
            "x": float(coords_2d[i, 0]),
            "y": float(coords_2d[i, 1]),
            "label": label,
            "type": type_name,
            "language": lang.upper(),
            "color": color,
            "popup": detail
        }
        chart_2d_data.append(point_2d)
        
        # 3D data point  
        point_3d = {
            "x": float(coords_3d[i, 0]),
            "y": float(coords_3d[i, 1]),
            "z": float(coords_3d[i, 2]),
            "label": label,
            "type": type_name,
            "language": lang.upper(),
            "color": color,
            "popup": detail
        }
        chart_3d_data.append(point_3d)
    
    # Language distribution data
    lang_counts = {}
    for lang in languages:
        lang_counts[lang] = lang_counts.get(lang, 0) + 1
    
    lang_colors = {'eng': '#1f77b4', 'esp': '#2ca02c', 'zho-chn': '#d62728', 'unknown': '#666666'}
    
    dist_data = []
    for lang, count in lang_counts.items():
        dist_data.append({
            "language": lang.upper(),
            "count": count,
            "color": lang_colors.get(lang, '#666666')
        })
    
    return {
        "pca_2d": {
            "data": chart_2d_data,
            "variance_explained": [float(pca_2d.explained_variance_ratio_[0]), float(pca_2d.explained_variance_ratio_[1])],
            "title": f"Cross-lingual Vector Space (2D) - {MODEL_DIMENSIONS}D Embeddings"
        },
        "pca_3d": {
            "data": chart_3d_data,
            "variance_explained": [float(pca_3d.explained_variance_ratio_[0]), float(pca_3d.explained_variance_ratio_[1]), float(pca_3d.explained_variance_ratio_[2])],
            "title": f"Cross-lingual Vector Space (3D) - {MODEL_DIMENSIONS}D Embeddings"
        },
        "language_distribution": {
            "data": dist_data,
            "title": "Vector Distribution by Language"
        }
    }

def generate_json_visualization_export():
    """Generate complete JSON export for frontend visualization system"""
    
    print("🎨 GENERATING JSON VISUALIZATION EXPORT")
    print("=" * 60)
    
    # Use config paths
    vector_dir = PATHS['vectors']
    output_dir = PATHS['visualizations']
    
    # Ensure output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"📁 Vector source: {vector_dir}")
    print(f"📁 Output directory: {output_dir}")
    
    # Load vectors and create visualizations
    print("📊 Loading vectors for visualization...")
    vectors, labels, types, languages, colors, details = load_all_vectors_for_visualization()
    
    if len(vectors) == 0:
        print("❌ No vectors found to visualize!")
        print(f"💡 Check that vector files exist in: {vector_dir}")
        return None
    
    print(f"✅ Loaded {len(vectors)} vectors for visualization")
    print(f"📐 Vector dimensions: {len(vectors[0])} (expected: {MODEL_DIMENSIONS})")
    
    # Generate comprehensive statistics
    print("📈 Computing comprehensive corpus statistics...")
    stats = create_comprehensive_corpus_statistics()
    
    # Create visualization data
    print("🎯 Creating visualization data...")
    charts = create_visualization_data(vectors, labels, types, languages, colors, details)
    
    # Build complete JSON structure
    timestamp = datetime.now()
    visualization_export = {
        "metadata": {
            "model": MODEL_NAME,
            "dimensions": MODEL_DIMENSIONS,
            "task": MODEL_TASK,
            "domain": DOMAIN.upper(),
            "generated": timestamp.isoformat(),
            "generated_readable": timestamp.strftime('%Y-%m-%d %H:%M:%S'),
            "version": timestamp.strftime('%Y%m%d_%H%M%S')
        },
        "corpus_statistics": {
            "total_documents": stats['total_documents'],
            "total_sections": stats['total_sections'],
            "total_paragraphs": stats['total_paragraphs'],
            "total_vectors": stats['total_vectors'],
            "document_vectors": stats['vector_types']['document_vectors'],
            "section_vectors": stats['vector_types']['section_vectors'],
            "paragraph_vectors": stats['vector_types']['paragraph_vectors'],
            "coverage_percent": stats['coverage_percent'],
            "processed_documents": stats['processed_documents'],
            "corpus_documents": stats['corpus_documents'],
            "languages": stats['languages']
        },
        "charts": charts
    }
    
    # Save JSON file with timestamp
    json_filename = f"{DOMAIN}_visualization_data_{timestamp.strftime('%Y%m%d_%H%M%S')}.json"
    json_path = output_dir / json_filename
    
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(visualization_export, f, ensure_ascii=False, indent=2)
    
    # Calculate file size
    json_size_mb = json_path.stat().st_size / (1024*1024)
    
    print(f"✅ JSON VISUALIZATION EXPORT COMPLETE")
    print(f"📁 Saved to: {json_path}")
    print(f"📊 File size: {json_size_mb:.2f}MB")
    print(f"📈 Statistics included:")
    print(f"   • Total documents: {stats['total_documents']}")
    print(f"   • Total vectors: {stats['total_vectors']}")
    print(f"   • Coverage: {stats['coverage_percent']}%")
    print(f"   • Languages: {dict(stats['languages'])}")
    print(f"🎯 Charts included: 2D PCA, 3D PCA, Language Distribution")
    print(f"🔧 Document popups: NO excerpt (cleaned as requested)")
    print(f"📱 Ready for frontend integration")
    
    return {
        'json_file': str(json_path),
        'json_filename': json_filename,
        'size_mb': json_size_mb,
        'stats': stats,
        'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S')
    }

# Execute the JSON export
print("🚀 Creating JSON visualization export...")
export_result = generate_json_visualization_export()

if export_result:
    print(f"\n🎉 JSON EXPORT COMPLETE!")
    print(f"💡 Frontend Integration:")
    print(f"   1. Upload {export_result['json_filename']} to your website")
    print(f"   2. Update visualizations.js to load this file")
    print(f"   3. Corpus statistics and charts will auto-populate")
    print(f"")
    print(f"📊 Export Summary:")
    print(f"   • File: {export_result['json_filename']}")
    print(f"   • Size: {export_result['size_mb']:.2f}MB")
    print(f"   • Generated: {export_result['timestamp']}")
    print(f"   • Documents: {export_result['stats']['total_documents']}")
    print(f"   • Vectors: {export_result['stats']['total_vectors']}")
    print(f"")
    print(f"🔄 Re-run this step after processing new documents")
    print(f"📈 Each export contains complete corpus state")
    print(f"🕐 Historical comparison: Compare multiple JSON files over time")
else:
    print(f"\n❌ Could not generate JSON export")
    print(f"💡 Make sure Step 7 batch processing completed successfully")