In [None]:
"""
KG-Enhanced Indonesian Legal RAG System - Complete Integrated Version v2
Enhanced with advanced Knowledge Graph features from new dataset
Maintains all original functionality + team simulation + new KG capabilities
"""

import torch
import pandas as pd
import numpy as np
import json
import gradio as gr
from collections import defaultdict, Counter
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, TextIteratorStreamer, BitsAndBytesConfig
import torch.nn.functional as F
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from datetime import datetime, timedelta
import gc
from datasets import load_dataset
from typing import Dict, List, Set, Tuple, Optional, Any
from collections import deque
import warnings
import threading
from threading import Thread
import time
import json
import markdown
import igraph as ig
from community import community_louvain
warnings.filterwarnings('ignore')

# =============================================================================
# ENHANCED CONFIGURATION WITH NEW KG DATASET
# =============================================================================

dataset_source = 'D:/AI_Workspace/10_Dataset_HF'
embed_source = 'D:/AI_Workspace/10_Encoder_Model_HF'
model_source = 'D:/AI_Workspace/10_Model_HF'

# Dataset configuration - UPDATED TO NEW DATASET
DATASET_NAME = dataset_source + "/ID_REG_KG_2511"
HF_TOKEN = None  # Set if needed

# Model configurations
EMBEDDING_MODEL = embed_source + '/Qwen3-Embedding-0.6B'
RERANKER_MODEL = embed_source + '/Qwen3-Reranker-0.6B'
LLM_MODEL = model_source + "/Deepseek_ID_Legal_Preview_FP4"
LLM_MODEL = model_source + "/Qwen3-4B-Thinking-2507-unsloth-bnb-4bit"
MAX_LENGTH = 32768

# Enhanced default configurations (unchanged)
DEFAULT_CONFIG = {
    'final_top_k': 3,
    'max_rounds': 5,
    'initial_quality': 0.95,
    'quality_degradation': 0.1,
    'min_quality': 0.5,
    'parallel_research': True,
    'research_team_size': 4,
    'temperature': 0.7,
    'max_new_tokens': 2048,
    'top_p': 1.0,
    'top_k': 20,
    'min_p': 0.1,
    'enable_cross_validation': True,
    'enable_devil_advocate': True,
    'consensus_threshold': 0.6
}

# Enhanced search phases (unchanged)
DEFAULT_SEARCH_PHASES = {
    'initial_scan': {
        'candidates': 400,
        'semantic_threshold': 0.20,
        'keyword_threshold': 0.06,
        'description': 'Quick broad scan like human initial reading',
        'time_limit': 30,
        'focus_areas': ['regulation_type', 'enacting_body'],
        'enabled': True
    },
    'focused_review': {
        'candidates': 150,
        'semantic_threshold': 0.35,
        'keyword_threshold': 0.12,
        'description': 'Focused review of promising candidates',
        'time_limit': 45,
        'focus_areas': ['content', 'chapter', 'article'],
        'enabled': True
    },
    'deep_analysis': {
        'candidates': 60,
        'semantic_threshold': 0.45,
        'keyword_threshold': 0.18,
        'description': 'Deep contextual analysis like careful reading',
        'time_limit': 60,
        'focus_areas': ['kg_entities', 'cross_references'],
        'enabled': True
    },
    'verification': {
        'candidates': 30,
        'semantic_threshold': 0.55,
        'keyword_threshold': 0.22,
        'description': 'Final verification and cross-checking',
        'time_limit': 30,
        'focus_areas': ['authority_score', 'temporal_score'],
        'enabled': True
    },
    'expert_review': {
        'candidates': 45,
        'semantic_threshold': 0.50,
        'keyword_threshold': 0.20,
        'description': 'Expert specialist review for complex cases',
        'time_limit': 40,
        'focus_areas': ['legal_richness', 'completeness_score'],
        'enabled': False
    }
}

# Research team personas (unchanged)
RESEARCH_TEAM_PERSONAS = {
    'senior_legal_researcher': {
        'name': 'üë®‚Äç‚öñÔ∏è Senior Legal Researcher',
        'experience_years': 15,
        'specialties': ['constitutional_law', 'procedural_law', 'precedent_analysis'],
        'approach': 'systematic_thorough',
        'strengths': ['authority_analysis', 'hierarchy_understanding', 'precedent_matching'],
        'weaknesses': ['modern_technology', 'informal_language'],
        'bias_towards': 'established_precedents',
        'search_style': {
            'semantic_weight': 0.25,
            'authority_weight': 0.35,
            'kg_weight': 0.25,
            'temporal_weight': 0.15
        },
        'phases_preference': ['verification', 'deep_analysis'],
        'speed_multiplier': 0.8,
        'accuracy_bonus': 0.15
    },
    'junior_legal_researcher': {
        'name': 'üë©‚Äç‚öñÔ∏è Junior Legal Researcher',
        'experience_years': 3,
        'specialties': ['research_methodology', 'digital_search', 'comprehensive_coverage'],
        'approach': 'broad_comprehensive',
        'strengths': ['semantic_search', 'keyword_matching', 'broad_coverage'],
        'weaknesses': ['authority_evaluation', 'precedent_weighting'],
        'bias_towards': 'comprehensive_results',
        'search_style': {
            'semantic_weight': 0.45,
            'authority_weight': 0.15,
            'kg_weight': 0.25,
            'temporal_weight': 0.15
        },
        'phases_preference': ['initial_scan', 'focused_review'],
        'speed_multiplier': 1.2,
        'accuracy_bonus': 0.0
    },
    'specialist_researcher': {
        'name': 'üìö Knowledge Graph Specialist',
        'experience_years': 8,
        'specialties': ['knowledge_graphs', 'semantic_analysis', 'entity_relationships'],
        'approach': 'relationship_focused',
        'strengths': ['kg_analysis', 'entity_extraction', 'relationship_mapping'],
        'weaknesses': ['traditional_legal_hierarchy', 'formal_procedures'],
        'bias_towards': 'interconnected_concepts',
        'search_style': {
            'semantic_weight': 0.20,
            'authority_weight': 0.15,
            'kg_weight': 0.50,
            'temporal_weight': 0.15
        },
        'phases_preference': ['deep_analysis', 'expert_review'],
        'speed_multiplier': 0.9,
        'accuracy_bonus': 0.1
    },
    'procedural_expert': {
        'name': '‚öñÔ∏è Procedural Law Expert',
        'experience_years': 12,
        'specialties': ['procedural_law', 'administrative_law', 'process_analysis'],
        'approach': 'step_by_step_methodical',
        'strengths': ['procedure_analysis', 'step_identification', 'requirement_mapping'],
        'weaknesses': ['abstract_concepts', 'philosophical_law'],
        'bias_towards': 'clear_procedures',
        'search_style': {
            'semantic_weight': 0.30,
            'authority_weight': 0.25,
            'kg_weight': 0.30,
            'temporal_weight': 0.15
        },
        'phases_preference': ['focused_review', 'verification'],
        'speed_multiplier': 1.0,
        'accuracy_bonus': 0.08
    },
    'devils_advocate': {
        'name': 'üîç Devil\'s Advocate Reviewer',
        'experience_years': 10,
        'specialties': ['critical_analysis', 'alternative_interpretations', 'edge_cases'],
        'approach': 'critical_challenging',
        'strengths': ['weakness_identification', 'alternative_perspectives', 'critical_thinking'],
        'weaknesses': ['positive_reinforcement', 'consensus_building'],
        'bias_towards': 'challenging_assumptions',
        'search_style': {
            'semantic_weight': 0.35,
            'authority_weight': 0.20,
            'kg_weight': 0.30,
            'temporal_weight': 0.15
        },
        'phases_preference': ['verification', 'expert_review'],
        'speed_multiplier': 0.7,
        'accuracy_bonus': 0.12
    }
}

# Query-specific team compositions (unchanged)
QUERY_TEAM_COMPOSITIONS = {
    'specific_article': ['senior_legal_researcher', 'specialist_researcher', 'devils_advocate'],
    'procedural': ['procedural_expert', 'junior_legal_researcher', 'senior_legal_researcher'],
    'definitional': ['senior_legal_researcher', 'specialist_researcher', 'junior_legal_researcher'],
    'sanctions': ['senior_legal_researcher', 'procedural_expert', 'devils_advocate'],
    'general': ['senior_legal_researcher', 'junior_legal_researcher', 'specialist_researcher', 'procedural_expert']
}

# Human priorities (unchanged)
DEFAULT_HUMAN_PRIORITIES = {
    'authority_hierarchy': 0.20,
    'temporal_relevance': 0.18,
    'semantic_match': 0.18,
    'knowledge_graph': 0.15,
    'keyword_precision': 0.12,
    'legal_completeness': 0.09,
    'cross_validation': 0.08
}

# Query patterns (unchanged)
QUERY_PATTERNS = {
    'specific_article': {
        'indicators': ['pasal', 'ayat', 'huruf', 'angka', 'butir'],
        'priority_weights': {'authority_hierarchy': 0.30, 'semantic_match': 0.25, 'knowledge_graph': 0.20, 'keyword_precision': 0.15, 'temporal_relevance': 0.10}
    },
    'procedural': {
        'indicators': ['prosedur', 'tata cara', 'persyaratan', 'cara', 'langkah'],
        'priority_weights': {'semantic_match': 0.25, 'knowledge_graph': 0.20, 'legal_completeness': 0.20, 'temporal_relevance': 0.20, 'authority_hierarchy': 0.15}
    },
    'definitional': {
        'indicators': ['definisi', 'pengertian', 'dimaksud dengan', 'adalah'],
        'priority_weights': {'authority_hierarchy': 0.35, 'semantic_match': 0.25, 'knowledge_graph': 0.15, 'keyword_precision': 0.15, 'temporal_relevance': 0.10}
    },
    'sanctions': {
        'indicators': ['sanksi', 'pidana', 'denda', 'hukuman', 'larangan'],
        'priority_weights': {'authority_hierarchy': 0.30, 'knowledge_graph': 0.25, 'keyword_precision': 0.20, 'temporal_relevance': 0.15, 'semantic_match': 0.10}
    },
    'general': {
        'indicators': [],
        'priority_weights': DEFAULT_HUMAN_PRIORITIES
    }
}

# NEW: Enhanced KG weights for advanced features
KG_WEIGHTS = {
    'direct_match': 1.0,
    'one_hop': 0.8,
    'two_hop': 0.6,
    'concept_cluster': 0.7,
    'hierarchy_boost': 0.5,
    'temporal_relevance': 0.4,
    'cross_reference': 0.6,
    'domain_match': 0.5,
    'legal_action_match': 0.7,
    'sanction_relevance': 0.8,
    'citation_impact': 0.4,
    'connectivity_boost': 0.3
}

# Indonesian stopwords (unchanged)
INDONESIAN_STOPWORDS = {
    'yang', 'dan', 'di', 'ke', 'dari', 'dalam', 'untuk', 'pada', 'dengan', 'adalah',
    'ini', 'itu', 'atau', 'jika', 'maka', 'akan', 'telah', 'dapat', 'harus', 'tidak',
    'ada', 'oleh', 'sebagai', 'karena', 'sehingga', 'bahwa', 'tentang', 'antara',
    'seperti', 'setelah', 'sebelum', 'sampai', 'hingga', 'namun', 'tetapi', 'juga'
}

# System prompt (unchanged)
SYSTEM_PROMPT = '''Anda adalah asisten AI yang ahli di bidang hukum Indonesia. Anda dapat membantu konsultasi hukum, menjawab pertanyaan, dan memberikan analisis berdasarkan peraturan perundang-undangan yang relevan. Untuk setiap respons, Anda harus berfikir dan menjawab dengan Bahasa Indonesia, serta gunakan format: <think> ... </think> Tuliskan jawaban akhir secara jelas, ringkas, profesional, dan berempati jika diperlukan. Gunakan bahasa hukum yang mudah dipahami. Sertakan referensi hukum Indonesia yang relevan. Selalu rekomendasikan konsultasi dengan ahli hukum untuk keputusan final. Manfaatkan hubungan semantik antar konsep hukum untuk memberikan konteks yang lebih kaya.'''

# ============================================================================
# CONFIGURATION: Regulation Patterns (Easy to adjust)
# ============================================================================

# Indonesian regulation type patterns
REGULATION_TYPE_PATTERNS = {
    'undang-undang': ['undang-undang', 'uu', 'undang undang'],
    'peraturan_pemerintah': ['peraturan pemerintah', 'pp', 'perpem'],
    'peraturan_presiden': ['peraturan presiden', 'perpres', 'pres'],
    'peraturan_menteri': ['peraturan menteri', 'permen', 'permenkeu', 'permendikbud'],
    'peraturan_daerah': ['peraturan daerah', 'perda', 'peraturan daerah provinsi', 'peraturan daerah kabupaten'],
    'keputusan_presiden': ['keputusan presiden', 'keppres', 'kepres'],
    'peraturan_gubernur': ['peraturan gubernur', 'pergub'],
    'peraturan_bupati': ['peraturan bupati', 'perbup'],
    'peraturan_walikota': ['peraturan walikota', 'perwali']
}

# Year separator patterns
YEAR_SEPARATORS = ['tahun', 'th', 'th.', '/', '-']

# Pronoun patterns that reference previous regulations
REGULATION_PRONOUNS = [
    'peraturan tersebut', 'peraturan ini', 'pp tersebut', 'pp ini',
    'uu tersebut', 'uu ini', 'regulasi tersebut', 'regulasi ini',
    'ketentuan tersebut', 'ketentuan ini', 'undang-undang tersebut',
    'undang-undang ini', 'perda tersebut', 'perda ini'
]

# Keywords indicating follow-up questions
FOLLOWUP_INDICATORS = [
    'apa yang diatur', 'mengatur apa', 'isi dari', 'membahas apa',
    'tentang apa', 'mengenai apa', 'berisi apa', 'materi apa',
    'ketentuan apa', 'pasal apa', 'bagaimana dengan', 'lalu bagaimana',
    'terus', 'kemudian', 'selanjutnya', 'dan', 'serta'
]

# Keywords indicating clarification/disagreement
CLARIFICATION_INDICATORS = [
    'tidak melihat', 'tidak ada', 'tidak menemukan', 'bukan tentang',
    'seharusnya', 'maksud saya', 'yang saya maksud', 'saya kira',
    'tetapi', 'namun', 'tapi', 'kok', 'kenapa', 'mengapa'
]

# Keywords for regulation content queries
CONTENT_QUERY_KEYWORDS = [
    'mengatur', 'diatur', 'pengaturan', 'ketentuan', 'isi', 'materi',
    'membahas', 'berisi', 'tentang', 'mengenai', 'menyangkut', 'terkait'
]

# =============================================================================
# GLOBAL VARIABLES
# =============================================================================

# Global variables for models and data
embedding_model = None
embedding_tokenizer = None
reranker_model = None
reranker_tokenizer = None
llm_model = None
llm_tokenizer = None
dataset_loader = None
search_engine = None
knowledge_graph = None
reranker = None
llm_generator = None
conversation_manager = None

# Model setup variables
device = None
EMBEDDING_DIM = None
token_false_id = None
token_true_id = None
prefix_tokens = None
suffix_tokens = None

# Progress tracking
current_progress = {"status": "Not started", "details": ""}
initialization_complete = False
initialization_lock = threading.Lock()

# =============================================================================
# MEMORY MANAGEMENT
# =============================================================================

def clear_cache():
    """Clear GPU cache and collect garbage"""
    try:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
    except Exception as e:
        print(f"Warning: Cache clear failed: {e}")


# =============================================================================
# EXPORT CONVERSATION FUNCTIONS
# =============================================================================

def format_complete_search_metadata(rag_result: Dict, include_scores: bool = True) -> str:
    """
    Format complete search metadata including ALL documents retrieved
    
    Args:
        rag_result: Complete RAG result with all_retrieved_metadata
        include_scores: Include detailed scoring information
    
    Returns:
        Formatted string with all search results
    """
    try:
        if not rag_result or not rag_result.get('all_retrieved_metadata'):
            return "No search metadata available."
        
        output = []
        output.append("# üìä COMPLETE SEARCH RESULTS METADATA")
        output.append("=" * 80)
        output.append("")
        
        all_metadata = rag_result['all_retrieved_metadata']
        
        # Group by phase
        phase_order = ['initial_scan', 'focused_review', 'deep_analysis', 'verification', 'expert_review']
        phase_groups = {}
        
        for phase_key, phase_data in all_metadata.items():
            phase_name = phase_data.get('phase', 'unknown')
            if phase_name not in phase_groups:
                phase_groups[phase_name] = []
            phase_groups[phase_name].append((phase_key, phase_data))
        
        total_docs = 0
        
        # Process each phase
        for phase_name in phase_order:
            if phase_name not in phase_groups:
                continue
            
            output.append(f"\n## üîç PHASE: {phase_name.upper()}")
            output.append("-" * 80)
            
            phase_entries = phase_groups[phase_name]
            
            for phase_key, phase_data in phase_entries:
                researcher_name = phase_data.get('researcher_name', 'Unknown Researcher')
                researcher_id = phase_data.get('researcher', 'unknown')
                candidates = phase_data.get('candidates', [])
                confidence = phase_data.get('confidence', 0)
                
                output.append(f"\n### Researcher: {researcher_name}")
                output.append(f"- **ID:** `{researcher_id}`")
                output.append(f"- **Documents Found:** {len(candidates)}")
                output.append(f"- **Confidence:** {confidence:.2%}")
                output.append("")
                
                if candidates:
                    output.append(f"#### Retrieved Documents ({len(candidates)} total):")
                    output.append("")
                    
                    # Show all documents
                    for idx, candidate in enumerate(candidates, 1):
                        try:
                            record = candidate.get('record', {})
                            
                            # Basic info
                            reg_type = record.get('regulation_type', 'N/A')
                            reg_num = record.get('regulation_number', 'N/A')
                            year = record.get('year', 'N/A')
                            about = record.get('about', 'N/A')
                            
                            output.append(f"**{idx}. {reg_type} No. {reg_num}/{year}**")
                            output.append(f"   - About: {about[:100]}{'...' if len(about) > 100 else ''}")
                            
                            if include_scores:
                                # Scores
                                composite_score = candidate.get('composite_score', 0)
                                semantic_score = candidate.get('semantic_score', 0)
                                keyword_score = candidate.get('keyword_score', 0)
                                kg_score = candidate.get('kg_score', 0)
                                
                                output.append(f"   - **Scores:** Composite: {composite_score:.4f} | Semantic: {semantic_score:.4f} | Keyword: {keyword_score:.4f} | KG: {kg_score:.4f}")
                                
                                # KG metadata
                                if kg_score > 0:
                                    kg_domain = record.get('kg_primary_domain', '')
                                    kg_hierarchy = record.get('kg_hierarchy_level', 0)
                                    kg_authority = record.get('kg_authority_score', 0)
                                    
                                    output.append(f"   - **KG Metadata:** Domain: {kg_domain} | Hierarchy: {kg_hierarchy} | Authority: {kg_authority:.3f}")
                                
                                # Team consensus
                                if candidate.get('team_consensus'):
                                    agreement = candidate.get('researcher_agreement', 0)
                                    output.append(f"   - **Team Consensus:** ‚úÖ Yes ({agreement} researchers)")
                                
                                # Researcher bias
                                if candidate.get('researcher_bias_applied'):
                                    output.append(f"   - **Researcher:** {candidate['researcher_bias_applied']}")
                            
                            # Content snippet
                            content = record.get('content', '')
                            if content:
                                snippet = content[:200] + "..." if len(content) > 200 else content
                                output.append(f"   - **Content:** {snippet}")
                            
                            output.append("")
                            
                        except Exception as e:
                            output.append(f"   Error formatting document {idx}: {e}")
                            output.append("")
                    
                    total_docs += len(candidates)
                
                output.append("")
        
        # Summary
        output.append("\n## üìà SEARCH SUMMARY")
        output.append("=" * 80)
        output.append(f"- **Total Documents Retrieved:** {total_docs}")
        output.append(f"- **Phases Executed:** {len(phase_groups)}")
        output.append(f"- **Unique Researchers:** {len(set(pd.get('researcher', 'unknown') for phases in phase_groups.values() for _, pd in phases))}")
        
        return "\n".join(output)
        
    except Exception as e:
        import traceback
        return f"Error formatting search metadata: {str(e)}\n\n{traceback.format_exc()}"


def export_conversation_to_markdown(conversation_history: List[Dict], include_metadata: bool = True, include_research_process: bool = True) -> str:
    """FIXED: Export with thinking process properly included"""
    try:
        md_parts = []
        
        md_parts.append("# Legal Consultation Export")
        md_parts.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        md_parts.append("=" * 80)
        md_parts.append("")
        
        for idx, entry in enumerate(conversation_history, 1):
            md_parts.append(f"\n## Exchange {idx}")
            md_parts.append("-" * 80)
            
            # Query
            query = entry.get('query', '')
            md_parts.append(f"\n**Question:** {query}\n")
            
            # Query metadata
            query_type = entry.get('query_type', 'general')
            query_entities = entry.get('query_entities', [])
            if query_type:
                md_parts.append(f"**Query Type:** {query_type}")
                if query_entities:
                    md_parts.append(f"**Key Entities:** {', '.join(query_entities[:5])}")
                md_parts.append("")
            
            # *** FIXED: Thinking process properly extracted and displayed ***
            thinking = entry.get('thinking', '')
            if thinking and include_research_process:
                md_parts.append("### üß† Thinking Process")
                md_parts.append("-" * 40)
                # Clean up thinking content
                thinking_clean = thinking.strip()
                if thinking_clean:
                    md_parts.append(thinking_clean)
                md_parts.append("")
            
            # Response
            response = entry.get('response', '')
            if response:
                md_parts.append("### ‚úÖ Answer")
                md_parts.append("-" * 40)
                md_parts.append(response)
                md_parts.append("")
            
            # Legal References
            sources_used = entry.get('sources_used', [])
            if sources_used:
                md_parts.append(f"### üìö Legal References ({len(sources_used)} documents)")
                md_parts.append("-" * 80)
                
                for i, source in enumerate(sources_used, 1):
                    try:
                        regulation = f"{source.get('regulation_type', 'N/A')} No. {source.get('regulation_number', 'N/A')}/{source.get('year', 'N/A')}"
                        md_parts.append(f"\n**{i}. {regulation}**")
                        md_parts.append(f"   - About: {source.get('about', 'N/A')}")
                        md_parts.append(f"   - Enacting Body: {source.get('enacting_body', 'N/A')}")
                        
                        if source.get('final_score'):
                            md_parts.append(f"   - Score: {source.get('final_score', 0):.4f}")
                        if source.get('kg_score'):
                            md_parts.append(f"   - KG Score: {source.get('kg_score', 0):.4f}")
                        if source.get('kg_primary_domain'):
                            md_parts.append(f"   - Domain: {source.get('kg_primary_domain')}")
                        if source.get('team_consensus'):
                            md_parts.append(f"   - Team Consensus: ‚úì Yes")
                        
                        content = source.get('content', '')
                        if content:
                            snippet = content[:200] + "..." if len(content) > 200 else content
                            md_parts.append(f"   - Content: {snippet}")
                    except Exception as e:
                        print(f"Error processing source {i}: {e}")
                        continue
                
                md_parts.append("")
            
            # Research process details with complete metadata
            if include_research_process and include_metadata and entry.get('research_log'):
                research_log = entry['research_log']
                md_parts.append("### üîç Research Process Details")
                md_parts.append("-" * 80)
                md_parts.append(f"- **Team Members:** {len(research_log.get('team_members', []))}")
                md_parts.append(f"- **Total Documents Retrieved:** {research_log.get('total_documents_retrieved', 0)}")
                
                phase_results = research_log.get('phase_results', {})
                if phase_results:
                    md_parts.append(f"- **Phases Executed:** {len(phase_results)}")
                    
                    # Detailed phase breakdown
                    phase_order = ['initial_scan', 'focused_review', 'deep_analysis', 'verification', 'expert_review']
                    phase_groups = {}
                    
                    for phase_key, phase_data in phase_results.items():
                        phase_name = phase_key.split('_', 1)[-1] if '_' in phase_key else phase_key
                        for base_phase in phase_order:
                            if base_phase in phase_key:
                                phase_name = base_phase
                                break
                        
                        if phase_name not in phase_groups:
                            phase_groups[phase_name] = []
                        phase_groups[phase_name].append((phase_key, phase_data))
                    
                    total_retrieved = 0
                    
                    for phase_name in phase_order:
                        if phase_name not in phase_groups:
                            continue
                        
                        md_parts.append(f"\n#### üìÇ PHASE: {phase_name.upper()}")
                        
                        phase_entries = phase_groups[phase_name]
                        phase_total = sum(len(pd.get('candidates', [])) for _, pd in phase_entries)
                        total_retrieved += phase_total
                        
                        for phase_key, phase_data in phase_entries:
                            researcher_name = phase_data.get('researcher_name', 'Unknown')
                            candidates = phase_data.get('candidates', [])
                            confidence = phase_data.get('confidence', 0)
                            
                            md_parts.append(f"\n**{researcher_name}:** {len(candidates)} documents (Confidence: {confidence:.2%})")
                            
                            # Show top 5 documents per researcher
                            for idx_doc, candidate in enumerate(candidates[:5], 1):
                                try:
                                    record = candidate.get('record', {})
                                    reg_info = f"{record.get('regulation_type', 'N/A')} No. {record.get('regulation_number', 'N/A')}/{record.get('year', 'N/A')}"
                                    composite = candidate.get('composite_score', 0)
                                    kg_score = candidate.get('kg_score', 0)
                                    
                                    md_parts.append(f"   {idx_doc}. {reg_info} (Score: {composite:.3f}, KG: {kg_score:.3f})")
                                except Exception:
                                    continue
                            
                            if len(candidates) > 5:
                                md_parts.append(f"   ... and {len(candidates) - 5} more documents")
                    
                    md_parts.append(f"\n**Total Documents Retrieved:** {total_retrieved}")
                md_parts.append("")
        
        md_parts.append("\n" + "=" * 80)
        md_parts.append("\n*Export completed successfully*")
        
        return "\n".join(md_parts)
        
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        return f"# Error Generating Markdown Export\n\n```\n{error_details}\n```"


def export_conversation_to_json(conversation_history: List[Dict], include_full_content: bool = True) -> str:
    """FIXED: JSON export with proper error handling and serialization"""
    try:
        export_data = {
            "export_info": {
                "timestamp": datetime.now().isoformat(),
                "total_exchanges": len(conversation_history),
                "version": "2.1-fixed"
            },
            "conversation": []
        }
        
        for idx, entry in enumerate(conversation_history, 1):
            try:
                exchange = {
                    "exchange_number": idx,
                    "query": str(entry.get('query', '')),
                    "query_type": str(entry.get('query_type', 'general')),
                    "query_entities": [str(e) for e in entry.get('query_entities', [])],
                    "response": {
                        "thinking": str(entry.get('thinking', '')),
                        "answer": str(entry.get('response', ''))
                    },
                    "legal_references": {
                        "count": len(entry.get('sources_used', [])),
                        "sources": []
                    },
                    "research_metadata": {
                        "available": bool(entry.get('research_log')),
                        "total_documents_retrieved": 0,
                        "phases": {}
                    }
                }
                
                # Add legal references
                if entry.get('sources_used'):
                    for source in entry['sources_used']:
                        try:
                            source_info = {
                                "regulation": {
                                    "type": str(source.get('regulation_type', '')),
                                    "number": str(source.get('regulation_number', '')),
                                    "year": str(source.get('year', '')),
                                    "citation": f"{source.get('regulation_type', '')} No. {source.get('regulation_number', '')}/{source.get('year', '')}"
                                },
                                "about": str(source.get('about', ''))[:500],  # Limit length
                                "enacting_body": str(source.get('enacting_body', '')),
                                "scores": {
                                    "final": float(source.get('final_score', 0)),
                                    "kg": float(source.get('kg_score', 0))
                                },
                                "kg_metadata": {
                                    "primary_domain": str(source.get('kg_primary_domain', '')),
                                    "hierarchy_level": int(source.get('kg_hierarchy_level', 0)),
                                    "team_consensus": bool(source.get('team_consensus', False))
                                }
                            }
                            
                            if include_full_content:
                                content = str(source.get('content', ''))
                                source_info["content"] = content[:1000] if len(content) > 1000 else content
                            
                            exchange["legal_references"]["sources"].append(source_info)
                        except Exception as e:
                            print(f"Error processing source: {e}")
                            continue
                
                # Add research metadata
                if entry.get('research_log') and entry['research_log'].get('phase_results'):
                    try:
                        phase_results = entry['research_log']['phase_results']
                        
                        for phase_key, phase_data in phase_results.items():
                            phase_name = str(phase_key.split('_', 1)[-1] if '_' in phase_key else phase_key)
                            
                            if phase_name not in exchange["research_metadata"]["phases"]:
                                exchange["research_metadata"]["phases"][phase_name] = {
                                    "researchers": [],
                                    "total_documents": 0
                                }
                            
                            researcher_data = {
                                "researcher_id": str(phase_data.get('researcher', 'unknown')),
                                "researcher_name": str(phase_data.get('researcher_name', 'Unknown')),
                                "documents_found": int(len(phase_data.get('candidates', []))),
                                "confidence": float(phase_data.get('confidence', 0)),
                                "documents": []
                            }
                            
                            # Add top documents
                            for candidate in phase_data.get('candidates', [])[:10]:
                                try:
                                    record = candidate.get('record', {})
                                    
                                    doc_data = {
                                        "regulation": {
                                            "type": str(record.get('regulation_type', '')),
                                            "number": str(record.get('regulation_number', '')),
                                            "year": str(record.get('year', '')),
                                        },
                                        "scores": {
                                            "composite": float(candidate.get('composite_score', 0)),
                                            "semantic": float(candidate.get('semantic_score', 0)),
                                            "keyword": float(candidate.get('keyword_score', 0)),
                                            "kg": float(candidate.get('kg_score', 0))
                                        },
                                        "team_consensus": bool(candidate.get('team_consensus', False))
                                    }
                                    
                                    if include_full_content:
                                        content = str(record.get('content', ''))
                                        doc_data["content"] = content[:500] if len(content) > 500 else content
                                    
                                    researcher_data["documents"].append(doc_data)
                                    
                                except Exception as e:
                                    print(f"Error processing document: {e}")
                                    continue
                            
                            exchange["research_metadata"]["phases"][phase_name]["researchers"].append(researcher_data)
                            exchange["research_metadata"]["phases"][phase_name]["total_documents"] += len(phase_data.get('candidates', []))
                        
                        exchange["research_metadata"]["total_documents_retrieved"] = sum(
                            phase["total_documents"] 
                            for phase in exchange["research_metadata"]["phases"].values()
                        )
                    except Exception as e:
                        print(f"Error processing research metadata: {e}")
                
                export_data["conversation"].append(exchange)
                
            except Exception as e:
                print(f"Error processing exchange {idx}: {e}")
                # Add placeholder for failed exchange
                export_data["conversation"].append({
                    "exchange_number": idx,
                    "error": str(e),
                    "query": str(entry.get('query', 'Error processing query'))
                })
                continue
        
        # *** FIXED: Ensure proper JSON serialization ***
        return json.dumps(export_data, indent=2, ensure_ascii=False, default=str)
        
    except Exception as e:
        # *** FIXED: Return valid JSON even on error ***
        import traceback
        error_details = traceback.format_exc()
        return json.dumps({
            "error": "Export failed",
            "message": str(e),
            "traceback": error_details
        }, indent=2, ensure_ascii=False)


def export_conversation_to_html(conversation_history: List[Dict], include_metadata: bool = True) -> str:
    """FIXED: HTML export with thinking process and table rendering support"""
    try:
        html_content = []
        
        # Enhanced CSS with table support
        html_content.append("""<!DOCTYPE html>
<html lang="id">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Legal Consultation Export</title>
    <style>
        * { box-sizing: border-box; }
        
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            max-width: 1200px;
            margin: 0 auto;
            padding: 30px 20px;
            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
            line-height: 1.8;
            color: #2c3e50;
        }
        
        .container {
            background: white;
            padding: 30px;
            border-radius: 12px;
            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
        }
        
        h1 { color: #1a237e; border-bottom: 4px solid #3f51b5; padding-bottom: 15px; margin-bottom: 30px; }
        h2 { color: #283593; margin-top: 40px; padding: 15px; background: linear-gradient(to right, #e8eaf6, transparent); border-left: 5px solid #3f51b5; }
        h3 { color: #3949ab; margin-top: 25px; padding-left: 10px; border-left: 3px solid #5c6bc0; }
        
        .exchange {
            background: #f8f9fa;
            padding: 25px;
            margin: 30px 0;
            border-radius: 12px;
            border-left: 4px solid #3f51b5;
        }
        
        .question {
            background: #fff3e0;
            padding: 15px;
            border-radius: 8px;
            margin: 15px 0;
            border-left: 4px solid #ff9800;
        }
        
        /* *** FIXED: Thinking process styling *** */
        .thinking {
            background: #e3f2fd;
            padding: 20px;
            border-radius: 8px;
            margin: 20px 0;
            border-left: 4px solid #2196f3;
            font-family: 'Courier New', monospace;
            white-space: pre-wrap;
        }
        
        .thinking h4 {
            margin-top: 0;
            color: #1976d2;
        }
        
        .answer {
            background: #f0f7ff;
            padding: 20px;
            border-radius: 8px;
            margin: 20px 0;
            border-left: 4px solid #4caf50;
        }
        
        /* *** FIXED: Table styling *** */
        table {
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
            background: white;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        
        table thead {
            background: #3f51b5;
            color: white;
        }
        
        table th, table td {
            padding: 12px 15px;
            text-align: left;
            border-bottom: 1px solid #e0e0e0;
        }
        
        table tbody tr:hover {
            background: #f5f5f5;
        }
        
        table th {
            font-weight: 600;
            text-transform: uppercase;
            font-size: 0.9em;
            letter-spacing: 0.5px;
        }
        
        /* Collapsible details */
        details {
            background: #f5f5f5;
            padding: 15px;
            border-radius: 8px;
            margin: 15px 0;
            border: 1px solid #e0e0e0;
        }
        
        details summary {
            font-weight: 600;
            color: #1565c0;
            cursor: pointer;
            user-select: none;
        }
        
        details[open] summary {
            margin-bottom: 15px;
            border-bottom: 2px solid #e0e0e0;
            padding-bottom: 10px;
        }
        
        .doc-item {
            background: #fafafa;
            padding: 12px;
            margin: 10px 0;
            border-radius: 6px;
            border-left: 3px solid #1976d2;
        }
        
        .score-badge {
            display: inline-block;
            background: #4caf50;
            color: white;
            padding: 3px 10px;
            border-radius: 12px;
            font-size: 0.85em;
            margin-right: 5px;
            font-weight: 600;
        }
        
        code {
            background-color: #f5f5f5;
            padding: 3px 8px;
            border-radius: 4px;
            font-family: 'Courier New', monospace;
            color: #d32f2f;
        }
        
        @media print {
            body { background: white; }
            .exchange { page-break-inside: avoid; }
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>üìã Legal Consultation Export</h1>
        <p><strong>Generated:</strong> """ + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + """</p>
        <hr>
""")
        
        # Process each exchange
        for idx, entry in enumerate(conversation_history, 1):
            html_content.append(f'<div class="exchange">')
            html_content.append(f'<h2>Exchange {idx}</h2>')
            
            # Query
            query = entry.get('query', '')
            html_content.append(f'<div class="question"><strong>Question:</strong> {query}</div>')
            
            # *** FIXED: Thinking process display ***
            thinking = entry.get('thinking', '')
            if thinking:
                thinking_html = thinking.replace('<', '&lt;').replace('>', '&gt;')
                html_content.append(f'<details open>')
                html_content.append(f'<summary>üß† Thinking Process</summary>')
                html_content.append(f'<div class="thinking">{thinking_html}</div>')
                html_content.append(f'</details>')
            
            # Response - convert markdown to HTML including tables
            response = entry.get('response', '')
            if response:
                # *** FIXED: Markdown conversion with table extension ***
                response_html = markdown.markdown(
                    response, 
                    extensions=['tables', 'fenced_code', 'nl2br']
                )
                html_content.append(f'<div class="answer"><strong>Answer:</strong><br>{response_html}</div>')
            
            # Legal References
            sources_used = entry.get('sources_used', [])
            if sources_used:
                html_content.append(f'<details>')
                html_content.append(f'<summary>üìö Legal References ({len(sources_used)} documents)</summary>')
                
                for i, source in enumerate(sources_used, 1):
                    try:
                        regulation = f"{source.get('regulation_type', 'N/A')} No. {source.get('regulation_number', 'N/A')}/{source.get('year', 'N/A')}"
                        html_content.append(f'<div class="doc-item">')
                        html_content.append(f'<strong>{i}. {regulation}</strong>')
                        
                        if source.get('about'):
                            html_content.append(f'<br><em>{source.get("about")}</em>')
                        
                        scores_html = ''
                        if source.get('final_score'):
                            scores_html += f'<span class="score-badge">Score: {source.get("final_score", 0):.4f}</span>'
                        if source.get('kg_score'):
                            scores_html += f'<span class="score-badge">KG: {source.get("kg_score", 0):.4f}</span>'
                        if scores_html:
                            html_content.append(f'<br>{scores_html}')
                        
                        html_content.append(f'</div>')
                    except Exception:
                        continue
                
                html_content.append(f'</details>')

            # All search results metadata - DETAILED with COLLAPSIBLE PHASES
            if include_metadata and entry.get('research_log') and entry['research_log'].get('phase_results'):
                html_content.append('<details>')
                html_content.append('<summary>üîç Complete Search Results (All Retrieved Documents)</summary>')
                
                phase_results = entry['research_log']['phase_results']
                phase_order = ['initial_scan', 'focused_review', 'deep_analysis', 'verification', 'expert_review']
                phase_groups = {}
                
                for phase_key, phase_data in phase_results.items():
                    phase_name = phase_key.split('_', 1)[-1] if '_' in phase_key else phase_key
                    for base_phase in phase_order:
                        if base_phase in phase_key:
                            phase_name = base_phase
                            break
                    
                    if phase_name not in phase_groups:
                        phase_groups[phase_name] = []
                    phase_groups[phase_name].append((phase_key, phase_data))
                
                total_retrieved = 0
                
                for phase_name in phase_order:
                    if phase_name not in phase_groups:
                        continue
                    
                    html_content.append(f'<div class="phase-section">')
                    html_content.append(f'<div class="phase-title">üîç PHASE: {phase_name.upper()}</div>')
                    
                    phase_entries = phase_groups[phase_name]
                    phase_total = sum(len(pd.get('candidates', [])) for _, pd in phase_entries)
                    total_retrieved += phase_total
                    
                    for phase_key, phase_data in phase_entries:
                        researcher_name = phase_data.get('researcher_name', 'Unknown')
                        candidates = phase_data.get('candidates', [])
                        confidence = phase_data.get('confidence', 0)
                        
                        html_content.append(f'<details>')
                        html_content.append(f'<summary>{researcher_name}: {len(candidates)} documents (Confidence: {confidence:.2%})</summary>')
                        
                        html_content.append(f'<div class="researcher-section">')
                        
                        # Display ALL documents with complete details
                        for idx_doc, candidate in enumerate(candidates, 1):
                            try:
                                record = candidate.get('record', {})
                                reg_info = f"{record.get('regulation_type', 'N/A')} No. {record.get('regulation_number', 'N/A')}/{record.get('year', 'N/A')}"
                                about = record.get('about', 'N/A')
                                composite = candidate.get('composite_score', 0)
                                kg_score = candidate.get('kg_score', 0)
                                
                                html_content.append(f'<div class="document-item">')
                                
                                # Main document line with scores
                                score_badges = f'<span class="score-badge">{composite:.3f}</span>'
                                score_badges += f'<span class="score-badge">KG: {kg_score:.3f}</span>'
                                
                                html_content.append(f'<div class="doc-title"><span class="doc-number">{idx_doc}.</span> {reg_info}</div>')
                                html_content.append(f'<div class="doc-detail">{score_badges}</div>')
                                
                                # Additional details
                                if about:
                                    snippet = about[:100] + "..." if len(about) > 100 else about
                                    html_content.append(f'<div class="doc-detail"><strong>About:</strong> {snippet}</div>')
                                
                                if record.get('enacting_body'):
                                    html_content.append(f'<div class="doc-detail"><strong>Body:</strong> {record.get("enacting_body")}</div>')
                                
                                # KG metadata
                                kg_details = []
                                if record.get('kg_primary_domain'):
                                    kg_details.append(f"Domain: {record.get('kg_primary_domain')}")
                                if record.get('kg_hierarchy_level'):
                                    kg_details.append(f"Hierarchy: Level {record.get('kg_hierarchy_level')}")
                                if kg_details:
                                    html_content.append(f'<div class="doc-detail"><strong>KG:</strong> {", ".join(kg_details)}</div>')
                                
                                # Team consensus
                                if candidate.get('team_consensus'):
                                    html_content.append(f'<div class="doc-detail"><span class="consensus-badge">‚úì Team Consensus ({candidate.get("researcher_agreement", 0)} researchers)</span></div>')
                                
                                html_content.append(f'</div>')
                            except Exception as e:
                                print(f"Error processing document {idx_doc}: {e}")
                                continue
                        
                        html_content.append(f'</div>')
                        html_content.append(f'</details>')
                    
                    html_content.append(f'</div>')
                
                html_content.append(f'<p><strong>Total Documents Retrieved:</strong> {total_retrieved}</p>')
                html_content.append(f'</details>')
            
            html_content.append('</div>')
        
        html_content.append("""
        <div style="text-align: center; margin-top: 50px; padding-top: 20px; border-top: 2px solid #e0e0e0; color: #757575;">
            <p><strong>Generated by Enhanced KG Indonesian Legal RAG System</strong></p>
        </div>
    </div>
</body>
</html>
""")
        
        return "\n".join(html_content)
        
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        return f"""<!DOCTYPE html>
<html>
<head><meta charset="UTF-8"><title>Export Error</title></head>
<body>
    <h1>Error Generating HTML Export</h1>
    <pre>{error_details}</pre>
</body>
</html>"""

def validate_config(config):
    """Validate configuration before use"""
    issues = []
    warnings = []
    
    try:
        # Basic settings validation
        if config.get('final_top_k', 0) < 1:
            issues.append("final_top_k must be >= 1")
        
        if config.get('temperature', 0) < 0 or config.get('temperature', 2) > 2:
            issues.append("temperature must be between 0 and 2")
        
        if config.get('max_new_tokens', 0) < 128:
            issues.append("max_new_tokens must be >= 128")
        
        # Team settings validation
        if config.get('research_team_size', 0) < 1 or config.get('research_team_size', 0) > 5:
            issues.append("research_team_size must be between 1 and 5")
        
        if config.get('consensus_threshold', 0) < 0.3 or config.get('consensus_threshold', 0) > 0.9:
            warnings.append("consensus_threshold outside recommended range (0.3-0.9)")
        
        # Search phases validation
        search_phases = config.get('search_phases', {})
        if not search_phases:
            issues.append("search_phases configuration missing")
        else:
            enabled_phases = 0
            for phase_name, phase_config in search_phases.items():
                if phase_config.get('enabled', False):
                    enabled_phases += 1
                    
                    # Validate candidates count
                    candidates = phase_config.get('candidates', 0)
                    if candidates < 10:
                        issues.append(f"{phase_name}: candidates must be >= 10")
                    elif candidates > 1000:
                        warnings.append(f"{phase_name}: high candidate count ({candidates}) may impact performance")
                    
                    # Validate thresholds
                    sem_threshold = phase_config.get('semantic_threshold', 0)
                    if sem_threshold < 0.1 or sem_threshold > 0.9:
                        warnings.append(f"{phase_name}: semantic_threshold outside normal range (0.1-0.9)")
                    
                    key_threshold = phase_config.get('keyword_threshold', 0)
                    if key_threshold < 0.02 or key_threshold > 0.5:
                        warnings.append(f"{phase_name}: keyword_threshold outside normal range (0.02-0.5)")
            
            if enabled_phases == 0:
                issues.append("At least one search phase must be enabled")
        
        # LLM generation parameters validation
        if config.get('top_p', 1.0) < 0.1 or config.get('top_p', 1.0) > 1.0:
            issues.append("top_p must be between 0.1 and 1.0")
        
        if config.get('top_k', 20) < 1 or config.get('top_k', 20) > 100:
            warnings.append("top_k outside recommended range (1-100)")
        
        if config.get('min_p', 0.1) < 0.01 or config.get('min_p', 0.1) > 0.5:
            warnings.append("min_p outside recommended range (0.01-0.5)")
        
        # Quality degradation parameters
        if config.get('initial_quality', 0.8) < 0.5 or config.get('initial_quality', 0.8) > 1.0:
            warnings.append("initial_quality outside recommended range (0.5-1.0)")
        
        if config.get('quality_degradation', 0.15) < 0.05 or config.get('quality_degradation', 0.15) > 0.3:
            warnings.append("quality_degradation outside recommended range (0.05-0.3)")
        
        if config.get('min_quality', 0.3) < 0.2 or config.get('min_quality', 0.3) > 0.5:
            warnings.append("min_quality outside recommended range (0.2-0.5)")
        
    except Exception as e:
        issues.append(f"Configuration validation error: {str(e)}")
    
    return {
        'valid': len(issues) == 0,
        'issues': issues,
        'warnings': warnings
    }


def apply_validated_config(config):
    """Apply configuration after validation"""
    validation_result = validate_config(config)
    
    if not validation_result['valid']:
        error_msg = "Configuration validation failed:\n"
        error_msg += "\n".join([f"‚ùå {issue}" for issue in validation_result['issues']])
        if validation_result['warnings']:
            error_msg += "\n\nWarnings:\n"
            error_msg += "\n".join([f"‚ö†Ô∏è {warning}" for warning in validation_result['warnings']])
        raise ValueError(error_msg)
    
    if validation_result['warnings']:
        print("Configuration warnings:")
        for warning in validation_result['warnings']:
            print(f"‚ö†Ô∏è {warning}")
    
    return config

class ProgressTracker:
    """Thread-safe progress tracking for research process"""
    
    def __init__(self):
        self.messages = []
        self.lock = threading.Lock()
        self.start_time = time.time()
    
    def add(self, message):
        """Add a progress message"""
        with self.lock:
            elapsed = time.time() - self.start_time
            timestamp = f"[{elapsed:.1f}s]"
            self.messages.append(f"{timestamp} {message}")
            return self.format()
    
    def format(self):
        """Format all messages for display"""
        with self.lock:
            return "\n".join([f"üîÑ {m}" for m in self.messages])
    
    def clear(self):
        """Clear all messages"""
        with self.lock:
            self.messages = []
            self.start_time = time.time()
    
    def get_duration(self):
        """Get elapsed time"""
        return time.time() - self.start_time


def check_initialization_status():
    """Check and report initialization status"""
    status = {
        'initialized': initialization_complete,
        'components': {
            'embedding_model': embedding_model is not None,
            'reranker_model': reranker_model is not None,
            'llm_model': llm_model is not None,
            'dataset_loader': dataset_loader is not None,
            'search_engine': search_engine is not None,
            'knowledge_graph': knowledge_graph is not None,
            'reranker': reranker is not None,
            'llm_generator': llm_generator is not None,
            'conversation_manager': conversation_manager is not None
        },
        'missing_components': []
    }
    
    for component, loaded in status['components'].items():
        if not loaded:
            status['missing_components'].append(component)
    
    return status


def wait_for_initialization(timeout=300, check_interval=2):
    """Wait for system initialization with timeout"""
    import time
    start_time = time.time()
    
    while not initialization_complete:
        if time.time() - start_time > timeout:
            raise TimeoutError(f"System initialization timeout after {timeout} seconds")
        
        status = check_initialization_status()
        if status['missing_components']:
            print(f"Waiting for: {', '.join(status['missing_components'])}")
        
        time.sleep(check_interval)
    
    print("‚úÖ System initialization complete!")
    return True

def safe_chat_wrapper(message, history, config_dict, show_thinking=True, show_sources=True, show_metadata=True):
    """Safe wrapper for chat function with error recovery"""
    max_retries = 2
    retry_count = 0
    
    while retry_count < max_retries:
        try:
            # Check initialization
            if not initialization_complete:
                yield history + [[message, "‚è≥ System is still initializing. Please wait..."]], ""
                time.sleep(2)
                continue
            
            # Validate configuration
            try:
                apply_validated_config(config_dict)
            except ValueError as e:
                yield history + [[message, f"‚ùå Configuration error:\n\n{str(e)}"]], ""
                return
            
            # Call main chat function
            for result in chat_with_legal_rag(message, history, config_dict, show_thinking, show_sources, show_metadata):
                yield result
            
            # Success - break retry loop
            break
            
        except Exception as e:
            retry_count += 1
            error_msg = f"‚ùå Error (attempt {retry_count}/{max_retries}): {str(e)}"
            
            if retry_count < max_retries:
                error_msg += "\n\nüîÑ Retrying..."
                yield history + [[message, error_msg]], ""
                time.sleep(1)
                
                # Try to recover
                try:
                    clear_cache()
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                except Exception:
                    pass
            else:
                error_msg += "\n\n‚ùå Maximum retries reached. Please try again or contact support."
                yield history + [[message, error_msg]], ""
                
                import traceback
                traceback.print_exc()

def system_health_check():
    """Comprehensive system health check"""
    health_report = {
        'timestamp': datetime.now().isoformat(),
        'status': 'healthy',
        'checks': {},
        'warnings': [],
        'errors': []
    }
    
    try:
        # Check initialization
        health_report['checks']['initialization'] = initialization_complete
        if not initialization_complete:
            health_report['errors'].append("System not initialized")
            health_report['status'] = 'unhealthy'
        
        # Check models
        if initialization_complete:
            health_report['checks']['embedding_model'] = embedding_model is not None
            health_report['checks']['reranker_model'] = reranker_model is not None
            health_report['checks']['llm_model'] = llm_model is not None
            
            # Check dataset
            if dataset_loader:
                stats = dataset_loader.get_statistics()
                health_report['checks']['dataset_records'] = stats.get('total_records', 0)
                health_report['checks']['kg_enhanced'] = stats.get('kg_enhanced', 0)
                
                if stats.get('total_records', 0) == 0:
                    health_report['errors'].append("No records in dataset")
                    health_report['status'] = 'unhealthy'
            
            # Check KG cache if available
            if knowledge_graph:
                cache_stats = knowledge_graph.get_cache_stats()
                health_report['checks']['kg_cache_hit_rate'] = cache_stats.get('hit_rate', 0)
                
                if cache_stats.get('hit_rate', 0) < 20 and cache_stats.get('total_requests', 0) > 100:
                    health_report['warnings'].append("Low KG cache hit rate")
        
        # Check memory
        try:
            import psutil
            memory = psutil.virtual_memory()
            health_report['checks']['memory_percent'] = memory.percent
            
            if memory.percent > 90:
                health_report['warnings'].append(f"High memory usage: {memory.percent:.1f}%")
                if memory.percent > 95:
                    health_report['status'] = 'degraded'
        except ImportError:
            health_report['warnings'].append("Cannot check memory (psutil not installed)")
        
        # Check GPU if available
        if torch.cuda.is_available():
            try:
                gpu_memory_allocated = torch.cuda.memory_allocated() / 1024**3  # GB
                gpu_memory_reserved = torch.cuda.memory_reserved() / 1024**3  # GB
                
                health_report['checks']['gpu_memory_allocated_gb'] = gpu_memory_allocated
                health_report['checks']['gpu_memory_reserved_gb'] = gpu_memory_reserved
                
                if gpu_memory_allocated > 10:  # Warning at 10GB
                    health_report['warnings'].append(f"High GPU memory usage: {gpu_memory_allocated:.1f}GB")
            except Exception as e:
                health_report['warnings'].append(f"Cannot check GPU memory: {str(e)}")
        
        # Check conversation manager
        if conversation_manager:
            history_count = len(conversation_manager.conversation_history)
            health_report['checks']['conversation_history_count'] = history_count
            
            if history_count > 100:
                health_report['warnings'].append(f"Large conversation history: {history_count} exchanges")
        
        # Overall status
        if health_report['errors']:
            health_report['status'] = 'unhealthy'
        elif health_report['warnings']:
            if health_report['status'] != 'degraded':
                health_report['status'] = 'warning'
    
    except Exception as e:
        health_report['status'] = 'error'
        health_report['errors'].append(f"Health check failed: {str(e)}")
    
    return health_report


def format_health_report(health_report):
    """Format health report for display"""
    output = ["## üè• System Health Report", ""]
    output.append(f"**Status:** {health_report['status'].upper()}")
    output.append(f"**Timestamp:** {health_report['timestamp']}")
    output.append("")
    
    # Checks
    if health_report['checks']:
        output.append("### ‚úÖ System Checks")
        for check, value in health_report['checks'].items():
            if isinstance(value, bool):
                status = "‚úÖ" if value else "‚ùå"
                output.append(f"- {status} **{check}**: {value}")
            elif isinstance(value, (int, float)):
                output.append(f"- üìä **{check}**: {value}")
            else:
                output.append(f"- **{check}**: {value}")
        output.append("")
    
    # Warnings
    if health_report['warnings']:
        output.append("### ‚ö†Ô∏è Warnings")
        for warning in health_report['warnings']:
            output.append(f"- {warning}")
        output.append("")
    
    # Errors
    if health_report['errors']:
        output.append("### ‚ùå Errors")
        for error in health_report['errors']:
            output.append(f"- {error}")
        output.append("")
    
    return "\n".join(output)
    
# =============================================================================
# FIXED: EFFICIENT DATASET LOADER WITH MEMORY OPTIMIZATION
# =============================================================================

class EnhancedKGDatasetLoader:
    """FIXED: Memory-efficient dataset loader with streaming and chunking"""
    
    def __init__(self, dataset_name, embedding_dim):
        self.dataset_name = dataset_name
        self.embedding_dim = embedding_dim
        self.all_records = []
        self.embeddings = None
        self.tfidf_matrix = None
        self.tfidf_vectorizer = None
        
        # KG indexes
        self.kg_entities_lookup = {}
        self.kg_cross_references_lookup = {}
        self.kg_domains_lookup = {}
        self.kg_concept_clusters_lookup = {}
        self.kg_legal_actions_lookup = {}
        self.kg_sanctions_lookup = {}
        self.kg_concept_vectors_lookup = {}
        
        # Numeric indexes
        self.authority_index = {}
        self.temporal_index = {}
        self.kg_connectivity_index = {}
        self.hierarchy_index = {}
        self.domain_index = {}
        
    def load_from_huggingface(self, progress_callback=None, limit=100000):
        """FIXED: Stream dataset with aggressive memory management"""
        try:
            if progress_callback:
                progress_callback("üì• Loading enhanced KG dataset with streaming...")
            
            from datasets import load_dataset
            import gc
            
            print(f"   Limit: {limit} records")

            # Load dataset
            dataset = load_dataset(
                self.dataset_name, 
                split=f'train[:{limit}]',
                streaming=False
            )
            
            total_rows = len(dataset)
            chunk_size = 1000  # Process 5K records at a time
            
            if progress_callback:
                progress_callback(f"üìä Processing {total_rows:,} records in chunks of {chunk_size:,}...")
            
            all_records_temp = []
            embeddings_temp = []
            tfidf_temp = []
            
            for start_idx in range(0, total_rows, chunk_size):
                end_idx = min(start_idx + chunk_size, total_rows)
                
                #if progress_callback:
                #    progress_callback(f"   Processing chunk {start_idx:,} to {end_idx:,}...")
                
                # Get chunk
                chunk = dataset.select(range(start_idx, end_idx))
                df_chunk = chunk.to_pandas()
                
                # Process records
                for idx, row in df_chunk.iterrows():
                    try:
                        record = self._create_record(row, start_idx + idx)
                        all_records_temp.append(record)
                        
                        # Extract embedding
                        if 'embedding' in row and row['embedding'] is not None:
                            embeddings_temp.append(row['embedding'])
                        else:
                            embeddings_temp.append(np.zeros(self.embedding_dim, dtype=np.float32))
                        
                        # Extract TF-IDF
                        if 'tfidf_vector' in row and row['tfidf_vector'] is not None:
                            tfidf_temp.append(row['tfidf_vector'])
                        
                    except Exception as e:
                        if progress_callback:
                            progress_callback(f"   ‚ö†Ô∏è Skipping record {start_idx + idx}: {e}")
                        continue
                
                # *** FIXED: Aggressive memory cleanup after each chunk ***
                del df_chunk, chunk
                gc.collect()
                
                # Clear CUDA cache if available
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                
                #if progress_callback:
                #    progress_callback(f"   ‚úì Processed {len(all_records_temp):,} records so far...")
            
            self.all_records = all_records_temp
            
            if progress_callback:
                progress_callback("üìä Converting embeddings to numpy array...")
            
            # *** FIXED: Convert embeddings efficiently ***
            self.embeddings = torch.tensor(np.array(embeddings_temp, dtype=np.float32), device='cpu')
            del embeddings_temp
            gc.collect()
            
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            if progress_callback:
                progress_callback("üîç Processing TF-IDF vectors...")
            
            # *** FIXED: Handle TF-IDF efficiently ***
            if tfidf_temp:
                from scipy.sparse import csr_matrix
                tfidf_array = np.array(tfidf_temp, dtype=np.float32)
                self.tfidf_matrix = csr_matrix(tfidf_array)
                del tfidf_temp, tfidf_array
                
                # Create vectorizer
                tfidf_dim = self.tfidf_matrix.shape[1]
                self._create_working_vectorizer(tfidf_dim)
            else:
                self._create_dummy_vectorizer()
            
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            if progress_callback:
                progress_callback("üóÉÔ∏è Building enhanced KG indexes...")
            
            self._build_enhanced_kg_indexes()
            
            del dataset
            gc.collect()
            
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            if progress_callback:
                kg_count = len(self.kg_entities_lookup)
                progress_callback(f"‚úÖ Ready: {len(self.all_records):,} records with {kg_count:,} KG-enhanced")
            
            return True
            
        except Exception as e:
            if progress_callback:
                progress_callback(f"‚ùå Loading failed: {str(e)}")
            import traceback
            traceback.print_exc()
            return False
    
    def _create_record(self, row, idx):
        """Create record with all fields"""
        return {
            # Original fields
            'global_id': row.get('global_id', idx),
            'local_id': row.get('local_id', 1),
            'regulation_type': str(row.get('regulation_type', 'Unknown')),
            'enacting_body': str(row.get('enacting_body', 'Unknown')),
            'regulation_number': str(row.get('regulation_number', 'N/A')),
            'year': str(row.get('year', '2023')),
            'about': str(row.get('about', ''))[:200],
            'effective_date': str(row.get('effective_date', '2023-01-01')),
            'chapter': str(row.get('chapter', 'N/A')),
            'article': str(row.get('article', 'N/A')),
            'content': str(row.get('content', ''))[:500],
            'chunk_id': row.get('chunk_id', 1),
            
            # KG numeric features
            'kg_entity_count': int(row.get('kg_entity_count', 0)),
            'kg_cross_ref_count': int(row.get('kg_cross_ref_count', 0)),
            'kg_primary_domain': str(row.get('kg_primary_domain', 'Unknown')),
            'kg_domain_confidence': float(row.get('kg_domain_confidence', 0.0)),
            'kg_cluster_count': int(row.get('kg_cluster_count', 0)),
            'kg_cluster_diversity': float(row.get('kg_cluster_diversity', 0.0)),
            'kg_authority_score': float(row.get('kg_authority_score', 0.5)),
            'kg_hierarchy_level': int(row.get('kg_hierarchy_level', 5)),
            'kg_temporal_score': float(row.get('kg_temporal_score', 0.6)),
            'kg_years_old': int(row.get('kg_years_old', 1)),
            'kg_legal_richness': float(row.get('kg_legal_richness', 0.0)),
            'kg_legal_complexity': float(row.get('kg_legal_complexity', 0.0)),
            'kg_completeness_score': float(row.get('kg_completeness_score', 0.0)),
            'kg_connectivity_score': float(row.get('kg_connectivity_score', 0.0)),
            'kg_has_obligations': bool(row.get('kg_has_obligations', False)),
            'kg_has_prohibitions': bool(row.get('kg_has_prohibitions', False)),
            'kg_has_permissions': bool(row.get('kg_has_permissions', False)),
            'kg_pagerank': float(row.get('kg_pagerank', 0.0)),
            'kg_degree_centrality': float(row.get('kg_degree_centrality', 0.0)),
            
            # Store JSON strings for lazy parsing
            'kg_entities_json': str(row.get('kg_entities_json', '[]')),
            'kg_cross_references_json': str(row.get('kg_cross_references_json', '[]')),
            'kg_legal_domains_json': str(row.get('kg_legal_domains_json', '[]')),
            'kg_concept_clusters_json': str(row.get('kg_concept_clusters_json', '{}')),
            'kg_legal_actions_json': str(row.get('kg_legal_actions_json', '{}')),
            'kg_sanctions_json': str(row.get('kg_sanctions_json', '{}')),
            'kg_concept_vector_json': str(row.get('kg_concept_vector_json', '[]')),
            'kg_citation_impact_json': str(row.get('kg_citation_impact_json', '{}'))
        }
    
    def _create_working_vectorizer(self, n_features):
        """Create working vectorizer for TF-IDF"""
        class WorkingVectorizer:
            def __init__(self, features):
                self.vocabulary_ = {}
                self._tfidf = True
                self.idf_ = None
                self.stop_words_ = set()
                self.n_features = features
            
            def transform(self, texts):
                from scipy.sparse import csr_matrix
                if isinstance(texts, str):
                    texts = [texts]
                return csr_matrix((len(texts), self.n_features))
        
        self.tfidf_vectorizer = WorkingVectorizer(n_features)
    
    def _create_dummy_vectorizer(self):
        """Create dummy vectorizer when TF-IDF not available"""
        class DummyVectorizer:
            def __init__(self):
                self.vocabulary_ = {}
                self._tfidf = True
                self.idf_ = None
                self.stop_words_ = set()
                self.n_features = 20000
            
            def transform(self, texts):
                from scipy.sparse import csr_matrix
                if isinstance(texts, str):
                    texts = [texts]
                return csr_matrix((len(texts), self.n_features))
        
        self.tfidf_vectorizer = DummyVectorizer()
    
    def _build_enhanced_kg_indexes(self):
        """Build enhanced KG indexes efficiently"""
        
        # Build lookup dictionaries for KG features (lazy parsing)
        for record in self.all_records:
            try:
                doc_id = record['global_id']
                
                # Store JSON strings - will parse on demand
                if record.get('kg_entities_json', '[]') != '[]':
                    self.kg_entities_lookup[doc_id] = record['kg_entities_json']
                
                if record.get('kg_cross_references_json', '[]') != '[]':
                    self.kg_cross_references_lookup[doc_id] = record['kg_cross_references_json']
                
                if record.get('kg_legal_domains_json', '[]') != '[]':
                    self.kg_domains_lookup[doc_id] = record['kg_legal_domains_json']
                
                if record.get('kg_concept_clusters_json', '{}') != '{}':
                    self.kg_concept_clusters_lookup[doc_id] = record['kg_concept_clusters_json']
                
                if record.get('kg_legal_actions_json', '{}') != '{}':
                    self.kg_legal_actions_lookup[doc_id] = record['kg_legal_actions_json']
                
                if record.get('kg_sanctions_json', '{}') != '{}':
                    self.kg_sanctions_lookup[doc_id] = record['kg_sanctions_json']
                
                if record.get('kg_concept_vector_json', '[]') != '[]':
                    self.kg_concept_vectors_lookup[doc_id] = record['kg_concept_vector_json']
                
            except Exception:
                continue
        
        # Build numeric indexes for fast filtering
        self.authority_index = defaultdict(list)
        self.temporal_index = defaultdict(list)
        self.kg_connectivity_index = defaultdict(list)
        self.hierarchy_index = defaultdict(list)
        self.domain_index = defaultdict(list)
        
        for i, record in enumerate(self.all_records):
            try:
                # Authority tier
                authority_tier = max(0, min(10, int(record['kg_authority_score'] * 10)))
                self.authority_index[authority_tier].append(i)
                
                # Temporal tier
                temporal_tier = max(0, min(10, int(record['kg_temporal_score'] * 10)))
                self.temporal_index[temporal_tier].append(i)
                
                # KG connectivity tier
                kg_tier = max(0, min(10, int(record['kg_connectivity_score'] * 10)))
                self.kg_connectivity_index[kg_tier].append(i)
                
                # Hierarchy tier
                hierarchy_tier = max(1, min(10, record['kg_hierarchy_level']))
                self.hierarchy_index[hierarchy_tier].append(i)
                
                # Domain index
                domain = record['kg_primary_domain']
                self.domain_index[domain].append(i)
                
            except Exception:
                continue
        
        # Convert to dict
        self.authority_index = dict(self.authority_index)
        self.temporal_index = dict(self.temporal_index)
        self.kg_connectivity_index = dict(self.kg_connectivity_index)
        self.hierarchy_index = dict(self.hierarchy_index)
        self.domain_index = dict(self.domain_index)
    
    def get_statistics(self):
        """Get enhanced dataset statistics"""
        if not self.all_records:
            return {}
        
        try:
            total_records = len(self.all_records)
            kg_enhanced = len(self.kg_entities_lookup)
            
            authority_scores = [r['kg_authority_score'] for r in self.all_records]
            temporal_scores = [r['kg_temporal_score'] for r in self.all_records]
            connectivity_scores = [r['kg_connectivity_score'] for r in self.all_records]
            entity_counts = [r['kg_entity_count'] for r in self.all_records]
            cross_ref_counts = [r['kg_cross_ref_count'] for r in self.all_records]
            
            return {
                'total_records': total_records,
                'kg_enhanced': kg_enhanced,
                'kg_enhancement_rate': kg_enhanced / total_records if total_records > 0 else 0,
                'embeddings_shape': self.embeddings.shape if self.embeddings is not None else None,
                'tfidf_shape': self.tfidf_matrix.shape if self.tfidf_matrix is not None else None,
                'tfidf_enabled': self.tfidf_matrix is not None,
                'memory_optimized': True,
                'authority_tiers': len(self.authority_index),
                'temporal_tiers': len(self.temporal_index),
                'kg_connectivity_tiers': len(self.kg_connectivity_index),
                'hierarchy_tiers': len(self.hierarchy_index),
                'unique_domains': len(self.domain_index),
                'avg_authority_score': np.mean(authority_scores) if authority_scores else 0,
                'avg_temporal_score': np.mean(temporal_scores) if temporal_scores else 0,
                'avg_connectivity_score': np.mean(connectivity_scores) if connectivity_scores else 0,
                'avg_entities_per_doc': np.mean(entity_counts) if entity_counts else 0,
                'avg_cross_refs_per_doc': np.mean(cross_ref_counts) if cross_ref_counts else 0,
                'has_obligations': sum(1 for r in self.all_records if r['kg_has_obligations']),
                'has_prohibitions': sum(1 for r in self.all_records if r['kg_has_prohibitions']),
                'has_permissions': sum(1 for r in self.all_records if r['kg_has_permissions'])
            }
        except Exception as e:
            return {'error': str(e)}

# =============================================================================
# ENHANCED KNOWLEDGE GRAPH WITH NEW FEATURES
# =============================================================================

class EnhancedKnowledgeGraph:
    """Enhanced KG class with caching"""
    
    def __init__(self, dataset_loader):
        self.dataset_loader = dataset_loader
        self.entities_lookup = dataset_loader.kg_entities_lookup
        self.cross_refs_lookup = dataset_loader.kg_cross_references_lookup
        self.domains_lookup = dataset_loader.kg_domains_lookup
        self.clusters_lookup = dataset_loader.kg_concept_clusters_lookup
        self.legal_actions_lookup = dataset_loader.kg_legal_actions_lookup
        self.sanctions_lookup = dataset_loader.kg_sanctions_lookup
        self.concept_vectors_lookup = dataset_loader.kg_concept_vectors_lookup
        
        # Caching
        self._parse_cache = {}
        self._cache_hits = 0
        self._cache_misses = 0
    
    def extract_entities_from_text(self, text):
        """Enhanced entity extraction for ANY Indonesian regulation"""
        if not text or pd.isna(text):
            return []
        
        try:
            text_lower = str(text).lower()
            entities = []
            
            # Pattern 1: Standard format - "Type No. X Tahun YYYY"
            for reg_type, patterns in REGULATION_TYPE_PATTERNS.items():
                for pattern in patterns:
                    # Build flexible regex pattern
                    regex_pattern = (
                        rf'{re.escape(pattern)}\s*'
                        r'(?:nomor|no\.?|num\.?)?\s*'
                        r'(\d+)\s*'
                        r'(?:' + '|'.join([re.escape(sep) for sep in YEAR_SEPARATORS]) + r')?\s*'
                        r'(\d{4})?'
                    )
                    
                    matches = re.finditer(regex_pattern, text_lower, re.IGNORECASE)
                    for match in matches:
                        number = match.group(1)
                        year = match.group(2) if match.group(2) else ''
                        
                        entity_text = f"{pattern} {number}"
                        if year:
                            entity_text += f" tahun {year}"
                        
                        entities.append((entity_text, 'regulation_reference'))
            
            # Pattern 2: Pasal references
            pasal_pattern = r'pasal\s*(\d+)(?:\s*ayat\s*\((\d+)\))?(?:\s*huruf\s*([a-z]))?'
            matches = re.finditer(pasal_pattern, text_lower)
            for match in matches:
                entities.append((match.group(0), 'article_reference'))
            
            # Pattern 3: Bab references
            bab_pattern = r'bab\s+([IVX]+|\d+)'
            matches = re.finditer(bab_pattern, text_lower)
            for match in matches:
                entities.append((match.group(0), 'chapter_reference'))
            
            return entities
        except Exception as e:
            print(f"Error in entity extraction: {e}")
            return []
    
    def get_parsed_kg_data(self, doc_id, data_type='entities'):
        """Parse KG JSON data on demand WITH CACHING"""
        # Create cache key
        cache_key = f"{doc_id}_{data_type}"
        
        # Check cache first
        if cache_key in self._parse_cache:
            self._cache_hits += 1
            return self._parse_cache[cache_key]
        
        self._cache_misses += 1
        
        try:
            result = None
            
            if data_type == 'entities' and doc_id in self.entities_lookup:
                result = json.loads(self.entities_lookup[doc_id])
            elif data_type == 'cross_refs' and doc_id in self.cross_refs_lookup:
                result = json.loads(self.cross_refs_lookup[doc_id])
            elif data_type == 'domains' and doc_id in self.domains_lookup:
                result = json.loads(self.domains_lookup[doc_id])
            elif data_type == 'clusters' and doc_id in self.clusters_lookup:
                result = json.loads(self.clusters_lookup[doc_id])
            elif data_type == 'legal_actions' and doc_id in self.legal_actions_lookup:
                result = json.loads(self.legal_actions_lookup[doc_id])
            elif data_type == 'sanctions' and doc_id in self.sanctions_lookup:
                result = json.loads(self.sanctions_lookup[doc_id])
            elif data_type == 'concept_vector' and doc_id in self.concept_vectors_lookup:
                result = json.loads(self.concept_vectors_lookup[doc_id])
            
            # Store in cache (limit cache size)
            if result is not None:
                if len(self._parse_cache) > 1000:  # Max 1000 cached entries
                    # Remove oldest 100 entries
                    keys_to_remove = list(self._parse_cache.keys())[:100]
                    for key in keys_to_remove:
                        del self._parse_cache[key]
                
                self._parse_cache[cache_key] = result
            
            return result
            
        except Exception as e:
            print(f"Error parsing KG data for {doc_id}/{data_type}: {e}")
            return None
    
    def get_cache_stats(self):
        """Get cache performance statistics"""
        total_requests = self._cache_hits + self._cache_misses
        hit_rate = (self._cache_hits / total_requests * 100) if total_requests > 0 else 0
        
        return {
            'cache_size': len(self._parse_cache),
            'cache_hits': self._cache_hits,
            'cache_misses': self._cache_misses,
            'hit_rate': hit_rate,
            'total_requests': total_requests
        }
    
    def clear_cache(self):
        """Clear the parse cache"""
        self._parse_cache = {}
        self._cache_hits = 0
        self._cache_misses = 0
        
    def extract_entities_from_text(self, text):
        """Enhanced entity extraction for ANY Indonesian regulation"""
        if not text or pd.isna(text):
            return []
        
        try:
            text_lower = str(text).lower()
            entities = []
            
            # Pattern 1: Standard format - "Type No. X Tahun YYYY"
            for reg_type, patterns in REGULATION_TYPE_PATTERNS.items():
                for pattern in patterns:
                    # Build flexible regex pattern
                    # Matches: "PP No. 41 Tahun 2009", "pp no 41 tahun 2009", "PP 41/2009", etc.
                    regex_pattern = (
                        rf'{re.escape(pattern)}\s*'  # Regulation type
                        r'(?:nomor|no\.?|num\.?)?\s*'  # Optional "nomor"/"no"
                        r'(\d+)\s*'  # Number
                        r'(?:' + '|'.join([re.escape(sep) for sep in YEAR_SEPARATORS]) + r')?\s*'  # Separator
                        r'(\d{4})?'  # Optional year
                    )
                    
                    matches = re.finditer(regex_pattern, text_lower, re.IGNORECASE)
                    for match in matches:
                        number = match.group(1)
                        year = match.group(2) if match.group(2) else ''
                        
                        # Create normalized entity
                        entity_text = f"{pattern} {number}"
                        if year:
                            entity_text += f" tahun {year}"
                        
                        entities.append((entity_text, 'regulation_reference'))
            
            # Pattern 2: Pasal (Article) references
            pasal_pattern = r'pasal\s*(\d+)(?:\s*ayat\s*\((\d+)\))?(?:\s*huruf\s*([a-z]))?'
            matches = re.finditer(pasal_pattern, text_lower)
            for match in matches:
                entities.append((match.group(0), 'article_reference'))
            
            # Pattern 3: Bab (Chapter) references
            bab_pattern = r'bab\s+([IVX]+|\d+)'
            matches = re.finditer(bab_pattern, text_lower)
            for match in matches:
                entities.append((match.group(0), 'chapter_reference'))
            
            return entities
        except Exception as e:
            print(f"Error in entity extraction: {e}")
            return []
    
    def calculate_enhanced_kg_score(self, query_entities, record, query_type='general'):
        """Enhanced KG scoring with new dataset features"""
        try:
            total_score = 0.0
            doc_id = record['global_id']
            
            # 1. Entity matching (enhanced)
            doc_entities = self.get_parsed_kg_data(doc_id, 'entities')
            if doc_entities and query_entities:
                entity_score = self._calculate_entity_match(query_entities, doc_entities)
                total_score += entity_score * KG_WEIGHTS['direct_match']
            
            # 2. Cross-reference boost
            cross_refs = self.get_parsed_kg_data(doc_id, 'cross_refs')
            if cross_refs:
                cross_ref_score = self._calculate_cross_ref_relevance(query_entities, cross_refs)
                total_score += cross_ref_score * KG_WEIGHTS['cross_reference']
            
            # 3. Domain matching
            domains = self.get_parsed_kg_data(doc_id, 'domains')
            if domains:
                domain_score = self._calculate_domain_relevance(query_type, domains, record)
                total_score += domain_score * KG_WEIGHTS['domain_match']
            
            # 4. Legal actions matching (for procedural/sanctions queries)
            if query_type in ['procedural', 'sanctions']:
                legal_actions = self.get_parsed_kg_data(doc_id, 'legal_actions')
                if legal_actions:
                    action_score = self._calculate_legal_action_relevance(query_type, legal_actions, record)
                    total_score += action_score * KG_WEIGHTS['legal_action_match']
            
            # 5. Sanctions matching (for sanctions queries)
            if query_type == 'sanctions':
                sanctions = self.get_parsed_kg_data(doc_id, 'sanctions')
                if sanctions:
                    sanction_score = self._calculate_sanction_relevance(sanctions)
                    total_score += sanction_score * KG_WEIGHTS['sanction_relevance']
            
            # 6. Concept clusters matching
            clusters = self.get_parsed_kg_data(doc_id, 'clusters')
            if clusters and query_entities:
                cluster_score = self._calculate_cluster_relevance(query_entities, clusters)
                total_score += cluster_score * KG_WEIGHTS['concept_cluster']
            
            # 7. Hierarchy boost
            hierarchy_score = self._calculate_hierarchy_boost(record)
            total_score += hierarchy_score * KG_WEIGHTS['hierarchy_boost']
            
            # 8. Connectivity boost
            connectivity_score = record.get('kg_connectivity_score', 0.0)
            total_score += connectivity_score * KG_WEIGHTS['connectivity_boost']
            
            # 9. Citation impact (if available)
            if record.get('kg_pagerank', 0.0) > 0:
                citation_score = record['kg_pagerank']
                total_score += citation_score * KG_WEIGHTS['citation_impact']
            
            return min(1.0, total_score)
        except Exception:
            return 0.0
    
    def _calculate_entity_match(self, query_entities, doc_entities):
        """Calculate entity matching score"""
        try:
            if not query_entities or not doc_entities:
                return 0.0
            
            query_entity_set = {str(entity).lower() for entity in query_entities}
            
            # Handle both list of strings and list of dicts
            doc_entity_set = set()
            for entity in doc_entities:
                if isinstance(entity, dict):
                    doc_entity_set.add(str(entity.get('text', '')).lower())
                else:
                    doc_entity_set.add(str(entity).lower())
            
            overlap = query_entity_set & doc_entity_set
            if overlap:
                return min(1.0, len(overlap) / len(query_entity_set))
            
            # Partial matching
            partial_score = 0.0
            for q_entity in query_entities:
                for d_entity in doc_entity_set:
                    if str(q_entity).lower() in str(d_entity).lower():
                        partial_score += 0.5
                        break
            
            return min(1.0, partial_score / len(query_entities)) if query_entities else 0.0
        except Exception:
            return 0.0
    
    def _calculate_cross_ref_relevance(self, query_entities, cross_refs):
        """Calculate cross-reference relevance"""
        try:
            if not cross_refs:
                return 0.0
            
            # More cross-references = better connected document
            ref_count = len(cross_refs) if isinstance(cross_refs, list) else 0
            return min(1.0, ref_count / 10.0)  # Normalize to 0-1
        except Exception:
            return 0.0
    
    def _calculate_domain_relevance(self, query_type, domains, record):
        """Calculate domain relevance"""
        try:
            # Map query types to relevant domains
            query_domain_map = {
                'procedural': ['administrative', 'procedural', 'governance'],
                'sanctions': ['criminal', 'administrative', 'sanctions'],
                'definitional': ['general', 'definitions', 'terminology'],
                'specific_article': ['all'],
                'general': ['all']
            }
            
            relevant_domains = query_domain_map.get(query_type, ['all'])
            
            if 'all' in relevant_domains:
                return record.get('kg_domain_confidence', 0.5)
            
            # Check if document domains match relevant domains
            if isinstance(domains, list):
                for domain_info in domains:
                    if isinstance(domain_info, dict):
                        domain_name = domain_info.get('domain', '').lower()
                        if any(rel_dom in domain_name for rel_dom in relevant_domains):
                            return domain_info.get('confidence', 0.5)
            
            return 0.0
        except Exception:
            return 0.0
    
    def _calculate_legal_action_relevance(self, query_type, legal_actions, record):
        """Calculate legal action relevance"""
        try:
            if query_type == 'procedural':
                # Check for procedural actions
                if record.get('kg_has_obligations', False):
                    return 0.8
                if record.get('kg_has_permissions', False):
                    return 0.6
            elif query_type == 'sanctions':
                # Check for prohibitions/sanctions
                if record.get('kg_has_prohibitions', False):
                    return 0.9
            
            return 0.0
        except Exception:
            return 0.0
    
    def _calculate_sanction_relevance(self, sanctions):
        """Calculate sanction relevance"""
        try:
            if not sanctions:
                return 0.0
            
            # Check if sanctions data is available
            if isinstance(sanctions, dict):
                if sanctions.get('has_sanctions', False):
                    return 0.9
                sanction_count = len(sanctions.get('sanctions', []))
                return min(1.0, sanction_count / 3.0)
            
            return 0.0
        except Exception:
            return 0.0
    
    def _calculate_cluster_relevance(self, query_entities, clusters):
        """Calculate concept cluster relevance"""
        try:
            if not clusters or not query_entities:
                return 0.0
            
            # Extract concepts from clusters
            cluster_concepts = set()
            if isinstance(clusters, dict):
                for cluster_name, concepts in clusters.items():
                    if isinstance(concepts, list):
                        cluster_concepts.update([str(c).lower() for c in concepts])
            
            # Check overlap with query entities
            query_set = {str(e).lower() for e in query_entities}
            overlap = query_set & cluster_concepts
            
            if overlap:
                return min(1.0, len(overlap) / len(query_set))
            
            return 0.0
        except Exception:
            return 0.0
    
    def _calculate_hierarchy_boost(self, record):
        """Calculate hierarchy-based boost"""
        try:
            # Lower hierarchy level = higher authority
            hierarchy_level = record.get('kg_hierarchy_level', 5)
            # Normalize: level 1 = 1.0, level 10 = 0.1
            return max(0.1, (11 - hierarchy_level) / 10.0)
        except Exception:
            return 0.5


    def follow_citation_chain(self, seed_document_ids, max_depth=2):
        """Follow citation chains from seed documents"""
        try:
            citation_network = {}
            visited = set()
            
            def traverse_citations(doc_id, depth):
                if depth > max_depth or doc_id in visited:
                    return []
                
                visited.add(doc_id)
                related_docs = []
                
                # Get cross-references
                cross_refs = self.get_parsed_kg_data(doc_id, 'cross_refs')
                
                if cross_refs:
                    for ref in cross_refs[:5]:  # Limit to top 5 per document
                        try:
                            if isinstance(ref, dict):
                                ref_id = ref.get('target_id')
                            else:
                                ref_id = str(ref)
                            
                            if ref_id and ref_id not in visited:
                                related_docs.append({
                                    'doc_id': ref_id,
                                    'citation_depth': depth,
                                    'cited_by': doc_id
                                })
                                
                                # Recursive traversal
                                if depth < max_depth:
                                    deeper_docs = traverse_citations(ref_id, depth + 1)
                                    related_docs.extend(deeper_docs)
                        except Exception:
                            continue
                
                return related_docs
            
            # Start traversal from each seed
            for seed_id in seed_document_ids:
                if isinstance(seed_id, dict):
                    seed_id = seed_id.get('global_id', seed_id)
                
                citation_network[seed_id] = traverse_citations(seed_id, 1)
            
            return citation_network
            
        except Exception as e:
            print(f"Error following citations: {e}")
            return {}
    
    def boost_cited_documents(self, candidates, citation_network):
        """Boost documents that appear in citation chains"""
        try:
            # Collect all documents in citation network
            cited_docs = set()
            citation_depths = {}
            
            for seed_id, citations in citation_network.items():
                for citation in citations:
                    doc_id = citation['doc_id']
                    depth = citation['citation_depth']
                    cited_docs.add(doc_id)
                    
                    # Track minimum depth (closer = better)
                    if doc_id not in citation_depths or depth < citation_depths[doc_id]:
                        citation_depths[doc_id] = depth
            
            # Apply citation bonuses
            for candidate in candidates:
                doc_id = candidate['record'].get('global_id')
                
                if doc_id in cited_docs:
                    depth = citation_depths[doc_id]
                    # Closer citations get higher boost
                    citation_bonus = 0.15 / depth if depth > 0 else 0.15
                    
                    # Update scores
                    if 'final_consensus_score' in candidate:
                        candidate['final_consensus_score'] = min(1.0, 
                            candidate['final_consensus_score'] + citation_bonus)
                    if 'composite_score' in candidate:
                        candidate['composite_score'] = min(1.0,
                            candidate['composite_score'] + citation_bonus)
                    
                    candidate['in_citation_chain'] = True
                    candidate['citation_depth'] = depth
            
            return candidates
            
        except Exception as e:
            print(f"Error boosting cited documents: {e}")
            return candidates


    def extract_regulation_references_with_confidence(self, text):
            """
            Extract regulation references with confidence scores.
            Returns: List of (regulation_dict, confidence_score)
            """
            if not text or pd.isna(text):
                return []
            
            try:
                text_lower = str(text).lower()
                references = []
                
                # Pattern 1: Complete reference with year (HIGHEST CONFIDENCE)
                # e.g., "UU No. 13 Tahun 2003", "PP 41/2009"
                for reg_type, patterns in REGULATION_TYPE_PATTERNS.items():
                    for pattern in patterns:
                        # Complete format: Type + Number + Year
                        complete_pattern = (
                            rf'{re.escape(pattern)}\s*'
                            r'(?:nomor|no\.?|num\.?|number)?\s*'
                            r'(\d+)\s*'
                            r'(?:tahun|th\.?|\/)\s*'
                            r'(\d{4})'
                        )
                        
                        matches = re.finditer(complete_pattern, text_lower, re.IGNORECASE)
                        for match in matches:
                            number = match.group(1)
                            year = match.group(2)
                            
                            references.append({
                                'regulation': {
                                    'type': pattern,
                                    'number': number,
                                    'year': year,
                                    'full_text': match.group(0)
                                },
                                'confidence': 1.0,  # HIGHEST - has type, number, and year
                                'specificity': 'complete'
                            })
                
                # Pattern 2: Reference with number but no year (MEDIUM CONFIDENCE)
                # e.g., "UU No. 13", "PP 41"
                for reg_type, patterns in REGULATION_TYPE_PATTERNS.items():
                    for pattern in patterns:
                        partial_pattern = (
                            rf'{re.escape(pattern)}\s*'
                            r'(?:nomor|no\.?|num\.?|number)?\s*'
                            r'(\d+)(?!\s*(?:tahun|th\.?|\/)\s*\d{{4}})'  # Negative lookahead for year
                        )
                        
                        matches = re.finditer(partial_pattern, text_lower, re.IGNORECASE)
                        for match in matches:
                            number = match.group(1)
                            
                            # Check if not already captured with year
                            already_exists = any(
                                ref['regulation']['type'] == pattern and 
                                ref['regulation']['number'] == number and
                                ref['confidence'] == 1.0
                                for ref in references
                            )
                            
                            if not already_exists:
                                references.append({
                                    'regulation': {
                                        'type': pattern,
                                        'number': number,
                                        'year': '',
                                        'full_text': match.group(0)
                                    },
                                    'confidence': 0.7,  # MEDIUM - has type and number
                                    'specificity': 'partial'
                                })
                
                # Pattern 3: Just regulation type mentioned (LOW CONFIDENCE)
                # e.g., "undang-undang tersebut", "peraturan pemerintah ini"
                if not references:  # Only if no specific references found
                    for reg_type, patterns in REGULATION_TYPE_PATTERNS.items():
                        for pattern in patterns:
                            if pattern in text_lower:
                                references.append({
                                    'regulation': {
                                        'type': pattern,
                                        'number': '',
                                        'year': '',
                                        'full_text': pattern
                                    },
                                    'confidence': 0.3,  # LOW - only type mentioned
                                    'specificity': 'vague'
                                })
                                break  # Only add once per type
                
                # Sort by confidence (highest first)
                references.sort(key=lambda x: x['confidence'], reverse=True)
                
                return references
                
            except Exception as e:
                print(f"Error extracting regulation references: {e}")
                return []

# =============================================================================
# ADVANCED QUERY ANALYZER
# =============================================================================

# =============================================================================
# ADVANCED QUERY ANALYZER
# =============================================================================

class AdvancedQueryAnalyzer:
    """
    Intelligent query analyzer that differentiates between:
    1. Specific legal phrases/entities (keyword-first search)
    2. Broad conceptual queries (semantic-first search)
    """
    
    def __init__(self, knowledge_graph):
        self.kg = knowledge_graph
        
        # Common law names mapping (expandable)
        self.common_law_names = {
            'kepabeanan': ['bea cukai', 'customs', 'pabean'],
            'ketenagakerjaan': ['tenaga kerja', 'pekerja', 'buruh'],
            'perpajakan': ['pajak', 'tax', 'pph', 'ppn'],
            'cipta kerja': ['job creation', 'omnibus law cipta kerja'],
            'perkawinan': ['nikah', 'marriage', 'kawin'],
            'agraria': ['tanah', 'land', 'pertanahan'],
            'keimigrasian': ['imigrasi', 'immigration', 'visa', 'paspor'],
            'kewarganegaraan': ['citizenship', 'warga negara'],
            'kehutanan': ['hutan', 'forest', 'forestry'],
            'pertambangan': ['tambang', 'mining', 'minerba'],
            'perikanan': ['ikan', 'fishery', 'nelayan'],
            'kesehatan': ['health', 'rumah sakit', 'obat'],
            'pendidikan': ['education', 'sekolah', 'universitas'],
            'perbankan': ['bank', 'banking', 'kredit'],
            'asuransi': ['insurance', 'pertanggungan'],
            'lingkungan hidup': ['environment', 'pencemaran', 'limbah'],
            'perlindungan konsumen': ['consumer protection', 'konsumen'],
            'hak cipta': ['copyright', 'intellectual property', 'paten'],
            'merek': ['trademark', 'brand'],
            'kepailitan': ['bankruptcy', 'pailit', 'insolvensi'],
            'arbitrase': ['arbitration', 'alternatif penyelesaian sengketa'],
            'pidana': ['criminal', 'tindak pidana', 'kejahatan'],
            'perdata': ['civil', 'gugatan', 'perkara perdata']
        }
        
        # Multi-word legal phrases that should be treated as single entities
        self.legal_phrases = {
            'cipta kerja': {
                'type': 'specific_concept',
                'priority': 0.95,
                'avoid_splitting': True,
                'context': 'job creation law, not copyright'
            },
            'hak cipta': {
                'type': 'specific_concept',
                'priority': 0.95,
                'avoid_splitting': True,
                'context': 'copyright, intellectual property'
            },
            'tenaga kerja': {
                'type': 'specific_concept',
                'priority': 0.90,
                'avoid_splitting': True,
                'context': 'labor, workforce'
            },
            'bea cukai': {
                'type': 'specific_concept',
                'priority': 0.90,
                'avoid_splitting': True,
                'context': 'customs duties'
            },
            'tindak pidana': {
                'type': 'specific_concept',
                'priority': 0.90,
                'avoid_splitting': True,
                'context': 'criminal offense'
            },
            'penanaman modal': {
                'type': 'specific_concept',
                'priority': 0.90,
                'avoid_splitting': True,
                'context': 'investment'
            },
            'tanggung jawab': {
                'type': 'specific_concept',
                'priority': 0.85,
                'avoid_splitting': True,
                'context': 'liability, responsibility'
            },
            'lingkungan hidup': {
                'type': 'specific_concept',
                'priority': 0.90,
                'avoid_splitting': True,
                'context': 'environment'
            }
        }
        
        # Conceptual query indicators (triggers semantic-first search)
        self.conceptual_indicators = [
            'bagaimana', 'apa itu', 'mengapa', 'kenapa', 'jelaskan',
            'apa saja', 'berapa', 'kapan', 'dimana', 'siapa',
            'prosedur', 'cara', 'langkah', 'tata cara',
            'definisi', 'pengertian', 'maksud', 'arti',
            'sanksi apa', 'hukuman apa', 'denda berapa',
            'syarat apa', 'ketentuan apa',
            'perbedaan', 'perbandingan', 'hubungan antara'
        ]
        
        # Specific query indicators (triggers keyword-first search)
        self.specific_indicators = [
            'tentang', 'mengenai', 'mengatur', 'diatur dalam',
            'berdasarkan', 'sesuai', 'menurut',
            'undang-undang', 'peraturan', 'pp', 'perpres', 'permen'
        ]
    
    def analyze_query(self, query: str) -> Dict[str, Any]:
        """
        Main analysis function that determines search strategy.
        
        Returns:
            {
                'search_strategy': 'keyword_first' | 'semantic_first' | 'hybrid_balanced',
                'key_phrases': [...],
                'law_name_detected': bool,
                'specific_entities': [...],
                'confidence': float,
                'reasoning': str,
                'metadata_hints': {...}
            }
        """
        query_lower = query.lower().strip()
        
        analysis = {
            'search_strategy': 'semantic_first',  # default
            'key_phrases': [],
            'law_name_detected': False,
            'specific_entities': [],
            'confidence': 0.5,
            'reasoning': '',
            'metadata_hints': {},
            'keyword_boost': 0.0,
            'semantic_boost': 0.0
        }
        
        # Step 1: Check for exact regulation reference (already handled by hybrid search)
        if self._has_exact_regulation_ref(query_lower):
            analysis['search_strategy'] = 'metadata_first'
            analysis['confidence'] = 0.95
            analysis['reasoning'] = 'Exact regulation reference detected'
            return analysis
        
        # Step 2: Check for specific legal phrases (HIGHEST PRIORITY)
        detected_phrases = self._detect_legal_phrases(query_lower)
        if detected_phrases:
            analysis['key_phrases'] = detected_phrases
            analysis['search_strategy'] = 'keyword_first'
            analysis['confidence'] = max([p['priority'] for p in detected_phrases])
            analysis['keyword_boost'] = 0.40  # Strong keyword emphasis
            analysis['semantic_boost'] = 0.10
            analysis['reasoning'] = f"Specific legal phrase detected: {', '.join([p['phrase'] for p in detected_phrases])}"
            
            # Add context to prevent semantic drift
            for phrase_info in detected_phrases:
                analysis['metadata_hints'][phrase_info['phrase']] = phrase_info['context']
            
            return analysis
        
        # Step 3: Check for common law names
        detected_law_name = self._detect_common_law_name(query_lower)
        if detected_law_name:
            analysis['law_name_detected'] = True
            analysis['specific_entities'].append(detected_law_name)
            analysis['search_strategy'] = 'keyword_first'
            analysis['confidence'] = 0.85
            analysis['keyword_boost'] = 0.35
            analysis['semantic_boost'] = 0.15
            analysis['reasoning'] = f"Common law name detected: {detected_law_name['name']}"
            analysis['metadata_hints']['about'] = detected_law_name['name']
            return analysis
        
        # Step 4: Check query structure patterns
        has_conceptual = any(ind in query_lower for ind in self.conceptual_indicators)
        has_specific = any(ind in query_lower for ind in self.specific_indicators)
        
        if has_specific and not has_conceptual:
            # "tentang cipta kerja" - specific but might not be a phrase
            analysis['search_strategy'] = 'keyword_first'
            analysis['confidence'] = 0.70
            analysis['keyword_boost'] = 0.30
            analysis['semantic_boost'] = 0.20
            analysis['reasoning'] = 'Specific indicator detected without conceptual question'
            
            # Extract potential key terms after specific indicator
            key_terms = self._extract_key_terms_after_indicator(query_lower)
            if key_terms:
                analysis['key_phrases'] = [{'phrase': term, 'priority': 0.70, 'context': 'extracted term'} for term in key_terms]
            
            return analysis
        
        if has_conceptual:
            # "bagaimana prosedur..." - conceptual question
            analysis['search_strategy'] = 'semantic_first'
            analysis['confidence'] = 0.75
            analysis['keyword_boost'] = 0.15
            analysis['semantic_boost'] = 0.35
            analysis['reasoning'] = 'Conceptual question detected'
            return analysis
        
        # Step 5: Length-based heuristic
        words = query_lower.split()
        if len(words) <= 4 and not has_conceptual:
            # Short queries are often specific
            analysis['search_strategy'] = 'hybrid_balanced'
            analysis['confidence'] = 0.60
            analysis['keyword_boost'] = 0.25
            analysis['semantic_boost'] = 0.25
            analysis['reasoning'] = 'Short query - balanced approach'
            
            # Treat entire short query as potential key phrase
            analysis['key_phrases'] = [{'phrase': query_lower, 'priority': 0.60, 'context': 'short query'}]
            return analysis
        
        # Default: balanced hybrid
        analysis['search_strategy'] = 'hybrid_balanced'
        analysis['confidence'] = 0.50
        analysis['keyword_boost'] = 0.20
        analysis['semantic_boost'] = 0.30
        analysis['reasoning'] = 'No clear indicators - using balanced hybrid search'
        
        return analysis
    
    def _has_exact_regulation_ref(self, query: str) -> bool:
        """Check if query contains exact regulation reference (Type + Number + Year)"""
        # Pattern: "UU No. 13 Tahun 2003" or similar
        for reg_type, patterns in REGULATION_TYPE_PATTERNS.items():
            for pattern in patterns:
                if pattern in query:
                    # Check if followed by number and year
                    import re
                    regex = rf'{re.escape(pattern)}\s*(?:nomor|no\.?)?\s*(\d+)\s*(?:tahun|th\.?)?\s*(\d{{4}})?'
                    if re.search(regex, query, re.IGNORECASE):
                        return True
        return False
    
    def _detect_legal_phrases(self, query: str) -> List[Dict]:
        """Detect specific multi-word legal phrases that should not be split"""
        detected = []
        
        for phrase, info in self.legal_phrases.items():
            if phrase in query:
                detected.append({
                    'phrase': phrase,
                    'priority': info['priority'],
                    'context': info['context'],
                    'avoid_splitting': info['avoid_splitting']
                })
                
                print(f"DEBUG: Detected legal phrase: '{phrase}' - {info['context']}")
        
        return detected
    
    def _detect_common_law_name(self, query: str) -> Optional[Dict]:
        """Detect if query contains a common law name (e.g., 'kepabeanan')"""
        for law_name, aliases in self.common_law_names.items():
            # Check main name
            if law_name in query:
                return {
                    'name': law_name,
                    'aliases': aliases,
                    'confidence': 0.85
                }
            
            # Check aliases
            for alias in aliases:
                if alias in query:
                    return {
                        'name': law_name,
                        'aliases': aliases,
                        'confidence': 0.75
                    }
        
        return None
    
    def _extract_key_terms_after_indicator(self, query: str) -> List[str]:
        """Extract key terms that appear after specific indicators"""
        key_terms = []
        
        for indicator in self.specific_indicators:
            if indicator in query:
                # Extract text after indicator
                parts = query.split(indicator, 1)
                if len(parts) > 1:
                    after_text = parts[1].strip()
                    # Take first 2-4 words as potential key terms
                    words = after_text.split()[:4]
                    if words:
                        key_term = ' '.join(words)
                        # Remove common stop words from end
                        while key_term.split()[-1] in INDONESIAN_STOPWORDS:
                            words = key_term.split()[:-1]
                            if not words:
                                break
                            key_term = ' '.join(words)
                        
                        if key_term:
                            key_terms.append(key_term)
        
        return key_terms

# =============================================================================
# FIXED: IMPROVED RAG ACCURACY - SCORING AND FILTERING
# =============================================================================

class EnhancedKGSearchEngine:
    def __init__(self, records, embeddings, embedding_model, embedding_tokenizer, knowledge_graph, dataset_loader):
        self.records = records
        self.embeddings = embeddings
        self.embedding_model = embedding_model
        self.embedding_tokenizer = embedding_tokenizer
        self.kg = knowledge_graph
        self.dataset_loader = dataset_loader
        self.all_phase_results = {}
        self.research_session_log = []
        self._build_indexes()
        
        # *** NEW: Performance tracking dictionary ***
        self.persona_performance = {
            researcher_id: {
                'query_types': {},  # Track performance per query type
                'success_count': 0,
                'total_queries': 0,
                'accuracy_adjustment': 0.0,  # Learning parameter
                'recent_performance': []  # Track last N queries for trend analysis
            }
            for researcher_id in RESEARCH_TEAM_PERSONAS.keys()
        }
        
        self.context_manager = ConversationContextManager(knowledge_graph_instance=knowledge_graph)
        self.community_detector = DynamicCommunityDetector(knowledge_graph)
        self.query_analyzer = AdvancedQueryAnalyzer(knowledge_graph)

    def _get_indices_for_filter(self, regulation_filter):
        if not regulation_filter:
            return None # Search all indices
    
        filtered_indices = []
        # This loop is slow, you should pre-build an index for faster lookups,
        # but for a proof-of-concept this demonstrates the logic.
        for i, record in enumerate(self.records):
            rec_type = str(record.get('regulation_type', '')).lower()
            rec_number = str(record.get('regulation_number', ''))
            rec_year = str(record.get('year', ''))
    
            filter_type = str(regulation_filter.get('regulation_type', '')).lower()
            filter_number = str(regulation_filter.get('regulation_number', ''))
            filter_year = str(regulation_filter.get('year', ''))
    
            # Use your existing matching logic
            type_match = any(p in rec_type for p in REGULATION_TYPE_PATTERNS.get(filter_type, [filter_type]))
            number_match = (filter_number == rec_number)
            year_match = (not filter_year or filter_year == rec_year)
    
            if type_match and number_match and year_match:
                filtered_indices.append(i)
    
        return filtered_indices if filtered_indices else None
    
    def _apply_regulation_filter(self, candidates, regulation_filter):
        """Filter candidates to specific regulation"""
        if not regulation_filter:
            return candidates
        
        filtered = []
        for candidate in candidates:
            try:
                record = candidate['record']
                
                # Normalize for comparison
                rec_type = str(record.get('regulation_type', '')).lower()
                rec_number = str(record.get('regulation_number', ''))
                rec_year = str(record.get('year', ''))
                
                filter_type = str(regulation_filter.get('regulation_type', '')).lower()
                filter_number = str(regulation_filter.get('regulation_number', ''))
                filter_year = str(regulation_filter.get('year', ''))
                
                # Match regulation type (flexible)
                type_match = False
                for patterns in REGULATION_TYPE_PATTERNS.values():
                    if any(p in rec_type for p in patterns) and any(p in filter_type for p in patterns):
                        type_match = True
                        break
                
                # Match number (exact)
                number_match = (filter_number in rec_number or rec_number in filter_number)
                
                # Match year (if provided)
                year_match = (not filter_year or filter_year in rec_year)
                
                if type_match and number_match and year_match:
                    filtered.append(candidate)
            except Exception:
                continue
        
        return filtered
    
    def _boost_contextual_documents(self, candidates):
        """Boost documents from conversation context"""
        relevant_regs = self.context_manager.get_relevant_regulations(max_age=2)
        
        if not relevant_regs:
            return candidates
        
        for candidate in candidates:
            try:
                record = candidate['record']
                rec_type = str(record.get('regulation_type', '')).lower()
                rec_number = str(record.get('regulation_number', ''))
                rec_year = str(record.get('year', ''))
                
                # Check if matches any regulation in context
                for reg in relevant_regs:
                    age = self.context_manager.exchange_count - reg.get('mentioned_in_exchange', 0)
                    
                    # Type match
                    type_match = False
                    for patterns in REGULATION_TYPE_PATTERNS.values():
                        if any(p in rec_type for p in patterns) and any(p in reg['type'] for p in patterns):
                            type_match = True
                            break
                    
                    # Number and year match
                    number_match = (reg['number'] in rec_number)
                    year_match = (not reg['year'] or reg['year'] in rec_year)
                    
                    if type_match and number_match and year_match:
                        # Apply conversation boost (decay with age)
                        boost = 0.15 * max(0, (3 - age) / 3)
                        candidate['composite_score'] = min(1.0, candidate.get('composite_score', 0) + boost)
                        candidate['contextual_boost'] = boost
                        break
            except Exception:
                continue
        
        return candidates

    def update_persona_performance(self, query_type, successful_researchers, all_researchers):
        """
        Update persona performance metrics based on query results.
        Called after consensus building to learn from success/failure.
        
        Args:
            query_type: Type of query (e.g., 'specific_article', 'procedural')
            successful_researchers: List of researcher IDs who contributed to top results
            all_researchers: List of all researcher IDs on the team
        """
        try:
            for researcher_id in all_researchers:
                if researcher_id not in self.persona_performance:
                    continue
                
                perf = self.persona_performance[researcher_id]
                perf['total_queries'] += 1
                
                # Track query type specific performance
                if query_type not in perf['query_types']:
                    perf['query_types'][query_type] = {'success': 0, 'total': 0}
                
                perf['query_types'][query_type]['total'] += 1
                
                # Record success if this researcher contributed
                is_successful = researcher_id in successful_researchers
                perf['recent_performance'].append(1 if is_successful else 0)
                
                # Keep only last 10 queries
                if len(perf['recent_performance']) > 10:
                    perf['recent_performance'].pop(0)
                
                if is_successful:
                    perf['success_count'] += 1
                    perf['query_types'][query_type]['success'] += 1
                
                # Calculate success rates
                overall_success_rate = perf['success_count'] / perf['total_queries']
                query_type_stats = perf['query_types'][query_type]
                query_type_success_rate = query_type_stats['success'] / query_type_stats['total']
                
                # Recent trend
                recent_trend = sum(perf['recent_performance']) / len(perf['recent_performance']) if perf['recent_performance'] else 0.5
                
                # ADAPTIVE LEARNING: Gradual adjustment
                if overall_success_rate > 0.7:
                    # High performers get gradual boost (max +0.10)
                    adjustment_delta = 0.005 * (1 + recent_trend)
                    perf['accuracy_adjustment'] = min(0.10, perf['accuracy_adjustment'] + adjustment_delta)
                
                elif overall_success_rate < 0.4:
                    # Low performers get slight penalty (max -0.05)
                    adjustment_delta = 0.003 * (1 - recent_trend)
                    perf['accuracy_adjustment'] = max(-0.05, perf['accuracy_adjustment'] - adjustment_delta)
                
                else:
                    # Middle performers: slow decay toward baseline
                    perf['accuracy_adjustment'] *= 0.98
                
                # Query-type specialization bonus
                if query_type_stats['total'] >= 3 and query_type_success_rate > 0.75:
                    if 'specialization_bonus' not in perf:
                        perf['specialization_bonus'] = {}
                    perf['specialization_bonus'][query_type] = min(0.08, query_type_success_rate * 0.1)
            
            # Log performance update
            if hasattr(self, 'research_session_log'):
                self.research_session_log.append({
                    'query_type': query_type,
                    'successful_researchers': successful_researchers,
                    'performance_snapshot': {
                        rid: {
                            'success_rate': perf['success_count'] / perf['total_queries'],
                            'accuracy_adjustment': perf['accuracy_adjustment']
                        }
                        for rid, perf in self.persona_performance.items()
                        if perf['total_queries'] > 0
                    }
                })
        
        except Exception as e:
            print(f"Error updating persona performance: {e}")
    
    def get_adjusted_persona(self, researcher_id, query_type):
        """
        Get persona with performance-based adjustments applied.
        Creates a 'smarter' version of the base persona.
        
        Args:
            researcher_id: ID of the researcher
            query_type: Current query type
            
        Returns:
            Adjusted persona dictionary with learned bonuses applied
        """
        try:
            # Get base persona (never modify the original!)
            base_persona = RESEARCH_TEAM_PERSONAS[researcher_id].copy()
            
            if researcher_id not in self.persona_performance:
                return base_persona
            
            perf = self.persona_performance[researcher_id]
            
            # Apply learned accuracy adjustment
            base_persona['accuracy_bonus'] += perf['accuracy_adjustment']
            
            # Apply query-type specialization bonus if exists
            if 'specialization_bonus' in perf and query_type in perf['specialization_bonus']:
                base_persona['accuracy_bonus'] += perf['specialization_bonus'][query_type]
            
            # Adjust search style based on query type success
            if query_type in perf['query_types']:
                query_stats = perf['query_types'][query_type]
                if query_stats['total'] >= 3:
                    success_rate = query_stats['success'] / query_stats['total']
                    
                    if success_rate > 0.7:
                        # Boost confidence
                        search_style_copy = base_persona['search_style'].copy()
                        for key in search_style_copy:
                            search_style_copy[key] *= 1.05
                        base_persona['search_style'] = search_style_copy
                    
                    elif success_rate < 0.4:
                        # Be more conservative
                        search_style_copy = base_persona['search_style'].copy()
                        for key in search_style_copy:
                            search_style_copy[key] *= 0.95
                        base_persona['search_style'] = search_style_copy
            
            # Add metadata
            base_persona['_adjusted'] = True
            base_persona['_original_accuracy_bonus'] = RESEARCH_TEAM_PERSONAS[researcher_id]['accuracy_bonus']
            base_persona['_learned_adjustment'] = perf['accuracy_adjustment']
            base_persona['_total_queries'] = perf['total_queries']
            base_persona['_success_rate'] = perf['success_count'] / perf['total_queries'] if perf['total_queries'] > 0 else 0
            
            return base_persona
        
        except Exception as e:
            print(f"Error getting adjusted persona: {e}")
            return RESEARCH_TEAM_PERSONAS[researcher_id]

    # Add these methods to EnhancedKGSearchEngine class

    def _build_indexes(self):
        """Build search indexes - MISSING METHOD"""
        try:
            print("Building search indexes...")
            # Already built in dataset_loader, just reference them
            self.authority_index = self.dataset_loader.authority_index
            self.temporal_index = self.dataset_loader.temporal_index
            self.kg_connectivity_index = self.dataset_loader.kg_connectivity_index
            self.hierarchy_index = self.dataset_loader.hierarchy_index
            self.domain_index = self.dataset_loader.domain_index
            print(f"‚úÖ Indexes built: {len(self.authority_index)} authority tiers")
        except Exception as e:
            print(f"Error building indexes: {e}")
            self.authority_index = {}
            self.temporal_index = {}
            self.kg_connectivity_index = {}
            self.hierarchy_index = {}
            self.domain_index = {}
    
    def _detect_query_type(self, query):
        """Detect query type - MISSING METHOD"""
        try:
            query_lower = query.lower()
            
            # Check each pattern
            for query_type, pattern_info in QUERY_PATTERNS.items():
                if query_type == 'general':
                    continue
                indicators = pattern_info['indicators']
                if any(indicator in query_lower for indicator in indicators):
                    return query_type
            
            return 'general'
        except Exception as e:
            print(f"Error detecting query type: {e}")
            return 'general'
    
    def _embed_query_with_context_and_kg(self, query, query_type):
        """Embed query with context - MODIFIED FOR CPU"""
        try:
            # Add query type context
            context_map = {
                'specific_article': 'pasal dan ayat spesifik',
                'procedural': 'prosedur dan tata cara',
                'definitional': 'definisi dan pengertian',
                'sanctions': 'sanksi dan hukuman',
                'general': 'informasi hukum'
            }
            
            context = context_map.get(query_type, 'informasi hukum')
            enhanced_query = f"Mencari {context}: {query}"
            
            # Embed on CPU
            with torch.no_grad():
                inputs = self.embedding_tokenizer(
                    [enhanced_query], 
                    padding=True, 
                    truncation=True,
                    max_length=MAX_LENGTH, 
                    return_tensors="pt"
                )
                inputs = {k: v.to('cpu') for k, v in inputs.items()}  # Force CPU
                outputs = self.embedding_model(**inputs)
                
                attention_mask = inputs['attention_mask']
                last_hidden_states = outputs.last_hidden_state
                sequence_lengths = attention_mask.sum(dim=1) - 1
                batch_size = last_hidden_states.shape[0]
                
                embedding = last_hidden_states[
                    torch.arange(batch_size, device='cpu'), 
                    sequence_lengths
                ]
                
                return embedding[0]
        except Exception as e:
            print(f"Error embedding query: {e}")
            return torch.zeros(self.embeddings.shape[1])
    
    def _apply_enhanced_domain_bonus(self, record, query_type, kg_score):
        """Apply domain-specific bonuses - MISSING METHOD"""
        try:
            bonus = 0.0
            
            # Domain matching bonus
            domain = record.get('kg_primary_domain', '').lower()
            domain_confidence = record.get('kg_domain_confidence', 0)
            
            if domain_confidence > 0.7:
                bonus += 0.05
            
            # Query-type specific bonuses
            if query_type == 'sanctions':
                if 'criminal' in domain or 'administrative' in domain:
                    bonus += 0.08
                if record.get('kg_has_prohibitions', False):
                    bonus += 0.10
            elif query_type == 'procedural':
                if 'administrative' in domain or 'procedural' in domain:
                    bonus += 0.08
                if record.get('kg_has_obligations', False):
                    bonus += 0.10
            elif query_type == 'specific_article':
                if record.get('article', 'N/A') != 'N/A':
                    bonus += 0.12
            
            # KG enhancement bonus
            if kg_score > 0.5:
                bonus += 0.06
            
            return min(0.20, bonus)  # Cap at 0.20
        except Exception as e:
            return 0.0
    
    def _apply_researcher_expertise_bonus(self, record, persona, query_type):
        """Apply researcher expertise bonus - MISSING METHOD"""
        try:
            bonus = 0.0
            
            # Check if query type matches researcher specialty
            specialties = persona.get('specialties', [])
            
            if query_type == 'specific_article' and 'precedent_analysis' in specialties:
                bonus += 0.05
            elif query_type == 'procedural' and 'procedural_law' in specialties:
                bonus += 0.06
            elif query_type == 'definitional' and 'constitutional_law' in specialties:
                bonus += 0.04
            elif 'knowledge_graphs' in specialties:
                if record.get('kg_entity_count', 0) > 5:
                    bonus += 0.05
            
            # Authority preference bonus
            if persona.get('bias_towards') == 'established_precedents':
                if record.get('kg_authority_score', 0) > 0.8:
                    bonus += 0.04
            
            return min(0.10, bonus)  # Cap at 0.10
        except Exception as e:
            return 0.0
    
    # ========================
    # FIX 2: Improved Individual Research Method
    # ========================
    # Replace _conduct_individual_research in EnhancedKGSearchEngine
    
    def _conduct_individual_research(
        self, researcher_id, persona, query, query_type,
        query_embedding, query_entities, config, regulation_filter=None,
        query_analysis=None
    ):
        """
        FIXED: Strict metadata-first search with graceful fallback to semantic search.
        No early returns - always contributes candidates to consensus.
        """
        try:
            # Get adapted persona
            if hasattr(self, 'get_adjusted_persona'):
                persona = self.get_adjusted_persona(researcher_id, query_type)
            
            # ========================================================================
            # CRITICAL FIX: STRICT METADATA EXTRACTION & FILTERING
            # ========================================================================
            exact_regulation_ref = None
            exact_match_mode = False
            metadata_candidates = []  # Initialize here to preserve across paths
            
            if self.kg:
                regulation_refs = self.kg.extract_regulation_references_with_confidence(query)
                
                if regulation_refs:
                    best_ref = regulation_refs[0]
                    
                    # **STRICT REQUIREMENT**: Confidence >= 0.9 AND all three components exist
                    if (best_ref['confidence'] >= 0.9 and 
                        best_ref['specificity'] == 'complete' and
                        best_ref['regulation'].get('type') and
                        best_ref['regulation'].get('number') and
                        best_ref['regulation'].get('year')):
                        
                        exact_regulation_ref = best_ref['regulation']
                        exact_match_mode = True
                        
                        print(f"üéØ STRICT METADATA MODE ACTIVATED: {exact_regulation_ref['type']} "
                              f"{exact_regulation_ref['number']}/{exact_regulation_ref['year']} "
                              f"(confidence: {best_ref['confidence']:.2%})")
            
            # ========================================================================
            # PATH 1: STRICT METADATA-ONLY SEARCH (PERFECT SCORE OVERRIDE)
            # ========================================================================
            
            if exact_match_mode and exact_regulation_ref:
                print(f"   üîí Executing STRICT metadata filter with PERFECT SCORE override...")
                
                ref_type = exact_regulation_ref['type'].lower()
                ref_number = exact_regulation_ref['number']
                ref_year = exact_regulation_ref['year']
                
                # **CRITICAL**: Direct exact-match filtering at record level
                for i, record in enumerate(self.records):
                    try:
                        rec_type = str(record.get('regulation_type', '')).lower()
                        rec_number = str(record.get('regulation_number', ''))
                        rec_year = str(record.get('year', ''))
                        
                        # **TYPE MATCH** (flexible pattern matching)
                        type_match = False
                        for patterns in REGULATION_TYPE_PATTERNS.values():
                            if any(p in rec_type for p in patterns) and any(p in ref_type for p in patterns):
                                type_match = True
                                break
                        
                        if not type_match:
                            continue
                        
                        # **NUMBER MATCH** (EXACT - case sensitive for numbers)
                        if ref_number != rec_number:
                            continue
                        
                        # **YEAR MATCH** (EXACT - THIS IS THE CRITICAL FIX)
                        if ref_year != rec_year:
                            continue  # ‚Üê THIS LINE ENSURES YEAR FILTERING
                        
                        # ‚úÖ TRIPLE MATCH FOUND - APPLY PERFECT SCORE OVERRIDE
                        print(f"   ‚úÖ EXACT MATCH FOUND: {rec_type} {rec_number}/{rec_year}")
                        
                        # **PERFECT SCORE OVERRIDE**: Set to 1.0 to guarantee top ranking
                        candidate = {
                            'record': record,
                            'composite_score': 1.0,  # ‚Üê PERFECT SCORE OVERRIDE
                            'semantic_score': 0.0,
                            'keyword_score': 0.0,
                            'kg_score': float(record.get('kg_connectivity_score', 0.5)),
                            'metadata_match': True,
                            'exact_metadata_mode': True,
                            'match_type': 'STRICT_TRIPLE_MATCH',
                            'researcher_bias_applied': persona['name'],
                            'perfect_score_override': True  # Flag for transparency
                        }
                        
                        metadata_candidates.append(candidate)
                        
                    except Exception as e:
                        print(f"   Error processing record {i}: {e}")
                        continue
                
                print(f"   üìä Strict filter result: {len(metadata_candidates)} exact matches")
                
                # *** FIXED: NO EARLY RETURN - Fallback to semantic if no matches ***
                if not metadata_candidates:
                    print(f"   ‚ö†Ô∏è NO EXACT MATCH FOUND for {ref_type} {ref_number}/{ref_year}")
                    print(f"   üîÑ FALLING BACK to semantic search for this researcher...")
                    exact_match_mode = False  # Disable to trigger PATH 2
            
            # ========================================================================
            # PATH 2: NORMAL SEMANTIC SEARCH (Original logic + metadata candidates)
            # ========================================================================
            
            # Only execute semantic search if:
            # 1. exact_match_mode was never triggered, OR
            # 2. exact_match_mode failed to find candidates (fallback)
            if not exact_match_mode or not metadata_candidates:
                print(f"   üîç Executing normal semantic search...")
                
                # Adjust search weights based on query analysis
                if query_analysis:
                    search_style = persona['search_style'].copy()
                    
                    if query_analysis['search_strategy'] == 'keyword_first':
                        keyword_boost = query_analysis.get('keyword_boost', 0.30)
                        semantic_boost = query_analysis.get('semantic_boost', 0.15)
                        
                        search_style['semantic_weight'] = semantic_boost
                        search_style['kg_weight'] = search_style.get('kg_weight', 0.25) + 0.10
                        search_style['authority_weight'] = 1.0 - (semantic_boost + search_style['kg_weight'] + keyword_boost + search_style.get('temporal_weight', 0.15))
                        
                    elif query_analysis['search_strategy'] == 'semantic_first':
                        semantic_boost = query_analysis.get('semantic_boost', 0.35)
                        keyword_boost = query_analysis.get('keyword_boost', 0.15)
                        
                        search_style['semantic_weight'] = semantic_boost
                        search_style['authority_weight'] = max(0.10, search_style.get('authority_weight', 0.25) - 0.05)
                    
                    else:  # hybrid_balanced
                        keyword_boost = query_analysis.get('keyword_boost', 0.25)
                        semantic_boost = query_analysis.get('semantic_boost', 0.25)
                        search_style['semantic_weight'] = semantic_boost
                else:
                    search_style = persona['search_style']
                
                # Calculate semantic similarities
                semantic_sims = F.cosine_similarity(
                    query_embedding.unsqueeze(0),
                    self.embeddings,
                    dim=1
                )
                semantic_sims = semantic_sims.cpu().numpy()
                
                phase_preference = persona['phases_preference']
                speed_mult = persona['speed_multiplier']
                
                search_phases = config.get('search_phases', DEFAULT_SEARCH_PHASES)
                
                phase_results = {}
                all_candidates = []
                
                # Execute search phases
                for phase_name in phase_preference:
                    if phase_name not in search_phases:
                        continue
                    
                    phase_config = search_phases[phase_name]
                    if not phase_config.get('enabled', True):
                        continue
                    
                    adjusted_config = phase_config.copy()
                    adjusted_config['candidates'] = int(phase_config['candidates'] * speed_mult)
                    
                    # Calculate keyword similarities
                    keyword_sims = None
                    if self.dataset_loader.tfidf_matrix is not None:
                        try:
                            query_tfidf = self.dataset_loader.tfidf_vectorizer.transform([query])
                            keyword_sims = cosine_similarity(
                                query_tfidf,
                                self.dataset_loader.tfidf_matrix
                            )[0]
                        except Exception:
                            keyword_sims = None
                    
                    # Score candidates (normal semantic scoring)
                    phase_candidates = self._score_candidates_with_enhanced_kg(
                        semantic_sims, keyword_sims, query_entities, query_type,
                        search_style, adjusted_config, persona,
                        regulation_filter=regulation_filter,
                        query_analysis=query_analysis
                    )
                    
                    phase_key = f"{researcher_id}_{phase_name}"
                    phase_results[phase_key] = {
                        'phase': phase_name,
                        'researcher': researcher_id,
                        'researcher_name': persona['name'],
                        'candidates': phase_candidates,
                        'confidence': self._calculate_phase_confidence(phase_candidates, phase_config),
                        'persona_adjusted': persona.get('_adjusted', False),
                        'learned_bonus': persona.get('_learned_adjustment', 0.0),
                        'query_strategy': query_analysis.get('search_strategy') if query_analysis else 'default',
                        'exact_match_mode': False
                    }
                    
                    all_candidates.extend(phase_candidates)
            
            # ========================================================================
            # UNIFIED RETURN: Metadata candidates (if any) + Semantic candidates
            # ========================================================================
            
            # *** FIXED: Combine metadata candidates with semantic candidates ***
            combined_candidates = metadata_candidates + all_candidates
            
            # If we had exact matches, create a separate phase for them
            if metadata_candidates:
                phase_results[f"{researcher_id}_metadata_strict"] = {
                    'phase': 'metadata_strict',
                    'researcher': researcher_id,
                    'researcher_name': persona['name'],
                    'candidates': metadata_candidates,
                    'confidence': 1.0,  # Perfect confidence
                    'exact_metadata_mode': True,
                    'query_strategy': 'strict_metadata_filter_perfect_score'
                }
            
            return {
                'researcher_id': researcher_id,
                'phase_results': phase_results,
                'all_candidates': combined_candidates,
                'exact_match_mode': bool(metadata_candidates),
                'query_type': query_type,
                'persona_metadata': {
                    'adjusted': persona.get('_adjusted', False),
                    'success_rate': persona.get('_success_rate', 0),
                    'learned_adjustment': persona.get('_learned_adjustment', 0)
                }
            }
        
        except Exception as e:
            print(f"Error in individual research for {researcher_id}: {e}")
            import traceback
            traceback.print_exc()
            return {
                'researcher_id': researcher_id,
                'phase_results': {},
                'all_candidates': [],
                'error': str(e)
            }
            
    def _calculate_phase_confidence(self, candidates, phase_config):
        """Calculate confidence for phase results"""
        try:
            if not candidates:
                return 0.0
            
            # Average score above threshold
            avg_score = np.mean([c.get('composite_score', 0) for c in candidates])
            
            # Normalize by thresholds
            sem_threshold = phase_config.get('semantic_threshold', 0.3)
            confidence = min(1.0, avg_score / (sem_threshold + 0.1))
            
            return confidence
        except Exception:
            return 0.5
    
    # ========================
    # FIX 3: Add Missing Cross-Validation and Devil's Advocate Methods
    # ========================
    
    def _conduct_cross_validation(self, individual_results, team_members, query_entities):
        """Cross-validate findings between researchers"""
        try:
            consensus_docs = {}
            
            # Collect all document IDs and their appearances
            doc_appearances = defaultdict(list)
            
            for researcher_id, results in individual_results.items():
                for phase_key, phase_data in results.get('phase_results', {}).items():
                    for candidate in phase_data.get('candidates', []):
                        doc_id = candidate['record']['global_id']
                        doc_appearances[doc_id].append({
                            'researcher': researcher_id,
                            'score': candidate.get('composite_score', 0),
                            'candidate': candidate
                        })
            
            # Find documents with agreement from multiple researchers
            for doc_id, appearances in doc_appearances.items():
                if len(appearances) >= 2:  # At least 2 researchers found it
                    avg_score = np.mean([a['score'] for a in appearances])
                    researchers = [a['researcher'] for a in appearances]
                    
                    consensus_docs[doc_id] = {
                        'agreement_count': len(appearances),
                        'researchers': researchers,
                        'avg_score': avg_score,
                        'candidate': appearances[0]['candidate']  # Use first instance
                    }
            
            return consensus_docs
            
        except Exception as e:
            print(f"Error in cross-validation: {e}")
            return {}
    
    def _conduct_devils_advocate_review(self, individual_results, query, query_type):
        """Devil's advocate critical review"""
        try:
            challenges = {}
            
            # Collect all high-scoring candidates
            all_candidates = []
            for researcher_id, results in individual_results.items():
                for phase_key, phase_data in results.get('phase_results', {}).items():
                    all_candidates.extend(phase_data.get('candidates', []))
            
            # Sort by score
            all_candidates.sort(key=lambda x: x.get('composite_score', 0), reverse=True)
            
            # Challenge top candidates
            for candidate in all_candidates[:10]:  # Review top 10
                doc_id = candidate['record']['global_id']
                record = candidate['record']
                
                challenge_points = []
                
                # Check for potential issues
                if candidate.get('kg_score', 0) < 0.3:
                    challenge_points.append("Low KG relevance")
                
                if candidate.get('semantic_score', 0) < 0.4:
                    challenge_points.append("Low semantic match")
                
                if record.get('kg_authority_score', 0) < 0.5:
                    challenge_points.append("Lower authority source")
                
                if record.get('kg_temporal_score', 0) < 0.4:
                    challenge_points.append("Potentially outdated")
                
                if query_type == 'specific_article' and record.get('article', 'N/A') == 'N/A':
                    challenge_points.append("No specific article reference")
                
                if challenge_points:
                    challenges[doc_id] = {
                        'candidate': candidate,
                        'challenge_points': challenge_points,
                        'severity': len(challenge_points) / 5.0  # Normalize to 0-1
                    }
            
            return challenges
            
        except Exception as e:
            print(f"Error in devil's advocate review: {e}")
            return {}
    
    def _apply_devils_advocate_filters(self, individual_results, challenges):
        """Apply devil's advocate filters to results"""
        try:
            if not challenges:
                return individual_results
            
            # Apply penalties to challenged documents
            for researcher_id, results in individual_results.items():
                for phase_key, phase_data in results.get('phase_results', {}).items():
                    filtered_candidates = []
                    
                    for candidate in phase_data.get('candidates', []):
                        doc_id = candidate['record']['global_id']
                        
                        if doc_id in challenges:
                            challenge_info = challenges[doc_id]
                            severity = challenge_info['severity']
                            
                            # Apply penalty
                            penalty = severity * 0.15  # Up to 15% penalty
                            candidate['composite_score'] *= (1 - penalty)
                            candidate['devils_advocate_challenged'] = True
                            candidate['challenge_points'] = challenge_info['challenge_points']
                            
                            # Only keep if still above threshold
                            if candidate['composite_score'] >= 0.4:
                                filtered_candidates.append(candidate)
                        else:
                            filtered_candidates.append(candidate)
                    
                    phase_data['candidates'] = filtered_candidates
            
            return individual_results
            
        except Exception as e:
            print(f"Error applying devil's advocate filters: {e}")
            return individual_results
    
    def _score_candidates_with_enhanced_kg(self, semantic_sims, keyword_sims,
                                 query_entities, query_type, search_style,
                                 phase_config, persona, regulation_filter=None,
                                 query_analysis=None):  # NEW PARAMETER
        """
        MODIFIED: Scoring with query-analysis-driven adjustments
        """
        try:
            candidates = []
            
            if keyword_sims is None:
                keyword_sims = np.zeros_like(semantic_sims)
            
            query_entity_set = set(str(e).lower() for e in query_entities) if query_entities else set()
            
            # *** NEW: Extract key phrases for exact matching ***
            key_phrases_set = set()
            if query_analysis and query_analysis.get('key_phrases'):
                key_phrases_set = {p['phrase'].lower() for p in query_analysis['key_phrases']}
            
            for i, (sem_score, key_score) in enumerate(zip(
                semantic_sims.tolist() if torch.is_tensor(semantic_sims) else semantic_sims, 
                keyword_sims
            )):
                try:
                    if sem_score < phase_config['semantic_threshold'] * 0.8 and key_score < phase_config['keyword_threshold'] * 0.8:
                        continue
                    
                    record = self.records[i]
                    
                    # *** NEW: Key phrase exact matching bonus ***
                    key_phrase_bonus = 0.0
                    if key_phrases_set:
                        content_lower = record.get('content', '').lower()
                        about_lower = record.get('about', '').lower()
                        
                        for phrase in key_phrases_set:
                            if phrase in content_lower:
                                key_phrase_bonus += 0.25  # Strong bonus for content match
                            elif phrase in about_lower:
                                key_phrase_bonus += 0.20  # Good bonus for about match
                    
                    # Calculate comprehensive KG score
                    kg_score = self.kg.calculate_enhanced_kg_score(query_entities, record, query_type)
                    
                    # Normalize scores
                    norm_sem_score = max(0, min(1, sem_score))
                    norm_key_score = max(0, min(1, (key_score + 1) / 2))
                    
                    # *** MODIFIED: Weighted combination respects query analysis ***
                    keyword_weight = search_style.get('semantic_weight', 0.25)  # Will be adjusted if keyword_first
                    if query_analysis and query_analysis['search_strategy'] == 'keyword_first':
                        # For keyword-first queries, increase keyword contribution
                        keyword_weight = 0.35
                    
                    composite_score = (
                        norm_sem_score * search_style['semantic_weight'] +
                        norm_key_score * keyword_weight +  # Dynamic keyword weight
                        record['kg_authority_score'] * search_style['authority_weight'] +
                        record['kg_temporal_score'] * search_style['temporal_weight'] +
                        kg_score * search_style['kg_weight'] +
                        record['kg_legal_richness'] * 0.08 +
                        record.get('kg_pagerank', 0.0) * 0.05 +
                        record['kg_completeness_score'] * 0.07 +
                        key_phrase_bonus  # *** NEW: Add key phrase bonus ***
                    )
                    
                    # Apply other bonuses (domain, expertise, entity overlap)
                    domain_bonus = self._apply_enhanced_domain_bonus(record, query_type, kg_score)
                    expertise_bonus = self._apply_researcher_expertise_bonus(record, persona, query_type)
                    
                    entity_overlap_bonus = 0.0
                    if query_entity_set and record.get('kg_entity_count', 0) > 0:
                        doc_entities_json = record.get('kg_entities_json', '[]')
                        if doc_entities_json != '[]':
                            try:
                                doc_entities = json.loads(doc_entities_json)
                                doc_entity_set = set()
                                for entity in doc_entities[:10]:
                                    if isinstance(entity, dict):
                                        doc_entity_set.add(str(entity.get('text', '')).lower())
                                    else:
                                        doc_entity_set.add(str(entity).lower())
                                
                                overlap = query_entity_set & doc_entity_set
                                if overlap:
                                    entity_overlap_bonus = min(0.15, len(overlap) * 0.05)
                            except:
                                pass
                    
                    composite_score += domain_bonus + expertise_bonus + entity_overlap_bonus
                    
                    # Contextual boost (regulation filter)
                    contextual_boost = 0.0
                    if regulation_filter:
                        try:
                            rec_type = str(record.get('regulation_type', '')).lower()
                            rec_number = str(record.get('regulation_number', ''))
                            rec_year = str(record.get('year', ''))
    
                            filter_type = str(regulation_filter.get('type') or regulation_filter.get('regulation_type', '')).lower()
                            filter_number = str(regulation_filter.get('number') or regulation_filter.get('regulation_number', ''))
                            filter_year = str(regulation_filter.get('year', ''))
    
                            type_match = any(p in rec_type for p in REGULATION_TYPE_PATTERNS.get(filter_type, [filter_type]))
                            number_match = (filter_number == rec_number)
                            year_match = (not filter_year or filter_year == rec_year)
    
                            if type_match and number_match and year_match:
                                contextual_boost = 0.25
                            elif type_match and number_match:
                                contextual_boost = 0.15
                        except Exception as e:
                            print(f"Error applying contextual boost: {e}")
                            contextual_boost = 0.0
                    
                    composite_score += contextual_boost
                    
                    # *** NEW: Law name metadata matching bonus ***
                    law_name_bonus = 0.0
                    if query_analysis and query_analysis.get('law_name_detected'):
                        law_name = query_analysis['specific_entities'][0]['name']
                        about_lower = record.get('about', '').lower()
                        reg_type_lower = record.get('regulation_type', '').lower()
                        
                        if law_name in about_lower or law_name in reg_type_lower:
                            law_name_bonus = 0.20  # Strong bonus for law name match
                    
                    composite_score += law_name_bonus
                    composite_score = min(1.0, composite_score)
    
                    candidate_data = {
                        'record': record,
                        'composite_score': composite_score,
                        'semantic_score': norm_sem_score,
                        'keyword_score': norm_key_score,
                        'kg_score': kg_score,
                        'researcher_bias_applied': persona['name'],
                        'entity_overlap_bonus': entity_overlap_bonus,
                        'contextual_boost': contextual_boost,
                        'key_phrase_bonus': key_phrase_bonus,  # NEW
                        'law_name_bonus': law_name_bonus,  # NEW
                        'query_strategy': query_analysis.get('search_strategy') if query_analysis else 'default'  # NEW
                    }
    
                    candidates.append(candidate_data)
                    
                except Exception as e:
                    print(f"Error processing candidate {i}: {e}")
                    continue
            
            # Sort and apply diversity
            candidates.sort(key=lambda x: x['composite_score'], reverse=True)
            diverse_candidates = self._apply_diversity_filter(
                candidates[:phase_config['candidates'] * 2],
                phase_config['candidates']
            )
            
            return diverse_candidates
            
        except Exception as e:
            print(f"Error in candidate scoring: {e}")
            import traceback
            traceback.print_exc()
            return []
    
    def _apply_diversity_filter(self, candidates, target_count):
        """FIXED: Ensure diversity in results"""
        if len(candidates) <= target_count:
            return candidates
        
        diverse_results = []
        seen_reg_types = set()
        seen_domains = set()
        seen_hierarchy = set()
        
        # First pass: high-scoring diverse candidates
        for candidate in candidates:
            if len(diverse_results) >= target_count:
                break
            
            record = candidate['record']
            reg_type = record['regulation_type']
            domain = record.get('kg_primary_domain', 'Unknown')
            hierarchy = record.get('kg_hierarchy_level', 5)
            
            # Add if brings diversity
            if (reg_type not in seen_reg_types or 
                domain not in seen_domains or 
                hierarchy not in seen_hierarchy or
                len(diverse_results) < target_count // 2):
                
                diverse_results.append(candidate)
                seen_reg_types.add(reg_type)
                seen_domains.add(domain)
                seen_hierarchy.add(hierarchy)
        
        # Second pass: fill remaining slots with highest scores
        remaining = target_count - len(diverse_results)
        for candidate in candidates:
            if remaining <= 0:
                break
            if candidate not in diverse_results:
                diverse_results.append(candidate)
                remaining -= 1
        
        return diverse_results[:target_count]
    
    def parallel_legal_research(self, query, query_type, config, progress_callback=None):
        """
        MODIFIED: Context-aware parallel research with advanced query analysis
        """
        try:
            if progress_callback:
                progress_callback(f"üîç Assembling legal research team for {query_type} query...")
            
            # *** NEW: Advanced Query Analysis ***
            query_analysis = self.query_analyzer.analyze_query(query)
            
            if progress_callback:
                progress_callback(
                    f"   üß† Query Strategy: {query_analysis['search_strategy']} "
                    f"(confidence: {query_analysis['confidence']:.0%})"
                )
                if query_analysis['reasoning']:
                    progress_callback(f"   üí° Reasoning: {query_analysis['reasoning']}")
                if query_analysis.get('key_phrases'):
                    phrases = [p['phrase'] for p in query_analysis['key_phrases']]
                    progress_callback(f"   üéØ Key Phrases: {', '.join(phrases)}")
            
            # ... (rest of the method remains the same until individual research)
            
            # Detect query intent
            try:
                query_intent = self.context_manager.detect_query_type(query)
            except Exception as e:
                print(f"Error detecting query intent: {e}")
                query_intent = 'new_query'
            
            if progress_callback and query_intent != 'new_query':
                progress_callback(f"   üîÑ Detected {query_intent} - using conversation context...")
            
            # Expand query with context
            try:
                expanded_query = self.context_manager.expand_query_with_context(query)
            except Exception as e:
                print(f"Error expanding query: {e}")
                expanded_query = query
            
            if expanded_query != query and progress_callback:
                progress_callback(f"   ‚ûï Query expanded with context")
            
            # Extract entities from expanded query
            query_entities = [entity for entity, _ in self.kg.extract_entities_from_text(expanded_query)]
            
            # Update context
            try:
                self.context_manager.update_from_query(
                    expanded_query, 
                    [(e, 'regulation_reference') for e in query_entities]
                )
            except Exception as e:
                print(f"Error updating context: {e}")
            
            # Get regulation filter if applicable
            regulation_filter = None
            try:
                if query_intent in ['followup', 'pronoun_reference', 'continuing_discussion']:
                    regulation_filter = self.context_manager.get_regulation_filter()
                    if regulation_filter:
                        print(f"DEBUG: Got regulation filter: {regulation_filter}")
                        
                        if progress_callback:
                            reg_type = regulation_filter.get('type') or regulation_filter.get('regulation_type', 'N/A')
                            reg_number = regulation_filter.get('number') or regulation_filter.get('regulation_number', 'N/A')
                            progress_callback(f"   üéØ Context filter: {reg_type} {reg_number}")
            except Exception as e:
                print(f"Error getting regulation filter: {e}")
                import traceback
                traceback.print_exc()
                regulation_filter = None
            
            # Select team
            team_composition = QUERY_TEAM_COMPOSITIONS.get(query_type, QUERY_TEAM_COMPOSITIONS['general'])
            selected_researchers = team_composition[:config['research_team_size']]
            
            if config.get('enable_devil_advocate', True) and 'devils_advocate' not in selected_researchers:
                if len(selected_researchers) < 5:
                    selected_researchers.append('devils_advocate')
                else:
                    selected_researchers[-1] = 'devils_advocate'
            
            if progress_callback:
                team_names = [RESEARCH_TEAM_PERSONAS[r]['name'] for r in selected_researchers]
                progress_callback(f"   üë• Team: {', '.join(team_names)}")
            
            self.all_phase_results = {}
            
            query_embedding = self._embed_query_with_context_and_kg(expanded_query, query_type)
            
            # Phase 1: Individual Research with Query Analysis
            individual_research_results = {}
            for researcher_id in selected_researchers:
                researcher_persona = RESEARCH_TEAM_PERSONAS[researcher_id]
                
                if progress_callback:
                    progress_callback(f"     {researcher_persona['name']} conducting research...")
                
                # *** MODIFIED: Pass query_analysis to individual research ***
                individual_results = self._conduct_individual_research(
                    researcher_id, researcher_persona, expanded_query, query_type, 
                    query_embedding, query_entities, config,
                    regulation_filter=regulation_filter,
                    query_analysis=query_analysis  # NEW PARAMETER
                )
                individual_research_results[researcher_id] = individual_results
                
                for phase_key, phase_data in individual_results.get('phase_results', {}).items():
                    self.all_phase_results[phase_key] = phase_data
            
            # ... (rest of method remains unchanged - cross-validation, devil's advocate, consensus)
            
            # Phase 2: Cross-Validation
            consensus_documents = {}
            if config.get('enable_cross_validation', True) and len(selected_researchers) > 1:
                if progress_callback:
                    progress_callback("   üîÑ Cross-validation between researchers...")
                
                consensus_documents = self._conduct_cross_validation(
                    individual_research_results, selected_researchers, query_entities
                )
            
            # Phase 3: Devil's Advocate
            if config.get('enable_devil_advocate', True) and 'devils_advocate' in selected_researchers:
                if progress_callback:
                    progress_callback("   üëø Devil's advocate review...")
                
                challenge_results = self._conduct_devils_advocate_review(
                    individual_research_results, expanded_query, query_type
                )
                individual_research_results = self._apply_devils_advocate_filters(
                    individual_research_results, challenge_results
                )
            
            # Phase 4: Build Consensus
            if progress_callback:
                progress_callback("   ü§ù Building team consensus...")
            
            final_consensus = self._build_team_consensus(
                individual_research_results, selected_researchers, config, 
                consensus_documents, query_type
            )
            
            # Update context with results
            try:
                if final_consensus and len(final_consensus) > 0:
                    self.context_manager.update_from_results(final_consensus[:5])
            except Exception as e:
                print(f"Warning: Could not update context from results: {e}")
            
            return final_consensus
            
        except Exception as e:
            if progress_callback:
                progress_callback(f"Error in research: {e}")
            import traceback
            traceback.print_exc()
            return []
    
    def _build_team_consensus(
        self, individual_results, team_members, config, consensus_docs=None, query_type='general'
    ):
        """
        ENHANCED: Build consensus and UPDATE PERFORMANCE METRICS.
        """
        try:
            all_candidates_with_attribution = {}
            
            for researcher_id, results in individual_results.items():
                researcher_persona = RESEARCH_TEAM_PERSONAS[researcher_id]
                
                for phase_key, phase_data in results.get('phase_results', {}).items():
                    for candidate in phase_data['candidates']:
                        doc_id = candidate['record']['global_id']
                        
                        if doc_id not in all_candidates_with_attribution:
                            all_candidates_with_attribution[doc_id] = {
                                'candidate': candidate,
                                'researcher_scores': {},
                                'supporting_researchers': [],
                                'researcher_types': set(),
                                'cross_validated': doc_id in (consensus_docs or {})
                            }
                        
                        attribution = all_candidates_with_attribution[doc_id]
                        attribution['researcher_scores'][researcher_id] = candidate['composite_score']
                        attribution['supporting_researchers'].append(researcher_id)
                        attribution['researcher_types'].add(researcher_persona['approach'])
            
            final_candidates = []
            consensus_threshold = config.get('consensus_threshold', 0.6)
            
            for doc_id, attribution in all_candidates_with_attribution.items():
                try:
                    total_weight = 0
                    weighted_score = 0
                    
                    for researcher_id, score in attribution['researcher_scores'].items():
                        researcher_persona = RESEARCH_TEAM_PERSONAS[researcher_id]
                        weight = (researcher_persona['experience_years'] / 15.0) + researcher_persona.get('accuracy_bonus', 0)
                        weighted_score += score * weight
                        total_weight += weight
                    
                    final_weighted_score = weighted_score / total_weight if total_weight > 0 else np.mean(list(attribution['researcher_scores'].values()))
                    
                    # Consensus bonuses
                    if len(attribution['supporting_researchers']) > 1:
                        consensus_bonus = min(0.10, 0.03 * (len(attribution['supporting_researchers']) - 1))
                        final_weighted_score += consensus_bonus
                    
                    if len(attribution['researcher_types']) > 1:
                        final_weighted_score += 0.05
                    
                    if attribution.get('cross_validated', False):
                        final_weighted_score += 0.08
                    
                    adjusted_threshold = consensus_threshold
                    if len(attribution['supporting_researchers']) >= 3:
                        adjusted_threshold *= 0.9
                    
                    if final_weighted_score >= adjusted_threshold:
                        candidate = attribution['candidate'].copy()
                        candidate['final_consensus_score'] = min(1.0, final_weighted_score)
                        candidate['team_consensus'] = True
                        candidate['supporting_researchers'] = attribution['supporting_researchers']
                        candidate['researcher_agreement'] = len(attribution['supporting_researchers'])
                        candidate['cross_validated'] = attribution.get('cross_validated', False)
                        
                        final_candidates.append(candidate)
                
                except Exception as e:
                    print(f"Error processing candidate consensus: {e}")
                    continue
            
            final_candidates.sort(key=lambda x: x.get('final_consensus_score', 0), reverse=True)
            
            final_candidates = self._apply_hierarchical_review(final_candidates, team_members)
            final_candidates = self._resolve_team_conflicts(final_candidates, team_members, config)
            
            # *** NEW: ADAPTIVE LEARNING - Update performance metrics ***
            successful_researchers = []
            for candidate in final_candidates[:config.get('final_top_k', 3)]:
                successful_researchers.extend(candidate.get('supporting_researchers', []))
            
            successful_researchers = list(set(successful_researchers))
            
            # Call learning update
            if hasattr(self, 'update_persona_performance'):
                self.update_persona_performance(query_type, successful_researchers, team_members)
            
            return final_candidates
        
        except Exception as e:
            print(f"Error building consensus: {e}")
            import traceback
            traceback.print_exc()
            return []


    def _apply_hierarchical_review(self, consensus_candidates, team_members):
        """Apply hierarchical review: junior findings reviewed by seniors"""
        try:
            if not consensus_candidates:
                return consensus_candidates
            
            # Classify researchers by experience
            junior_researchers = []
            senior_researchers = []
            
            for researcher_id in team_members:
                persona = RESEARCH_TEAM_PERSONAS[researcher_id]
                exp_years = persona['experience_years']
                if exp_years <= 5:
                    junior_researchers.append(researcher_id)
                elif exp_years >= 10:
                    senior_researchers.append(researcher_id)
            
            # If no hierarchy, skip
            if not junior_researchers or not senior_researchers:
                return consensus_candidates
            
            # Senior validation: flag documents only found by juniors
            reviewed_candidates = []
            
            for candidate in consensus_candidates:
                supporting = candidate.get('supporting_researchers', [])
                
                # Check if any senior found this
                has_senior_support = any(r in senior_researchers for r in supporting)
                only_junior_support = all(r in junior_researchers for r in supporting)
                
                if only_junior_support and not has_senior_support:
                    # Apply penalty for lack of senior validation
                    original_score = candidate.get('final_consensus_score', 0)
                    candidate['final_consensus_score'] = original_score * 0.85
                    candidate['needs_senior_review'] = True
                    candidate['review_note'] = 'Found by junior researchers only'
                else:
                    candidate['senior_validated'] = has_senior_support
                
                reviewed_candidates.append(candidate)
            
            # Re-sort after review adjustments
            reviewed_candidates.sort(key=lambda x: x.get('final_consensus_score', 0), reverse=True)
            
            return reviewed_candidates
            
        except Exception as e:
            print(f"Error in hierarchical review: {e}")
            return consensus_candidates

    def _resolve_team_conflicts(self, candidates, team_members, config):
        """Resolve conflicts when team strongly disagrees on results"""
        try:
            consensus_threshold = config.get('consensus_threshold', 0.6)
            conflict_cases = []
            
            for candidate in candidates:
                supporting = candidate.get('supporting_researchers', [])
                scores = candidate.get('researcher_scores', {})
                
                if len(scores) < 2:
                    continue
                
                # Calculate score variance
                score_values = list(scores.values())
                mean_score = np.mean(score_values)
                variance = np.var(score_values)
                
                # High variance = conflict
                if variance > 0.04:  # Significant disagreement
                    # Get senior opinion as tiebreaker
                    senior_scores = []
                    for researcher_id, score in scores.items():
                        persona = RESEARCH_TEAM_PERSONAS[researcher_id]
                        if persona['experience_years'] >= 10:
                            senior_scores.append(score)
                    
                    if senior_scores:
                        # Use senior consensus as tiebreaker
                        senior_consensus = np.mean(senior_scores)
                        candidate['conflict_resolved'] = True
                        candidate['resolution_method'] = 'senior_tiebreaker'
                        candidate['original_score'] = candidate.get('final_consensus_score', 0)
                        candidate['final_consensus_score'] = senior_consensus * 0.95  # Slight penalty for conflict
                    else:
                        # No senior opinion - use supermajority
                        if len(supporting) >= len(team_members) * 0.66:  # 2/3 support
                            candidate['conflict_resolved'] = True
                            candidate['resolution_method'] = 'supermajority'
                        else:
                            # Insufficient support - penalize
                            candidate['final_consensus_score'] *= 0.80
                            candidate['conflict_unresolved'] = True
            
            return candidates
            
        except Exception as e:
            print(f"Error resolving conflicts: {e}")
            return candidates
            
    def research_rounds_with_quality_degradation(
        self, query, query_type, initial_candidates, config, progress_callback=None
    ):
        """
        ENHANCED: Multi-round research with community detection and boosting.
        """
        try:
            if progress_callback:
                progress_callback(f"üîÑ Starting research rounds (max: {config['max_rounds']})...")
            
            all_rounds_metadata = dict(getattr(self, 'all_phase_results', {}))
            
            current_quality = config['initial_quality']
            all_rounds_results = []
            
            # Check if initial results are sufficient
            high_quality_candidates = [
                c for c in initial_candidates 
                if c.get('final_consensus_score', c.get('composite_score', 0)) >= current_quality
            ]
            
            if len(high_quality_candidates) >= config['final_top_k']:
                if progress_callback:
                    progress_callback(f"   ‚úÖ Initial search sufficient: {len(high_quality_candidates)} high-quality results")
                all_rounds_results.extend(high_quality_candidates)
            else:
                # Add initial high-quality candidates
                all_rounds_results.extend(high_quality_candidates)
                
                # Conduct additional rounds if needed
                for round_num in range(config['max_rounds']):
                    if progress_callback:
                        progress_callback(f"   Round {round_num + 1}/{config['max_rounds']} (quality threshold: {current_quality:.2f})...")
                    
                    self.all_phase_results = {}
                    
                    round_results = self.parallel_legal_research(
                        query, query_type, config, progress_callback
                    )
                    
                    # Merge metadata
                    for phase_key, phase_data in self.all_phase_results.items():
                        round_phase_key = f"round{round_num + 1}_{phase_key}"
                        all_rounds_metadata[round_phase_key] = phase_data
                    
                    quality_filtered = [
                        c for c in round_results 
                        if c.get('final_consensus_score', c.get('composite_score', 0)) >= current_quality
                    ]
                    
                    if progress_callback:
                        progress_callback(f"      Found {len(quality_filtered)} candidates above threshold")
                    
                    all_rounds_results.extend(quality_filtered)
                    
                    # Citation chain boosting
                    if all_rounds_results:
                        top_seed_docs = [c['record']['global_id'] for c in all_rounds_results[:5]]
                        citation_network = self.kg.follow_citation_chain(top_seed_docs, max_depth=2)
                        all_rounds_results = self.kg.boost_cited_documents(all_rounds_results, citation_network)
                    
                    unique_results = self._deduplicate_candidates(all_rounds_results)
                    
                    if len(unique_results) >= config['final_top_k']:
                        if progress_callback:
                            progress_callback(f"   ‚úÖ Research successful after {round_num + 1} rounds ({len(unique_results)} unique results)")
                        break
                    
                    current_quality = max(
                        config['min_quality'],
                        current_quality - (config['quality_degradation'] * 0.7)
                    )
                    
                    if progress_callback:
                        progress_callback(f"      Lowering threshold to {current_quality:.2f} for next round")
                    
                    if current_quality <= config['min_quality']:
                        if progress_callback:
                            progress_callback(f"   ‚ö†Ô∏è Minimum quality threshold reached")
                        break
            
            # Deduplicate
            unique_results = self._deduplicate_candidates(all_rounds_results)
            
            # *** NEW: DYNAMIC COMMUNITY DETECTION ***
            community_summary = "Community detection disabled"
            
            if hasattr(self, 'community_detector') and self.community_detector.enabled:
                if progress_callback:
                    progress_callback(f"üîç Running community detection on {len(unique_results)} documents...")
                
                try:
                    # Detect communities
                    community_analysis = self.community_detector.detect_communities(
                        unique_results, 
                        top_n=min(100, len(unique_results))
                    )
                    
                    # Boost documents in main community
                    if community_analysis.get('largest_communities'):
                        unique_results = self.community_detector.boost_community_documents(
                            unique_results, 
                            community_analysis
                        )
                        
                        if progress_callback:
                            progress_callback(f"   ‚úÖ Boosted {community_analysis['largest_communities'][0][1]} docs in main community")
                    
                    # Get summary
                    community_summary = community_analysis.get('summary', 'Community detection completed')
                    
                    if progress_callback:
                        progress_callback(f"   üìä {community_summary}")
                    
                except Exception as e:
                    if progress_callback:
                        progress_callback(f"   ‚ö†Ô∏è Community detection failed: {e}")
                    community_summary = f"Community detection failed: {str(e)}"
            
            # Sort by final score
            unique_results.sort(key=lambda x: x.get('final_consensus_score', x.get('composite_score', 0)), reverse=True)
            
            if progress_callback:
                progress_callback(f"   üèÅ Research completed: {len(unique_results)} final unique results")
            
            return unique_results, config['max_rounds'], all_rounds_metadata, community_summary
            
        except Exception as e:
            if progress_callback:
                progress_callback(f"‚ùå Error in research rounds: {e}")
            import traceback
            traceback.print_exc()
            
            return initial_candidates[:config['final_top_k']], 0, self.all_phase_results or {}, "Research failed"
    

    def _deduplicate_candidates(self, candidates):
        """Remove duplicates - ORIGINAL"""
        try:
            seen_ids = {}
            unique_candidates = []
            
            for candidate in candidates:
                doc_id = candidate['record']['global_id']
                score = candidate.get('final_consensus_score', candidate.get('composite_score', 0))
                
                if doc_id not in seen_ids or score > seen_ids[doc_id]['score']:
                    seen_ids[doc_id] = {'candidate': candidate, 'score': score}
            
            for doc_id, data in seen_ids.items():
                unique_candidates.append(data['candidate'])
            
            return unique_candidates
        except Exception:
            return candidates
    
    def direct_metadata_search(self, regulation_ref, top_k=20):
        """
        Direct search by regulation metadata.
        Returns: List of matching candidates with high confidence.
        """
        try:
            # FIXED: Handle both key naming conventions
            filter_type = ''
            filter_number = ''
            filter_year = ''
            
            if regulation_ref:
                # Try both naming conventions
                filter_type = str(
                    regulation_ref.get('type') or 
                    regulation_ref.get('regulation_type', '')
                ).lower()
                
                filter_number = str(
                    regulation_ref.get('number') or 
                    regulation_ref.get('regulation_number', '')
                )
                
                filter_year = str(regulation_ref.get('year', ''))
            
            if not filter_number:
                print("Warning: No regulation number provided for metadata search")
                print(f"DEBUG: regulation_ref structure: {regulation_ref}")
                return []
            
            print(f"DEBUG: Searching for - Type: '{filter_type}', Number: '{filter_number}', Year: '{filter_year}'")
            
            matching_candidates = []
            
            for i, record in enumerate(self.records):
                try:
                    rec_type = str(record.get('regulation_type', '')).lower()
                    rec_number = str(record.get('regulation_number', ''))
                    rec_year = str(record.get('year', ''))
                    
                    if not rec_type or not rec_number:
                        continue
                    
                    # Type matching (flexible)
                    type_match = False
                    for patterns in REGULATION_TYPE_PATTERNS.values():
                        if any(p in rec_type for p in patterns) and any(p in filter_type for p in patterns):
                            type_match = True
                            break
                    
                    if not type_match:
                        continue
                    
                    # Number matching (exact)
                    number_match = (filter_number == rec_number)
                    
                    if not number_match:
                        continue
                    
                    # Year matching (exact if provided, otherwise accept)
                    year_match = (not filter_year or filter_year == rec_year)
                    
                    if not year_match:
                        continue
                    
                    # This is a match!
                    candidate = {
                        'record': record,
                        'composite_score': 0.95,
                        'semantic_score': 0.0,
                        'keyword_score': 0.0,
                        'kg_score': float(record.get('kg_connectivity_score', 0.5)),
                        'metadata_match': True,
                        'match_quality': 'exact' if filter_year else 'partial'
                    }
                    
                    matching_candidates.append(candidate)
                    
                except Exception as e:
                    print(f"Error processing record {i} in metadata search: {e}")
                    continue
            
            print(f"DEBUG: Found {len(matching_candidates)} metadata matches")
            
            if not matching_candidates:
                print(f"No metadata matches found for: {filter_type} {filter_number} {filter_year}")
            
            # Sort by relevance
            matching_candidates.sort(
                key=lambda x: (
                    x['match_quality'] == 'exact',
                    x.get('kg_score', 0),
                    x['record'].get('kg_authority_score', 0)
                ),
                reverse=True
            )
            
            return matching_candidates[:top_k]
            
        except Exception as e:
            print(f"Error in direct_metadata_search: {e}")
            import traceback
            traceback.print_exc()
            return []
    
    def hybrid_search_strategy(self, query, query_type, config, progress_callback=None):
        """
        Hybrid search: metadata-first for specific refs, semantic for concepts.
        """
        try:
            # Step 1: Extract regulation references with confidence
            regulation_refs = self.kg.extract_regulation_references_with_confidence(query)
            
            if progress_callback:
                if regulation_refs:
                    best_ref = regulation_refs[0]
                    reg_info = best_ref.get('regulation', {})
                    progress_callback(
                        f"   üéØ Detected specific regulation: {reg_info.get('type', 'N/A')} "
                        f"{reg_info.get('number', 'N/A')} "
                        f"(confidence: {best_ref.get('confidence', 0):.0%})"
                    )
            
            # Step 2: Decide search strategy based on confidence
            use_metadata_search = False
            primary_regulation = None
            
            if regulation_refs:
                best_ref = regulation_refs[0]
                confidence = best_ref.get('confidence', 0)
                specificity = best_ref.get('specificity', 'vague')
                
                # Use metadata search if complete or partial with high confidence
                if specificity == 'complete' or (specificity == 'partial' and confidence >= 0.7):
                    use_metadata_search = True
                    primary_regulation = best_ref.get('regulation', {})
                    
                    # Validate primary_regulation has required fields
                    if not primary_regulation.get('number'):
                        print(f"Warning: Regulation reference missing number field")
                        use_metadata_search = False
                        primary_regulation = None
            
            # Step 3: Execute appropriate search strategy
            if use_metadata_search and primary_regulation:
                if progress_callback:
                    progress_callback(f"   üîç Using METADATA search for specific regulation")
                
                try:
                    return self._metadata_first_search(
                        query, query_type, primary_regulation, config, progress_callback
                    )
                except Exception as e:
                    print(f"Metadata search failed, falling back to semantic: {e}")
                    if progress_callback:
                        progress_callback(f"   ‚ö†Ô∏è Metadata search failed, using semantic search")
                    
                    return self._semantic_first_search(
                        query, query_type, config, progress_callback
                    )
            else:
                if progress_callback:
                    progress_callback(f"   üîç Using SEMANTIC search for conceptual query")
                
                return self._semantic_first_search(
                    query, query_type, config, progress_callback
                )
                
        except Exception as e:
            if progress_callback:
                progress_callback(f"‚ùå Error in hybrid search: {e}")
            print(f"Error in hybrid search strategy: {e}")
            import traceback
            traceback.print_exc()
            
            # Fallback to semantic search
            try:
                return self._semantic_first_search(query, query_type, config, progress_callback)
            except Exception as fallback_error:
                print(f"Fallback also failed: {fallback_error}")
                return []
    
    def _metadata_first_search(self, query, query_type, regulation_ref, config, progress_callback=None):
        """
        Metadata-first search: Find the specific regulation, then find relevant chunks.
        """
        try:
            if progress_callback:
                progress_callback(f"   Step 1/3: Direct metadata search...")
            
            # Validate regulation_ref
            if not regulation_ref or not isinstance(regulation_ref, dict):
                if progress_callback:
                    progress_callback(f"   ‚ö†Ô∏è Invalid regulation reference, falling back to semantic search")
                return self._semantic_first_search(query, query_type, config, progress_callback)
            
            # Check if regulation_ref has required fields
            if not regulation_ref.get('number'):
                if progress_callback:
                    progress_callback(f"   ‚ö†Ô∏è Missing regulation number, falling back to semantic search")
                return self._semantic_first_search(query, query_type, config, progress_callback)
            
            # Step 1: Direct metadata search
            metadata_matches = self.direct_metadata_search(regulation_ref, top_k=50)
            
            if not metadata_matches:
                if progress_callback:
                    progress_callback(f"   ‚ö†Ô∏è No direct matches found, falling back to semantic search")
                
                # Fallback to semantic search
                return self._semantic_first_search(query, query_type, config, progress_callback)
            
            if progress_callback:
                progress_callback(f"   ‚úÖ Found {len(metadata_matches)} matching documents")
            
            # Step 2: Semantic ranking within matched documents
            if progress_callback:
                progress_callback(f"   Step 2/3: Semantic ranking within matched documents...")
            
            try:
                # Extract indices of matched documents
                matched_indices = []
                for candidate in metadata_matches:
                    try:
                        idx = self.records.index(candidate['record'])
                        matched_indices.append(idx)
                    except ValueError:
                        # Record not in list, skip
                        continue
                
                if not matched_indices:
                    if progress_callback:
                        progress_callback(f"   ‚ö†Ô∏è Could not index matched documents, using base scores")
                    # Just return metadata matches as-is
                    return metadata_matches[:config.get('final_top_k', 10)]
                
                # Embed query
                query_embedding = self._embed_query_with_context_and_kg(query, query_type)
                
                # Get embeddings for matched documents only
                matched_embeddings = self.embeddings[matched_indices]
                
                # Calculate semantic similarity within matches
                semantic_sims = F.cosine_similarity(
                    query_embedding.unsqueeze(0),
                    matched_embeddings,
                    dim=1
                ).cpu().numpy()
                
                # Update candidates with semantic scores
                for i, candidate in enumerate(metadata_matches):
                    if i < len(semantic_sims):
                        candidate['semantic_score'] = float(semantic_sims[i])
                        
                        # Recalculate composite score
                        candidate['composite_score'] = (
                            0.60 +  # Base score for metadata match
                            candidate['semantic_score'] * 0.25 +
                            candidate.get('kg_score', 0) * 0.15
                        )
                
                # Sort by composite score
                metadata_matches.sort(key=lambda x: x['composite_score'], reverse=True)
                
            except Exception as e:
                print(f"Error in semantic ranking: {e}")
                # Continue with metadata matches only
                if progress_callback:
                    progress_callback(f"   ‚ö†Ô∏è Semantic ranking failed, using metadata scores only")
            
            # Step 3: Light team review for quality assurance
            if progress_callback:
                progress_callback(f"   Step 3/3: Team consensus on top candidates...")
            
            # Select top candidates for team review
            top_candidates = metadata_matches[:min(30, len(metadata_matches))]
            
            # Simple team consensus
            query_entities = [entity for entity, _ in self.kg.extract_entities_from_text(query)]
            final_candidates = self._light_team_consensus(
                top_candidates, query_entities, query_type, config
            )
            
            if progress_callback:
                progress_callback(f"   ‚úÖ Metadata-first search completed: {len(final_candidates)} results")
            
            return final_candidates
            
        except Exception as e:
            if progress_callback:
                progress_callback(f"   ‚ùå Error in metadata-first search: {e}")
            print(f"Error in metadata-first search: {e}")
            import traceback
            traceback.print_exc()
            
            # Final fallback
            return self._semantic_first_search(query, query_type, config, progress_callback)
    
    def _light_team_consensus(self, candidates, query_entities, query_type, config):
        """
        Lightweight team consensus for metadata matches.
        Focus on validation rather than discovery.
        """
        try:
            # Apply KG scoring
            for candidate in candidates:
                record = candidate['record']
                kg_score = self.kg.calculate_enhanced_kg_score(
                    query_entities, record, query_type
                )
                candidate['kg_score'] = kg_score
                
                # Update composite score with KG
                candidate['composite_score'] = min(1.0, (
                    candidate['composite_score'] * 0.7 +
                    kg_score * 0.3
                ))
            
            # Sort by final score
            candidates.sort(key=lambda x: x['composite_score'], reverse=True)
            
            # Mark as team validated
            for candidate in candidates:
                candidate['team_consensus'] = True
                candidate['metadata_match'] = True
            
            return candidates[:config.get('final_top_k', 10)]
            
        except Exception as e:
            print(f"Error in light team consensus: {e}")
            return candidates[:config.get('final_top_k', 10)]
    
    def _semantic_first_search(self, query, query_type, config, progress_callback=None):
        """
        Original semantic search - for conceptual queries.
        This calls the existing parallel_legal_research logic.
        """
        try:
            return self.parallel_legal_research(query, query_type, config, progress_callback)
        except Exception as e:
            if progress_callback:
                progress_callback(f"Error in semantic search: {e}")
            import traceback
            traceback.print_exc()
            return []

# =============================================================================
# RERANKER WITH ENHANCED KG (KEEP ORIGINAL + ENHANCE)
# =============================================================================

class EnhancedKGReranker:
    """Reranker with ENHANCED KG context - keeps original logic"""
    
    def __init__(self, model, tokenizer, prefix_tokens, suffix_tokens, 
                 token_true_id, token_false_id, max_length, knowledge_graph):
        self.model = model
        self.tokenizer = tokenizer
        self.prefix_tokens = prefix_tokens
        self.suffix_tokens = suffix_tokens
        self.token_true_id = token_true_id
        self.token_false_id = token_false_id
        self.max_length = max_length
        self.kg = knowledge_graph
    
    def format_instruction_with_kg(self, query, doc, query_type):
        """Format instruction with ENHANCED KG-context"""
        try:
            query_entities = [entity for entity, _ in self.kg.extract_entities_from_text(query)]
            
            # ENHANCED: Get advanced KG data
            if isinstance(doc, dict):
                record = doc
                doc_id = record.get('global_id')
                
                # Parse advanced KG features
                doc_entities = self.kg.get_parsed_kg_data(doc_id, 'entities') or []
                cross_refs = self.kg.get_parsed_kg_data(doc_id, 'cross_refs') or []
                domains = self.kg.get_parsed_kg_data(doc_id, 'domains') or []
                
                kg_connections = []
                
                # Entity matches
                for q_entity in query_entities[:3]:
                    for d_entity in doc_entities[:3]:
                        try:
                            d_entity_text = d_entity.get('text', '') if isinstance(d_entity, dict) else str(d_entity)
                            if str(q_entity).lower() == str(d_entity_text).lower():
                                kg_connections.append(f"Exact match: {q_entity}")
                        except Exception:
                            continue
                
                # Cross-reference info
                if cross_refs:
                    kg_connections.append(f"{len(cross_refs)} cross-references")
                
                # Domain info
                if record.get('kg_primary_domain'):
                    kg_connections.append(f"Domain: {record['kg_primary_domain']}")
            else:
                kg_connections = []
            
            instruction_map = {
                'specific_article': '''Sebagai ahli hukum, evaluasi apakah dokumen peraturan ini berisi pasal/ayat spesifik yang diminta. 
                Fokus pada: (1) Kecocokan nomor pasal/ayat, (2) Relevansi isi ketentuan, (3) Kesesuaian konteks hukum.''',
                'procedural': '''Sebagai praktisi hukum, evaluasi apakah dokumen ini menjelaskan prosedur yang ditanyakan. 
                Fokus pada: (1) Kelengkapan langkah-langkah, (2) Kejelasan persyaratan, (3) Ketepatan implementasi.''',
                'definitional': '''Sebagai akademisi hukum, evaluasi apakah dokumen ini memberikan definisi yang ditanyakan. 
                Fokus pada: (1) Kejelasan definisi, (2) Otoritas sumber, (3) Konteks penggunaan istilah.''',
                'sanctions': '''Sebagai penegak hukum, evaluasi apakah dokumen ini mengatur sanksi yang ditanyakan. 
                Fokus pada: (1) Jenis sanksi, (2) Besaran/bentuk hukuman, (3) Kondisi penerapan.''',
                'general': '''Sebagai konsultan hukum, evaluasi relevansi dokumen peraturan ini terhadap pertanyaan hukum. 
                Pertimbangkan: (1) Kesesuaian topik, (2) Otoritas regulasi, (3) Kemutakhiran ketentuan, (4) Kelengkapan informasi.'''}
            
            instruction = instruction_map.get(query_type, instruction_map['general'])
            
            kg_context = ""
            if kg_connections:
                kg_context = f"\nHubungan konseptual: {'; '.join(kg_connections[:3])}"
            
            if isinstance(doc, dict):
                doc_text = doc.get('content', '')
            else:
                doc_text = str(doc)
                
            if len(doc_text) > 1000:
                doc_text = doc_text[:1000] + "..."
            
            return f"<Instruct>: {instruction}{kg_context}\n<Query>: {query}\n<Document>: {doc_text}"
        except Exception as e:
            print(f"Error in KG formatting: {e}")
            return f"<Instruct>: Evaluate document relevance\n<Query>: {query}\n<Document>: {str(doc)[:1000]}"
    
    @torch.no_grad()
    def rerank_with_kg(self, query, candidates, query_type, config, top_k=None, progress_callback=None):
        """Rerank with ENHANCED KG - MODIFIED FOR CPU"""
        try:
            if not candidates:
                return candidates
            
            if top_k:
                candidates = candidates[:top_k]
            
            if progress_callback:
                progress_callback(f"   ‚öñÔ∏è KG-enhanced reranking {len(candidates)} candidates...")
            
            query_entities = [entity for entity, _ in self.kg.extract_entities_from_text(query)]
            
            pairs = []
            for candidate in candidates:
                try:
                    record = candidate['record']
                    doc_context = self._create_rich_document_context(record, query_entities)
                    pair = self.format_instruction_with_kg(query, doc_context, query_type)
                    pairs.append(pair)
                except Exception as e:
                    print(f"Error creating pair: {e}")
                    pairs.append(f"<Query>: {query}\n<Document>: {record.get('content', '')[:500]}")
            
            batch_size = 2
            all_rerank_scores = []
            
            for i in range(0, len(pairs), batch_size):
                batch_pairs = pairs[i:i+batch_size]
                
                try:
                    inputs = self.tokenizer(
                        batch_pairs, padding=False, truncation='longest_first',
                        return_attention_mask=False, 
                        max_length=self.max_length - len(self.prefix_tokens) - len(self.suffix_tokens)
                    )
                    
                    for j, ele in enumerate(inputs['input_ids']):
                        inputs['input_ids'][j] = self.prefix_tokens + ele + self.suffix_tokens
                    
                    inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=self.max_length)
                    inputs = {k: v.to('cpu') for k, v in inputs.items()}  # Force CPU
                    
                    batch_scores = self.model(**inputs).logits[:, -1, :]
                    true_vector = batch_scores[:, self.token_true_id]
                    false_vector = batch_scores[:, self.token_false_id]
                    batch_scores = torch.stack([false_vector, true_vector], dim=1)
                    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
                    scores = batch_scores[:, 1].exp().tolist()
                    all_rerank_scores.extend(scores)
                    
                except Exception as e:
                    print(f"Reranking batch error: {e}")
                    all_rerank_scores.extend([0.4] * len(batch_pairs))
                
                clear_cache()
            
            enhanced_candidates = []
            for i, candidate in enumerate(candidates):
                try:
                    base_rerank_score = all_rerank_scores[i] if i < len(all_rerank_scores) else 0.4
                    
                    # ENHANCED: Apply new KG bonuses
                    enhanced_score = self._apply_enhanced_kg_bonuses(
                        candidate, base_rerank_score, query_type, query_entities
                    )
                    
                    candidate['base_rerank_score'] = base_rerank_score
                    candidate['enhanced_rerank_score'] = enhanced_score
                    candidate['final_score'] = enhanced_score
                    enhanced_candidates.append(candidate)
                except Exception as e:
                    print(f"Error enhancing candidate: {e}")
                    candidate['final_score'] = 0.4
                    enhanced_candidates.append(candidate)
            
            enhanced_candidates.sort(key=lambda x: x.get('final_score', 0), reverse=True)
            return enhanced_candidates
        except Exception as e:
            print(f"Reranking error: {e}")
            return candidates
    
    def _create_rich_document_context(self, record, query_entities):
        """Create rich document context with ENHANCED features"""
        try:
            context_parts = []
            
            context_parts.append(f"[{record['regulation_type']} No. {record['regulation_number']}/{record['year']}]")
            context_parts.append(f"Badan: {record['enacting_body']}")
            context_parts.append(f"Tentang: {record['about']}")
            
            if record.get('chapter', 'N/A') != 'N/A' or record.get('article', 'N/A') != 'N/A':
                context_parts.append(f"Referensi: Bab {record.get('chapter', 'N/A')} - Pasal {record.get('article', 'N/A')}")
            
            # ENHANCED: Add KG metadata
            if record.get('kg_primary_domain'):
                context_parts.append(f"Domain: {record['kg_primary_domain']}")
            
            if record.get('kg_cross_ref_count', 0) > 0:
                context_parts.append(f"Cross-refs: {record['kg_cross_ref_count']}")
            
            context_parts.append(f"Isi: {record['content']}")
            
            return " | ".join(context_parts)
        except Exception as e:
            print(f"Error creating document context: {e}")
            return record.get('content', 'No content')
    
    def _apply_enhanced_kg_bonuses(self, candidate, base_score, query_type, query_entities):
        """Apply ENHANCED KG bonuses with new features"""
        try:
            record = candidate['record']
            enhanced_score = base_score
            
            # Original bonuses
            enhanced_score += record['kg_authority_score'] * 0.15
            
            if record['kg_temporal_score'] > 0.8:
                enhanced_score += 0.1
            elif record['kg_temporal_score'] < 0.4:
                enhanced_score -= 0.05
            
            enhanced_score += record['kg_legal_richness'] * 0.08
            
            # ENHANCED: New feature bonuses
            if record.get('kg_hierarchy_level', 5) <= 3:
                enhanced_score += 0.10
            
            if record.get('kg_cross_ref_count', 0) > 5:
                enhanced_score += 0.08
            
            if record.get('kg_pagerank', 0.0) > 0.01:
                enhanced_score += min(0.12, record['kg_pagerank'] * 10)
            
            if record.get('kg_connectivity_score', 0) > 0.7:
                enhanced_score += 0.09
            
            if record.get('kg_domain_confidence', 0) > 0.8:
                enhanced_score += 0.07
            
            # Query-type specific bonuses
            kg_score = candidate.get('kg_score', 0)
            if query_type == 'specific_article':
                if any(term in record['content'].lower() for term in ['pasal', 'ayat']):
                    enhanced_score += 0.1
                    if kg_score > 0.4:
                        enhanced_score += 0.05
            elif query_type == 'sanctions':
                if record.get('kg_has_prohibitions', False):
                    enhanced_score += 0.15
                if any(term in record['content'].lower() for term in ['pidana', 'denda', 'sanksi']):
                    enhanced_score += 0.12
            elif query_type == 'procedural':
                if record.get('kg_has_obligations', False):
                    enhanced_score += 0.12
            
            # Team consensus bonus
            if candidate.get('team_consensus', False):
                enhanced_score += 0.08
            
            if record['kg_completeness_score'] > 0.8:
                enhanced_score += 0.06
            
            return max(0.0, min(1.0, enhanced_score))
        except Exception as e:
            print(f"Error applying bonuses: {e}")
            return base_score

# LLM Generator and Conversation Manager remain UNCHANGED from original
# (keeping them exactly as in original code to maintain compatibility)

# =============================================================================
# LLM GENERATOR (UNCHANGED)
# LLM GENERATOR (UNCHANGED - exactly as original)
# =============================================================================

class KGEnhancedLLMGenerator:
    """LLM generator with KG context preservation - UNCHANGED"""
    
    def __init__(self, model, tokenizer, knowledge_graph):
        self.model = model
        self.tokenizer = tokenizer
        self.kg = knowledge_graph
        
    def format_context_with_kg_protection(self, query, results, query_type, config, all_phase_metadata=None):
        """Format context with KG enhancement and BALANCED metadata protection"""
        try:
            context_parts = []
            
            # UPDATED: More nuanced anti-hallucination instruction
            context_parts.append("""CRITICAL INSTRUCTIONS - READ CAREFULLY:
    
    You are a professional Indonesian legal assistant. Your primary duty is to be accurate, factual, and helpful.
    
    RULE 1 - SOURCE RESTRICTION (STRICT):
    You MUST base your entire answer only on the text from the "Legal References" provided below. You MUST NOT mention, cite, or allude to any law, regulation, or external information that is not in this provided list.
    
    RULE 2 - SYNTHESIS RULE (IMPORTANT):
    If a user's question is conceptual (e.g., "is there equality?", "what is the relationship?", "how does this work?"), you MUST synthesize a comprehensive answer by combining and reasoning over the provided facts. Do not just look for exact keyword matches from the question. Use logical reasoning to connect the dots between multiple pieces of information in the provided documents.
    
    RULE 3 - FACT-FINDING RULE (IMPORTANT):
    If a user asks for a specific, hard fact (e.g., "what is the exact penalty amount?", "what is the effective date?", "what is Article 25?") and that specific fact is not explicitly stated in the provided text, you MUST clearly state that the specific information is not present in the available documents.
    
    RULE 4 - METADATA ACCURACY (STRICT):
    You MUST NOT modify, invent, or alter any metadata such as regulation numbers, years, enacting bodies, or article numbers. Use ONLY the exact information provided in the documents below.
    
    RULE 5 - WHEN CONTEXT IS INSUFFICIENT:
    If the provided documents are genuinely insufficient to answer the user's question (either conceptually or factually), clearly state: "The provided legal references do not contain sufficient information to answer this question." Then suggest the user rephrase or ask a different question.
    
    BALANCE: Be confident in synthesizing and reasoning when the facts support it. Be honest when specific facts are missing.""")
            
            query_entities = [entity for entity, _ in self.kg.extract_entities_from_text(query)]
            
            instruction_map = {
                'specific_article': "Anda akan memberikan penjelasan detail tentang pasal/ayat spesifik berdasarkan peraturan hukum Indonesia yang tersedia. Manfaatkan hubungan antar konsep struktural dalam analisis.",
                'procedural': "Anda akan menjelaskan prosedur dan tata cara berdasarkan peraturan hukum Indonesia yang tersedia. Pertimbangkan keterkaitan antar tahapan prosedur.",
                'definitional': "Anda akan memberikan definisi komprehensif berdasarkan ketentuan hukum Indonesia yang tersedia. Jelaskan hubungan antar konsep terkait.",
                'sanctions': "Anda akan menjelaskan sanksi dan konsekuensi hukum berdasarkan peraturan Indonesia yang tersedia. Analisis keterkaitan jenis sanksi dengan pelanggaran.",
                'general': "Anda akan memberikan penjelasan komprehensif berdasarkan peraturan hukum Indonesia yang tersedia. Manfaatkan hubungan semantik antar konsep hukum."
            }
            
            context_parts.append(instruction_map.get(query_type, instruction_map['general']))
            
            if query_entities:
                context_parts.append(f"\nKonsep kunci yang relevan: {', '.join(query_entities[:5])}")
            
            context_parts.append(f"\nDokumen hukum yang relevan dengan pertanyaan: '{query}'\n")
            
            for i, result in enumerate(results[:config['final_top_k']], 1):
                try:
                    record = result['record']
                    context_parts.append(f"DOKUMEN {i}:")
                    context_parts.append(f"Jenis: {record['regulation_type']} No. {record['regulation_number']}/{record['year']}")
                    context_parts.append(f"Tentang: {record['about']}")
                    context_parts.append(f"Badan: {record['enacting_body']}")
                    
                    if record['chapter'] != 'N/A':
                        context_parts.append(f"Bab: {record['chapter']}")
                    if record['article'] != 'N/A':
                        context_parts.append(f"Pasal: {record['article']}")
                    
                    kg_score = result.get('kg_score', 0)
                    if kg_score > 0.3:
                        context_parts.append(f"Relevansi konseptual: {kg_score:.2f}")
                    
                    context_parts.append(f"Isi: {record['content']}")
                    context_parts.append("")
                except Exception as e:
                    print(f"Error formatting document {i}: {e}")
                    continue
            
            context_parts.append("INSTRUKSI AKHIR:")
            context_parts.append("1. Gunakan format <think>...</think> untuk proses berpikir Anda")
            context_parts.append("2. Dalam <think>, analisis apakah pertanyaan memerlukan sintesis konseptual atau fakta spesifik")
            context_parts.append("3. Jika konseptual: gabungkan dan sintesis informasi dari dokumen untuk menjawab")
            context_parts.append("4. Jika fakta spesifik: cari fakta tersebut secara eksplisit dalam teks")
            context_parts.append("5. JANGAN kutip atau rujuk peraturan yang tidak ada dalam daftar dokumen di atas")
            context_parts.append("6. Jelaskan konteks dan implikasi hukum dengan mempertimbangkan keterkaitan konsep")
            context_parts.append("7. Berikan jawaban dalam bahasa Indonesia yang jelas dan profesional")
            
            return "\n".join(context_parts)
        except Exception as e:
            print(f"Error formatting context: {e}")
            return f"Berikan penjelasan tentang: {query}\n\nGunakan format <think>...</think> untuk proses berpikir."
            
    def generate_with_kg(self, query, results, query_type, config, all_phase_metadata=None):
        """Generate response with KG enhancement using streaming - UNCHANGED"""
        try:
            context = self.format_context_with_kg_protection(query, results, query_type, config, all_phase_metadata)
            
            kg_enhanced_system_prompt = SYSTEM_PROMPT + " Manfaatkan hubungan semantik dan konseptual antar elemen hukum untuk memberikan analisis yang lebih komprehensif dan kontekstual."
            
            input_ids = self.tokenizer.apply_chat_template([
                {'role': 'system', 'content': kg_enhanced_system_prompt},
                {'role': 'user', 'content': f"{context}\n\nPertanyaan: {query}"}
            ], tokenize=True, add_generation_prompt=True, return_tensors='pt').to(self.model.device)
            
            return input_ids, {
                'query_type': query_type,
                'num_sources': len(results),
                'kg_enhanced': True,
                'metadata_protected': True
            }
        except Exception as e:
            print(f"Error in LLM generation setup: {e}")
            simple_prompt = f"Jelaskan tentang: {query}"
            input_ids = self.tokenizer([simple_prompt], return_tensors='pt')['input_ids'].to(self.model.device)
            return input_ids, {'query_type': query_type, 'num_sources': 0, 'error': str(e)}

    def generate_with_kg_and_context(self, query, results, query_type, config, conversation_context="", all_phase_metadata=None):
        """Generate response with KG enhancement and conversation context - UNCHANGED"""
        try:
            context = self.format_context_with_kg_protection(query, results, query_type, config, all_phase_metadata)
            
            if conversation_context.strip():
                context = f"{conversation_context}\n\n---\n\n{context}"
            
            kg_enhanced_system_prompt = SYSTEM_PROMPT + " Manfaatkan hubungan semantik dan konseptual antar elemen hukum untuk memberikan analisis yang lebih komprehensif dan kontekstual. Pertimbangkan konteks percakapan sebelumnya untuk memberikan jawaban yang relevan dan berkesinambungan."
            
            input_ids = self.tokenizer.apply_chat_template([
                {'role': 'system', 'content': kg_enhanced_system_prompt},
                {'role': 'user', 'content': f"{context}\n\nPertanyaan: {query}"}
            ], tokenize=True, add_generation_prompt=True, return_tensors='pt').to(self.model.device)
            
            return input_ids, {
                'query_type': query_type,
                'num_sources': len(results),
                'kg_enhanced': True,
                'metadata_protected': True,
                'context_aware': bool(conversation_context.strip())
            }
        except Exception as e:
            print(f"Error in context-aware LLM generation setup: {e}")
            return self.generate_with_kg(query, results, query_type, config, all_phase_metadata)


# ============================================================================
# Dynamic Community Detection Module
# ============================================================================

class DynamicCommunityDetector:
    """
    Discovers hidden thematic clusters in search results using network analysis.
    Uses cross-references between documents to build a graph and detect communities.
    """
    
    def __init__(self, knowledge_graph):
        self.kg = knowledge_graph
        self.enabled = True
    
    def detect_communities(self, candidates, top_n=100):
        """
        Detect thematic communities in retrieved documents.
        
        Args:
            candidates: List of candidate documents with records
            top_n: Number of top candidates to analyze
            
        Returns:
            Dict with community assignments and metadata
        """
        if not self.enabled:
            return {'communities': {}, 'summary': 'Community detection disabled'}
        
        try:
            # Step 1: Select top candidates
            top_candidates = candidates[:min(top_n, len(candidates))]
            
            if len(top_candidates) < 3:
                return {'communities': {}, 'summary': 'Insufficient documents for community detection'}
            
            print(f"üîç Analyzing {len(top_candidates)} documents for communities...")
            
            # Step 2: Build document graph
            graph_data = self._build_document_graph(top_candidates)
            
            if graph_data['edge_count'] == 0:
                return {'communities': {}, 'summary': 'No cross-references found between documents'}
            
            # Step 3: Detect communities using Louvain algorithm
            communities = self._run_louvain_algorithm(graph_data)
            
            # Step 4: Analyze communities
            community_analysis = self._analyze_communities(communities, top_candidates)
            
            print(f"‚úÖ Found {len(community_analysis['community_sizes'])} communities")
            
            return community_analysis
            
        except Exception as e:
            print(f"Error in community detection: {e}")
            import traceback
            traceback.print_exc()
            return {'communities': {}, 'summary': f'Community detection failed: {str(e)}'}
    
    def _build_document_graph(self, candidates):
        """Build an igraph graph from document cross-references"""
        try:
            # Create document ID to index mapping
            doc_id_to_idx = {}
            idx_to_doc_id = {}
            
            for idx, candidate in enumerate(candidates):
                doc_id = candidate['record']['global_id']
                doc_id_to_idx[doc_id] = idx
                idx_to_doc_id[idx] = doc_id
            
            # Build edge list from cross-references
            edges = []
            edge_weights = []
            
            for candidate in candidates:
                doc_id = candidate['record']['global_id']
                source_idx = doc_id_to_idx[doc_id]
                
                # Get cross-references for this document
                cross_refs = self.kg.get_parsed_kg_data(doc_id, 'cross_refs')
                
                if cross_refs:
                    for ref in cross_refs:
                        try:
                            # Extract target document ID
                            if isinstance(ref, dict):
                                target_id = ref.get('target_id')
                            else:
                                target_id = str(ref)
                            
                            # Check if target is in our candidate set
                            if target_id in doc_id_to_idx:
                                target_idx = doc_id_to_idx[target_id]
                                edges.append((source_idx, target_idx))
                                edge_weights.append(1.0)
                        except Exception:
                            continue
            
            # Create igraph graph
            g = ig.Graph(n=len(candidates), edges=edges, directed=True)
            g.es['weight'] = edge_weights
            
            # Convert to undirected for community detection
            g_undirected = g.as_undirected(mode='collapse', combine_edges='sum')
            
            return {
                'graph': g_undirected,
                'doc_id_to_idx': doc_id_to_idx,
                'idx_to_doc_id': idx_to_doc_id,
                'edge_count': len(edges),
                'node_count': len(candidates)
            }
            
        except Exception as e:
            print(f"Error building document graph: {e}")
            raise
    
    def _run_louvain_algorithm(self, graph_data):
        """Run Louvain community detection algorithm"""
        try:
            g = graph_data['graph']
            
            # Run Louvain algorithm (built into igraph)
            communities = g.community_multilevel(weights='weight')
            
            # Convert to dict: doc_id -> community_id
            community_assignments = {}
            
            for comm_id, members in enumerate(communities):
                for member_idx in members:
                    doc_id = graph_data['idx_to_doc_id'][member_idx]
                    community_assignments[doc_id] = comm_id
            
            return {
                'assignments': community_assignments,
                'modularity': communities.modularity,
                'num_communities': len(communities),
                'sizes': [len(comm) for comm in communities]
            }
            
        except Exception as e:
            print(f"Error running Louvain algorithm: {e}")
            raise
    
    def _analyze_communities(self, communities, candidates):
        """Analyze detected communities and extract insights"""
        try:
            assignments = communities['assignments']
            
            # Group candidates by community
            community_docs = {}
            for candidate in candidates:
                doc_id = candidate['record']['global_id']
                if doc_id in assignments:
                    comm_id = assignments[doc_id]
                    if comm_id not in community_docs:
                        community_docs[comm_id] = []
                    community_docs[comm_id].append(candidate)
            
            # Find largest communities
            community_sizes = {k: len(v) for k, v in community_docs.items()}
            largest_communities = sorted(community_sizes.items(), key=lambda x: x[1], reverse=True)
            
            # Extract themes from largest communities
            themes = {}
            for comm_id, size in largest_communities[:3]:  # Top 3 communities
                docs = community_docs[comm_id]
                theme = self._extract_community_theme(docs)
                themes[comm_id] = {
                    'size': size,
                    'theme': theme,
                    'documents': docs
                }
            
            return {
                'communities': assignments,
                'community_sizes': community_sizes,
                'largest_communities': largest_communities,
                'themes': themes,
                'modularity': communities['modularity'],
                'num_communities': communities['num_communities'],
                'summary': self._generate_community_summary(themes)
            }
            
        except Exception as e:
            print(f"Error analyzing communities: {e}")
            return {
                'communities': assignments,
                'summary': 'Could not analyze community themes'
            }
    
    def _extract_community_theme(self, docs):
        """Extract common theme from a community of documents"""
        try:
            # Collect domains, regulation types, and key terms
            domains = []
            reg_types = []
            topics = []
            
            for doc in docs:
                record = doc['record']
                
                # Domain
                if record.get('kg_primary_domain'):
                    domains.append(record['kg_primary_domain'])
                
                # Regulation type
                if record.get('regulation_type'):
                    reg_types.append(record['regulation_type'])
                
                # Topic from "about"
                if record.get('about'):
                    topics.append(record['about'][:50])
            
            # Find most common elements
            from collections import Counter
            
            most_common_domain = Counter(domains).most_common(1)
            most_common_type = Counter(reg_types).most_common(1)
            
            theme = {
                'primary_domain': most_common_domain[0][0] if most_common_domain else 'Mixed',
                'regulation_type': most_common_type[0][0] if most_common_type else 'Various',
                'topic_sample': topics[0] if topics else 'Legal documents'
            }
            
            return theme
            
        except Exception as e:
            return {'primary_domain': 'Unknown', 'regulation_type': 'Various', 'topic_sample': 'Legal documents'}
    
    def _generate_community_summary(self, themes):
        """Generate human-readable summary of communities"""
        try:
            if not themes:
                return "No distinct thematic communities detected."
            
            summary_parts = [f"Detected {len(themes)} major thematic clusters:"]
            
            for comm_id, theme_data in themes.items():
                size = theme_data['size']
                theme = theme_data['theme']
                
                summary_parts.append(
                    f"\n‚Ä¢ **Cluster {comm_id + 1}** ({size} docs): "
                    f"{theme['primary_domain']} - {theme['regulation_type']} "
                    f"({theme['topic_sample']}...)"
                )
            
            return " ".join(summary_parts)
            
        except Exception as e:
            return "Community analysis completed but summary generation failed."
    
    def boost_community_documents(self, candidates, community_analysis):
        """Boost scores of documents in the largest community"""
        try:
            if not community_analysis or not community_analysis.get('largest_communities'):
                return candidates
            
            # Get largest community ID
            largest_comm_id = community_analysis['largest_communities'][0][0]
            assignments = community_analysis.get('communities', {})
            
            boost_amount = 0.10  # 10% boost for being in main community
            
            for candidate in candidates:
                doc_id = candidate['record']['global_id']
                
                if doc_id in assignments:
                    if assignments[doc_id] == largest_comm_id:
                        # Apply boost
                        if 'final_consensus_score' in candidate:
                            candidate['final_consensus_score'] = min(1.0, 
                                candidate['final_consensus_score'] + boost_amount)
                        if 'composite_score' in candidate:
                            candidate['composite_score'] = min(1.0,
                                candidate['composite_score'] + boost_amount)
                        
                        candidate['in_main_community'] = True
                        candidate['community_id'] = largest_comm_id
            
            print(f"‚úÖ Boosted documents in community {largest_comm_id}")
            
            return candidates
            
        except Exception as e:
            print(f"Error boosting community documents: {e}")
            return candidates


# ============================================================================
# Automated Context Management (No Hard-Coded Keywords)
# ============================================================================

class AutomatedConversationContextManager:
    """
    REFACTORED: Fully automated context detection using semantic similarity
    and entity extraction. NO hard-coded keyword lists.
    """
    
    def __init__(self, knowledge_graph_instance=None):
        self.active_regulations = []
        self.active_entities = []
        self.exchange_count = 0
        self.primary_regulation = None
        self.regulation_confidence = 0.0
        self.kg = knowledge_graph_instance
        
        # Semantic tracking for automated detection
        self.last_query_embedding = None
        self.recent_topic_embeddings = []
        self.max_topic_history = 3
        
        # Thresholds (adjustable)
        self.topic_shift_threshold = 0.65  # Cosine similarity threshold
        self.context_age_limit = 4  # Exchanges before context expires
    
    def set_knowledge_graph(self, kg_instance):
        """Set KG instance after initialization"""
        self.kg = kg_instance
    
    def _calculate_semantic_similarity(self, emb1, emb2):
        """Calculate cosine similarity between embeddings"""
        try:
            if emb1 is None or emb2 is None:
                return 0.0
            
            # Ensure tensors
            if not isinstance(emb1, torch.Tensor):
                emb1 = torch.tensor(emb1)
            if not isinstance(emb2, torch.Tensor):
                emb2 = torch.tensor(emb2)
            
            # Normalize and compute
            emb1_norm = F.normalize(emb1.unsqueeze(0), p=2, dim=1)
            emb2_norm = F.normalize(emb2.unsqueeze(0), p=2, dim=1)
            
            similarity = F.cosine_similarity(emb1_norm, emb2_norm).item()
            return similarity
            
        except Exception as e:
            print(f"Error calculating similarity: {e}")
            return 0.0
    
    def detect_query_type(self, query, query_embedding=None):
        """
        AUTOMATED: Detect if query is new topic or follow-up.
        Uses ONLY semantic similarity and entity extraction.
        NO hard-coded keyword lists.
        
        Returns: 'new_query', 'followup', or 'continuing_discussion'
        """
        
        # Check 1: Explicit different regulation mentioned
        if self.kg:
            regulation_refs = self.kg.extract_regulation_references_with_confidence(query)
            
            if regulation_refs and regulation_refs[0]['confidence'] >= 0.5:
                mentioned_reg = regulation_refs[0]['regulation']
                
                if self.primary_regulation:
                    same_type = mentioned_reg['type'] == self.primary_regulation['type']
                    same_number = mentioned_reg['number'] == self.primary_regulation['number']
                    
                    if not (same_type and same_number):
                        print(f"üÜï Different regulation detected ‚Üí NEW QUERY")
                        return 'new_query'
        
        # Check 2: Semantic similarity (primary automated method)
        if query_embedding is not None and self.last_query_embedding is not None:
            semantic_similarity = self._calculate_semantic_similarity(
                query_embedding, 
                self.last_query_embedding
            )
            
            print(f"üìä Semantic similarity: {semantic_similarity:.3f}")
            
            # Low similarity = topic change
            if semantic_similarity < self.topic_shift_threshold:
                print(f"üîÑ Semantic topic shift detected ‚Üí NEW QUERY")
                return 'new_query'
            
            # Check with recent history
            if self.recent_topic_embeddings and len(self.recent_topic_embeddings) > 1:
                avg_similarity = np.mean([
                    self._calculate_semantic_similarity(query_embedding, topic_emb)
                    for topic_emb in self.recent_topic_embeddings
                ])
                
                print(f"üìä Avg similarity with history: {avg_similarity:.3f}")
                
                if avg_similarity < self.topic_shift_threshold:
                    print(f"üîÑ Query diverges from topic history ‚Üí NEW QUERY")
                    return 'new_query'
        
        # Check 3: Context age
        if self.primary_regulation:
            age = self.exchange_count - self.primary_regulation.get('mentioned_in_exchange', 0)
            if age > self.context_age_limit:
                print(f"‚è∞ Context too old ({age} exchanges) ‚Üí NEW QUERY")
                return 'new_query'
        
        # Check 4: No context exists
        if not self.primary_regulation and self.exchange_count == 0:
            return 'new_query'
        
        # Default: Follow-up if we have context
        if self.primary_regulation:
            return 'followup'
        
        return 'new_query'
    
    def update_from_query(self, query, query_entities, query_embedding=None):
        """Update context with automated topic change detection"""
        
        # Detect if new topic BEFORE updating
        is_new_topic = False
        
        # Check 1: Different regulation
        if self.kg:
            regulation_refs = self.kg.extract_regulation_references_with_confidence(query)
            if regulation_refs and regulation_refs[0]['confidence'] >= 0.5:
                mentioned_reg = regulation_refs[0]['regulation']
                
                if self.primary_regulation:
                    same_type = mentioned_reg['type'] == self.primary_regulation['type']
                    same_number = mentioned_reg['number'] == self.primary_regulation['number']
                    
                    if not (same_type and same_number):
                        is_new_topic = True
        
        # Check 2: Semantic similarity
        if not is_new_topic and query_embedding is not None and self.last_query_embedding is not None:
            semantic_similarity = self._calculate_semantic_similarity(
                query_embedding, 
                self.last_query_embedding
            )
            
            if semantic_similarity < self.topic_shift_threshold:
                is_new_topic = True
        
        # Check 3: Context age
        if not is_new_topic and self.primary_regulation:
            age = self.exchange_count - self.primary_regulation.get('mentioned_in_exchange', 0)
            if age > self.context_age_limit:
                is_new_topic = True
        
        # Increment exchange count
        self.exchange_count += 1
        
        # Clear context if new topic
        if is_new_topic:
            print(f"‚ú® CLEARING CONTEXT for new topic")
            self.active_regulations = []
            self.active_entities = []
            self.primary_regulation = None
            self.regulation_confidence = 0.0
            self.recent_topic_embeddings = []
        
        # Update embeddings
        if query_embedding is not None:
            self.last_query_embedding = query_embedding
            self.recent_topic_embeddings.append(query_embedding)
            if len(self.recent_topic_embeddings) > self.max_topic_history:
                self.recent_topic_embeddings.pop(0)
        
        # Add entities
        for entity, entity_type in query_entities:
            if entity not in [e[0] for e in self.active_entities]:
                self.active_entities.append((entity, entity_type, self.exchange_count))
        
        # Extract regulation references
        if self.kg:
            regulation_refs = self.kg.extract_regulation_references_with_confidence(query)
            
            if regulation_refs:
                best_ref = regulation_refs[0]
                confidence = best_ref['confidence']
                regulation = best_ref['regulation']
                
                if confidence >= 0.5:
                    self.primary_regulation = {
                        'type': regulation['type'],
                        'number': regulation['number'],
                        'year': regulation['year'],
                        'mentioned_in_exchange': self.exchange_count,
                        'full_text': regulation['full_text']
                    }
                    self.regulation_confidence = confidence
                    print(f"‚úÖ Primary regulation: {regulation['type']} {regulation['number']} (confidence: {confidence:.0%})")
                
                # Add to active regulations
                reg_info = {
                    'type': regulation['type'],
                    'number': regulation['number'],
                    'year': regulation['year'],
                    'mentioned_in_exchange': self.exchange_count,
                    'confidence': confidence
                }
                
                existing = False
                for i, active_reg in enumerate(self.active_regulations):
                    if (active_reg['type'] == reg_info['type'] and 
                        active_reg['number'] == reg_info['number']):
                        if confidence >= active_reg.get('confidence', 0):
                            self.active_regulations[i] = reg_info
                        existing = True
                        break
                
                if not existing:
                    self.active_regulations.append(reg_info)
    
    def expand_query_with_context(self, query):
        """SMART: Only expand if genuinely needed"""
        if not self.primary_regulation or self.regulation_confidence < 0.5:
            return query
        
        age = self.exchange_count - self.primary_regulation.get('mentioned_in_exchange', 0)
        if age > 3:
            return query
        
        reg_text = f"{self.primary_regulation['type']} {self.primary_regulation['number']}"
        if self.primary_regulation.get('year'):
            reg_text += f" tahun {self.primary_regulation['year']}"
        
        if reg_text.lower() in query.lower():
            return query
        
        if len(query.split()) < 8:
            return f"{query} (mengacu pada {reg_text})"
        
        return query
    
    def get_regulation_filter(self):
        """Get regulation filter if context is still relevant"""
        try:
            if not self.primary_regulation or self.regulation_confidence < 0.5:
                return None
            
            age = self.exchange_count - self.primary_regulation.get('mentioned_in_exchange', 0)
            if age > 3:
                return None
            
            return {
                'regulation_type': str(self.primary_regulation.get('type', '')),
                'regulation_number': str(self.primary_regulation.get('number', '')),
                'year': str(self.primary_regulation.get('year', '')),
                'type': str(self.primary_regulation.get('type', '')),
                'number': str(self.primary_regulation.get('number', '')),
                'confidence': float(self.regulation_confidence)
            }
            
        except Exception as e:
            print(f"Error in get_regulation_filter: {e}")
            return None
    
    def get_relevant_regulations(self, max_age=3):
        """Get regulations mentioned in recent exchanges"""
        current_exchange = self.exchange_count
        relevant = []
        
        for reg in self.active_regulations:
            age = current_exchange - reg.get('mentioned_in_exchange', 0)
            if age <= max_age:
                relevant.append(reg)
        
        relevant.sort(key=lambda x: (
            -abs(current_exchange - x.get('mentioned_in_exchange', 0)),
            -x.get('confidence', 0)
        ))
        
        return relevant
    
    def clear(self):
        """Clear all context"""
        self.active_regulations = []
        self.active_entities = []
        self.exchange_count = 0
        self.primary_regulation = None
        self.regulation_confidence = 0.0
        self.last_query_embedding = None
        self.recent_topic_embeddings = []

# ============================================================================
# CONVERSATION CONTEXT MANAGER (NEW CLASS)
# ============================================================================

class ConversationContextManager:
    """Manages conversation context across exchanges with AUTOMATIC topic change detection"""
    
    def __init__(self, knowledge_graph_instance=None):
        self.active_regulations = []
        self.active_entities = []
        self.topic_keywords = []
        self.last_query_type = None
        self.exchange_count = 0
        self.primary_regulation = None
        self.regulation_confidence = 0.0
        self.kg = knowledge_graph_instance
        
        # NEW: Track semantic topic shifts
        self.last_query_embedding = None
        self.topic_shift_threshold = 0.65  # Cosine similarity threshold for topic change
        self.recent_topics = []  # Track recent topic vectors
        self.max_topic_history = 3
    
    def set_knowledge_graph(self, kg_instance):
        """Set the knowledge graph instance after initialization"""
        self.kg = kg_instance
    
    def _calculate_semantic_similarity(self, query1_embedding, query2_embedding):
        """Calculate semantic similarity between two query embeddings"""
        try:
            if query1_embedding is None or query2_embedding is None:
                return 0.0
            
            # Ensure tensors
            if not isinstance(query1_embedding, torch.Tensor):
                query1_embedding = torch.tensor(query1_embedding)
            if not isinstance(query2_embedding, torch.Tensor):
                query2_embedding = torch.tensor(query2_embedding)
            
            # Normalize and compute cosine similarity
            query1_normalized = F.normalize(query1_embedding.unsqueeze(0), p=2, dim=1)
            query2_normalized = F.normalize(query2_embedding.unsqueeze(0), p=2, dim=1)
            
            similarity = F.cosine_similarity(query1_normalized, query2_normalized).item()
            return similarity
            
        except Exception as e:
            print(f"Error calculating semantic similarity: {e}")
            return 0.0
    
    def detect_query_type(self, query, query_embedding=None):
        """ENHANCED: Automatic topic detection using semantic similarity"""
        query_lower = query.lower()
        
        # Step 1: Check for explicit new regulation mention FIRST
        if self.kg:
            regulation_refs = self.kg.extract_regulation_references_with_confidence(query)
        else:
            regulation_refs = []
        
        # CRITICAL: If new regulation explicitly mentioned, check if different
        if regulation_refs and regulation_refs[0]['confidence'] >= 0.5:  # Lowered from 0.7
            mentioned_reg = regulation_refs[0]['regulation']
            
            # Check if it's different from primary regulation
            if self.primary_regulation:
                same_type = mentioned_reg['type'] == self.primary_regulation['type']
                same_number = mentioned_reg['number'] == self.primary_regulation['number']
                
                if not (same_type and same_number):
                    print(f"DEBUG: NEW regulation detected: {mentioned_reg['type']} {mentioned_reg['number']} (different from {self.primary_regulation['type']} {self.primary_regulation['number']})")
                    return 'new_query'
                else:
                    print(f"DEBUG: Same regulation as context: {mentioned_reg['type']} {mentioned_reg['number']}")
                    return 'continuing_discussion'
            else:
                # First regulation mentioned - treat as new
                print(f"DEBUG: First regulation mention: {mentioned_reg['type']} {mentioned_reg['number']}")
                return 'new_query'
        
        # Step 2: If NO regulation mentioned but we have context, check semantic similarity
        if query_embedding is not None and self.last_query_embedding is not None:
            semantic_similarity = self._calculate_semantic_similarity(
                query_embedding, 
                self.last_query_embedding
            )
            
            print(f"DEBUG: Semantic similarity with last query: {semantic_similarity:.3f}")
            
            # Low similarity = topic change
            if semantic_similarity < self.topic_shift_threshold:
                print(f"DEBUG: SEMANTIC topic shift detected (similarity: {semantic_similarity:.3f} < {self.topic_shift_threshold})")
                return 'new_query'
            
            # Check with recent topic history for more robust detection
            if self.recent_topics and len(self.recent_topics) > 1:
                avg_similarity = np.mean([
                    self._calculate_semantic_similarity(query_embedding, topic_emb)
                    for topic_emb in self.recent_topics
                ])
                
                print(f"DEBUG: Avg similarity with recent topics: {avg_similarity:.3f}")
                
                if avg_similarity < self.topic_shift_threshold:
                    print(f"DEBUG: Query diverges from recent topic history")
                    return 'new_query'
        
        # Step 3: Check conversation age
        if self.primary_regulation:
            age = self.exchange_count - self.primary_regulation.get('mentioned_in_exchange', 0)
            if age > 4:  # Reduced from 5
                print(f"DEBUG: Context too old ({age} exchanges), treating as new query")
                return 'new_query'
        
        # Step 4: If we have NO context at all, it's new
        if not self.primary_regulation and self.exchange_count == 0:
            print(f"DEBUG: No context exists, first query")
            return 'new_query'
        
        # Step 5: Very short queries might be followups if we have context
        if self.primary_regulation and len(query.split()) < 5:
            return 'followup'
        
        # Step 6: Default - if we reach here with context, assume followup
        if self.primary_regulation:
            return 'followup'
        
        # Final default: new query
        print(f"DEBUG: Default case - treating as new query")
        return 'new_query'
    
    def update_from_query(self, query, query_entities, query_embedding=None):
        """ENHANCED: Update context with automatic topic change detection"""
        
        # FIRST: Detect if this is a new topic BEFORE updating anything
        is_new_topic = False
        
        # Check 1: Explicit different regulation
        if self.kg:
            regulation_refs = self.kg.extract_regulation_references_with_confidence(query)
            if regulation_refs and regulation_refs[0]['confidence'] >= 0.5:
                mentioned_reg = regulation_refs[0]['regulation']
                
                if self.primary_regulation:
                    same_type = mentioned_reg['type'] == self.primary_regulation['type']
                    same_number = mentioned_reg['number'] == self.primary_regulation['number']
                    
                    if not (same_type and same_number):
                        print(f"DEBUG: Different regulation mentioned - NEW TOPIC")
                        is_new_topic = True
        
        # Check 2: Semantic similarity (if we have history)
        if not is_new_topic and query_embedding is not None and self.last_query_embedding is not None:
            semantic_similarity = self._calculate_semantic_similarity(
                query_embedding, 
                self.last_query_embedding
            )
            
            if semantic_similarity < self.topic_shift_threshold:
                print(f"DEBUG: Low semantic similarity ({semantic_similarity:.3f}) - NEW TOPIC")
                is_new_topic = True
        
        # Check 3: Context age
        if not is_new_topic and self.primary_regulation:
            age = self.exchange_count - self.primary_regulation.get('mentioned_in_exchange', 0)
            if age > 4:
                print(f"DEBUG: Context too old ({age} exchanges) - NEW TOPIC")
                is_new_topic = True
        
        # NOW increment exchange count
        self.exchange_count += 1
        
        # Clear context if new topic detected
        if is_new_topic:
            print(f"DEBUG: ‚ú® CLEARING CONTEXT for new topic")
            self.active_regulations = []
            self.active_entities = []
            self.primary_regulation = None
            self.regulation_confidence = 0.0
            self.recent_topics = []
        
        # Update query embedding history
        if query_embedding is not None:
            self.last_query_embedding = query_embedding
            
            # Add to recent topics (keep last N)
            self.recent_topics.append(query_embedding)
            if len(self.recent_topics) > self.max_topic_history:
                self.recent_topics.pop(0)
        
        # Add new entities
        for entity, entity_type in query_entities:
            if entity not in [e[0] for e in self.active_entities]:
                self.active_entities.append((entity, entity_type, self.exchange_count))
        
        # Extract regulation references WITH CONFIDENCE
        if self.kg:
            regulation_refs = self.kg.extract_regulation_references_with_confidence(query)
        else:
            regulation_refs = []
        
        if regulation_refs:
            best_ref = regulation_refs[0]
            confidence = best_ref['confidence']
            regulation = best_ref['regulation']
            
            # Update primary regulation if confidence is high
            if confidence >= 0.5:  # Lowered threshold
                self.primary_regulation = {
                    'type': regulation['type'],
                    'number': regulation['number'],
                    'year': regulation['year'],
                    'mentioned_in_exchange': self.exchange_count,
                    'full_text': regulation['full_text']
                }
                self.regulation_confidence = confidence
                print(f"DEBUG: ‚úÖ Set primary regulation: {regulation['type']} {regulation['number']} (confidence: {confidence:.0%})")
            
            # Add to active regulations
            reg_info = {
                'type': regulation['type'],
                'number': regulation['number'],
                'year': regulation['year'],
                'mentioned_in_exchange': self.exchange_count,
                'confidence': confidence
            }
            
            # Check if already exists
            existing = False
            for i, active_reg in enumerate(self.active_regulations):
                if (active_reg['type'] == reg_info['type'] and 
                    active_reg['number'] == reg_info['number']):
                    if confidence >= active_reg.get('confidence', 0):
                        self.active_regulations[i] = reg_info
                    existing = True
                    break
            
            if not existing:
                self.active_regulations.append(reg_info)
    
    def expand_query_with_context(self, query):
        """SMART: Only expand if genuinely needed (not for new topics)"""
        # Don't expand if this is a new topic
        if not self.primary_regulation or self.regulation_confidence < 0.5:
            return query
        
        # Check regulation age
        age = self.exchange_count - self.primary_regulation.get('mentioned_in_exchange', 0)
        if age > 3:
            # Too old, don't expand
            return query
        
        # Build regulation reference
        reg_text = f"{self.primary_regulation['type']} {self.primary_regulation['number']}"
        if self.primary_regulation.get('year'):
            reg_text += f" tahun {self.primary_regulation['year']}"
        
        # Check if regulation already mentioned
        if reg_text.lower() in query.lower():
            return query  # Already explicit, no need to expand
        
        # Only expand for genuinely short/ambiguous queries
        if len(query.split()) < 8:
            return f"{query} (mengacu pada {reg_text})"
        
        return query
    
    def get_regulation_filter(self):
        """Get regulation filter for search - only if context is still relevant"""
        try:
            if not self.primary_regulation or self.regulation_confidence < 0.5:
                return None
            
            # Check age - don't use if too old
            age = self.exchange_count - self.primary_regulation.get('mentioned_in_exchange', 0)
            if age > 3:
                print(f"DEBUG: Primary regulation too old for filter (age: {age})")
                return None
            
            reg_filter = {
                'regulation_type': str(self.primary_regulation.get('type', '')),
                'regulation_number': str(self.primary_regulation.get('number', '')),
                'year': str(self.primary_regulation.get('year', '')),
                'type': str(self.primary_regulation.get('type', '')),
                'number': str(self.primary_regulation.get('number', '')),
                'confidence': float(self.regulation_confidence)
            }
            
            if not reg_filter['number']:
                return None
            
            print(f"DEBUG: Using regulation filter: {reg_filter['type']} {reg_filter['number']}")
            return reg_filter
            
        except Exception as e:
            print(f"Error in get_regulation_filter: {e}")
            return None
    
    def update_from_results(self, results):
        """Update context from search results"""
        if not results:
            return
        
        try:
            for result in results[:3]:
                try:
                    record = result.get('record', {})
                    
                    if not record:
                        continue
                    
                    reg_type = str(record.get('regulation_type', ''))
                    reg_number = str(record.get('regulation_number', ''))
                    reg_year = str(record.get('year', ''))
                    
                    if not reg_type or not reg_number:
                        continue
                    
                    # Normalize regulation type
                    normalized_type = None
                    for patterns in REGULATION_TYPE_PATTERNS.values():
                        if any(p in reg_type.lower() for p in patterns):
                            normalized_type = patterns[0]
                            break
                    
                    if not normalized_type:
                        normalized_type = reg_type.lower()
                    
                    reg_info = {
                        'type': normalized_type,
                        'number': reg_number,
                        'year': reg_year,
                        'about': str(record.get('about', '')),
                        'mentioned_in_exchange': self.exchange_count,
                        'confidence': 0.8
                    }
                    
                    # Check if already exists
                    existing = False
                    for i, active_reg in enumerate(self.active_regulations):
                        try:
                            if (str(active_reg.get('type', '')) == reg_info['type'] and 
                                str(active_reg.get('number', '')) == reg_info['number'] and
                                str(active_reg.get('year', '')) == reg_info['year']):
                                
                                self.active_regulations[i] = reg_info
                                existing = True
                                
                                if not self.primary_regulation or reg_info['confidence'] > self.regulation_confidence:
                                    self.primary_regulation = reg_info.copy()
                                    self.regulation_confidence = reg_info['confidence']
                                
                                break
                        except Exception as e:
                            print(f"Error comparing regulations: {e}")
                            continue
                    
                    if not existing:
                        self.active_regulations.append(reg_info)
                        
                        if not self.primary_regulation:
                            self.primary_regulation = reg_info.copy()
                            self.regulation_confidence = reg_info['confidence']
                
                except Exception as e:
                    print(f"Error processing result for context: {e}")
                    continue
        
        except Exception as e:
            print(f"Error in update_from_results: {e}")
    
    def get_relevant_regulations(self, max_age=3):
        """Get regulations mentioned in recent exchanges"""
        current_exchange = self.exchange_count
        relevant = []
        
        for reg in self.active_regulations:
            age = current_exchange - reg.get('mentioned_in_exchange', 0)
            if age <= max_age:
                relevant.append(reg)
        
        relevant.sort(key=lambda x: (
            -abs(current_exchange - x.get('mentioned_in_exchange', 0)),
            -x.get('confidence', 0)
        ))
        
        return relevant
    
    def get_primary_regulation_filter(self):
        """Get the primary regulation for follow-up queries"""
        if not self.primary_regulation:
            return None, 0.0
        
        age = self.exchange_count - self.primary_regulation.get('mentioned_in_exchange', 0)
        
        if age > 3:
            self.regulation_confidence *= 0.5
            if self.regulation_confidence < 0.3:
                self.primary_regulation = None
                return None, 0.0
        
        return self.primary_regulation, self.regulation_confidence
    
    def clear(self):
        """Clear conversation context"""
        self.active_regulations = []
        self.active_entities = []
        self.topic_keywords = []
        self.last_query_type = None
        self.exchange_count = 0
        self.primary_regulation = None
        self.regulation_confidence = 0.0
        self.last_query_embedding = None
        self.recent_topics = []

# =============================================================================
# CONVERSATION MANAGER (UNCHANGED)
# =============================================================================

class KGEnhancedConversationManager:
    """Conversation manager with KG awareness - UNCHANGED"""
    
    def __init__(self, llm_generator, knowledge_graph):
        self.llm_generator = llm_generator
        self.kg = knowledge_graph
        self.conversation_history = []
        self.context_memory = []
        self.entity_memory = []
        self.search_context_manager = None

    def set_search_context_manager(self, context_manager):
        """Link to search engine's context manager"""
        self.search_context_manager = context_manager
        
    def add_to_history(self, query, rag_result):
        """Add to conversation history with KG tracking"""
        try:
            query_entities = rag_result.get('query_entities', [])
            
            # Create history entry
            history_entry = {
                'query': query,
                'query_type': rag_result.get('query_type', 'general'),
                'query_entities': query_entities,
                'response': rag_result.get('llm_response', {}).get('response', ''),
                'thinking': rag_result.get('llm_response', {}).get('thinking', ''),
                'sources': len(rag_result.get('results', [])),
                'research_rounds': rag_result.get('research_rounds', 0),
                'kg_enhanced': rag_result.get('kg_enhanced', False),
                'timestamp': datetime.now().isoformat(),
                'sources_used': []  # For export functionality
            }
            
            # Extract and save source details (top 10 for export)
            for result in rag_result.get('results', [])[:10]:
                try:
                    record = result['record']
                    source_info = {
                        'regulation_type': record.get('regulation_type', ''),
                        'regulation_number': record.get('regulation_number', ''),
                        'year': record.get('year', ''),
                        'about': record.get('about', ''),
                        'enacting_body': record.get('enacting_body', ''),
                        'content': record.get('content', '')[:1000],
                        'final_score': result.get('final_score', 0),
                        'kg_score': result.get('kg_score', 0),
                        'kg_primary_domain': record.get('kg_primary_domain', ''),
                        'kg_hierarchy_level': record.get('kg_hierarchy_level', 0),
                        'team_consensus': result.get('team_consensus', False),
                        'researcher_agreement': result.get('researcher_agreement', 0)
                    }
                    history_entry['sources_used'].append(source_info)
                except Exception as e:
                    print(f"Error saving source: {e}")
                    continue
            
            # Save research log if available
            if rag_result.get('all_retrieved_metadata'):
                history_entry['research_log'] = {
                    'team_members': [p['name'] for p in RESEARCH_TEAM_PERSONAS.values()],
                    'phase_results': rag_result['all_retrieved_metadata'],
                    'total_documents_retrieved': sum(
                        len(phase.get('candidates', [])) 
                        for phase in rag_result['all_retrieved_metadata'].values()
                    )
                }
            
            # Add to conversation history
            self.conversation_history.append(history_entry)
            
            # Build context memory (for follow-up queries) - keep top 3
            for result in rag_result.get('results', [])[:3]:
                try:
                    record = result['record']
                    self.context_memory.append({
                        'regulation': f"{record['regulation_type']} No. {record['regulation_number']}/{record['year']}",
                        'about': record['about'],
                        'content_snippet': record['content'][:200] + "..." if len(record['content']) > 200 else record['content']
                    })
                except Exception:
                    continue
            
            # Update entity memory
            self.entity_memory.extend(query_entities)
            if len(self.entity_memory) > 30:
                self.entity_memory = self.entity_memory[-30:]
            
            # Trim context memory
            if len(self.context_memory) > 15:
                self.context_memory = self.context_memory[-15:]
                
        except Exception as e:
            print(f"Error adding to history: {e}")
    
    def get_conversation_context(self):
        """Get conversation context with KG - UNCHANGED"""
        try:
            if not self.conversation_history:
                return ""
            
            context_parts = ["KONTEKS PERCAKAPAN SEBELUMNYA:"]
            
            recent_history = self.conversation_history[-3:]
            for i, entry in enumerate(recent_history, 1):
                context_parts.append(f"Q{i}: {entry['query']}")
                
                if entry.get('query_entities'):
                    entities_text = ', '.join(entry['query_entities'][:3])
                    context_parts.append(f"Entitas Q{i}: {entities_text}")
                
                if entry['thinking']:
                    context_parts.append(f"Thinking{i}: {entry['thinking'][:100]}...")
                response_preview = entry['response'][:150] + "..." if len(entry['response']) > 150 else entry['response']
                context_parts.append(f"A{i}: {response_preview}")
                context_parts.append("")
            
            if self.entity_memory:
                unique_entities = list(set(self.entity_memory[-10:]))
                context_parts.append(f"ENTITAS YANG TELAH DIBAHAS: {', '.join(unique_entities)}")
            
            return "\n".join(context_parts)
        except Exception as e:
            print(f"Error getting conversation context: {e}")
            return ""
    
    def generate_followup_response(self, query, query_type, config):
        """Generate follow-up response with KG awareness - UNCHANGED"""
        try:
            current_entities = [entity for entity, _ in self.kg.extract_entities_from_text(query)]
            
            conversation_context = self.get_conversation_context()
            
            memory_context = ""
            if self.context_memory:
                memory_context = "DOKUMEN YANG TELAH DIBAHAS:\n"
                for mem in self.context_memory[-5:]:
                    memory_context += f"- {mem['regulation']}: {mem['about']}\n"
            
            user_prompt = f"""{conversation_context}
{memory_context}

PERTANYAAN BARU: {query}

Berikan jawaban berdasarkan konteks percakapan dan hubungan antar konsep hukum yang relevan."""

            kg_enhanced_system_prompt = SYSTEM_PROMPT + " Manfaatkan hubungan semantik berdasarkan konteks percakapan sebelumnya."
            
            input_ids = self.llm_generator.tokenizer.apply_chat_template([
                {'role': 'system', 'content': kg_enhanced_system_prompt},
                {'role': 'user', 'content': user_prompt}
            ], tokenize=True, add_generation_prompt=True, return_tensors='pt').to(self.llm_generator.model.device)
            
            return input_ids, {
                'query_type': query_type,
                'query_entities': current_entities,
                'is_followup': True,
                'kg_enhanced': True,
                'context_used': True
            }
        except Exception as e:
            print(f"Error in followup generation: {e}")
            simple_prompt = f"Pertanyaan lanjutan: {query}"
            input_ids = self.llm_generator.tokenizer([simple_prompt], return_tensors='pt')['input_ids'].to(self.llm_generator.model.device)
            return input_ids, {'query_type': query_type, 'error': str(e)}

    def prepare_export_data(self):
        """Prepare conversation data for export"""
        # Since we're now saving everything in add_to_history,
        # we can just return the history directly
        return self.conversation_history
    
    def export_to_markdown(self, include_metadata=True, include_research_process=True):
        """Export conversation to Markdown format"""
        export_data = self.prepare_export_data()
        return export_conversation_to_markdown(export_data, include_metadata, include_research_process)
    
    def export_to_json(self, include_full_content=True):
        """Export conversation to JSON format"""
        export_data = self.prepare_export_data()
        return export_conversation_to_json(export_data, include_full_content)
    
    def export_to_html(self, include_metadata=True):
        """Export conversation to HTML format"""
        export_data = self.prepare_export_data()
        return export_conversation_to_html(export_data, include_metadata)
    
    def clear_conversation(self):
        """Clear conversation history - UNCHANGED"""
        try:
            self.conversation_history = []
            self.context_memory = []
            self.entity_memory = []
            clear_cache()
        except Exception as e:
            print(f"Error clearing conversation: {e}")

# =============================================================================
# INITIALIZATION FUNCTIONS (UPDATED FOR NEW DATASET)
# =============================================================================

def initialize_models_and_data(progress_callback=None):
    """Initialize all models and NEW enhanced KG data"""
    global embedding_model, embedding_tokenizer, reranker_model, reranker_tokenizer
    global llm_model, llm_tokenizer, dataset_loader, search_engine, knowledge_graph
    global reranker, llm_generator, conversation_manager
    global device, EMBEDDING_DIM, token_false_id, token_true_id, prefix_tokens, suffix_tokens
    global initialization_complete
    
    with initialization_lock:
        if initialization_complete:
            if progress_callback:
                progress_callback("System already initialized")
            return True
    
        try:
            if progress_callback:
                progress_callback("Initializing enhanced system...")
            
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            
            if progress_callback:
                progress_callback("Loading embedding model...")
            
            embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL, padding_side='left')
            try:
                embedding_model = AutoModel.from_pretrained(
                    EMBEDDING_MODEL, 
                    attn_implementation="flash_attention_2", 
                    dtype=torch.float32,
                    device_map="cpu"
                )
            except Exception:
                embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL, dtype=torch.float32, device_map="cpu")
            
            embedding_model = embedding_model.eval()
            
            # Get embedding dimension
            def get_embedding_dim():
                try:
                    with torch.no_grad():
                        test_input = embedding_tokenizer(["test"], padding=True, truncation=True, 
                                                       max_length=MAX_LENGTH, return_tensors="pt")
                        test_input = {k: v.to('cpu') for k, v in test_input.items()}
                        test_output = embedding_model(**test_input)
                        attention_mask = test_input['attention_mask']
                        last_hidden_states = test_output.last_hidden_state
                        sequence_lengths = attention_mask.sum(dim=1) - 1
                        batch_size = last_hidden_states.shape[0]
                        embedding = last_hidden_states[torch.arange(batch_size, device='cpu'), sequence_lengths]
                        return embedding.shape[1]
                except Exception as e:
                    print(f"Error getting embedding dim: {e}")
                    return 768
            
            EMBEDDING_DIM = get_embedding_dim()
            
            if progress_callback:
                progress_callback("Loading reranker model...")
            
            reranker_tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL, padding_side='left')
            try:
                reranker_model = AutoModelForCausalLM.from_pretrained(
                    RERANKER_MODEL,
                    dtype=torch.float32,
                    attn_implementation="flash_attention_2",
                    device_map="cpu"
                )
            except Exception:
                reranker_model = AutoModelForCausalLM.from_pretrained(RERANKER_MODEL, dtype=torch.float32, device_map="cpu")
            
            reranker_model = reranker_model.eval()
            
            try:
                token_false_id = reranker_tokenizer.convert_tokens_to_ids("no")
                token_true_id = reranker_tokenizer.convert_tokens_to_ids("yes")
                prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
                suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
                prefix_tokens = reranker_tokenizer.encode(prefix, add_special_tokens=False)
                suffix_tokens = reranker_tokenizer.encode(suffix, add_special_tokens=False)
            except Exception as e:
                print(f"Error setting up reranker tokens: {e}")
                token_false_id = 0
                token_true_id = 1
                prefix_tokens = []
                suffix_tokens = []
            
            if progress_callback:
                progress_callback("Loading LLM model on GPU with 4-bit quantization...")

            # 4-bit quantization config
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16
            )
            
            llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
            llm_model = AutoModelForCausalLM.from_pretrained(
                LLM_MODEL,
                quantization_config=bnb_config,
                device_map="cuda",  # Auto distribute across available GPU
                dtype=torch.float16,
                low_cpu_mem_usage=True
            )
            
            if progress_callback:
                progress_callback("üìä Loading ENHANCED KG dataset...")
            
            dataset_loader = EnhancedKGDatasetLoader(DATASET_NAME, EMBEDDING_DIM)
            if not dataset_loader.load_from_huggingface(progress_callback):
                raise Exception("Failed to load enhanced KG dataset")
            
            if progress_callback:
                progress_callback("üóÉÔ∏è Building enhanced search indexes...")
            
            knowledge_graph = EnhancedKnowledgeGraph(dataset_loader)
            
            # *** FIXED: Create search engine properly ***
            search_engine = EnhancedKGSearchEngine(
                dataset_loader.all_records,
                dataset_loader.embeddings,
                embedding_model,
                embedding_tokenizer,
                knowledge_graph,
                dataset_loader
            )
            
            # *** NEW: Initialize query analyzer if not already done ***
            if not hasattr(search_engine, 'query_analyzer') or search_engine.query_analyzer is None:
                if progress_callback:
                    progress_callback("üß† Initializing Advanced Query Analyzer...")
                search_engine.query_analyzer = AdvancedQueryAnalyzer(knowledge_graph)
            
            reranker = EnhancedKGReranker(
                reranker_model, reranker_tokenizer, prefix_tokens, suffix_tokens,
                token_true_id, token_false_id, MAX_LENGTH, knowledge_graph
            )
            
            llm_generator = KGEnhancedLLMGenerator(llm_model, llm_tokenizer, knowledge_graph)
            
            if progress_callback:
                progress_callback("üîó Linking context manager with knowledge graph...")
            
            # Ensure context manager has KG reference
            search_engine.context_manager.set_knowledge_graph(knowledge_graph)
            
            # Create conversation manager
            conversation_manager = KGEnhancedConversationManager(llm_generator, knowledge_graph)
            conversation_manager.set_search_context_manager(search_engine.context_manager)
            
            clear_cache()
            
            if progress_callback:
                progress_callback("‚úÖ Enhanced system initialization complete!")
                progress_callback("üìå Embedding model: CPU")
                progress_callback("üìå Reranker model: CPU")
                progress_callback("üìå LLM model: GPU (4-bit)")
            
            initialization_complete = True
            return True
            
        except Exception as e:
            if progress_callback:
                progress_callback(f"Initialization failed: {str(e)}")
            print(f"Full initialization error: {e}")
            import traceback
            traceback.print_exc()
            initialization_complete = False
            return False

# =============================================================================
# MAIN RAG FUNCTIONS (KEEP ORIGINAL LOGIC)
# =============================================================================

def complete_kg_enhanced_rag_pipeline(query, config, progress_callback=None):
    """
    ENHANCED v2: Complete pipeline with community detection and adaptive learning.
    """
    if not initialization_complete:
        return {
            'query': query,
            'results': [],
            'community_summary': 'System not initialized',
            'llm_response': {
                'thinking': 'System belum siap',
                'response': 'Sistem sedang dalam proses inisialisasi.',
                'query_type': 'error',
                'num_sources': 0
            }
        }
    
    try:
        if hasattr(search_engine, 'all_phase_results'):
            search_engine.all_phase_results = {}
        
        query_type = search_engine._detect_query_type(query)
        query_entities = [entity for entity, _ in knowledge_graph.extract_entities_from_text(query)]
        
        if progress_callback:
            progress_callback(f"Query type detected: {query_type}")
        
        # Hybrid search
        try:
            initial_candidates = search_engine.hybrid_search_strategy(
                query, query_type, config, progress_callback
            )
        except Exception as e:
            if progress_callback:
                progress_callback(f"Error in hybrid search: {str(e)}")
            import traceback
            traceback.print_exc()
            initial_candidates = []
        
        if progress_callback:
            progress_callback(f"Initial search completed: {len(initial_candidates)} candidates found")
        
        # *** MODIFIED: Now returns community_summary ***
        research_candidates, rounds_conducted, all_phase_metadata, community_summary = search_engine.research_rounds_with_quality_degradation(
            query, query_type, initial_candidates, config, progress_callback
        )
        
        if not research_candidates:
            return {
                'query': query,
                'query_type': query_type,
                'query_entities': query_entities,
                'results': [],
                'all_retrieved_metadata': all_phase_metadata,
                'research_rounds': rounds_conducted,
                'community_summary': community_summary,
                'kg_enhanced': True,
                'llm_response': {
                    'thinking': 'Tidak ada dokumen yang ditemukan',
                    'response': 'Maaf, tidak ditemukan dokumen hukum yang relevan dengan pertanyaan Anda.',
                    'query_type': query_type,
                    'num_sources': 0,
                    'kg_enhanced': True,
                    'metadata_protected': True
                }
            }
        
        reranked_candidates = reranker.rerank_with_kg(
            query, research_candidates, query_type, config, 
            top_k=min(30, len(research_candidates)), 
            progress_callback=progress_callback
        )
        
        final_results = final_selection_with_kg(reranked_candidates, query_type, config)
        
        if progress_callback:
            progress_callback(f"Final selection completed: {len(final_results)} results")
        
        return {
            'query': query,
            'query_type': query_type,
            'query_entities': query_entities,
            'results': final_results,
            'all_retrieved_metadata': all_phase_metadata,
            'research_rounds': rounds_conducted,
            'community_summary': community_summary,  # ‚Üê ENSURE THIS IS RETURNED
            'kg_enhanced': True,
            'input_ids_and_metadata': None
        }
    except Exception as e:
        print(f"RAG pipeline error: {e}")
        return {
            'query': query,
            'query_type': 'error',
            'query_entities': [],
            'results': [],
            'all_retrieved_metadata': {},
            'research_rounds': 0,
            'community_summary': 'Error in processing',
            'kg_enhanced': False,
            'llm_response': {
                'thinking': f'Error: {str(e)}',
                'response': 'Terjadi kesalahan dalam pemrosesan. Silakan coba lagi.',
                'query_type': 'error',
                'num_sources': 0
            }
        }

def final_selection_with_kg(candidates, query_type, config):
    """Final selection with KG consideration - ORIGINAL"""
    try:
        if not candidates:
            return []
        
        quality_threshold = 0.5
        if query_type in ['specific_article', 'sanctions']:
            quality_threshold = 0.6
        elif query_type == 'definitional':
            quality_threshold = 0.55
        
        quality_candidates = [
            c for c in candidates 
            if c.get('final_score', 0) >= quality_threshold
        ]
        
        if len(quality_candidates) < max(2, config['final_top_k'] // 2):
            quality_threshold *= 0.85
            quality_candidates = [
                c for c in candidates 
                if c.get('final_score', 0) >= quality_threshold
            ]
        
        diverse_results = []
        seen_regulation_types = set()
        seen_authorities = set()
        seen_kg_clusters = set()
        
        for candidate in quality_candidates:
            if len(diverse_results) >= config['final_top_k']:
                break
                
            try:
                record = candidate['record']
                reg_type = record['regulation_type']
                authority_level = int(record['kg_authority_score'] * 10)
                kg_connectivity = int(record.get('kg_connectivity_score', 0) * 10)
                
                if (reg_type not in seen_regulation_types or 
                    authority_level not in seen_authorities or 
                    kg_connectivity not in seen_kg_clusters or
                    len(diverse_results) < config['final_top_k'] // 2):
                    diverse_results.append(candidate)
                    seen_regulation_types.add(reg_type)
                    seen_authorities.add(authority_level)
                    seen_kg_clusters.add(kg_connectivity)
            except Exception:
                continue
        
        remaining_slots = config['final_top_k'] - len(diverse_results)
        for candidate in quality_candidates:
            if candidate not in diverse_results and remaining_slots > 0:
                diverse_results.append(candidate)
                remaining_slots -= 1
        
        diverse_results.sort(key=lambda x: x.get('final_score', 0), reverse=True)
        return diverse_results[:config['final_top_k']]
    except Exception as e:
        print(f"Final selection error: {e}")
        return candidates[:config.get('final_top_k', 3)]

# =============================================================================
# GRADIO INTERFACE FUNCTIONS (KEEP ALL ORIGINAL + UPDATE STATS)
# =============================================================================

def format_retrieved_metadata(all_metadata, config):
    """Format all retrieved documents metadata - UNCHANGED"""
    if not all_metadata:
        return ""
    
    try:
        output = ["## üìö ALL RETRIEVED DOCUMENTS METADATA", ""]
        
        phase_order = ['initial_scan', 'focused_review', 'deep_analysis', 'verification', 'expert_review']
        
        phase_groups = {}
        total_kg_enhanced = 0
        
        for phase_key, phase_data in all_metadata.items():
            phase_name = phase_data['phase']
            researcher = phase_data.get('researcher', 'unknown')
            
            if phase_name not in phase_groups:
                phase_groups[phase_name] = {}
            if researcher not in phase_groups[phase_name]:
                phase_groups[phase_name][researcher] = []
            
            kg_candidates = [c for c in phase_data['candidates'] if c.get('kg_score', 0) > 0.3]
            total_kg_enhanced += len(kg_candidates)
            
            phase_groups[phase_name][researcher].extend(phase_data['candidates'])
        
        total_retrieved = 0
        for phase_name in phase_order:
            if phase_name not in phase_groups:
                continue
                
            researchers = phase_groups[phase_name]
            output.append(f"### üîç PHASE: {phase_name.upper()}")
            output.append("")
            
            for researcher, candidates in researchers.items():
                kg_count = len([c for c in candidates if c.get('kg_score', 0) > 0.3])
                if researcher in RESEARCH_TEAM_PERSONAS:
                    researcher_name = RESEARCH_TEAM_PERSONAS[researcher]['name']
                else:
                    researcher_name = researcher
                output.append(f"**{researcher_name}:** {len(candidates)} documents")
                total_retrieved += len(candidates)
                
                for i, candidate in enumerate(candidates[:5], 1):
                    try:
                        record = candidate['record']
                        score = candidate.get('composite_score', 0)
                        kg_score = candidate.get('kg_score', 0)
                        
                        if kg_score > 0:
                            score_display = f"Score: {score:.3f}, KG: {kg_score:.3f}"
                        else:
                            score_display = f"Score: {score:.3f}"
                        
                        output.append(f"   {i}. **{record['regulation_type']} No. {record['regulation_number']}/{record['year']}** ({score_display})")
                        output.append(f"      About: {record['about'][:80]}...")
                        output.append("")
                    except Exception:
                        continue
                
                if len(candidates) > 5:
                    output.append(f"      ... and {len(candidates) - 5} more documents")
                output.append("")
        
        output.append("### üìà RETRIEVAL SUMMARY")
        output.append(f"- **Total Documents Retrieved:** {total_retrieved:,}")
        if total_kg_enhanced > 0:
            output.append(f"- **KG-Enhanced Documents:** {total_kg_enhanced:,}")
        output.append(f"- **Research Phases Used:** {len(phase_groups)}")
        if total_retrieved > 0 and total_kg_enhanced > 0:
            output.append(f"- **KG Enhancement Rate:** {total_kg_enhanced/total_retrieved*100:.1f}%")
        
        return "\n".join(output)
    except Exception as e:
        return f"Error formatting metadata: {e}"

def format_sources_info(results, config):
    """Format sources information with ENHANCED KG features"""
    if not results:
        return "Tidak ada sumber yang ditemukan."
    
    try:
        output = [f"## üìñ SUMBER HUKUM UTAMA ({len(results)} dokumen)", ""]
        
        for i, result in enumerate(results, 1):
            try:
                record = result['record']
                
                output.append(f"### SUMBER {i}")
                output.append(f"**{record['regulation_type']} No. {record['regulation_number']}/{record['year']}**")
                output.append(f"**Ditetapkan oleh:** {record['enacting_body']}")
                output.append(f"**Tentang:** {record['about']}")
                
                if record.get('chapter', 'N/A') != 'N/A' or record.get('article', 'N/A') != 'N/A':
                    output.append(f"**Referensi:** Bab {record.get('chapter', 'N/A')} - Pasal {record.get('article', 'N/A')}")
                
                # Enhanced metadata display
                metadata_parts = []
                if 'final_score' in result:
                    metadata_parts.append(f"Final: {result['final_score']:.3f}")
                if 'enhanced_rerank_score' in result:
                    metadata_parts.append(f"Rerank: {result['enhanced_rerank_score']:.3f}")
                if 'composite_score' in result:
                    metadata_parts.append(f"Search: {result['composite_score']:.3f}")
                if 'kg_score' in result and result['kg_score'] > 0:
                    metadata_parts.append(f"KG: {result['kg_score']:.3f}")
                
                if metadata_parts:
                    output.append(f"**Skor:** {' | '.join(metadata_parts)}")
                
                # ENHANCED: Additional KG metadata
                additional_info = []
                if record.get('kg_primary_domain'):
                    additional_info.append(f"Domain: {record['kg_primary_domain']}")
                if record.get('kg_hierarchy_level', 0) <= 3:
                    additional_info.append(f"Hierarchy: Level {record['kg_hierarchy_level']}")
                if record.get('kg_cross_ref_count', 0) > 0:
                    additional_info.append(f"Cross-refs: {record['kg_cross_ref_count']}")
                if record.get('kg_pagerank', 0) > 0:
                    additional_info.append(f"PageRank: {record['kg_pagerank']:.4f}")
                if record.get('kg_connectivity_score', 0) > 0:
                    additional_info.append(f"Connectivity: {record['kg_connectivity_score']:.3f}")
                
                if additional_info:
                    output.append(f"**Enhanced KG Metadata:** {' | '.join(additional_info)}")
                
                # Team consensus info
                if result.get('team_consensus', False):
                    consensus_info = f"Team Consensus: Yes"
                    if 'researcher_agreement' in result:
                        consensus_info += f" (Agreement: {result['researcher_agreement']})"
                    if 'supporting_researchers' in result:
                        researchers = result['supporting_researchers']
                        researcher_names = [RESEARCH_TEAM_PERSONAS.get(r, {}).get('name', r) for r in researchers[:3]]
                        consensus_info += f" | Researchers: {', '.join(researcher_names)}"
                    output.append(f"**{consensus_info}**")
                
                # Devils advocate info
                if result.get('devils_advocate_challenged', False):
                    challenge_points = result.get('challenge_points', [])
                    output.append(f"**üîç Challenged by Devil's Advocate:** {'; '.join(challenge_points[:2])}")
                
                content = record.get('content', '')
                if len(content) > 500:
                    content = content[:500] + "..."
                
                output.append(f"**Isi:** {content}")
                output.append("")
                
            except Exception as e:
                output.append(f"Error formatting source {i}: {e}")
                continue
        
        return "\n".join(output)
    except Exception as e:
        return f"Error formatting sources: {e}"

# KEEP ALL ORIGINAL CHAT FUNCTION - chat_with_legal_rag remains UNCHANGED
# (Due to length, I'll indicate this should remain exactly as in original)

# =============================================================================
# MODIFIED CHAT FUNCTION - Add Query Analysis Display
# =============================================================================

def chat_with_legal_rag(message, history, config_dict, show_thinking=True, show_sources=True, show_metadata=True):
    """Main chat function with ADVANCED QUERY ANALYSIS and HYBRID SEARCH"""
    if not message.strip():
        return history, ""
    
    try:
        current_progress = []
        
        def add_progress(msg):
            current_progress.append(msg)
            progress_display = "\n".join([f"üîÑ {m}" for m in current_progress])
            return history + [[message, f"**Mencari dan menganalisis...**\n\n{progress_display}"]]
        
        yield add_progress("üöÄ Memulai analisis query..."), ""
        
        # *** NEW: Advanced Query Analysis Display ***
        try:
            query_analysis = search_engine.query_analyzer.analyze_query(message)
            
            # Display analysis
            yield add_progress(f"üß† Strategy: {query_analysis['search_strategy']} ({query_analysis['confidence']:.0%})"), ""
            
            if query_analysis['reasoning']:
                yield add_progress(f"üí° {query_analysis['reasoning']}"), ""
            
            if query_analysis.get('key_phrases'):
                phrases = [p['phrase'] for p in query_analysis['key_phrases']]
                yield add_progress(f"üéØ Key phrases: {', '.join(phrases)}"), ""
            
            if query_analysis.get('law_name_detected'):
                law_name = query_analysis['specific_entities'][0]['name']
                yield add_progress(f"üìú Law name detected: {law_name}"), ""
        
        except Exception as e:
            print(f"Error in query analysis display: {e}")
            query_analysis = None
        
        # Ensure context manager has KG reference
        if not search_engine.context_manager.kg:
            search_engine.context_manager.set_knowledge_graph(knowledge_graph)
        
        # Detect query intent using context manager
        try:
            query_intent = search_engine.context_manager.detect_query_type(message)
            yield add_progress(f"üéØ Query intent: {query_intent}"), ""
        except Exception as e:
            print(f"Error detecting query intent: {e}")
            import traceback
            traceback.print_exc()
            query_intent = 'new_query'
        
        # Update context BEFORE search
        try:
            query_entities_raw = knowledge_graph.extract_entities_from_text(message)
            search_engine.context_manager.update_from_query(message, query_entities_raw)
        except Exception as e:
            print(f"Error updating context from query: {e}")
            import traceback
            traceback.print_exc()
            query_entities_raw = []
        
        # Get regulation filter if applicable
        regulation_filter = None
        try:
            if query_intent in ['followup', 'pronoun_reference', 'continuing_discussion']:
                regulation_filter, confidence = search_engine.context_manager.get_primary_regulation_filter()
                if regulation_filter and confidence >= 0.5:
                    print(f"DEBUG: regulation_filter = {regulation_filter}")
                    
                    yield add_progress(
                        f"üìå Primary regulation: {regulation_filter.get('regulation_type', 'N/A')} "
                        f"{regulation_filter.get('regulation_number', 'N/A')} "
                        f"(confidence: {confidence:.0%})"
                    ), ""
        except Exception as e:
            print(f"Error getting regulation filter: {e}")
            import traceback
            traceback.print_exc()
            regulation_filter = None
        
        # Execute RAG pipeline (now with advanced query analysis)
        yield add_progress("üîç Conducting intelligent search..."), ""
        
        rag_result = None
        all_phase_metadata = {}
        
        try:
            # Use hybrid search strategy (now with query analysis)
            initial_candidates = search_engine.hybrid_search_strategy(
                message, 
                search_engine._detect_query_type(message),
                config_dict, 
                lambda x: current_progress.append(x)
            )
            yield add_progress(f"‚úÖ Initial search completed: {len(initial_candidates)} candidates"), ""
            
            # Collect metadata
            initial_metadata = dict(getattr(search_engine, 'all_phase_results', {}))
            
            # Continue with reranking if we have results
            if initial_candidates:
                yield add_progress(f"‚öñÔ∏è Reranking candidates..."), ""
                
                reranked_candidates = reranker.rerank_with_kg(
                    message, initial_candidates, 
                    search_engine._detect_query_type(message), 
                    config_dict, 
                    top_k=min(30, len(initial_candidates)),
                    progress_callback=lambda x: current_progress.append(x)
                )
                
                yield add_progress(f"‚úÖ Reranking completed"), ""
                
                # Final selection
                final_results = final_selection_with_kg(reranked_candidates, 
                                                        search_engine._detect_query_type(message), 
                                                        config_dict)
                
                yield add_progress(f"üéØ Final selection: {len(final_results)} results"), ""
                
                # Build RAG result
                rag_result = {
                    'query': message,
                    'query_type': search_engine._detect_query_type(message),
                    'query_entities': [e for e, _ in query_entities_raw],
                    'results': final_results,
                    'all_retrieved_metadata': initial_metadata,
                    'research_rounds': 1,
                    'kg_enhanced': True,
                    'query_analysis': query_analysis  # NEW: Include analysis in result
                }
            else:
                yield add_progress("‚ö†Ô∏è No suitable candidates found"), ""
                
                rag_result = {
                    'query': message,
                    'query_type': search_engine._detect_query_type(message),
                    'query_entities': [e for e, _ in query_entities_raw],
                    'results': [],
                    'all_retrieved_metadata': initial_metadata,
                    'research_rounds': 0,
                    'kg_enhanced': True,
                    'query_analysis': query_analysis
                }
                
        except Exception as e:
            yield add_progress(f"‚ùå Error in search: {str(e)}"), ""
            import traceback
            traceback.print_exc()
            
            rag_result = {
                'query': message,
                'query_type': 'general',
                'query_entities': [],
                'results': [],
                'all_retrieved_metadata': {},
                'research_rounds': 0,
                'kg_enhanced': False
            }
        
        # ... (rest of the chat function remains unchanged - LLM generation, streaming, etc.)
        
        # Step 3: Generate LLM Response
        yield add_progress("ü§ñ Generating KG-enhanced response..."), ""
        
        final_progress = "\n".join([msg for msg in current_progress])
        
        if rag_result and rag_result['results']:
            try:
                conversation_context = conversation_manager.get_conversation_context()
                
                input_ids, metadata = llm_generator.generate_with_kg_and_context(
                    message, 
                    rag_result['results'], 
                    rag_result['query_type'], 
                    config_dict, 
                    conversation_context,
                    all_phase_metadata
                )
                
                streamer = TextIteratorStreamer(
                    llm_tokenizer, 
                    skip_prompt=True, 
                    skip_special_tokens=True
                )
                
                generate_kwargs = {
                    'input_ids': input_ids,
                    'streamer': streamer,
                    'max_new_tokens': config_dict['max_new_tokens'],
                    'do_sample': config_dict['temperature'] > 0,
                    'temperature': config_dict['temperature'] if config_dict['temperature'] > 0 else 1.0,
                    'min_p': config_dict['min_p'],
                    'top_p': config_dict['top_p'],
                    'top_k': config_dict['top_k']
                }
                
                thread = Thread(target=llm_model.generate, kwargs=generate_kwargs)
                thread.start()
                
                thinking_content = []
                final_answer = []
                live_output = []
                in_thinking_block = False
                saw_think_tag = False
                thinking_header_shown = False
                accumulated_text = ''
                
                for new_text in streamer:
                    accumulated_text += new_text
                    
                    if '<think>' in new_text:
                        in_thinking_block = True
                        saw_think_tag = True
                        new_text = new_text.replace('<think>', '')
                        if not thinking_header_shown and show_thinking:
                            live_output = [f"**Proses Penelitian:**\n\n{final_progress}\n\n---\n\nüß† **Sedang berfikir...**\n"]
                            thinking_header_shown = True
                    
                    if '</think>' in new_text:
                        in_thinking_block = False
                        new_text = new_text.replace('</think>', '')
                        if show_thinking:
                            live_output.append('\n\n-----\n‚úÖ **Jawaban:**\n')
                    
                    if saw_think_tag:
                        if in_thinking_block:
                            thinking_content.append(new_text)
                            if show_thinking:
                                live_output.append(new_text)
                        else:
                            final_answer.append(new_text)
                            live_output.append(new_text)
                    else:
                        if len(accumulated_text) > 20 and not saw_think_tag:
                            if not thinking_header_shown:
                                live_output = [f"**Proses Penelitian:**\n\n{final_progress}\n\n---\n\n‚≠ê **Jawaban langsung:**\n\n"]
                                thinking_header_shown = True
                            final_answer.append(new_text)
                            live_output.append(new_text)
                        else:
                            if not thinking_header_shown:
                                progress_with_generation = final_progress + f"\n\nü§ñ Generating response..."
                                live_output = [f"**Proses Penelitian:**\n\n{progress_with_generation}\n\n{new_text}"]
                            else:
                                live_output.append(new_text)
                    
                    yield history + [[message, ''.join(live_output)]], ""
                
                thread.join()
                
                response_text = ''.join(final_answer).strip()
        
                # Build base output
                final_output = f'<details><summary>üìã <b>Proses Penelitian Selesai (klik untuk melihat)</b></summary>\n\n{final_progress}\n</details>\n\n'
        
                if thinking_content and show_thinking:
                    thinking_text = ''.join(thinking_content).strip()
                    final_output += (
                        '<details><summary>üß† <b>Proses berfikir (klik untuk melihat)</b></summary>\n\n'
                        + thinking_text +
                        '\n</details>\n\n'
                        + '-----\n‚úÖ **Jawaban:**\n'
                        + response_text
                    )
                else:
                    final_output += f"‚úÖ **Jawaban:**\n{response_text}"
                
                # ============================================================================
                # PHASE 2 FIX: DISPLAY COMMUNITY DETECTION RESULTS
                # ============================================================================
                
                # Extract community analysis from research log
                community_analysis_data = None
                if rag_result.get('research_log') and rag_result['research_log'].get('final_consensus_log'):
                    final_consensus_log = rag_result['research_log']['final_consensus_log']
                    community_analysis_data = final_consensus_log.get('community_analysis')
                
                # Format and display community clusters if available
                if community_analysis_data and isinstance(community_analysis_data, list) and len(community_analysis_data) > 0:
                    final_output += "\n\n---\n\n### üåê Discovered Thematic Clusters\n\n"
                    final_output += "_The research team identified these interconnected legal themes in the retrieved documents:_\n\n"
                    
                    for cluster_idx, cluster_data in enumerate(community_analysis_data, 1):
                        try:
                            # Extract cluster information
                            cluster_id = cluster_data.get('cluster_id', cluster_idx)
                            size = cluster_data.get('size', 0)
                            
                            # Build cluster header
                            final_output += f"**Cluster {cluster_idx}** ({size} documents)\n"
                            
                            # Display keywords/summary
                            if 'top_keywords' in cluster_data and cluster_data['top_keywords']:
                                keywords = cluster_data['top_keywords']
                                if isinstance(keywords, list):
                                    keywords_str = ", ".join([f"`{kw}`" for kw in keywords[:8]])  # Top 8 keywords
                                    final_output += f"- **Key Terms:** {keywords_str}\n"
                            
                            elif 'summary' in cluster_data and cluster_data['summary']:
                                summary = str(cluster_data['summary'])
                                final_output += f"- {summary}\n"
                            
                            # Display primary domain if available
                            if 'primary_domain' in cluster_data and cluster_data['primary_domain']:
                                final_output += f"- **Domain:** {cluster_data['primary_domain']}\n"
                            
                            # Display regulation types if available
                            if 'regulation_types' in cluster_data and cluster_data['regulation_types']:
                                reg_types = cluster_data['regulation_types']
                                if isinstance(reg_types, list):
                                    final_output += f"- **Types:** {', '.join(reg_types[:3])}\n"
                            
                            # Display coherence score if available
                            if 'coherence_score' in cluster_data:
                                coherence = float(cluster_data['coherence_score'])
                                final_output += f"- **Coherence:** {coherence:.2%}\n"
                            
                            final_output += "\n"
                            
                        except Exception as e:
                            print(f"Error formatting cluster {cluster_idx}: {e}")
                            continue
                    
                    final_output += "_These clusters were automatically discovered using network analysis of cross-references and semantic relationships between legal documents._\n\n"
                
                # ============================================================================
                # Continue with collapsible sections (sources, metadata)
                # ============================================================================
                
                collapsible_sections = []
                
                if show_sources and rag_result['results']:
                    sources_info = format_sources_info(rag_result['results'], config_dict)
                    collapsible_sections.append(
                        f'<details><summary>üìñ <b>Sumber Hukum Utama ({len(rag_result["results"])} dokumen)</b></summary>\n\n{sources_info}\n</details>'
                    )
                
                if show_metadata and rag_result.get('all_retrieved_metadata'):
                    metadata_info = format_retrieved_metadata(rag_result['all_retrieved_metadata'], config_dict)
                    if metadata_info.strip():
                        collapsible_sections.append(
                            f'<details><summary>üìö <b>Semua Metadata Dokumen yang Ditemukan</b></summary>\n\n{metadata_info}\n</details>'
                        )
                
                if collapsible_sections:
                    final_output += f"\n\n---\n\n" + "\n\n".join(collapsible_sections)
                
                yield history + [[message, final_output]], ""
                
                rag_result['llm_response'] = {
                    'thinking': ''.join(thinking_content).strip(),
                    'response': response_text
                }
                conversation_manager.add_to_history(message, rag_result)
                
                if len(conversation_manager.conversation_history) > 0:
                    print(f"‚úÖ Conversation saved. {len(conversation_manager.conversation_history)} exchanges ready for export.")
                
            except Exception as e:
                error_output = f'<details><summary>üìã <b>Proses Penelitian Selesai (klik untuk melihat)</b></summary>\n\n{final_progress}\n</details>\n\n'
                error_output += f"‚ùå **Error generating response:** {str(e)}\n\n"
                error_output += "Maaf, terjadi kesalahan saat membuat respons. Silakan coba lagi."
                
                yield history + [[message, error_output]], ""
                
                import traceback
                traceback.print_exc()
                
        else:
            final_output = f'<details><summary>üìã <b>Proses Penelitian Selesai (klik untuk melihat)</b></summary>\n\n{final_progress}\n</details>\n\n'
            final_output += "‚ùå **Tidak ada hasil ditemukan**\n\n"
            final_output += "Maaf, tidak ditemukan dokumen hukum yang relevan dengan pertanyaan Anda. Silakan coba:\n"
            final_output += "- Menggunakan kata kunci yang berbeda\n"
            final_output += "- Memperjelas pertanyaan Anda\n"
            final_output += "- Menggunakan istilah hukum yang lebih spesifik"
            
            yield history + [[message, final_output]], ""
            
            if rag_result:
                rag_result['llm_response'] = {
                    'thinking': 'Tidak ada dokumen yang ditemukan',
                    'response': 'Maaf, tidak ditemukan dokumen hukum yang relevan dengan pertanyaan Anda.'
                }
                conversation_manager.add_to_history(message, rag_result)
    
    except Exception as e:
        error_msg = f"‚ùå **Terjadi kesalahan sistem:**\n\n{str(e)}\n\n"
        error_msg += "Silakan coba lagi atau hubungi administrator jika masalah berlanjut."
        yield history + [[message, error_msg]], ""
        
        import traceback
        traceback.print_exc()

def clear_conversation():
    """Clear conversation history"""
    try:
        conversation_manager.clear_conversation()
        
        # ALSO clear search context
        if search_engine and hasattr(search_engine, 'context_manager'):
            search_engine.context_manager.clear()
        
        return [], ""
    except Exception as e:
        print(f"Error clearing conversation: {e}")
        return [], ""

def get_system_info():
    """Get system information"""
    if not initialization_complete:
        return "Sistem belum selesai inisialisasi."
    
    try:
        stats = dataset_loader.get_statistics()
        info = f"""## üìä Enhanced KG Legal RAG System Information

**Enhanced Features:**
- **Realistic Research Team**: 5 distinct researcher personas with unique expertise
- **Query-Specific Assembly**: Optimal team selection based on query type
- **Multi-Stage Process**: Individual ‚Üí Cross-validation ‚Üí Devil's Advocate ‚Üí Consensus
- **Advanced Customization**: Granular control over all search phases

**Research Team Personas:**
- **üë®‚Äç‚öñÔ∏è Senior Legal Researcher**: 15 years exp, authority-focused
- **üë©‚Äç‚öñÔ∏è Junior Legal Researcher**: 3 years exp, comprehensive coverage
- **üìö Knowledge Graph Specialist**: 8 years exp, relationship-focused  
- **‚öñÔ∏è Procedural Law Expert**: 12 years exp, methodical analysis
- **üîç Devil's Advocate**: 10 years exp, critical challenges

**Models:**
- **Embedding:** {EMBEDDING_MODEL}
- **Reranker:** {RERANKER_MODEL}
- **LLM:** {LLM_MODEL}

**Dataset Statistics:**
- **Total Documents:** {stats.get('total_records', 0):,}
- **KG-Enhanced:** {stats.get('kg_enhanced', 0):,} ({stats.get('kg_enhancement_rate', 0):.1%})
- **Avg Entities/Doc:** {stats.get('avg_entities_per_doc', 0):.1f}
- **Avg Authority Score:** {stats.get('avg_authority_score', 0):.3f}
- **Avg KG Connectivity:** {stats.get('avg_kg_connectivity', 0):.3f}

**Performance Metrics:**
- **Authority Tiers:** {stats.get('authority_tiers', 0)}
- **Temporal Tiers:** {stats.get('temporal_tiers', 0)}  
- **KG Connectivity Tiers:** {stats.get('kg_connectivity_tiers', 0)}
- **Memory Optimized:** {stats.get('memory_optimized', False)}

**Research Process Features:**
- **Cross-Validation:** Team member validation of findings
- **Devil's Advocate:** Critical challenge of assumptions  
- **Consensus Building:** Weighted agreement based on expertise
- **Conflict Resolution:** Automatic handling of disagreements
"""
        return info
    except Exception as e:
        return f"Error getting system info: {e}"

# =============================================================================
# ENHANCED GRADIO INTERFACE
# =============================================================================

def create_gradio_interface():
    """Create enhanced Gradio interface with full customization"""
    
    custom_css = """
    /* Base container - responsive to zoom */
    .gradio-container {
        max-width: 100%;
        width: 100%;
        margin: 0 auto;
        padding: 0;
        overflow-x: hidden;
    }
    
    /* Main chat area - scalable dimensions */
    .main-chat-area {
        width: 100%;
        max-width: 75em; /* Changed from rem to em for zoom scaling */
        margin: 0 auto;
        padding: 1.25em; /* Changed from rem to em */
        box-sizing: border-box;
    }
    
    /* Chatbot container - responsive sizing */
    .chat-container {
        height: 75vh; /* Viewport height scales with zoom */
        min-height: 25em; /* Reduced and changed to em */
        max-height: none; /* Remove max-height to allow scaling */
        width: 100%;
        overflow-y: auto;
        border: 0.0625em solid #e0e0e0; /* Changed to em */
        border-radius: 0.75em; /* Changed to em */
        background: white;
        box-sizing: border-box;
        resize: vertical; /* Allow manual resizing if needed */
    }
    
    /* Prevent width changes from content expansion */
    .chatbot {
        width: 100%;
        max-width: none;
        min-width: 0;
    }
    
    /* Chat messages - scalable overflow handling */
    .message-wrap {
        max-width: 100%;
        word-wrap: break-word;
        overflow-wrap: break-word;
    }
    
    /* Center the chatbot placeholder */
    .chatbot .wrap {
        display: flex;
        align-items: center;
        justify-content: center;
        text-align: center;
    }

    .chatbot .placeholder {
        text-align: center;
        display: flex;
        align-items: center;
        justify-content: center;
        height: 100%;
        width: 100%;
    }

    .chatbot .empty {
        display: flex;
        align-items: center;
        justify-content: center;
        height: 100%;
        width: 100%;
        text-align: center;
        color: #666;
        font-size: 1em; /* Changed to em for scaling */
    }
    
    /* Input area styling */
    .input-row {
        margin-top: 0.9375em; /* Changed to em */
        width: 100%;
    }
    
    .input-row .form {
        width: 100%;
    }
    
    /* Settings panels - scalable */
    .settings-panel {
        background-color: #f8f9fa;
        padding: 1.25em; /* Changed to em */
        border-radius: 0.75em; /* Changed to em */
        margin-bottom: 0.9375em; /* Changed to em */
        box-shadow: 0 0.125em 0.25em rgba(0,0,0,0.1); /* Changed to em */
        width: 100%;
        box-sizing: border-box;
    }
    
    .status-panel {
        background-color: #e8f4fd;
        padding: 0.9375em; /* Changed to em */
        border-radius: 0.5em; /* Changed to em */
        border-left: 0.25em solid #2196F3; /* Changed to em */
        margin-bottom: 0.625em; /* Changed to em */
    }
    
    /* Responsive breakpoints - using em for zoom-friendly breakpoints */
    @media (max-width: 87.5em) {
        .main-chat-area {
            max-width: 95%;
            padding: 0.9375em;
        }
    }
    
    @media (max-width: 64em) {
        .chat-container {
            height: 70vh;
            min-height: 20em; /* Reduced for better mobile experience */
        }
        
        .main-chat-area {
            padding: 0.9375em;
        }
    }
    
    @media (max-width: 48em) {
        .chat-container {
            height: 65vh;
            min-height: 18em; /* Further reduced */
        }
        
        .main-chat-area {
            padding: 0.625em;
        }
        
        .settings-panel {
            padding: 0.9375em;
        }
    }
    
    @media (max-width: 30em) {
        .chat-container {
            height: 60vh;
            min-height: 15em; /* Minimum for usability */
        }
        
        .main-chat-area {
            padding: 0.5em;
        }
        
        .settings-panel {
            padding: 0.75em;
            margin-bottom: 0.625em;
        }
    }
    
    /* Prevent layout shifts from dynamic content */
    .block {
        min-width: 0;
    }
    
    /* Tab content - centered tabs */
    .tab-nav {
        margin-bottom: 1.25em; /* Changed to em */
        display: flex;
        justify-content: center;
        align-items: center;
        width: 100%;
    }
    
    /* Center the tab navigation */
    .tabs {
        display: flex;
        flex-direction: column;
        align-items: center;
        width: 100%;
    }
    
    /* Style the tab buttons - scalable */
    .tab-nav button {
        margin: 0 0.5em; /* Changed to em */
        padding: 0.75em 1.5em; /* Changed to em */
        border-radius: 0.5em; /* Changed to em */
        font-weight: 500;
        transition: all 0.2s ease;
    }
    
    /* Center tab container */
    .tabitem {
        width: 100%;
        max-width: 75em; /* Changed to em */
        margin: 0 auto;
    }
    
    /* Examples styling */
    .examples {
        margin-top: 0.9375em; /* Changed to em */
    }
    
    /* Button styling */
    .clear-btn {
        margin-left: auto;
    }
    
    /* Ensure consistent column widths in settings */
    .settings-columns {
        display: grid;
        grid-template-columns: 1fr 1fr;
        gap: 1.25em; /* Changed to em */
        width: 100%;
    }
    
    @media (max-width: 48em) {
        .settings-columns {
            grid-template-columns: 1fr;
        }
    }
    
    /* Fix for expandable content not affecting layout */
    .prose {
        max-width: 100%;
    }
    
    /* Prevent horizontal scroll */
    * {
        box-sizing: border-box;
    }
    
    /* Enhanced zoom support */
    html {
        -webkit-text-size-adjust: 100%;
        -ms-text-size-adjust: 100%;
    }
    
    /* Ensure text scales properly with browser zoom */
    body, .gradio-container, .chatbot {
        font-size: 1em; /* Base font size that scales with zoom */
    }
    """
    
    with gr.Blocks(
        title="Enhanced Indonesian Legal Assistant",
        theme=gr.themes.Default(),
        css=custom_css
    ) as interface:
        
        with gr.Tabs():
            # Main Chat Tab
            with gr.TabItem("üí¨ Konsultasi Hukum", id="chat"):
                with gr.Column(elem_classes="main-chat-area"):
                    chatbot = gr.Chatbot(
                        height="75vh",
                        show_label=False,
                        container=True,
                        bubble_full_width=True,
                        elem_classes="chat-container",
                        show_copy_button=True,
                        sanitize_html=True,
                        render_markdown=True,
                    )
                    
                    with gr.Row(elem_classes="input-row"):
                        msg_input = gr.Textbox(
                            placeholder="Tanyakan tentang hukum Indonesia...",
                            show_label=False,
                            container=False,
                            scale=10,
                            submit_btn=True,
                            lines=1,
                            max_lines=3,
                            interactive=True
                        )
                    
                    with gr.Row():
                        with gr.Column():
                            gr.Examples(
                                examples=[
                                    "Apakah ada pengaturan yang menjamin kesetaraan hak antara guru dan dosen dalam memperoleh tunjangan profesi?",
                                    "Apakah terdapat mekanisme pengawasan terhadap penyimpanan uang negara agar terhindar dari penyalahgunaan atau kebocoran keuangan?", 
                                    "Bagaimana mekanisme hukum untuk memperoleh izin resmi bagi pihak yang menjalankan usaha sebagai pengusaha pabrik, penyimpanan, importir, penyalur, maupun penjual eceran barang kena cukai?",
                                    "Apakah terdapat kewajiban pemerintah untuk menyediakan dana khusus bagi penyuluhan, atau dapat melibatkan sumber pendanaan alternatif seperti swasta dan masyarakat?",
                                    "Bagaimana prosedur hukum yang harus ditempuh sebelum sanksi denda administrasi di bidang cukai dapat dikenakan kepada pelaku usaha?",
                                    "Bagaimana sistem perencanaan kas disusun agar mampu mengantisipasi kebutuhan mendesak negara/daerah tanpa mengganggu stabilitas fiskal?",
                                    "syarat dan prosedur perceraian menurut hukum Indonesia",
                                    "hak dan kewajiban pekerja dalam UU Ketenagakerjaan"
                                ],
                                inputs=msg_input,
                                examples_per_page=2,
                                label=""
                            )
            
            # Enhanced Settings Tab
            with gr.TabItem("‚öôÔ∏è Pengaturan Sistem", id="settings"):
                #gr.Markdown("### üîß Enhanced RAG Configuration with Research Team")
                
                with gr.Row():
                    with gr.Column():
                        # Basic Settings
                        with gr.Group(elem_classes="settings-panel"):
                            gr.Markdown("#### üéØ Basic Settings")
                            final_top_k = gr.Slider(1, 10, value=3, step=1, label="Final Top K Results")
                            temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.1, label="LLM Temperature")
                            max_new_tokens = gr.Slider(512, 4096, value=2048, step=256, label="Max New Tokens")
                        
                        # Research Team Settings
                        with gr.Group(elem_classes="settings-panel researcher-settings"):
                            gr.Markdown("#### üë• Research Team Configuration")
                            research_team_size = gr.Slider(1, 5, value=4, step=1, label="Team Size")
                            enable_cross_validation = gr.Checkbox(label="Enable Cross-Validation", value=True)
                            enable_devil_advocate = gr.Checkbox(label="Enable Devil's Advocate", value=True)
                            consensus_threshold = gr.Slider(0.3, 0.9, value=0.6, step=0.05, label="Consensus Threshold")
                        
                        # Display Settings
                        with gr.Group(elem_classes="settings-panel"):
                            gr.Markdown("#### üí¨ Display Settings")
                            show_thinking = gr.Checkbox(label="Show Thinking Process", value=True)
                            show_sources = gr.Checkbox(label="Show Legal Sources", value=True)
                            show_metadata = gr.Checkbox(label="Show All Retrieved Metadata", value=True)

                        with gr.Group(elem_classes="settings-panel"):
                            gr.Markdown("#### üß† LLM Generation Settings")
                            top_p = gr.Slider(0.1, 1.0, value=1.0, step=0.1, label="Top P")
                            top_k = gr.Slider(1, 100, value=20, step=1, label="Top K")
                            min_p = gr.Slider(0.01, 0.3, value=0.1, step=0.01, label="Min P")

                        with gr.Group(elem_classes="settings-panel"):
                            gr.Markdown("#### üìä System Information")
                            system_info_btn = gr.Button("üìà View System Stats", variant="primary")
                            reset_defaults_btn = gr.Button("üîÑ Reset to Defaults", variant="secondary")
                            system_info_output = gr.Markdown("")

                        # In create_gradio_interface(), add to Settings tab:
                        with gr.Group(elem_classes="settings-panel"):
                            gr.Markdown("#### üè• System Health")
                            health_check_btn = gr.Button("üîç Run Health Check", variant="secondary")
                            health_report_output = gr.Markdown("")
                        
                        # Connect health check button
                        health_check_btn.click(
                            lambda: format_health_report(system_health_check()),
                            outputs=health_report_output
                        )
                    
                    with gr.Column():
                        # Enhanced Search Phase Configuration
                        with gr.Group(elem_classes="settings-panel phase-settings"):
                            gr.Markdown("#### üîç Search Phase Configuration")
                            
                            gr.Markdown("**Initial Scan Phase**")
                            initial_scan_enabled = gr.Checkbox(label="Enable Initial Scan", value=True)
                            initial_scan_candidates = gr.Slider(100, 800, value=400, step=50, label="Candidates")
                            initial_scan_semantic = gr.Slider(0.1, 0.5, value=0.20, step=0.05, label="Semantic Threshold")
                            initial_scan_keyword = gr.Slider(0.02, 0.15, value=0.06, step=0.01, label="Keyword Threshold")
                            
                            gr.Markdown("**Focused Review Phase**")
                            focused_review_enabled = gr.Checkbox(label="Enable Focused Review", value=True)
                            focused_review_candidates = gr.Slider(50, 300, value=150, step=25, label="Candidates")
                            focused_review_semantic = gr.Slider(0.2, 0.6, value=0.35, step=0.05, label="Semantic Threshold")
                            focused_review_keyword = gr.Slider(0.05, 0.2, value=0.12, step=0.01, label="Keyword Threshold")
                            
                            gr.Markdown("**Deep Analysis Phase**")
                            deep_analysis_enabled = gr.Checkbox(label="Enable Deep Analysis", value=True)
                            deep_analysis_candidates = gr.Slider(20, 120, value=60, step=10, label="Candidates")
                            deep_analysis_semantic = gr.Slider(0.3, 0.7, value=0.45, step=0.05, label="Semantic Threshold")
                            deep_analysis_keyword = gr.Slider(0.1, 0.3, value=0.18, step=0.01, label="Keyword Threshold")
                            
                            gr.Markdown("**Verification Phase**")
                            verification_enabled = gr.Checkbox(label="Enable Verification", value=True)
                            verification_candidates = gr.Slider(10, 60, value=30, step=5, label="Candidates")
                            verification_semantic = gr.Slider(0.4, 0.8, value=0.55, step=0.05, label="Semantic Threshold")
                            verification_keyword = gr.Slider(0.15, 0.35, value=0.22, step=0.01, label="Keyword Threshold")
                            
                            gr.Markdown("**Expert Review Phase (Optional)**")
                            expert_review_enabled = gr.Checkbox(label="Enable Expert Review", value=True)
                            expert_review_candidates = gr.Slider(15, 80, value=45, step=5, label="Candidates")
                            expert_review_semantic = gr.Slider(0.35, 0.75, value=0.50, step=0.05, label="Semantic Threshold")
                            expert_review_keyword = gr.Slider(0.12, 0.3, value=0.20, step=0.01, label="Keyword Threshold")

            
            with gr.TabItem("üì• Export Conversation", id="export"):
                with gr.Column(elem_classes="main-chat-area"):
                    gr.Markdown("""
                    ## Export Your Conversation
                    
                    Download your complete consultation history including:
                    - All questions and answers
                    - Research team process details
                    - Legal sources consulted
                    - Metadata and analysis
                    """)
                    
                    with gr.Row():
                        export_format = gr.Radio(
                            choices=["Markdown", "JSON", "HTML"],
                            value="Markdown",
                            label="Export Format"
                        )
                    
                    with gr.Row():
                        include_metadata_export = gr.Checkbox(
                            label="Include Technical Metadata",
                            value=True
                        )
                        include_research_process_export = gr.Checkbox(
                            label="Include Research Team Process",
                            value=True
                        )
                        include_full_content_export = gr.Checkbox(
                            label="Include Full Document Content (JSON only)",
                            value=False
                        )
                    
                    with gr.Row():
                        export_button = gr.Button("üì• Generate Export", variant="primary", size="lg")
                    
                    export_output = gr.Textbox(
                        label="Export Output",
                        lines=20,
                        max_lines=30,
                        show_copy_button=True
                    )
                    
                    download_file = gr.File(
                        label="Download Export File",
                        visible=True
                    )
                    
                    gr.Markdown("""
                    ### Export Format Guide
                    
                    - **Markdown**: Human-readable format, great for reading and sharing
                    - **JSON**: Structured data, perfect for processing or archiving
                    - **HTML**: Styled webpage, best for printing or presentation
                    """)
            
            # Export function
            def export_conversation_handler(export_format, include_metadata, include_research, include_full):
                """Handle export button click"""
                try:
                    if not conversation_manager.conversation_history:
                        return "No conversation to export.", None
                    
                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                    
                    if export_format == "Markdown":
                        content = conversation_manager.export_to_markdown(include_metadata, include_research)
                        filename = f"legal_consultation_{timestamp}.md"
                        extension = "md"
                    elif export_format == "JSON":
                        content = conversation_manager.export_to_json(include_full)
                        filename = f"legal_consultation_{timestamp}.json"
                        extension = "json"
                    else:  # HTML
                        content = conversation_manager.export_to_html(include_metadata)
                        filename = f"legal_consultation_{timestamp}.html"
                        extension = "html"
                    
                    # Save to temporary file
                    import tempfile
                    with tempfile.NamedTemporaryFile(mode='w', suffix=f'.{extension}', delete=False, encoding='utf-8') as f:
                        f.write(content)
                        temp_path = f.name
                    
                    return content, temp_path
                    
                except Exception as e:
                    return f"Export failed: {str(e)}", None
            
            # Connect export button
            export_button.click(
                export_conversation_handler,
                inputs=[export_format, include_metadata_export, include_research_process_export, include_full_content_export],
                outputs=[export_output, download_file]
            )
            # About Tab
            with gr.TabItem("‚ÑπÔ∏è About Enhanced System", id="about"):
                with gr.Column(elem_classes="main-chat-area"):
                    gr.Markdown("""
                    # üèõÔ∏è Enhanced KG-Indonesian Legal RAG System
                    
                    ## üÜï Enhanced Features
                    
                    ### üë• Realistic Research Team Simulation
                    The system now features **5 distinct researcher personas** with unique characteristics:
                    
                    - **üë®‚Äç‚öñÔ∏è Senior Legal Researcher**: 15 years experience, authority-focused, systematic approach
                    - **üë©‚Äç‚öñÔ∏è Junior Legal Researcher**: 3 years experience, broad comprehensive coverage
                    - **üìö Knowledge Graph Specialist**: 8 years experience, relationship and semantic focus
                    - **‚öñÔ∏è Procedural Law Expert**: 12 years experience, methodical step-by-step analysis
                    - **üîç Devil's Advocate**: 10 years experience, critical analysis and challenges
                    
                    ### üéØ Query-Specific Team Assembly
                    Teams are automatically assembled based on query type:
                    - **Specific Article**: Senior + Specialist + Devil's Advocate
                    - **Procedural**: Procedural Expert + Junior + Senior
                    - **Definitional**: Senior + Specialist + Junior
                    - **Sanctions**: Senior + Procedural Expert + Devil's Advocate
                    - **General**: All researchers (customizable team size)
                    
                    ### üîÑ Multi-Stage Research Process
                    1. **Individual Research**: Each researcher conducts research based on their expertise
                    2. **Cross-Validation**: Team members validate each other's findings
                    3. **Devil's Advocate Review**: Critical challenges to prevent groupthink
                    4. **Consensus Building**: Weighted consensus based on experience and accuracy
                    
                    ### ‚öôÔ∏è Advanced Customization
                    - **Granular Phase Control**: Enable/disable and adjust each search phase individually
                    - **Team Configuration**: Control team size, cross-validation, devil's advocate
                    - **Consensus Thresholds**: Adjust agreement requirements for final results
                    - **Real-time Updates**: All settings apply immediately to the research process
                    
                    ## üîß Configuration Guide
                    
                    ### Recommended Settings
                    - **Team Size**: 3-4 for optimal balance between coverage and efficiency
                    - **Consensus Threshold**: 0.6 for balanced precision/recall
                    - **Cross-Validation**: Enable for complex queries requiring validation
                    - **Devil's Advocate**: Enable for critical decisions and sanctions queries
                    
                    ### Search Phase Optimization
                    - **Initial Scan**: High candidate count, low thresholds for broad coverage
                    - **Focused Review**: Moderate filtering for promising candidates
                    - **Deep Analysis**: Strict thresholds for quality documents
                    - **Verification**: Highest standards for final validation
                    - **Expert Review**: Optional phase for complex specialized queries
                    
                    ### Performance Tuning
                    - **Lower thresholds**: Increase recall but may reduce precision
                    - **Higher candidate counts**: More comprehensive but slower processing
                    - **Team size optimization**: Larger teams for complex queries, smaller for simple ones
                    
                    ## üìä Research Analytics
                    
                    The enhanced system provides detailed insights into the research process:
                    - **Per-Researcher Metrics**: Success rates and specialization effectiveness
                    - **Phase Analysis**: Which phases contribute most to final results
                    - **Consensus Tracking**: Team agreement patterns and conflict resolution
                    - **Query Success Patterns**: Learning from successful query-answer pairs
                    
                    ## üöÄ Technical Improvements
                    
                    - **Memory Optimization**: Efficient handling of large legal document collections
                    - **Parallel Processing**: Multiple researchers work simultaneously
                    - **Smart Caching**: Researchers build on each other's work
                    - **Error Handling**: Robust fallback mechanisms for edge cases
                    - **Streaming Responses**: Real-time progress updates during research
                    
                    **Note**: This enhanced system combines human-like legal research methodology with AI efficiency, providing transparency into the research process while maintaining high accuracy and comprehensive coverage.
                    """)
        
        # Hidden state for enhanced configuration
        config_state = gr.State(DEFAULT_CONFIG)
        
        def update_enhanced_config(*args):
            """Update configuration with all enhanced settings"""
            try:
                search_phases = {
                    'initial_scan': {
                        'candidates': int(args[5]),
                        'semantic_threshold': float(args[6]),
                        'keyword_threshold': float(args[7]),
                        'description': 'Quick broad scan like human initial reading',
                        'time_limit': 30,
                        'focus_areas': ['regulation_type', 'enacting_body'],
                        'enabled': bool(args[4])
                    },
                    'focused_review': {
                        'candidates': int(args[9]),
                        'semantic_threshold': float(args[10]),
                        'keyword_threshold': float(args[11]),
                        'description': 'Focused review of promising candidates',
                        'time_limit': 45,
                        'focus_areas': ['content', 'chapter', 'article'],
                        'enabled': bool(args[8])
                    },
                    'deep_analysis': {
                        'candidates': int(args[13]),
                        'semantic_threshold': float(args[14]),
                        'keyword_threshold': float(args[15]),
                        'description': 'Deep contextual analysis like careful reading',
                        'time_limit': 60,
                        'focus_areas': ['kg_entities', 'cross_references'],
                        'enabled': bool(args[12])
                    },
                    'verification': {
                        'candidates': int(args[17]),
                        'semantic_threshold': float(args[18]),
                        'keyword_threshold': float(args[19]),
                        'description': 'Final verification and cross-checking',
                        'time_limit': 30,
                        'focus_areas': ['authority_score', 'temporal_score'],
                        'enabled': bool(args[16])
                    },
                    'expert_review': {
                        'candidates': int(args[21]),
                        'semantic_threshold': float(args[22]),
                        'keyword_threshold': float(args[23]),
                        'description': 'Expert specialist review for complex cases',
                        'time_limit': 40,
                        'focus_areas': ['legal_richness', 'completeness_score'],
                        'enabled': bool(args[20])
                    }
                }
                
                new_config = {
                    'final_top_k': int(args[0]),
                    'temperature': float(args[1]),
                    'max_new_tokens': int(args[2]),
                    'research_team_size': int(args[3]),
                    'enable_cross_validation': bool(args[24]),
                    'enable_devil_advocate': bool(args[25]),
                    'consensus_threshold': float(args[26]),
                    'top_p': float(args[27]),
                    'top_k': int(args[28]),
                    'min_p': float(args[29]),
                    'search_phases': search_phases,
                    'max_rounds': 5,
                    'initial_quality': 0.8,
                    'quality_degradation': 0.15,
                    'min_quality': 0.3,
                    'parallel_research': True
                }
                
                # Update the global config reference if search_engine exists
                if 'search_engine' in globals() and search_engine is not None:
                    # Update search engine's internal config reference
                    search_engine.current_config = new_config
                
                return new_config
                
            except Exception as e:
                print(f"Error updating enhanced config: {e}")
                return DEFAULT_CONFIG
        
        def reset_to_enhanced_defaults():
            """Reset to enhanced default values"""
            try:
                return (
                    DEFAULT_CONFIG['final_top_k'],  # 0
                    DEFAULT_CONFIG['temperature'],  # 1
                    DEFAULT_CONFIG['max_new_tokens'],  # 2
                    DEFAULT_CONFIG['research_team_size'],  # 3
                    DEFAULT_SEARCH_PHASES['initial_scan']['enabled'],  # 4
                    DEFAULT_SEARCH_PHASES['initial_scan']['candidates'],  # 5
                    DEFAULT_SEARCH_PHASES['initial_scan']['semantic_threshold'],  # 6
                    DEFAULT_SEARCH_PHASES['initial_scan']['keyword_threshold'],  # 7
                    DEFAULT_SEARCH_PHASES['focused_review']['enabled'],  # 8
                    DEFAULT_SEARCH_PHASES['focused_review']['candidates'],  # 9
                    DEFAULT_SEARCH_PHASES['focused_review']['semantic_threshold'],  # 10
                    DEFAULT_SEARCH_PHASES['focused_review']['keyword_threshold'],  # 11
                    DEFAULT_SEARCH_PHASES['deep_analysis']['enabled'],  # 12
                    DEFAULT_SEARCH_PHASES['deep_analysis']['candidates'],  # 13
                    DEFAULT_SEARCH_PHASES['deep_analysis']['semantic_threshold'],  # 14
                    DEFAULT_SEARCH_PHASES['deep_analysis']['keyword_threshold'],  # 15
                    DEFAULT_SEARCH_PHASES['verification']['enabled'],  # 16
                    DEFAULT_SEARCH_PHASES['verification']['candidates'],  # 17
                    DEFAULT_SEARCH_PHASES['verification']['semantic_threshold'],  # 18
                    DEFAULT_SEARCH_PHASES['verification']['keyword_threshold'],  # 19
                    DEFAULT_SEARCH_PHASES['expert_review']['enabled'],  # 20
                    DEFAULT_SEARCH_PHASES['expert_review']['candidates'],  # 21
                    DEFAULT_SEARCH_PHASES['expert_review']['semantic_threshold'],  # 22
                    DEFAULT_SEARCH_PHASES['expert_review']['keyword_threshold'],  # 23
                    DEFAULT_CONFIG['enable_cross_validation'],  # 24
                    DEFAULT_CONFIG['enable_devil_advocate'],  # 25
                    DEFAULT_CONFIG['consensus_threshold'],  # 26
                    DEFAULT_CONFIG['top_p'],  # 27
                    DEFAULT_CONFIG['top_k'],  # 28
                    DEFAULT_CONFIG['min_p']   # 29
                )
            except Exception as e:
                print(f"Error resetting to defaults: {e}")
                return tuple([0.5] * 30)  # Fallback
        
        # All configuration inputs
        config_inputs = [
            final_top_k, temperature, max_new_tokens, research_team_size,  # 0-3
            initial_scan_enabled, initial_scan_candidates, initial_scan_semantic, initial_scan_keyword,  # 4-7
            focused_review_enabled, focused_review_candidates, focused_review_semantic, focused_review_keyword,  # 8-11
            deep_analysis_enabled, deep_analysis_candidates, deep_analysis_semantic, deep_analysis_keyword,  # 12-15
            verification_enabled, verification_candidates, verification_semantic, verification_keyword,  # 16-19
            expert_review_enabled, expert_review_candidates, expert_review_semantic, expert_review_keyword,  # 20-23
            enable_cross_validation, enable_devil_advocate, consensus_threshold,  # 24-26
            top_p, top_k, min_p  # 27-29
        ]
        
        # Connect all inputs to config update
        for input_component in config_inputs:
            try:
                input_component.change(
                    update_enhanced_config,
                    inputs=config_inputs,
                    outputs=config_state
                )
            except Exception as e:
                print(f"Error connecting config input: {e}")
        
        # Reset button
        try:
            reset_defaults_btn.click(
                reset_to_enhanced_defaults,
                outputs=config_inputs
            )
        except Exception as e:
            print(f"Error setting up reset button: {e}")
        
        # Chat functionality
        try:
            msg_input.submit(
                chat_with_legal_rag,
                inputs=[msg_input, chatbot, config_state, show_thinking, show_sources, show_metadata],
                outputs=[chatbot, msg_input]
            )
        except Exception as e:
            print(f"Error setting up chat: {e}")
        
        # System info
        try:
            system_info_btn.click(
                get_system_info,
                outputs=system_info_output
            )
        except Exception as e:
            print(f"Error setting up system info: {e}")
    
    return interface

# =============================================================================
# DEBUGGING AND DIAGNOSTIC FUNCTIONS
# =============================================================================

def debug_research_process():
    """Debug function to identify potential issues"""
    debug_results = {
        'potential_issues': [],
        'suggestions': [],
        'performance_metrics': {}
    }
    
    # Memory check
    try:
        import psutil
        memory_percent = psutil.virtual_memory().percent
        if memory_percent > 85:
            debug_results['potential_issues'].append("High memory usage detected")
            debug_results['suggestions'].append("Consider reducing batch sizes or team size")
    except ImportError:
        debug_results['potential_issues'].append("Cannot monitor memory usage - install psutil")
    
    # Initialization check
    if not initialization_complete:
        debug_results['potential_issues'].append("System not fully initialized")
        debug_results['suggestions'].append("Wait for complete initialization")
    
    # Dataset quality check
    if dataset_loader:
        stats = dataset_loader.get_statistics()
        if stats.get('kg_enhancement_rate', 0) < 0.3:
            debug_results['potential_issues'].append("Low KG enhancement rate")
            debug_results['suggestions'].append("Verify KG preprocessing quality")
    
    # Research team balance check
    total_researchers = len(RESEARCH_TEAM_PERSONAS)
    if total_researchers < 3:
        debug_results['potential_issues'].append("Insufficient researcher diversity")
        debug_results['suggestions'].append("Add more researcher personas for better coverage")
    
    # Configuration validation
    total_candidates = sum(phase['candidates'] for phase in DEFAULT_SEARCH_PHASES.values())
    if total_candidates > 1000:
        debug_results['potential_issues'].append("High candidate processing load")
        debug_results['suggestions'].append("Consider reducing candidate counts in early phases")
    
    return debug_results

# =============================================================================
# MAIN EXECUTION (UNCHANGED AS REQUESTED)
# =============================================================================

if __name__ == "__main__":
    print("=" * 80)
    print("üèõÔ∏è Enhanced KG Indonesian Legal RAG System")
    print("=" * 80)
    
    # Configuration
    ENABLE_SHARING = False  # Set to True for public sharing
    AUTO_INITIALIZE = True  # Set to True to initialize on startup
    
    # Create interface first
    print("\nüì± Creating Gradio interface...")
    interface = create_gradio_interface()
    print("‚úÖ Interface created")
    
    # Initialize system if needed
    if AUTO_INITIALIZE or ENABLE_SHARING:
        print("\nüîß Initializing system...")
        print("-" * 80)
        
        def init_progress(msg):
            print(f"   {msg}")
        
        try:
            success = initialize_models_and_data(init_progress)
            
            if not success:
                print("\n‚ùå System initialization failed!")
                print("Please check the error messages above and try again.")
                exit(1)
            
            print("\n" + "=" * 80)
            print("‚úÖ System initialized successfully!")
            print("=" * 80)
            
            # Print system info
            stats = dataset_loader.get_statistics()
            print(f"\nüìä Dataset Statistics:")
            print(f"   - Total Records: {stats.get('total_records', 0):,}")
            print(f"   - KG Enhanced: {stats.get('kg_enhanced', 0):,}")
            print(f"   - Enhancement Rate: {stats.get('kg_enhancement_rate', 0):.1%}")
            
            # Run health check
            print(f"\nüè• Running health check...")
            health = system_health_check()
            print(f"   Status: {health['status'].upper()}")
            if health['warnings']:
                print(f"   Warnings: {len(health['warnings'])}")
            if health['errors']:
                print(f"   Errors: {len(health['errors'])}")
            
        except Exception as e:
            print(f"\n‚ùå Initialization error: {str(e)}")
            import traceback
            traceback.print_exc()
            exit(1)
    
    # Launch interface
    print("\nüöÄ Launching interface...")
    print("-" * 80)
    
    try:
        launch_kwargs = {
            'server_name': "0.0.0.0" if ENABLE_SHARING else "127.0.0.1",
            'server_port': 7860,
            'share': ENABLE_SHARING,
            'debug': False,
            'show_error': True,
            'quiet': False,
            'max_threads': 10,
            'auth': None,
            'inbrowser': not ENABLE_SHARING,
            'show_api': False
        }
        
        print(f"   Server: {launch_kwargs['server_name']}:{launch_kwargs['server_port']}")
        print(f"   Share: {launch_kwargs['share']}")
        print(f"   Auto-open browser: {launch_kwargs['inbrowser']}")
        
        interface.launch(**launch_kwargs)
        
    except KeyboardInterrupt:
        print("\n\n‚ö†Ô∏è Shutting down gracefully...")
        clear_cache()
        print("‚úÖ Shutdown complete")
        
    except Exception as e:
        print(f"\n‚ùå Failed to launch interface: {str(e)}")
        import traceback
        traceback.print_exc()
        exit(1)

üèõÔ∏è Enhanced KG Indonesian Legal RAG System

üì± Creating Gradio interface...
‚úÖ Interface created

üîß Initializing system...
--------------------------------------------------------------------------------
   Initializing enhanced system...
   Loading embedding model...
   Loading reranker model...
   Loading LLM model on GPU with 4-bit quantization...


Unrecognized keys in `rope_scaling` for 'rope_type'='yarn': {'attn_factor'}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

   üìä Loading ENHANCED KG dataset...
   üì• Loading enhanced KG dataset with streaming...
   Limit: 100000 records


Resolving data files:   0%|          | 0/75 [00:00<?, ?it/s]

   üìä Processing 100,000 records in chunks of 1,000...
   üìä Converting embeddings to numpy array...
   üîç Processing TF-IDF vectors...
   üóÉÔ∏è Building enhanced KG indexes...
   ‚úÖ Ready: 100,000 records with 0 KG-enhanced
   üóÉÔ∏è Building enhanced search indexes...
Building search indexes...
‚úÖ Indexes built: 10 authority tiers
   üîó Linking context manager with knowledge graph...
   ‚úÖ Enhanced system initialization complete!
   üìå Embedding model: CPU
   üìå Reranker model: CPU
   üìå LLM model: GPU (4-bit)

‚úÖ System initialized successfully!

üìä Dataset Statistics:
   - Total Records: 100,000
   - KG Enhanced: 0
   - Enhancement Rate: 0.0%

üè• Running health check...
   Status: HEALTHY

üöÄ Launching interface...
--------------------------------------------------------------------------------
   Server: 127.0.0.1:7860
   Share: False
   Auto-open browser: True
* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in 

DEBUG: No context exists, first query
DEBUG: Default case - treating as new query
   üîç Executing normal semantic search...
   üîç Executing normal semantic search...
   üîç Executing normal semantic search...
   üîç Executing normal semantic search...
   üîç Executing normal semantic search...


You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
