In [2]:
# Education Domain RAG Builder
# Build FAISS index specifically for education domain with page information

import json
import os
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from typing import List, Dict, Any
from datetime import datetime
from pathlib import Path

class EducationRAGBuilder:
    def __init__(self, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
        """
        Initialize Education RAG Builder
        
        Args:
            embedding_model: HuggingFace sentence transformer model
        """
        print(f"🔄 Loading embedding model: {embedding_model}")
        self.embedding_model = SentenceTransformer(embedding_model)
        self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension()
        print(f"✅ Embedding dimension: {self.embedding_dim}")
        
    def load_education_data(self, file_path: str) -> List[Dict]:
        """Load education structured data from JSON file"""
        print(f"📂 Loading education data from: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if isinstance(data, list):
            print(f"✅ Loaded {len(data)} education sections")
            return data
        else:
            print("❌ Expected list format for education data")
            return []
    
    def create_education_chunks(self, sections: List[Dict]) -> List[Dict[str, Any]]:
        """
        Create text chunks from education data with page information
        
        Args:
            sections: List of education sections
            
        Returns:
            List of chunk dictionaries with text, metadata, and page info
        """
        chunks = []
        chunk_id = 0
        
        print(f"🔄 Processing {len(sections)} education sections...")
        
        for section_idx, section in enumerate(sections):
            if not isinstance(section, dict):
                continue
            
            # Extract page information
            page_start = section.get('page_start', 0)
            page_end = section.get('page_end', 0)
            main_heading = section.get('main_heading', 'Unknown')
            sub_heading = section.get('sub_heading', '')
            
            # Create base metadata for this section
            base_metadata = {
                "domain": "education",
                "section_id": section_idx,
                "page_start": page_start,
                "page_end": page_end,
                "main_heading": main_heading,
                "sub_heading": sub_heading
            }
            
            # 1. Process main heading
            if main_heading and len(main_heading.strip()) > 5:
                chunks.append({
                    "chunk_id": chunk_id,
                    "text": f"Main Topic: {main_heading}",
                    "metadata": {
                        **base_metadata,
                        "chunk_type": "main_heading",
                        "content_type": "heading"
                    }
                })
                chunk_id += 1
            
            # 2. Process sub heading
            if sub_heading and len(sub_heading.strip()) > 5:
                chunks.append({
                    "chunk_id": chunk_id,
                    "text": f"Subtopic: {sub_heading}",
                    "metadata": {
                        **base_metadata,
                        "chunk_type": "sub_heading",
                        "content_type": "heading"
                    }
                })
                chunk_id += 1
            
            # 3. Process main content
            content = section.get('content', '')
            if content and len(content.strip()) > 20:
                # Create context for content
                context_text = f"{main_heading} - {sub_heading}: {content}".strip()
                
                # Split long content into smaller chunks if needed
                max_chunk_size = 500  # characters
                if len(content) > max_chunk_size:
                    # Split into smaller chunks
                    words = content.split()
                    current_chunk = []
                    current_length = 0
                    
                    for word in words:
                        if current_length + len(word) > max_chunk_size and current_chunk:
                            # Save current chunk
                            chunk_text = ' '.join(current_chunk)
                            full_context = f"{main_heading} - {sub_heading}: {chunk_text}"
                            
                            chunks.append({
                                "chunk_id": chunk_id,
                                "text": full_context,
                                "metadata": {
                                    **base_metadata,
                                    "chunk_type": "content",
                                    "content_type": "text",
                                    "chunk_part": len([c for c in chunks if c["metadata"].get("chunk_type") == "content" and c["metadata"]["section_id"] == section_idx]) + 1
                                }
                            })
                            chunk_id += 1
                            
                            # Reset for next chunk
                            current_chunk = [word]
                            current_length = len(word)
                        else:
                            current_chunk.append(word)
                            current_length += len(word) + 1  # +1 for space
                    
                    # Save remaining chunk
                    if current_chunk:
                        chunk_text = ' '.join(current_chunk)
                        full_context = f"{main_heading} - {sub_heading}: {chunk_text}"
                        
                        chunks.append({
                            "chunk_id": chunk_id,
                            "text": full_context,
                            "metadata": {
                                **base_metadata,
                                "chunk_type": "content",
                                "content_type": "text",
                                "chunk_part": len([c for c in chunks if c["metadata"].get("chunk_type") == "content" and c["metadata"]["section_id"] == section_idx]) + 1
                            }
                        })
                        chunk_id += 1
                else:
                    # Single chunk for shorter content
                    chunks.append({
                        "chunk_id": chunk_id,
                        "text": context_text,
                        "metadata": {
                            **base_metadata,
                            "chunk_type": "content",
                            "content_type": "text"
                        }
                    })
                    chunk_id += 1
            
            # 4. Process image captions
            images = section.get('images', [])
            for img_idx, image in enumerate(images):
                if isinstance(image, dict):
                    caption = image.get('caption', '')
                    image_page = image.get('page', page_start)
                    image_path = image.get('path', '')
                    
                    if caption and len(caption.strip()) > 5:
                        chunks.append({
                            "chunk_id": chunk_id,
                            "text": f"Image Caption: {caption}",
                            "metadata": {
                                **base_metadata,
                                "chunk_type": "image_caption",
                                "content_type": "caption",
                                "image_page": image_page,
                                "image_path": image_path,
                                "image_index": img_idx
                            }
                        })
                        chunk_id += 1
        
        print(f"✅ Created {len(chunks)} chunks from {len(sections)} sections")
        
        # Print chunk type summary
        chunk_types = {}
        for chunk in chunks:
            chunk_type = chunk["metadata"]["chunk_type"]
            chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
        
        print("📊 Chunk type breakdown:")
        for chunk_type, count in chunk_types.items():
            print(f"   {chunk_type}: {count} chunks")
        
        return chunks
    
    def generate_embeddings(self, chunks: List[Dict[str, Any]]) -> np.ndarray:
        """Generate embeddings for all chunks"""
        texts = [chunk["text"] for chunk in chunks]
        print(f"🔄 Generating embeddings for {len(texts)} chunks...")
        
        # Generate embeddings in batches for efficiency
        embeddings = self.embedding_model.encode(
            texts, 
            batch_size=32, 
            show_progress_bar=True,
            convert_to_numpy=True
        )
        
        print(f"✅ Generated embeddings shape: {embeddings.shape}")
        return embeddings
    
    def build_faiss_index(self, embeddings: np.ndarray) -> faiss.Index:
        """Build FAISS index from embeddings"""
        print("🔄 Building FAISS index...")
        
        # Use IndexFlatL2 for exact search (good for education dataset size)
        index = faiss.IndexFlatL2(self.embedding_dim)
        
        # Add embeddings to index
        embeddings = embeddings.astype('float32')
        index.add(embeddings)
        
        print(f"✅ FAISS index built with {index.ntotal} vectors")
        return index
    
    def save_rag_index(self, index: faiss.Index, chunks: List[Dict[str, Any]], 
                      embeddings: np.ndarray, output_dir: str):
        """Save complete RAG index to disk"""
        
        # Create output directory
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        
        print(f"💾 Saving RAG index to: {output_dir}")
        
        # Save FAISS index
        faiss_path = output_path / "faiss_index.bin"
        faiss.write_index(index, str(faiss_path))
        print(f"✅ Saved FAISS index: {faiss_path}")
        
        # Save embeddings
        embeddings_path = output_path / "embeddings.npy"
        np.save(str(embeddings_path), embeddings)
        print(f"✅ Saved embeddings: {embeddings_path}")
        
        # Save chunk metadata
        metadata_path = output_path / "chunks_metadata.json"
        with open(metadata_path, 'w', encoding='utf-8') as f:
            json.dump(chunks, f, indent=2, ensure_ascii=False)
        print(f"✅ Saved chunks metadata: {metadata_path}")
        
        # Save configuration
        config = {
            "domain": "education",
            "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
            "embedding_dim": self.embedding_dim,
            "num_chunks": len(chunks),
            "index_type": "IndexFlatL2",
            "created_at": datetime.now().isoformat(),
            "model_name": str(self.embedding_model),
            "chunk_types": list(set(chunk["metadata"]["chunk_type"] for chunk in chunks)),
            "page_range": {
                "min_page": min(chunk["metadata"]["page_start"] for chunk in chunks if chunk["metadata"]["page_start"] > 0),
                "max_page": max(chunk["metadata"]["page_end"] for chunk in chunks if chunk["metadata"]["page_end"] > 0)
            }
        }
        
        config_path = output_path / "rag_config.json"
        with open(config_path, 'w', encoding='utf-8') as f:
            json.dump(config, f, indent=2)
        print(f"✅ Saved configuration: {config_path}")
        
        # Calculate and display sizes
        total_size = sum(f.stat().st_size for f in output_path.iterdir() if f.is_file())
        print(f"📊 Total RAG index size: {total_size / (1024*1024):.2f} MB")
    
    def build_education_rag(self, structured_data_path: str, output_dir: str = "../../datasets/education/rag_index"):
        """Build complete RAG index for education domain"""
        print(f"\n{'='*60}")
        print(f"🎓 BUILDING EDUCATION RAG INDEX")
        print(f"{'='*60}")
        
        # Load education data
        sections = self.load_education_data(structured_data_path)
        
        if not sections:
            print("❌ No sections loaded. Cannot build RAG index.")
            return
        
        # Create text chunks with page information
        chunks = self.create_education_chunks(sections)
        
        if not chunks:
            print("❌ No chunks created. Cannot build RAG index.")
            return
        
        # Generate embeddings
        embeddings = self.generate_embeddings(chunks)
        
        # Build FAISS index
        index = self.build_faiss_index(embeddings)
        
        # Save everything
        self.save_rag_index(index, chunks, embeddings, output_dir)
        
        print(f"\n🎉 Education RAG index completed!")
        print(f"📂 Saved to: {output_dir}")
        return output_dir

# Test the RAG builder
def test_education_rag():
    """Test building RAG for education domain"""
    
    # Initialize builder
    builder = EducationRAGBuilder()
    
    # Build RAG index
    structured_data_path = "../datasets/education/education_structured_data_extract.json"
    output_dir = "../datasets/education/rag_index"
    
    if Path(structured_data_path).exists():
        builder.build_education_rag(structured_data_path, output_dir)
    else:
        print(f"❌ Education data file not found: {structured_data_path}")
        print("Please ensure the file exists before building RAG index.")

# Sample chunk inspector
def inspect_sample_chunks(output_dir: str = "../datasets/education/rag_index"):
    """Inspect some sample chunks after building"""
    chunks_file = Path(output_dir) / "chunks_metadata.json"
    
    if chunks_file.exists():
        with open(chunks_file, 'r') as f:
            chunks = json.load(f)
        
        print(f"\n🔍 SAMPLE CHUNKS INSPECTION")
        print(f"{'='*50}")
        print(f"Total chunks: {len(chunks)}")
        
        # Show first few chunks of each type
        chunk_types = {}
        for chunk in chunks:
            chunk_type = chunk["metadata"]["chunk_type"]
            if chunk_type not in chunk_types:
                chunk_types[chunk_type] = []
            if len(chunk_types[chunk_type]) < 2:  # Show 2 examples per type
                chunk_types[chunk_type].append(chunk)
        
        for chunk_type, samples in chunk_types.items():
            print(f"\n📝 {chunk_type.upper()} samples:")
            for i, chunk in enumerate(samples, 1):
                print(f"  Sample {i}:")
                print(f"    Text: {chunk['text'][:100]}...")
                print(f"    Pages: {chunk['metadata']['page_start']}-{chunk['metadata']['page_end']}")
                print(f"    Section: {chunk['metadata']['main_heading']}")
    else:
        print(f"❌ Chunks file not found: {chunks_file}")

# ========================
# MAIN EXECUTION
# ========================

if __name__ == "__main__":
    print("🚀 Education Domain RAG Builder")
    print("Building FAISS index specifically for education with page information...")
    
    # Build education RAG index
    test_education_rag()
    
    # Inspect the results
    print("\n" + "="*60)
    inspect_sample_chunks()
    
    print("\n✅ Education RAG index ready!")
    print("Now you can use this index for accurate page-based image retrieval!")

# Uncomment to run
# test_education_rag()

🚀 Education Domain RAG Builder
Building FAISS index specifically for education with page information...
🔄 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
✅ Embedding dimension: 384

🎓 BUILDING EDUCATION RAG INDEX
📂 Loading education data from: ../datasets/education/education_structured_data_extract.json
✅ Loaded 137 education sections
🔄 Processing 137 education sections...
✅ Created 2090 chunks from 137 sections
📊 Chunk type breakdown:
   main_heading: 116 chunks
   sub_heading: 137 chunks
   content: 1203 chunks
   image_caption: 634 chunks
🔄 Generating embeddings for 2090 chunks...


Batches:   0%|          | 0/66 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


✅ Generated embeddings shape: (2090, 384)
🔄 Building FAISS index...
✅ FAISS index built with 2090 vectors
💾 Saving RAG index to: ../datasets/education/rag_index
✅ Saved FAISS index: ../datasets/education/rag_index/faiss_index.bin
✅ Saved embeddings: ../datasets/education/rag_index/embeddings.npy
✅ Saved chunks metadata: ../datasets/education/rag_index/chunks_metadata.json
✅ Saved configuration: ../datasets/education/rag_index/rag_config.json
📊 Total RAG index size: 7.52 MB

🎉 Education RAG index completed!
📂 Saved to: ../datasets/education/rag_index


🔍 SAMPLE CHUNKS INSPECTION
Total chunks: 2090

📝 MAIN_HEADING samples:
  Sample 1:
    Text: Main Topic: CROP PRODUCTION   AND MANAGEMENT...
    Pages: 14-15
    Section: CROP PRODUCTION   AND MANAGEMENT
  Sample 2:
    Text: Main Topic: CROP PRODUCTION   AND MANAGEMENT...
    Pages: 15-15
    Section: CROP PRODUCTION   AND MANAGEMENT

📝 SUB_HEADING samples:
  Sample 1:
    Text: Subtopic: 1.1 Agricultural Practices...
    Pages: 14-15
  