In [1]:
# RAG Index Builder for Gemma Family Assistant
# Build FAISS indexes for all 5 domains from structured data

import json
import os
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from typing import List, Dict, Any
import pickle
from datetime import datetime

# Install required packages (run once)
# !pip install sentence-transformers faiss-cpu

class RAGIndexBuilder:
    def __init__(self, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
        """
        Initialize RAG Index Builder
        
        Args:
            embedding_model: HuggingFace sentence transformer model
        """
        print(f"Loading embedding model: {embedding_model}")
        self.embedding_model = SentenceTransformer(embedding_model)
        self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension()
        print(f"Embedding dimension: {self.embedding_dim}")
        
    def load_structured_data(self, file_path: str) -> Dict[str, Any]:
        """Load structured data from JSON file"""
        print(f"Loading structured data from: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Loaded {len(data) if isinstance(data, list) else 'structured'} entries")
        return data
    
    def create_text_chunks(self, data: Dict[str, Any], domain: str) -> List[Dict[str, Any]]:
        """
        Create text chunks from structured data
        
        Args:
            data: Structured data dictionary
            domain: Domain name (ayurveda, education, etc.)
            
        Returns:
            List of chunk dictionaries with text and metadata
        """
        chunks = []
        chunk_id = 0
        
        def process_item(item, parent_context=""):
            nonlocal chunk_id
            
            if isinstance(item, dict):
                for key, value in item.items():
                    context = f"{parent_context} > {key}" if parent_context else key
                    
                    if isinstance(value, str) and len(value.strip()) > 20:
                        # Create chunk for substantial text content
                        chunks.append({
                            "chunk_id": chunk_id,
                            "text": f"{key}: {value}",
                            "metadata": {
                                "domain": domain,
                                "context": context,
                                "key": key,
                                "chunk_type": "text_content"
                            }
                        })
                        chunk_id += 1
                    
                    elif isinstance(value, (dict, list)):
                        # Recursively process nested structures
                        process_item(value, context)
                        
            elif isinstance(item, list):
                for i, sub_item in enumerate(item):
                    context = f"{parent_context}[{i}]" if parent_context else f"item_{i}"
                    process_item(sub_item, context)
                    
            elif isinstance(item, str) and len(item.strip()) > 20:
                # Direct string content
                chunks.append({
                    "chunk_id": chunk_id,
                    "text": item,
                    "metadata": {
                        "domain": domain,
                        "context": parent_context,
                        "chunk_type": "direct_text"
                    }
                })
                chunk_id += 1
        
        # Process the entire data structure
        process_item(data)
        
        print(f"Created {len(chunks)} text chunks for {domain}")
        return chunks
    
    def generate_embeddings(self, chunks: List[Dict[str, Any]]) -> np.ndarray:
        """Generate embeddings for all chunks"""
        texts = [chunk["text"] for chunk in chunks]
        print(f"Generating embeddings for {len(texts)} chunks...")
        
        # Generate embeddings in batches for efficiency
        embeddings = self.embedding_model.encode(
            texts, 
            batch_size=32, 
            show_progress_bar=True,
            convert_to_numpy=True
        )
        
        print(f"Generated embeddings shape: {embeddings.shape}")
        return embeddings
    
    def build_faiss_index(self, embeddings: np.ndarray) -> faiss.Index:
        """Build FAISS index from embeddings"""
        print("Building FAISS index...")
        
        # Use IndexFlatL2 for exact search (good for small datasets)
        index = faiss.IndexFlatL2(self.embedding_dim)
        
        # Add embeddings to index
        embeddings = embeddings.astype('float32')
        index.add(embeddings)
        
        print(f"FAISS index built with {index.ntotal} vectors")
        return index
    
    def save_rag_index(self, index: faiss.Index, chunks: List[Dict[str, Any]], 
                      embeddings: np.ndarray, domain: str, output_dir: str):
        """Save complete RAG index to disk"""
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Save FAISS index
        faiss_path = os.path.join(output_dir, "faiss_index.bin")
        faiss.write_index(index, faiss_path)
        print(f"Saved FAISS index: {faiss_path}")
        
        # Save embeddings
        embeddings_path = os.path.join(output_dir, "embeddings.npy")
        np.save(embeddings_path, embeddings)
        print(f"Saved embeddings: {embeddings_path}")
        
        # Save chunk metadata
        metadata_path = os.path.join(output_dir, "chunks_metadata.json")
        with open(metadata_path, 'w', encoding='utf-8') as f:
            json.dump(chunks, f, indent=2, ensure_ascii=False)
        print(f"Saved chunks metadata: {metadata_path}")
        
        # Save configuration
        config = {
            "domain": domain,
            "embedding_model": self.embedding_model.get_sentence_embedding_dimension(),
            "embedding_dim": self.embedding_dim,
            "num_chunks": len(chunks),
            "index_type": "IndexFlatL2",
            "created_at": datetime.now().isoformat(),
            "model_name": str(self.embedding_model)
        }
        
        config_path = os.path.join(output_dir, "rag_config.json")
        with open(config_path, 'w', encoding='utf-8') as f:
            json.dump(config, f, indent=2)
        print(f"Saved configuration: {config_path}")
        
        # Calculate and display sizes
        total_size = sum(os.path.getsize(os.path.join(output_dir, f)) 
                        for f in os.listdir(output_dir))
        print(f"Total RAG index size: {total_size / (1024*1024):.2f} MB")
    
    def build_domain_rag(self, domain: str, structured_data_path: str, datasets_dir: str = "datasets"):
        """Build complete RAG index for a domain"""
        print(f"\n{'='*50}")
        print(f"Building RAG index for domain: {domain.upper()}")
        print(f"{'='*50}")
        
        # Load structured data
        data = self.load_structured_data(structured_data_path)
        
        # Create text chunks
        chunks = self.create_text_chunks(data, domain)
        
        if not chunks:
            print(f"No chunks created for {domain}. Skipping...")
            return
        
        # Generate embeddings
        embeddings = self.generate_embeddings(chunks)
        
        # Build FAISS index
        index = self.build_faiss_index(embeddings)
        
        # Save everything
        output_dir = os.path.join(datasets_dir, domain, "rag_index")
        self.save_rag_index(index, chunks, embeddings, domain, output_dir)
        
        print(f"✅ RAG index for {domain} completed!")
        return output_dir

def build_all_rag_indexes(datasets_dir: str = "../datasets"):
    """Build RAG indexes for all 5 domains"""
    
    # Initialize RAG builder
    builder = RAGIndexBuilder()
    
    # Define domains and their structured data files
    domains = {
        "ayurveda": "ayurveda_structured_data_extract.json",
        "depression": "depression_structured_data_extract.json", 
        "disaster_management": "disaster_management_structured_data_extract.json",
        "education": "education_structured_data_extract.json",
        "rice_diseases": "rice_diseases_structured_data_extract.json"
    }
    
    successful_builds = []
    failed_builds = []
    
    # Build RAG index for each domain
    for domain, filename in domains.items():
        try:
            structured_data_path = os.path.join(datasets_dir, domain, filename)
            
            # Check if file exists
            if not os.path.exists(structured_data_path):
                print(f"❌ File not found: {structured_data_path}")
                failed_builds.append(domain)
                continue
            
            # Build RAG index
            output_dir = builder.build_domain_rag(domain, structured_data_path, datasets_dir)
            successful_builds.append(domain)
            
        except Exception as e:
            print(f"❌ Error building RAG for {domain}: {str(e)}")
            failed_builds.append(domain)
    
    # Summary
    print(f"\n{'='*60}")
    print("RAG INDEX BUILD SUMMARY")
    print(f"{'='*60}")
    print(f"✅ Successfully built: {len(successful_builds)} domains")
    for domain in successful_builds:
        print(f"   - {domain}")
    
    if failed_builds:
        print(f"❌ Failed builds: {len(failed_builds)} domains")
        for domain in failed_builds:
            print(f"   - {domain}")
    
    print(f"\n🎉 RAG index building completed!")

# Example usage for testing a single domain
def test_single_domain():
    """Test building RAG for a single domain"""
    builder = RAGIndexBuilder()
    
    # Test with ayurveda domain (adjust path as needed)
    domain = "ayurveda"
    structured_data_path = f"datasets/{domain}/ayurveda_structured_data_extract.json"
    
    if os.path.exists(structured_data_path):
        builder.build_domain_rag(domain, structured_data_path)
    else:
        print(f"Test file not found: {structured_data_path}")

# ========================
# MAIN EXECUTION
# ========================

if __name__ == "__main__":
    print("🚀 Starting RAG Index Builder for Gemma Family Assistant")
    print("Building FAISS indexes for all 5 domains...")
    
    # Build all RAG indexes
    build_all_rag_indexes()
    
    print("\n✅ All RAG indexes built and saved!")
    print("You can now use these indexes for fast retrieval in your Streamlit app.")

# Uncomment to test single domain first
# test_single_domain()



🚀 Starting RAG Index Builder for Gemma Family Assistant
Building FAISS indexes for all 5 domains...
Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding dimension: 384

Building RAG index for domain: AYURVEDA
Loading structured data from: ../datasets/ayurveda/ayurveda_structured_data_extract.json
Loaded 25 entries
Created 164 text chunks for ayurveda
Generating embeddings for 164 chunks...


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Generated embeddings shape: (164, 384)
Building FAISS index...
FAISS index built with 164 vectors
Saved FAISS index: ../datasets/ayurveda/rag_index/faiss_index.bin
Saved embeddings: ../datasets/ayurveda/rag_index/embeddings.npy
Saved chunks metadata: ../datasets/ayurveda/rag_index/chunks_metadata.json
Saved configuration: ../datasets/ayurveda/rag_index/rag_config.json
Total RAG index size: 0.53 MB
✅ RAG index for ayurveda completed!

Building RAG index for domain: DEPRESSION
Loading structured data from: ../datasets/depression/depression_structured_data_extract.json
Loaded 25 entries
Created 81 text chunks for depression
Generating embeddings for 81 chunks...


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Generated embeddings shape: (81, 384)
Building FAISS index...
FAISS index built with 81 vectors
Saved FAISS index: ../datasets/depression/rag_index/faiss_index.bin
Saved embeddings: ../datasets/depression/rag_index/embeddings.npy
Saved chunks metadata: ../datasets/depression/rag_index/chunks_metadata.json
Saved configuration: ../datasets/depression/rag_index/rag_config.json
Total RAG index size: 0.29 MB
✅ RAG index for depression completed!

Building RAG index for domain: DISASTER_MANAGEMENT
Loading structured data from: ../datasets/disaster_management/disaster_management_structured_data_extract.json
Loaded 8 entries
Created 23 text chunks for disaster_management
Generating embeddings for 23 chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings shape: (23, 384)
Building FAISS index...
FAISS index built with 23 vectors
Saved FAISS index: ../datasets/disaster_management/rag_index/faiss_index.bin
Saved embeddings: ../datasets/disaster_management/rag_index/embeddings.npy
Saved chunks metadata: ../datasets/disaster_management/rag_index/chunks_metadata.json
Saved configuration: ../datasets/disaster_management/rag_index/rag_config.json
Total RAG index size: 0.10 MB
✅ RAG index for disaster_management completed!

Building RAG index for domain: EDUCATION
Loading structured data from: ../datasets/education/education_structured_data_extract.json
Loaded 137 entries
Created 1286 text chunks for education
Generating embeddings for 1286 chunks...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Generated embeddings shape: (1286, 384)
Building FAISS index...
FAISS index built with 1286 vectors
Saved FAISS index: ../datasets/education/rag_index/faiss_index.bin
Saved embeddings: ../datasets/education/rag_index/embeddings.npy
Saved chunks metadata: ../datasets/education/rag_index/chunks_metadata.json
Saved configuration: ../datasets/education/rag_index/rag_config.json
Total RAG index size: 4.63 MB
✅ RAG index for education completed!

Building RAG index for domain: RICE_DISEASES
Loading structured data from: ../datasets/rice_diseases/rice_diseases_structured_data_extract.json
Loaded 69 entries
Created 194 text chunks for rice_diseases
Generating embeddings for 194 chunks...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Generated embeddings shape: (194, 384)
Building FAISS index...
FAISS index built with 194 vectors
Saved FAISS index: ../datasets/rice_diseases/rag_index/faiss_index.bin
Saved embeddings: ../datasets/rice_diseases/rag_index/embeddings.npy
Saved chunks metadata: ../datasets/rice_diseases/rag_index/chunks_metadata.json
Saved configuration: ../datasets/rice_diseases/rag_index/rag_config.json
Total RAG index size: 0.64 MB
✅ RAG index for rice_diseases completed!

RAG INDEX BUILD SUMMARY
✅ Successfully built: 5 domains
   - ayurveda
   - depression
   - disaster_management
   - education
   - rice_diseases

🎉 RAG index building completed!

✅ All RAG indexes built and saved!
You can now use these indexes for fast retrieval in your Streamlit app.
