In [15]:
# Education Q&A System with RAG and Multimodal Processing
# Working with Existing RAG Index and Real Educational Data

# ## Step 1: Install Required Packages
# Run this cell first to install all dependencies

# !pip install sentence-transformers faiss-cpu requests pillow numpy pathlib

# ## Step 2: Import Libraries
import json
import os
import re
from typing import Dict, List, Tuple, Optional
import requests
import base64
from pathlib import Path
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from PIL import Image

print("✅ All libraries imported successfully!")

# ## Step 3: Load and Understand Your Data Structure
# Let's first examine your existing data structure

def explore_data_structure(base_path: str = "../datasets/education"):
    """
    Explore the existing data structure to understand what we have
    """
    base_path = Path(base_path)
    
    print("📁 Exploring data structure...")
    print(f"Base path: {base_path}")
    
    # # Check what files exist
    # for file_path in base_path.rglob("*"):
    #     if file_path.is_file():
    #         print(f"  📄 {file_path.relative_to(base_path)}")
    
    # Load and examine structured data
    try:
        structured_file = base_path / "education_structured_data_extract.json"
        if structured_file.exists():
            with open(structured_file, 'r') as f:
                structured_data = json.load(f)
            
            if isinstance(structured_data, list):
                print(f"\n📊 Structured data is a LIST with {len(structured_data)} sections")
                
                # Show first item structure
                if structured_data and len(structured_data) > 0:
                    first_item = structured_data[0]
                    print(f"Sample structure from first section:")
                    for key, value in first_item.items():
                        if isinstance(value, str) and len(value) > 100:
                            print(f"  {key}: {value[:100]}...")
                        elif isinstance(value, list):
                            print(f"  {key}: List with {len(value)} items")
                            if len(value) > 0 and isinstance(value[0], dict):
                                print(f"    Sample item keys: {list(value[0].keys())}")
                        else:
                            print(f"  {key}: {value}")
            else:
                print(f"\n📊 Structured data is a DICT with {len(structured_data)} sections")
                # Show first item structure
                if structured_data:
                    first_key = list(structured_data.keys())[0]
                    first_item = structured_data[first_key]
                    print(f"Sample structure from '{first_key}':")
                    for key, value in first_item.items():
                        if isinstance(value, str) and len(value) > 100:
                            print(f"  {key}: {value[:100]}...")
                        elif isinstance(value, list):
                            print(f"  {key}: List with {len(value)} items")
                        else:
                            print(f"  {key}: {value}")
    except Exception as e:
        print(f"❌ Error loading structured data: {e}")
    
    # Check RAG index
    rag_path = base_path / "rag_index"
    if rag_path.exists():
        print(f"\n🔍 RAG index directory found:")
        for file_path in rag_path.iterdir():
            print(f"  📄 {file_path.name}")
        
        # Load RAG config if exists
        config_file = rag_path / "rag_config.json"
        if config_file.exists():
            with open(config_file, 'r') as f:
                config = json.load(f)
            print(f"  📊 RAG Config: {config}")
        
        # Sample some RAG chunks
        chunks_file = rag_path / "chunks_metadata.json"
        if chunks_file.exists():
            with open(chunks_file, 'r') as f:
                chunks = json.load(f)
            print(f"\n🔍 RAG chunks sample (first chunk keys):")
            if chunks and len(chunks) > 0:
                first_chunk = chunks[0]
                print(f"  Keys in first chunk: {list(first_chunk.keys())}")
                for key, value in first_chunk.items():
                    if isinstance(value, str) and len(value) > 50:
                        print(f"  {key}: {value[:50]}...")
                    else:
                        print(f"  {key}: {value}")
    else:
        print("\n❌ RAG index directory not found")

# Debug helper to check file locations
def check_file_locations():
    """Check if required files exist in expected locations"""
    print("🔍 CHECKING FILE LOCATIONS")
    print("=" * 40)
    
    files_to_check = [
        "../datasets/education/education_structured_data_extract.json",
        "../datasets/education/rag_index/faiss_index.bin", 
        "../datasets/education/rag_index/chunks_metadata.json",
        "../datasets/education/rag_index/rag_config.json"
    ]
    
    for file_path in files_to_check:
        path = Path(file_path)
        exists = "✅" if path.exists() else "❌"
        print(f"{exists} {file_path}")
        
        if path.exists() and path.suffix == '.json':
            try:
                with open(path, 'r') as f:
                    data = json.load(f)
                if isinstance(data, list):
                    print(f"    📊 Contains {len(data)} items")
                elif isinstance(data, dict):
                    print(f"    📊 Contains {len(data)} keys")
            except:
                print(f"    ⚠️ Could not read JSON")

# Check files before initializing system
check_file_locations()

# ## Step 4: Education Q&A System Class
# This class works with your existing RAG index and data structure

class EducationQASystem:
    def __init__(self, base_path: str = "../datasets/education"):
        """
        Initialize the Education Q&A System with existing data
        """
        self.base_path = Path(base_path)
        self.images_path = self.base_path / "images"
        self.rag_index_path = self.base_path / "rag_index"
        
        print("🔄 Initializing Education Q&A System...")
        
        # Load existing datasets - only load files that exist
        print("🔄 Loading datasets...")
        
        # Try to load QA dataset (optional)
        qa_file = self.base_path / "education_qa_dataset.json"
        if qa_file.exists():
            self.qa_dataset = self._load_json("education_qa_dataset.json")
        else:
            print("ℹ️ QA dataset not found - skipping (optional)")
            self.qa_dataset = {}
        
        # Load structured data (required)
        self.structured_data = self._load_json("education_structured_data_extract.json")
        
        # Initialize RAG components with existing index
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.rag_index = None
        self.chunks_metadata = None
        
        # Ollama API endpoints for your models
        self.text_model_url = "http://localhost:11434/api/generate"
        self.multimodal_model_url = "http://localhost:11434/api/generate"
        
        # Load existing RAG index (check if you rebuilt it)
        rag_files_exist = (
            (self.rag_index_path / "chunks_metadata.json").exists() and
            (self.rag_index_path / "faiss_index.bin").exists()
        )
        
        if rag_files_exist:
            self._load_existing_rag()
        else:
            print("❌ RAG index not found. Please build it first using the RAG builder.")
            print(f"Expected location: {self.rag_index_path}")
            print("💡 Tip: Make sure you ran the Education RAG Builder and saved to the correct location")
            self.chunks_metadata = []
            self.rag_index = None
        
        print("✅ Education Q&A System initialized!")
        if isinstance(self.structured_data, list):
            print(f"📊 Loaded {len(self.structured_data)} structured sections (list format)")
        else:
            print(f"📊 Loaded {len(self.structured_data)} structured sections (dict format)")
        print(f"🔍 RAG index loaded: {'Yes' if self.rag_index else 'No'}")
    
    def _load_json(self, filename: str):
        """Load JSON file from the base path - handles both dict and list"""
        try:
            file_path = self.base_path / filename
            if file_path.exists():
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    # Handle case where data is a list (your actual structure)
                    if isinstance(data, list):
                        print(f"✅ Loaded list with {len(data)} items from {filename}")
                        return data
                    else:
                        print(f"✅ Loaded dict with {len(data)} keys from {filename}")
                        return data
            else:
                print(f"⚠️ File not found: {filename}")
                return [] if filename == "education_structured_data_extract.json" else {}
        except Exception as e:
            print(f"❌ Error loading {filename}: {e}")
            return [] if filename == "education_structured_data_extract.json" else {}
    
    def _load_existing_rag(self):
        """Load existing RAG index and metadata"""
        try:
            # Load chunks metadata
            metadata_file = self.rag_index_path / "chunks_metadata.json"
            if metadata_file.exists():
                with open(metadata_file, 'r') as f:
                    self.chunks_metadata = json.load(f)
                print(f"✅ Loaded {len(self.chunks_metadata)} RAG chunks")
            
            # Load FAISS index
            index_file = self.rag_index_path / "faiss_index.bin"
            if index_file.exists():
                self.rag_index = faiss.read_index(str(index_file))
                print(f"✅ FAISS index loaded with {self.rag_index.ntotal} vectors")
            else:
                print("❌ FAISS index file not found")
                
        except Exception as e:
            print(f"❌ Error loading RAG index: {e}")
            self.chunks_metadata = []
            self.rag_index = None
    
    def is_page_based_question(self, question: str) -> Tuple[bool, Optional[int]]:
        """
        Determine if question is page-based and extract page number
        
        Patterns we look for:
        - "in page 15", "on page 16", "from page 17"
        - "page 15", "p. 15", "pg 15"
        """
        page_patterns = [
            r'(?:in|on|from|at)\s+page\s+(\d+)',
            r'page\s+(\d+)',
            r'p\.?\s*(\d+)',
            r'pg\.?\s*(\d+)'
        ]
        
        question_lower = question.lower()
        for pattern in page_patterns:
            match = re.search(pattern, question_lower)
            if match:
                page_number = int(match.group(1))
                print(f"🔍 Detected page-based question for page {page_number}")
                return True, page_number
        
        return False, None
    
    def perform_rag_search(self, question: str, top_k: int = 5) -> List[Dict]:
        """
        Perform RAG search using existing index
        """
        if self.rag_index is None or not self.chunks_metadata:
            print("❌ RAG index not available")
            return []
        
        print(f"🔍 Searching RAG for: '{question[:50]}...'")
        
        # Encode question
        question_embedding = self.embedding_model.encode([question])
        faiss.normalize_L2(question_embedding.astype('float32'))
        
        # Search
        distances, indices = self.rag_index.search(question_embedding.astype('float32'), top_k)
        
        # Get relevant chunks
        relevant_chunks = []
        for i, idx in enumerate(indices[0]):
            if idx < len(self.chunks_metadata):
                chunk_data = self.chunks_metadata[idx].copy()
                chunk_data['relevance_score'] = float(distances[0][i])
                relevant_chunks.append(chunk_data)
        
        print(f"✅ Found {len(relevant_chunks)} relevant chunks")
        return relevant_chunks
    
    def get_page_based_info(self, page_number: int) -> Dict:
        """
        Get information for a specific page from structured data
        
        Your data structure is a list of sections, each with page_start and page_end
        """
        print(f"📖 Looking for content on page {page_number}")
        
        matching_sections = []
        
        # Handle list format (your actual structure)
        if isinstance(self.structured_data, list):
            for i, section_data in enumerate(self.structured_data):
                if not isinstance(section_data, dict):
                    continue
                    
                # Check if this section covers the requested page
                page_start = section_data.get('page_start', 0)
                page_end = section_data.get('page_end', 0)
                
                if page_start <= page_number <= page_end:
                    matching_sections.append(section_data)
        
        # Handle dict format (fallback)
        elif isinstance(self.structured_data, dict):
            for section_id, section_data in self.structured_data.items():
                if not isinstance(section_data, dict):
                    continue
                    
                page_start = section_data.get('page_start', 0)
                page_end = section_data.get('page_end', 0)
                
                if page_start <= page_number <= page_end:
                    matching_sections.append(section_data)
        
        if matching_sections:
            # Combine all matching sections
            combined_text = "\n\n".join([
                f"Section: {section.get('main_heading', '')} - {section.get('sub_heading', '')}\n{section.get('content', '')}" 
                for section in matching_sections
            ])
            
            combined_images = []
            
            # Collect all images from matching sections
            for section in matching_sections:
                section_images = section.get('images', [])
                for img in section_images:
                    # Handle your image structure
                    if isinstance(img, dict):
                        img_page = img.get('page', 0)
                        img_path = img.get('path', '')
                        if img_page == page_number and img_path:
                            combined_images.append(img_path)
                    elif isinstance(img, str):
                        # Handle case where images are just paths
                        combined_images.append(img)
            
            print(f"✅ Found {len(matching_sections)} sections covering page {page_number}")
            print(f"🖼️ Found {len(combined_images)} images for page {page_number}")
            
            return {
                'text': combined_text,
                'images': combined_images,
                'page_number': page_number,
                'sections': matching_sections
            }
        
        return {
            'text': f"No content found for page {page_number}",
            'images': [],
            'page_number': page_number,
            'sections': []
        }
    
    def get_images_from_rag_results(self, relevant_chunks: List[Dict]) -> List[str]:
        """
        Smart image extraction: Match RAG chunk content with structured JSON sections
        Only process images from the top 2 most relevant chunks to keep it manageable
        """
        all_images = []
        
        # Limit to top 2 chunks to avoid processing too many images
        top_chunks = relevant_chunks[:2]
        print(f"🔍 Processing images from top {len(top_chunks)} relevant chunks")
        
        for i, chunk in enumerate(top_chunks, 1):
            print(f"📄 Processing chunk {i}/{len(top_chunks)}")
            
            chunk_content = chunk.get('content', chunk.get('text', ''))
            if not chunk_content:
                continue
            
            # Method 1: Use section_id if available (from new RAG)
            chunk_metadata = chunk.get('metadata', {})
            section_id = chunk_metadata.get('section_id')
            
            if section_id is not None and isinstance(self.structured_data, list):
                if 0 <= section_id < len(self.structured_data):
                    section = self.structured_data[section_id]
                    section_images = section.get('images', [])
                    
                    for img in section_images:
                        if isinstance(img, dict) and img.get('path'):
                            all_images.append(img['path'])
                    
                    print(f"   ✅ Found {len(section_images)} images using section_id {section_id}")
                    continue
            
            # Method 2: Content matching fallback
            print(f"   🔍 Using content matching for chunk")
            chunk_preview = chunk_content[:150].strip()
            
            # Search through structured data sections
            if isinstance(self.structured_data, list):
                for section_idx, section in enumerate(self.structured_data):
                    if not isinstance(section, dict):
                        continue
                    
                    section_content = section.get('content', '')
                    if not section_content:
                        continue
                    
                    # Check if chunk content matches this section
                    # Try multiple matching strategies
                    match_found = False
                    
                    # Strategy 1: Direct substring match
                    if len(chunk_preview) > 50 and chunk_preview in section_content:
                        match_found = True
                    
                    # Strategy 2: Reverse match - section content in chunk
                    elif len(section_content) > 50 and section_content[:150] in chunk_content:
                        match_found = True
                    
                    # Strategy 3: Common significant words (fallback)
                    else:
                        chunk_words = set(word.lower() for word in chunk_content.split() if len(word) > 5)
                        section_words = set(word.lower() for word in section_content.split() if len(word) > 5)
                        common_words = chunk_words.intersection(section_words)
                        
                        if len(common_words) >= 3:  # At least 3 significant common words
                            match_found = True
                    
                    if match_found:
                        # Get images from this specific section
                        section_images = section.get('images', [])
                        section_image_count = 0
                        
                        for img in section_images:
                            if isinstance(img, dict) and img.get('path'):
                                all_images.append(img['path'])
                                section_image_count += 1
                        
                        print(f"   ✅ Matched section {section_idx}, found {section_image_count} images")
                        break  # Found matching section, move to next chunk
        
        # Remove duplicates while preserving order and limit to 3 images
        unique_images = []
        seen = set()
        for img_path in all_images:
            if img_path not in seen and len(unique_images) < 3:  # Limit to 3 images
                unique_images.append(img_path)
                seen.add(img_path)
        
        print(f"🖼️ Selected {len(unique_images)} images (max 3) from {len(top_chunks)} chunks")
        return unique_images
    
    def encode_image_to_base64(self, image_path: str) -> str:
        """Encode image to base64 for API calls"""
        try:
            # Fix path for notebook location (app folder)
            # Image paths in JSON don't have ../ but we need it since we're in app folder
            
            # Clean the path first
            if image_path.startswith('../'):
                clean_path = image_path[3:]  # Remove '../'
            else:
                clean_path = image_path
            
            # Add ../ prefix for notebook in app folder
            fixed_path = f"../{clean_path}"
            full_path = Path(fixed_path)
            
            if full_path.exists():
                with open(full_path, "rb") as image_file:
                    return base64.b64encode(image_file.read()).decode('utf-8')
            else:
                # Try without ../
                fallback_path = Path(clean_path)
                if fallback_path.exists():
                    with open(fallback_path, "rb") as image_file:
                        return base64.b64encode(image_file.read()).decode('utf-8')
                else:
                    print(f"❌ Image not found: {fixed_path} or {clean_path}")
                    return ""
        except Exception as e:
            print(f"❌ Error encoding image {image_path}: {e}")
            return ""
    
    def query_gemma_text(self, prompt: str, model_name: str = "gemma-family:latest") -> str:
        """Query your fine-tuned Gemma text model"""
        payload = {
            "model": model_name,
            "prompt": prompt,
            "stream": False,
            "options": {
                "temperature": 0.7,
                "top_p": 0.9
            }
        }
        
        try:
            print(f"🤖 Querying text model: {model_name}")
            response = requests.post(self.text_model_url, json=payload, timeout=120)
            response.raise_for_status()
            result = response.json().get('response', '')
            print(f"✅ Got response ({len(result)} characters)")
            return result
        except Exception as e:
            error_msg = f"Error querying text model: {e}"
            print(f"❌ {error_msg}")
            return error_msg
    
    def query_gemma_multimodal(self, prompt: str, image_base64: str, 
                              model_name: str = "gemma3n:e4b") -> str:
        """Query your multimodal Gemma model"""
        payload = {
            "model": model_name,
            "prompt": prompt,
            "images": [image_base64],
            "stream": False,
            "options": {
                "temperature": 0.7,
                "top_p": 0.9
            }
        }
        
        try:
            print(f"🎨 Querying multimodal model: {model_name}")
            response = requests.post(self.multimodal_model_url, json=payload, timeout=120)
            response.raise_for_status()
            result = response.json().get('response', '')
            print(f"✅ Got image analysis ({len(result)} characters)")
            return result
        except Exception as e:
            error_msg = f"Error querying multimodal model: {e}"
            print(f"❌ {error_msg}")
            return error_msg
    
    def process_images_with_context(self, image_paths: List[str], context: str, 
                                  question: str) -> List[Dict]:
        """Process images with multimodal model"""
        image_explanations = []
        
        for i, image_path in enumerate(image_paths, 1):
            print(f"🖼️ Processing image {i}/{len(image_paths)}: {Path(image_path).name}")
            
            image_base64 = self.encode_image_to_base64(image_path)
            if not image_base64:
                continue
            
            prompt = f"""
            Educational Context: {context[:1000]}...
            
            Student Question: {question}
            
            Please analyze this educational image and explain:
            1. What you see in the image
            2. How it relates to the educational content
            3. How it helps answer the student's question
            
            Provide a clear, educational explanation that would help a student understand the topic better.
            """
            
            explanation = self.query_gemma_multimodal(prompt, image_base64)
            image_explanations.append({
                'image_path': image_path,
                'explanation': explanation
            })
        
        return image_explanations
    
    def answer_general_question(self, question: str) -> Dict:
        """Answer general question using RAG + multimodal processing"""
        
        print("\n" + "="*60)
        print("🔍 PROCESSING GENERAL QUESTION")
        print("="*60)
        print(f"Question: {question}")
        
        # Step 1: RAG search
        relevant_chunks = self.perform_rag_search(question, top_k=5)
        
        if not relevant_chunks:
            return {
                'answer': "I couldn't find relevant information for your question in the educational materials.",
                'sources': [],
                'images': [],
                'type': 'general'
            }
        
        # Step 2: Combine text context
        context_text = "\n\n".join([
            f"Source: {chunk.get('source', 'Educational Material')}\n{chunk.get('text', chunk.get('content', ''))}"
            for chunk in relevant_chunks
        ])
        
        print(f"📝 Combined context from {len(relevant_chunks)} sources")
        
        # Step 3: Get images from relevant pages
        relevant_images = self.get_images_from_rag_results(relevant_chunks)
        
        # Step 4: Process images if found
        image_explanations = []
        if relevant_images:
            print(f"🎨 Processing {len(relevant_images)} relevant images...")
            image_explanations = self.process_images_with_context(
                relevant_images, context_text, question
            )
        
        # Step 5: Generate final answer
        print("🤖 Generating comprehensive answer...")
        
        image_context = ""
        if image_explanations:
            image_context = "\n\nVisual Information:\n" + "\n\n".join([
                f"Image Analysis {i+1}: {img['explanation']}"
                for i, img in enumerate(image_explanations)
            ])
        
        final_prompt = f"""
        Student Question: {question}
        
        Educational Content:
        {context_text}
        {image_context}
        
        Instructions:
        - Provide a comprehensive, educational answer to the student's question
        - Use the provided educational content and visual information
        - Structure your answer clearly with proper explanations
        - Make it suitable for a student learning this topic
        - Reference specific information from the materials when relevant
        
        Answer:
        """
        
        final_answer = self.query_gemma_text(final_prompt)
        
        return {
            'answer': final_answer,
            'sources': relevant_chunks,
            'images': image_explanations,
            'type': 'general',
            'context_used': len(context_text),
            'images_processed': len(image_explanations)
        }
    
    def answer_page_based_question(self, question: str, page_number: int) -> Dict:
        """Answer page-based question"""
        
        print("\n" + "="*60)
        print("📖 PROCESSING PAGE-BASED QUESTION")
        print("="*60)
        print(f"Question: {question}")
        print(f"Page: {page_number}")
        
        # Step 1: Get page information
        page_info = self.get_page_based_info(page_number)
        
        if not page_info['text'] or 'No content found' in page_info['text']:
            return {
                'answer': f"I couldn't find any content for page {page_number} in the educational materials.",
                'page_info': page_info,
                'images': [],
                'page_number': page_number,
                'type': 'page_based'
            }
        
        # Step 2: Process images if found
        image_explanations = []
        if page_info['images']:
            print(f"🎨 Processing {len(page_info['images'])} images from page {page_number}...")
            image_explanations = self.process_images_with_context(
                page_info['images'], page_info['text'], question
            )
        
        # Step 3: Generate answer
        print("🤖 Generating page-specific answer...")
        
        image_context = ""
        if image_explanations:
            image_context = "\n\nVisual Information from Page:\n" + "\n\n".join([
                f"Image {i+1}: {img['explanation']}"
                for i, img in enumerate(image_explanations)
            ])
        
        final_prompt = f"""
        Student Question: {question}
        Page Number Requested: {page_number}
        
        Content from Page {page_number}:
        {page_info['text']}
        {image_context}
        
        Instructions:
        - Answer the student's question specifically based on the content from page {page_number}
        - Reference the page content directly in your explanation
        - If there are images, explain how they support the content
        - Make your answer educational and easy to understand
        - Focus specifically on what's covered on this page
        
        Answer:
        """
        
        final_answer = self.query_gemma_text(final_prompt)
        
        return {
            'answer': final_answer,
            'page_info': page_info,
            'images': image_explanations,
            'page_number': page_number,
            'type': 'page_based',
            'sections_found': len(page_info.get('sections', []))
        }
    
    def process_question(self, question: str) -> Dict:
        """Main method to process any question"""
        
        # Determine question type
        is_page_based, page_number = self.is_page_based_question(question)
        
        if is_page_based and page_number:
            return self.answer_page_based_question(question, page_number)
        else:
            return self.answer_general_question(question)

# ## Step 5: Initialize the System
# Create an instance of our Q&A system

print("\n" + "="*60)
print("🚀 INITIALIZING EDUCATION Q&A SYSTEM")
print("="*60)

qa_system = EducationQASystem("../datasets/education")

# ## Step 6: Test the System
# Let's test with both types of questions

def test_question(question: str):
    """Test a single question and display results"""
    print(f"\n{'='*80}")
    print(f"🎯 TESTING QUESTION")
    print(f"{'='*80}")
    print(f"❓ Question: {question}")
    
    # Process the question
    result = qa_system.process_question(question)
    
    # Display results
    print(f"\n📊 RESULTS:")
    print(f"   Type: {result['type']}")
    print(f"   Answer length: {len(result['answer'])} characters")
    
    if result['type'] == 'general':
        print(f"   Sources used: {len(result.get('sources', []))}")
        print(f"   Context size: {result.get('context_used', 0)} characters")
    else:
        print(f"   Page number: {result['page_number']}")
        print(f"   Sections found: {result.get('sections_found', 0)}")
    
    print(f"   Images processed: {len(result.get('images', []))}")
    
    print(f"\n📝 ANSWER:")
    print("-" * 50)
    print(result['answer'])
    
    if result.get('images'):
        print(f"\n🖼️ IMAGES ANALYZED:")
        for i, img in enumerate(result['images'], 1):
            print(f"   {i}. {Path(img['image_path']).name}")
            print(f"      {img['explanation'][:100]}...")
    
    return result

# Test questions based on your data
test_questions = [
    # General questions
    "Explain the importance of irrigation in crop production. What are different irrigation methods?",
    "What are the basic practices of crop production?",
    "How do we prepare soil for cultivation?",
    "What is the difference between manure and fertilizers?",
    
    # Page-based questions
    "In page 15, explain about agricultural practices",
    "On page 16, what tools are used for soil preparation?",
    "From page 18, explain about sowing methods",
    "In page 20, what are the advantages of manure?"
]

print(f"\n🧪 RUNNING {len(test_questions)} TEST QUESTIONS...")

# Test each question
results = []
for i, question in enumerate(test_questions, 1):
    print(f"\n{'#'*60}")
    print(f"TEST {i}/{len(test_questions)}")
    print(f"{'#'*60}")
    
    result = test_question(question)
    results.append({
        'question': question,
        'result': result
    })

print(f"\n🎉 TESTING COMPLETE!")
print(f"✅ Processed {len(results)} questions successfully")

# ## Step 7: Interactive Testing
# Function for interactive testing

def ask_question():
    """Interactive question asking"""
    print(f"\n{'='*60}")
    print("💬 INTERACTIVE Q&A SESSION")
    print("="*60)
    print("Enter your question (or 'quit' to exit):")
    
    while True:
        question = input("\n❓ Your question: ").strip()
        
        if question.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
        
        if not question:
            print("Please enter a question.")
            continue
        
        try:
            result = qa_system.process_question(question)
            
            print(f"\n🤖 Answer ({result['type']} question):")
            print("-" * 40)
            print(result['answer'])
            
            if result.get('images'):
                print(f"\n🖼️ Analyzed {len(result['images'])} images")
            
        except Exception as e:
            print(f"❌ Error processing question: {e}")

# Uncomment the line below to start interactive session
# ask_question()

print("\n✅ Notebook setup complete! You can now:")
print("1. Test individual questions using test_question('your question')")
print("2. Start interactive session using ask_question()")
print("3. Modify the system for your specific needs")
print("\n🔧 Make sure your Ollama models are running:")
print("   - gemma-family:latest (text model)")
print("   - gemma3n:e4b (multimodal model)")

✅ All libraries imported successfully!
🔍 CHECKING FILE LOCATIONS
✅ ../datasets/education/education_structured_data_extract.json
    📊 Contains 137 items
✅ ../datasets/education/rag_index/faiss_index.bin
✅ ../datasets/education/rag_index/chunks_metadata.json
    📊 Contains 2090 items
✅ ../datasets/education/rag_index/rag_config.json
    📊 Contains 9 keys

🚀 INITIALIZING EDUCATION Q&A SYSTEM
🔄 Initializing Education Q&A System...
🔄 Loading datasets...
✅ Loaded list with 155 items from education_qa_dataset.json
✅ Loaded list with 137 items from education_structured_data_extract.json
✅ Loaded 2090 RAG chunks
✅ FAISS index loaded with 2090 vectors
✅ Education Q&A System initialized!
📊 Loaded 137 structured sections (list format)
🔍 RAG index loaded: Yes

🧪 RUNNING 8 TEST QUESTIONS...

############################################################
TEST 1/8
############################################################

🎯 TESTING QUESTION
❓ Question: Explain the importance of irrigation in crop pr

## Rag test

In [8]:
# RAG Index Tester - Simple script to inspect what RAG is returning

import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from pathlib import Path

class RAGTester:
    def __init__(self, base_path: str = "../datasets/education"):
        """Simple RAG tester to inspect search results"""
        self.base_path = Path(base_path)
        self.rag_index_path = self.base_path / "rag_index"
        
        print("🔄 Loading RAG components...")
        
        # Load embedding model
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Load RAG index and metadata
        self.rag_index = None
        self.chunks_metadata = None
        self._load_rag()
        
        if self.rag_index:
            print(f"✅ RAG Tester ready! Index has {self.rag_index.ntotal} vectors")
        else:
            print("❌ Failed to load RAG index")
    
    def _load_rag(self):
        """Load RAG index and metadata"""
        try:
            # Load chunks metadata
            metadata_file = self.rag_index_path / "chunks_metadata.json"
            with open(metadata_file, 'r') as f:
                self.chunks_metadata = json.load(f)
            print(f"✅ Loaded {len(self.chunks_metadata)} chunk metadata")
            
            # Load FAISS index
            index_file = self.rag_index_path / "faiss_index.bin"
            self.rag_index = faiss.read_index(str(index_file))
            print(f"✅ Loaded FAISS index with {self.rag_index.ntotal} vectors")
            
        except Exception as e:
            print(f"❌ Error loading RAG: {e}")
    
    def search_rag(self, question: str, top_k: int = 5):
        """Search RAG and return raw results"""
        if not self.rag_index or not self.chunks_metadata:
            print("❌ RAG not loaded")
            return []
        
        print(f"\n🔍 Searching for: '{question}'")
        print("=" * 60)
        
        # Encode question
        question_embedding = self.embedding_model.encode([question])
        faiss.normalize_L2(question_embedding.astype('float32'))
        
        # Search
        distances, indices = self.rag_index.search(question_embedding.astype('float32'), top_k)
        
        # Get results
        results = []
        for i, idx in enumerate(indices[0]):
            if idx < len(self.chunks_metadata):
                chunk = self.chunks_metadata[idx].copy()
                chunk['similarity_score'] = float(distances[0][i])
                chunk['rank'] = i + 1
                results.append(chunk)
        
        return results
    
    def print_results(self, results):
        """Print search results in a readable format"""
        if not results:
            print("❌ No results found")
            return
        
        print(f"📊 Found {len(results)} results:")
        print("=" * 60)
        
        for result in results:
            print(f"\n🏆 RANK {result['rank']} (Score: {result['similarity_score']:.4f})")
            print("-" * 40)
            
            # Print all available fields
            for key, value in result.items():
                if key in ['rank', 'similarity_score']:
                    continue
                    
                if isinstance(value, str):
                    if len(value) > 200:
                        print(f"{key}: {value[:200]}...")
                    else:
                        print(f"{key}: {value}")
                elif isinstance(value, (list, dict)):
                    print(f"{key}: {value}")
                else:
                    print(f"{key}: {value}")
            
            print("-" * 40)
    
    def inspect_chunk_structure(self):
        """Inspect the structure of chunks to understand what fields are available"""
        if not self.chunks_metadata:
            print("❌ No chunk metadata loaded")
            return
        
        print(f"\n🔍 CHUNK STRUCTURE ANALYSIS")
        print("=" * 60)
        print(f"Total chunks: {len(self.chunks_metadata)}")
        
        if len(self.chunks_metadata) > 0:
            # Analyze first chunk
            first_chunk = self.chunks_metadata[0]
            print(f"\nFirst chunk fields:")
            for key, value in first_chunk.items():
                value_type = type(value).__name__
                if isinstance(value, str):
                    print(f"  {key}: {value_type} (length: {len(value)})")
                elif isinstance(value, list):
                    print(f"  {key}: {value_type} (items: {len(value)})")
                else:
                    print(f"  {key}: {value_type} = {value}")
            
            # Check what fields are common across chunks
            print(f"\nField analysis across all {len(self.chunks_metadata)} chunks:")
            field_counts = {}
            for chunk in self.chunks_metadata:
                for key in chunk.keys():
                    field_counts[key] = field_counts.get(key, 0) + 1
            
            for field, count in sorted(field_counts.items()):
                percentage = (count / len(self.chunks_metadata)) * 100
                print(f"  {field}: {count}/{len(self.chunks_metadata)} chunks ({percentage:.1f}%)")
    
    def test_question(self, question: str, top_k: int = 5):
        """Test a single question and show results"""
        results = self.search_rag(question, top_k)
        self.print_results(results)
        return results

# Initialize the tester
print("🚀 Initializing RAG Tester...")
rag_tester = RAGTester("../datasets/education")

# First, let's understand the chunk structure
rag_tester.inspect_chunk_structure()

# Test questions
test_questions = [
    "What is irrigation?",
    "How to prepare soil?", 
    "What are different irrigation methods?",
    "Basic practices of crop production",
    "Difference between manure and fertilizers"
]

print(f"\n\n🧪 TESTING {len(test_questions)} QUESTIONS")
print("=" * 80)

# Test each question
for i, question in enumerate(test_questions, 1):
    print(f"\n\n{'#' * 20} TEST {i}/{len(test_questions)} {'#' * 20}")
    rag_tester.test_question(question, top_k=3)

# Interactive testing
def interactive_rag_test():
    """Interactive RAG testing"""
    print(f"\n\n💬 INTERACTIVE RAG TESTING")
    print("=" * 60)
    print("Enter questions to test RAG (or 'quit' to exit):")
    
    while True:
        question = input("\n❓ Question: ").strip()
        
        if question.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
        
        if not question:
            continue
        
        try:
            rag_tester.test_question(question, top_k=5)
        except Exception as e:
            print(f"❌ Error: {e}")

# Uncomment to start interactive testing
# interactive_rag_test()

print("\n✅ RAG testing complete!")
print("\n🔧 To test interactively, uncomment the last line and run:")
print("interactive_rag_test()")

🚀 Initializing RAG Tester...
🔄 Loading RAG components...
✅ Loaded 1286 chunk metadata
✅ Loaded FAISS index with 1286 vectors
✅ RAG Tester ready! Index has 1286 vectors

🔍 CHUNK STRUCTURE ANALYSIS
Total chunks: 1286

First chunk fields:
  chunk_id: int = 0
  text: str (length: 46)
  metadata: dict = {'domain': 'education', 'context': 'item_0 > main_heading', 'key': 'main_heading', 'chunk_type': 'text_content'}

Field analysis across all 1286 chunks:
  chunk_id: 1286/1286 chunks (100.0%)
  metadata: 1286/1286 chunks (100.0%)
  text: 1286/1286 chunks (100.0%)


🧪 TESTING 5 QUESTIONS


#################### TEST 1/5 ####################

🔍 Searching for: 'What is irrigation?'
📊 Found 3 results:

🏆 RANK 1 (Score: 0.9098)
----------------------------------------
chunk_id: 7
text: content:  to as  agricultural practices  which are listed below: (i) Preparation of soil (ii) Sowing (iii) Adding manure and fertilisers (iv) Irrigation (v) Protecting from weeds (vi) Harvesting (vii)...
metadata: {'

## simple test

In [16]:
# Simple test to debug Gemma response
import requests
import json

def test_gemma_simple():
    """Simple test to see what Gemma returns"""
    
    # Simple prompt
    simple_prompt = """Question: What is irrigation?

Context: Irrigation is the artificial application of water to crops for growth.

Please answer the question."""
    
    print("🔧 SENDING TO GEMMA:")
    print("=" * 40)
    print(simple_prompt)
    print("=" * 40)
    
    # Send to Gemma
    payload = {
        "model": "gemma-family:latest",
        "prompt": simple_prompt,
        "stream": False,
        "options": {"temperature": 0.7, "top_p": 0.9}
    }
    
    try:
        response = requests.post("http://localhost:11434/api/generate", json=payload, timeout=60)
        print(f"Status Code: {response.status_code}")
        
        if response.status_code == 200:
            response_json = response.json()
            print("🔧 FULL RESPONSE JSON:")
            print(json.dumps(response_json, indent=2))
            
            print("\n🔧 RESPONSE CONTENT:")
            print("=" * 40)
            gemma_response = response_json.get('response', '')
            print(f"Length: {len(gemma_response)}")
            print(f"Content: '{gemma_response}'")
            print("=" * 40)
            
        else:
            print(f"Error: {response.status_code}")
            print(response.text)
            
    except Exception as e:
        print(f"Error: {e}")

# Run the test
print("🚀 Testing Gemma Response...")
test_gemma_simple()

🚀 Testing Gemma Response...
🔧 SENDING TO GEMMA:
Question: What is irrigation?

Context: Irrigation is the artificial application of water to crops for growth.

Please answer the question.
Status Code: 200
🔧 FULL RESPONSE JSON:
{
  "model": "gemma-family:latest",
  "created_at": "2025-08-05T16:25:26.524135Z",
  "response": "Irrigation is the practice of supplying water to crops through artificial means, such as canals, pumps, and sprinklers, to ensure they have adequate moisture for healthy growth.",
  "done": true,
  "done_reason": "stop",
  "context": [
    236820,
    236909,
    1906,
    236779,
    1340,
    236779,
    1005,
    111038,
    105,
    2364,
    107,
    14977,
    236787,
    2900,
    563,
    33314,
    236881,
    107,
    107,
    3637,
    236787,
    107161,
    563,
    506,
    16477,
    3739,
    529,
    1813,
    531,
    20859,
    573,
    3877,
    236761,
    107,
    107,
    9366,
    3890,
    506,
    2934,
    236761,
    106,
    107,
    105,

## end to end test

In [25]:
# Simple end-to-end test
import json
import requests
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from pathlib import Path

def simple_end_to_end_test():
    """Simple test of the full pipeline"""
    
    question = "What is irrigation?"
    print(f"🔍 Question: {question}")
    
    # Step 1: Load RAG
    print("\n📚 Step 1: Loading RAG...")
    try:
        embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Load chunks
        with open("../datasets/education/rag_index/chunks_metadata.json", 'r') as f:
            chunks = json.load(f)
        
        # Load FAISS index
        rag_index = faiss.read_index("../datasets/education/rag_index/faiss_index.bin")
        
        print(f"✅ RAG loaded: {len(chunks)} chunks, {rag_index.ntotal} vectors")
    except Exception as e:
        print(f"❌ RAG loading failed: {e}")
        return
    
    # Step 2: Search RAG
    print("\n🔍 Step 2: Searching RAG...")
    question_embedding = embedding_model.encode([question])
    faiss.normalize_L2(question_embedding.astype('float32'))
    
    distances, indices = rag_index.search(question_embedding.astype('float32'), 3)
    
    relevant_chunks = []
    for idx in indices[0]:
        if idx < len(chunks):
            relevant_chunks.append(chunks[idx])
    
    print(f"✅ Found {len(relevant_chunks)} relevant chunks")
    for i, chunk in enumerate(relevant_chunks):
        text = chunk.get('text', '')
        print(f"  {i+1}. {text[:80]}...")
    
    # Step 3: Combine context
    print("\n📝 Step 3: Building context...")
    context = "\n\n".join([chunk.get('text', '') for chunk in relevant_chunks])
    print(f"✅ Context length: {len(context)} characters")
    
    # Step 4: Call Gemma
    print("\n🤖 Step 4: Calling Gemma...")
    
    prompt = f"""Context: {context}

Question: {question}

Answer the question using the context provided."""
    
    print(f"Prompt length: {len(prompt)}")
    
    payload = {
        "model": "gemma3n:e4b",
        "prompt": prompt,
        "stream": False
    }
    
    try:
        response = requests.post("http://localhost:11434/api/generate", json=payload, timeout=60)
        response_json = response.json()
        final_answer = response_json.get('response', '')
        
        print(f"✅ Gemma response length: {len(final_answer)}")
        print("🎯 FINAL ANSWER:")
        print("=" * 50)
        print(final_answer)
        print("=" * 50)
        
        return final_answer
        
    except Exception as e:
        print(f"❌ Gemma call failed: {e}")
        return ""

# Run the test
simple_end_to_end_test()

🔍 Question: What is irrigation?

📚 Step 1: Loading RAG...
✅ RAG loaded: 2090 chunks, 2090 vectors

🔍 Step 2: Searching RAG...
✅ Found 3 relevant chunks
  1. CROP PRODUCTION   AND MANAGEMENT - 1.6 Irrigation: (ii) Drip system : In this sy...
  2. CROP PRODUCTION   AND MANAGEMENT - 1.6 Irrigation: efficient. The various tradit...
  3. CROP PRODUCTION   AND MANAGEMENT - 1.6 Irrigation: uneven land where sufficient ...

📝 Step 3: Building context...
✅ Context length: 1459 characters

🤖 Step 4: Calling Gemma...
Prompt length: 1548
✅ Gemma response length: 305
🎯 FINAL ANSWER:
The context describes irrigation as **modern methods that help us use water economically.** It also mentions traditional methods like moat, pulley-system, chain pump, dhekli, and rahat (lever system) for lifting water. The modern methods discussed are the **Sprinkler System** and the **Drip System**.






'The context describes irrigation as **modern methods that help us use water economically.** It also mentions traditional methods like moat, pulley-system, chain pump, dhekli, and rahat (lever system) for lifting water. The modern methods discussed are the **Sprinkler System** and the **Drip System**.\n\n\n\n'

## test image

In [30]:
# Test single image with Gemma multimodal
import requests
import base64
from pathlib import Path

def test_single_image():
    """Test single image analysis"""
    
    image_path = "../datasets/education/images/pagenumber_21_image2.png"
    
    print(f"🖼️ Testing image: {image_path}")
    
    # Check if image exists
    if not Path(image_path).exists():
        print("❌ Image not found!")
        return
    
    print("✅ Image found")
    
    # Encode to base64
    try:
        with open(image_path, "rb") as image_file:
            image_base64 = base64.b64encode(image_file.read()).decode('utf-8')
        print(f"✅ Image encoded: {len(image_base64)} characters")
    except Exception as e:
        print(f"❌ Encoding failed: {e}")
        return
    
    # Test with simple prompt
    prompt = "What is shown in this image? Describe only what you can see."
    
    payload = {
        "model": "gemma3n:e4b",
        "prompt": prompt,
        "images": [image_base64],
        "stream": False,
        "options": {
            "num_predict": 500,
            "temperature": 0.1,
            "top_p": 0.5
            
        }
    }
    
    print(f"🤖 Sending to Gemma multimodal...")
    print(f"Prompt: '{prompt}'")
    
    try:
        response = requests.post("http://localhost:11434/api/generate", json=payload, timeout=60)
        print(f"Status: {response.status_code}")
        
        if response.status_code == 200:
            response_json = response.json()
            answer = response_json.get('response', '')
            
            print(f"✅ Response length: {len(answer)}")
            print("\n🎯 GEMMA SAYS:")
            print("=" * 40)
            print(answer)
            print("=" * 40)
        else:
            print(f"❌ Error: {response.status_code}")
            print(response.text)
            
    except Exception as e:
        print(f"❌ Request failed: {e}")

# Run the test
test_single_image()

❌ Error 400: {"error":"illegal base64 data at input byte 34"}


In [32]:
import requests
import base64
from pathlib import Path

def test_single_image_with_base64():
    """Test multimodal inference with an image from Kaggle to local Ollama"""

    image_path = "../datasets/education/images/pagenumber_21_image2.png"
    prompt = "What is shown in this image? Describe only what you can see."

    print(f"🖼️ Testing image: {image_path}")

    if not Path(image_path).exists():
        print("❌ Image not found!")
        return

    print("✅ Image found")

    # ✅ Encode to base64 without prefix
    try:
        with open(image_path, "rb") as image_file:
            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
        print(f"✅ Image encoded: {len(image_base64)} characters")
    except Exception as e:
        print(f"❌ Encoding failed: {e}")
        return

    payload = {
        "model": "gemma3n:e4b",  # Your Ollama model name
        "prompt": prompt,
        "images": [image_base64],  # Just raw base64
        "stream": False,
        "options": {
            "num_predict": 300,
            "temperature": 0.1,
            "top_p": 0.5
        }
    }

    print(f"🤖 Sending to Ollama multimodal...")
    print(f"Prompt: '{prompt}'")

    try:
        response = requests.post("http://localhost:11434/api/generate", json=payload, timeout=120)
        print(f"Status: {response.status_code}")

        if response.status_code == 200:
            answer = response.json().get("response", "")
            print(f"✅ Response length: {len(answer)}")
            print("\n🎯 GEMMA SAYS:")
            print("=" * 40)
            print(answer)
            print("=" * 40)
        else:
            print(f"❌ Error {response.status_code}: {response.text}")

    except Exception as e:
        print(f"❌ Request failed: {e}")

# Run the test
test_single_image_with_base64()

🖼️ Testing image: ../datasets/education/images/pagenumber_21_image2.png
✅ Image found
✅ Image encoded: 696844 characters
🤖 Sending to Ollama multimodal...
Prompt: 'What is shown in this image? Describe only what you can see.'
Status: 200
✅ Response length: 315

🎯 GEMMA SAYS:
The image shows a close-up of a person's hands holding a small, round, brown object. The object appears to be a seed or a small nut. The hands are gently cupped, supporting the object. The background is blurred and out of focus, but appears to be a neutral, light-colored surface. 

The lighting is soft and even. 



In [38]:
import ollama

image_path = "/Users/saikumarallaka/kaggle/gemma_3n_impact_challenge/datasets/education/images/pagenumber_33_image1.png"

response = ollama.chat(
    model="gemma3n:e4b",  # ✅ match exactly as in ollama list
    messages=[
        {
            "role": "user",
            "content": "What is shown in this image? Describe only what you can see.",
            "images": [image_path]
        }
    ],
    options={'temperature': 0},
)

print(response["message"]["content"])

The image shows a close-up of a person's hands holding a small, round, brown object. The object appears to be a seed or a small nut. The hands are gently cupped, supporting the object. The background is blurred and out of focus, but appears to be a neutral, light-colored surface. 

The lighting is soft and even. The focus is sharply on the hands and the object they are holding. 

