In [1]:
# 1. Install system dependencies first
!sudo apt-get update
!sudo apt-get install -y libvips

# 2. Install PyTorch and its ecosystem for your specific CUDA version (cu121)
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121

# 3. Install flash-attn AFTER PyTorch
!pip install flash-attn

# 4. Install the remaining Python packages (CORRECTED LINE)
!pip install PyMuPDF weasyprint gradio transformers sentence-transformers faiss-cpu accelerate bitsandbytes Pillow pyvips

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]      
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease                         
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]           
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]        
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,702 kB]
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,728 kB]
Get:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]     
Get:12 http://security.ubuntu.com/ubuntu jammy-sec

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
#!/usr/bin/env python3
"""
Enhanced PDF Processing Pipeline: RAG Chat & Summarization 
Application
======================================================================
Version 5: Dynamic Attention Mechanism & Robust Memory Management
Key changes:
- Automatically detects GPU compute capability to select the best
  attention mechanism (Flash Attention 2 for Ampere+, SDPA for others).
- Implemented VLM unloading to fix CUDA out-of-memory errors.
- Activated gradient checkpointing for memory-efficient model operation.
- Added more robust memory management with gc.collect() and torch.cuda.empty_cache().
"""

import os
import re
import time
import json
import gc
from datetime import datetime
from typing import List, Dict, Tuple, Optional, Any
from dataclasses import dataclass, field
import tempfile
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Core dependencies
import torch
import torch.nn as nn
import fitz  # PyMuPDF
import numpy as np
from PIL import Image
import io
import markdown
import gradio as gr

try:
    from weasyprint import HTML
    WEASYPRINT_AVAILABLE = True
except ImportError:
    WEASYPRINT_AVAILABLE = False

from tqdm import tqdm

# AI/ML dependencies
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, AutoProcessor,
    BitsAndBytesConfig, GenerationConfig
)
from sentence_transformers import SentenceTransformer

try:
    import faiss
    FAISS_AVAILABLE = True
    print("✅ FAISS library found. RAG will use FAISS for vector indexing.")
except ImportError:
    FAISS_AVAILABLE = False
    print("⚠️ FAISS library not found. RAG functionality will be disabled.")

try:
    import bitsandbytes
    BNB_AVAILABLE = True
    print("✅ bitsandbytes library found. Will attempt 4-bit model loading.")
except ImportError:
    BNB_AVAILABLE = False
    print("⚠️ bitsandbytes library not found.")

# Kaggle/T4 optimizations
os.environ["TOKENIZERS_PARALLELISM"] = "false"
if torch.cuda.is_available():
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# ===========================================
# CONFIGURATION
# ===========================================

@dataclass
class PipelineConfig:
    """Configuration for the PDF processing and RAG pipeline"""
    # RAG chunking
    rag_max_chars_per_chunk: int = 1500
    rag_overlap_chars: int = 200

    # Summarization chunking - adjusted for Gemma model
    summarization_max_chars_per_chunk: int = 20000
    summarization_overlap_chars: int = 2500

    # Summarization LLM settings - updated for Gemma
    map_summary_max_tokens: int = 800
    reduce_summary_max_tokens: int = 3000

    skip_vlm: bool = False
    use_cache: bool = True

    # GPU/Model optimizations
    precision: str = "auto"
    compile_model: bool = False  # Disabled for better compatibility
    
    # Diagnostic toggle
    save_diagnostics: bool = True
    
    # Force single GPU usage for better performance
    force_single_gpu: bool = True

    # Updated LLM choice to Gemma
    llm_models_priority: List[str] = field(default_factory=lambda: [
        "google/gemma-3-4b-it"
    ])
    
    embedding_model: str = "NovaSearch/stella_en_400M_v5"
    moondream_model: str = "vikhyatk/moondream2"
    moondream_revision: str = "2025-04-14"

    # RAG settings
    rag_top_k_chunks: int = 4

    torch_dtype: Any = field(init=False)
    attention_implementation: str = field(init=False)

    def __post_init__(self):
        # Set torch dtype
        if self.precision == "auto":
            self.torch_dtype = torch.float16
        elif self.precision == "fp16":
            self.torch_dtype = torch.float16
        elif self.precision == "bf16" and torch.cuda.is_available() and torch.cuda.is_bf16_supported():
            self.torch_dtype = torch.bfloat16
        else:
            self.torch_dtype = torch.float32

        # OPTIMIZATION: Auto-select attention mechanism based on GPU capability
        if torch.cuda.is_available():
            major, _ = torch.cuda.get_device_capability()
            if major >= 8:
                print("✅ GPU (Compute Capability >= 8.0) supports Flash Attention 2. Using 'flash_attention_2'.")
                self.attention_implementation = "sdpa"
            else:
                print("⚠️ GPU does not support Flash Attention 2. Falling back to 'sdpa' for optimization.")
                self.attention_implementation = "sdpa"
        else:
            print("ℹ️ No CUDA GPU found. Using default 'eager' attention mechanism.")
            self.attention_implementation = "eager"


# ===========================================
# GPU UTILITIES
# ===========================================
def setup_gpu_environment():
    if not torch.cuda.is_available():
        print("⚠️ CUDA not available! Running on CPU.")
        return False
    gpu_count = torch.cuda.device_count()
    print(f"🔍 Detected {gpu_count} GPU(s)")
    for i in range(gpu_count):
        gpu_name = torch.cuda.get_device_name(i)
        gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1e9
        major, minor = torch.cuda.get_device_capability(i)
        print(f"  GPU {i}: {gpu_name} ({gpu_memory:.1f} GB), Compute Capability: {major}.{minor}")
    return True

def get_device_map(model_name: str, force_single_gpu: bool = False) -> str | Dict:
    gpu_count = torch.cuda.device_count()
    if gpu_count == 0:
        return "cpu"
    if force_single_gpu or gpu_count == 1:
        return {"": 0}  # Force everything to GPU 0
    return "auto"

# ===========================================
# PERFORMANCE MONITORING
# ===========================================
class PerformanceMonitor:
    def __init__(self):
        self.timings = {}
        self.metrics = {}
    
    def start_timer(self, name: str):
        self.timings[name] = time.time()
    
    def stop_timer(self, name: str) -> float:
        elapsed = time.time() - self.timings.get(name, time.time())
        self.metrics[f"{name}_time_sec"] = elapsed
        print(f"⏱️ {name} took: {elapsed:.2f}s")
        return elapsed
    
    def log_metric(self, name: str, value: Any):
        self.metrics[name] = value
        print(f"📊 Metric {name}: {value}")

# ===========================================
# DIAGNOSTIC LOGGER
# ===========================================
class DiagnosticLogger:
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.log_dir = ""
        if self.config.save_diagnostics:
            self.log_dir = f"diagnostics_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
            os.makedirs(self.log_dir, exist_ok=True)
            print(f"🔬 Diagnostics enabled. Saving logs to: {self.log_dir}")

    def log(self, filename: str, content: Any):
        if not self.config.save_diagnostics or not self.log_dir:
            return
        filepath = os.path.join(self.log_dir, filename)
        try:
            with open(filepath, "w", encoding="utf-8") as f:
                if isinstance(content, str):
                    f.write(content)
                else:
                    json.dump(content, f, indent=4)
            print(f"📄 Diagnostic log saved: {filepath}")
        except Exception as e:
            print(f"⚠️ Failed to save diagnostic log {filename}: {e}")

    def log_performance(self, metrics: Dict[str, Any]):
        self.log("performance_metrics.json", metrics)

# ===========================================
# VLM - MOONDREAM (With Unloading Capability)
# ===========================================
class MoondreamVLM:
    def __init__(self, config: PipelineConfig, perf_monitor: Optional[PerformanceMonitor] = None):
        self.config = config
        self.perf_monitor = perf_monitor if perf_monitor else PerformanceMonitor()
        self.model, self.processor = None, None
        if self.config.skip_vlm:
            print("⏭️ VLM (Moondream) is skipped by config.")
            return
        self._load_vlm()

    def _load_vlm(self):
        self.perf_monitor.start_timer("load_vlm")
        print(f"🌙 Loading VLM: {self.config.moondream_model}")
        try:
            device_for_vlm = "cuda:0" if torch.cuda.is_available() else "cpu"
            self.processor = AutoTokenizer.from_pretrained(
                self.config.moondream_model, 
                revision=self.config.moondream_revision
            )
            self.model = AutoModelForCausalLM.from_pretrained(
                self.config.moondream_model, 
                revision=self.config.moondream_revision,
                trust_remote_code=True, 
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            ).to(device_for_vlm)
            self.model.eval()
            print(f"✅ VLM {self.config.moondream_model} loaded on {device_for_vlm}.")
        except Exception as e:
            print(f"❌ Failed to load VLM {self.config.moondream_model}: {e}")
            self.model, self.processor = None, None
        self.perf_monitor.stop_timer("load_vlm")

    def caption_images_batch(self, images: List[Image.Image]) -> List[str]:
        if not self.model or not self.processor or self.config.skip_vlm:
            return [f"Image (VLM skipped or not loaded)" for _ in images]
        self.perf_monitor.start_timer("vlm_captioning_batch")
        captions = []
        for img in tqdm(images, desc="Captioning images with VLM"):
            try:
                if img.mode != "RGB":
                    img = img.convert("RGB")
                enc_image = self.model.encode_image(img)
                caption_text = self.model.answer_question(
                    image_embeds=enc_image, 
                    question="Describe the content of this image.", 
                    tokenizer=self.processor
                )
                captions.append(caption_text if caption_text else "Could not generate caption.")
            except Exception as e:
                print(f"⚠️ Error captioning image: {e}")
                captions.append("Error generating caption.")
        self.perf_monitor.stop_timer("vlm_captioning_batch")
        return captions

    def unload_vlm(self):
        """Explicitly unload the VLM to free up GPU memory."""
        self.perf_monitor.start_timer("unload_vlm")
        if self.model:
            print("🗑️ Unloading VLM to free up GPU memory...")
            del self.model
            del self.processor
            self.model, self.processor = None, None
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            print("✅ VLM unloaded and GPU memory cleared.")
        self.perf_monitor.stop_timer("unload_vlm")

# ===========================================
# OPTIMIZED TEXT GENERATOR (LLM) - UPDATED FOR GEMMA
# ===========================================
class OptimizedTextGenerator:
    def __init__(self, config: PipelineConfig, perf_monitor: Optional[PerformanceMonitor] = None):
        self.config = config
        self.perf_monitor = perf_monitor if perf_monitor else PerformanceMonitor()
        self.model, self.tokenizer, self.processor = None, None, None
        self.is_gemma = False
        self._load_llm()
        
    def _load_llm(self):
        self.perf_monitor.start_timer("load_llm")
        print("🚀 Loading LLM (Gemma-3-4b-it)...")
        
        quantization_config = None
        if BNB_AVAILABLE:
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
                bnb_4bit_use_double_quant=True
            )
            print("Using BitsAndBytes 4-bit quantization config.")

        for model_id in self.config.llm_models_priority:
            print(f"Attempting to load LLM: {model_id}")
            
            self.is_gemma = "gemma" in model_id.lower()
            
            try:
                if self.is_gemma:
                    self.processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
                    self.tokenizer = self.processor.tokenizer if hasattr(self.processor, 'tokenizer') else self.processor
                else:
                    self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=True)
                    if self.tokenizer.pad_token is None:
                        self.tokenizer.pad_token = self.tokenizer.eos_token

                model_kwargs = {
                    "trust_remote_code": True,
                    "low_cpu_mem_usage": True,
                    "device_map": get_device_map(model_id, self.config.force_single_gpu),
                    # OPTIMIZATION: Use the best available attention implementation
                    "attn_implementation": self.config.attention_implementation,
                }
                
                if quantization_config:
                    model_kwargs["quantization_config"] = quantization_config
                    model_kwargs["torch_dtype"] = torch.bfloat16
                else:
                    model_kwargs["torch_dtype"] = self.config.torch_dtype

                if self.is_gemma:
                    try:
                        from transformers import Gemma3ForConditionalGeneration
                        model_class = Gemma3ForConditionalGeneration
                    except ImportError:
                        model_class = AutoModelForCausalLM

                    self.model = model_class.from_pretrained(model_id, **model_kwargs)
                    # OPTIMIZATION: Enable gradient checkpointing to save memory
                    self.model.gradient_checkpointing_enable()
                else:
                    self.model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
                
                print(f"✅ LLM {model_id} loaded successfully.")
                self.model.eval()
                
                if hasattr(self.model, 'hf_device_map'):
                    print(f"📊 LLM distribution: {self.model.hf_device_map}")
                
                self.perf_monitor.stop_timer("load_llm")
                return
                
            except Exception as e:
                print(f"❌ Failed to load LLM {model_id}: {e}")
                if hasattr(self, 'model') and self.model is not None:
                    del self.model
                    self.model = None
                if hasattr(self, 'tokenizer') and self.tokenizer is not None:
                    del self.tokenizer
                    self.tokenizer = None
                if hasattr(self, 'processor') and self.processor is not None:
                    del self.processor
                    self.processor = None
                torch.cuda.empty_cache()
                gc.collect()
                
        self.perf_monitor.stop_timer("load_llm")
        raise RuntimeError("Failed to load any specified LLM.")

    def generate_text(self, prompt: str, max_new_tokens: int = 512, system_prompt: str = None) -> str:
        self.perf_monitor.start_timer("llm_generation")
        if not self.model or not (self.tokenizer or self.processor):
            return "LLM not loaded."

        if self.is_gemma and self.processor:
            messages = []
            if system_prompt:
                messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
            messages.append({"role": "user", "content": [{"type": "text", "text": prompt}]})
            
            inputs = self.processor.apply_chat_template(
                messages, 
                tokenize=True, 
                return_dict=True, 
                return_tensors="pt", 
                add_generation_prompt=True
            ).to(self.model.device)
        else:
            messages = [{"role": "user", "content": prompt}]
            
            if hasattr(self.tokenizer, 'apply_chat_template'):
                text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            else:
                text = prompt

            inputs = self.tokenizer(
                text, 
                return_tensors="pt", 
                truncation=True, 
                max_length=4096 - max_new_tokens
            ).to(self.model.device)
        
        input_token_count = inputs.input_ids.shape[1]
        
        gen_kwargs = {
            "max_new_tokens": max_new_tokens,
            "num_beams": 1,
            "early_stopping": True,
            "do_sample": True,
            "temperature": 0.7,
            "top_p": 0.9,
            "repetition_penalty": 1.1,
        }

        with torch.inference_mode():
            outputs = self.model.generate(
                inputs["input_ids"],
                attention_mask=inputs.get("attention_mask"),
                **gen_kwargs
            )
        
        generated_ids = outputs[0][input_token_count:]
        
        if self.processor and hasattr(self.processor, 'decode'):
            generated_text = self.processor.decode(generated_ids, skip_special_tokens=True).strip()
        else:
            generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
        
        elapsed = time.time() - self.perf_monitor.timings.get("llm_generation", time.time())
        tokens_per_second = len(generated_ids) / elapsed if elapsed > 0 else 0
        
        self.perf_monitor.stop_timer("llm_generation")
        self.perf_monitor.log_metric("llm_output_tokens", len(generated_ids))
        self.perf_monitor.log_metric("tokens_per_second", tokens_per_second)
        
        del inputs, outputs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return self._clean_output(generated_text)

    def _clean_output(self, text: str) -> str:
        text = re.sub(r'<\|.*?\|>', '', text)
        text = text.replace("Answer:", "").replace("Summary:", "").strip()
        return text.strip()

# ===========================================
# PDF PROCESSOR & CHUNKING (Unchanged)
# ===========================================
@dataclass
class ExtractedContent:
    id: str
    text: str
    type: str
    page_num: Optional[int] = None
    metadata: Dict = field(default_factory=dict)

class PDFProcessor:
    def __init__(self, config: PipelineConfig, perf_monitor: Optional[PerformanceMonitor] = None):
        self.config = config
        self.perf_monitor = perf_monitor if perf_monitor else PerformanceMonitor()

    def extract_content_from_pdf(self, pdf_path: str) -> Tuple[List[Dict], List[Image.Image], List[int]]:
        self.perf_monitor.start_timer("pdf_raw_extraction")
        print(f"📄 Extracting raw content from PDF: {pdf_path}")
        raw_text_by_page, pil_images, image_page_numbers = [], [], []
        try:
            doc = fitz.open(pdf_path)
        except Exception as e:
            print(f"❌ Error opening PDF {pdf_path}: {e}")
            return [], [], []
        
        for page_num_idx in tqdm(range(len(doc)), desc="Extracting pages"):
            page = doc[page_num_idx]
            page_num_actual = page_num_idx + 1
            text = page.get_text("text")
            if text.strip():
                raw_text_by_page.append({'page_num': page_num_actual, 'text': text})
            
            if not self.config.skip_vlm:
                for img_index, img_info in enumerate(page.get_images(full=True)):
                    xref = img_info[0]
                    try:
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]
                        pil_image = Image.open(io.BytesIO(image_bytes))
                        if pil_image.size[0] > 1024 or pil_image.size[1] > 1024:
                            pil_image.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
                        pil_images.append(pil_image)
                        image_page_numbers.append(page_num_actual)
                    except Exception as e_img:
                        print(f"⚠️ Could not extract image xref {xref} on page {page_num_actual}: {e_img}")
        
        doc.close()
        self.perf_monitor.stop_timer("pdf_raw_extraction")
        self.perf_monitor.log_metric("num_raw_images_extracted", len(pil_images))
        print(f"✅ Raw content extraction complete. {len(raw_text_by_page)} text pages, {len(pil_images)} images.")
        return raw_text_by_page, pil_images, image_page_numbers

    def chunk_for_rag(self, raw_text_by_page: List[Dict], image_captions_with_pages: List[Dict]) -> List[ExtractedContent]:
        self.perf_monitor.start_timer("rag_chunking")
        all_rag_content = []
        chunk_id_counter = 0
        
        full_doc_text_annotated = "".join([f"[Page {item['page_num']}]\n{item['text']}\n\n" for item in raw_text_by_page])
        text_segments = self._chunk_text_strategy(full_doc_text_annotated, self.config.rag_max_chars_per_chunk, self.config.rag_overlap_chars)
        
        for i, segment in enumerate(text_segments):
            page_nums_in_segment = set(re.findall(r"\[Page (\d+)\]", segment))
            first_page = min(map(int, page_nums_in_segment)) if page_nums_in_segment else None
            clean_segment = re.sub(r"\[Page \d+\]\n?", "", segment).strip()
            if not clean_segment:
                continue
            all_rag_content.append(ExtractedContent(
                id=f"text_chunk_{chunk_id_counter}",
                text=clean_segment,
                type="text_chunk",
                page_num=first_page,
                metadata={'source': 'text'}
            ))
            chunk_id_counter += 1
        
        for i, cap_info in enumerate(image_captions_with_pages):
            caption_text = f"Image on page {cap_info['page_num']}: {cap_info['caption']}"
            all_rag_content.append(ExtractedContent(
                id=f"img_cap_{chunk_id_counter}",
                text=caption_text,
                type="image_caption",
                page_num=cap_info['page_num'],
                metadata={'source': 'image_caption'}
            ))
            chunk_id_counter += 1
        
        self.perf_monitor.stop_timer("rag_chunking")
        self.perf_monitor.log_metric("num_rag_chunks", len(all_rag_content))
        return all_rag_content

    def chunk_for_summarization(self, raw_text_by_page: List[Dict], image_captions_with_pages: List[Dict]) -> List[str]:
        self.perf_monitor.start_timer("summarization_chunking")
        page_texts_map = {item['page_num']: item['text'] for item in raw_text_by_page}
        page_captions_map = {}
        for cap_info in image_captions_with_pages:
            p_num = cap_info['page_num']
            if p_num not in page_captions_map:
                page_captions_map[p_num] = []
            page_captions_map[p_num].append(f"(Image: {cap_info['caption']})")
        
        full_text_for_summary = ""
        for p_num in sorted(list(set(page_texts_map.keys()) | set(page_captions_map.keys()))):
            full_text_for_summary += f"[Page {p_num}]\n"
            if p_num in page_texts_map:
                full_text_for_summary += page_texts_map[p_num] + "\n"
            if p_num in page_captions_map:
                full_text_for_summary += " ".join(page_captions_map[p_num]) + "\n"
            full_text_for_summary += "\n"
        
        summarization_chunks = self._chunk_text_strategy(
            full_text_for_summary, 
            self.config.summarization_max_chars_per_chunk, 
            self.config.summarization_overlap_chars
        )
        self.perf_monitor.stop_timer("summarization_chunking")
        self.perf_monitor.log_metric("num_summarization_chunks", len(summarization_chunks))
        return summarization_chunks

    def _chunk_text_strategy(self, text: str, max_chars: int, overlap: int) -> List[str]:
        if not text:
            return []
        chunks = []
        start_idx = 0
        text_len = len(text)
        while start_idx < text_len:
            end_idx = min(start_idx + max_chars, text_len)
            chunks.append(text[start_idx:end_idx])
            if end_idx == text_len:
                break
            start_idx += (max_chars - overlap)
            if start_idx >= end_idx:
                start_idx = end_idx
        return chunks

# ===========================================
# EMBEDDING AND VECTOR STORE (Unchanged)
# ===========================================
class RAGEmbeddingStore:
    def __init__(self, config: PipelineConfig, perf_monitor: Optional[PerformanceMonitor] = None):
        self.config = config
        self.perf_monitor = perf_monitor if perf_monitor else PerformanceMonitor()
        self.embedding_model, self.index, self.content_map = None, None, []
        if not FAISS_AVAILABLE:
            print("⚠️ FAISS not available. RAG Store cannot be initialized.")
            return
        self._load_embedding_model()

    def _load_embedding_model(self):
        self.perf_monitor.start_timer("load_embedding_model")
        try:
            print(f"🌟 Loading embedding model: {self.config.embedding_model}")
            self.embedding_model = SentenceTransformer(
                self.config.embedding_model,
                device="cuda" if torch.cuda.is_available() else "cpu",
                trust_remote_code=True
            )
            print(f"✅ Embedding model '{self.config.embedding_model}' loaded.")
        except Exception as e:
            print(f"❌ Failed to load embedding model: {e}")
            self.embedding_model = None
        self.perf_monitor.stop_timer("load_embedding_model")

    def build_index(self, all_content: List[ExtractedContent]):
        if not self.embedding_model or not FAISS_AVAILABLE:
            print("⚠️ Cannot build index.")
            return False
        if not all_content:
            print("⚠️ No content to build index from.")
            return False
        
        self.perf_monitor.start_timer("build_faiss_index")
        self.content_map = all_content
        content_texts = [content.text for content in all_content]
        print(f"⏳ Generating embeddings for {len(content_texts)} content pieces...")
        embeddings = self.embedding_model.encode(content_texts, show_progress_bar=True, batch_size=32)
        embeddings = np.array(embeddings).astype('float32')
        embedding_dim = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(embedding_dim)
        self.index.add(embeddings)
        self.perf_monitor.stop_timer("build_faiss_index")
        self.perf_monitor.log_metric("faiss_index_size", self.index.ntotal if self.index else 0)
        print(f"✅ FAISS index built with {self.index.ntotal if self.index else 0} vectors.")
        return True

    def retrieve_relevant_content(self, query: str, top_k: int) -> List[ExtractedContent]:
        if not self.index or not self.embedding_model or not self.content_map:
            return []
        self.perf_monitor.start_timer("retrieve_content")
        query_embedding = self.embedding_model.encode([query], prompt_name="s2p_query")
        query_embedding = np.array(query_embedding).astype('float32')
        distances, indices = self.index.search(query_embedding, top_k)
        retrieved = [self.content_map[idx] for idx in indices[0] if 0 <= idx < len(self.content_map)]
        self.perf_monitor.stop_timer("retrieve_content")
        return retrieved

# ===========================================
# GRADIO APPLICATION STATE & LOGIC
# ===========================================
APP_CONFIG = PipelineConfig(save_diagnostics=True, force_single_gpu=True)
DIAGNOSTIC_LOGGER = DiagnosticLogger(APP_CONFIG)

PDF_PROCESSOR_INSTANCE = PDFProcessor(APP_CONFIG)
LLM_GENERATOR_INSTANCE, VLM_CAPTIONER_INSTANCE, EMBEDDING_STORE_INSTANCE = None, None, None
SESSION_RAW_TEXT_BY_PAGE, SESSION_PIL_IMAGES, SESSION_IMAGE_PAGE_NUMBERS, SESSION_IMAGE_CAPTIONS_WITH_PAGES = [], [], [], []

def initialize_all_models_for_gradio():
    global LLM_GENERATOR_INSTANCE, VLM_CAPTIONER_INSTANCE, EMBEDDING_STORE_INSTANCE
    if LLM_GENERATOR_INSTANCE is None:
        print("Initializing LLM...")
        LLM_GENERATOR_INSTANCE = OptimizedTextGenerator(APP_CONFIG)
    if not APP_CONFIG.skip_vlm and VLM_CAPTIONER_INSTANCE is None:
        print("Initializing VLM...")
        VLM_CAPTIONER_INSTANCE = MoondreamVLM(APP_CONFIG)
    if EMBEDDING_STORE_INSTANCE is None and FAISS_AVAILABLE:
        print("Initializing Embedding Store...")
        EMBEDDING_STORE_INSTANCE = RAGEmbeddingStore(APP_CONFIG)

def process_uploaded_pdf_for_app(pdf_file_obj, progress=gr.Progress(track_tqdm=True)):
    global SESSION_RAW_TEXT_BY_PAGE, SESSION_PIL_IMAGES, SESSION_IMAGE_PAGE_NUMBERS, SESSION_IMAGE_CAPTIONS_WITH_PAGES
    if pdf_file_obj is None:
        return None, "Please upload a PDF file.", False, False, []
    
    initialize_all_models_for_gradio()
    pdf_path = pdf_file_obj.name
    pdf_basename = os.path.basename(pdf_path)
    status_updates = [f"Processing '{pdf_basename}'..."]
    progress(0.1, desc=status_updates[-1])

    local_raw_text, local_pil_images, local_image_page_numbers = PDF_PROCESSOR_INSTANCE.extract_content_from_pdf(pdf_path)

    SESSION_RAW_TEXT_BY_PAGE = local_raw_text
    SESSION_PIL_IMAGES = local_pil_images
    SESSION_IMAGE_PAGE_NUMBERS = local_image_page_numbers

    status_updates.append(f"Extracted {len(local_raw_text)} text pages and {len(local_pil_images)} images.")
    progress(0.3, desc=status_updates[-1])

    if not local_raw_text and not local_pil_images:
        return pdf_basename, "\n".join(status_updates) + "\nNo content extracted.", False, False, []

    local_image_captions_with_pages = []
    if not APP_CONFIG.skip_vlm and VLM_CAPTIONER_INSTANCE and local_pil_images:
        status_updates.append("Generating image captions...")
        progress(0.5, desc=status_updates[-1])
        
        captions = VLM_CAPTIONER_INSTANCE.caption_images_batch(local_pil_images)
        
        for i, caption_text in enumerate(captions):
            local_image_captions_with_pages.append({
                'page_num': local_image_page_numbers[i], 
                'caption': caption_text, 
                'image_index_on_page': i
            })
            
        status_updates.append(f"Generated {len(local_image_captions_with_pages)} image captions.")
        DIAGNOSTIC_LOGGER.log("1_image_captions.json", local_image_captions_with_pages)
        
        # OPTIMIZATION: Unload VLM to free up memory before summarization
        VLM_CAPTIONER_INSTANCE.unload_vlm()

    elif APP_CONFIG.skip_vlm:
        status_updates.append("Image captioning (VLM) is skipped by configuration.")

    SESSION_IMAGE_CAPTIONS_WITH_PAGES = local_image_captions_with_pages
    progress(0.7, desc=status_updates[-1])

    rag_ready = False
    if EMBEDDING_STORE_INSTANCE and EMBEDDING_STORE_INSTANCE.embedding_model:
        rag_content_pieces = PDF_PROCESSOR_INSTANCE.chunk_for_rag(local_raw_text, local_image_captions_with_pages)
        if rag_content_pieces:
            status_updates.append("Building RAG search index...")
            progress(0.8, desc=status_updates[-1])
            rag_ready = EMBEDDING_STORE_INSTANCE.build_index(rag_content_pieces)
            if rag_ready:
                status_updates.append("RAG index built. Ready for chat.")
            else:
                status_updates.append("Failed to build RAG index.")
        else:
            status_updates.append("No content to build RAG index from.")
    else:
        status_updates.append("RAG store/embedding model not available. Chat will be limited.")

    final_status = "\n".join(status_updates)
    progress(1.0, desc="Processing complete.")
    summary_ready = bool(local_raw_text)
    
    if LLM_GENERATOR_INSTANCE:
        DIAGNOSTIC_LOGGER.log_performance(LLM_GENERATOR_INSTANCE.perf_monitor.metrics)

    return pdf_basename, final_status, rag_ready, summary_ready, []

def handle_rag_chat_response(message: str, chat_history: List[Tuple[str,str]], processed_pdf_name: Optional[str], rag_ready_state: bool):
    if not rag_ready_state or processed_pdf_name is None:
        chat_history.append((message, "Error: PDF not processed for RAG or RAG not ready."))
        return "", chat_history
    if LLM_GENERATOR_INSTANCE is None or EMBEDDING_STORE_INSTANCE is None:
        chat_history.append((message, "Error: LLM or Embedding store not initialized."))
        return "", chat_history
    
    retrieved_content = EMBEDDING_STORE_INSTANCE.retrieve_relevant_content(message, top_k=APP_CONFIG.rag_top_k_chunks)
    if not retrieved_content:
        response_text = "I couldn't find relevant information in the document to answer your question."
    else:
        context_parts = []
        source_pages = set()
        for content_item in retrieved_content:
            context_parts.append(content_item.text)
            if content_item.page_num:
                source_pages.add(content_item.page_num)
        context_str_for_prompt = "\n\n---\n\n".join(context_parts)
        prompt = f"""Based on the following context from the document '{processed_pdf_name}' ONLY, answer the question. If the answer is not found in the context, clearly state that.

Context:
{context_str_for_prompt}

Question: {message}

Answer:"""
        
        rag_interaction_data = {
            "timestamp": datetime.now().isoformat(),
            "question": message,
            "retrieved_context": context_str_for_prompt
        }
        timestamp = datetime.now().strftime('%H%M%S')
        DIAGNOSTIC_LOGGER.log(f"rag_interaction_{timestamp}.json", rag_interaction_data)
        
        response_text = LLM_GENERATOR_INSTANCE.generate_text(prompt, max_new_tokens=512)
        if source_pages:
            response_text += f"\n\n(Sources: Approx. Page(s) {', '.join(map(str, sorted(list(source_pages))))})"
    
    chat_history.append((message, response_text))
    return "", chat_history

def handle_summarize_document(processed_pdf_name: Optional[str], summary_ready_state: bool, progress=gr.Progress(track_tqdm=True)):
    if not summary_ready_state or processed_pdf_name is None:
        return "Error: PDF not processed or no text content available for summarization."
    if LLM_GENERATOR_INSTANCE is None:
        return "Error: LLM not initialized for summarization."
    if not SESSION_RAW_TEXT_BY_PAGE:
        return "No text content was extracted from the PDF to summarize."
    
    status_updates = [f"Starting summarization for '{processed_pdf_name}'..."]
    progress(0.1, desc=status_updates[-1])

    summarization_chunks = PDF_PROCESSOR_INSTANCE.chunk_for_summarization(SESSION_RAW_TEXT_BY_PAGE, SESSION_IMAGE_CAPTIONS_WITH_PAGES)
    DIAGNOSTIC_LOGGER.log("2a_summarization_chunks.json", summarization_chunks)

    if not summarization_chunks:
        return "Could not prepare any chunks for summarization."
    
    status_updates.append(f"Created {len(summarization_chunks)} large chunks for map-reduce summarization.")
    progress(0.2, desc=status_updates[-1])

    chunk_summaries_with_source = []
    
    for i, chunk_text in enumerate(summarization_chunks):
        map_progress = 0.2 + (0.6 * (i + 1) / len(summarization_chunks))
        progress(map_progress, desc=f"Summarizing chunk {i+1}/{len(summarization_chunks)}...")
        
        prompt = f"""Please generate a comprehensive and detailed summary of the following text segment. This summary should capture ALL the key points, main arguments and any significant conclusions presented *within this segment only*. Maintain a neutral, objective, and informative tone. The target length for this segment's summary is approximately {(APP_CONFIG.map_summary_max_tokens//4) - 30} words. Do not add any introductory or concluding phrases that are not part of the summary content itself (e.g., avoid 'Here is the summary:').

TEXT SEGMENT TO SUMMARIZE:
-------------------------
{chunk_text[:APP_CONFIG.summarization_max_chars_per_chunk]}
-------------------------
Begin the summary directly."""

        system_prompt = "You are a helpful assistant specialized in summarizing document segments."
        
        summary = LLM_GENERATOR_INSTANCE.generate_text(
            prompt, 
            max_new_tokens=APP_CONFIG.map_summary_max_tokens + 200,
            system_prompt=system_prompt
        )
        
        chunk_summaries_with_source.append({
            "chunk_index": i,
            "chunk_text": chunk_text,
            "summary": summary
        })
        status_updates.append(f"Summary for chunk {i+1} generated.")

    DIAGNOSTIC_LOGGER.log("2b_chunk_summaries.json", chunk_summaries_with_source)
    chunk_summaries = [item['summary'] for item in chunk_summaries_with_source]

    if not chunk_summaries:
        return "Failed to generate summaries for any chunk."
    
    status_updates.append("Combining chunk summaries into a final document summary...")
    progress(0.9, desc=status_updates[-1])

    combined_chunk_summaries_text = "\n\n".join(chunk_summaries)
    DIAGNOSTIC_LOGGER.log("2c_combined_chunk_summaries.txt", combined_chunk_summaries_text)

    if len(combined_chunk_summaries_text) > 40000:
        final_summary = f"# Summary of {processed_pdf_name}\n\n"
        final_summary += "The document is extensive. Here are the key points from each section:\n\n"
        for i, summary in enumerate(chunk_summaries):
            final_summary += f"## Section {i+1}\n\n{summary}\n\n"
    else:
        system_prompt_overall = (
            "You are an expert AI assistant tasked with synthesizing multiple summaries of document segments into a single, comprehensive, and coherent final summary. "
            "Your primary objective is to integrate the information from the provided segment summaries, eliminate redundancy, and produce a well-structured final document that accurately reflects the core content of the original source, based *only* on the summaries provided."
        )
        
        user_prompt_overall = f"""You have been provided with a collection of sequential summaries, where each summary covers a distinct segment of a larger document. Your task is to synthesize these individual segment summaries into one cohesive and comprehensive final summary of the original document. The final summary must be based entirely on the information present in the segment summaries provided below.

**Input: Collection of Segment Summaries:**
---------------------------------------
{combined_chunk_summaries_text}
---------------------------------------

**Output Requirements for the Final Consolidated Summary:**
1.  **Content Focus:** Integrate ALL the information, key themes, arguments, and conclusions from the provided segment summaries. Identify and remove any redundancies or overlapping points.
2.  **Structure & Formatting:** The output MUST be in well-structured Markdown format and include the following distinct sections:
    a.  **Main Title:** A suitable H1 or H2 Markdown heading for the overall summary (e.g., `# Comprehensive Summary of [Document Topic]`).
    b.  **Introductory Paragraph:** A brief introduction that outlines the main topic and overall scope of the original document, as can be inferred from the collective summaries.
    c.  **Key Terms and Descriptions Section:** A dedicated section with a subheading (e.g., `## Key Terms and Concepts`) that lists significant key terms or concepts encountered across all the segment summaries. Each term should be followed by a brief, clear description based on its context in the summaries.
    d.  **Main Body:** The core of the summary, presenting a logical and flowing narrative of the document's content by weaving together the information from the segment summaries. Ensure smooth transitions between topics derived from different segments.
    e.  **Concluding Paragraph:** A final paragraph that wraps up the main points and provides a sense of closure, reflecting the overall essence captured in the segment summaries.
3.  **No Extraneous Text:** The output MUST ONLY be the summary itself, structured as described above. Do NOT include any conversational text, meta-comments, or remarks outside of the summary content (e.g., avoid phrases like 'Here is the consolidated summary:' or 'I have followed your instructions.').
4.  **Target Length:** Aim for a total output length of approximately {(APP_CONFIG.reduce_summary_max_tokens // 4) - 50} words for this final summary or about 5-6 pages.

Begin the final consolidated summary directly with the Main Title."""

        final_summary = LLM_GENERATOR_INSTANCE.generate_text(
            user_prompt_overall, 
            max_new_tokens=APP_CONFIG.reduce_summary_max_tokens + 200,
            system_prompt=system_prompt_overall
        )

    DIAGNOSTIC_LOGGER.log("2d_final_summary.txt", final_summary)

    status_updates.append("Final summary generated.")
    progress(1.0, desc="Summarization complete.")

    if LLM_GENERATOR_INSTANCE:
        DIAGNOSTIC_LOGGER.log_performance(LLM_GENERATOR_INSTANCE.perf_monitor.metrics)

    return final_summary

def create_gradio_interface():
    print("Creating Gradio interface...")
    setup_gpu_environment()
    with gr.Blocks(theme=gr.themes.Glass()) as demo:
        gr.Markdown("# 📄 PDF Assistant: Chat (RAG) & Summarize")
        gr.Markdown("Upload a PDF to chat about its content or get a comprehensive summary.")
        processed_pdf_name_state = gr.State(None)
        rag_ready_state = gr.State(False)
        summary_ready_state = gr.State(False)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 1. Upload & Process PDF")
                pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
                process_button = gr.Button("Process PDF for Chat & Summary", variant="primary")
                status_display = gr.Markdown("Status: Waiting for PDF...")
                gr.Markdown("---")
                gr.Markdown("### About This Assistant")
                gr.Markdown(f"""- **LLMs Used**: {', '.join(APP_CONFIG.llm_models_priority)} (4-bit quantized)
- **Embeddings**: {APP_CONFIG.embedding_model}
- **Vector Store**: FAISS (for RAG)
- **Image Understanding**: {APP_CONFIG.moondream_model if not APP_CONFIG.skip_vlm else 'Disabled'}
- **Diagnostics**: {'Enabled' if APP_CONFIG.save_diagnostics else 'Disabled'}
- **GPU Usage**: {'Single GPU mode' if APP_CONFIG.force_single_gpu else 'Multi-GPU mode'}""")
                if not FAISS_AVAILABLE:
                    gr.Markdown("⚠️ **FAISS library not found. RAG chat functionality is disabled.**")
                if not BNB_AVAILABLE:
                    gr.Markdown("⚠️ **BitsAndBytes not found. 4-bit models might fail to load.**")
            
            with gr.Column(scale=2):
                with gr.Tabs():
                    with gr.TabItem("💬 Chat with PDF (RAG)"):
                        chatbot = gr.Chatbot(label="Chat History", height=550)
                        msg_textbox = gr.Textbox(label="Your Question:", placeholder="Ask about text or images in the PDF...", lines=2)
                        with gr.Row():
                            submit_chat_button = gr.Button("Send Question", variant="primary", elem_id="send_button_rag")
                            clear_chat_button = gr.Button("Clear Chat")
                    
                    with gr.TabItem("📜 Summarize Document"):
                        summarize_button = gr.Button("Generate Full Document Summary", variant="primary")
                        summary_output_display = gr.Markdown(label="Document Summary", value="Summary will appear here...")
        
        process_button.click(
            fn=process_uploaded_pdf_for_app,
            inputs=[pdf_upload],
            outputs=[processed_pdf_name_state, status_display, rag_ready_state, summary_ready_state, chatbot]
        )
        msg_textbox.submit(
            fn=handle_rag_chat_response,
            inputs=[msg_textbox, chatbot, processed_pdf_name_state, rag_ready_state],
            outputs=[msg_textbox, chatbot]
        )
        submit_chat_button.click(
            fn=handle_rag_chat_response,
            inputs=[msg_textbox, chatbot, processed_pdf_name_state, rag_ready_state],
            outputs=[msg_textbox, chatbot]
        )
        clear_chat_button.click(lambda: (None, []), outputs=[msg_textbox, chatbot])
        summarize_button.click(
            fn=handle_summarize_document,
            inputs=[processed_pdf_name_state, summary_ready_state],
            outputs=[summary_output_display]
        )
    
    return demo

# ===========================================
# MAIN EXECUTION
# ===========================================
if __name__ == "__main__":
    gradio_app = create_gradio_interface()
    print("Launching Gradio app...")
    gradio_app.launch(server_name="0.0.0.0", server_port=7980, share=True)

✅ FAISS library found. RAG will use FAISS for vector indexing.
✅ bitsandbytes library found. Will attempt 4-bit model loading.
⚠️ GPU does not support Flash Attention 2. Falling back to 'sdpa' for optimization.
🔬 Diagnostics enabled. Saving logs to: diagnostics_20250528_095423
Creating Gradio interface...
🔍 Detected 2 GPU(s)
  GPU 0: Tesla T4 (15.8 GB), Compute Capability: 7.5
  GPU 1: Tesla T4 (15.8 GB), Compute Capability: 7.5
Launching Gradio app...
* Running on local URL:  http://0.0.0.0:8796
* Running on public URL: https://f920a4199275121600.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Initializing LLM...
🚀 Loading LLM (Gemma-3-4b-it)...
Using BitsAndBytes 4-bit quantization config.
Attempting to load LLM: google/gemma-3-4b-it


processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?steps/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

✅ LLM google/gemma-3-4b-it loaded successfully.
📊 LLM distribution: {'': 0}
⏱️ load_llm took: 45.71s
Initializing VLM...
🌙 Loading VLM: vikhyatk/moondream2


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/276 [00:00<?, ?B/s]

hf_moondream.py:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

config.py:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

region.py:   0%|          | 0.00/3.08k [00:00<?, ?B/s]

layers.py:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

image_crops.py:   0%|          | 0.00/7.53k [00:00<?, ?B/s]

moondream.py:   0%|          | 0.00/26.4k [00:00<?, ?B/s]

utils.py:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

vision.py:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

text.py:   0%|          | 0.00/6.20k [00:00<?, ?B/s]

rope.py:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

✅ VLM vikhyatk/moondream2 loaded on cuda:0.
⏱️ load_vlm took: 23.43s
Initializing Embedding Store...
🌟 Loading embedding model: NovaSearch/stella_en_400M_v5


modules.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/170k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/NovaSearch/stella_en_400M_v5:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/57.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/NovaSearch/stella_en_400M_v5:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Some weights of the model checkpoint at NovaSearch/stella_en_400M_v5 were not used when initializing NewModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.20M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.20M [00:00<?, ?B/s]

✅ Embedding model 'NovaSearch/stella_en_400M_v5' loaded.
⏱️ load_embedding_model took: 14.14s
📄 Extracting raw content from PDF: /tmp/gradio/13d4aa47db180586efbbf79f3808ad8c92cb2618c220d44a6bca7323a30ad4ed/170609_student.pdf
⏱️ pdf_raw_extraction took: 0.65s
📊 Metric num_raw_images_extracted: 2
✅ Raw content extraction complete. 67 text pages, 2 images.
⏱️ vlm_captioning_batch took: 6.72s
📄 Diagnostic log saved: diagnostics_20250528_095423/1_image_captions.json
🗑️ Unloading VLM to free up GPU memory...
✅ VLM unloaded and GPU memory cleared.
⏱️ unload_vlm took: 0.60s
⏱️ rag_chunking took: 0.00s
📊 Metric num_rag_chunks: 141
⏳ Generating embeddings for 141 content pieces...


Batches:   0%|          | 0/5 [00:00<?, ?steps/s]

⏱️ build_faiss_index took: 10.06s
📊 Metric faiss_index_size: 141
✅ FAISS index built with 141 vectors.
📄 Diagnostic log saved: diagnostics_20250528_095423/performance_metrics.json


Batches:   0%|          | 0/1 [00:00<?, ?steps/s]

⏱️ retrieve_content took: 0.13s
📄 Diagnostic log saved: diagnostics_20250528_095423/rag_interaction_100604.json
⏱️ llm_generation took: 19.72s
📊 Metric llm_output_tokens: 129
📊 Metric tokens_per_second: 6.541860862056994
⏱️ summarization_chunking took: 0.00s
📊 Metric num_summarization_chunks: 11
📄 Diagnostic log saved: diagnostics_20250528_095423/2a_summarization_chunks.json
⏱️ llm_generation took: 59.61s
📊 Metric llm_output_tokens: 344
📊 Metric tokens_per_second: 5.770483482447557
⏱️ llm_generation took: 57.77s
📊 Metric llm_output_tokens: 334
📊 Metric tokens_per_second: 5.781604927156025
⏱️ llm_generation took: 52.94s
📊 Metric llm_output_tokens: 299
📊 Metric tokens_per_second: 5.647557783449537
⏱️ llm_generation took: 59.54s
📊 Metric llm_output_tokens: 355
📊 Metric tokens_per_second: 5.96267139360413
⏱️ llm_generation took: 54.75s
📊 Metric llm_output_tokens: 307
📊 Metric tokens_per_second: 5.607529443004869
⏱️ llm_generation took: 54.84s
📊 Metric llm_output_tokens: 315
📊 Metric tokens