In [1]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory // 1e9:.1f} GB")

PyTorch version: 2.9.0+cu128
Transformers version: 4.57.1
CUDA available: True
GPU: NVIDIA GeForce RTX 5050 Laptop GPU
GPU Memory: 8.0 GB


In [2]:
with open('../Dataset/StudentHandbookDataset.txt', 'r', encoding='utf-8') as f:
    dataset = f.read()

print(f"Dataset loaded: {len(dataset):,} characters")
print(f"Estimated pages: ~{len(dataset) // 2000}")

# Import retrieval dependencies
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
import faiss
import numpy as np
import pickle
import os
import json
import time

Dataset loaded: 171,284 characters
Estimated pages: ~85


In [5]:
# RAG Configuration - Using your optimal settings from retrieval experiments
RAG_CONFIG = {
    'chunking': {
        'strategy': 'semantic',
        'percentile_threshold': 50,
        'buffer_size': 1
    },
    'embedding': {
        'model_name': 'sentence-transformers/multi-qa-mpnet-base-dot-v1',
        'device': 'cuda' if torch.cuda.is_available() else 'cpu'
    },
    'generation': {
        'model_name': 'mistralai/Mistral-7B-Instruct-v0.1',
        'max_length': 2048,
        'temperature': 0.3,
        'top_p': 0.9,
        'do_sample': True
    },
    'retrieval': {
        'top_k': 5,
        'score_threshold': 0.3
    }
}

print("RAG Configuration:")
for section, config in RAG_CONFIG.items():
    print(f"\n{section.upper()}:")
    for key, value in config.items():
        print(f"  {key}: {value}")

RAG Configuration:

CHUNKING:
  strategy: semantic
  percentile_threshold: 50
  buffer_size: 1

EMBEDDING:
  model_name: sentence-transformers/multi-qa-mpnet-base-dot-v1
  device: cuda

GENERATION:
  model_name: mistralai/Mistral-7B-Instruct-v0.1
  max_length: 2048
  temperature: 0.3
  top_p: 0.9
  do_sample: True

RETRIEVAL:
  top_k: 5
  score_threshold: 0.3


In [3]:
# def load_or_create_chunks():
#     """Load existing chunks or create new ones with optimal config"""
#     chunk_filename = f"../Retriever/saved_chunks/chunks_p{RAG_CONFIG['chunking']['percentile_threshold']}_b{RAG_CONFIG['chunking']['buffer_size']}.pkl"
    
#     if os.path.exists(chunk_filename):
#         print(f"Loading existing chunks from {chunk_filename}")
#         with open(chunk_filename, 'rb') as f:
#             chunks = pickle.load(f)
#         print(f"Loaded {len(chunks)} chunks")
#         return chunks
    
#     print("Creating new chunks with optimal configuration...")
#     embedding_model = HuggingFaceEmbeddings(
#         model_name="sentence-transformers/all-MiniLM-L6-v2",  # Lightweight for chunking
#         model_kwargs={'device': RAG_CONFIG['embedding']['device']},
#         encode_kwargs={'normalize_embeddings': True}
#     )
    
#     text_splitter = SemanticChunker(
#         embeddings=embedding_model,
#         breakpoint_threshold_type="percentile",
#         breakpoint_threshold_amount=RAG_CONFIG['chunking']['percentile_threshold'],
#         buffer_size=RAG_CONFIG['chunking']['buffer_size'],
#         add_start_index=True
#     )
    
#     chunks = text_splitter.create_documents([dataset])
    
#     # Save chunks
#     os.makedirs("../Retriever/saved_chunks", exist_ok=True)
#     with open(chunk_filename, 'wb') as f:
#         pickle.dump(chunks, f)
    
#     print(f"Created and saved {len(chunks)} chunks")
#     return chunks

# chunks = load_or_create_chunks()

# # Analyze chunks
# chunk_sizes = [len(chunk.page_content) for chunk in chunks]
# print(f"\nChunk Analysis:")
# print(f"  Total chunks: {len(chunks)}")
# print(f"  Average size: {np.mean(chunk_sizes):.0f} characters")
# print(f"  Size range: {min(chunk_sizes)} - {max(chunk_sizes)} characters")

import json

def load_or_create_chunks():
    """Load custom chunks from Chunks.json"""
    chunks_file = "../Dataset/Chunks.json"
    
    print(f"Loading custom chunks from {chunks_file}")
    
    try:
        with open(chunks_file, 'r', encoding='utf-8') as f:
            chunks_data = json.load(f)
        
        # Convert JSON chunks to LangChain Document format
        from langchain.schema import Document
        
        chunks = []
        for chunk in chunks_data:
            doc = Document(
                page_content=chunk['content'],
                metadata={
                    'chunk_id': chunk['chunk_id'],
                    'source_document': chunk['metadata']['source_document'],
                    'section_hierarchy': chunk['metadata']['section_hierarchy']
                }
            )
            chunks.append(doc)
        
        print(f"Loaded {len(chunks)} custom chunks")
        return chunks
        
    except FileNotFoundError:
        print(f"Error: {chunks_file} not found!")
        return None
    except Exception as e:
        print(f"Error loading chunks: {e}")
        return None

# Load chunks
chunks = load_or_create_chunks()

if chunks:
    # Analyze chunks
    chunk_sizes = [len(chunk.page_content) for chunk in chunks]
    print(f"\nChunk Analysis:")
    print(f"  Total chunks: {len(chunks)}")
    print(f"  Average size: {np.mean(chunk_sizes):.0f} characters")
    print(f"  Size range: {min(chunk_sizes)} - {max(chunk_sizes)} characters")
else:
    print("Failed to load chunks!")

Loading custom chunks from ../Dataset/Chunks.json
Loaded 125 custom chunks

Chunk Analysis:
  Total chunks: 125
  Average size: 1336 characters
  Size range: 213 - 3785 characters


In [6]:
class OptimalRetriever:
    """Retrieval system using your best-performing configuration"""
    
    def __init__(self, chunks, config):
        self.chunks = chunks
        self.config = config
        self.embedding_model = None
        self.index = None
        self.setup_embeddings()
    
    def setup_embeddings(self):
        """Initialize embedding model and FAISS index"""
        print("Setting up embedding model...")
        self.embedding_model = HuggingFaceEmbeddings(
            model_name=self.config['embedding']['model_name'],
            model_kwargs={
                'device': self.config['embedding']['device'],
                'trust_remote_code': True   
            },
            encode_kwargs={'normalize_embeddings': True}
        )
        
        print("Generating embeddings for all chunks...")
        chunk_texts = [chunk.page_content for chunk in self.chunks]
        
        # Batch processing to avoid memory issues
        batch_size = 32
        all_embeddings = []
        
        for i in range(0, len(chunk_texts), batch_size):
            batch = chunk_texts[i:i+batch_size]
            batch_embeddings = self.embedding_model.embed_documents(batch)
            all_embeddings.extend(batch_embeddings)
            if (i // batch_size + 1) % 10 == 0:
                print(f"  Processed {i + len(batch)}/{len(chunk_texts)} chunks")
        
        # Build FAISS index
        print("Building FAISS index...")
        dimension = len(all_embeddings[0])
        self.index = faiss.IndexFlatIP(dimension)
        
        embeddings_array = np.array(all_embeddings).astype('float32')
        faiss.normalize_L2(embeddings_array)
        self.index.add(embeddings_array)
        
        print(f"Retrieval system ready: {self.index.ntotal:,} vectors ({dimension}D)")
    
    def retrieve(self, query, top_k=None, score_threshold=None):
        """Retrieve relevant chunks for a query"""
        top_k = top_k or self.config['retrieval']['top_k']
        score_threshold = score_threshold or self.config['retrieval']['score_threshold']
        
        # Embed query
        query_embedding = self.embedding_model.embed_query(query)
        query_vector = np.array([query_embedding]).astype('float32')
        faiss.normalize_L2(query_vector)
        
        # Search
        scores, indices = self.index.search(query_vector, top_k)
        
        # Filter by threshold and format results
        relevant_chunks = []
        for idx, score in zip(indices[0], scores[0]):
            if score >= score_threshold and idx < len(self.chunks):
                relevant_chunks.append({
                    'text': self.chunks[idx].page_content,
                    'score': float(score),
                    'chunk_id': int(idx)
                })
        
        return relevant_chunks

# Initialize retriever
print("Initializing retrieval system...")
retriever = OptimalRetriever(chunks, RAG_CONFIG)
print("Retrieval system ready!")

Initializing retrieval system...
Setting up embedding model...


  self.embedding_model = HuggingFaceEmbeddings(


Generating embeddings for all chunks...
Building FAISS index...
Retrieval system ready: 125 vectors (768D)
Retrieval system ready!


In [7]:
from huggingface_hub import login

# Login to Hugging Face to download Mistral-7B
print("üîê Logging in to Hugging Face...")
print("You'll be prompted to enter your HF token.")
print("Get your token from: https://huggingface.co/settings/tokens")
print("\nNote: The token input will be hidden for security.")

try:
    login()
    print("‚úÖ Successfully logged in to Hugging Face!")
except Exception as e:
    print(f"‚ùå Login failed: {e}")
    print("\nAlternative: You can set your token in environment variable:")
    print("HF_TOKEN=your_token_here")

üîê Logging in to Hugging Face...
You'll be prompted to enter your HF token.
Get your token from: https://huggingface.co/settings/tokens

Note: The token input will be hidden for security.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

‚úÖ Successfully logged in to Hugging Face!


## üîê Step 6: Hugging Face Authentication

Before downloading Mistral-7B, you need to authenticate with Hugging Face. This is required to access gated models.

In [8]:

import asyncio
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
# ==========================================================
# Optimized Model Loading for RTX 5050 (8GB VRAM) ‚Äî with Live Output
# ==========================================================
import sys, os, time, threading
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers.utils import logging
import torch



# === Enable live print output and progress ===
os.environ["PYTHONUNBUFFERED"] = "1"
logging.set_verbosity_info()
logging.enable_progress_bar()

def heartbeat():
    while True:
        print("‚è≥ Still working... please wait...", flush=True)
        time.sleep(60)

threading.Thread(target=heartbeat, daemon=True).start()

# ==========================================================
print("üöÄ LOADING MISTRAL-7B FOR RTX 5050 (8GB VRAM)", flush=True)
print("=" * 70, flush=True)

model_name = RAG_CONFIG['generation']['model_name']
model_loaded = False
mistral_model = None
mistral_tokenizer = None

# Clear GPU memory before starting
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"GPU: {torch.cuda.get_device_name(0)}", flush=True)
    print(f"Total VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB", flush=True)
    print(f"Initial GPU memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB", flush=True)

try:
    # Step 1: Load Tokenizer
    print(f"\nüìù Loading tokenizer for {model_name}...", flush=True)
    mistral_tokenizer = AutoTokenizer.from_pretrained(
        model_name, 
        trust_remote_code=True
    )
    if mistral_tokenizer.pad_token is None:
        mistral_tokenizer.pad_token = mistral_tokenizer.eos_token
    print("‚úÖ Tokenizer loaded successfully", flush=True)
    
    # Step 2: Configure 4-bit Quantization (Essential for 8GB VRAM)
    print("\n‚öôÔ∏è Configuring 4-bit quantization for 8GB VRAM...", flush=True)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,                      # Use 4-bit quantization
        bnb_4bit_compute_dtype=torch.float16,   # Compute in float16
        bnb_4bit_use_double_quant=True,         # Double quantization for extra memory savings
        bnb_4bit_quant_type="nf4"               # NormalFloat 4-bit quantization
    )
    print("‚úÖ Quantization config ready (4-bit NF4 + double quant)", flush=True)
    
    # Step 3: Load Model with Quantization
    print(f"\nüîÑ Loading {model_name} with 4-bit quantization...", flush=True)
    print("üì¶ This will download ~13GB if not cached (may take 5-10 minutes)", flush=True)
    print("‚è≥ Please wait...", flush=True)
    
    mistral_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",                      # Automatically distribute across GPU
        torch_dtype=torch.float16,              # Use float16 for memory efficiency
        trust_remote_code=True,
        low_cpu_mem_usage=True                  # Minimize CPU memory during loading
    )
    
    # Clear cache after loading
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    model_loaded = True
    
    # Display Success and Memory Stats
    print("\n" + "=" * 70, flush=True)
    print("‚úÖ MODEL LOADED SUCCESSFULLY!", flush=True)
    print("=" * 70, flush=True)
    
    if torch.cuda.is_available():
        memory_allocated = torch.cuda.memory_allocated() / 1e9
        memory_reserved = torch.cuda.memory_reserved() / 1e9
        memory_free = 8 - memory_reserved
        
        print(f"üíæ GPU Memory Status:", flush=True)
        print(f"   Allocated:  {memory_allocated:.2f} GB", flush=True)
        print(f"   Reserved:   {memory_reserved:.2f} GB", flush=True)
        print(f"   Free:       {memory_free:.2f} GB", flush=True)
        print(f"   Total:      8.00 GB", flush=True)
        print(f"   Usage:      {(memory_reserved/8)*100:.1f}%", flush=True)
        
        # Memory status indicator
        if memory_reserved < 6.4:  # < 80%
            print(f"   Status:     üü¢ Excellent - Plenty of headroom", flush=True)
        elif memory_reserved < 7.2:  # < 90%
            print(f"   Status:     üü° Good - Monitor memory usage", flush=True)
        else:
            print(f"   Status:     üî¥ Tight - Be cautious with batch sizes", flush=True)
    
    print("\nüéØ Optimizations Applied for 8GB VRAM:", flush=True)
    print("   ‚úÖ 4-bit NF4 quantization (~4GB model size)", flush=True)
    print("   ‚úÖ Double quantization for extra savings", flush=True)
    print("   ‚úÖ Float16 compute dtype", flush=True)
    print("   ‚úÖ Automatic device mapping", flush=True)
    print("   ‚úÖ Low CPU memory usage mode", flush=True)
    
    print("\nüìä Model Configuration:", flush=True)
    print(f"   Model:       {model_name}", flush=True)
    print(f"   Precision:   4-bit (quantized from 16-bit)", flush=True)
    print(f"   Device:      {next(mistral_model.parameters()).device}", flush=True)
    print(f"   Parameters:  ~7B (quantized)", flush=True)
    
except Exception as e:
    print("\n" + "=" * 70, flush=True)
    print("‚ùå MODEL LOADING FAILED", flush=True)
    print("=" * 70, flush=True)
    print(f"Error: {str(e)}", flush=True)
    print("\nüí° Troubleshooting Steps:", flush=True)
    print("1. ‚úÖ Check that you're logged in to Hugging Face (run previous cell)", flush=True)
    print("2. ‚úÖ Ensure you have internet connection for download", flush=True)
    print("3. ‚úÖ Verify you have ~15GB free disk space", flush=True)
    print("4. ‚úÖ Check that CUDA is available: torch.cuda.is_available()", flush=True)
    print("5. ‚úÖ Try restarting the notebook kernel", flush=True)
    print("\nüí° If issues persist:", flush=True)
    print("   - Install/update: pip install -U transformers accelerate bitsandbytes", flush=True)
    print("   - Clear HF cache: rm -rf ~/.cache/huggingface/", flush=True)
    
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    model_loaded = False

print("\n" + "=" * 70, flush=True)
print(f"üèÅ Final Status: {'‚úÖ READY TO USE' if model_loaded else '‚ùå NOT LOADED'}", flush=True)
print("=" * 70, flush=True)


‚è≥ Still working... please wait...üöÄ LOADING MISTRAL-7B FOR RTX 5050 (8GB VRAM)

GPU: NVIDIA GeForce RTX 5050 Laptop GPU
Total VRAM: 8.5 GB
Initial GPU memory: 0.45 GB

üìù Loading tokenizer for mistralai/Mistral-7B-Instruct-v0.1...


loading file tokenizer.model from cache at C:\Users\tebats\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\snapshots\ec5deb64f2c6e6fa90c1abf74a91d5c93a9669ca\tokenizer.model
loading file tokenizer.json from cache at C:\Users\tebats\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\snapshots\ec5deb64f2c6e6fa90c1abf74a91d5c93a9669ca\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\tebats\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\snapshots\ec5deb64f2c6e6fa90c1abf74a91d5c93a9669ca\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\tebats\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\snapshots\ec5deb64f2c6e6fa90c1abf74a91d5c93a9669ca\tokenizer_config.json
loading file chat_template.jinja from cache at None


‚úÖ Tokenizer loaded successfully

‚öôÔ∏è Configuring 4-bit quantization for 8GB VRAM...
‚úÖ Quantization config ready (4-bit NF4 + double quant)

üîÑ Loading mistralai/Mistral-7B-Instruct-v0.1 with 4-bit quantization...
üì¶ This will download ~13GB if not cached (may take 5-10 minutes)
‚è≥ Please wait...


loading configuration file config.json from cache at C:\Users\tebats\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\snapshots\ec5deb64f2c6e6fa90c1abf74a91d5c93a9669ca\config.json
`torch_dtype` is deprecated! Use `dtype` instead!
Model config MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "float16",
  "eos_token_id": 2,
  "head_dim": null,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "transformers_version": "4.57.1",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file model.safetensors from cache at C:\Users\tebats\.cache\huggingface\hub\models--mistralai--Mistr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file generation_config.json from cache at C:\Users\tebats\.cache\huggingface\hub\models--mistralai--Mistral-7B-Instruct-v0.1\snapshots\ec5deb64f2c6e6fa90c1abf74a91d5c93a9669ca\generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

Could not locate the custom_generate/generate.py inside mistralai/Mistral-7B-Instruct-v0.1.



‚úÖ MODEL LOADED SUCCESSFULLY!
üíæ GPU Memory Status:
   Allocated:  4.58 GB
   Reserved:   4.70 GB
   Free:       3.30 GB
   Total:      8.00 GB
   Usage:      58.8%
   Status:     üü¢ Excellent - Plenty of headroom

üéØ Optimizations Applied for 8GB VRAM:
   ‚úÖ 4-bit NF4 quantization (~4GB model size)
   ‚úÖ Double quantization for extra savings
   ‚úÖ Float16 compute dtype
   ‚úÖ Automatic device mapping
   ‚úÖ Low CPU memory usage mode

üìä Model Configuration:
   Model:       mistralai/Mistral-7B-Instruct-v0.1
   Precision:   4-bit (quantized from 16-bit)
   Device:      cuda:0
   Parameters:  ~7B (quantized)

üèÅ Final Status: ‚úÖ READY TO USE


‚è≥ Still working... please wait...
‚è≥ Still working... please wait...
‚è≥ Still working... please wait...
‚è≥ Still working... please wait...
‚è≥ Still working... please wait...


In [9]:
class MistralRAGGenerator:
    """Memory-optimized RAG system using Mistral-7B-Instruct for RTX 5050 (8GB VRAM)"""
    
    def __init__(self, model, tokenizer, retriever, config):
        self.model = model
        self.tokenizer = tokenizer
        self.retriever = retriever
        self.config = config

        # üîß Optimized system prompt (uses second version's [INST] style)
        self.system_prompt = """<s>[INST] You are a university student advisor with access to the official student handbook.
Your task is to answer the student's question accurately using only the provided handbook context.

Guidelines:
- Use only the provided context to answer.
- If the context doesn‚Äôt include enough information, say so clearly.
- Be specific about policies, procedures, and requirements.
- Keep your response concise and factual.

HANDBOOK CONTEXT:
{context}

STUDENT QUESTION:
{question}

Provide a clear, helpful answer based on the handbook context above. 
If the handbook lacks enough information, explicitly say so. [/INST]
Answer:"""
    
    def format_context(self, retrieved_chunks):
        """Format retrieved chunks into context string"""
        if not retrieved_chunks:
            return "No relevant information found."
        
        context_parts = []
        for i, chunk in enumerate(retrieved_chunks, 1):
            context_parts.append(f"[Section {i}] {chunk['text']}")
        
        return "\n\n".join(context_parts)
    
    def generate_response(self, question, max_new_tokens=200, temperature=None, top_p=None, verbose=True):
        """
        Generate RAG response optimized for 8GB VRAM GPUs.
        - Uses reduced context and token limits
        - Clears GPU memory between steps
        - Enables KV cache for faster generation
        """
        temperature = temperature or self.config['generation']['temperature']
        top_p = top_p or self.config['generation']['top_p']
        
        if verbose:
            print(f"\n‚ùì Question: {question}")
            print("=" * 70)
        
        # üßπ Clear GPU cache before generation
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        # üîç Step 1: Retrieve relevant context
        if verbose:
            print("üîç Retrieving relevant context...")
        retrieved_chunks = self.retriever.retrieve(question)
        
        if not retrieved_chunks:
            if verbose:
                print("‚ö†Ô∏è No relevant context found.")
            return {
                'question': question,
                'response': "I couldn‚Äôt find relevant information in the student handbook to answer your question.",
                'retrieved_chunks': [],
                'context_used': ""
            }
        
        if verbose:
            print(f"üìö Found {len(retrieved_chunks)} relevant chunks:")
            for i, chunk in enumerate(retrieved_chunks, 1):
                print(f"   {i}. Relevance score: {chunk['score']:.4f}")
        
        # üß† Step 2: Format context (limit length for VRAM efficiency)
        context = self.format_context(retrieved_chunks)
        context_limited = context[:1200]  # 1.2k chars keeps VRAM below 8GB
        
        # üß© Step 3: Construct optimized prompt
        prompt = self.system_prompt.format(context=context_limited, question=question)
        
        # ‚úÇÔ∏è Step 4: Tokenize input efficiently
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=1024  # Lower to prevent OOM on 8GB GPUs
        )
        
        if torch.cuda.is_available():
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        
        # üìä GPU memory check before generation
        if verbose and torch.cuda.is_available():
            mem_before = torch.cuda.memory_allocated() / 1e9
            print(f"üíæ GPU memory before generation: {mem_before:.2f} GB")
        
        # ‚öôÔ∏è Step 5: Generate response (no grad, cache on)
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,      # keep small for VRAM
                temperature=temperature,
                top_p=top_p,
                do_sample=self.config['generation']['do_sample'],
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
                use_cache=True                       # enables faster generation
            )
        
        # üßæ Step 6: Decode and extract answer
        full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        if "Answer:" in full_output:
            response = full_output.split("Answer:")[-1].strip()
        else:
            response = full_output[len(prompt):].strip()
        
        # üßπ Step 7: Cleanup GPU memory
        del inputs, outputs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            if verbose:
                mem_after = torch.cuda.memory_allocated() / 1e9
                print(f"üíæ GPU memory after cleanup: {mem_after:.2f} GB")
        
        # ü™Ñ Step 8: Display and return
        if verbose:
            print("\nüí° Answer:")
            print("-" * 70)
            print(response)
            print("-" * 70)
        
        return {
            'question': question,
            'response': response,
            'retrieved_chunks': retrieved_chunks,
            'context_used': context_limited
        }


In [10]:
# Initialize the RAG system
if model_loaded and mistral_model is not None and mistral_tokenizer is not None:
    print("=" * 70)
    print("üîß INITIALIZING RAG SYSTEM")
    print("=" * 70)
    
    rag_system = MistralRAGGenerator(
        model=mistral_model,
        tokenizer=mistral_tokenizer,
        retriever=retriever,
        config=RAG_CONFIG
    )
    
    print("‚úÖ RAG system initialized successfully!")
    print("=" * 70)
else:
    print("‚ùå Cannot initialize RAG system - model or tokenizer not loaded")
    rag_system = None

üîß INITIALIZING RAG SYSTEM
‚úÖ RAG system initialized successfully!


In [11]:
# Test the RAG system with sample questions
test_questions = [
    "What are the admission requirements?",
    "What is the grading system?",
    "How do I apply for financial aid?",
    "What are the library hours?",
    "What is the academic calendar?"
]

def test_rag_system(questions=None):
    """Test RAG system with multiple questions"""
    if not model_loaded:
        print("Cannot test - model not loaded")
        return
    
    questions = questions or test_questions
    results = []
    
    print(f"Testing RAG system with {len(questions)} questions...")
    print("=" * 70)
    
    for i, question in enumerate(questions, 1):
        print(f"\nTEST {i}/{len(questions)}")
        print("=" * 70)
        
        start_time = time.time()
        result = rag_system.generate_response(question)
        elapsed_time = time.time() - start_time
        
        result['elapsed_time'] = elapsed_time
        results.append(result)
        
        print(f"\nTime taken: {elapsed_time:.2f} seconds")
        print("=" * 70)
    
    return results

# Run quick test (uncomment to test)
# test_results = test_rag_system(test_questions[:2])  # Test first 2 questions

print("RAG system ready for testing!")



RAG system ready for testing!


In [13]:
def interactive_rag_chat():
    """Enhanced interactive chat interface optimized for RTX 5050"""
    if not model_loaded:
        print("‚ùå Cannot start chat - model not loaded")
        return
    
    print("=" * 70)
    print("ü§ñ INTERACTIVE RAG CHAT - RTX 5050 OPTIMIZED")
    print("=" * 70)
    print("Ask questions about the student handbook!")
    print("\nüìã Commands:")
    print("  'quit' or 'exit' - End chat")
    print("  'help' - Show commands")
    print("  'memory' - Show GPU memory usage")
    print("  'history' - Show chat history")
    print("  'clear' - Clear chat history")
    print("  'save' - Save chat history to file")
    print("=" * 70)
    
    chat_history = []
    
    while True:
        try:
            # Show memory status
            if torch.cuda.is_available():
                memory_used = torch.cuda.memory_allocated() / 1e9
                memory_percent = (memory_used / 8) * 100
                status = "üü¢" if memory_percent < 80 else "üü°" if memory_percent < 90 else "üî¥"
                print(f"\n{status} GPU Memory: {memory_used:.1f}GB ({memory_percent:.0f}%)")
            
            user_input = input("\nüí¨ Your question: ").strip()
            
            if user_input.lower() in ['quit', 'exit', 'q']:
                print("üëã Chat ended. Goodbye!")
                break
            
            elif user_input.lower() == 'help':
                print("\nüìã Available Commands:")
                print("  help - Show this help")
                print("  quit/exit/q - End chat")
                print("  memory - Show detailed GPU memory info")
                print("  history - Show chat history")
                print("  clear - Clear chat history")
                print("  save - Save chat history to file")
                print("  settings - Show current RAG settings")
                continue
            
            elif user_input.lower() == 'memory':
                if torch.cuda.is_available():
                    memory_allocated = torch.cuda.memory_allocated() / 1e9
                    memory_reserved = torch.cuda.memory_reserved() / 1e9
                    memory_free = 8 - memory_reserved
                    print(f"\nüîç GPU Memory Status:")
                    print(f"  Allocated: {memory_allocated:.2f} GB")
                    print(f"  Reserved:  {memory_reserved:.2f} GB")
                    print(f"  Free:      {memory_free:.2f} GB")
                    print(f"  Total:     8.0 GB")
                else:
                    print("‚ùå CUDA not available")
                continue
            
            elif user_input.lower() == 'settings':
                print(f"\n‚öôÔ∏è Current RAG Settings:")
                print(f"  Temperature: {RAG_CONFIG['generation']['temperature']}")
                print(f"  Top-p: {RAG_CONFIG['generation']['top_p']}")
                print(f"  Max new tokens: 256 (optimized for RTX 5050)")
                print(f"  Retrieval top-k: {RAG_CONFIG['retrieval']['top_k']}")
                continue
            
            elif user_input.lower() == 'history':
                if chat_history:
                    print(f"\nüìú Chat History ({len(chat_history)} questions):")
                    for i, item in enumerate(chat_history, 1):
                        print(f"  {i}. {item['question'][:60]}{'...' if len(item['question']) > 60 else ''}")
                else:
                    print("\nüìú No chat history yet.")
                continue
            
            elif user_input.lower() == 'clear':
                chat_history.clear()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                print("üóëÔ∏è Chat history cleared and GPU cache cleaned.")
                continue
            
            elif user_input.lower() == 'save':
                if chat_history:
                    filename = f"chat_history_{time.strftime('%Y%m%d_%H%M%S')}.json"
                    with open(filename, 'w', encoding='utf-8') as f:
                        json.dump([{
                            'question': item['question'],
                            'response': item['response'],
                            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
                        } for item in chat_history], f, indent=2)
                    print(f"üíæ Chat history saved to {filename}")
                else:
                    print("üìú No chat history to save.")
                continue
            
            elif not user_input:
                print("‚ùì Please enter a question.")
                continue
            
            # Generate response with timing
            start_time = time.time()
            result = rag_system.generate_response(user_input, max_new_tokens=256, verbose=True)
            elapsed_time = time.time() - start_time
            
            result['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S')
            result['response_time'] = elapsed_time
            chat_history.append(result)
            
            print(f"\n‚è±Ô∏è Response time: {elapsed_time:.1f}s")
            
            # Memory cleanup after each response
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
        except KeyboardInterrupt:
            print("\n\n‚ö†Ô∏è Chat interrupted. Goodbye!")
            break
        except Exception as e:
            print(f"‚ùå Error: {e}")
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    
    return chat_history

# Start interactive chat (uncomment to use)
# chat_history = interactive_rag_chat()

print("üöÄ Enhanced interactive chat function ready!")

interactive_rag_chat()

üöÄ Enhanced interactive chat function ready!
ü§ñ INTERACTIVE RAG CHAT - RTX 5050 OPTIMIZED
Ask questions about the student handbook!

üìã Commands:
  'quit' or 'exit' - End chat
  'help' - Show commands
  'memory' - Show GPU memory usage
  'history' - Show chat history
  'clear' - Clear chat history
  'save' - Save chat history to file

üü¢ GPU Memory: 4.6GB (57%)

‚ùì Question: Who is the president of EARIST?
üîç Retrieving relevant context...
üìö Found 5 relevant chunks:
   1. Relevance score: 0.5430
   2. Relevance score: 0.5364
   3. Relevance score: 0.5019
   4. Relevance score: 0.4890
   5. Relevance score: 0.4865
üíæ GPU memory before generation: 4.58 GB
üíæ GPU memory after cleanup: 4.58 GB

üí° Answer:
----------------------------------------------------------------------
The handbook context does not provide information about the current president of EARIST.
----------------------------------------------------------------------

‚è±Ô∏è Response time: 33.9s

üü¢ GPU

[{'question': 'Who is the president of EARIST?',
  'response': 'The handbook context does not provide information about the current president of EARIST.',
  'retrieved_chunks': [{'text': 'HISTORY OF EARIST\n\nThe Eulogio "Amang" Rodriguez Institute of Science and Technology (EARIST) was established after the liberation of Manila in 1945. EARIST traces back its development from Vocational High School with only a room at the second floor of the Mapa High School, nine teachers, a clerk, and 147 students under Mr. Pantaleon Regala, its principal. Its former name was Eulogio Rodriguez Vocational High School (ERVHS).\n\nOn July 1, 1946, EARIST acquired its present site at Nagtahan, Sampaloc, Manila. Mr. Apolinario Apilado was appointed as principal. He was succeeded by Dr. Hilario G. Nudas in 1949.\n\nEARIST\'s growth and development were made possible via three Republic Acts and Presidential Decree, to wit:\n\n- **Republic Act No. 4072**, jointly sponsored by Congressman Ramon D. Bagatsing 

In [12]:
def batch_process_questions(questions, output_file=None, memory_cleanup_interval=5):
    """
    Batch process multiple questions with memory optimization for RTX 5050
    
    Args:
        questions: List of questions (strings) or list of dicts with 'question' key
        output_file: Optional filename to save results
        memory_cleanup_interval: Clean GPU memory every N questions
    """
    if not model_loaded:
        print("‚ùå Cannot process - model not loaded")
        return None
    
    # Handle different input formats
    if isinstance(questions, str):
        questions = [questions]
    
    question_list = []
    for q in questions:
        if isinstance(q, str):
            question_list.append(q)
        elif isinstance(q, dict) and 'question' in q:
            question_list.append(q['question'])
        else:
            print(f"‚ö†Ô∏è Skipping invalid question format: {q}")
    
    print("=" * 70)
    print("üîÑ BATCH PROCESSING MODE - RTX 5050 OPTIMIZED")
    print("=" * 70)
    print(f"üìä Processing {len(question_list)} questions")
    print(f"üßπ Memory cleanup every {memory_cleanup_interval} questions")
    print(f"üíæ Output file: {output_file or 'None (in-memory only)'}")
    print("=" * 70)
    
    results = []
    start_time = time.time()
    
    for i, question in enumerate(question_list, 1):
        print(f"\nüìù Processing {i}/{len(question_list)}: {question[:60]}{'...' if len(question) > 60 else ''}")
        
        # Memory status before processing
        if torch.cuda.is_available():
            memory_before = torch.cuda.memory_allocated() / 1e9
            print(f"   üíæ GPU Memory: {memory_before:.1f}GB")
        
        try:
            # Generate response with reduced verbosity for batch mode
            question_start = time.time()
            result = rag_system.generate_response(
                question, 
                max_new_tokens=256, 
                verbose=False  # Reduce output for batch processing
            )
            question_time = time.time() - question_start
            
            # Add metadata
            result.update({
                'batch_index': i,
                'processing_time': question_time,
                'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
                'gpu_memory_before': memory_before if torch.cuda.is_available() else None
            })
            
            results.append(result)
            
            print(f"   ‚úÖ Completed in {question_time:.1f}s")
            print(f"   üìã Found {len(result['retrieved_chunks'])} relevant chunks")
            
            # Memory cleanup at intervals
            if i % memory_cleanup_interval == 0:
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    memory_after = torch.cuda.memory_allocated() / 1e9
                    print(f"   üßπ Memory cleaned: {memory_after:.1f}GB")
            
        except Exception as e:
            print(f"   ‚ùå Error processing question {i}: {e}")
            error_result = {
                'question': question,
                'response': f"Error: {str(e)}",
                'retrieved_chunks': [],
                'context_used': "",
                'batch_index': i,
                'processing_time': 0,
                'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
                'error': str(e)
            }
            results.append(error_result)
            
            # Force memory cleanup on error
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    
    total_time = time.time() - start_time
    
    # Final cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Calculate statistics
    successful_results = [r for r in results if 'error' not in r]
    error_count = len(results) - len(successful_results)
    avg_time = np.mean([r['processing_time'] for r in successful_results]) if successful_results else 0
    total_chunks = sum(len(r['retrieved_chunks']) for r in successful_results)
    
    print(f"\n{'='*70}")
    print(f"üìä BATCH PROCESSING COMPLETE")
    print(f"{'='*70}")
    print(f"‚úÖ Successful: {len(successful_results)}/{len(question_list)}")
    print(f"‚ùå Errors: {error_count}")
    print(f"‚è±Ô∏è Total time: {total_time:.1f}s")
    print(f"‚ö° Average time per question: {avg_time:.1f}s")
    print(f"üìã Total chunks retrieved: {total_chunks}")
    print(f"üß† Average chunks per question: {total_chunks/len(successful_results) if successful_results else 0:.1f}")
    
    # Save results if requested
    if output_file:
        try:
            # Create serializable version
            serializable_results = []
            for result in results:
                serialized = {
                    'question': result['question'],
                    'response': result['response'],
                    'batch_index': result['batch_index'],
                    'processing_time': result['processing_time'],
                    'timestamp': result['timestamp'],
                    'num_chunks_retrieved': len(result['retrieved_chunks']),
                    'retrieval_scores': [chunk['score'] for chunk in result['retrieved_chunks']],
                    'error': result.get('error', None)
                }
                serializable_results.append(serialized)
            
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump({
                    'metadata': {
                        'total_questions': len(question_list),
                        'successful_results': len(successful_results),
                        'error_count': error_count,
                        'total_processing_time': total_time,
                        'average_time_per_question': avg_time,
                        'processing_date': time.strftime('%Y-%m-%d %H:%M:%S'),
                        'model_config': RAG_CONFIG
                    },
                    'results': serializable_results
                }, f, indent=2)
            
            print(f"üíæ Results saved to {output_file}")
            
        except Exception as e:
            print(f"‚ùå Error saving results: {e}")
    
    print(f"{'='*70}\n")
    return results

def load_questions_from_file(filepath):
    """Load questions from various file formats"""
    try:
        if filepath.endswith('.json'):
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    return data
                elif isinstance(data, dict):
                    # Handle Questions.json format
                    questions = []
                    for category, q_list in data.items():
                        questions.extend(q_list)
                    return questions
        
        elif filepath.endswith('.txt'):
            with open(filepath, 'r', encoding='utf-8') as f:
                return [line.strip() for line in f if line.strip()]
        
        else:
            print(f"‚ùå Unsupported file format: {filepath}")
            return None
            
    except Exception as e:
        print(f"‚ùå Error loading questions from {filepath}: {e}")
        return None

# Example usage functions
def batch_process_test_questions():
    """Process the predefined test questions"""
    test_questions = [
        "What are the admission requirements?",
        "What is the grading system?",
        "How do I apply for financial aid?",
        "What are the library hours?",
        "What is the academic calendar?",
        "What are the graduation requirements?",
        "How do I register for classes?",
        "What academic support services are available?"
    ]
    
    return batch_process_questions(
        test_questions, 
        output_file=f"batch_test_results_{time.strftime('%Y%m%d_%H%M%S')}.json"
    )

def batch_process_questions_json():
    """Process all questions from Questions.json file"""
    questions = load_questions_from_file('../Retriever/Questions.json')
    if questions:
        return batch_process_questions(
            questions, 
            output_file=f"batch_all_questions_{time.strftime('%Y%m%d_%H%M%S')}.json",
            memory_cleanup_interval=3  # More frequent cleanup for larger batches
        )
    else:
        print("‚ùå Could not load questions from Questions.json")
        return None

print("üîÑ Advanced batch processing functions ready!")
print("\nUsage examples:")
print("  batch_process_test_questions()  # Process 8 test questions")
print("  batch_process_questions_json()  # Process all Questions.json")
print("  batch_process_questions(['Q1', 'Q2'], 'results.json')  # Custom questions")

üîÑ Advanced batch processing functions ready!

Usage examples:
  batch_process_test_questions()  # Process 8 test questions
  batch_process_questions_json()  # Process all Questions.json
  batch_process_questions(['Q1', 'Q2'], 'results.json')  # Custom questions


In [13]:
def evaluate_rag_with_questions_json():
    """Evaluate RAG system using your Questions.json file"""
    if not model_loaded:
        print("Cannot evaluate - model not loaded")
        return
    
    try:
        with open('../Retriever/Questions.json', 'r', encoding='utf-8') as f:
            questions_data = json.load(f)
    except FileNotFoundError:
        print("Questions.json not found. Make sure it exists in ../Retriever/")
        return
    
    print("=" * 70)
    print("RAG SYSTEM EVALUATION")
    print("=" * 70)
    
    all_results = []
    total_questions = sum(len(questions) for questions in questions_data.values())
    current_q = 0
    
    for category, questions in questions_data.items():
        print(f"\nEvaluating Category: {category}")
        print(f"Questions: {len(questions)}")
        print("-" * 50)
        
        category_results = []
        
        for q_item in questions:
            current_q += 1
            question = q_item['question']
            expected_ref = q_item['expected_reference']
            
            print(f"\n[{current_q}/{total_questions}] {question}")
            
            start_time = time.time()
            result = rag_system.generate_response(question)
            elapsed_time = time.time() - start_time
            
            result['expected_reference'] = expected_ref
            result['category'] = category
            result['elapsed_time'] = elapsed_time
            
            category_results.append(result)
            all_results.append(result)
            
            print(f"Time: {elapsed_time:.2f}s")
            print("-" * 30)
        
        # Category summary
        avg_time = np.mean([r['elapsed_time'] for r in category_results])
        avg_chunks = np.mean([len(r['retrieved_chunks']) for r in category_results])
        print(f"\nCategory Summary:")
        print(f"  Average response time: {avg_time:.2f}s")
        print(f"  Average chunks retrieved: {avg_chunks:.1f}")
    
    # Overall summary
    print(f"\n{'='*70}")
    print(f"EVALUATION COMPLETE")
    print(f"{'='*70}")
    print(f"Total questions: {len(all_results)}")
    print(f"Average response time: {np.mean([r['elapsed_time'] for r in all_results]):.2f}s")
    print(f"Total evaluation time: {sum([r['elapsed_time'] for r in all_results]):.1f}s")
    
    return all_results

def save_evaluation_results(results, filename="rag_evaluation_results.json"):
    """Save evaluation results to file"""
    if not results:
        print("No results to save")
        return
    
    # Convert to serializable format
    serializable_results = []
    for result in results:
        serialized = {
            'question': result['question'],
            'response': result['response'],
            'expected_reference': result.get('expected_reference', ''),
            'category': result.get('category', ''),
            'elapsed_time': result['elapsed_time'],
            'num_chunks_retrieved': len(result['retrieved_chunks']),
            'retrieval_scores': [chunk['score'] for chunk in result['retrieved_chunks']]
        }
        serializable_results.append(serialized)
    
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(serializable_results, f, indent=2)
    
    print(f"Evaluation results saved to {filename}")

# Run evaluation (uncomment to evaluate)
# evaluation_results = evaluate_rag_with_questions_json()
# save_evaluation_results(evaluation_results)

print("Evaluation functions ready!")

Evaluation functions ready!


In [None]:
# üöÄ QUICK START - Test Your RAG System on RTX 5050

if model_loaded and rag_system is not None:
    print("=" * 70)
    print("üöÄ TESTING RAG SYSTEM ON RTX 5050 (8GB VRAM)")
    print("=" * 70)
    
    # Show current GPU memory status
    if torch.cuda.is_available():
        memory_used = torch.cuda.memory_allocated() / 1e9
        memory_percent = (memory_used / 8) * 100
        
        if memory_percent < 80:
            status = "? Excellent"
        elif memory_percent < 90:
            status = "üü° Good"
        else:
            status = "üî¥ High"
        
        print(f"\nüíæ GPU Memory: {memory_used:.1f} GB / 8 GB ({memory_percent:.0f}%) - {status}")
    
    # Test with a sample question
    sample_question = "What are the admission requirements?"
    
    print(f"\nüß™ Testing with sample question:")
    print(f"'{sample_question}'")
    print("-" * 70)
    
    # Generate response with timing
    start_time = time.time()
    result = rag_system.generate_response(sample_question, max_new_tokens=200, verbose=True)
    elapsed_time = time.time() - start_time
    
    print(f"\n‚è±Ô∏è Response generated in {elapsed_time:.1f} seconds")
    print("=" * 70)
    print("‚úÖ SUCCESS! Your RAG system is working on RTX 5050!")
    print("=" * 70)
    
    # Final memory cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        final_memory = torch.cuda.memory_allocated() / 1e9
        print(f"üßπ Memory after cleanup: {final_memory:.1f} GB / 8 GB")
    
    # Usage guide
    print("\n" + "=" * 70)
    print("? HOW TO USE YOUR RAG SYSTEM")
    print("=" * 70)
    
    print("\n1Ô∏è‚É£  Ask a Single Question:")
    print("   result = rag_system.generate_response('your question here')")
    
    print("\n2Ô∏è‚É£  Interactive Chat Mode:")
    print("   interactive_rag_chat()")
    
    print("\n3Ô∏è‚É£  Batch Process Multiple Questions:")
    print("   batch_process_test_questions()")
    
    print("\n4Ô∏è‚É£  Check GPU Memory:")
    print("   torch.cuda.memory_allocated() / 1e9  # Shows GB used")
    print("   torch.cuda.empty_cache()             # Frees unused memory")
    
    print("\n" + "=" * 70)
    print("? TIPS FOR 8GB VRAM:")
    print("=" * 70)
    print("‚úÖ Keep max_new_tokens ‚â§ 250 for safety")
    print("‚úÖ Process questions one at a time or use small batches")
    print("‚úÖ Run torch.cuda.empty_cache() if you get OOM errors")
    print("‚úÖ Monitor memory with the 'memory' command in interactive mode")
    print("‚úÖ Close other GPU-using applications for best performance")
    
    print("\n" + "=" * 70)
    print("üéâ YOU'RE ALL SET! Start asking questions about the student handbook!")
    print("=" * 70)
    
else:
    print("=" * 70)
    print("‚ùå RAG SYSTEM NOT READY")
    print("=" * 70)
    print("\nPlease run all cells in order:")
    print("1. ‚úÖ Cell 1: Import libraries and check GPU")
    print("2. ‚úÖ Cell 2: Load dataset")
    print("3. ‚úÖ Cell 3: Configure RAG settings")
    print("4. ‚úÖ Cell 4: Load/create chunks")
    print("5. ‚úÖ Cell 5: Initialize retriever")
    print("6. ‚úÖ Cell 6: Login to Hugging Face")
    print("7. ‚úÖ Cell 7: Load Mistral-7B model")
    print("8. ‚úÖ Cell 8: Initialize RAG generator")
    print("\nThen run this cell again!")
    print("=" * 70)

In [12]:
def process_and_save_retrieved_chunks(question, output_txt_file=None):
    """
    Process a single question through the RAG workflow and save retrieved chunks to a .txt file.
    Args:
        question (str): The question to process.
        output_txt_file (str): Path to the output .txt file. If None, uses a timestamped default.
    Returns:
        dict: The RAG response result.
    """
    if not model_loaded or rag_system is None:
        print("‚ùå RAG system not ready. Please ensure the model is loaded.")
        return None

    print("=" * 70)
    print(f"Processing question: {question}")
    print("=" * 70)
    result = rag_system.generate_response(question, verbose=True)

    # Save retrieved chunks to .txt file
    if output_txt_file is None:
        safe_question = question[:40].replace(' ', '_').replace('?', '')
        output_txt_file = f"retrieved_chunks_{safe_question}_{time.strftime('%Y%m%d_%H%M%S')}.txt"

    try:
        with open(output_txt_file, 'w', encoding='utf-8') as f:
            f.write(f"Question: {question}\n\n")
            f.write("Retrieved Chunks:\n\n")
            for i, chunk in enumerate(result['retrieved_chunks'], 1):
                f.write(f"[Chunk {i}] (Score: {chunk['score']:.4f})\n")
                f.write(chunk['text'] + "\n\n")
        print(f"\nüíæ Retrieved chunks saved to {output_txt_file}")
    except Exception as e:
        print(f"‚ùå Error saving retrieved chunks: {e}")

    return result

print("\nFunction 'process_and_save_retrieved_chunks' is ready! Use it to process a question and save retrieved chunks to a .txt file.")


process_and_save_retrieved_chunks("Who is the Current President of Earist?")


Function 'process_and_save_retrieved_chunks' is ready! Use it to process a question and save retrieved chunks to a .txt file.
Processing question: Who is the Current President of Earist?

‚ùì Question: Who is the Current President of Earist?
üîç Retrieving relevant context...
üìö Found 5 relevant chunks:
   1. Relevance score: 0.7226
   2. Relevance score: 0.7042
   3. Relevance score: 0.6187
   4. Relevance score: 0.6060
   5. Relevance score: 0.5853
üíæ GPU memory before generation: 4.58 GB
üìö Found 5 relevant chunks:
   1. Relevance score: 0.7226
   2. Relevance score: 0.7042
   3. Relevance score: 0.6187
   4. Relevance score: 0.6060
   5. Relevance score: 0.5853
üíæ GPU memory before generation: 4.58 GB
üíæ GPU memory after cleanup: 4.58 GB

üí° Answer:
----------------------------------------------------------------------
According to the handbook, the current President of EARIST is Dr. Editha V. Caillao.
------------------------------------------------------------------

{'question': 'Who is the Current President of Earist?',
 'response': 'According to the handbook, the current President of EARIST is Dr. Editha V. Caillao.',
 'retrieved_chunks': [{'text': 'Caillao in 2006 to 2010. The 7th and current President of EARIST is Dr. Editha V.',
   'score': 0.7226303815841675,
   'chunk_id': 43},
  {'text': 'PILLO\nPresident, EARIST\nVice ‚Äì Chair\nHON.',
   'score': 0.7041721343994141,
   'chunk_id': 447},
  {'text': 'President ‚Äì the head of the school, college or university.',
   'score': 0.6186845302581787,
   'chunk_id': 587},
  {'text': 'Elected officers...',
   'score': 0.6059854030609131,
   'chunk_id': 293},
  {'text': 'Arturo P. Casuga is the 4th president followed by the 6th President Dr. Eduardo S.',
   'score': 0.5853207111358643,
   'chunk_id': 42}],
 'context_used': '[Section 1] Caillao in 2006 to 2010. The 7th and current President of EARIST is Dr. Editha V.\n\n[Section 2] PILLO\nPresident, EARIST\nVice ‚Äì Chair\nHON.\n\n[Section 3] Preside