<a href="https://colab.research.google.com/github/aiguru-pro/k-ai-engineers/blob/main/RAG_w_Gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧠 RAG Workshop with Gradio Interface - Visual Educational Demo
# Kognitic AI for Engineers - Progressive Examples


ENHANCED WORKSHOP WITH GRADIO UI:
1. 📚 BASIC RAG: Simple document Q&A with clinical trial protocols
2. 🔍 INTERMEDIATE RAG: Multi-document analysis with reranking
3. 🎨 GRADIO UI: Beautiful visual interface for better learning experience

All original functionality preserved - just adding visual interface!



In [1]:
# Install required packages for Google Colab
import os
import sys

# Install packages if not already installed
try:
    import sentence_transformers
    import faiss
    import openai
    import gradio as gr
except ImportError:
    print("Installing required packages...")
    os.system('pip install openai sentence-transformers faiss-cpu rank-bm25 numpy pandas matplotlib seaborn scikit-learn gradio -q')

import os
from typing import List, Dict, Any, Optional, Tuple
import numpy as np
import pandas as pd
from dataclasses import dataclass
import json
import re
import time
from datetime import datetime

from openai import OpenAI
from sentence_transformers import SentenceTransformer
import faiss
from rank_bm25 import BM25Okapi
import gradio as gr

print("🔧 EMBEDDING & VECTOR DB SETUP:")
print(f"✅ SentenceTransformers: Available")
print(f"✅ FAISS: Available")
print(f"✅ OpenAI: Available")
print(f"✅ Gradio: Available")


Installing required packages...
🔧 EMBEDDING & VECTOR DB SETUP:
✅ SentenceTransformers: Available
✅ FAISS: Available
✅ OpenAI: Available
✅ Gradio: Available


In [2]:
class BasicRAG:
    """
    Simplest RAG implementation for clinical trial protocol analysis.
    Shows core concepts without complexity.

    🎯 EDUCATIONAL FOCUS:
    - Basic RAG pipeline: Embed → Store → Retrieve → Generate
    - Using HuggingFace embedding models
    - Simple vector similarity search with FAISS
    """

    def __init__(self, api_key: str, embedding_model: str = 'all-MiniLM-L6-v2'):
        self.client = OpenAI(api_key=api_key)

        print(f"🤖 Loading embedding model: {embedding_model}")
        self.encoder = SentenceTransformer(embedding_model)  # HuggingFace model

        self.documents = []
        self.embeddings = []
        self.index = None

        print(f"✅ Model loaded: {self.encoder.get_sentence_embedding_dimension()}-dimensional embeddings")

    def add_document(self, text: str, metadata: Dict = None):
        """Add a document to the knowledge base with chunking demonstration"""

        print(f"📄 Processing document ({len(text)} characters)...")

        # Demonstrate chunking impact
        chunks = self._smart_chunk_document(text)
        print(f"📚 Created {len(chunks)} chunks using sentence-based chunking")

        for i, chunk in enumerate(chunks):
            chunk_metadata = {**(metadata or {}), 'chunk_id': i, 'chunk_text_preview': chunk[:50]}

            self.documents.append({
                'text': chunk,
                'metadata': chunk_metadata
            })

            # Create embedding using HuggingFace model
            print(f"🔄 Creating embedding for chunk {i+1}...")
            embedding = self.encoder.encode(chunk)
            self.embeddings.append(embedding)

        # Rebuild FAISS index
        self._build_faiss_index()
        print(f"🗄️ FAISS index updated with {len(self.documents)} total chunks")

    def _smart_chunk_document(self, text: str, sentences_per_chunk: int = 3) -> List[str]:
        """Demonstrate sentence-based chunking (better than naive character splitting)"""

        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if s.strip()]

        chunks = []
        for i in range(0, len(sentences), sentences_per_chunk):
            chunk = '. '.join(sentences[i:i+sentences_per_chunk])
            if chunk:
                chunks.append(chunk + '.')

        return chunks

    def _build_faiss_index(self):
        """Build FAISS index for fast similarity search"""
        if self.embeddings:
            embeddings_array = np.array(self.embeddings).astype('float32')

            # Using Inner Product (IP) for cosine similarity
            self.index = faiss.IndexFlatIP(embeddings_array.shape[1])

            # Normalize embeddings for cosine similarity
            faiss.normalize_L2(embeddings_array)
            self.index.add(embeddings_array)

            print(f"🗄️ FAISS index built: {self.index.ntotal} vectors, {self.index.d} dimensions")

    def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
        """Retrieve most relevant documents using vector similarity"""
        if not self.index:
            return []

        print(f"🔍 Searching for: '{query}'")

        # Encode query using same model
        query_embedding = self.encoder.encode(query).astype('float32').reshape(1, -1)
        faiss.normalize_L2(query_embedding)  # Normalize for cosine similarity

        # Search using FAISS
        scores, indices = self.index.search(query_embedding, min(top_k, len(self.documents)))

        print(f"📊 Retrieved {len(indices[0])} results")

        # Return results with similarity scores
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.documents):
                results.append({
                    'document': self.documents[idx],
                    'similarity_score': float(score),
                    'embedding_model': self.encoder._modules['0'].auto_model.name_or_path
                })
                print(f"   📄 Chunk {idx}: similarity={score:.3f}")

        return results

    def generate_answer(self, query: str, context_docs: List[Dict]) -> str:
        """Generate answer using retrieved context"""

        # Prepare context
        context = "\n\n".join([
            f"Document {i+1}: {doc['document']['text']}"
            for i, doc in enumerate(context_docs)
        ])

        prompt = f"""
You are a clinical research specialist analyzing trial protocols.

CONTEXT DOCUMENTS:
{context}

QUESTION: {query}

INSTRUCTIONS:
- Answer based ONLY on the provided context
- If information isn't in the context, say "Based on the provided documents, I cannot find..."
- Be specific and cite which document your answer comes from
- Focus on clinical accuracy and regulatory compliance

ANSWER:
"""

        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=400,
            temperature=0.1
        )

        return response.choices[0].message.content

    def query(self, question: str) -> Dict[str, Any]:
        """Complete RAG pipeline: retrieve + generate"""

        # Retrieve relevant documents
        retrieved_docs = self.retrieve(question, top_k=3)

        if not retrieved_docs:
            return {
                'answer': "No relevant documents found in the knowledge base.",
                'sources': [],
                'method': 'basic_rag'
            }

        # Generate answer
        answer = self.generate_answer(question, retrieved_docs)

        return {
            'answer': answer,
            'sources': [doc['document']['metadata'] for doc in retrieved_docs],
            'scores': [doc['similarity_score'] for doc in retrieved_docs],
            'method': 'basic_rag'
        }


## Intermediate RAG

In [3]:
@dataclass
class Document:
    """Enhanced document structure with metadata"""
    id: str
    text: str
    metadata: Dict[str, Any]
    embedding: Optional[np.ndarray] = None

class IntermediateRAG:
    """
    More sophisticated RAG with PDF processing, reranking and multi-document synthesis.
    Demonstrates cross-document querying across multiple clinical trial protocols.
    """

    def __init__(self, api_key: str):
        self.client = OpenAI(api_key=api_key)
        self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
        self.documents: List[Document] = []
        self.index = None

        # BM25 for keyword-based retrieval
        self.bm25 = None
        self.tokenized_docs = []

    def create_sample_pdfs(self) -> List[str]:
        """Create sample PDF content files for Google Colab compatibility"""

        # Sample PDF contents (realistic clinical trial data)
        pdf_contents = {
            "KEYNOTE-189_Protocol.txt": "KEYNOTE-189 Clinical Trial Protocol\nMK-3475-189: Phase III Pembrolizumab + Pemetrexed + Platinum Study\n\nSTUDY SUMMARY:\nThis randomized, double-blind, placebo-controlled Phase III study compares pembrolizumab in combination with pemetrexed and carboplatin or cisplatin versus placebo in combination with pemetrexed and carboplatin or cisplatin in participants with previously untreated metastatic nonsquamous NSCLC.\n\nPRIMARY OBJECTIVES:\n- Compare overall survival (OS) of pembrolizumab combination vs placebo combination\n- Compare progression-free survival (PFS) of pembrolizumab combination vs placebo combination\n\nSTUDY DESIGN:\n- Phase III, randomized, double-blind, placebo-controlled\n- Population: Metastatic nonsquamous NSCLC, first-line treatment\n- Sample Size: Approximately 600 participants\n- Randomization: 2:1 (pembrolizumab:placebo)\n\nKEY STUDY RESULTS:\nPrimary Endpoint Results:\n- Overall Survival: 22.0 vs 10.7 months (HR 0.56, 95% CI 0.45-0.70, p<0.001)\n- Progression-Free Survival: 9.0 vs 4.9 months (HR 0.48, 95% CI 0.40-0.58, p<0.001)\n\nSecondary Endpoints:\n- Objective Response Rate: 48% vs 19% (p<0.001)\n- Duration of Response: 11.2 vs 7.8 months\n- Disease Control Rate: 71% vs 52%\n\nSafety Profile:\n- Grade 3+ Treatment-Related AEs: 71% vs 66%\n- Most common AEs: Fatigue (56%), nausea (45%), anemia (34%)\n- Immune-related AEs: Hypothyroidism (9%), pneumonitis (5%), hepatitis (2%)\n- Treatment discontinuation due to AEs: 13% vs 8%",

            "CheckMate-227_Protocol.txt": "CheckMate-227 Clinical Trial Protocol\nCA209-227: Phase III Nivolumab + Ipilimumab vs Chemotherapy Study\n\nSTUDY OVERVIEW:\nThis randomized, open-label, multi-center Phase III study evaluates nivolumab combined with ipilimumab versus platinum-based doublet chemotherapy in first-line treatment of stage IV or recurrent non-small cell lung cancer.\n\nPRIMARY OBJECTIVES:\n- Evaluate OS of nivolumab + ipilimumab vs chemotherapy in TMB ≥10 mut/Mb patients\n- Evaluate OS of nivolumab + ipilimumab vs chemotherapy in PD-L1 ≥1% patients\n\nSTUDY DESIGN:\n- Phase III, randomized, open-label, multi-center\n- Population: Previously untreated advanced/metastatic NSCLC\n- Total Enrollment: 1,739 participants\n- Stratification: Histology, sex, geographic region\n\nKEY STUDY RESULTS:\nPrimary Endpoint Results (TMB ≥10 mut/Mb):\n- Overall Survival: 17.3 vs 14.9 months (HR 0.79, 95% CI 0.65-0.96, p=0.0154)\n- Progression-Free Survival: 7.2 vs 5.5 months (HR 0.82, 95% CI 0.69-0.97)\n- Objective Response Rate: 36% vs 30%\n\nSafety Profile:\n- Grade 3-4 Treatment-Related AEs: 31% vs 36%\n- Most common irAEs: Skin reactions (34%), diarrhea (17%), hepatitis (8%)\n- Pneumonitis: 7% (Grade 3-4: 2%)\n- Treatment discontinuation due to AEs: 18%\n- Corticosteroid use for irAE management: 35%",

            "BEACON-CRC_Protocol.txt": "BEACON CRC Clinical Trial Protocol\nBRF117019: Phase III Encorafenib + Binimetinib + Cetuximab Study\n\nSTUDY RATIONALE:\nBackground:\nBRAF V600E mutations occur in approximately 8-12% of metastatic colorectal cancer (mCRC) patients and are associated with poor prognosis, resistance to anti-EGFR therapy when used as monotherapy, and limited treatment options in later-line settings.\n\nSTUDY OBJECTIVES:\nPrimary Objective:\nDemonstrate superior overall survival (OS) of encorafenib + cetuximab versus standard of care in patients with BRAF V600E-mutant mCRC who have received 1-2 prior systemic therapies.\n\nSTUDY DESIGN:\nOverall Design:\n- Phase III, randomized, open-label, active-controlled\n- Population: BRAF V600E-mutant mCRC, 1-2 prior therapies\n- Randomization: 1:1:1 across three treatment arms\n- Stratification: ECOG PS (0 vs 1), prior bevacizumab, region\n\nKEY STUDY RESULTS:\nPrimary Endpoint Results:\n- Overall Survival (Doublet vs Control): 9.0 vs 5.4 months (HR 0.52, p<0.001)\n- Overall Survival (Triplet vs Control): 9.3 vs 5.4 months (HR 0.54, p<0.001)\n\nSecondary Endpoints:\n- Progression-Free Survival: 4.5 vs 1.5 months (HR 0.42)\n- Objective Response Rate: 26% vs 2% (doublet), 27% vs 2% (triplet)\n- Disease Control Rate: 61% vs 22% (doublet), 65% vs 22% (triplet)\n\nClinical Significance:\n- First targeted therapy success in BRAF-mutant mCRC\n- Represents major breakthrough for historically poor prognosis population\n- Demonstrates importance of combination approach to overcome resistance"
        }

        print("📄 Creating sample clinical trial protocol files for Google Colab...")

        # Create text files (Colab-friendly alternative to PDFs)
        created_files = []
        for filename, content in pdf_contents.items():
            try:
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(content)
                created_files.append(filename)
                print(f"   ✅ Created {filename}")
            except Exception as e:
                print(f"   ❌ Error creating {filename}: {e}")

        print(f"✅ Created {len(created_files)} protocol files ready for RAG processing")
        return created_files

    def load_pdf_content(self, file_path: str) -> str:
        """Load content from protocol file (Colab-friendly)"""

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            print(f"📖 Loaded {len(content)} characters from {file_path}")
            return content
        except FileNotFoundError:
            print(f"⚠️  File not found: {file_path}")
            return ""
        except Exception as e:
            print(f"❌ Error loading {file_path}: {e}")
            return ""

    def chunk_document(self, text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
        """Advanced chunking with overlap for better retrieval"""

        print(f"🔪 Chunking document: {len(text)} chars → chunks of ~{chunk_size} chars with {overlap} overlap")

        # Split by sections first (assuming sections are separated by multiple newlines)
        sections = text.split('\n\n')

        chunks = []
        current_chunk = ""

        for section in sections:
            # If adding this section would exceed chunk size, finalize current chunk
            if len(current_chunk) + len(section) > chunk_size and current_chunk:
                chunks.append(current_chunk.strip())
                # Start new chunk with overlap from previous chunk
                words = current_chunk.split()
                overlap_text = ' '.join(words[-overlap//10:]) if len(words) > overlap//10 else ""
                current_chunk = overlap_text + '\n\n' + section
            else:
                current_chunk += '\n\n' + section if current_chunk else section

        # Add the last chunk
        if current_chunk.strip():
            chunks.append(current_chunk.strip())

        print(f"📚 Created {len(chunks)} chunks (avg length: {np.mean([len(c) for c in chunks]):.0f} chars)")
        return chunks

    def load_pdf_documents(self, pdf_files: List[str]):
        """Load and process multiple protocol documents"""

        print(f"\n📚 LOADING {len(pdf_files)} CLINICAL TRIAL DOCUMENTS")
        print("=" * 60)

        total_chunks = 0

        for pdf_file in pdf_files:
            print(f"\n📄 Processing: {pdf_file}")

            # Extract basic metadata from filename
            study_name = pdf_file.replace('_Protocol.txt', '').replace('.txt', '')

            # Load content
            content = self.load_pdf_content(pdf_file)
            if not content:
                continue

            # Extract enhanced metadata from content
            metadata = self.extract_metadata_from_content(content, study_name)
            print(f"🏷️  Extracted metadata: {metadata}")

            # Chunk the document with overlap
            chunks = self.chunk_document(content)

            print(f"📑 {study_name}: {len(chunks)} chunks created")

            # Add each chunk as a separate document with rich metadata
            for i, chunk in enumerate(chunks):
                chunk_id = f"{study_name}_chunk_{i+1}"
                chunk_metadata = {
                    **metadata,
                    'chunk_id': chunk_id,
                    'chunk_number': i+1,
                    'total_chunks': len(chunks),
                    'source_file': pdf_file,
                    'chunk_size': len(chunk),
                    'has_results': 'results' in chunk.lower(),
                    'has_safety': any(term in chunk.lower() for term in ['adverse', 'safety', 'toxicity']),
                    'has_efficacy': any(term in chunk.lower() for term in ['survival', 'response', 'efficacy'])
                }

                self.add_document(chunk_id, chunk, chunk_metadata)
                total_chunks += 1

        print(f"\n✅ DOCUMENT LOADING COMPLETE")
        print(f"📊 Total chunks indexed: {total_chunks}")
        print(f"🤖 Embedding model: sentence-transformers/all-MiniLM-L6-v2")
        print(f"🗄️ Vector database: FAISS (IndexFlatIP)")

    def extract_metadata_from_content(self, content: str, study_name: str) -> Dict[str, Any]:
        """Extract metadata from document content using pattern matching"""

        metadata = {
            'study': study_name,
            'document_type': 'clinical_protocol'
        }

        # Extract key information using simple regex patterns

        # Extract indication
        if 'NSCLC' in content or 'lung cancer' in content.lower():
            metadata['indication'] = 'NSCLC'
        elif 'colorectal' in content.lower() or 'CRC' in content:
            metadata['indication'] = 'Colorectal Cancer'

        # Extract line of therapy
        if 'first-line' in content.lower():
            metadata['line_of_therapy'] = 'first-line'
        elif 'second-line' in content.lower():
            metadata['line_of_therapy'] = 'second-line'
        else:
            metadata['line_of_therapy'] = 'later-line'

        # Extract drug information
        if 'pembrolizumab' in content.lower():
            metadata['drug_class'] = 'anti-PD-1'
            metadata['primary_drug'] = 'pembrolizumab'
        elif 'nivolumab' in content.lower():
            metadata['drug_class'] = 'anti-PD-1'
            metadata['primary_drug'] = 'nivolumab'
        elif 'encorafenib' in content.lower():
            metadata['drug_class'] = 'targeted'
            metadata['primary_drug'] = 'encorafenib'

        # Extract endpoints
        if 'overall survival' in content.lower():
            metadata['primary_endpoint'] = 'overall_survival'
        if 'progression-free survival' in content.lower():
            metadata['includes_pfs'] = True

        return metadata

    def add_document(self, doc_id: str, text: str, metadata: Dict[str, Any]):
        """Add document with enhanced metadata"""

        # Create embedding
        embedding = self.encoder.encode(text)

        # Create document
        doc = Document(
            id=doc_id,
            text=text,
            metadata=metadata,
            embedding=embedding
        )

        self.documents.append(doc)

        # Update BM25 index
        tokens = text.lower().split()
        self.tokenized_docs.append(tokens)

        # Rebuild indices
        self._build_indices()

    def _build_indices(self):
        """Build both semantic and keyword indices"""

        # FAISS for semantic search
        if self.documents:
            embeddings = np.array([doc.embedding for doc in self.documents]).astype('float32')
            self.index = faiss.IndexFlatIP(embeddings.shape[1])
            self.index.add(embeddings)

        # BM25 for keyword search
        if self.tokenized_docs:
            self.bm25 = BM25Okapi(self.tokenized_docs)

    def hybrid_retrieve(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Hybrid retrieval: semantic + keyword search with reranking"""

        # Semantic search
        query_embedding = self.encoder.encode(query).astype('float32').reshape(1, -1)
        semantic_scores, semantic_indices = self.index.search(query_embedding, min(top_k * 2, len(self.documents)))

        # Keyword search
        query_tokens = query.lower().split()
        keyword_scores = self.bm25.get_scores(query_tokens)

        # Combine scores (normalize and weight)
        results = []
        for i, doc in enumerate(self.documents):
            semantic_score = 0
            if i in semantic_indices[0]:
                idx_pos = np.where(semantic_indices[0] == i)[0][0]
                semantic_score = semantic_scores[0][idx_pos]

            keyword_score = keyword_scores[i] if i < len(keyword_scores) else 0

            # Weighted combination (60% semantic, 40% keyword)
            combined_score = 0.6 * semantic_score + 0.4 * keyword_score

            results.append({
                'document': doc,
                'semantic_score': float(semantic_score),
                'keyword_score': float(keyword_score),
                'combined_score': float(combined_score)
            })

        # Sort by combined score and return top-k
        results.sort(key=lambda x: x['combined_score'], reverse=True)
        return results[:top_k]

    def rerank_with_llm(self, query: str, candidates: List[Dict]) -> List[Dict]:
        """Use LLM to rerank retrieved candidates"""

        if len(candidates) <= 1:
            return candidates

        # Prepare candidates for reranking
        candidate_texts = []
        for i, candidate in enumerate(candidates):
            doc = candidate['document']
            candidate_texts.append(f"Document {i+1}: {doc.text[:200]}...")

        rerank_prompt = f"""
You are evaluating document relevance for a clinical research query.

QUERY: {query}

CANDIDATE DOCUMENTS:
{chr(10).join(candidate_texts)}

TASK: Rank these documents by relevance to the query (most relevant first).
Consider:
- Direct answer to the question
- Clinical accuracy and specificity
- Completeness of information

OUTPUT: Only respond with the ranking as numbers separated by commas (e.g., "3,1,4,2")
"""

        try:
            response = self.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": rerank_prompt}],
                max_tokens=50,
                temperature=0
            )

            # Parse ranking
            ranking_text = response.choices[0].message.content.strip()
            rankings = [int(x.strip()) - 1 for x in ranking_text.split(',')]

            # Reorder candidates
            reranked = [candidates[i] for i in rankings if i < len(candidates)]
            return reranked

        except:
            # Fallback to original order if reranking fails
            return candidates

    def synthesize_answer(self, query: str, docs: List[Dict]) -> str:
        """Generate comprehensive answer from multiple documents"""

        # Prepare context with document metadata
        context_parts = []
        for i, doc_result in enumerate(docs):
            doc = doc_result['document']
            metadata_str = ", ".join([f"{k}: {v}" for k, v in doc.metadata.items()])
            context_parts.append(
                f"Document {i+1} ({metadata_str}):\n{doc.text}"
            )

        context = "\n\n".join(context_parts)

        prompt = f"""
You are a senior clinical research analyst synthesizing information from multiple trial protocols.

CONTEXT DOCUMENTS:
{context}

QUERY: {query}

INSTRUCTIONS:
- Synthesize information from multiple documents when relevant
- Compare and contrast findings across different studies if applicable
- Highlight any contradictions or important differences
- Provide specific citations (e.g., "According to Document 2...")
- If comparing multiple trials, create a structured comparison

COMPREHENSIVE ANALYSIS:
"""

        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=600,
            temperature=0.2
        )

        return response.choices[0].message.content

    def query(self, question: str) -> Dict[str, Any]:
        """Enhanced RAG pipeline with hybrid retrieval and reranking"""

        # Step 1: Hybrid retrieval
        candidates = self.hybrid_retrieve(question, top_k=5)

        if not candidates:
            return {
                'answer': "No relevant documents found.",
                'sources': [],
                'method': 'intermediate_rag'
            }

        # Step 2: LLM reranking
        reranked_candidates = self.rerank_with_llm(question, candidates[:3])

        # Step 3: Generate comprehensive answer
        answer = self.synthesize_answer(question, reranked_candidates)

        return {
            'answer': answer,
            'sources': [doc['document'].metadata for doc in reranked_candidates],
            'retrieval_scores': {
                'semantic': [doc['semantic_score'] for doc in reranked_candidates],
                'keyword': [doc['keyword_score'] for doc in reranked_candidates],
                'combined': [doc['combined_score'] for doc in reranked_candidates]
            },
            'method': 'intermediate_rag'
        }



# 🎨 GRADIO INTERFACE WRAPPER


In [4]:
class RAGWorkshopInterface:
    """
    Gradio interface wrapper for the RAG workshop.
    Preserves ALL original functionality while adding beautiful visual interface.
    """

    def __init__(self):
        self.basic_rag = None
        self.intermediate_rag = None
        self.api_key = None
        self.setup_status = {
            'basic': False,
            'intermediate': False
        }

    def setup_basic_rag(self, api_key: str) -> str:
        """Setup Basic RAG system"""
        try:
            if not api_key.strip():
                return "❌ Please enter your OpenAI API key"

            self.api_key = api_key
            self.basic_rag = BasicRAG(api_key)

            # Load sample clinical trial documents
            trial_docs = [
                {
                    'text': """
Phase III randomized, double-blind, placebo-controlled study of Drug X
in patients with metastatic colorectal cancer. Primary endpoint is
overall survival. Secondary endpoints include progression-free survival
and objective response rate. Study population: 600 patients with
confirmed metastatic colorectal cancer, ECOG performance status 0-1,
adequate organ function. Exclusion criteria include prior anti-VEGF
therapy, uncontrolled hypertension, recent surgery within 28 days.
                    """.strip(),
                    'metadata': {'study_id': 'CRC-301', 'phase': 'III', 'indication': 'Colorectal Cancer'}
                },
                {
                    'text': """
Inclusion criteria for the colorectal cancer study: Age ≥18 years,
histologically confirmed adenocarcinoma of colon or rectum, measurable
disease per RECIST 1.1, life expectancy >3 months, signed informed
consent. Laboratory requirements: ANC ≥1500/μL, platelets ≥100,000/μL,
hemoglobin ≥9 g/dL, total bilirubin ≤1.5× ULN, AST/ALT ≤2.5× ULN.
                    """.strip(),
                    'metadata': {'study_id': 'CRC-301', 'section': 'Inclusion Criteria'}
                },
                {
                    'text': """
Drug X dosing regimen: 10mg/kg IV every 2 weeks until disease
progression or unacceptable toxicity. Dose modifications allowed for
grade 3+ toxicities. Concomitant medications: Standard supportive care
allowed, no other anticancer therapy permitted during study treatment.
Regular safety monitoring with lab assessments every 2 weeks, imaging
every 6 weeks per RECIST 1.1.
                    """.strip(),
                    'metadata': {'study_id': 'CRC-301', 'section': 'Treatment Protocol'}
                }
            ]

            # Add documents to knowledge base
            for doc in trial_docs:
                self.basic_rag.add_document(doc['text'], doc['metadata'])

            self.setup_status['basic'] = True
            return f"✅ Basic RAG initialized successfully!\n📚 Loaded {len(trial_docs)} clinical trial documents\n🤖 Using HuggingFace sentence-transformers model\n🗄️ FAISS vector database ready"

        except Exception as e:
            return f"❌ Error setting up Basic RAG: {str(e)}"

    def setup_intermediate_rag(self, api_key: str) -> str:
        """Setup Intermediate RAG system"""
        try:
            if not api_key.strip():
                return "❌ Please enter your OpenAI API key"

            self.api_key = api_key
            self.intermediate_rag = IntermediateRAG(api_key)

            # Create and load sample PDF files
            pdf_files = self.intermediate_rag.create_sample_pdfs()
            self.intermediate_rag.load_pdf_documents(pdf_files)

            studies = set([doc.metadata.get('study', 'Unknown') for doc in self.intermediate_rag.documents])

            self.setup_status['intermediate'] = True
            return f"✅ Intermediate RAG initialized successfully!\n📚 Loaded {len(self.intermediate_rag.documents)} document chunks from {len(pdf_files)} clinical protocols\n🎯 Available studies: {', '.join(studies)}\n🔍 Hybrid search (semantic + keyword) ready\n🤖 LLM reranking enabled"

        except Exception as e:
            return f"❌ Error setting up Intermediate RAG: {str(e)}"

    def query_basic_rag(self, question: str) -> Tuple[str, str, str]:
        """Query Basic RAG system"""
        if not self.setup_status['basic']:
            return "❌ Please setup Basic RAG first", "", ""

        if not question.strip():
            return "❌ Please enter a question", "", ""

        try:
            result = self.basic_rag.query(question)

            # Format answer
            answer = result['answer']

            # Format sources
            sources_info = []
            for i, (source, score) in enumerate(zip(result['sources'], result['scores'])):
                study_id = source.get('study_id', 'Unknown')
                section = source.get('section', 'Main Content')
                sources_info.append(f"📄 **Source {i+1}:** {study_id} - {section} (Relevance: {score:.3f})")

            sources_display = "\n".join(sources_info)

            # Format technical details
            tech_details = f"""**🔧 Technical Details:**
- **Method:** {result['method']}
- **Retrieved Documents:** {len(result['sources'])}
- **Embedding Model:** sentence-transformers/all-MiniLM-L6-v2
- **Vector Database:** FAISS IndexFlatIP
- **Similarity Metric:** Cosine Similarity"""

            return answer, sources_display, tech_details

        except Exception as e:
            return f"❌ Error querying Basic RAG: {str(e)}", "", ""

    def query_intermediate_rag(self, question: str) -> Tuple[str, str, str]:
        """Query Intermediate RAG system"""
        if not self.setup_status['intermediate']:
            return "❌ Please setup Intermediate RAG first", "", ""

        if not question.strip():
            return "❌ Please enter a question", "", ""

        try:
            result = self.intermediate_rag.query(question)

            # Format answer
            answer = result['answer']

            # Format sources with enhanced metadata
            sources_info = []
            for i, source in enumerate(result['sources']):
                study = source.get('study', 'Unknown')
                indication = source.get('indication', 'Unknown')
                chunk_num = source.get('chunk_number', '?')

                # Get scores
                semantic_score = result['retrieval_scores']['semantic'][i]
                keyword_score = result['retrieval_scores']['keyword'][i]
                combined_score = result['retrieval_scores']['combined'][i]

                sources_info.append(f"""📄 **Source {i+1}:** {study} - {indication}
   - **Chunk:** {chunk_num}
   - **Semantic Score:** {semantic_score:.3f}
   - **Keyword Score:** {keyword_score:.3f}
   - **Combined Score:** {combined_score:.3f}""")

            sources_display = "\n\n".join(sources_info)

            # Format technical details
            tech_details = f"""**🔧 Technical Details:**
- **Method:** {result['method']} (Hybrid Search + LLM Reranking)
- **Retrieved Documents:** {len(result['sources'])}
- **Search Strategy:** 60% Semantic + 40% Keyword (BM25)
- **Embedding Model:** sentence-transformers/all-MiniLM-L6-v2
- **Vector Database:** FAISS IndexFlatIP
- **Reranking:** GPT-4o-mini
- **Total Indexed Chunks:** {len(self.intermediate_rag.documents)}"""

            return answer, sources_display, tech_details

        except Exception as e:
            return f"❌ Error querying Intermediate RAG: {str(e)}", "", ""

    def create_interface(self):
        """Create the Gradio interface"""

        with gr.Blocks(
            title="🧠 RAG Workshop - Kognitic AI for Engineers",
            theme=gr.themes.Soft(),
            css="""
            .gradio-container {
                max-width: 1200px !important;
            }
            .tab-nav {
                background: linear-gradient(90deg, #667eea 0%, #764ba2 100%) !important;
            }
            """
        ) as interface:

            # Header
            gr.Markdown("""
            # 🧠 RAG (Retrieval-Augmented Generation) Workshop
            ## Kognitic AI for Engineers - Progressive Examples

            **🎯 Learning Objectives:**
            - ✅ Understand RAG architecture and components
            - ✅ See impact of chunking strategies and embedding models
            - ✅ Learn hybrid search techniques (semantic + keyword)
            - ✅ Experience reranking for improved precision
            - ✅ Apply RAG to real clinical research problems
            """)

            with gr.Tabs() as tabs:

                # ==================== BASIC RAG TAB ====================
                with gr.TabItem("📚 Basic RAG", id="basic_rag"):
                    gr.Markdown("""
                    ## 📚 Basic RAG - Clinical Trial Protocol Q&A

                    **🎯 Educational Focus:**
                    - Basic RAG pipeline: Embed → Store → Retrieve → Generate
                    - Using HuggingFace embedding models
                    - Simple vector similarity search with FAISS
                    """)

                    with gr.Row():
                        with gr.Column(scale=2):
                            basic_api_key = gr.Textbox(
                                label="🔑 OpenAI API Key",
                                type="password",
                                placeholder="Enter your OpenAI API key...",
                                info="Your API key is used only for this session and not stored."
                            )
                            basic_setup_btn = gr.Button("🚀 Setup Basic RAG", variant="primary")
                            basic_setup_status = gr.Textbox(
                                label="📊 Setup Status",
                                value="⏳ Click 'Setup Basic RAG' to initialize...",
                                interactive=False,
                                lines=4
                            )

                        with gr.Column(scale=3):
                            gr.Markdown("""
                            **📋 Sample Documents Included:**
                            - 🧬 **CRC-301:** Phase III colorectal cancer study
                            - 📑 **Inclusion Criteria:** Patient eligibility requirements
                            - 💊 **Treatment Protocol:** Drug dosing and monitoring

                            **🔧 Technical Stack:**
                            - **Embeddings:** sentence-transformers/all-MiniLM-L6-v2
                            - **Vector DB:** FAISS (IndexFlatIP)
                            - **Chunking:** Sentence-based with overlap
                            - **Similarity:** Cosine similarity
                            """)

                    gr.Markdown("---")

                    with gr.Row():
                        with gr.Column():
                            basic_question = gr.Textbox(
                                label="❓ Your Clinical Research Question",
                                placeholder="e.g., What is the primary endpoint of the colorectal cancer study?",
                                lines=2
                            )
                            basic_query_btn = gr.Button("🔍 Ask Basic RAG", variant="secondary")

                        with gr.Column():
                            gr.Markdown("""
                            **💡 Try These Example Questions:**
                            - What is the primary endpoint of the colorectal cancer study?
                            - What are the dosing requirements for Drug X?
                            - What laboratory values are required for patient eligibility?
                            - What are the exclusion criteria for the study?
                            """)

                    with gr.Row():
                        with gr.Column(scale=2):
                            basic_answer = gr.Textbox(
                                label="💡 RAG Answer",
                                lines=8,
                                interactive=False
                            )

                        with gr.Column(scale=1):
                            basic_sources = gr.Textbox(
                                label="📚 Retrieved Sources",
                                lines=6,
                                interactive=False
                            )

                    basic_tech_details = gr.Textbox(
                        label="🔧 Technical Details",
                        lines=4,
                        interactive=False
                    )

                # ==================== INTERMEDIATE RAG TAB ====================
                with gr.TabItem("🔍 Intermediate RAG", id="intermediate_rag"):
                    gr.Markdown("""
                    ## 🔍 Intermediate RAG - Multi-Document Analysis with Reranking

                    **🎯 Educational Focus:**
                    - Multi-document PDF processing and cross-document synthesis
                    - Hybrid search: semantic (FAISS) + keyword (BM25)
                    - LLM reranking for improved precision
                    - Rich metadata extraction and filtering
                    """)

                    with gr.Row():
                        with gr.Column(scale=2):
                            inter_api_key = gr.Textbox(
                                label="🔑 OpenAI API Key",
                                type="password",
                                placeholder="Enter your OpenAI API key...",
                                info="Your API key is used only for this session and not stored."
                            )
                            inter_setup_btn = gr.Button("🚀 Setup Intermediate RAG", variant="primary")
                            inter_setup_status = gr.Textbox(
                                label="📊 Setup Status",
                                value="⏳ Click 'Setup Intermediate RAG' to initialize...",
                                interactive=False,
                                lines=6
                            )

                        with gr.Column(scale=3):
                            gr.Markdown("""
                            **📋 Clinical Trial Protocols Included:**
                            - 🫁 **KEYNOTE-189:** Pembrolizumab + Chemo in NSCLC
                            - 🫁 **CheckMate-227:** Nivolumab + Ipilimumab in NSCLC
                            - 🎯 **BEACON CRC:** BRAF-targeted therapy in colorectal cancer

                            **🔧 Advanced Technical Stack:**
                            - **Hybrid Search:** 60% Semantic + 40% Keyword (BM25)
                            - **Reranking:** GPT-4o-mini for precision improvement
                            - **Chunking:** Advanced with overlap and metadata
                            - **Cross-Document:** Multi-study synthesis and comparison
                            """)

                    gr.Markdown("---")

                    with gr.Row():
                        with gr.Column():
                            inter_question = gr.Textbox(
                                label="❓ Your Cross-Document Research Question",
                                placeholder="e.g., Compare the overall survival results across all three studies",
                                lines=3
                            )
                            inter_query_btn = gr.Button("🔍 Ask Intermediate RAG", variant="secondary")

                        with gr.Column():
                            gr.Markdown("""
                            **💡 Try These Cross-Document Questions:**
                            - Compare the overall survival results across all three studies
                            - What are the different biomarker strategies used in lung cancer vs colorectal cancer studies?
                            - How do the safety profiles differ between the lung cancer immunotherapy studies?
                            - Which study shows the best objective response rate and why?
                            - What are the key differences in patient eligibility criteria across these trials?
                            """)

                    with gr.Row():
                        with gr.Column(scale=2):
                            inter_answer = gr.Textbox(
                                label="💡 Synthesized Cross-Document Answer",
                                lines=10,
                                interactive=False
                            )

                        with gr.Column(scale=1):
                            inter_sources = gr.Textbox(
                                label="📚 Retrieved Sources with Scores",
                                lines=8,
                                interactive=False
                            )

                    inter_tech_details = gr.Textbox(
                        label="🔧 Advanced Technical Details",
                        lines=5,
                        interactive=False
                    )

                # ==================== EDUCATIONAL NOTES TAB ====================
                with gr.TabItem("🎓 Educational Notes", id="educational"):
                    gr.Markdown("""
                    # 🎓 RAG Workshop - Educational Notes

                    ## 🏗️ RAG Architecture Overview

                    ```
                    📝 Documents → 🔪 Chunking → 🤖 Embedding → 🗄️ Vector DB
                                                                      ↓
                    💡 Answer ← 🧠 LLM Generation ← 📚 Context ← 🔍 Retrieval ← ❓ Query
                    ```

                    ## 📚 Basic RAG vs 🔍 Intermediate RAG

                    | Feature | Basic RAG | Intermediate RAG |
                    |---------|-----------|------------------|
                    | **Search Method** | Semantic only (FAISS) | Hybrid (Semantic + Keyword) |
                    | **Documents** | 3 simple chunks | Multiple PDFs with metadata |
                    | **Reranking** | None | LLM-based reranking |
                    | **Cross-Document** | Single document focus | Multi-document synthesis |
                    | **Metadata** | Basic | Rich extraction + filtering |
                    | **Complexity** | Educational baseline | Production-ready approach |

                    ## 🔧 Technical Implementation Details

                    ### 🤖 Embedding Models
                    - **Model:** `sentence-transformers/all-MiniLM-L6-v2`
                    - **Dimensions:** 384-dimensional embeddings
                    - **Speed:** Fast inference, good for prototyping
                    - **Quality:** Balanced performance across domains

                    ### 🗄️ Vector Databases
                    - **FAISS:** High-performance similarity search
                    - **IndexFlatIP:** Inner product for cosine similarity
                    - **Normalization:** L2 normalization for proper cosine similarity

                    ### 🔍 Hybrid Search Strategy
                    - **Semantic Search:** Captures meaning and context
                    - **Keyword Search (BM25):** Captures exact term matches
                    - **Combination:** 60% semantic + 40% keyword weighting
                    - **Benefits:** Better recall and precision balance

                    ### 🎯 Reranking Benefits
                    - **Purpose:** Improve precision of top results
                    - **Method:** LLM evaluates relevance to specific query
                    - **Trade-off:** Higher accuracy vs increased latency
                    - **When to Use:** Complex queries requiring nuanced understanding

                    ## 🚀 Production Considerations

                    ### 📊 Evaluation Metrics
                    - **Retrieval Quality:** Precision@K, Recall@K, MRR
                    - **Answer Quality:** Human evaluation, factual accuracy
                    - **End-to-End:** Response relevance, user satisfaction

                    ### ⚡ Performance Optimization
                    - **Chunking Strategy:** Balance context vs granularity
                    - **Index Type:** HNSW for large scale, IVF for memory efficiency
                    - **Caching:** Query results and embeddings
                    - **Batch Processing:** Bulk document ingestion

                    ### 🏭 Scaling to Production
                    1. **Start Simple:** Basic RAG with good chunking
                    2. **Add Hybrid Search:** Improve recall with keyword matching
                    3. **Implement Reranking:** Boost precision for complex queries
                    4. **Rich Metadata:** Enable filtering and boosting
                    5. **Monitor & Evaluate:** Continuous improvement cycle

                    ## 🔬 Advanced RAG Techniques (Beyond This Workshop)

                    ### 🎯 Query Enhancement
                    - **Query Expansion:** Add related terms automatically
                    - **Query Rewriting:** Rephrase for better retrieval
                    - **Multi-Query:** Generate multiple query variants

                    ### 📈 Advanced Retrieval
                    - **Two-Stage Retrieval:** Fast first-pass + precise reranking
                    - **Dense + Sparse:** ColBERT, SPLADE for better precision
                    - **Graph RAG:** Knowledge graph integration

                    ### 🧠 Generation Enhancement
                    - **Chain-of-Thought:** Step-by-step reasoning
                    - **Self-RAG:** Model decides when to retrieve
                    - **Adaptive RAG:** Route queries to different strategies

                    ## 📚 Recommended Resources

                    - **HuggingFace Sentence Transformers:** https://huggingface.co/sentence-transformers
                    - **FAISS Documentation:** https://faiss.ai/
                    - **ChromaDB (Production Vector DB):** https://docs.trychroma.com/
                    - **LangChain RAG Guide:** https://python.langchain.com/docs/use_cases/question_answering
                    - **Pinecone RAG Handbook:** https://www.pinecone.io/learn/retrieval-augmented-generation/
                    """)

            # Event handlers
            basic_setup_btn.click(
                self.setup_basic_rag,
                inputs=[basic_api_key],
                outputs=[basic_setup_status]
            )

            inter_setup_btn.click(
                self.setup_intermediate_rag,
                inputs=[inter_api_key],
                outputs=[inter_setup_status]
            )

            basic_query_btn.click(
                self.query_basic_rag,
                inputs=[basic_question],
                outputs=[basic_answer, basic_sources, basic_tech_details]
            )

            inter_query_btn.click(
                self.query_intermediate_rag,
                inputs=[inter_question],
                outputs=[inter_answer, inter_sources, inter_tech_details]
            )

        return interface


# 🚀 MAIN RUNNER

In [5]:
def launch_rag_workshop():
    """Launch the RAG workshop with Gradio interface"""

    print("🧠 RAG Workshop - Launching Gradio Interface...")
    print("=" * 60)

    # Create interface
    workshop = RAGWorkshopInterface()
    interface = workshop.create_interface()

    print("🎨 Gradio interface created successfully!")
    print("🚀 Launching workshop...")

    # Launch interface
    interface.launch(
        share=True,  # Create public link for Colab
        server_name="0.0.0.0",  # Allow external access
        server_port=7860,  # Standard Gradio port
        show_api=False,  # Hide API docs for cleaner interface
        favicon_path=None,  # Use default favicon
        inbrowser=True  # Auto-open in browser
    )

if __name__ == "__main__":
    launch_rag_workshop()

🧠 RAG Workshop - Launching Gradio Interface...
🎨 Gradio interface created successfully!
🚀 Launching workshop...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a485ebf48383b2e372.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
