In [None]:
# Installation and Imports

!pip install -q ragas google-generativeai langchain-google-genai langchain-huggingface datasets evaluate pandas nbconvert

import os, sys
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Tuple
import nbformat
from nbconvert import PythonExporter
import pandas as pd
from datasets import load_dataset, Dataset
from tqdm.auto import tqdm
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/303.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.6/303.6 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# =============================================================================
# 1. Mount Google Drive and Define Paths
# =============================================================================

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

DRIVE_ROOT = Path("/content/drive/MyDrive/Colab Notebooks")

# Notebook filenames
STEP6_NAIVE_NOTEBOOK = DRIVE_ROOT / "Naive_RAG_&_Evaluation.ipynb"
STEP6_ENHANCED_NOTEBOOK = DRIVE_ROOT / "Advanced_RAG.ipynb"

# Check they exist
if not STEP6_NAIVE_NOTEBOOK.exists():
    raise FileNotFoundError(f"Naive notebook not found: {STEP6_NAIVE_NOTEBOOK}")
if not STEP6_ENHANCED_NOTEBOOK.exists():
    raise FileNotFoundError(f"Enhanced notebook not found: {STEP6_ENHANCED_NOTEBOOK}")

print(f"Naive notebook:    {STEP6_NAIVE_NOTEBOOK}")
print(f"Enhanced notebook: {STEP6_ENHANCED_NOTEBOOK}")

# Create output directory
STEP6_OUTPUT_DIR = DRIVE_ROOT / "step6_outputs"
STEP6_OUTPUT_DIR.mkdir(exist_ok=True)

print(f"\nArtifacts will be saved to: {STEP6_OUTPUT_DIR}")

Mounted at /content/drive
Naive notebook:    /content/drive/MyDrive/Colab Notebooks/Naive_RAG_&_Evaluation.ipynb
Enhanced notebook: /content/drive/MyDrive/Colab Notebooks/Advanced_RAG.ipynb

Artifacts will be saved to: /content/drive/MyDrive/Colab Notebooks/step6_outputs


In [None]:
# =============================================================================
# 2. Convert Notebooks to Python Scripts
# =============================================================================

def notebook_to_python(nb_path: Path, output_path: Path):
    """Convert a Jupyter notebook to a Python script."""
    print(f"\nConverting {nb_path.name}...")

    with open(nb_path, 'r', encoding='utf-8') as f:
        notebook = nbformat.read(f, as_version=4)

    exporter = PythonExporter()
    python_code, _ = exporter.from_notebook_node(notebook)

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(python_code)

    print(f"  Saved to: {output_path}")
    return output_path

# Define output Python script paths
NAIVE_PY = STEP6_OUTPUT_DIR / "naive_system.py"
ENHANCED_PY = STEP6_OUTPUT_DIR / "enhanced_system.py"

# Convert notebooks
notebook_to_python(STEP6_NAIVE_NOTEBOOK, NAIVE_PY)
notebook_to_python(STEP6_ENHANCED_NOTEBOOK, ENHANCED_PY)


Converting Naive_RAG_&_Evaluation.ipynb...
  Saved to: /content/drive/MyDrive/Colab Notebooks/step6_outputs/naive_system.py

Converting Advanced_RAG.ipynb...
  Saved to: /content/drive/MyDrive/Colab Notebooks/step6_outputs/enhanced_system.py


PosixPath('/content/drive/MyDrive/Colab Notebooks/step6_outputs/enhanced_system.py')

In [None]:
# =============================================================================
# 3. Import Converted Modules
# =============================================================================

def import_python_module(module_name: str, file_path: Path):
    """Import a Python module from file."""
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    module = importlib.util.module_from_spec(spec)
    sys.modules[module_name] = module
    spec.loader.exec_module(module)
    return module

print("\nImporting modules...")
naive_mod = import_python_module("naive_rag_system", NAIVE_PY)
enhanced_mod = import_python_module("enhanced_rag_system", ENHANCED_PY)
print("✓ Modules imported")


Importing modules...
Collecting pymilvus
  Downloading pymilvus-2.6.2-py3-none-any.whl.metadata (6.5 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.11.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (9.4 kB)
Downloading pymilvus-2.6.2-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.8/258.8 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ujson-5.11.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ujson, pymilvus
Successfully installed pymilvus-2.6.2 ujson-5.11.0
Collecting milvus-lite>=2.4.0 (from pymilvus[milvus_lite])
  Downloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl.metadata (10.0 kB)
Downloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl (55.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



DATASET STATISTICS
Passage length statistics:
  Min: 1 characters
  Max: 2515 characters
  Mean: 389.8 characters
  Median: 299.0 characters

Null values per column:
passage    0
dtype: int64

Shape before dropping nulls: (3200, 1)
Shape after dropping nulls: (3200, 1)

Using 1000 passages for RAG system

Loading QA dataset...


README.md:   0%|          | 0.00/719 [00:00<?, ?B/s]

data/test.parquet/part.0.parquet:   0%|          | 0.00/54.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/918 [00:00<?, ? examples/s]

Loaded 918 Q&A pairs

SAMPLE DATA
Sample passage:
Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area....

Sample question: Was Abraham Lincoln the sixteenth President of the United States?
Sample answer: yes

CREATING CHUNKS
Total chunks created: 1289
Average chunks per passage: 1.29
Sample chunk: Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area....

GENERATING EMBEDDINGS


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]


Embeddings shape: (1289, 384)
Embedding dimension: 384

SETTING UP MILVUS
Schema defined with fields: id, passage, embedding

CREATING COLLECTION AND INSERTING DATA
Collection 'rag_mini' created
Insert response: {'insert_count': 1289, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 1

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


FLAN-T5 model 'google/flan-t5-small' loaded successfully

TESTING RETRIEVAL
Query: What are the three sections of a beetle?

ID: 1281 | Score: 0.3709
Text: s as generally assumed, which would necessitate splitting the traditional Pelecaniformes in three....

ID: 1274 | Score: 0.3135
Text: The Megadyptes - Eudyptes clade occurs at similar latitudes (though not as far north as the Galapagos Penguin), has its highest diversity in the New Z...

ID: 1269 | Score: 0.2790
Text: Pygoscelis contains species with a fairly simple black-and-white head pattern; their distribution is intermediate, centered on Antarctic coasts but ex...

TESTING FULL RAG PIPELINE
Question: Was Abraham Lincoln the sixteenth President of the United States?



Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


=== GENERATED ANSWER ===
Yes

=== TOP 5 RETRIEVED CHUNKS ===

[1] ID: 339 | Score: 0.7095
Text: Young Abraham Lincoln...

[2] ID: 320 | Score: 0.6434
Text: Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from Ma...

[3] ID: 381 | Score: 0.5896
Text: On November 6, 1860, Lincoln was elected as the 16th President of the United States, beating Democrat Stephen A. Douglas...

[4] ID: 882 | Score: 0.5569
Text: Sixteen months before his death, his son, John Quincy Adams, became the sixth President of the United States (1825 1829)...

[5] ID: 480 | Score: 0.5484
Text: * American School, Lincoln's economic views....

RAG SYSTEM READY

STEP 3: PROMPTING STRATEGY EVALUATION
✓ Defined 3 prompting strategies

→ Evaluating chain_of_thought on 100 samples (GPU) - started at 00:32:14

Evaluating 'chain_of_thought' on 100 samples...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

✓ Completed chain_of_thought at 00:33:12

→ Evaluating persona on 100 samples (GPU) - started at 00:33:12

Evaluating 'persona' on 100 samples...
✓ Completed persona at 00:33:39

→ Evaluating instruction on 100 samples (GPU) - started at 00:33:39

Evaluating 'instruction' on 100 samples...
✓ Completed instruction at 00:34:05

=== Results ===
chain_of_thought {'exact_match': 8.0, 'f1': 12.332352903527536}
persona {'exact_match': 28.0, 'f1': 31.380048266706773}
instruction {'exact_match': 23.0, 'f1': 26.121037102432453}

BEST PERFORMING STRATEGIES

→ Best by F1 Score:
  Strategy: persona
  F1: 31.3800
  EM: 28.0000

→ Best by Exact Match:
  Strategy: persona
  F1: 31.3800
  EM: 28.0000

=== STEP 3 EVALUATION COMPLETE ===
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[?25hAll packages imported successfully

CONFIGURATION
  embedding_model: sentence-transformers/all-MiniLM-L6-v2
  vector_dim: 384
  chunk_size: 600
  

data/passages.parquet/part.0.parquet:   0%|          | 0.00/797k [00:00<?, ?B/s]

Generating passages split:   0%|          | 0/3200 [00:00<?, ? examples/s]

Working with 1000 documents
Created 1289 chunks

Generating embeddings...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Embeddings shape: (1289, 384)
FAISS index contains 1289 vectors

Loading Flan-T5...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


LLM ready

Loading CrossEncoder for reranking...


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Reranker ready

────────────────────────────────────────────────────────────────────────────────
TESTING RAG SYSTEM - MULTIPLE QUERIES
────────────────────────────────────────────────────────────────────────────────

TEST QUERY #1
Query: Was Abraham Lincoln the sixteenth President of the United States?

Question: Was Abraham Lincoln the sixteenth President of the United States?


Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors



Query variations (2):
  1. Was Abraham Lincoln the sixteenth President of the United States?
  2. Abraham Lincoln was the sixteenth president of the United States.
Retrieved 35 unique candidates
Reranked to top 5

Context assembled: 1936 chars from 4 chunks

────────────────────────────────────────────────────────────────────────────────
RESULTS
────────────────────────────────────────────────────────────────────────────────

Answer: Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination.

Source Citations:
┌──────┬─────────────┬──────────────┐
│ Rank │  Chunk ID   │   Relevance  │
├──────┼─────────────┼──────────────┤
│  1   │ 278-0       │      10.3089 │
│  2   │ 319-0       │       8.1220 │
│  3   │ 198-0       │      -0.4495 │
│  4   │ 383-0       │      -0.7348 │
└──────┴─────────────┴──────────────┘

────────────────────────────────────────────────────────────────────────────────



In [None]:
# =============================================================================
# 4. Extract Components from Notebooks
# =============================================================================

print("\nExtracting components from notebooks...")

# From Naive RAG (Milvus-based)
milvus_client = getattr(naive_mod, 'milvus_client', None)
naive_chunks = getattr(naive_mod, 'document_chunks', None)
naive_embedder = getattr(naive_mod, 'embedding_model', None)
naive_pipeline = getattr(naive_mod, 'flan_pipeline', None)
naive_search_fn = getattr(naive_mod, 'search_similar_chunks', None)

# From Enhanced RAG (FAISS-based)
enhanced_chunks = getattr(enhanced_mod, 'all_chunks', None)
enhanced_encoder = getattr(enhanced_mod, 'encoder', None)
enhanced_pipeline = getattr(enhanced_mod, 'text_gen', None)
enhanced_rag_fn = getattr(enhanced_mod, 'generate_rag_answer', None)

# Get prompting strategy class
try:
    PromptStrategies = getattr(naive_mod, 'PromptingStrategies', None)
    persona_prompt = PromptStrategies.persona_prompt if PromptStrategies else None
except:
    persona_prompt = None

print(f"  Naive chunks: {len(naive_chunks) if naive_chunks else 0}")
print(f"  Enhanced chunks: {len(enhanced_chunks) if enhanced_chunks else 0}")
print(f"  Milvus client: {'✓' if milvus_client else '✗'}")
print(f"  Functions: {'✓' if all([naive_search_fn, enhanced_rag_fn, persona_prompt]) else '✗'}")


Extracting components from notebooks...
  Naive chunks: 1289
  Enhanced chunks: 1289
  Milvus client: ✓
  Functions: ✓


In [None]:
# =============================================================================
# 5. Environment Setup and LLM Configuration
# =============================================================================

# Define API key
GEMINI_API_KEY = "Replace with your own key"
os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

gemini_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    temperature=0.0,
    google_api_key=GEMINI_API_KEY
)

print("✓ Gemini configured with gemini-2.5-pro")

# Load test questions
qa_dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer")
test_questions = qa_dataset["test"]

NUM_EVAL_SAMPLES = 100
eval_questions = test_questions.select(range(NUM_EVAL_SAMPLES))

print(f"✓ Loaded {len(eval_questions)} evaluation questions")

✓ Gemini configured with gemini-2.5-pro
✓ Loaded 100 evaluation questions


In [None]:
# =============================================================================
# 6. Define RAG Response Functions
# =============================================================================

def naive_rag_response(question: str) -> Tuple[str, List[str]]:
    """Execute naive RAG using Milvus (top-1, persona prompt)."""
    try:
        results = naive_search_fn(question, num_results=1)
        if not results:
            return "I don't know.", [""]

        context = results[0][1]
        prompt = persona_prompt(context, question)
        output = naive_pipeline(prompt, max_new_tokens=128, temperature=0.2)
        answer = output[0]["generated_text"].strip()

        return answer, [context]
    except Exception as e:
        print(f"Naive error: {e}")
        return "Error", [""]

def enhanced_rag_response(question: str) -> Tuple[str, List[str]]:
    """Execute enhanced RAG (query rewriting + reranking)."""
    try:
        answer, refs = enhanced_rag_fn(question)
        contexts = []
        for ref in refs[:5]:
            if isinstance(ref, dict):
                chunk_id = ref.get('id', '')
                for chunk in enhanced_chunks:
                    if chunk['id'] == chunk_id:
                        contexts.append(chunk['text'])
                        break
        return answer, contexts if contexts else [""]
    except Exception as e:
        print(f"Enhanced error: {e}")
        return "Error", [""]

print("✓ Response functions ready")

✓ Response functions ready


In [None]:
# =============================================================================
# 7. Collect Evaluation Data
# =============================================================================

def process_question(idx: int, system_type: str) -> Dict:
    """Process one question."""
    try:
        question = eval_questions[idx]["question"]
        answer_data = eval_questions[idx].get("answer") or eval_questions[idx].get("answers")
        ground_truth = answer_data if isinstance(answer_data, str) else answer_data[0]

        if system_type == "naive":
            answer, contexts = naive_rag_response(question)
        else:
            answer, contexts = enhanced_rag_response(question)

        return {
            "question": question,
            "answer": answer,
            "contexts": contexts,
            "ground_truth": ground_truth
        }
    except Exception as e:
        print(f"Error Q{idx}: {e}")
        return None

def collect_data(system_type: str, num_workers: int = 10) -> List[Dict]:
    """Collect with multithreading."""
    print(f"\nCollecting {system_type.upper()} data...")

    results = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(process_question, i, system_type): i
                  for i in range(len(eval_questions))}

        for future in tqdm(as_completed(futures), total=len(futures), desc=system_type):
            result = future.result()
            if result:
                results.append(result)

    print(f"✓ Collected {len(results)} responses")
    return results

naive_data = collect_data("naive", num_workers=10)
enhanced_data = collect_data("enhanced", num_workers=10)


Collecting NAIVE data...


naive:   0%|          | 0/100 [00:00<?, ?it/s]

✓ Collected 100 responses

Collecting ENHANCED data...

Question: Was Abraham Lincoln the sixteenth President of the United States?

Question: Did his mother die of pneumonia?

Question: Did Lincoln sign the National Banking Act of 1863?

Question: When did Lincoln begin his political career?

Question: How many long was Lincoln's formal education?

Question: What did The Legal Tender Act of 1862 establish?

Question: Who suggested Lincoln grow a beard?

Question: When did the Gettysburg address argue that America was born?

Question: Did Lincoln beat John C. Breckinridge in the 1860 election?

Question: Was Abraham Lincoln the first President of the United States?


enhanced:   0%|          | 0/100 [00:00<?, ?it/s]


Query variations (1):
  1. Did Lincoln sign the National Banking Act of 1863?
Retrieved 30 unique candidates
Reranked to top 5

Context assembled: 2009 chars from 4 chunks

Query variations (1):
  1. When did the Gettysburg address argue that America was born?

Query variations (1):
  1. How many long was Lincoln's formal education?
Retrieved 30 unique candidates

Query variations (1):
  1. When did Lincoln begin his political career?
Retrieved 30 unique candidates
Reranked to top 5

Context assembled: 2009 chars from 4 chunks
Retrieved 30 unique candidates
Reranked to top 5

Context assembled: 2009 chars from 4 chunks
Reranked to top 5

Context assembled: 1922 chars from 4 chunks

Query variations (2):
  1. What did The Legal Tender Act of 1862 establish?
  2. legal tendering

Question: Did Lincoln start his political career in 1832?
Retrieved 50 unique candidates
Reranked to top 5

Context assembled: 1913 chars from 5 chunks

Query variations (2):
  1. Did his mother die of pneumoni

In [None]:
# ============================================================================
# Step 8: RAGAs Evaluation
# ============================================================================

def to_ragas(data: List[Dict]) -> Dataset:
    return Dataset.from_dict({
        "question": [d["question"] for d in data],
        "answer": [d["answer"] for d in data],
        "contexts": [d["contexts"] for d in data],
        "ground_truth": [d["ground_truth"] for d in data]
    })

naive_ds = to_ragas(naive_data)
enhanced_ds = to_ragas(enhanced_data)

# Configure embeddings for RAGAs
ragas_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

print("\nRunning RAGAs evaluation...")
metrics = [faithfulness, answer_relevancy, context_precision, context_recall]

print("Evaluating naive system...")
naive_scores = evaluate(
    naive_ds,
    metrics=metrics,
    llm=gemini_llm,
    embeddings=ragas_embeddings

print("Evaluating enhanced system...")
enhanced_scores = evaluate(
    enhanced_ds,
    metrics=metrics,
    llm=gemini_llm,
    embeddings=ragas_embeddings
)

print("✓ Evaluation complete")


Running RAGAs evaluation...
Evaluating naive system...


Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]



Evaluating enhanced system...


Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]



✓ Evaluation complete


In [None]:
# =============================================================================
# 9. Save Raw RAGAs Scores
# =============================================================================

df = pd.DataFrame({
    'System': ['Naive', 'Enhanced'],
    'Faithfulness': [naive_scores['faithfulness'], enhanced_scores['faithfulness']],
    'Answer Relevancy': [naive_scores['answer_relevancy'], enhanced_scores['answer_relevancy']],
    'Context Precision': [naive_scores['context_precision'], enhanced_scores['context_precision']],
    'Context Recall': [naive_scores['context_recall'], enhanced_scores['context_recall']]
})

print("\n" + df.to_string(index=False))

df.to_csv(STEP6_OUTPUT_DIR / 'ragas_results_raw.csv', index=False)
print(f"\n✓ Saved to {STEP6_OUTPUT_DIR / 'ragas_results_raw.csv'}")


  System                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         Faithfulness                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [None]:
# =============================================================================
# 10. Aggregate RAGAs Metrics
# =============================================================================

# Calculate mean scores from the arrays
df = pd.DataFrame({
    'System': ['Naive', 'Enhanced'],
    'Faithfulness': [
        np.nanmean(naive_scores['faithfulness']),  # nanmean ignores NaN values
        np.nanmean(enhanced_scores['faithfulness'])
    ],
    'Answer Relevancy': [
        np.nanmean(naive_scores['answer_relevancy']),
        np.nanmean(enhanced_scores['answer_relevancy'])
    ],
    'Context Precision': [
        np.nanmean(naive_scores['context_precision']),
        np.nanmean(enhanced_scores['context_precision'])
    ],
    'Context Recall': [
        np.nanmean(naive_scores['context_recall']),
        np.nanmean(enhanced_scores['context_recall'])
    ]
})

print("\n" + "="*80)
print("RAGAS EVALUATION RESULTS (AGGREGATED)")
print("="*80)
print("\n" + df.to_string(index=False))

df.to_csv(STEP6_OUTPUT_DIR / 'ragas_results.csv', index=False)
print(f"\n✓ Saved to {STEP6_OUTPUT_DIR / 'ragas_results.csv'}")


RAGAS EVALUATION RESULTS (AGGREGATED)

  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall
   Naive      0.427083          0.189883           0.380000            0.36
Enhanced      0.670000          0.270800           0.602458            0.63

✓ Saved to /content/drive/MyDrive/Colab Notebooks/step6_outputs/ragas_results.csv
