In [3]:
# Load all required Libraries
import pandas as pd
import numpy as np
import transformers, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset

from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

In [4]:
COLLECTION_NAME = "rag_mini" #config["COLLECTION_NAME"]
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
VECTOR_DIMENSION = 384
GENERATION_MODEL_NAME = "google/flan-t5-base"
EMBEDDING_FIELD = "embedding"

# Read Passages from the Datasets and Drop rows if they are NA or empty

In [5]:
passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

print(passages.shape)
passages.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(3200, 1)


Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."


# Do EDA on the passage dataset
- You can try to find the maximum and minimum length of the passages before indexing (just a direction)

In [None]:
# Code for EDA
import numpy as np
#length in terms of characters
passages['length_char'] = passages['passage'].apply(len)

print(f"Total number of passages: {len(passages)}")
print(f"Min passage length (chars): {passages['length_char'].min()}")
print(f"Max passage length (chars): {passages['length_char'].max()}")
print(f"Avg passage length (chars): {passages['length_char'].mean():.2f}")
print(f"SD of length (chars): {passages['length_char'].std():.2f}")


# length in terms of tokens-word count
passages['word_count'] = passages['passage'].apply(lambda x: len(str(x).split()))

print(f"Min word count: {passages['word_count'].min()}")
print(f"Max word count: {passages['word_count'].max()}")
print(f"Avg word count: {passages['word_count'].mean():.2f}")

# Remove the temporary columns added for analysis
passages = passages.drop(columns=['length_char', 'word_count'])

# Display the head again to confirm the DataFrame is clean
print("\nPassages DataFrame head:")
print(passages.head())

Total number of passages: 3200
Min passage length (chars): 1
Max passage length (chars): 2515
Avg passage length (chars): 389.85
SD of length (chars): 348.37
Min word count: 1
Max word count: 425
Avg word count: 62.10

Passages DataFrame head:
                                              passage
id                                                   
0   Uruguay (official full name in  ; pron.  , Eas...
1   It is bordered by Brazil to the north, by Arge...
2   Montevideo was founded by the Spanish in the e...
3   The economy is largely based in agriculture (m...
4   According to Transparency International, Urugu...


# Tokenize Text and Generate Embeddings using Sentence Transformers

In [6]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

# Encode Text
embeddings = embedding_model.encode(passages['passage'].tolist()).tolist()

print(f"Embeddings row: ({len(embeddings)})")
print(f"Embeddings column: ({len(embeddings[0])})")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings row: (3200)
Embeddings column: (384)


In [7]:
# Update passages DataFrame with embeddings
passages['embedding'] = embeddings

# Create Milvus Client and Insert your Embeddings to your DB
- Make sure you define a schema for your collection (Points will be deducted if you fail to define a proper schema with ids, passage text, embedding)

In [9]:
# Define every column of your schema
id_ = FieldSchema(
    name="id",
    dtype=DataType.INT64,
    is_primary=True,
    auto_id=False,
    description="Passage ID"
)
#max_length can adjust, put large number just in case
passage = FieldSchema(
    name="passage",
    dtype=DataType.VARCHAR,
    max_length=10000,
    description="Raw passage text"
)
# vector field. all-MiniLM-L6-v2 is 384 dimensions
embedding = FieldSchema(
    name="embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=VECTOR_DIMENSION,
    description="Vector embedding"
)

In [10]:
# define a proper schema with ids, passage text, embedding mentioned above
schema = CollectionSchema(
    fields=[id_, passage, embedding],
    description="RAG-Mini-Wikipedia Collection"
)

In [11]:
client = MilvusClient("rag_wikipedia_mini.db")
client
# Create the Collection with Collection Name = "rag_mini". Make sure you define the schema variable while creating the collection


<pymilvus.milvus_client.milvus_client.MilvusClient at 0x77fcbe0a7200>

In [None]:
# print(client.list_collections())

In [12]:

# Remove the collection if it already exists for a fresh run

if client.has_collection(collection_name=COLLECTION_NAME):
    client.drop_collection(collection_name=COLLECTION_NAME)
    print(f"Dropped existing collection: {COLLECTION_NAME}")

# add error handling below as well
try:
  client.create_collection(
      collection_name=COLLECTION_NAME,
      schema=schema
  )
  print(f"Created new collection: {COLLECTION_NAME}")
except Exception as e:
    print(f"Error creating collection: {e}")

Created new collection: rag_mini


**Convert your Pandas Dataframe to a list of dictionaries**
- The Dictionary at least have 3 keys [id, passage, embedding]

In [13]:
passages_reset = passages.reset_index()

rag_data = rag_data = passages_reset[['id', 'passage', 'embedding']].to_dict('records')

print(f"First data point for insertion: {rag_data[0]}")

First data point for insertion: {'id': 0, 'passage': 'Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.', 'embedding': [0.00698534632101655, -0.06149812042713165, -0.06683704257011414, -0.008286005817353725, 0.040500346571207047, -0.025822967290878296, 0.08005103468894958, 0.07768561691045761, -0.002070049289613962, 0.10649284720420837, 0.08770647644996643, -0.02937556803226471, 0.0329684279859066, -0.028817754238843918, 0.02904241904616356, -0.02618853561580181, -0.00023743067868053913, -0.054334718734025955, 0.05992431193590164, -0.0519942045211792, 0.036157187074422836, 0.0025632453616708517, 0.060441501438617706, 0.05508989095687866, -0.02602364681661129, 0.06074458360671997, 0.006496518850326538, 0.017178891226649284, -0.030812205746769905, 0.08718796074390411, -0.010918666608631611,

In [14]:
# Code to insert the data to your DB
res = client.insert(collection_name=COLLECTION_NAME, data=rag_data)
print(res)

{'insert_count': 3200, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,

- Do a Sanity Check on your database

**Do not delete the below line during your submission**

In [None]:
print("Entity count:", client.get_collection_stats("rag_mini")["row_count"])
print("Collection schema:", client.describe_collection("rag_mini"))

Entity count: 3200
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': 'RAG-Mini-Wikipedia Collection', 'fields': [{'field_id': 100, 'name': 'id', 'description': 'Passage ID', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': 'Raw passage text', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 10000}}, {'field_id': 102, 'name': 'embedding', 'description': 'Vector embedding', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': False}


# Steps to Fetch Results
- Read the Question Dataset
- Clean the Question Dataset if necessary (Drop Questions with NaN etc.)
- Convert Each Query to a Vector Embedding (Use the same embedding model you used to embed your document)
- Try for a Single Question First
- Load Collection into Memory after creating Index for Search on your embedding field (This is an essential step before you can search in your db)
- Search and Fetch Top N Results

In [15]:
import pandas as pd

queries = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")
queries

Unnamed: 0_level_0,question,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Was Abraham Lincoln the sixteenth President of...,yes
2,Did Lincoln sign the National Banking Act of 1...,yes
4,Did his mother die of pneumonia?,no
6,How many long was Lincoln's formal education?,18 months
8,When did Lincoln begin his political career?,1832
...,...,...
1710,Was Wilson president of the American Political...,Yes
1711,Did he not cast his ballot for John M. Palmer ...,Yes
1712,Did Wilson not spend 1914 through the beginnin...,Yes
1713,"Was Wilson , a staunch opponent of antisemitis...",Yes


In [16]:
#drop na in the dataset
queries_df = queries.dropna(subset=['question', 'answer'])
query = queries_df['question'].iloc[0]                  # Your single query

query_embedding = embedding_model.encode(query).tolist()

# print(query_embedding.shape)

print(f"Query: {query}")
print(f"Shape of query embedding: {np.array(query_embedding).shape}")

Query: Was Abraham Lincoln the sixteenth President of the United States?
Shape of query embedding: (384,)


#### Create Index on the embedding column on your DB

In [17]:
index_params = MilvusClient.prepare_index_params()

# Add an index on the embedding field
index_params.add_index(
    field_name=EMBEDDING_FIELD,
    index_name="vector_index",
    index_type="AUTOINDEX",  # Use AUTOINDEX for simplicity and Milvus Lite compatibility
    metric_type="COSINE"     # Use COSINE similarity, common for sentence embeddings
)

In [18]:
# Create the index
try:
    client.create_index(collection_name=COLLECTION_NAME, index_params=index_params)
    print("Index created successfully.")
except Exception as e:
    print(f"Index creation result error: {e}")

# Load collection into memory (required for search)
client.load_collection(collection_name=COLLECTION_NAME)
print("Collection loaded into memory")

Index created successfully.
Collection loaded into memory


In [19]:
# Search the db with your query embedding
search_params = {
    "data": [query_embedding],
    "collection_name": COLLECTION_NAME,
    "limit": 1,  # Fetch Top result
    "output_fields": ["passage"], # We only need the passage text
}


In [20]:
# Search the db with your query embedding

output_ = client.search(**search_params)

print(output_)

data: [[{'id': 288, 'distance': 0.7095187902450562, 'entity': {'passage': 'Young Abraham Lincoln'}}]]


## Now get the Context
- Initially use the first passage ONLY as your context
- In Later Experiments, you must try at least 2 different passage selection strategies (Top 3 / Top 5 / Top 10) and pass to your prompt

In [21]:
context = "No relevant context found."

# observed output structure: data: [[{..., 'entity': {'passage': '...'}}]]

# 1. Check if results were returned at all
if output_ and len(output_) > 0:

    # Access the result list for the first query (usually output_[0])
    first_query_result_list = output_[0]

    # 2. Check if the inner list contains any hits
    if isinstance(first_query_result_list, list) and len(first_query_result_list) > 0:

        # Get the Top-1 hit (the first dictionary in the inner list)
        top_1_hit = first_query_result_list[0]

        # 3. Extract the passage nested under the 'entity' key
        # This structure is specific to Milvus Lite in some environments.
        if 'entity' in top_1_hit and 'passage' in top_1_hit['entity']:
            context = top_1_hit['entity']['passage']

            print("Selected Context (Top-1):")
            print(context)
        else:
            # Should not happen if data was found, but handles structure change
            print("Error: Context found, but 'passage' key missing in 'entity'.")
    else:
        # Context list was empty (no documents were close enough)
        print("Search returned an empty hits list.")
else:
    # Output_ was empty or not a list (indicates retrieval failure)
    print("No search results returned at all (Retrieval may have failed).")

Selected Context (Top-1):
Young Abraham Lincoln


**Develop your Prompt**

In [22]:
# prompt test
system_prompt = (
    "You are an intelligent, helpful and highy accurate QA assistant. "
    "Please based on and only based on my provided context to answer questions. "
    "If any answer is not present in the context, say you didn't find the answer. "
    "No external knowledge usage allowed."
)

prompt = f"""{system_prompt} \n\nContext: {context} \n\nQuestion: {query}"""
print(prompt)

You are an intelligent, helpful and highy accurate QA assistant. Please based on and only based on my provided context to answer questions. If any answer is not present in the context, say you didn't find the answer. No external knowledge usage allowed. 

Context: Young Abraham Lincoln 

Question: Was Abraham Lincoln the sixteenth President of the United States?


# RAG Response for a Single Query

In [23]:
# Load the LLM Model you want to use
MODEL_NAME  = GENERATION_MODEL_NAME

In [24]:

# Use AutoModelForSeq2SeqLM for encoder-decoder models like T5
llm_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Prepare input for the model
input_ids = tokenizer(prompt, return_tensors="pt").input_ids


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [25]:
# Generate answer
output_tokens = llm_model.generate(
    input_ids,
    max_length=50,
    do_sample=False,
    temperature=0.0
)

# Decode and extract answer.
rag_answer = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

print("RAG Generated Answer Shown Below")
print(f"Question: {query}")
print(f"Answer: {rag_answer}")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


RAG Generated Answer Shown Below
Question: Was Abraham Lincoln the sixteenth President of the United States?
Answer: yes.


# Generate Responses for all the Queries in the Dataset

In [26]:
def context_retrieve(question, client, k=1):
    """Retrieves context"""

    # Retrieval + Error Handling
    try:
        query_embedding = embedding_model.encode(question).tolist()
        search_params = {
            "data": [query_embedding],
            "collection_name": COLLECTION_NAME,
            "limit": k,
            "output_fields": ["passage"],
        }
        search_results = client.search(**search_params)
    except Exception as e:
        print(f"Retrieval Error: {e}")
        return "Retrieval failed.", []

    # Context Extraction + Error Handling
    contexts = []

    # 1. Check if results were returned at all
    if search_results and len(search_results) > 0:

        # Access the result list for the first query
        first_query_result_list = search_results[0]

        # 2. Check if the inner list contains any hits
        if isinstance(first_query_result_list, list) and len(first_query_result_list) > 0:

            # Loop over all hits in the inner list
            for hit in first_query_result_list:
                if 'passage' in hit:
                    contexts.append(hit['passage'])
                elif 'entity' in hit and 'passage' in hit['entity']:
                    # Fallback for nested structure
                    contexts.append(hit['entity']['passage'])

            if contexts:
                context_str = "\n".join(contexts)
            else:
                context_str = "No relevant context found in hits."
        else:
            context_str = "Search returned an empty hits list."
    else:
        context_str = "No search results returned at all (Retrieval may have failed)."

    print("All Collected Contexts Shown Below")
    print(context_str)

    return contexts, context_str

In [27]:
#instruct prompt
def RAG_generator(question, client, llm_model, tokenizer, k=1):
    """constructs prompt, and generates RAG response."""

    contexts, context_str = context_retrieve(question, client, k)

    # Prompt Construction
    system_prompt = (
        "You are an intelligent, helpful and highy accurate QA assistant. "
        "Please based on and only based on my provided context to answer questions. "
        "If any answer is not present in the context, say you didn't find the answer. "
        "No external knowledge usage allowed."
    )

    prompt = f"""{system_prompt} \n\nContext: {context_str} \n\nQuestion: {question}"""

    # Generation + Error Handling
    try:
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        output_tokens = llm_model.generate(input_ids, max_length=50, do_sample=False, temperature=0.0)
        rag_answer = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    except Exception as e:
        rag_answer = f"Generation failed: {e}"

    return rag_answer, contexts


# Code to run the loop remains the same
all_questions = queries_df['question'].tolist()
all_ground_truths = queries_df['answer'].tolist()
all_generated_answers = []
all_retrieved_contexts = []
sample_size = min(100, len(queries_df))

for i in range(sample_size):
    question = all_questions[i]
    answer, contexts = RAG_generator(question, client, llm_model, tokenizer, k=1)
    all_generated_answers.append(answer)
    all_retrieved_contexts.append(contexts)

all_ground_truths_sampled = all_ground_truths[:sample_size]
all_questions_sampled = all_questions[:sample_size]

All Collected Contexts Shown Below
Young Abraham Lincoln
All Collected Contexts Shown Below
Lincoln believed in the Whig theory of the presidency, which left Congress to write the laws while he signed them, vetoing only those bills that threatened his war powers. Thus, he signed the Homestead Act in 1862, making millions of acres of government-held land in the West available for purchase at very low cost. The Morrill Land-Grant Colleges Act, also signed in 1862, provided government grants for agricultural universities in each state. The Pacific Railway Acts of 1862 and 1864 granted federal support for the construction of the United States' First Transcontinental Railroad, which was completed in 1869. Other important legislation involved economic matters, including the first income tax and higher tariffs. Also included was the creation of the system of national banks by the National Banking Acts of 1863, 1864, and 1865, which allowed the creation of a strong national financial system.  

In [None]:
#persona prompt
def RAG_persona_generator(question, client, llm_model, tokenizer, k=1):
    """constructs prompt, and generates RAG response."""

    contexts, context_str = context_retrieve(question, client, k)

    # Prompt Construction
    system_prompt = (
        "You are a distinguished history professor in a top-tier university. You must answer students' RAG relevant questions accurately and concisely, and you should strictly based on the provided historical information."
        "If the context is not enough for you to get the final answer, say 'More historical information are needed'."

    )

    prompt = f"""{system_prompt} \n\nContext: {context_str} \n\nQuestion: {question}"""

    # Generation + Error Handling
    try:
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        output_tokens = llm_model.generate(input_ids, max_length=50, do_sample=False, temperature=0.0)
        rag_answer = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    except Exception as e:
        rag_answer = f"Generation failed: {e}"

    return rag_answer, contexts

# Code to run the loop remains the same
all_questions_persona = queries_df['question'].tolist()
all_ground_truths_persona = queries_df['answer'].tolist()
all_generated_answers_persona = []
all_retrieved_contexts_persona = []

for i in range(sample_size):
    question_persona = all_questions_persona[i]
    answer_persona, contexts_persona = RAG_persona_generator(question_persona, client, llm_model, tokenizer, k=1)
    all_generated_answers_persona.append(answer_persona)
    all_retrieved_contexts_persona.append(contexts_persona)

all_ground_truths_sampled_persona = all_ground_truths_persona[:sample_size]
all_questions_sampled_persona = all_questions_persona[:sample_size]

All Collected Contexts Shown Below
Young Abraham Lincoln
All Collected Contexts Shown Below
Lincoln believed in the Whig theory of the presidency, which left Congress to write the laws while he signed them, vetoing only those bills that threatened his war powers. Thus, he signed the Homestead Act in 1862, making millions of acres of government-held land in the West available for purchase at very low cost. The Morrill Land-Grant Colleges Act, also signed in 1862, provided government grants for agricultural universities in each state. The Pacific Railway Acts of 1862 and 1864 granted federal support for the construction of the United States' First Transcontinental Railroad, which was completed in 1869. Other important legislation involved economic matters, including the first income tax and higher tariffs. Also included was the creation of the system of national banks by the National Banking Acts of 1863, 1864, and 1865, which allowed the creation of a strong national financial system.  

# Finding out the Basic QA Metrics (F1 score, EM score)

In [None]:
from evaluate import load

# Load the SQuAD evaluation metric (Requires 'evaluate' library)
squad_metric = load("squad")

# Prepare data for SQuAD metric
question_ids = [str(i) for i in range(sample_size)]
predictions = [{'prediction_text': pred, 'id': q_id} for q_id, pred in zip(question_ids, all_generated_answers)]
references = [{'answers': {'answer_start': [0], 'text': [gt]}, 'id': q_id} for q_id, gt in zip(question_ids, all_ground_truths_sampled)]

# Calculate the metric
results = squad_metric.compute(predictions=predictions, references=references)

print("Instruction Prompt Metrics (SQuAD EM/F1)")
print(f"Exact Match (EM): {results['exact_match']:.2f}%")
print(f"F1 Score: {results['f1']:.2f}%")

Instruction Prompt Metrics (SQuAD EM/F1)
Exact Match (EM): 56.00%
F1 Score: 60.59%


In [None]:
# Prepare data for SQuAD metric
question_ids = [str(i) for i in range(sample_size)]
predictions_persona = [{'prediction_text': pred, 'id': q_id} for q_id, pred in zip(question_ids, all_generated_answers_persona)]
references_persona = [{'answers': {'answer_start': [0], 'text': [gt]}, 'id': q_id} for q_id, gt in zip(question_ids, all_ground_truths_sampled_persona)]

# Calculate the metric
results_persona = squad_metric.compute(predictions=predictions_persona, references=references_persona)

print("Persona Prompt Metrics (SQuAD EM/F1)")
print(f"Exact Match (EM): {results_persona['exact_match']:.2f}%")
print(f"F1 Score: {results_persona['f1']:.2f}%")

Persona Prompt Metrics (SQuAD EM/F1)
Exact Match (EM): 50.00%
F1 Score: 53.29%


In [29]:
# add for step 4
def calculate_squad_metrics(generated_answers, ground_truths):
    squad_metric = load("squad")
    question_ids = [str(i) for i in range(len(generated_answers))]
    predictions = [{'prediction_text': pred, 'id': q_id} for q_id, pred in zip(question_ids, generated_answers)]
    references = [{'answers': {'answer_start': [0], 'text': [gt]}, 'id': q_id} for q_id, gt in zip(question_ids, ground_truths)]
    return squad_metric.compute(predictions=predictions, references=references)

In [None]:
#step 4
def RAG_generator_exp(question, client, llm_model, tokenizer, k, emb_model, coll_name, prompt_strategy="instruction"):
    """
    Retrieves context, constructs prompt, and generates RAG response for experiments.
    Accepts embedding model and collection name as arguments.
    """

    # 1. Retrieval
    try:
        query_embedding = emb_model.encode(question).tolist()
        search_params = {
            "data": [query_embedding],
            "collection_name": coll_name,
            "limit": k,
            "output_fields": ["passage"],
        }
        search_results = client.search(**search_params)
    except Exception as e:
        return "Retrieval failed.", []

    # 2. Context Extraction
    contexts = []
    if search_results and len(search_results) > 0:
        first_query_result_list = search_results[0]
        if isinstance(first_query_result_list, list) and len(first_query_result_list) > 0:
            for hit in first_query_result_list:
                if 'entity' in hit and 'passage' in hit['entity']:
                    contexts.append(hit['entity']['passage'])

    context_str = "\n".join(contexts)

    # 3. Prompt Construction
    system_prompt = (
        "You are an intelligent, helpful and highy accurate QA assistant. "
        "Please based on and only based on my provided context to answer questions. "
        "If any answer is not present in the context, say you didn't find the answer. "
        "No external knowledge usage allowed."
    )
    prompt = f"""{system_prompt} \n\nContext: {context_str} \n\nQuestion: {question}"""

    # 4. Generation
    try:
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        output_tokens = llm_model.generate(input_ids, max_length=50, do_sample=False, temperature=0.0)
        rag_answer = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    except Exception as e:
        rag_answer = f"Generation failed: {e}"

    return rag_answer, contexts

In [None]:
#step 4
def run_parameter_experiment(config_name, config_details, queries_df, passages_df):
    """Orchestrates the indexing, generation, and evaluation for one configuration."""

    model_name = config_details['model_name']
    dimension = config_details['dimension']
    k_values = config_details['k_values']

    exp_collection_name = f"rag_exp_{dimension}"

    # Re-indexing
    print(f"Starting Indexing for {config_name} (Dim: {dimension})")
    exp_embedding_model = SentenceTransformer(model_name)

    # create new collections
    if client.has_collection(exp_collection_name):
        client.drop_collection(exp_collection_name)

    schema = CollectionSchema(
        fields=[
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
            FieldSchema(name="passage", dtype=DataType.VARCHAR, max_length=8192),
            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dimension)
        ]
    )
    client.create_collection(collection_name=exp_collection_name, schema=schema)

    # embedding and inserting
    exp_embeddings = exp_embedding_model.encode(passages_df['passage'].tolist()).tolist()
    passages_exp = passages_df.reset_index().copy()
    passages_exp['embedding'] = exp_embeddings
    rag_data_exp = passages_exp[['id', 'passage', 'embedding']].to_dict('records')
    client.insert(collection_name=exp_collection_name, data=rag_data_exp)

    # create index
    index_params = MilvusClient.prepare_index_params()
    index_params.add_index(field_name="embedding", index_type="AUTOINDEX", metric_type="COSINE")
    client.create_index(collection_name=exp_collection_name, index_params=index_params)
    client.load_collection(collection_name=exp_collection_name)
    print("Indexing Complete.")

    # Evaluation Loop
    all_experiment_results = []

    for k in k_values:
        print(f" Testing {config_name} with k={k}...")

        all_generated_answers_exp = []

        for question in queries_df['question'].tolist()[:sample_size]:
            answer, _ = RAG_generator_exp(
                question, client, llm_model, tokenizer, k,
                emb_model=exp_embedding_model,
                coll_name=exp_collection_name
            )
            all_generated_answers_exp.append(answer)

        # F1/EM Metrics
        results = calculate_squad_metrics(all_generated_answers_exp, all_ground_truths_sampled)

        all_experiment_results.append({
            "Config": config_name,
            "Dimension": dimension,
            "Retrieval_k": k,
            "EM": results['exact_match'],
            "F1": results['f1'],
        })

    client.release_collection(exp_collection_name)
    return all_experiment_results



In [None]:
EXPERIMENT_CONFIG = {
    "384_dim": {
        "model_name": "all-MiniLM-L6-v2",
        "dimension": 384,
        "k_values": [3, 5]
    },
    "512_dim": {
        "model_name": "sentence-transformers/distiluse-base-multilingual-cased-v2",
        "dimension": 512,
        "k_values": [3, 5]
    }
}


In [None]:
final_experiment_results = []
for name, config in EXPERIMENT_CONFIG.items():

    results = run_parameter_experiment(name, config, queries_df, passages)
    final_experiment_results.extend(results)

final_df = pd.DataFrame(final_experiment_results)
print(final_df)
# final_df.to_csv("comparison_analysis.csv", index=False)

Starting Indexing for 384_dim (Dim: 384)
Indexing Complete.
 Testing 384_dim with k=3...


Token indices sequence length is longer than the specified maximum sequence length for this model (693 > 512). Running this sequence through the model will result in indexing errors


 Testing 384_dim with k=5...
Starting Indexing for 512_dim (Dim: 512)
Indexing Complete.
 Testing 512_dim with k=3...
 Testing 512_dim with k=5...
    Config  Dimension  Retrieval_k    EM         F1
0  384_dim        384            3  55.0  62.475142
1  384_dim        384            5  60.0  65.990294
2  512_dim        512            3  54.0  60.660889
3  512_dim        512            5  54.0  59.665057


Task 5

In [None]:
# Cell for Reranking Model Setup
from sentence_transformers import CrossEncoder
import torch.nn.functional as F

# Initialize Cross-Encoder Reranker
RERANKER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
reranker = CrossEncoder(RERANKER_MODEL_NAME)

print(f"Reranker Model Loaded: {RERANKER_MODEL_NAME}. Proceed with Enhanced RAG function.")

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Reranker Model Loaded: cross-encoder/ms-marco-MiniLM-L-6-v2. Proceed with Enhanced RAG function.


In [None]:
# ENHANCEMENT 1: Query Rewriting
def rewrite_query(question, llm_model, tokenizer):
    """Rewrites an ambiguous question into a better search query using the LLM."""
    rewrite_prompt = (
        f"Rewrite the following question to be an effective search query. Original Question: '{question}'"
    )

    input_ids = tokenizer(rewrite_prompt, return_tensors="pt").input_ids
    output_tokens = llm_model.generate(input_ids, max_length=1000)
    rewritten_query = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    # Simple check to avoid empty or garbage output
    if len(rewritten_query) < 5 or rewritten_query.startswith("Rewrite"):
        return question # Return original question if rewriting fails
    return rewritten_query

# ENHANCEMENT 2: Reranking
def rerank_documents(query, contexts, top_k=1):
    """Reranks retrieved documents using a Cross-Encoder and selects the top k."""

    # Prepare (query, context_i) pairs for cross-encoder
    model_inputs = [[query, context] for context in contexts]

    # Get scores from the Cross-Encoder
    scores = reranker.predict(model_inputs) # output is a logit score representing relevance

    # Pair scores with contexts and sort
    ranked_results = sorted(zip(scores, contexts), key=lambda x: x[0], reverse=True)

    # Return the top k doc
    return [context for score, context in ranked_results[:top_k]]


def RAG_enhanced_generator(question, client, llm_model, tokenizer, k_retrieve=5, k_final=1):
    """ Enhanced RAG function with above Query Rewriting and Reranking """
    # Query Rewriting (Enhancement 1)
    search_query = rewrite_query(question, llm_model, tokenizer)

    # Initial Retrieval (similar to previous coding cell)
    try:
        query_embedding = embedding_model.encode(search_query).tolist()
        search_params = {
            "data": [query_embedding],
            "collection_name": COLLECTION_NAME,
            "limit": k_retrieve,
            "output_fields": ["passage"],
        }
        search_results = client.search(**search_params)
    except Exception as e:
        return "Retrieval failed.", []

    initial_contexts = []

    # checking a before
    if search_results and len(search_results) > 0:
        first_query_result = search_results[0]

        hits_list = None
        if isinstance(first_query_result, list):
            hits_list = first_query_result
        elif isinstance(first_query_result, dict) and 'hits' in first_query_result:
            # Fallback
            hits_list = first_query_result['hits']

        if hits_list:
            for hit in hits_list:
                # extract passage text
                if 'entity' in hit and 'passage' in hit['entity']:
                    initial_contexts.append(hit['entity']['passage'])
                elif 'passage' in hit:
                    initial_contexts.append(hit['passage'])

    if not initial_contexts:
        return "No relevant context retrieved.", []

    # Reranking (Enhancement 2)
    final_contexts = rerank_documents(question, initial_contexts, top_k=k_final)

    context_str = "\n".join(final_contexts)

    # Generation (using Instruction Prompt as verified in step 3)
    system_prompt = ("You are an intelligent, helpful and highy accurate QA assistant. "
                    "Please based on and only based on my provided context to answer questions. "
                    "If any answer is not present in the context, say you didn't find the answer. "
                    "No external knowledge usage allowed.")

    prompt = f"""{system_prompt} \n\nContext: {context_str} \n\nQuestion: {question}"""

    # Generation
    try:
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        output_tokens = llm_model.generate(input_ids, max_length=50, do_sample=False)
        rag_answer = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    except Exception as e:
        rag_answer = f"Generation failed: {e}"

    return rag_answer, final_contexts

In [None]:
# Initialize Cross-Encoder Reranker
RERANKER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
reranker = CrossEncoder(RERANKER_MODEL_NAME)

# Re-initialize models/client for execution context stability
client = MilvusClient("rag_wikipedia_mini.db")
COLLECTION_NAME = "rag_mini"
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
MODEL_NAME = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [None]:
# STEP 5 EVALUATION

# Set the optimal retrieval pool size based on Step 4's findings (k=5)
K_RETRIEVE = 5
K_FINAL = 1 # Rerank down to the single best document for the LLM

all_enhanced_answers = []
all_enhanced_contexts = []
sample_size = min(100, len(queries_df))

print(f"Running Enhanced RAG (Query Rewrite + Rerank) with k_retrieve={K_RETRIEVE}...")

for i in range(sample_size):
    question = queries_df['question'].iloc[i]

    answer, contexts = RAG_enhanced_generator(
        question, client, llm_model, tokenizer,
        k_retrieve=K_RETRIEVE,
        k_final=K_FINAL
    )
    all_enhanced_answers.append(answer)
    all_enhanced_contexts.append(contexts)

# Calculate Metrics (using the same SQuAD function from Step 3/4)
enhanced_results = calculate_squad_metrics(all_enhanced_answers, all_ground_truths_sampled)

print("Enhanced RAG Evaluation ")
print(f"Enhanced EM: {enhanced_results['exact_match']:.2f}%")
print(f"Enhanced F1: {enhanced_results['f1']:.2f}%")

Running Enhanced RAG (Query Rewrite + Rerank) with k_retrieve=5...
Enhanced RAG Evaluation 
Enhanced EM: 60.00%
Enhanced F1: 67.47%


In [None]:
# STEP 5 EVALUATION

# Set the optimal retrieval pool size based on Step 4's findings (k=5)
K_RETRIEVE = 5
K_FINAL = 3 # Rerank down to 3 best document for the LLM

all_enhanced_answers_3 = []
all_enhanced_contexts_3 = []
sample_size = min(100, len(queries_df))

print(f"Running Enhanced RAG (Query Rewrite + Rerank) with k_retrieve={K_RETRIEVE}...")

for i in range(sample_size):
    question = queries_df['question'].iloc[i]

    answer, contexts = RAG_enhanced_generator(
        question, client, llm_model, tokenizer,
        k_retrieve=K_RETRIEVE,
        k_final=K_FINAL
    )
    all_enhanced_answers_3.append(answer)
    all_enhanced_contexts_3.append(contexts)

# Calculate Metrics (using the same SQuAD function from Step 3/4)
enhanced_results_3 = calculate_squad_metrics(all_enhanced_answers_3, all_ground_truths_sampled)

print("\n--- Enhanced RAG Evaluation (F1/EM) ---")
print(f"Enhanced EM: {enhanced_results_3['exact_match']:.2f}%")
print(f"Enhanced F1: {enhanced_results_3['f1']:.2f}%")

Running Enhanced RAG (Query Rewrite + Rerank) with k_retrieve=5...


Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors



--- Enhanced RAG Evaluation (F1/EM) ---
Enhanced EM: 62.00%
Enhanced F1: 68.45%


In [None]:
# STEP 5 EVALUATION

# Set the optimal retrieval pool size based on Step 4's findings (k=5)
K_RETRIEVE = 5
K_FINAL = 5 # Rerank down to 5 best document for the LLM

all_enhanced_answers_5 = []
all_enhanced_contexts_5 = []
sample_size = min(100, len(queries_df))

print(f"Running Enhanced RAG (Query Rewrite + Rerank) with k_retrieve={K_RETRIEVE}...")

for i in range(sample_size):
    question = queries_df['question'].iloc[i]

    answer, contexts = RAG_enhanced_generator(
        question, client, llm_model, tokenizer,
        k_retrieve=K_RETRIEVE,
        k_final=K_FINAL
    )
    all_enhanced_answers_5.append(answer)
    all_enhanced_contexts_5.append(contexts)

# Calculate Metrics (using the same SQuAD function from Step 3/4)
enhanced_results_5 = calculate_squad_metrics(all_enhanced_answers_5, all_ground_truths_sampled)

print("\n--- Enhanced RAG Evaluation (F1/EM) ---")
print(f"Enhanced EM: {enhanced_results_5['exact_match']:.2f}%")
print(f"Enhanced F1: {enhanced_results_5['f1']:.2f}%")

Running Enhanced RAG (Query Rewrite + Rerank) with k_retrieve=5...

--- Enhanced RAG Evaluation (F1/EM) ---
Enhanced EM: 61.00%
Enhanced F1: 67.07%


# Advanced Evaluation using RAGAs

follow instructions to run RAGAS

In [30]:
pip install openai ragas langchain-openai



In [63]:
import os
from openai import OpenAI
os.environ["OPENAI_API_KEY"] = "masked api call here"
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])


In [64]:
#test one message after using the model and implment ragas test
resp = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role":"user","content":"Hello from RAGAS test"}]
)
print(resp.choices[0].message.content)


Hello! How can I assist you today with your RAGAS test?


In [65]:
#built llm for later llm call
from langchain_openai import ChatOpenAI
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy

llm = ChatOpenAI(model="gpt-4o-mini", api_key=os.environ["OPENAI_API_KEY"])

In [66]:
all_questions = queries_df['question'].tolist()
all_ground_truths = queries_df['answer'].tolist()
all_ground_truths_sampled = all_ground_truths[:sample_size]
all_questions_sampled = all_questions[:sample_size]

In [None]:
# the naive baseline evaluation
data_naive = {
    "question": all_questions_sampled,
    "answer": all_generated_answers,
    "contexts": all_retrieved_contexts,
    "reference": all_ground_truths_sampled}

dataset_naive = Dataset.from_dict(data_naive)

In [53]:
# Evaluate naive RAG
results_naive = evaluate(
    dataset=dataset_naive,
    metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
    llm = llm
)

naive_df = results_naive.to_pandas().mean(numeric_only=True)
print("\n Naive RAG (K=5) RAGAs Metrics Calculated Below")
print(naive_df)

Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[1]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[5]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[9]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[13]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[17]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[25]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[21]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[29]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[33]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[37]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[41]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[49]: IndexEr


 Naive RAG (K=5) RAGAs Metrics Calculated Below
faithfulness         0.555000
answer_relevancy     0.673916
context_recall       0.440000
context_precision    0.600000
dtype: float64


In [69]:
# the enhanced RAG evaluation with K_final = 1
data_enhanced = {
    "question": all_questions_sampled,
    "answer": all_enhanced_answers,      # Answers from Enhanced RAG
    "contexts": all_enhanced_contexts,   # Contexts after Reranking (K_FINAL=1)
    "reference": all_ground_truths_sampled
}
dataset_enhanced = Dataset.from_dict(data_enhanced)

In [72]:
# Evaluate Enhanced RAG (K_FINAL=1)
results_enhanced = evaluate(
    dataset=dataset_enhanced,
    metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
    llm = llm
)

print(results_enhanced)

Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[0]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[1]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[3]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[4]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[5]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[7]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[9]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[10]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[6]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[8]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[11]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[12]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[18]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[19]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[22]: TimeoutError()
ERROR:ragas.executor:Exception rai

{'faithfulness': 0.5714, 'answer_relevancy': 0.7268, 'context_recall': 0.7447, 'context_precision': 0.7818}


In [73]:
enhanced_df = results_enhanced.to_pandas().mean(numeric_only=True)
print(enhanced_df)

faithfulness         0.571429
answer_relevancy     0.726801
context_recall       0.744681
context_precision    0.781818
dtype: float64


In [77]:
# the enhanced RAG evaluation with K_final = 3

data_enhanced_3 = {
    "question": all_questions_sampled,
    "answer": all_enhanced_answers_3,      # Answers from Enhanced RAG with K_FINAL=3
    "contexts": all_enhanced_contexts_3,   # Contexts after Reranking (K_FINAL=3)
    "reference": all_ground_truths_sampled
}

dataset_enhanced_3 = Dataset.from_dict(data_enhanced_3)

In [78]:
# Evaluate Enhanced RAG (K_FINAL=3)
results_enhanced_3 = evaluate(
    dataset=dataset_enhanced_3,
    metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
    llm = llm
)

print(results_enhanced_3)

Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[1]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[5]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[13]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[9]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[17]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[25]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[21]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[29]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[33]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[27]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[31]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[35]: TimeoutError()
ERROR:ragas.executor:Exception rai

{'faithfulness': 0.7024, 'answer_relevancy': 0.7749, 'context_recall': 0.5366, 'context_precision': 0.9333}


In [82]:
# Convert to DataFrame and take mean of numeric metrics
enhanced_df_3 = results_enhanced_3.to_pandas().mean(numeric_only=True)
print(enhanced_df_3)

faithfulness         0.702381
answer_relevancy     0.774894
context_recall       0.536585
context_precision    0.933333
dtype: float64


In [87]:
# the enhanced RAG evaluation with K_final = 5
data_enhanced_5 = {
    "question": all_questions_sampled,
    "answer": all_enhanced_answers_5,      # Answers from Enhanced RAG with K_FINAL=5
    "contexts": all_enhanced_contexts_5,   # Contexts after Reranking (K_FINAL=5)
    "reference": all_ground_truths_sampled
}

dataset_enhanced_5 = Dataset.from_dict(data_enhanced_5)


In [88]:

# Evaluate Enhanced RAG (K_FINAL=5)
results_enhanced_5 = evaluate(
    dataset=dataset_enhanced_5,
    metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
    llm = llm
)

print(results_enhanced_5)


Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[13]: IndexError(list index out of range)
ERROR:ragas.executor:Exception raised in Job[0]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[1]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[2]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[4]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[5]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[3]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[7]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[8]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[11]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[12]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[15]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[16]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[17]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[19]: TimeoutError()
ERROR:ragas.

{'faithfulness': 0.7500, 'answer_relevancy': 0.7596, 'context_recall': 0.6216, 'context_precision': 1.0000}


In [89]:
# Convert to DataFrame and take mean of numeric metrics
enhanced_df_5 = results_enhanced_5.to_pandas().mean(numeric_only=True)
print(enhanced_df_5)

faithfulness         0.750000
answer_relevancy     0.759628
context_recall       0.621622
context_precision    1.000000
dtype: float64
