# Description

## This aim of this notbook to show and compare Contextual Retrieval implementation of RAG vs. simple/traditional implemintation
### Steps:
- Chucking
- Summarization
- BM25 embedding
- BM25 model saving to file
- Model embedding
- Storage of dense and sparse vectors
- Retrieval of sparse and dense vectors
- Fusion of Ranking
- Simple Retrieval


In [1]:
!pip install sentence_transformers -qU
!pip install rank_bm25 -qU
!pip install datasets -qU
!pip install pinecone -qU
!pip install langchain -qU
!pip install langchain_core -qU
!pip install langchain_groq -qU

# Importing libraries

In [2]:
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
# from rouge import Rouge
from datasets import load_dataset
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pinecone
import pandas as pd # for dataframe

# Loading dataset

In [7]:
# Step 1: Load and Chunk the Knowledge Base
# Load dataset from Hugging Face
nltk.download('punkt')
dataset = load_dataset("m-ric/huggingface_doc_qa_eval")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/893 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/289k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/65 [00:00<?, ? examples/s]

In [8]:
df = pd.DataFrame(dataset['train'])
print(df.head())

                                             context  \
0   `tokenizers-linux-x64-musl`\n\nThis is the **...   
1  !--Copyright 2023 The HuggingFace Team. All ri...   
2   Paper Pages\n\nPaper pages allow people to fi...   
3   Datasets server API\n\n> API on 🤗 datasets\n\...   
4  !--Copyright 2022 The HuggingFace Team. All ri...   

                                            question  \
0  What architecture is the `tokenizers-linux-x64...   
1  What is the purpose of the BLIP-Diffusion mode...   
2  How can a user claim authorship of a paper on ...   
3  What is the purpose of the /healthcheck endpoi...   
4  What is the default context window size for Lo...   

                                              answer  \
0                          x86_64-unknown-linux-musl   
1  The BLIP-Diffusion model is designed for contr...   
2  By clicking their name on the corresponding Pa...   
3                          Ensure the app is running   
4                                         127 

In [9]:
best_answers_df = df[df['standalone_score'] >= 4]
print(best_answers_df.head())

                                             context  \
0   `tokenizers-linux-x64-musl`\n\nThis is the **...   
1  !--Copyright 2023 The HuggingFace Team. All ri...   
2   Paper Pages\n\nPaper pages allow people to fi...   
3   Datasets server API\n\n> API on 🤗 datasets\n\...   
4  !--Copyright 2022 The HuggingFace Team. All ri...   

                                            question  \
0  What architecture is the `tokenizers-linux-x64...   
1  What is the purpose of the BLIP-Diffusion mode...   
2  How can a user claim authorship of a paper on ...   
3  What is the purpose of the /healthcheck endpoi...   
4  What is the default context window size for Lo...   

                                              answer  \
0                          x86_64-unknown-linux-musl   
1  The BLIP-Diffusion model is designed for contr...   
2  By clicking their name on the corresponding Pa...   
3                          Ensure the app is running   
4                                         127 

In [10]:
best_answers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   context            65 non-null     object
 1   question           65 non-null     object
 2   answer             65 non-null     object
 3   source_doc         65 non-null     object
 4   standalone_score   65 non-null     int64 
 5   standalone_eval    65 non-null     object
 6   relatedness_score  65 non-null     int64 
 7   relatedness_eval   65 non-null     object
 8   relevance_score    65 non-null     int64 
 9   relevance_eval     65 non-null     object
dtypes: int64(3), object(7)
memory usage: 5.2+ KB


# Extract contexts from the dataset and create Langchain documents

In [11]:
# Extract contexts from the dataset and create Langchain documents
documents = [Document(page_content=context) for context in best_answers_df['context']]  # Assuming we're using the 'train' split
print(documents)



In [12]:
from transformers import AutoTokenizer, AutoModel
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_seq_length = tokenizer.model_max_length
embedding_model = AutoModel.from_pretrained(model_name)
#load ' sentence-transformers/all-MiniLM-L6-v2' embedding model from Hugging Face
# embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [13]:
def get_seq_length(text: str):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    return len(tokens)


In [14]:
print(max_seq_length)

512


# Defining text splitter

In [15]:
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]
# Use RecursiveCharacterTextSplitter to split documents into chunks
chunk_overlap = 50
chunk_size = max_seq_length - chunk_overlap
print('chunk_size',chunk_size)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=get_seq_length,
    add_start_index=True,
    separators=MARKDOWN_SEPARATORS,
)

chunk_size 462


In [17]:
class Chunk:
    def __init__(self, text: str):
        self.text = text
        self.context = None

class ProcessedDocument:
    def __init__(self, document: Document, chunks: list[Chunk]):
        self.document = document
        self.chunks = chunks


In [22]:
docs_processed: list[ProcessedDocument] = []
for doc in documents:
    text = doc.page_content  # Extract the text content from the Document
    chunks = text_splitter.split_text(text)  # Split the text into chunks (strings)
    processed_doc = ProcessedDocument(
        doc,
        [Chunk(chunk_text) for chunk_text in chunks]
    )
    docs_processed.append(processed_doc)
# chunks = [doc.page_content for doc in docs_processed]

In [23]:
# for doc in docs_processed:
#     for chunk in doc.chunks:
#         try:
#           chunk_length = get_seq_length(chunk.text)
#           if chunk_length > max_seq_length:
#               print(f"Chunk exceeds max length: {chunk_length} tokens")
#         except Exception as e:
#           print(f"Error processing chunk: {e}")
#           print("===========================")
#           print(f"Chunk: {chunk.text}")



In [24]:
# Count total chunks
total_chunks = sum(len(doc.chunks) for doc in docs_processed)
print(f"Total number of chunks across all documents: {total_chunks}")

Total number of chunks across all documents: 533


# Define summary chain

In [25]:
from pydantic import BaseModel, Field
from typing import Optional
class Context(BaseModel):
    context: Optional[str] = Field(description="Summary of the chunk in the context of the document")


In [26]:
from langchain.prompts import PromptTemplate
from google.colab import userdata

In [27]:
from langchain_groq import ChatGroq

# MODEL_GROQ = "llama-3.1-8b-instant"
MODEL_GROQ = "llama-3.2-90b-text-preview"
groq_api_key = userdata.get("GROQ_API_KEY")
if not groq_api_key:
  groq_api_key = input("Please enter your GROQ API KEY: ")

llm = ChatGroq(api_key=groq_api_key, model=MODEL_GROQ,
                        temperature=0,
                        max_tokens=None,
                        timeout=None,
                        max_retries=2,)

In [28]:
prompt_template = PromptTemplate(
    input_variables=["document", "chunk"],
    template=
       """You are an AI assistant specializing in summarization of documents.
          Your are provide brief, relevant context for a chunk of text
            based on the following document.

            Here is the document:
            <doc>
            {document}
            </doc>

            Here is the chunk we want to situate within the whole document:
            <chunk>
            {chunk}
            </chunk>

            Provide a concise context (3-4 sentences max) for this chunk,
            considering the following guidelines:
            - Give a short succinct context Be this chunk within the overall
            document for the purposes of improving search retrieval of the chunk.
            - Answer only with the succinct context and nothing else.
            - Context should be mentioned like ‘Focuses on ....'
            do not mention 'this chunk or section focuses on...'
            """)

def create_context_chain(llm):
    # Configure the LLM to produce structured output
    structured_llm = llm.with_structured_output(Context)

    # Create the chain using the pipe operator
    chain = prompt_template | structured_llm
    return chain

context_chain = create_context_chain(llm)

In [29]:
doc = docs_processed[30]
print("page:\n",doc.document.page_content)
# for chunk in doc.chunks:
chunk = doc.chunks[0]
print('chunk:\n', chunk.text)

page:
  Convert weights to safetensors

PyTorch model weights are commonly saved and stored as `.bin` files with Python's [`pickle`](https://docs.python.org/3/library/pickle.html) utility. To save and store your model weights in the more secure `safetensor` format, we recommend converting your weights to `.safetensors`.

The easiest way to convert your model weights is to use the [Convert Space](https://huggingface.co/spaces/diffusers/convert), given your model weights are already stored on the Hub. The Convert Space downloads the pickled weights, converts them, and opens a Pull Request to upload the newly converted `.safetensors` file to your repository.


For larger models, the Space may be a bit slower because its resources are tied up in converting other models. You can also try running the [convert.py](https://github.com/huggingface/safetensors/blob/main/bindings/python/convert.py) script (this is what the Space is running) locally to convert your weights.

Feel free to ping [@Nar

In [30]:
context: Context = context_chain.invoke({"document": doc.document.page_content, "chunk": chunk})

In [31]:
print(f"chunk with context: Context: \n\n {context.context} \n\n Chunk: {chunk.text}")

chunk with context: Context: 

 Converting PyTorch model weights to safetensors format using the Convert Space or a local script. 

 Chunk: Convert weights to safetensors

PyTorch model weights are commonly saved and stored as `.bin` files with Python's [`pickle`](https://docs.python.org/3/library/pickle.html) utility. To save and store your model weights in the more secure `safetensor` format, we recommend converting your weights to `.safetensors`.

The easiest way to convert your model weights is to use the [Convert Space](https://huggingface.co/spaces/diffusers/convert), given your model weights are already stored on the Hub. The Convert Space downloads the pickled weights, converts them, and opens a Pull Request to upload the newly converted `.safetensors` file to your repository.


For larger models, the Space may be a bit slower because its resources are tied up in converting other models. You can also try running the [convert.py](https://github.com/huggingface/safetensors/blob/m

In [32]:
import time
from datetime import datetime

# Initialize counters
calls_per_minute = 0
last_reset_time = time.time()

for doc in docs_processed:
    for chunk in doc.chunks:
        current_time = time.time()

        # Check if a minute has passed since last reset
        if current_time - last_reset_time >= 60:
            print(f"Made {calls_per_minute} calls in the last minute")
            calls_per_minute = 0
            last_reset_time = current_time
        else:
            # If we're still within the same minute and hit rate limit
            if calls_per_minute >= 30:  # Assuming 30 calls per minute limit
                wait_time = 60 - (current_time - last_reset_time)
                print(f"Rate limit reached. Waiting {wait_time:.2f} seconds...")
                time.sleep(wait_time)
                calls_per_minute = 0
                last_reset_time = time.time()

        # Make the API call
        context: Context = context_chain.invoke({"document": doc.document.page_content, "chunk": chunk})
        doc.context = context.context

        # Increment counter
        calls_per_minute += 1

        # Optional: print progress
        print(f"Processed chunk {calls_per_minute} in current minute. Total chunks processed: {sum(len(d.chunks) for d in docs_processed[:docs_processed.index(doc)]) + len(doc.chunks[:doc.chunks.index(chunk) + 1])}")


Processed chunk 1 in current minute. Total chunks processed: 1
Processed chunk 2 in current minute. Total chunks processed: 2
Processed chunk 3 in current minute. Total chunks processed: 3
Processed chunk 4 in current minute. Total chunks processed: 4
Processed chunk 5 in current minute. Total chunks processed: 5
Processed chunk 6 in current minute. Total chunks processed: 6
Processed chunk 7 in current minute. Total chunks processed: 7
Processed chunk 8 in current minute. Total chunks processed: 8
Processed chunk 9 in current minute. Total chunks processed: 9
Processed chunk 10 in current minute. Total chunks processed: 10
Processed chunk 11 in current minute. Total chunks processed: 11
Made 11 calls in the last minute
Processed chunk 1 in current minute. Total chunks processed: 12
Processed chunk 2 in current minute. Total chunks processed: 13
Processed chunk 3 in current minute. Total chunks processed: 14
Processed chunk 4 in current minute. Total chunks processed: 15
Processed chun

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.2-90b-text-preview` in organization `org_01j1ce2962fx1rw1x30eavn9h5` on : Limit 500000, Used 500530, Requested 3799. Please try again in 12m28.17s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}

In [None]:
from datasets import Dataset
from huggingface_hub import login

# Create lists to store the data
chunk_texts = []
document_texts = []
contexts = []

# Extract data from docs_processed
for doc in docs_processed:
    for chunk in doc.chunks:
        chunk_texts.append(chunk.text)
        contexts.append(chunk.context)
        document_texts.append(doc.document.page_content)

# Create dictionary for dataset
dataset_dict = {
    'chunk': chunk_texts,
    'document': document_texts,
    'context': contexts
}

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(dataset_dict)

hf_token = userdata.get("HuggingFace")
if not hf_token:
  # Login to Hugging Face (you'll need your token)
  hf_token = input("Please enter your Hugging Face token: ")
login(hf_token)

# Push to Hugging Face Hub
dataset.push_to_hub(
    f"AIEnthusiast369/hf_doc_qa_eval_chunk_size_{chunk_size}_llama_3_2_90b",  # Replace with your username and desired dataset name
    private=False  # Set to False if you want it public
)

## Save processed documents to file
## Downloading processed documents in case notebook times out


In [None]:
import joblib
from datetime import datetime
from google.colab import files
import glob
import os

def save_download_object(object, filename):
    joblib.dump(object, filename)
    print(f"Saved object to {filename}")
    files.download(filename)
    print(f"Downloaded {filename}")

def create_timestamp() -> str:
    return datetime.now().strftime("%Y%m%d_%H%M%S")


def create_filename_timestamp(filename, extension = "joblib") -> str:
    timestamp = create_timestamp()
    return f"{filename}_{timestamp}.{extension}"

def load_bm25_model(filename):
    try:
        return joblib.load(filename)
    except (FileNotFoundError, OSError):
        return None

def get_latest_bm25_file():
    # Look for files matching the pattern bm25_*.joblib
    files = glob.glob("bm25_*.joblib")
    if not files:
        return None
    # Return the most recent file
    return max(files, key=os.path.getctime)

In [None]:
# Create filename with timestamp
bm25_filename = create_filename_timestamp("docs_processed")

# Save the processed documents
save_download_object(docs_processed, bm25_filename)

In [None]:
# prompt: print chunks from docs_processed where context has value

for doc in docs_processed:
  for chunk in doc.chunks:
    if chunk.context:
      print(f"chunk with context: Context: \n\n {chunk.context} \n\n Chunk: {chunk.text}")


In [None]:
# Create list of chunks with their contexts
chunks_with_context = []
for doc in docs_processed:
    for chunk in doc.chunks:
        if chunk.context:  # Only include chunks that have a context
            chunks_with_context.append(
              f"{chunk.text} \n\n {chunk.context}"
            )

In [None]:
pinecone_api_key = userdata.get("PINECONE_API_KEY")
if not pinecone_api_key:
  pinecone_api_key = input("Please enter your PINECONE API KEY: ")

pinecone_env = userdata.get("PINECONE_ENV")
if not pinecone_env:
  pinecone_env = input("Please enter your PINECONE ENV: ")

SPARSE_INDEX_NAME: str = "sparse_index"
EMBEDDING_INDEX_NAME: str = "embedding_index"
EMBEDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDING_MODEL)

pinecone.init(api_key=pinecone_api_key, environment=pinecone_env)

def create_bm25(chunks: list[str]):
    # Try to load existing BM25 model
    latest_bm25_file = get_latest_bm25_file()
    if latest_bm25_file:
        bm25 = load_bm25_model(latest_bm25_file)
        if bm25 is not None:
            print(f"Loaded existing BM25 model from {latest_bm25_file}")
            return bm25

    # If no existing model found or loading failed, create a new one
    print("Creating new BM25 model...")
    tokenized_chunks = [nltk.word_tokenize(chunk) for chunk in chunks]
    bm25 = BM25Okapi(tokenized_chunks)

    # Save the new model
    bm25_filename = create_filename_timestamp("bm25")
    save_download_object(bm25, bm25_filename)

    return bm25

def create_pinecone_indexes(pinecone, embedding_model, bm25, chunks: list[str]):

    # Create Pinecone indexes for TF-IDF and Embeddings
    max_seq_length = model.max_seq_length

    if SPARSE_INDEX_NAME not in pinecone.list_indexes():
        pinecone.create_index(SPARSE_INDEX_NAME, dimension=len(bm25.), metric="cosine")
        time.sleep(1) # giving time to pinecode to create the index

    sparse_index = pinecone.Index(SPARSE_INDEX_NAME)
    if EMBEDDING_INDEX_NAME not in pinecone.list_indexes():
        pinecone.create_index(EMBEDDING_INDEX_NAME, dimension=max_seq_length, metric="cosine")
        time.sleep(1) # giving time to pinecode to create the index

    # Connect to Pinecone indexes
    sparse_index = pinecone.Index(SPARSE_INDEX_NAME)
    embedding_index = pinecone.Index(EMBEDDING_INDEX_NAME)

    # Store Vectors/Embeddings in Pinecone with Metadata (Chunk Text)
    # Store BM25 vectors
    for i, chunk in enumerate(chunks):
        bm25_scores = bm25.get_scores(nltk.word_tokenize(chunk))
        sparse_index.upsert([(str(i), bm25_scores.tolist(), {"text": chunk})])

    # Semantic Embeddings using a Pre-trained Transformer Model
    embeddings = embedding_model.encode(chunks, convert_to_tensor=False)
    # Store embeddings in Pinecone
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        embedding_index.upsert([(str(i), embedding, {"text": chunk})])


In [None]:
from sentence_transformers import CrossEncoder

def fusion_rank_search(
    query: str,
    bm25,
    chunks: list[str],
    model,
    embedding_index,
    reranker_model: CrossEncoder = None,
    k: int = 5,
    weight_sparse: float = 0.5,
    reranker_cutoff: int = 20  # Number of top results to rerank
):
    # Get BM25 results
    tokenized_query = nltk.word_tokenize(query)
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top_indices = np.argsort(bm25_scores)[::-1][:reranker_cutoff]

    # Normalize BM25 scores using min-max normalization
    bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
    bm25_results = [
        {
            'id': str(i),
            'score': bm25_scores_norm[i],
            'metadata': {'text': chunks[i]}
        }
        for i in bm25_top_indices
    ]

    # Get embedding results
    query_embedding = model.encode(query, convert_to_tensor=False).tolist()
    embedding_results = embedding_index.query(query_embedding, top_k=reranker_cutoff, include_metadata=True)

    # Extract and normalize embedding scores
    dense_scores = np.array([match['score'] for match in embedding_results['matches']])
    dense_scores_norm = (dense_scores - np.min(dense_scores)) / (np.max(dense_scores) - np.min(dense_scores))

    # Create dictionaries to store normalized scores
    fusion_scores = defaultdict(lambda: {'sparse': 0.0, 'dense': 0.0, 'text': ''})

    # Store normalized BM25 scores
    for result in bm25_results:
        doc_id = result['id']
        fusion_scores[doc_id]['sparse'] = result['score']
        fusion_scores[doc_id]['text'] = result['metadata']['text']

    # Store normalized embedding scores
    for match, norm_score in zip(embedding_results['matches'], dense_scores_norm):
        doc_id = match['id']
        fusion_scores[doc_id]['dense'] = norm_score
        fusion_scores[doc_id]['text'] = match['metadata']['text']

    # Combine scores using weighted average
    weight_dense = 1.0 - weight_sparse
    initial_results = [
        {
            'id': doc_id,
            'score': (
                weight_sparse * scores['sparse'] +
                weight_dense * scores['dense']
            ),
            'metadata': {
                'text': scores['text'],
                'sparse_score': scores['sparse'],
                'dense_score': scores['dense']
            }
        }
        for doc_id, scores in fusion_scores.items()
    ]

    # Sort by combined score
    initial_results.sort(key=lambda x: x['score'], reverse=True)
    initial_results = initial_results[:reranker_cutoff]

    # Apply reranking if reranker model is provided
    if reranker_model is not None:
        # Prepare pairs for reranking
        pairs = [(query, result['metadata']['text']) for result in initial_results]

        # Get reranker scores - use them directly for final ranking
        rerank_scores = reranker_model.predict(pairs)

        # Update results with reranker scores
        for result, rerank_score in zip(initial_results, rerank_scores):
            result['metadata']['rerank_score'] = float(rerank_score)
            # Use reranker score as the final score
            result['score'] = float(rerank_score)

        # Resort based on reranker scores
        initial_results.sort(key=lambda x: x['score'], reverse=True)

    return initial_results[:k]

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from tqdm import tqdm
import pandas as pd

def evaluate_rag_system(
    best_answers_df: pd.DataFrame,
    bm25,
    chunks: list[str],
    embedding_model,
    embedding_index,
    llm_chain,
    n_samples: int = None  # Optional: limit number of samples for testing
):
    # Initialize ROUGE scorer
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Initialize results storage
    results = []

    # Get subset of dataframe if n_samples is specified
    eval_df = best_answers_df.head(n_samples) if n_samples else best_answers_df

    # Iterate through questions and answers
    for idx, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc="Evaluating Questions"):
        query = row['question']
        reference_answer = row['answer']

        try:
            # Get relevant context using fusion ranking
            retrieved_results = fusion_rank_search(
                query=query,
                bm25=bm25,
                chunks=chunks,
                model=embedding_model,
                embedding_index=embedding_index,
                k=20
            )

            # Prepare context for LLM
            context = "\n".join([res['metadata']['text'] for res in retrieved_results])

            # Generate answer using LLM
            llm_response = llm_chain.invoke({
                "context": context,
                "question": query
            })
            generated_answer = llm_response.content if hasattr(llm_response, 'content') else llm_response

            # Calculate BLEU score
            reference_tokens = [reference_answer.split()]
            candidate_tokens = generated_answer.split()
            bleu_score = sentence_bleu(reference_tokens, candidate_tokens)

            # Calculate ROUGE scores
            rouge_scores = rouge_scorer_instance.score(reference_answer, generated_answer)

            # Store results
            result = {
                'question': query,
                'reference_answer': reference_answer,
                'generated_answer': generated_answer,
                'bleu_score': bleu_score,
                'rouge1_f1': rouge_scores['rouge1'].fmeasure,
                'rouge2_f1': rouge_scores['rouge2'].fmeasure,
                'rougeL_f1': rouge_scores['rougeL'].fmeasure,
                'retrieved_contexts': [res['metadata']['text'] for res in retrieved_results],
                'context_scores': [res['score'] for res in retrieved_results]
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing question {idx}: {str(e)}")
            continue

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Calculate and print average scores
    avg_scores = {
        'Average BLEU': results_df['bleu_score'].mean(),
        'Average ROUGE-1': results_df['rouge1_f1'].mean(),
        'Average ROUGE-2': results_df['rouge2_f1'].mean(),
        'Average ROUGE-L': results_df['rouge2_f1'].mean()
    }

    return results_df, avg_scores

# Example usage:
def print_evaluation_results(results_df, avg_scores):
    print("\nAverage Scores:")
    for metric, score in avg_scores.items():
        print(f"{metric}: {score:.4f}")

    print("\nDetailed Results Sample (first 3):")
    for idx, row in results_df.head(3).iterrows():
        print("\nQuestion:", row['question'])
        print("Reference Answer:", row['reference_answer'])
        print("Generated Answer:", row['generated_answer'])
        print(f"BLEU Score: {row['bleu_score']:.4f}")
        print(f"ROUGE-1 F1: {row['rouge1_f1']:.4f}")
        print(f"ROUGE-2 F1: {row['rouge2_f1']:.4f}")
        print(f"ROUGE-L F1: {row['rougeL_f1']:.4f}")
        print("\nRetrieved Contexts:")
        for context, score in zip(row['retrieved_contexts'], row['context_scores']):
            print(f"Score: {score:.4f}")
            print(f"Context: {context[:200]}...")

In [None]:
# Run evaluation
results_df, avg_scores = evaluate_rag_system(
    best_answers_df=best_answers_df,
    bm25=bm25,
    chunks=chunks,
    model=embedding_model,
    embedding_index=embedding_index,
    llm_chain=llm_chain,
    n_samples=10  # Optional: start with a small sample for testing
)

# Print results
print_evaluation_results(results_df, avg_scores)

# Save results to CSV (optional)
results_df.to_csv('rag_evaluation_results.csv', index=False)