# Description

## This aim of this notbook to show and compare Contextual Retrieval implementation of RAG vs. simple/traditional implemintation
### Steps:
- Chucking
- Summarization
- BM25 embedding
- BM25 model saving to file
- Model embedding
- Storage of dense and sparse vectors
- Retrieval of sparse and dense vectors
- Fusion of Ranking
- Simple Retrieval


In [None]:
# !pip install sentence_transformers -qU
!pip install rank_bm25 -qU
!pip install datasets -qU
# !pip install -U FlagEmbedding
# !pip install pinecone-plugin-inference -qU
!pip install pinecone[grpc] -qU
# !pip install pinecone-client  -qU
!pip install langchain -qU
!pip install langchain_core -qU
!pip install langchain_groq -qU
!pip install langchain-google-genai -qU
!pip install langchain-openai -qU
!pip install rouge-score  -qU


# Importing libraries

In [None]:
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
# from rouge import Rouge
from datasets import load_dataset
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pinecone
import pandas as pd # for dataframe
import getpass
from google.colab import userdata
import os

In [None]:
# nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Loading dataset

In [None]:
# Step 1: Load and Chunk the Knowledge Base
# Load dataset from Hugging Face

dataset = load_dataset("m-ric/huggingface_doc_qa_eval")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
df = pd.DataFrame(dataset['train'])
print(df.head())

                                             context  \
0   `tokenizers-linux-x64-musl`\n\nThis is the **...   
1  !--Copyright 2023 The HuggingFace Team. All ri...   
2   Paper Pages\n\nPaper pages allow people to fi...   
3   Datasets server API\n\n> API on 🤗 datasets\n\...   
4  !--Copyright 2022 The HuggingFace Team. All ri...   

                                            question  \
0  What architecture is the `tokenizers-linux-x64...   
1  What is the purpose of the BLIP-Diffusion mode...   
2  How can a user claim authorship of a paper on ...   
3  What is the purpose of the /healthcheck endpoi...   
4  What is the default context window size for Lo...   

                                              answer  \
0                          x86_64-unknown-linux-musl   
1  The BLIP-Diffusion model is designed for contr...   
2  By clicking their name on the corresponding Pa...   
3                          Ensure the app is running   
4                                         127 

Taking only best question/answer pairs

In [None]:
best_answers_df = df[df['standalone_score'] >= 4]
print(best_answers_df.head())

                                             context  \
0   `tokenizers-linux-x64-musl`\n\nThis is the **...   
1  !--Copyright 2023 The HuggingFace Team. All ri...   
2   Paper Pages\n\nPaper pages allow people to fi...   
3   Datasets server API\n\n> API on 🤗 datasets\n\...   
4  !--Copyright 2022 The HuggingFace Team. All ri...   

                                            question  \
0  What architecture is the `tokenizers-linux-x64...   
1  What is the purpose of the BLIP-Diffusion mode...   
2  How can a user claim authorship of a paper on ...   
3  What is the purpose of the /healthcheck endpoi...   
4  What is the default context window size for Lo...   

                                              answer  \
0                          x86_64-unknown-linux-musl   
1  The BLIP-Diffusion model is designed for contr...   
2  By clicking their name on the corresponding Pa...   
3                          Ensure the app is running   
4                                         127 

In [None]:
best_answers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   context            65 non-null     object
 1   question           65 non-null     object
 2   answer             65 non-null     object
 3   source_doc         65 non-null     object
 4   standalone_score   65 non-null     int64 
 5   standalone_eval    65 non-null     object
 6   relatedness_score  65 non-null     int64 
 7   relatedness_eval   65 non-null     object
 8   relevance_score    65 non-null     int64 
 9   relevance_eval     65 non-null     object
dtypes: int64(3), object(7)
memory usage: 5.2+ KB


# Extract contexts from the dataset and create Langchain documents

In [None]:
# Extract contexts from the dataset and create Langchain documents
# documents = [Document(page_content=context) for context in best_answers_df['context']]  # Assuming we're using the 'train' split
# print(documents)

texts = best_answers_df['context'].tolist()

# **Setting up Embedding model**

## **sentence-transformers**

In [None]:
# # load ' sentence-transformers/all-MiniLM-L6-v2' embedding model from Hugging Face
# from transformers import AutoTokenizer, AutoModel
# model_name = 'sentence-transformers/all-MiniLM-L6-v2'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# max_seq_length = tokenizer.model_max_length
# embedding_model = AutoModel.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

## **openai**

In [None]:
openai_api_key = userdata.get("OPENAI_API_KEY")
if not openai_api_key:
  openai_api_key = getpass("Please enter your OPENAI API KEY: ")

os.environ["OPENAI_API_KEY"] = openai_api_key

In [None]:
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

max_seq_length = embedding_model.embedding_ctx_length
# index_dimensions = embedding_model.dimensions
index_dimensions = 1536 # default setting of text-embedding-3-small
print(f'max_seq_length:{max_seq_length}, index_dimensions:{index_dimensions}')

max_seq_length:8191, index_dimensions:1536


## **Google**

In [None]:
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

# MODEL_GEMINI_EMBED = "text-embedding-004"
# embedding_model = GoogleGenerativeAIEmbeddings(model=MODEL_GEMINI_EMBED)


In [None]:
# print(f'max_seq_length:{max_seq_length}, index_dimensions:{index_dimensions}')

# Defining text splitter

###openai

In [None]:
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]
# Use RecursiveCharacterTextSplitter to split documents into chunks
chunk_overlap = 200
chunk_size = 1000 - chunk_overlap
print('chunk_size',chunk_size)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=MARKDOWN_SEPARATORS,
)

chunk_size 800


####Other

In [None]:
# def get_seq_length(text: str):
#     tokens = tokenizer.encode(text, add_special_tokens=True)
#     return len(tokens)


In [None]:
# MARKDOWN_SEPARATORS = [
#     "\n#{1,6} ",
#     "```\n",
#     "\n\\*\\*\\*+\n",
#     "\n---+\n",
#     "\n___+\n",
#     "\n\n",
#     "\n",
#     " ",
#     "",
# ]
# # Use RecursiveCharacterTextSplitter to split documents into chunks
# chunk_overlap = 50
# chunk_size = max_seq_length - chunk_overlap
# print('chunk_size',chunk_size)
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=chunk_size,
#     chunk_overlap=chunk_overlap,
#     length_function=get_seq_length,
#     add_start_index=True,
#     separators=MARKDOWN_SEPARATORS,
# )

chunk_size 462


# **Definining ProcessedDocument & Chunk**

In [None]:
class Chunk:
    def __init__(self, text: str):
        self.text = text
        self.context = None

class ProcessedDocument:
    def __init__(self, text: str, chunks: list[Chunk]):
        self.text = text
        self.chunks = chunks


In [None]:
docs_processed: list[ProcessedDocument] = []
for text in texts:
    # text = doc.page_content  # Extract the text content from the Document
    chunks = text_splitter.split_text(text)  # Split the text into chunks (strings)
    print(f"Number of chunks for document #{len(docs_processed)}: {len(chunks)}")
    processed_doc = ProcessedDocument(
        text,
        [Chunk(chunk_text) for chunk_text in chunks]
    )
    docs_processed.append(processed_doc)
print(f"Number of Processed document: {len(docs_processed)}")

Number of chunks for document #0: 1
Number of chunks for document #1: 6
Number of chunks for document #2: 5
Number of chunks for document #3: 2
Number of chunks for document #4: 12
Number of chunks for document #5: 5
Number of chunks for document #6: 29
Number of chunks for document #7: 2
Number of chunks for document #8: 40
Number of chunks for document #9: 26
Number of chunks for document #10: 5
Number of chunks for document #11: 3
Number of chunks for document #12: 16
Number of chunks for document #13: 3
Number of chunks for document #14: 7
Number of chunks for document #15: 1
Number of chunks for document #16: 22
Number of chunks for document #17: 2
Number of chunks for document #18: 20
Number of chunks for document #19: 27
Number of chunks for document #20: 24
Number of chunks for document #21: 20
Number of chunks for document #22: 40
Number of chunks for document #23: 16
Number of chunks for document #24: 1
Number of chunks for document #25: 2
Number of chunks for document #26: 1

In [None]:
# for doc in docs_processed:
#     for chunk in doc.chunks:
#         try:
#           chunk_length = get_seq_length(chunk.text)
#           if chunk_length > max_seq_length:
#               print(f"Chunk exceeds max length: {chunk_length} tokens")
#         except Exception as e:
#           print(f"Error processing chunk: {e}")
#           print("===========================")
#           print(f"Chunk: {chunk.text}")



In [None]:
# Count total chunks
total_chunks = sum(len(doc.chunks) for doc in docs_processed)
print(f"Total number of chunks across all documents: {total_chunks}")

Total number of chunks across all documents: 882


# **Define summary chain**

In [None]:
from langchain.prompts import PromptTemplate
from google.colab import userdata

### **OPENAI**

In [None]:
from langchain_openai import ChatOpenAI


model_chat_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model=model_chat_name)
sum_provider = 'OPENAI'

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_messages([
    ("system",
            """You are an AI assistant specializing in document summarization and contextualization. Your task is to provide brief, relevant context for a specific chunk of text based on a larger document. Here's how to proceed:

First, carefully read and analyze the following document:

<document>
{document}
</document>

Now, consider this specific chunk of text from the document:

<chunk>
{chunk}
</chunk>

Your goal is to provide a concise context for this chunk, situating it within the whole document. Follow these guidelines:

1. Analyze how the chunk relates to the overall document's themes, arguments, or narrative.
2. Identify the chunk's role or significance within the broader context of the document.
3. Determine what information from the rest of the document is most relevant to understanding this chunk.

Compose your response as follows:
- Provide 3-4 sentences maximum of context.
- Begin directly with the context, without any introductory phrases.
- Use language like "Focuses on..." or "Addresses..." to describe the chunk's content.
- Ensure the context would be helpful for improving search retrieval of the chunk.

Important notes:
- Do not use phrases like "this chunk" or "this section" in your response.
- Do not repeat the chunk's content verbatim; provide context from the rest of the document.
- Avoid unnecessary details; be succinct and relevant.
- Do not include any additional commentary or meta-discussion about the task itself.

 Remember, your goal is to provide clear, concise, and relevant context that situates the given chunk within the larger document.
            """

     )
])


In [None]:
def create_context_chain(llm):
    return prompt_template | llm

context_chain = create_context_chain(llm)

In [None]:
def get_context(text: str, chunk: str) -> str:
    if len(chunk.strip()) <= 0 or len(text.strip()) <= 0:
        print(f"Chunk or text is empty")
        raise Exception("Chunk or text is empty")
    context= context_chain.invoke({"document": text, "chunk": chunk})
    return context.content

In [None]:
def generate_context(docs_processed: list[ProcessedDocument]):
    for i, doc in enumerate(docs_processed):
        print(f'processing document index {i}')
        for chunk in doc.chunks:
            # print(chunk.text)
            context: str = get_context(text= doc.text, chunk= chunk.text)
            chunk.context = context
            # print(f"chunk with context: Context: \n\n {chunk.context} \n\n Chunk: {chunk.text}")

# **Testing chain**

In [None]:
page = """
 Convert weights to safetensors

PyTorch model weights are commonly saved and stored as `.bin` files with Python's [`pickle`](https://docs.python.org/3/library/pickle.html) utility. To save and store your model weights in the more secure `safetensor` format, we recommend converting your weights to `.safetensors`.
The easiest way to convert your model weights is to use the [Convert Space](https://huggingface.co/spaces/diffusers/convert), given your model weights are already stored on the Hub. The Convert Space downloads the pickled weights, converts them, and opens a Pull Request to upload the newly converted `.safetensors` file to your repository.
<Tip warning={true}>
For larger models, the Space may be a bit slower because its resources are tied up in converting other models. You can also try running the [convert.py](https://github.com/huggingface/safetensors/blob/main/bindings/python/convert.py) script (this is what the Space is running) locally to convert your weights.
Feel free to ping [@Narsil](https://huggingface.co/Narsil) for any issues with the Space.
</Tip>
"""
chunk = """
Convert weights to safetensors
PyTorch model weights are commonly saved and stored as `.bin` files with Python's [`pickle`](https://docs.python.org/3/library/pickle.html) utility. To save and store your model weights in the more secure `safetensor` format, we recommend converting your weights to `.safetensors`.
The easiest way to convert your model weights is to use the [Convert Space](https://huggingface.co/spaces/diffusers/convert), given your model weights are already stored on the Hub. The Convert Space downloads the pickled weights, converts them, and opens a Pull Request to upload the newly converted `.safetensors` file to your repository.
<Tip warning={true}>
For larger models, the Space may be a bit slower because its resources are tied up in converting other models. You can also try running the [convert.py](https://github.com/huggingface/safetensors/blob/main/bindings/python/convert.py) script (this is what the Space is running) locally to convert your weights.
Feel free to ping [@Narsil](https://huggingface.co/Narsil) for any issues with the Space.
</Tip>
"""

In [None]:
test_context = get_context(text = page, chunk=chunk)

In [None]:
print(test_context)

The document discusses converting PyTorch model weights saved as `.bin` files with `pickle` to a more secure `safetensor` format by using the Convert Space tool or running a conversion script locally. It emphasizes the importance of converting weights to `.safetensors` for security reasons. Additionally, the document provides a tip about potential delays in using the Convert Space tool due to resource constraints and offers an alternative method for conversion.


In [None]:
# temp_docs = docs_processed[1:2]
# generate_context(temp_docs)
generate_context(docs_processed)

processing document index 0
processing document index 1
processing document index 2
processing document index 3
processing document index 4
processing document index 5
processing document index 6
processing document index 7
processing document index 8
processing document index 9
processing document index 10
processing document index 11
processing document index 12
processing document index 13
processing document index 14
processing document index 15
processing document index 16
processing document index 17
processing document index 18
processing document index 19
processing document index 20
processing document index 21
processing document index 22
processing document index 23
processing document index 24
processing document index 25
processing document index 26
processing document index 27
processing document index 28
processing document index 29
processing document index 30
processing document index 31
processing document index 32
processing document index 33
processing document inde

In [None]:
# print(temp_docs[0].chunks[0].context)

None


### **GROQ**

In [None]:
# from pydantic import BaseModel, Field
# from typing import Optional
# class Context(BaseModel):
#     context: Optional[str] = Field(description="Summary of the chunk in the context of the document")

In [None]:
#It hits the limit even thoug lower than daily
# from langchain_groq import ChatGroq

# # MODEL_GROQ = "llama-3.1-8b-instant"
# MODEL_GROQ = "llama-3.2-90b-text-preview"
# groq_api_key = userdata.get("GROQ_API_KEY")
# if not groq_api_key:
#   groq_api_key = getpass("Please enter your GROQ API KEY: ")

# llm = ChatGroq(api_key=groq_api_key, model=MODEL_GROQ,
#                         temperature=0,
#                         max_tokens=None,
#                         timeout=None,
#                         max_retries=2,)
# sum_provider = 'GROQ'

### **GOOGLE**

In [None]:
# MODEL_GEMINI_CHAT = "gemini-1.5-flash"

# gemini_api_key = userdata.get("GEMINI_API_KEY")
# if not gemini_api_key:
#   gemini_api_key = getpass("Please enter your GEMINI API KEY: ")

# os.environ["GOOGLE_API_KEY"] = gemini_api_key

In [None]:
# from langchain_google_genai import GoogleGenerativeAI
# llm = GoogleGenerativeAI(model=MODEL_GEMINI_CHAT)
# sum_provider = 'GOOGLE'

In [None]:
# prompt_template = PromptTemplate(
#     input_variables=["document", "chunk"],
#     template=
#        """You are an AI assistant specializing in document summarization and contextualization. Your task is to provide brief, relevant context for a specific chunk of text based on a larger document. Here's how to proceed:

# First, carefully read and analyze the following document:

# <document>
# {{DOCUMENT}}
# </document>

# Now, consider this specific chunk of text from the document:

# <chunk>
# {{CHUNK}}
# </chunk>

# Your goal is to provide a concise context for this chunk, situating it within the whole document. Follow these guidelines:

# 1. Analyze how the chunk relates to the overall document's themes, arguments, or narrative.
# 2. Identify the chunk's role or significance within the broader context of the document.
# 3. Determine what information from the rest of the document is most relevant to understanding this chunk.

# Compose your response as follows:
# - Provide 3-4 sentences maximum of context.
# - Begin directly with the context, without any introductory phrases.
# - Use language like "Focuses on..." or "Addresses..." to describe the chunk's content.
# - Ensure the context would be helpful for improving search retrieval of the chunk.

# Important notes:
# - Do not use phrases like "this chunk" or "this section" in your response.
# - Do not repeat the chunk's content verbatim; provide context from the rest of the document.
# - Avoid unnecessary details; be succinct and relevant.
# - Do not include any additional commentary or meta-discussion about the task itself.

#  Remember, your goal is to provide clear, concise, and relevant context that situates the given chunk within the larger document.)

In [None]:
# def create_context_chain(llm, structure: bool = True):
#     # Configure the LLM to produce structured output
#     if structure:
#         l_llm = llm.with_structured_output(Context)
#     else:
#         l_llm = llm
#     # Create the chain using the pipe operator
#     chain = prompt_template | l_llm
#     return chain

# context_chain = create_context_chain(llm, structure = False)

In [None]:
# doc = docs_processed[30]
# print("page:\n",doc.document.page_content)
# # for chunk in doc.chunks:
# chunk = doc.chunks[0]
# print('chunk:\n', chunk.text)

page:
  Convert weights to safetensors

PyTorch model weights are commonly saved and stored as `.bin` files with Python's [`pickle`](https://docs.python.org/3/library/pickle.html) utility. To save and store your model weights in the more secure `safetensor` format, we recommend converting your weights to `.safetensors`.

The easiest way to convert your model weights is to use the [Convert Space](https://huggingface.co/spaces/diffusers/convert), given your model weights are already stored on the Hub. The Convert Space downloads the pickled weights, converts them, and opens a Pull Request to upload the newly converted `.safetensors` file to your repository.


For larger models, the Space may be a bit slower because its resources are tied up in converting other models. You can also try running the [convert.py](https://github.com/huggingface/safetensors/blob/main/bindings/python/convert.py) script (this is what the Space is running) locally to convert your weights.

Feel free to ping [@Nar

In [None]:
# def get_context(doc: ProcessedDocument, chunk: Chunk, provider: str):
#   if (provider == 'OPEMAI')
#     context= context_chain.invoke({"document": doc.document.page_content, "chunk": chunk})
#     return context.content
#   else:
#     context: Context = context_chain.invoke({"document": doc.document.page_content, "chunk": chunk})
#     return context.context

In [None]:
# print(f"chunk with context: Context: \n\n {context.context} \n\n Chunk: {chunk.text}")

In [None]:
# import time
# from datetime import datetime

# def generate_context(docs_processed: list[ProcessedDocument]):
#     # Initialize counters
#     calls_per_minute = 0
#     last_reset_time = time.time()

#     for doc in docs_processed:
#         for chunk in doc.chunks:
#             current_time = time.time()

#             # Check if a minute has passed since last reset
#             if current_time - last_reset_time >= 60:
#                 print(f"Made {calls_per_minute} calls in the last minute")
#                 calls_per_minute = 0
#                 last_reset_time = current_time
#             else:
#                 # If we're still within the same minute and hit rate limit
#                 if calls_per_minute >= 15:  # Assuming 30 calls per minute limit
#                     wait_time = 60 - (current_time - last_reset_time)
#                     print(f"Rate limit reached. Waiting {wait_time:.2f} seconds...")
#                     time.sleep(wait_time)
#                     calls_per_minute = 0
#                     last_reset_time = time.time()

#             # Make the API call
#             context: str = get_context(doc= doc.document.page_content, chunk= chunk, provider=???)
#             doc.context = context.context

#             # Increment counter
#             calls_per_minute += 1

#             # Optional: print progress
#             print(f"Processed chunk {calls_per_minute} in current minute. Total chunks processed: {sum(len(d.chunks) for d in docs_processed[:docs_processed.index(doc)]) + len(doc.chunks[:doc.chunks.index(chunk) + 1])}")


## Save processed documents to file
## Downloading processed documents in case notebook times out


In [None]:
import joblib
from datetime import datetime
from google.colab import files
import glob
import os

def save_download_object(object, filename):
    joblib.dump(object, filename)
    print(f"Saved object to {filename}")
    files.download(filename)
    print(f"Downloaded {filename}")

def create_timestamp() -> str:
    return datetime.now().strftime("%Y%m%d_%H%M%S")


def create_filename_timestamp(filename, extension = "joblib") -> str:
    timestamp = create_timestamp()
    return f"{filename}_{timestamp}.{extension}"

# def load_bm25_model(filename):
#     try:
#         return joblib.load(filename)
#     except (FileNotFoundError, OSError):
#         return None

# def get_latest_bm25_file():
#     # Look for files matching the pattern bm25_*.joblib
#     files = glob.glob("bm25_*.joblib")
#     if not files:
#         return None
#     # Return the most recent file
#     return max(files, key=os.path.getctime)

In [None]:
# Create filename with timestamp
docs_processed_filename = create_filename_timestamp("docs_processed")

# Save the processed documents
save_download_object(docs_processed, docs_processed_filename)

Saved object to docs_processed_20241203_134704.joblib


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded docs_processed_20241203_134704.joblib


In [None]:
chunk_texts = []
document_texts = []
contexts = []

# Extract data from docs_processed
for doc in docs_processed:
    for chunk in doc.chunks:
        chunk_texts.append(chunk.text)
        contexts.append(chunk.context)
        document_texts.append(doc.text)

# Create dictionary for dataset
dataset_dict = {
    'chunk': chunk_texts,
    'document': document_texts,
    'context': contexts
}

In [None]:
index_doc = 2
doc = docs_processed[index_doc]
print('len(doc.chuncks)',len(doc.chunks))
chunk = doc.chunks[index_doc]
print(chunk.context)
# print('len(doc.chunks)',len(doc.chunks))
# print(doc.chunks)

len(doc.chuncks) 5
Focuses on the process of claiming authorship to a paper within the Hugging Face Hub, where the system tries to match papers to users based on their email addresses. Users can manually claim authorship by clicking on their name on the corresponding Paper page and following the steps to validate the request. Once approved by the admin team, the Paper page will be marked as verified.


In [None]:
index = 3
print('len(contexts)',len(contexts))
print(contexts[index])
print('len(chunk_texts)',len(chunk_texts))
print(chunk_texts[index])

len(contexts) 882
None
len(chunk_texts) 882
*Subject-driven text-to-image generation models create novel renditions of an input subject based on text prompts. Existing models suffer from lengthy fine-tuning and difficulties preserving the subject fidelity. To overcome these limitations, we introduce BLIP-Diffusion, a new subject-driven image generation model that supports multimodal control which consumes inputs of subject images and text prompts. Unlike other subject-driven generation models, BLIP-Diffusion introduces a new multimodal encoder which is pre-trained to provide subject representation. We first pre-train the multimodal encoder following BLIP-2 to produce visual representation aligned with the text. Then we design a subject representation learning task which enables a diffusion model to leverage such visual


# **Saving Context + Chunks to dataset**

In [None]:
from datasets import Dataset
from huggingface_hub import login

# Create lists to store the data


# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(dataset_dict)

hf_token = userdata.get("HuggingFace")
if not hf_token:
  # Login to Hugging Face (you'll need your token)
  hf_token = input("Please enter your Hugging Face token: ")
login(hf_token)

# Push to Hugging Face Hub
dataset.push_to_hub(
    f"AIEnthusiast369/hf_doc_qa_eval_chunk_size_{chunk_size}_open_ai",  # Replace with your username and desired dataset name
    private=False  # Set to False if you want it public
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/AIEnthusiast369/hf_doc_qa_eval_chunk_size_800_open_ai/commit/7be3854af236da891ed8ecbd7299e0c9f0a3299a', commit_message='Upload dataset', commit_description='', oid='7be3854af236da891ed8ecbd7299e0c9f0a3299a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/AIEnthusiast369/hf_doc_qa_eval_chunk_size_800_open_ai', endpoint='https://huggingface.co', repo_type='dataset', repo_id='AIEnthusiast369/hf_doc_qa_eval_chunk_size_800_open_ai'), pr_revision=None, pr_num=None)

In [None]:
# prompt: print chunks from docs_processed where context has value

# for doc in docs_processed:
#   for chunk in doc.chunks:
#     if chunk.context:
#       print(f"chunk with context: Context: \n\n {chunk.context} \n\n Chunk: {chunk.text}")


# **Loading chunks with context dataset**
*Yuu need to run it only in case of notebook timing out and you loose state*

In [None]:
chunked_dataset = load_dataset("AIEnthusiast369/hf_doc_qa_eval_chunk_size_800_open_ai")
chunks_from_ds=True

# **Creating contextualized chunks**

In [None]:
# from multiprocessing import context
# prompt: from chunked_dataset populate docs_processed

# Assuming 'chunked_dataset' is already loaded as in your provided code
chunks_with_context = []
chunks_regular=[]

if chunks_from_ds:
  chuncked_ds = chunked_dataset['train']
  for i in range(len(chuncked_ds)):
      row = chuncked_ds[i]
      chunk = row['chunk']
      chunks_regular.append(chunk)
      context = row['context']
      if context:
              chunks_with_context.append(
                f"{context} \n\n {chunk}"
              )
else:
  for doc in docs_processed:
      for chunk in doc.chunks:
          chunks_regular.append(chunk.text)
          if chunk.context:  # Only include chunks that have a context
              chunks_with_context.append(
                f"{chunk.context} \n\n {chunk.text}"
              )
print(f'Len of regular chunks: {len(chunks_regular)}')
print(f'Len of chunks with context: {len(chunks_with_context)}')


# Now docs_processed is populated from the chunked_dataset
# You can proceed with the rest of your code using the loaded data

Len of regular chunks: 882
Len of chunks with context: 882


# **Setiing up Indeses**

In [None]:
def create_bm25(chunks: list[str]):
    # # Try to load existing BM25 model
    # latest_bm25_file = get_latest_bm25_file()
    # if latest_bm25_file:
    #     bm25 = load_bm25_model(latest_bm25_file)
    #     if bm25 is not None:
    #         print(f"Loaded existing BM25 model from {latest_bm25_file}")
    #         return bm25

    # If no existing model found or loading failed, create a new one
    print("Creating BM25 model...")
    tokenized_chunks = [nltk.word_tokenize(chunk) for chunk in chunks]
    bm25 = BM25Okapi(tokenized_chunks)

    # # Save the new model
    # bm25_filename = create_filename_timestamp("bm25")
    # save_download_object(bm25, bm25_filename)

    return bm25

In [None]:
# from pinecone.grpc import PineconeGRPC as Pinecone
# from pinecone import ServerlessSpec

from pinecone import Pinecone, ServerlessSpec

pinecone_api_key = userdata.get("PINECONE_API_KEY")
if not pinecone_api_key:
  pinecone_api_key = input("Please enter your PINECONE API KEY: ")

spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  )

EMBEDDING_INDEX_CONTEXTUAL: str = "test-rag-openai-contextual"
EMBEDDING_INDEX_REGULAR: str = "test-rag-openai-regular"

pc = Pinecone(api_key=pinecone_api_key)

def get_index_names(index_data):
  """
  Extracts the names of indices from a list of index data.

  Args:
    index_data: A list of strings, where each string is a JSON representation of an index.

  Returns:
    A list of index names.
  """
  index_names = []
  for index_item in index_data:
    try:
      # index_json = json.loads(index_str)
      index_names.append(index_item['name'])
    except (json.JSONDecodeError, KeyError):
      print(f"Skipping invalid index data: {index_item}")
  return index_names

In [None]:
from typing import Any, List
from time import sleep

def wait_for_index(index_name):
    while True:
        desc = pc.describe_index(index_name)
        if desc['ready']:
            print("Index is ready!")
            break
        sleep(5)

def create_pinecone_indexes(pinecone, embedding_model, index_name: str, chunks: list[str], specs: ServerlessSpec, dimensions, index_names: List[str]) -> Any:

    if index_name not in index_names:
        pc.create_index(index_name, dimension=dimensions, metric="cosine", spec=specs)
        wait_for_index(index_name)

    # Connect to Pinecone indexes
    embedding_index = pc.Index(index_name)


    # Semantic Embeddings using a Pre-trained Transformer Model
    embeddings = embedding_model.embed_documents(chunks)
    # Store embeddings in Pinecone
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        embedding_index.upsert([(str(i), embedding, {"text": chunk})])

    print(f'len(embeddings)={len(embeddings)}, len(embeddings[0])={len(embeddings[0])}')
    return embedding_index


In [None]:
indeces =pc.list_indexes()
print(indeces)

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 1536,
              'host': 'test-rag-openai-regular-g8hsdn4.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'test-rag-openai-regular',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'deletion_protection': 'disabled',
              'dimension': 1536,
              'host': 'test-rag-openai-contextual-g8hsdn4.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'test-rag-openai-contextual',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}


# **Creating Indeses**

In [None]:
index_names = get_index_names(indeces)
print(index_names)
if EMBEDDING_INDEX_CONTEXTUAL not in index_names:
   create_pinecone_indexes(pc, embedding_model, EMBEDDING_INDEX_CONTEXTUAL, chunks_with_context, spec, 1536, index_names)
if EMBEDDING_INDEX_REGULAR not in index_names:
   create_pinecone_indexes(pc, embedding_model, EMBEDDING_INDEX_REGULAR, chunks_regular, spec, 1536, index_names)
bm25_regular = create_bm25(chunks_regular)
bm25_contextual = create_bm25(chunks_with_context)

['test-rag-openai-regular', 'test-rag-openai-contextual']
Creating BM25 model...
Creating BM25 model...


# **Definining Reranker**

### **Hugging Face**

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

RERANKER_MODEL = 'BAAI/bge-reranker-v2-m3'
tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(RERANKER_MODEL)
model.eval()

def get_reranker_score(pairs):
    with torch.no_grad():
        inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
        scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
        print(scores)


tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/795 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

### **FlagEmbedding**

In [None]:
# from FlagEmbedding import FlagReranker

# reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

# score = reranker.compute_score(['query', 'passage'])
# print(score) # -5.65234375

# # You can map the scores into 0-1 by set "normalize=True", which will apply sigmoid function to the score
# score = reranker.compute_score(['query', 'passage'], normalize=True)
# print(score) # 0.003497010252573502

# scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']])
# print(scores) # [-8.1875, 5.26171875]

# # You can map the scores into 0-1 by set "normalize=True", which will apply sigmoid function to the score
# scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']], normalize=True)
# print(scores) # [0.00027803096387751553, 0.9948403768236574]


In [None]:
# from sentence_transformers import CrossEncoder
from collections import defaultdict
def fusion_rank_search(
    query: str,
    bm25,
    chunks: list[str],
    model,
    embedding_index,
    k: int = 5,
    weight_sparse: float = 0.5,
    reranker_cutoff: int = 20  # Number of top results to rerank
):
    # Get BM25 results
    tokenized_query = nltk.word_tokenize(query)
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top_indices = np.argsort(bm25_scores)[::-1][:reranker_cutoff]

    # Normalize BM25 scores using min-max normalization
    bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
    bm25_results = [
        {
            'id': str(i),
            'score': bm25_scores_norm[i]
            # 'metadata': {'text': chunks[i]}
        }
        for i in bm25_top_indices
    ]

    # Get embedding results
    query_embedding = model.encode(query, convert_to_tensor=False).tolist()
    embedding_results = embedding_index.query(query_embedding, top_k=reranker_cutoff, include_metadata=True)

    # Extract and normalize embedding scores
    dense_scores = np.array([match['score'] for match in embedding_results['matches']])
    dense_scores_norm = (dense_scores - np.min(dense_scores)) / (np.max(dense_scores) - np.min(dense_scores))

    # Create dictionaries to store normalized scores
    fusion_scores = defaultdict(lambda: {'sparse': 0.0, 'dense': 0.0, 'text': ''})

    # Store normalized BM25 scores
    for result in bm25_results:
        doc_id = result['id']
        fusion_scores[doc_id]['sparse'] = result['score']
        fusion_scores[doc_id]['text'] = result['metadata']['text']

    # Store normalized embedding scores
    for match, norm_score in zip(embedding_results['matches'], dense_scores_norm):
        doc_id = match['id']
        fusion_scores[doc_id]['dense'] = norm_score
        fusion_scores[doc_id]['text'] = match['metadata']['text']

    # Combine scores using weighted average
    weight_dense = 1.0 - weight_sparse
    initial_results = [
        {
            'id': doc_id,
            'score': (
                weight_sparse * scores['sparse'] +
                weight_dense * scores['dense']
            ),
            'metadata': {
                'text': scores['text'],
                'sparse_score': scores['sparse'],
                'dense_score': scores['dense']
            }
        }
        for doc_id, scores in fusion_scores.items()
    ]

    # Sort by combined score
    initial_results.sort(key=lambda x: x['score'], reverse=True)
    initial_results = initial_results[:reranker_cutoff]

    # Apply reranking if reranker model is provided

    # Prepare pairs for reranking
    pairs = [(query, result['metadata']['text']) for result in initial_results]

    # Get reranker scores - use them directly for final ranking
    rerank_scores = get_reranker_score(pairs)

    # Update results with reranker scores
    for result, rerank_score in zip(initial_results, rerank_scores):
        result['metadata']['rerank_score'] = float(rerank_score)
        # Use reranker score as the final score
        result['score'] = float(rerank_score)

    # Resort based on reranker scores
    initial_results.sort(key=lambda x: x['score'], reverse=True)

    return initial_results[:k]

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from tqdm import tqdm
import pandas as pd

def evaluate_rag_system(
    best_answers_df: pd.DataFrame,
    bm25,
    chunks: list[str],
    embedding_model,
    embedding_index,
    llm_chain,
    n_samples: int = None,  # Optional: limit number of samples for testing
    reranker_cutoff: int = 20
):
    # Initialize ROUGE scorer
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Initialize results storage
    results = []

    # Get subset of dataframe if n_samples is specified
    eval_df = best_answers_df.head(n_samples) if n_samples else best_answers_df

    # Iterate through questions and answers
    for idx, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc="Evaluating Questions"):
        query = row['question']
        reference_answer = row['answer']

        try:
            # Get relevant context using fusion ranking
            retrieved_results = fusion_rank_search(
                query=query,
                bm25=bm25,
                chunks=chunks,
                model=embedding_model,
                embedding_index=embedding_index,
                k=5,
                weight_sparse=0.5,
                reranker_cutoff=reranker_cutoff
            )

            # Prepare context for LLM
            context = "\n".join([res['metadata']['text'] for res in retrieved_results])

            # Generate answer using LLM
            llm_response = llm_chain.invoke({
                "context": context,
                "question": query
            })
            generated_answer = llm_response.content if hasattr(llm_response, 'content') else llm_response

            # Calculate BLEU score
            reference_tokens = [reference_answer.split()]
            candidate_tokens = generated_answer.split()
            bleu_score = sentence_bleu(reference_tokens, candidate_tokens)

            # Calculate ROUGE scores
            rouge_scores = rouge_scorer_instance.score(reference_answer, generated_answer)

            # Store results
            result = {
                'question': query,
                'reference_answer': reference_answer,
                'generated_answer': generated_answer,
                'bleu_score': bleu_score,
                'rouge1_f1': rouge_scores['rouge1'].fmeasure,
                'rouge2_f1': rouge_scores['rouge2'].fmeasure,
                'rougeL_f1': rouge_scores['rougeL'].fmeasure,
                'retrieved_contexts': [res['metadata']['text'] for res in retrieved_results],
                'context_scores': [res['score'] for res in retrieved_results]
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing question {idx}: {str(e)}")
            continue

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Calculate and print average scores
    avg_scores = {
        'Average BLEU': results_df['bleu_score'].mean(),
        'Average ROUGE-1': results_df['rouge1_f1'].mean(),
        'Average ROUGE-2': results_df['rouge2_f1'].mean(),
        'Average ROUGE-L': results_df['rouge2_f1'].mean()
    }

    return results_df, avg_scores

# Example usage:
def print_evaluation_results(results_df, avg_scores):
    print("\nAverage Scores:")
    for metric, score in avg_scores.items():
        print(f"{metric}: {score:.4f}")

    print("\nDetailed Results Sample (first 3):")
    for idx, row in results_df.head(3).iterrows():
        print("\nQuestion:", row['question'])
        print("Reference Answer:", row['reference_answer'])
        print("Generated Answer:", row['generated_answer'])
        print(f"BLEU Score: {row['bleu_score']:.4f}")
        print(f"ROUGE-1 F1: {row['rouge1_f1']:.4f}")
        print(f"ROUGE-2 F1: {row['rouge2_f1']:.4f}")
        print(f"ROUGE-L F1: {row['rougeL_f1']:.4f}")
        print("\nRetrieved Contexts:")
        for context, score in zip(row['retrieved_contexts'], row['context_scores']):
            print(f"Score: {score:.4f}")
            print(f"Context: {context[:200]}...")

In [None]:
# Run evaluation
results_df, avg_scores = evaluate_rag_system(
    best_answers_df=best_answers_df,
    bm25=bm25,
    chunks=chunks_with_context,
    model=embedding_model,
    embedding_index=EMBEDDING_INDEX_CONTEXTUAL,
    llm_chain=llm_chain,
    reranker_cutoff=20,
    # n_samples=10  # Optional: start with a small sample for testing
)

# Print results
print_evaluation_results(results_df, avg_scores)

# Save results to CSV (optional)
results_df.to_csv('rag_evaluation_results.csv', index=False)