# GOAL



In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv

load_dotenv()

file_path = (
    "../practicos-rag/data/benchmark_data/Reglamento 1333 2008.pdf"
)
loader = PyPDFLoader(file_path)
docs = loader.load()

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
splits = text_splitter.split_documents(docs)

In [4]:
from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings
import os
#embeddings = OllamaEmbeddings(model="llama3")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small",
                            )

In [5]:
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant import RetrievalMode

qdrant = QdrantVectorStore.from_documents(
    splits,
    embedding=embeddings,
    location=":memory:",
    collection_name="my_documents",
    retrieval_mode=RetrievalMode.DENSE,
)

In [6]:
retriever = qdrant.as_retriever()

In [7]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_ollama import OllamaLLM
from langchain_openai import ChatOpenAI


llm = ChatOpenAI(model="gpt-4-turbo-preview") 

# Define prompt template
template = """Utilize the retrieved context below to answer the question.
If you're unsure of the answer, simply state you don't know and apologies
Keep your response concise, limited to two sentences.
Question: {question}
Context: {context}
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [8]:
rag_chain.invoke("What is the purpose of the regulation?")

'The purpose of Regulation (EC) No 1333/2008 is to lay down rules on food additives used in foods, aiming to ensure food safety and protect public health.'

In [9]:
QA_generation_prompt = ChatPromptTemplate.from_template("""
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}
Output:::""")

# Create a chain to create a question
question_chain = (
    {"context": RunnablePassthrough()}
    | QA_generation_prompt
    | llm
    | StrOutputParser()
)

In [10]:
import random
from tqdm import tqdm

sampled_docs = random.sample(docs, 15)
sampled_docs_processed = [doc.page_content for doc in sampled_docs]

In [11]:
questions = [question_chain.invoke({"context": sampled_context}) for sampled_context in tqdm(sampled_docs_processed)]

100%|██████████| 15/15 [00:21<00:00,  1.46s/it]


In [12]:
questions_processed = []
ground_truth = []
for question in questions:
    questions_processed.append(question.split("Factoid question: ")[-1].split("Answer: ")[0])
    ground_truth.append(question.split("Factoid question: ")[-1].split("Answer: ")[1])

In [13]:
contexts = []
answers = []
# Inference
for query in questions:
    answers.append(rag_chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])


  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])


In [14]:
data = {
    "question": questions,
    "answer": answers,
    "reference": ground_truth,
    "retrieved_contexts": contexts
}

In [15]:
from datasets import Dataset

# Convert dict to dataset
dataset = Dataset.from_dict(data)

In [16]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

In [17]:
result = evaluate(
    dataset = dataset,
    llm=llm,
    embeddings=embeddings,
    metrics=[
        context_recall,
        faithfulness,
        answer_relevancy,
        context_precision,
    ],)

df = result.to_pandas()
df

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

Exception raised in Job[33]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4-turbo-preview in organization org-1IccE2reLGyjkhxuj0CwkN90 on tokens per min (TPM): Limit 30000, Used 29410, Requested 1162. Please try again in 1.144s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[25]: TimeoutError()
Exception raised in Job[28]: TimeoutError()


Unnamed: 0,user_input,retrieved_contexts,response,reference,context_recall,faithfulness,answer_relevancy,context_precision
0,Factoid question: What is the document number ...,[Regulation (EC) No 1333/2008 of the European ...,The document number of the regulation mentione...,Regulation (EC) No 1333/2008,1.0,1.0,0.668864,1.0
1,Factoid question: What is the E number for Asc...,[E 290 Carbon dioxide\nE 296 Malic acid\nE 300...,The E number for Ascorbic acid according to Re...,E 300,1.0,1.0,0.874058,0.805556
2,Factoid question: What is the E number assigne...,[E 529 Calcium oxide\nE 530 Magnesium oxide\n[...,The E number assigned to Calcium oxide accordi...,E 529,1.0,0.5,0.897507,1.0
3,Factoid question: What regulation covers food ...,[are covered by Regulation (EC) No 1332/2008 o...,Regulation (EC) No 1332/2008 covers food enzym...,Regulation (EC) No 1332/2008,1.0,1.0,0.629246,1.0
4,Factoid question: What is the E number for iso...,[E 943b Isobutane\nE 944 Propane\nE 948 Oxygen...,The E number for isobutane is E 943b.,E 943b,1.0,1.0,0.860928,0.75
5,Factoid question: What is the E-number for Tit...,[E 172 Iron oxides and hydroxides\n(3) Group I...,The E-number for Titanium dioxide as per Regul...,E 171,1.0,1.0,0.874524,0.583333
6,Factoid question: What is the E-number assigne...,"[[F79(k)\nE 310–320: PROPYL GALLATE, TBHQ AND ...",The E-number assigned to Butylated hydroxyanis...,E 320,1.0,,0.872277,1.0
7,Factoid question: What is the regulation numbe...,[REGULATION (EC) No 1333/2008 OF THE\nEUROPEAN...,The regulation number is Regulation (EC) No 13...,Regulation (EC) No 1333/2008,,1.0,0.483636,1.0
8,Factoid question: What is the maximum level of...,[nutrientsInfant formulae and follow-on formul...,The maximum level of silicon dioxide (E 551) a...,10 000 mg/kg,1.0,,0.888573,0.833333
9,Factoid question: What is the obligation of a ...,[any new scientific or technical information w...,They must immediately inform the Authority of ...,They must immediately inform the Authority of ...,1.0,1.0,0.727461,1.0


In [22]:
df.to_csv("baseline_ragas_results.csv", index=False)

In [29]:
# get mean of the metrics column by column
print("Mean Faithfulness: ", round(df["faithfulness"].mean(), 4))
print("Mean Answer relevancy: ", round(df["answer_relevancy"].mean(), 4))
print("Mean Context recall: ", round(df["context_recall"].mean(), 4))
print("Mean Context precision: ", round(df["context_precision"].mean(), 4))


Mean Faithfulness:  0.9423
Mean Answer relevancy:  0.7836
Mean Context recall:  1.0
Mean Context precision:  0.8944


## Adding a reranker step

In [18]:
query = "What is the purpose of the regulation?"

retrieved_docs = retriever.get_relevant_documents(query, kwargs={"k": 10})

In [19]:
retrieved_docs

[Document(metadata={'source': '../practicos-rag/data/benchmark_data/Reglamento 1333 2008.pdf', 'page': 3, '_id': '009810801d8a4da28d42c752624e9e3e', '_collection_name': 'my_documents'}, page_content='Regulation and to adopt appropriate transitional measures. Since those measures are\nof general scope and are designed to amend non-essential elements of this Regulation,'),
 Document(metadata={'source': '../practicos-rag/data/benchmark_data/Reglamento 1333 2008.pdf', 'page': 5, '_id': '8abb64a3aa7f4e768e993b256ed47a3d', '_collection_name': 'my_documents'}, page_content='HAVE ADOPTED THIS REGULATION:\nCHAPTER I\nSUBJECT MATTER, SCOPE AND DEFINITIONS\nArticle 1\nSubject matter\nThis Regulation lays down rules on food additives used in foods with a view to ensuring'),
 Document(metadata={'source': '../practicos-rag/data/benchmark_data/Reglamento 1333 2008.pdf', 'page': 6, '_id': '12467865f29c4033b63bcca25e844817', '_collection_name': 'my_documents'}, page_content='Regulation (EC) No 1333/200

In [99]:
import cohere as co
cohere_client = co.Client(os.getenv("COHERE_API_KEY"))
def rerank_docs(query, retrieved_docs):
    reranked_docs = cohere_client.rerank(
        model="rerank-english-v3.0",
        query=query,
        documents=retrieved_docs,
        rank_fields=["page_content"],
        return_documents=True
    )
    return reranked_docs

from rerankers import Reranker

def open_source_reranker(query, retrieved_docs):
    #reranker = Reranker('cross-encoder', verbose=0,model_type='cross-encoder')
    reranker = Reranker("colbert")
    retrieved_docs = [doc.page_content for doc in retrieved_docs]
    reranked_docs = reranker.rank(query, retrieved_docs)
    return reranked_docs


In [100]:
reranked_docs = open_source_reranker(query, retrieved_docs)

Loading default colbert model for language en
Default Model: colbert-ir/colbertv2.0
Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device mps
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting


In [101]:
reranked_docs.results

[Result(document=Document(document_type='text', text='E 170 Calcium carbonate\nE 260 Acetic acid\n[F64E 261 Potassium acetates]\nE 262 Sodium acetates\nE 263 Calcium acetate\nE 270 Lactic acid', base64=None, image_path=None, doc_id=2, metadata={}), score=0.8182083368301392, rank=1),
 Result(document=Document(document_type='text', text='E 968 Erythritol\n[F60E 969 Advantame]\n3. Additives other than colours and sweeteners\nE-number Name\nE 170 Calcium carbonate\n[F45E 172 Iron oxides and hydroxides]\nE 200 Sorbic acid\nE 202 Potassium sorbate\nF62\nE 210 Benzoic acid \na', base64=None, image_path=None, doc_id=3, metadata={}), score=0.8180067539215088, rank=2),
 Result(document=Document(document_type='text', text='E 529 Calcium oxide\nE 530 Magnesium oxide\n[F70E 534 Iron tartrate]\nE 535 Sodium ferrocyanide\nE 536 Potassium ferrocyanide\nE 538 Calcium ferrocyanide\nE 541 Sodium aluminium phosphate acidic\nE 551 Silicon dioxide\nE 552 Calcium silicate', base64=None, image_path=None, doc_

In [102]:
contexts = []
answers = []
# Inference
for query in questions:
    answers.append(rag_chain.invoke(query))
    retrieved_docs = retriever.get_relevant_documents(query)
    reranked_docs = open_source_reranker(query, retrieved_docs)
    if reranked_docs.results:  # Check if there are any results
        contexts.append([reranked_docs.results[0].document.text])

data = {
    "question": questions,
    "answer": answers,
    "reference": ground_truth,
    "retrieved_contexts": contexts
}

Loading default colbert model for language en
Default Model: colbert-ir/colbertv2.0
Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device mps
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting
Loading default colbert model for language en
Default Model: colbert-ir/colbertv2.0
Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device mps
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting
Loading default colbert model for language en
Default Model: colbert-ir/colbertv2.0
Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device mps
No dtype set
Using dtype torch.float32
Loading model c

In [103]:
reranked_dataset = Dataset.from_dict(data)
result = evaluate(
    dataset = reranked_dataset,
    llm=llm,
    embeddings=embeddings,
    metrics=[
        context_recall,
        faithfulness,
        answer_relevancy,
        context_precision,
    ],)
reranked_df = result.to_pandas()
reranked_df.to_csv("reranked_ragas_results.csv", index=False)


Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

In [104]:
# get mean of the metrics column by column
print("Mean Faithfulness: ", round(reranked_df["faithfulness"].mean(), 4))
print("Mean Answer relevancy: ", round(reranked_df["answer_relevancy"].mean(), 4))
print("Mean Context recall: ", round(reranked_df["context_recall"].mean(), 4))
print("Mean Context precision: ", round(reranked_df["context_precision"].mean(), 4))

Mean Faithfulness:  0.75
Mean Answer relevancy:  0.8118
Mean Context recall:  0.9333
Mean Context precision:  1.0
