In [35]:
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.evaluation import RelevancyEvaluator
from llama_index.core.evaluation.eval_utils import get_responses
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core import StorageContext
from qdrant_client import QdrantClient
from duckduckgo_search import DDGS
from google import genai
from llama_index.core.prompts import PromptTemplate
import typing_extensions as typing
from typing import TypedDict, Literal
from google.genai import types
import json
import markdown
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
genai.__version__

'1.7.0'

In [3]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

client = genai.Client(api_key=GOOGLE_API_KEY)

In [4]:
question = "How is Flat white different from cappuccino?"
DATA_DIR = 'data'
COLLECTION_NAME = "coffee-recipes"

LlamaIndex will go to the `data` folder, load the PDFs and convert it to indexes which are basically vector embeddings.

- `VectorStoreIndex`: responsible for converting all the text into vectors and it'll index those vectors
- `SimpleDirectoryReader`: used to read from a directory

In [5]:
# load PDF 
reader = SimpleDirectoryReader(DATA_DIR)
documents = reader.load_data()

In [6]:
#contains metadata of the PDF
documents

[Document(id_='f7ca6476-b792-45e5-82a7-be6a9c9546e9', embedding=None, metadata={'page_label': '1', 'file_name': 'Brochure_Basic-Creative-coffee-recipes.pdf', 'file_path': '/Users/aasth/Desktop/Linkedin Post/corrective_rag/data/Brochure_Basic-Creative-coffee-recipes.pdf', 'file_type': 'application/pdf', 'file_size': 926043, 'creation_date': '2025-04-03', 'last_modified_date': '2025-04-03'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Ultimate coffee pleasure\nCOFFEE ACADEMY\n', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}'),
 Document(id_='

In [7]:
#Loads a lightweight embedding model to convert text into vector format for similarity search
embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")

#Initializes an in-memory Qdrant database to store and search vector embedding
qdrant_client = QdrantClient(location=":memory:")

#Connects the Qdrant client to LlamaIndex as a vector store using the specified collection name
vector_store = QdrantVectorStore(client=qdrant_client, collection_name=COLLECTION_NAME)

#Creates a storage context that LlamaIndex will use to manage and store indexed data.
#Tells llama-index how/where to store and retrieve data.
storage_context = StorageContext.from_defaults(vector_store=vector_store)

Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00,  3.42it/s]


In [8]:
# Creates a vector index from the loaded documents by embedding them and storing them in the Qdrant vector store
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context, embed_model=embed_model)

#Sets up a retriever to fetch the top 4 most similar documents from the index for any given query
retriever = VectorIndexRetriever(index=index, similarity_top_k=4)

  self._client.create_payload_index(


In [66]:
#role prompting
DEFAULT_RELEVANCY_PROMPT_TEMPLATE = PromptTemplate(
    template="""As a grader, your task is to evaluate the relevance of a document retrieved in response to a user's question.

    Retrieved Document:
    -------------------
    {context_str}

    User Question:
    --------------
    {query_str}

    Evaluation Criteria:
    - Consider whether the document contains keywords or topics related to the user's question.
    - The evaluation should not be overly stringent; the primary objective is to identify and filter out clearly irrelevant retrievals.

    Decision:
    - Assign a binary score to indicate the document's relevance.
    - Use 'yes' if the document is relevant to the question, or 'no' if it is not.

    Please provide your binary score ('yes' or 'no') below to indicate the document's relevance to the user question."""
)

In [67]:
def web_search(query):
    with DDGS() as ddgs:
        results = ddgs.text(query)
        # print("result:")
        # print(results[0])
        # print("****")
        return [r['body'] for r in results if 'body' in r][:3]

In [78]:
class Answer(TypedDict):
    answer: str
    reasoning: str
    source_name: Literal["vector database", "web search"]
    citations: str

In [None]:
#system and CoT prompting with JSON format
STRUCTURED_ANSWER_PROMPT = """
Based on the documents below, answer the user's question. Respond in JSON format:

Documents:
{context}

User Question:
{query}

Example Response:
{{
  "answer": "your answer here",
  "reasoning": "step-by-step thought process here",
  "source_name": "vector database or web search",
  "citations": "Quoted content or doc ID or file name"
}}

Let's think step by step.
"""

In [80]:
def answer_query(query):
    nodes = retriever.retrieve(query)
    # print("node:")
    # print(nodes[0])
    relevant_nodes = []
    relevancy_results = []

    for node in nodes:
        prompt = DEFAULT_RELEVANCY_PROMPT_TEMPLATE.format(
            context_str=node.text,
            query_str=query
        )
        response = client.models.generate_content(
            model="gemini-1.5-flash-001",
            config=types.GenerateContentConfig(
            response_mime_type='application/json',
            response_schema=Answer,
            ),
            contents=prompt,
        )
        decision = response.text.strip().lower()
        relevancy_results.append(decision)
        if decision == "yes":
            relevant_nodes.append(node)

    if not relevant_nodes:
        web_results = web_search(query)
        combined = "\n".join(web_results)
        final_prompt = STRUCTURED_ANSWER_PROMPT.format(context=combined, query=query)
        response = client.models.generate_content(
            model="gemini-1.5-flash-001",
            config=types.GenerateContentConfig(
                response_mime_type = 'application/json',
                response_schema = Answer,
                temperature=0.0
            ),
            contents=final_prompt
        )
        return json.loads(response.text), combined
    else:
        relevant_text = "\n\n".join([f"[Source: {n.metadata.get('file_name')}] {n.text}" for n in relevant_nodes])
        final_prompt = STRUCTURED_ANSWER_PROMPT.format(context=relevant_text, query=query)
        response = client.models.generate_content(
            model="gemini-1.5-flash-001",
            config=types.GenerateContentConfig(
                response_mime_type = 'application/json',
                response_schema = Answer,
                temperature=0.0
            ),
            contents=final_prompt
        )
        return json.loads(response.text), relevant_text


In [81]:
final_response,relevant_text = answer_query(question)
print(json.dumps(final_response, indent=4))

{
    "answer": "The main difference between a flat white and a cappuccino is the milk-to-coffee ratio. A flat white has a 1:3 ratio of coffee to steamed milk, while a cappuccino has a 1:2 or 1:3 ratio. This means a cappuccino is stronger than a flat white. Additionally, the texture of the milk is different. A cappuccino has a thick and dense microfoam, while a flat white has a thin layer of foam.",
    "reasoning": "The question asks about the difference between a flat white and a cappuccino. The documents provide information about the milk-to-coffee ratio and the texture of the milk in each drink. The first document states that a cappuccino has a 1:1 ratio of coffee to steamed milk, while a flat white has a 1:3 ratio. This means a cappuccino is stronger. The second document also mentions the milk-to-coffee ratio and states that a cappuccino has a 1:2 or 1:3 ratio. The first document also mentions that a cappuccino has a thick and dense microfoam, while a flat white has a thin layer o

In [82]:
import enum

# Define the evaluation prompt for RAG
# role prompting
RAG_EVAL_PROMPT = """\
# Instruction
You are an expert evaluator for Retrieval-Augmented Generation (RAG) systems. Your task is to evaluate the quality of the AI-generated response based on the given user prompt and the retrieved context.

You will assess how well the response:
- Follows the instructions in the user prompt
- Is grounded in the context
- Is complete and provides a helpful answer
- Is fluent and easy to read

Please give step-by-step reasoning and assign a score using the Rating Rubric.

# Evaluation
## Metric Definition
You will assess question answering quality in a RAG setting, where the model is expected to answer the user's query using only the provided context. Responses should be relevant, well-structured, and avoid hallucinations.

## Criteria
- Instruction Following: Does the response fulfill the prompt's requirements (e.g., format, answer type, word limits)?
- Groundedness: Does the response rely solely on the context provided? No outside or hallucinated info?
- Completeness: Does it fully and correctly answer the user query using the context?
- Fluency: Is the response well-written, clear, and grammatically correct?

## Rating Rubric
5: (Very good) Follows instructions, grounded, complete, and fluent.
4: (Good) Mostly grounded and complete, minor issues in fluency or relevance.
3: (Fair) Partially complete, some hallucination or vague phrasing, moderate fluency issues.
2: (Bad) Lacks completeness or relevance; possible hallucinations or missed instructions.
1: (Very bad) Hallucinated, off-topic, ignores prompt/context.

## Evaluation Steps
STEP 1: Review the prompt, context, and response.
STEP 2: Assess the 4 criteria.
STEP 3: Justify your score.
STEP 4: Select a score from 5, 4, 3, 2, or 1.

# User Inputs and AI-generated Response
## User Prompt
{prompt}

## Retrieved Context
{context}

## AI-generated Response
{response}
"""


def eval_rag_response(prompt, context, ai_response):
    """Evaluate a RAG-generated response using Gemini chat."""
    chat = client.chats.create(model='gemini-2.0-flash')

    # Construct the full evaluation prompt
    full_prompt = RAG_EVAL_PROMPT.format(prompt=prompt, context=context, response=ai_response)

    # Get explanation and verbose score
    response = chat.send_message(message=full_prompt)
    verbose_eval = response.text


    return verbose_eval


In [83]:
text_eval = eval_rag_response(
    prompt=question,
    context=relevant_text,
    ai_response=final_response['answer']
)

print(text_eval)   


STEP 1: Review the prompt, context, and response.
The prompt asks for the differences between flat white and cappuccino. The context provides information about the ratios of milk to coffee, texture, and history of both drinks. The response summarizes the differences in the milk-to-coffee ratio and the texture.

STEP 2: Assess the 4 criteria.
- Instruction Following: The response correctly identifies and explains the differences between flat white and cappuccino as described in the context.
- Groundedness: The response is entirely based on the information provided in the context.
- Completeness: The response covers the main differences in ratio and texture, providing a helpful answer.
- Fluency: The response is well-written and easy to understand.

STEP 3: Justify your score.
The response successfully follows instructions, is grounded in the context, provides a complete answer, and is fluent. There are no issues.

STEP 4: Select a score from 5, 4, 3, 2, or 1.
5

