#### Evaluation of RAG

In [1]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


urls=[
    'https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/',
    'https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/',
    'https://lilianweng.github.io/posts/2023-06-23-agent/',
]


docs=[WebBaseLoader(urls).load() for url in urls]

docs_list=[item for sublist in docs for item in sublist]
text_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250,chunk_overlap=0
)

docs_splits=text_splitter.split_documents(docs_list)

vector_store=InMemoryVectorStore.from_documents(
    documents=docs_splits,
    embedding=OpenAIEmbeddings(model='text-embedding-3-small')
)

retriever=vector_store.as_retriever(k=6)

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
retriever.invoke('What is the agents')

[Document(id='0b8ae5cd-645d-4d52-8779-b3d59ef3092b', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, 

In [None]:
from langchain.chat_models import init_chat_model
llm=init_chat_model(model='openai:gpt-4o-mini')

In [4]:
from langsmith import traceable

@traceable
def rag_bot(question:str)->dict:
    docs=retriever.invoke(question)
    docs_string=" ".join(doc.page_content for doc in docs)

    instruction=f"""You are a helpful assistant who is good at analyzing source information and answering questions.
     use three sentences maximum and keep the answer concise
     
     Documents:{docs_string}"""
    
    ai_response=llm.invoke([
        {'role':'system','content':instruction},
        {'role':'user','content':question}
    ])

    return {'answer':ai_response.content,'documents':docs}

In [5]:
rag_bot("What is agents?")

{'answer': 'Agents, in this context, refer to virtual characters controlled by LLM-powered systems that simulate human behavior and interactions within a constructed environment, such as a sandbox. They utilize mechanisms like memory, planning, and reflection to behave based on past experiences and interact with other agents.',
 'documents': [Document(id='0b8ae5cd-645d-4d52-8779-b3d59ef3092b', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, comple

In [53]:
from langsmith import Client
client=Client()

examples = [
    {
        "inputs": {"question": "What is a threat model in adversarial attacks on large language models?"},
        "outputs": {"answer": "According to Lilian Weng, a threat model in adversarial attacks on LLMs refers to the assumptions about what the adversary knows, what access they have (e.g. inference only vs. white-box), and what kind of behavior (classification or generation) is being attacked at inference time while the model weights are fixed."}
    },
    {
        "inputs": {"question": "What is the difference between white-box and black-box adversarial attacks on LLMs?"},
        "outputs": {"answer": "White-box attacks assume access to model weights, architecture, or training process, allowing use of gradient signals, while black-box attacks only allow interaction through input/output, with no direct access to internal parameters."}
    },
    {
        "inputs": {"question": "What is token manipulation as a type of adversarial attack?"},
        "outputs": {"answer": "Token manipulation is a black-box adversarial method where a small number of tokens in text input are altered—using synonyms, insertions, deletions or swaps—while preserving the overall semantics, in order to trigger erroneous or unsafe outputs."}
    },
    {
        "inputs": {"question": "What is a jailbreak prompt and how is it used in adversarial attacks?"},
        "outputs": {"answer": "A jailbreak prompt is a heuristic based prompt designed to bypass or subvert the model’s built-in safety or refusal behavior—things like prefix injection, style injection, suppression of refusal formats, role-playing attacks are examples, all intended to induce the model to produce content it should refuse under normal safety constraints."}
    },
    {
        "inputs": {"question": "How does human red-teaming work in mitigating adversarial attacks?"},
        "outputs": {"answer": "Human red-teaming involves people crafting adversarial examples to test model safety, often with tools that highlight token importance or allow prompt rewrites, and results of red teams are used to train classifiers or adjust safety logic."}
    },
    {
        "inputs": {"question": "What is prompt engineering according to Lilian Weng’s blog?"},
        "outputs": {"answer": "Prompt engineering is the practice of designing the wording, format, and structure of prompts—for example, chain-of-thought, zero-shot, few-shot, or instruction templates—in order to guide performance of LLMs more reliably and reduce errors or ambiguity."}
    },
    {
        "inputs": {"question": "What are agents in the context of LLMs as explained in the 'Agent' blog by Lilian Weng?"},
        "outputs": {"answer": "Agents are systems built on LLMs that can use external tools, perform actions, plan, retrieve knowledge, combine reasoning, maintain context, and act adaptively rather than just responding passively to prompts."}
    }
]



dataset_name='rag-evaluation'

dataset=client.create_dataset(dataset_name=dataset_name)

for ex in examples:
    client.create_example(
        dataset_id=dataset.id,
        inputs=ex["inputs"],
        outputs=ex["outputs"]
    )

print("All examples uploaded successfully!")


All examples uploaded successfully!


### Metrics

1.Correctness:Response vs Reference Answer

In [54]:
from pydantic import BaseModel
from typing_extensions import Annotated
from langchain_openai import ChatOpenAI

# ======= 1. Correctness =======
class CorrectnessGrade(BaseModel):
    explanation: Annotated[str, "Explain your reasoning"]
    correct: Annotated[bool, "True if correct, False otherwise"]

correctness_instruction = """You are grading a quiz.
Given a QUESTION, GROUND TRUTH ANSWER, and STUDENT ANSWER:

1) Grade the student answer based ONLY on factual accuracy.
2) It is OK if the student answer has extra correct info.
3) No conflicting statements allowed.

Explain reasoning step-by-step."""

correctness_llm = ChatOpenAI(model='gpt-4o-mini', temperature=0).with_structured_output(
    CorrectnessGrade, method='json_schema', strict=True
)

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    prompt = f"""
Question: {inputs['question']}
Ground Truth Answer: {reference_outputs['answer']}
Student Answer: {outputs['answer']}
"""
    grade = correctness_llm.invoke([
        {'role': 'system', 'content': correctness_instruction},
        {'role': 'user', 'content': prompt}
    ])
    return grade.correct

##### 2.Relevance:Response vs Input

In [55]:

class RelevanceGrade(BaseModel):
    explanation: Annotated[str, "Explain your reasoning"]
    relevant: Annotated[bool, "True if relevant, False otherwise"]

relevance_instruction = """You are grading a quiz.
Given QUESTION and STUDENT ANSWER:

1) Check if the answer is concise and relevant.
2) Answer must help answer the question.

Explain reasoning step-by-step."""

relevance_llm = ChatOpenAI(model='gpt-4o-mini', temperature=0).with_structured_output(
    RelevanceGrade, method='json_schema', strict=True
)

def relevance(inputs: dict, outputs: dict) -> bool:
    prompt = f"""
Question: {inputs['question']}
Student Answer: {outputs['answer']}
"""
    grade = relevance_llm.invoke([
        {'role': 'system', 'content': relevance_instruction},
        {'role': 'user', 'content': prompt}
    ])
    return grade.relevant  


##### 3.Groundeness:Respons vs retrieved docs

In [56]:
# ======= 3. Groundedness =======
class GroundGrade(BaseModel):
    explanation: Annotated[str, "Explain your reasoning"]
    grounded: Annotated[bool, "True if grounded, False if hallucinates"]

grounded_instruction = """You are grading a quiz.
Given FACTS and STUDENT ANSWER:

1) Check that the answer is grounded in the facts.
2) No hallucinated information allowed.

Explain reasoning step-by-step."""

grounded_llm = ChatOpenAI(model='gpt-4o-mini', temperature=0).with_structured_output(
    GroundGrade, method='json_schema', strict=True
)

def groundedness(inputs: dict, outputs: dict) -> bool:
    doc_string = "\n\n".join(doc.page_content for doc in outputs['documents'])
    prompt = f"""
Facts: {doc_string}
Student Answer: {outputs['answer']}
"""
    grade = grounded_llm.invoke([
        {'role': 'system', 'content': grounded_instruction},
        {'role': 'user', 'content': prompt}
    ])
    return grade.grounded

4.Retrieval Relevance:Retrieved docs vs input

In [57]:
# ======= 4. Retrieval Relevance =======
class RetrievalRelevanceGrade(BaseModel):
    explanation: Annotated[str, "Explain your reasoning"]
    relevant: Annotated[bool, "True if retrieved documents relevant, False otherwise"]

retrieval_relevance_instruction = """You are grading a quiz.
Given QUESTION and FACTS:

1) Identify facts completely unrelated to the question.
2) If any keywords or semantic meaning match the question, mark as relevant.

Explain reasoning step-by-step."""

retrieval_relevance_llm = ChatOpenAI(model='gpt-4o-mini', temperature=0).with_structured_output(
    RetrievalRelevanceGrade, method='json_schema', strict=True
)

def retrieval_relevance(inputs: dict, outputs: dict) -> bool:
    doc_string = "\n\n".join(doc.page_content for doc in outputs['documents'])
    prompt = f"""
Question: {inputs['question']}
Facts: {doc_string}
"""
    grade = retrieval_relevance_llm.invoke([
        {'role': 'system', 'content': retrieval_relevance_instruction},
        {'role': 'user', 'content': prompt}
    ])
    return grade.relevant

In [58]:
def target(input:dict) -> dict:
    return rag_bot(input['question'])

experimental_results=client.evaluate(
    target,
    data=dataset_name,
    evaluators=[correctness,groundedness,relevance,retrieval_relevance],
    experiment_prefix='rag-doc-relevance',
    metadata={'version':'LCEL context,gpt-4-0125-preview'},
)
experimental_results

View the evaluation results for experiment: 'rag-doc-relevance-6e241a47' at:
https://smith.langchain.com/o/6a377c92-e664-4557-95a3-a1e917bdfc35/datasets/72a14497-cd85-41d3-9601-15bc10c14845/compare?selectedSessions=6cf65507-0100-41f4-a3cf-fdf18bf3d206




7it [02:15, 19.30s/it]
