# Context Relevance Prompt Experiment

In [1]:
import _global
from typing import List
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_openai import ChatOpenAI
from langsmith import Client
from langsmith.evaluation import evaluate, evaluate_existing
from langsmith.schemas import Example, Run

## Experimental LLM Evaluator being tested

In [2]:
# Data model
class GradeDocuments(BaseModel):
    """ Pydantic object used to format LLM output
    * 0: irrelevant diagnosis \n
    * 1: correct diagnosis, but does not contain information to anser the user question \n
    * 2: correct diagnosis and contains information to answer the user question). \n    
    """
    score: int = Field(description="Documents grade based on correct diagnosis and relevant information")


llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
#llm = ChatOpenAI(model="gpt-4o", temperature=0)

structured_llm_grader = llm.with_structured_output(GradeDocuments)

In [3]:
# few shots prompt
client = Client()
ds = client.list_examples(dataset_name="prompt_test_GAS")

examples = []
required_eg_score = [0,1,2]*2
for example in ds:
    if int(example.outputs["score"]) in required_eg_score:
        examples.append({
            "context":example.inputs["context"],
            "query":example.inputs["query"],
            "score":example.outputs["score"],
        })
        required_eg_score.remove(int(example.outputs["score"]))

In [5]:
system = """
You are a grader assessing the relevance of a retrieved document content to a query. \n
The query is a question about a medical diagnosis. \n
The document is a Python dictionary. The content of the document is under the "page_content" key in the dictionary. \n
Give a score for the document using the scoring system below. \n

# Scoring
- **0**: The content does not contain any information about the queried diagnosis \n
- **1**: The content contains information about the queried diagnosis, but the information does not answer the query \n
- **2**: The content contains information about the queried diagnosis and the information answers the query). \n
"""

human = """
# RETRIEVED DOCUMENT
{context} 

# QUERY
{query}
"""

prompt_gradedoc = ChatPromptTemplate.from_messages(
    [
        ("human", human),
        ("ai", "{score}")
    ]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=prompt_gradedoc,
    examples=examples,
)

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        few_shot_prompt,
        ("human", human),
    ]
)

retrieval_grader = final_prompt | structured_llm_grader

def grade_docs(run, example) -> dict:
    grade = retrieval_grader.invoke({"query": example.inputs["query"], "context": example.inputs["context"]})
    return {"key": "grade", "score": int(grade.score), "comment": "grade for doc"}

## Metrics for evaluating the LLM evaluator

In [6]:
# compare generated eval scores with ground truth eval scores
def avg_diff(runs: list[Run], examples: list[Example]) -> dict:
    score = 0
    for i, (run, example) in enumerate(zip(runs, examples)):
        score += run.feedback_stats["grade"]["avg"] - int(example.outputs["score"])

    score = score / len(runs)
    
    return {"key": "avg diff from true score", "score": score}


In [11]:
# num of correct scoring
def correctness(runs: list[Run], examples: list[Example]) -> dict:
    score = 0
    for i, (run, example) in enumerate(zip(runs, examples)):
        if run.feedback_stats["grade"]["avg"] == int(example.outputs["score"]):
            score += 1

    score = score / len(runs)
    
    return {"key": "perc of correct scoring", "score": score}


## Full pipeline for evaluating llm evaluator for context relevance

In [7]:
# generate eval scores
exp = evaluate(
    lambda x:x,
    data="prompt_test_GAS",
    evaluators=[grade_docs],
    experiment_prefix="Prompt_testing",
)

# generate eval score for the evaluator
evaluate_existing(exp.experiment_name, summary_evaluators=[avg_diff, correctness])


View the evaluation results for experiment: 'Prompt_testing-1369472f' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/4c17fe49-6797-40aa-a5fc-36f4809034f5/compare?selectedSessions=bc44329e-1dfc-441d-8ff0-d149bf2b94c5




0it [00:00, ?it/s]

View the evaluation results for experiment: 'Prompt_testing-1369472f' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/4c17fe49-6797-40aa-a5fc-36f4809034f5/compare?selectedSessions=bc44329e-1dfc-441d-8ff0-d149bf2b94c5




0it [00:00, ?it/s]

<ExperimentResults Prompt_testing-1369472f>

# Faithfulness Prompt Experiment 

## Experimental LLM Evaluator being tested

In [14]:
class Faithfulness(BaseModel):
    """List facts in handout not based on ground truth"""
    list_of_false: List[str] = Field(description="List of facts in handout not based on ground truth")

# LLM with function call 
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_grader = llm.with_structured_output(Faithfulness)

# Prompt 
system = """
You are an expert assessor tasked with evaluating whether the generated text (provided by the user between the XML tags GENERATED TEXT) is factually based on the provided context (provided by the user between the XML tags CONTEXT).  

Follow these steps:
    Step 1: Read the provided context carefully. Understand the information presented in the context.
    Step 2: Analyze each sentence in the GENERATED TEXT. Compare it with the provided CONTEXT to determine its factual basis. Sentences in the GENERATED TEXT that are similar (but not verbatim) to the sentences in provided CONTEXT, but are still factually aligned are considered factually based on the CONTEXT.
    Step 3: Identify sentences in the GENERATED TEXT that are not factually based on the context (not factually supported by the provided CONTEXT or directly contradicts the provided CONTEXT.) 

"""

human = """
<GENERATED TEXT>
{handout} 
</GENERATED TEXT>

<CONTEXT>
{contexts}
</CONTEXT>
"""

prompt_faithfulness = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", human),
    ]
)

grader_faithfulness = prompt_faithfulness | structured_llm_grader

def grade_faithfulness(run, example):
    try: # try to get outputs from run, otherwise it is from dataset
        handout = run.outputs["handout"]
        contexts = run.outputs["contexts"]
    except KeyError:
        handout = example.outputs["handout"]
        contexts = example.inputs["contexts"]
        
    result = grader_faithfulness.invoke({"handout": handout, "contexts": contexts}).list_of_false
    count = len(result)
    
    return {
        "key": "count", "score": count, "comment": "num infactual sentences",
        "sentences": result
    }


## Metrics for evaluating the LLM evaluator

In [15]:
# average difference of generated count vs ground truth count
def avg_diff_count(runs: list[Run], examples: list[Example]) -> dict:
    score = 0
    for i, (run, example) in enumerate(zip(runs, examples)):
        score += run.feedback_stats["count"]["avg"] - int(example.outputs["count"])

    score = score / len(runs)
    
    return {"key": "avg diff from true count", "score": score}



## Full pipeline for evaluating llm evaluator for context relevance

In [16]:
# generate eval scores
exp = evaluate(
    lambda x:x,
    data="Prompt_test_faithfulness",
    evaluators=[grade_faithfulness],
    experiment_prefix="Prompt_testing",
)

# generate eval score for the evaluator
evaluate_existing(exp.experiment_name, summary_evaluators=[avg_diff_count])


View the evaluation results for experiment: 'Prompt_testing-dd3cd07e' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/8c0209e9-1fa7-4eae-9ee6-657491c07e75/compare?selectedSessions=9483b5b6-9039-472d-b1bd-85c1bd3bc88d




0it [00:00, ?it/s]

View the evaluation results for experiment: 'Prompt_testing-dd3cd07e' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/8c0209e9-1fa7-4eae-9ee6-657491c07e75/compare?selectedSessions=9483b5b6-9039-472d-b1bd-85c1bd3bc88d




0it [00:00, ?it/s]

Error running summary evaluator <function avg_diff_count at 0x29b5e24d0>: 'count'


<ExperimentResults Prompt_testing-dd3cd07e>