# Last left off
- how to give structured input to llm to improve accuracy
- few prompt strategy

# Context Relevance Prompt Experiment

In [4]:
import _global
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langsmith.evaluation import evaluate, evaluate_existing
from langsmith.schemas import Example, Run

In [8]:
# Data model
class GradeDocuments(BaseModel):
    """ Pydantic object used to format LLM output
    * 0: irrelevant diagnosis \n
    * 1: correct diagnosis, but does not contain information to anser the user question \n
    * 2: correct diagnosis and contains information to answer the user question). \n    
    """
    score: int = Field(description="Documents grade based on correct diagnosis and relevant information")


llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

## Experimental LLM Evaluator being tested

In [9]:
system = """
You are a grader assessing the relevance of a retrieved document content to a query. \n
The query is a question about a medical diagnosis. \n
The document is a Python dictionary. The content of the document is under the "page_content" key in the dictionary. \n
Give a score for the document using the scoring system below. \n

# Scoring
- **0**: The content does not contain information about the queried diagnosis \n
- **1**: The content contains information about the queried diagnosis, but the information does not answer the query \n
- **2**: The content contains information about the queried diagnosis and the information answers the query). \n
"""

human = """
# RETRIEVED DOCUMENT
{document} 

# QUERY
{query}
"""
prompt_gradedoc = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", human),
    ]
)

retrieval_grader = prompt_gradedoc | structured_llm_grader

def grade_docs(run, example) -> dict:
    """Grades all queries and corresponding documents in the db. used in eval type 1"""
    grade = retrieval_grader.invoke({"query": example.inputs["query"], "document": example.inputs["context"]})
    return {"key": "grade", "score": int(grade.score), "comment": "grade for doc"}

## Metrics for evaluating the LLM evaluator

In [11]:
# compare generated eval scores with ground truth eval scores
def summary_eval(runs: list[Run], examples: list[Example]) -> dict:
    score = 0
    for i, (run, example) in enumerate(zip(runs, examples)):
        score += run.feedback_stats["grade"]["avg"] - int(example.outputs["score"])

    score = score / len(runs)
    
    return {"key": "avg diff from true score", "score": score}


In [12]:
experiment_name = "Prompt_testing-eccc400e"


View the evaluation results for experiment: 'Prompt_testing-eccc400e' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/4c17fe49-6797-40aa-a5fc-36f4809034f5/compare?selectedSessions=8184c2d8-0f13-4f7c-9142-15400e1a1557




0it [00:00, ?it/s]

<ExperimentResults Prompt_testing-eccc400e>

## Full pipeline for evaluating llm evaluator for context relevance

In [None]:
# generate eval scores
exp = evaluate(
    lambda x:x,
    data="prompt_test_GAS",
    evaluators=[grade_docs],
    experiment_prefix="Prompt_testing",
)

# generate eval score for the evaluator
evaluate_existing(exp.experiment_name, summary_evaluators=[summary_eval])


# Ground Truth Prompt Experiment 