# Task List
- GPT3.5 LLM evaluator for grading context relevance is inaccurate
- for eval on all cases, a way to group both experiments in one place; https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_on_intermediate_steps

# Setup

In [1]:
from typing import List
from datetime import datetime
from operator import itemgetter

from _global import path_to_resources, hf_embed
import templates

from langchain.callbacks.tracers import LangChainTracer
from langchain_community.llms import Ollama
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers.string import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_openai import ChatOpenAI

import langsmith
from langsmith import traceable, trace
from langsmith.evaluation import LangChainStringEvaluator, evaluate

In [2]:
# set up retriever
db = Chroma(collection_name="main_collection", persist_directory=f"{path_to_resources}/db_wiki", embedding_function=hf_embed)
retriever = db.as_retriever(
                search_type = "similarity",
                search_kwargs = {"k":4},
            )

In [3]:
# langsmith setup
project_name = "ED-handout"

# RAG Class

In [35]:
class RagBot:
    def __init__(self, retriever, templates, model: str = "gpt-3.5-turbo-1106"):
        self._retriever = retriever
        self._llm_gpt = ChatOpenAI(model_name=model, temperature=0)
        self._llm_llama = Ollama(model="llama2:13b", temperature=0)
        self.templates = templates
        self.queries = {
            "definition": "definition of {diagnosis}",
            "presentation": "manifestations of {diagnosis}",
            "course": "natural history of {diagnosis}",
            "management": "treatment and management for {diagnosis}",
            "follow_up": "follow-up plan for {diagnosis}",
            "redflags": "signs and symptoms that indicate the need for urgent medical attention for patients with {diagnosis}",
        }

    
    @traceable
    def diagnosis_extraction(self, assessment):
        """Extracts diagnosis from physician's assessment of the patient"""
        prompt_extract_diagnosis = ChatPromptTemplate.from_messages([
            ("system",self.templates.extract_diagnosis_system),
            ("human", "{assessment}")
        ])
        chain_diagnosis = prompt_extract_diagnosis | self._llm_gpt
        
        return chain_diagnosis.invoke({"assessment":assessment}).content

    
    def make_queries(self, diagnosis):
        """Uses the diagnosis to populate dict of queries that will be used to retreive context from db"""
        return {key: value.format(diagnosis=diagnosis) for key, value in self.queries.items()}

    
    @traceable(run_type="retriever")
    def _retrieve_docs(self, query):
        return self._retriever.invoke(query)

    
    def get_contexts(self, queries):
        """returns dict with tuples of (query, contexts)"""
        contexts = {}
        for k, query in queries.items():
            contexts[k] = (query, self._retrieve_docs(query))
        
        return contexts


    def compress_contexts(self, q_c):
        prompt_compress = ChatPromptTemplate.from_messages([
            ("system", self.templates.compress_context_system),
            ("human", self.templates.compress_context_human)
        ])
        chain_compress = prompt_compress | self._llm_gpt

        return chain_compress.invoke({"query": q_c[0], "context": q_c[1]}).content

    
    @traceable()
    def retrieval_steps(self, assessment, eval=False):
        """All the steps to prep the contexts for final handout generation"""    
        diagnosis = self.diagnosis_extraction(assessment)
        queries = self.make_queries(diagnosis)
        contexts = self.get_contexts(queries)

        return {"contexts": contexts, "diagnosis": diagnosis}
        
    
    @traceable()
    def make_handout(self, assessment, md_plan):
        _run_input = self.retrieval_steps(assessment)
        _contexts = _run_input["contexts"]
        diagnosis = _run_input["diagnosis"]

        return self._make_handout(_contexts, diagnosis, md_plan)
        
    @traceable()
    def _make_handout(self, _contexts, diagnosis, md_plan):
        """separate this part from the complete make_handout chain so can be used in langsmith for testing"""
        # compression
        contexts = {}
        for k, q_c in _contexts.items():
            contexts[k] = self.compress_contexts(q_c)

        # make handout
        prompt_make_handout = ChatPromptTemplate.from_messages([
            ("system",self.templates.handout_generation_system),
            ("human", self.templates.handout_generation_human),
        ])
        chain_make_handout = prompt_make_handout | self._llm_gpt
        response = chain_make_handout.invoke({
            "context_definition": contexts["definition"],
            "context_presentation": contexts["presentation"],
            "context_course": contexts["course"],
            "context_management": contexts["management"],
            "context_follow_up": contexts["follow_up"],
            "context_redflags": contexts["redflags"],
            "context_md_plan": md_plan,
        })
        
        # Evaluators will expect "answer" and "contexts"
        return {
            "diagnosis": diagnosis,
            "contexts": "\n".join(contexts.values()) + "\n" + md_plan,
            "handout": response.content
        }

In [36]:
rag_bot = RagBot(retriever, templates)

In [21]:
# test that extraction works and works with langsmith

with trace("Diagnosis extraction", "chain", project_name=project_name, inputs={"assessment": "5yo M with viral-triggered asthma"}) as rt:
    output = rag_bot.diagnosis_extraction(inputs["assessment"])
    rt.end(outputs={"output": output})

NameError: name 'inputs' is not defined

# Eval

ways to use evaluations
- to evaluate one part of the RAG pipeline
    1. this runs part of the rag pipeline -> create a dataset -> evaluate based on the data; used when optimizing each section (e.g prompt engineering, experimenting with retrieval strategies)
    2. run on one example (dataset with assessment and plan as input for one diagnosis); used to test debug for individual case
    3. run for all common diagnoses dataset (Pt_cases); evaluate the RAG chain on a database
- evaluate the whole pipeline

## Doc grader
- given a diagnosis, create dataset of query + doc for each doc retrieved from each query
- run experiement on the dataset

In [15]:
# Data model
class GradeDocuments(BaseModel):
    """0-2 score based on relevance of doc.
    * 0: irrelevant diagnosis \n
    * 1: correct diagnosis, but does not contain information to anser the user question \n
    * 2: correct diagnosis and contains information to answer the user question). \n    
    """

    score: str = Field(description="Documents grade based on correct diagnosis and relevant information")

# LLM with function call 
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt 
system = """
    You are a grader assessing relevance of a retrieved document to a user question. \n 
    The content of the document can be found in page_content. Give a score for the document using the scoring system below. 
    Scoring: 
    * 0: irrelevant diagnosis \n
    * 1: correct diagnosis, but does not contain information to anser the user question \n
    * 2: correct diagnosis and contains information to answer the user question). \n
    
    
"""
prompt_gradedoc = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "RETRIEVED DOCUMENT: \n\n {document} \n\n USER QUESTION: {query}"),
    ]
)

retrieval_grader = prompt_gradedoc | structured_llm_grader

def grade_doc(query, doc) -> dict:
    grade = retrieval_grader.invoke({"query": query, "document": doc})
    return {"key": "grade", "score": int(grade.score), "comment": "grade for doc"}

def grade_docs(run, example) -> dict:
    grade = retrieval_grader.invoke({"query": example.inputs["query"], "document": example.inputs["context"]})
    return {"key": "grade", "score": int(grade.score), "comment": "grade for doc"}

In [7]:
def create_dataset_relevance(diagnosis, context_dict, dataset_name):
    """Takes query_context dictionary and create a dataset for {diagnosis} to evaluate the relevance of retrieved context"""
    client = langsmith.Client()
    
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description=f"Test context relevance for docs retreiived for {diagnosis}",
    )

    for query, q_c in context_dict.values(): #each document should be an example in the dataset        
        for doc in q_c:
            client.create_examples(
                inputs=[{"query": query, "context": doc}],
                dataset_id=dataset.id,
            )

In [29]:
# eval type: 1

def eval_context_relevance(rag_bot, assessment):
    retrieved = rag_bot.retrieval_steps(assessment) # dict of query:context
    context_dict = retrieved["contexts"]
    diagnosis = retrieved["diagnosis"]

    current_time = datetime.now().strftime('%Y-%m-%d, %H:%M:%S')
    dataset_name = f"Queries_Docs_{diagnosis}_{current_time}"
    create_dataset_relevance(diagnosis, context_dict, dataset_name)
        
    evaluate(
        lambda x:x,
        data=dataset_name,
        evaluators=[grade_docs],
        experiment_prefix="Context-relevance-",
        metadata={
            "model": "oai",
            "diagnosis":diagnosis
        },
    )

In [28]:
eval_context_relevance(rag_bot, "5yo M, viral triggered asthma")

View the evaluation results for experiment: 'Context-relevance--feef56a0' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/2c62c230-5ffb-4be1-a160-67b532c3d537/compare?selectedSessions=2d4ce06b-d884-4f5f-bca6-6dc74cff05da




0it [00:00, ?it/s]

## Ground truth checker
** LLM gives different evaluation each time?

In [9]:
# not in use; used once to create dataset for eval_groundtruth_evaluator_prompt
def create_dataset_groundtruth(handout, context, dataset_name):
    """Takes handout and contexts used to make a dataset"""
    client = langsmith.Client()
    
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description=f"Test whether handout for {diagnosis} is based on provided context",
    )

    # **Preprocess context to remove headings
    client.create_examples(
        inputs=[{"contexts": contexts}],
        outputs=[{"handout": handout}],
        dataset_id=dataset.id,
    )

In [48]:
class GroundTruth(BaseModel):
    """List facts in handout not based on ground truth"""
    list_of_false: List[str] = Field(description="List of facts in handout not based on ground truth")

# LLM with function call 
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_grader = llm.with_structured_output(GroundTruth)

# Prompt 
system = """
You are an expert assessor tasked with evaluating whether the generated text (provided by the user between the XML tags GENERATED TEXT) is factually based on the provided context (provided by the user between the XML tags CONTEXT).  

Follow these steps:
    Step 1: Read the provided context carefully. Understand the information presented in the context.
    Step 2: Analyze each sentence in the GENERATED TEXT. Compare it with the provided CONTEXT to determine its factual basis. Sentences in the GENERATED TEXT that are similar (but not verbatim) to the sentences in provided CONTEXT, but are still factually aligned are considered factually based on the CONTEXT.
    Step 3: Identify sentences in the GENERATED TEXT that are not factually based on the context (not factually supported by the provided CONTEXT or directly contradicts the provided CONTEXT.) 

"""

human = """
<GENERATED TEXT>
{handout} 
</GENERATED TEXT>

<CONTEXT>
{contexts}
</CONTEXT>
"""

prompt_groundtruth = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", human),
    ]
)

grader_groundtruth = prompt_groundtruth | structured_llm_grader

def grade_groundtruth(run, example):
    try: # try to get outputs from run, otherwise it is from dataset
        handout = run.outputs["handout"]
        contexts = run.outputs["contexts"]
    except KeyError:
        handout = example.outputs["handout"]
        contexts = example.inputs["contexts"]
        
    result = grader_groundtruth.invoke({"handout": handout, "contexts": contexts}).list_of_false
    count = len(result)
    
    return {
        "key": "count", "score": count, "comment": "num infactual sentences",
        "sentences": result
    }


In [10]:
# eval type: 1; to experiment with prompt engineering for LLM evaluator using dataset on context and handout

def eval_groundtruth_evaluator_prompt(rag_bot):
    """to evaluate prompt for LLM assessor for ground truth evaluation task"""
    dataset_name = "Context_groundtruth"
        
    evaluate(
        lambda x:x,
        data=dataset_name,
        evaluators=[grade_groundtruth],
        experiment_prefix="Groundtruth-prompt-",
        metadata={
            "model": "oai",
        },
    )

In [11]:
eval_groundtruth_evaluator_prompt(
    rag_bot,
)

View the evaluation results for experiment: 'Groundtruth-prompt--21269dce' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/8c0209e9-1fa7-4eae-9ee6-657491c07e75/compare?selectedSessions=bbef5376-38e7-4ae5-9b97-889c19d0c4b9




0it [00:00, ?it/s]

In [53]:
# eval type: 2; to evaluate context ground truth for one diagnosis by running the whole rag chain. No dataset created in this process
def eval_context_groundtruth(rag_bot, assessment, md_plan):
    evaluate(
        lambda x: rag_bot.make_handout(assessment, md_plan),
        data="Ground_truth", # dummy dataset
        evaluators=[grade_groundtruth],
        experiment_prefix="Groundtruth-",
        metadata={
            "model": "oai",
        },
    )

In [52]:
eval_context_groundtruth(
    rag_bot,
    assessment="5yo M, first asthma exacerbation, virally triggered", 
    md_plan="-continue ventolin q4h \n -continue flovent 125mcg qdaily \n -follow-up with your family doctor in 2 days",
)

View the evaluation results for experiment: 'Groundtruth--52310ebe' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/6ce351fc-f177-4f74-9322-8c8cd25579d4/compare?selectedSessions=cfa71343-37ed-4085-a1f9-7651080d4e0d




0it [00:00, ?it/s]

KeyboardInterrupt: 

Error running evaluator <DynamicRunEvaluator grade_groundtruth> on run fa5b4bb6-3c62-4ad0-a5a2-4acf92e1a0ac: KeyError('handout')
Traceback (most recent call last):
  File "/Users/a_wei/miniconda3/envs/llm/lib/python3.10/site-packages/langsmith/evaluation/_runner.py", line 1216, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
  File "/Users/a_wei/miniconda3/envs/llm/lib/python3.10/site-packages/langsmith/evaluation/evaluator.py", line 279, in evaluate_run
    result = self.func(
  File "/Users/a_wei/miniconda3/envs/llm/lib/python3.10/site-packages/langsmith/run_helpers.py", line 565, in wrapper
    raise e
  File "/Users/a_wei/miniconda3/envs/llm/lib/python3.10/site-packages/langsmith/run_helpers.py", line 560, in wrapper
    function_result = run_container["context"].run(
  File "/var/folders/rc/1l8xrxl962j_sth8dsw54l6h0000gn/T/ipykernel_11038/3166412940.py", line 40, in grade_groundtruth
    result = grader_groundtruth.invoke({"handout": example.outputs["handout"],

In [67]:
context_groundtruth(rag_bot, None, None)

View the evaluation results for experiment: 'Groundtruth--091700f9' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/8c0209e9-1fa7-4eae-9ee6-657491c07e75/compare?selectedSessions=71f30369-3ab9-4fab-97ad-34e61d6e4790




0it [00:00, ?it/s]

In [30]:
# **not used** eval type: 3; to run whole rag chain on all diagnoses in Pt_cases and generate dataset for possible eval
import time
def eval_llm_allcases(rag_bot):
    client = langsmith.Client()
    cases = client.list_examples(dataset_id="e3957f7c-e232-4541-beef-d7216ab12241")

    i = 0
    for case in cases:
        # eval for grading the document
        assessment = case.inputs["assessment"]
        md_plan = case.inputs["plan"]
        
        retrieved = rag_bot.retrieval_steps(assessment) # dict of query:context
        context_dict = retrieved["contexts"]
        diagnosis = retrieved["diagnosis"]
    
        current_time = datetime.now().strftime('%Y-%m-%d, %H:%M:%S')
        dataset_name = f"LLM_{diagnosis}_{current_time}"
        create_dataset_relevance(diagnosis, context_dict, dataset_name)

        time.sleep(90)
        evaluate(
            lambda x:x,
            data=dataset_name,
            evaluators=[grade_docs],
            experiment_prefix="LLM-context-relevance-",
            metadata={
                "model": "oai",
                "eval": "doc_grade",
            },
        )

        time.sleep(90)
        # eval for ground truth
        evaluate(
            lambda x: rag_bot._make_handout(context_dict, diagnosis, md_plan),
            data="Ground_truth",
            evaluators=[grade_groundtruth],
            experiment_prefix="LLM-groundtruth",
            metadata={
                "model": "oai",
                "eval": "groundtruth",
            },
        )
        i+=1
        if i ==2:
            break

In [49]:
def document_relevance(root_run: Run, example: Example) -> dict:
    """
    A very simple evaluator that checks to see if the input of the retrieval step exists
    in the retrieved docs.
    """
    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "make_handout")
    retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieval_steps")

    context_dict = retrieve_run.outputs["contexts"]

    scores = []
    for query, q_c in context_dict.values(): #each document should be an example in the dataset        
        for doc in q_c:
            scores.append(grade_doc(query, doc))
            
    return {"results": scores}
    

def ground_truth(root_run: Run, example: Example) -> dict:
    """
    A simple evaluator that checks to see the answer is grounded in the documents
    """
    # Get documents and answer
    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "make_handout")

    score = grade_groundtruth(rag_pipeline_run, None)
    return {"key": "num_false_facts", "score": score}

In [50]:
def eval_llm_allcases_1(rag_bot):
    experiment_results = evaluate(
        lambda inputs: rag_bot.make_handout(assessment=inputs["assessment"], md_plan=inputs["plan"]),
        data="Dummy_pt_case",
        evaluators=[document_relevance, ground_truth],
        experiment_prefix=datetime.now().strftime('%Y-%m-%d, %H:%M:%S')
    )

In [51]:
eval_llm_allcases_1(rag_bot)

View the evaluation results for experiment: '2024-06-02, 18:33:25-0caa1194' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/281774a1-94f9-47f1-9846-88086ceec7d5/compare?selectedSessions=dd801a42-7c33-45f6-a899-e091ef89b6d7




0it [00:00, ?it/s]

Error running evaluator <DynamicRunEvaluator ground_truth> on run 774a37b1-5990-4eb2-a3d6-61d9b5e6d99e: ValueError("Expected an EvaluationResult object, or dict with a metric 'key' and optional 'score'; got {'key': 'num_false_facts', 'score': {'key': 'count', 'score': 7, 'comment': 'num infactual sentences', 'sentences': ['- Administer the prescribed dexamethasone at 3pm tomorrow as directed by the doctor.', '- Use over-the-counter medications for pain and fever to keep your child comfortable.', '- Keep your child calm and comfortable, and consider using cool or warm mist, although its effectiveness is not clear.', '- Seek immediate medical attention if your child experiences:', '- Inspiratory stridor (a high-pitched sound when breathing in)', '- Blue or bluish-colored lips', '- Decrease in the level of alertness']}}")
Traceback (most recent call last):
  File "/Users/a_wei/miniconda3/envs/llm/lib/python3.10/site-packages/langsmith/evaluation/evaluator.py", line 204, in _coerce_evaluat

## LLM grading based on custom metrics
- jargon
- reference list
- template format

## Human feedback of output