# Task List
- GPT3.5 LLM evaluator for grading context relevance is inaccurate
- was cleaning up code and adding comments

# Setup

In [1]:
from typing import List
from datetime import datetime
from operator import itemgetter

from _global import path_to_resources, hf_embed
import templates

from langchain.callbacks.tracers import LangChainTracer
from langchain_community.llms import Ollama
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers.string import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_openai import ChatOpenAI

import langsmith
from langsmith import traceable, trace
from langsmith.evaluation import LangChainStringEvaluator, evaluate

In [2]:
# set up retriever
db = Chroma(collection_name="main_collection", persist_directory=f"{path_to_resources}/db_main", embedding_function=hf_embed)
retriever = db.as_retriever(
                search_type = "similarity",
                search_kwargs = {"k":4},
            )

In [3]:
# langsmith setup
project_name = "ED-handout"

# RAG Class

In [4]:
class RagBot:
    """Bot that handles different steps of RAG"""
    def __init__(self, retriever, templates, model: str = "gpt-3.5-turbo-1106"):
        self._retriever = retriever
        self._llm_gpt = ChatOpenAI(model_name=model, temperature=0)
        self._llm_llama = Ollama(model="llama2:13b", temperature=0)
        self.templates = templates
        self.queries = {
            "definition": "definition of {diagnosis}",
            "presentation": "manifestations of {diagnosis}",
            "course": "natural history of {diagnosis}",
            "management": "treatment and management for {diagnosis}",
            "follow_up": "follow-up plan for {diagnosis}",
            "redflags": "signs and symptoms that indicate the need for urgent medical attention for patients with {diagnosis}",
        }

    
    @traceable
    def diagnosis_extraction(self, assessment):
        """Extracts diagnosis from physician's assessment of the patient"""
        prompt_extract_diagnosis = ChatPromptTemplate.from_messages([
            ("system",self.templates.extract_diagnosis_system),
            ("human", "{assessment}")
        ])
        chain_diagnosis = prompt_extract_diagnosis | self._llm_gpt
        
        return chain_diagnosis.invoke({"assessment":assessment}).content

    
    def make_queries(self, diagnosis):
        """Uses the diagnosis to populate dict of queries that will be used to retreive context from db"""
        return {key: value.format(diagnosis=diagnosis) for key, value in self.queries.items()}

    
    @traceable(run_type="retriever")
    def _retrieve_docs(self, query):
        return self._retriever.invoke(query)

    
    def get_contexts(self, queries):
        """returns dict with tuples of (query, contexts)"""
        contexts = {}
        for k, query in queries.items():
            contexts[k] = (query, self._retrieve_docs(query))
        
        return contexts


    def compress_contexts(self, q_c):
        """contextual compression with llm"""
        prompt_compress = ChatPromptTemplate.from_messages([
            ("system", self.templates.compress_context_system),
            ("human", self.templates.compress_context_human)
        ])
        chain_compress = prompt_compress | self._llm_gpt

        return chain_compress.invoke({"query": q_c[0], "context": q_c[1]}).content

    
    @traceable()
    def retrieval_steps(self, assessment, eval=False):
        """all the steps to prep the contexts for final handout generation"""    
        diagnosis = self.diagnosis_extraction(assessment)
        queries = self.make_queries(diagnosis)
        contexts = self.get_contexts(queries)

        return {"contexts": contexts, "diagnosis": diagnosis}
        
    
    @traceable()
    def make_handout(self, assessment, md_plan):
        _run_input = self.retrieval_steps(assessment)
        _contexts = _run_input["contexts"]
        diagnosis = _run_input["diagnosis"]

        # compression
        contexts = {}
        for k, q_c in _contexts.items():
            contexts[k] = self.compress_contexts(q_c)

        # make handout
        prompt_make_handout = ChatPromptTemplate.from_messages([
            ("system",self.templates.handout_generation_system),
            ("human", self.templates.handout_generation_human),
        ])
        chain_make_handout = prompt_make_handout | self._llm_gpt
        response = chain_make_handout.invoke({
            "context_definition": contexts["definition"],
            "context_presentation": contexts["presentation"],
            "context_course": contexts["course"],
            "context_management": contexts["management"],
            "context_follow_up": contexts["follow_up"],
            "context_redflags": contexts["redflags"],
            "context_md_plan": md_plan,
        })
        
        # Evaluators will expect "answer" and "contexts"
        return {
            "diagnosis": diagnosis,
            "contexts": "\n".join(contexts.values()) + "\n" + md_plan,
            "handout": response.content
        }

In [5]:
rag_bot = RagBot(retriever, templates)

# Eval

ways to use evaluations
- to evaluate part of the RAG pipeline
    1. **before handout generation**: this runs part of the rag pipeline -> create a dataset -> evaluate based on the data; used when optimizing each section (e.g prompt engineering, experimenting with retrieval strategies)
    2. **after handout generation**: run on one example (dataset with assessment and plan as input for one diagnosis); used to test debug for individual case
- evaluate the whole pipeline: run for all common diagnoses dataset (Pt_cases); evaluate the RAG chain on a database

## Doc grader
- given a diagnosis, create dataset of query + doc for each doc retrieved from each query
- run experiement on the dataset

In [6]:
# Data model
class GradeDocuments(BaseModel):
    """ Pydantic object used to format LLM output
    * 0: irrelevant diagnosis \n
    * 1: correct diagnosis, but does not contain information to anser the user question \n
    * 2: correct diagnosis and contains information to answer the user question). \n    
    """
    score: str = Field(description="Documents grade based on correct diagnosis and relevant information")


llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

system = """
    You are a grader assessing relevance of a retrieved document to a user question. \n 
    The content of the document can be found in page_content. Give a score for the document using the scoring system below. 
    Scoring: 
    * 0: irrelevant diagnosis \n
    * 1: correct diagnosis, but does not contain information to anser the user question \n
    * 2: correct diagnosis and contains information to answer the user question). \n
"""
prompt_gradedoc = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "RETRIEVED DOCUMENT: \n\n {document} \n\n USER QUESTION: {query}"),
    ]
)

retrieval_grader = prompt_gradedoc | structured_llm_grader

def grade_doc(query, doc) -> dict:
    """Grades one query and one corresponding document. used in eval type 3"""
    grade = retrieval_grader.invoke({"query": query, "document": doc})
    return {"key": "grade", "score": int(grade.score), "comment": "grade for doc"}

def grade_docs(run, example) -> dict:
    """Grades all queries and corresponding documents in the db. used in eval type 1"""
    grade = retrieval_grader.invoke({"query": example.inputs["query"], "document": example.inputs["context"]})
    return {"key": "grade", "score": int(grade.score), "comment": "grade for doc"}

In [7]:
def create_dataset_relevance(diagnosis, context_dict, dataset_name):
    """Takes query_context dictionary and create a dataset for {diagnosis} to evaluate the relevance of retrieved context"""
    client = langsmith.Client()
    
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description=f"Test context relevance for docs retreiived for {diagnosis}",
    )

    for query, q_c in context_dict.values(): #each document should be an example in the dataset        
        for doc in q_c:
            client.create_examples(
                inputs=[{"query": query, "context": doc}],
                dataset_id=dataset.id,
            )

In [8]:
# eval type: 1; used to asses the retrieved context's relevance

def eval_context_relevance(rag_bot, assessment):
    retrieved = rag_bot.retrieval_steps(assessment) # dict of query:context
    context_dict = retrieved["contexts"]
    diagnosis = retrieved["diagnosis"]

    current_time = datetime.now().strftime('%Y-%m-%d, %H:%M:%S')
    dataset_name = f"Queries_Docs_{diagnosis}_{current_time}"
    create_dataset_relevance(diagnosis, context_dict, dataset_name)
        
    evaluate(
        lambda x:x,
        data=dataset_name,
        evaluators=[grade_docs],
        experiment_prefix="Context-relevance-",
        metadata={
            "model": "oai",
            "diagnosis":diagnosis
        },
    )

In [9]:
eval_context_relevance(rag_bot, "5yo M, viral triggered asthma")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

View the evaluation results for experiment: 'Context-relevance--a75c23db' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/849d8b2c-52d8-40de-99d6-f414a23ae3f6/compare?selectedSessions=9c656256-5590-470c-904a-f0217f28748f




0it [00:00, ?it/s]

## Faithful checker

In [10]:
# not in use; used once to create dataset for eval_faithfulness_evaluator_prompt
def create_dataset_faithfulness(handout, context, dataset_name):
    """Takes handout and contexts used to make a dataset"""
    client = langsmith.Client()
    
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description=f"Test whether handout for {diagnosis} is based on provided context",
    )

    # **Preprocess context to remove headings
    client.create_examples(
        inputs=[{"contexts": contexts}],
        outputs=[{"handout": handout}],
        dataset_id=dataset.id,
    )

In [11]:
class Faithfulness(BaseModel):
    """List facts in handout not based on ground truth"""
    list_of_false: List[str] = Field(description="List of facts in handout not based on ground truth")

# LLM with function call 
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_grader = llm.with_structured_output(Faithfulness)

# Prompt 
system = """
You are an expert assessor tasked with evaluating whether the generated text (provided by the user between the XML tags GENERATED TEXT) is factually based on the provided context (provided by the user between the XML tags CONTEXT).  

Follow these steps:
    Step 1: Read the provided context carefully. Understand the information presented in the context.
    Step 2: Analyze each sentence in the GENERATED TEXT. Compare it with the provided CONTEXT to determine its factual basis. Sentences in the GENERATED TEXT that are similar (but not verbatim) to the sentences in provided CONTEXT, but are still factually aligned are considered factually based on the CONTEXT.
    Step 3: Identify sentences in the GENERATED TEXT that are not factually based on the context (not factually supported by the provided CONTEXT or directly contradicts the provided CONTEXT.) 

"""

human = """
<GENERATED TEXT>
{handout} 
</GENERATED TEXT>

<CONTEXT>
{contexts}
</CONTEXT>
"""

prompt_faithfulness= ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", human),
    ]
)

grader_faithfulness= prompt_faithfulness| structured_llm_grader

def grade_faithfulness(run, example):
    try: # try to get outputs from run, otherwise it is from dataset
        handout = run.outputs["handout"]
        contexts = run.outputs["contexts"]
    except KeyError:
        handout = example.outputs["handout"]
        contexts = example.inputs["contexts"]
        
    result = grader_faithfulness.invoke({"handout": handout, "contexts": contexts}).list_of_false
    count = len(result)
    
    return {
        "key": "count", "score": count, "comment": "num infactual sentences",
        "sentences": result
    }


In [13]:
# eval type: 1; to experiment with prompt engineering for LLM evaluator using dataset on context and handout

def eval_faithfulness_evaluator_prompt(rag_bot):
    """to evaluate prompt for LLM assessor for faithfulness evaluation task"""
    dataset_name = "Context_faithfulness"
        
    evaluate(
        lambda x:x,
        data=dataset_name,
        evaluators=[grade_faithfulness],
        experiment_prefix="Faithfulness-prompt-",
        metadata={
            "model": "oai",
        },
    )

In [14]:
eval_faithfulness_evaluator_prompt(
    rag_bot,
)

LangSmithNotFoundError: Dataset Context_faithfulness not found

In [15]:
# eval type: 2; to evaluate context faithfulness for one diagnosis by running the whole rag chain. No dataset created in this process
def eval_context_faithfulness(rag_bot, assessment, md_plan):
    evaluate(
        lambda x: rag_bot.make_handout(assessment, md_plan),
        data="Faithfulness", # dummy dataset
        evaluators=[grade_faithfulness],
        experiment_prefix="Faithfulness-",
        metadata={
            "model": "oai",
        },
    )

In [16]:
eval_context_faithfulness(
    rag_bot,
    assessment="5yo M, first asthma exacerbation, virally triggered", 
    md_plan="-continue ventolin q4h \n -continue flovent 125mcg qdaily \n -follow-up with your family doctor in 2 days",
)

View the evaluation results for experiment: 'Faithfulness--64dd2079' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/6ce351fc-f177-4f74-9322-8c8cd25579d4/compare?selectedSessions=07d6053c-5bfc-459c-9f8e-42bf08d1f644




0it [00:00, ?it/s]

## Full pipeline

In [19]:
def document_relevance(root_run, example):
    """
    A very simple evaluator that checks to see if the input of the retrieval step exists
    in the retrieved docs.
    """
    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "make_handout")
    retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieval_steps")

    context_dict = retrieve_run.outputs["contexts"]

    scores = []
    for query, q_c in context_dict.values(): #each document should be an example in the dataset        
        for doc in q_c:
            scores.append(grade_doc(query, doc))
            
    return {"results": scores}
    

def faithfulness(root_run, example):
    """
    A simple evaluator that checks to see the answer is grounded in the documents
    """
    # Get documents and answer
    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "make_handout")

    return grade_faithfulness(rag_pipeline_run, None)

In [20]:
def eval_llm_allcases_1(rag_bot):
    experiment_results = evaluate(
        lambda inputs: rag_bot.make_handout(assessment=inputs["assessment"], md_plan=inputs["plan"]),
        data="Dummy_pt_case",
        evaluators=[document_relevance, faithfulness],
        experiment_prefix=datetime.now().strftime('%Y-%m-%d, %H:%M:%S')
    )

In [21]:
eval_llm_allcases_1(rag_bot)

View the evaluation results for experiment: '2024-06-20, 17:00:36-21b860ba' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/281774a1-94f9-47f1-9846-88086ceec7d5/compare?selectedSessions=0c07e3b7-890f-474b-b515-6f84833ccafb




0it [00:00, ?it/s]

In [24]:
def eval_llm_allcases_2(rag_bot):
    experiment_results = evaluate(
        lambda inputs: rag_bot.make_handout(assessment=inputs["assessment"], md_plan=inputs["plan"]),
        data="Pt_cases",
        evaluators=[document_relevance, faithfulness],
        experiment_prefix=datetime.now().strftime('%Y-%m-%d, %H:%M:%S')
    )

In [25]:
eval_llm_allcases_2(rag_bot)

View the evaluation results for experiment: '2024-06-20, 17:08:12-276d0867' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/e3957f7c-e232-4541-beef-d7216ab12241/compare?selectedSessions=37eba5ab-c749-465c-8aec-3b12e19602b3




0it [00:00, ?it/s]

Error running target function: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-1106 in organization org-hIcQ6WHRm8UbxJ53MIHwRhVG on tokens per min (TPM): Limit 60000, Used 59065, Requested 2196. Please try again in 1.261s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Error running target function: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-1106 in organization org-hIcQ6WHRm8UbxJ53MIHwRhVG on tokens per min (TPM): Limit 60000, Used 58819, Requested 2079. Please try again in 898ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Error running target function: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo-1106 in organization org-hIcQ6WHRm8UbxJ53MIHwRhVG on tokens per min (TPM): Limit 60000, Used 59645, Requested 1391

## LLM grading based on custom metrics
- jargon
- reference list
- template format

## Human feedback of output