# Setup

In [1]:
from operator import itemgetter
from _global import path_to_resources, hf_embed
import templates
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers.string import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
import langsmith
from langsmith import traceable, trace
from langsmith.evaluation import LangChainStringEvaluator, evaluate
from langchain.callbacks.tracers import LangChainTracer

In [2]:
# set up retriever
db = Chroma(collection_name="main_collection", persist_directory=f"{path_to_resources}/db_wiki", embedding_function=hf_embed)
retriever = db.as_retriever(
                search_type = "similarity",
                search_kwargs = {"k":4},
            )

In [3]:
# langsmith setup
project_name = "ED-handout"

# RAG Class

In [4]:
class RagBot:
    def __init__(self, retriever, templates, model: str = "gpt-3.5-turbo-1106"):
        self._retriever = retriever
        self._llm_gpt = ChatOpenAI(model_name=model, temperature=0)
        self._llm_llama = Ollama(model="llama2:13b", temperature=0)
        self.templates = templates
        self.queries = {
            "definition": "definition of {diagnosis}",
            "presentation": "manifestations of {diagnosis}",
            "course": "natural history of {diagnosis}",
            "management": "treatment and management for {diagnosis}",
            "follow_up": "follow-up plan for {diagnosis}",
            "redflags": "signs and symptoms that indicate the need for urgent medical attention for patients with {diagnosis}",
        }

    
    @traceable
    def diagnosis_extraction(self, assessment):
        """Extracts diagnosis from physician's assessment of the patient"""
        prompt_extract_diagnosis = ChatPromptTemplate.from_messages([
            ("system",self.templates.extract_diagnosis_system),
            ("human", "{assessment}")
        ])
        chain_diagnosis = prompt_extract_diagnosis | self._llm_gpt
        
        return chain_diagnosis.invoke({"assessment":assessment}).content

    
    def make_queries(self, diagnosis):
        """Uses the diagnosis to populate dict of queries that will be used to retreive context from db"""
        return {key: value.format(diagnosis=diagnosis) for key, value in self.queries.items()}

    
    @traceable(run_type="retriever")
    def _retrieve_docs(self, query):
        return self._retriever.invoke(query)

    
    def get_contexts(self, queries):
        """returns a tuple with (query, contexts)"""
        contexts = {}
        for k, query in queries.items():
            contexts[k] = (query, self._retrieve_docs(query))
        
        return contexts


    def compress_contexts(self, q_c):
        prompt_compress = ChatPromptTemplate.from_messages([
            ("system", self.templates.compress_context_system),
            ("human", self.templates.compress_context_human)
        ])
        chain_compress = prompt_compress | self._llm_gpt

        return chain_compress.invoke({"query": q_c[0], "context": q_c[1]}).content

    
    @traceable()
    def retrieval_steps(self, assessment):
        """All the steps to prep the contexts for final handout generation"""    
        diagnosis = self.diagnosis_extraction(assessment)
        queries = self.make_queries(diagnosis)
        contexts = self.get_contexts(queries)

        return {"contexts": contexts, "diagnosis": diagnosis}
        
    
    @traceable()
    def make_handout(self, assessment, md_plan):
        _run_input = self.retrieval_steps(assessment)
        _contexts = _run_input["contexts"]
        diagnosis = _run_input["diagnosis"]

        # compression
        contexts = {}
        for k, q_c in _contexts.items():
            contexts[k] = self.compress_contexts(q_c)

        # make handout
        prompt_make_handout = ChatPromptTemplate.from_messages([
            ("system",self.templates.handout_generation_system),
            ("human", self.templates.handout_generation_human),
        ])
        chain_make_handout = prompt_make_handout | self._llm_gpt
        response = chain_make_handout.invoke({
            "context_definition": contexts["definition"],
            "context_presentation": contexts["presentation"],
            "context_course": contexts["course"],
            "context_management": contexts["management"],
            "context_follow_up": contexts["follow_up"],
            "context_redflags": contexts["redflags"],
            "context_md_plan": md_plan,
        })
        
        # Evaluators will expect "answer" and "contexts"
        return {
            "diagnosis": diagnosis,
            "contexts": "\n".join(contexts.values()) + "\n" + md_plan,
            "handout": response.content,
        }



In [5]:
bot = RagBot(retriever, templates)

In [9]:
# test that extraction works and works with langsmith

with trace("Diagnosis extraction", "chain", project_name=project_name, inputs={"assessment": "5yo M with viral-triggered asthma"}) as rt:
    output = bot.diagnosis_extraction(inputs["assessment"])
    rt.end(outputs={"output": output})

In [36]:
# need to wrap the bot so it can be used with langsmith evaluate()
def make_handout_with_context(rag_bot):
    def _make_handout_with_context(example: dict):
        """Use this for evaluation of retrieved documents and hallucinations"""
        response = rag_bot.make_handout(example["assessment"], example["plan"])
        print(response)
        return {"handout": response["handout"], "contexts": response["contexts"]}

    return _make_handout_with_context

# Eval

## Doc grader

In [85]:
### OpenAI Grader

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Data model
class GradeDocuments(BaseModel):
    """0-2 score based on relevance of doc."""

    score: str = Field(description="Documents grade based on correct diagnosis and relevant information")

# LLM with function call 
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt 
system = """
    You are a grader assessing relevance of a retrieved document to a user question. \n 
    The content of the document can be found in page_content. Give a score for the document using the scoring system below. 
    Scoring: 
    * 0: irrelevant diagnosis \n
    * 1: correct diagnosis, but does not contain information to anser the user question \n
    * 2: correct diagnosis and contains information to answer the user question). \n
    
    
"""
prompt_gradedoc = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "RETRIEVED DOCUMENT: \n\n {document} \n\n USER QUESTION: {query}"),
    ]
)

retrieval_grader = prompt_gradedoc | structured_llm_grader

def grade_docs(run, example) -> dict:
    grade = retrieval_grader.invoke({"query": example.inputs["query"], "document": example.inputs["context"]})
    return {"key": "grade", "score": grade.score, "comment": "grade for doc"}


- given a diagnosis
- create dataset of query + doc for each doc retrieved from each query
- run experiement on the dataset

In [86]:
def create_dataset_relevance(diagnosis, context_dict, dataset_name):
    """Takes query_context dictionary and create a dataset for {diagnosis} to evaluate the relevance of retrieved context"""
    client = langsmith.Client()
    
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description=f"Test context relevance for docs retreiived for {diagnosis}",
    )

    for query, q_c in context_dict.values(): #each document should be an example in the dataset        
        for doc in q_c:
            client.create_examples(
                inputs=[{"query": query, "context": doc}],
                dataset_id=dataset.id,
            )

In [87]:
def context_relevance(rag_bot, assessment):
    retrieved = rag_bot.retrieval_steps(assessment) # dict of query:context
    context_dict = retrieved["contexts"]
    diagnosis = retrieved["diagnosis"]

    dataset_name = f"Queries_Docs_{diagnosis}"
    #create_dataset_relevance(diagnosis, context_dict, dataset_name)
        
    evaluate(
        lambda x:x,
        data=dataset_name,
        evaluators=[grade_docs],
        experiment_prefix="Context-relevance-",
        metadata={
            "model": "oai",
            "diagnosis":diagnosis
        },
    )

In [88]:
context_relevance(bot, "5yo M, viral triggered asthma")

View the evaluation results for experiment: 'Context-relevance--ddec2471' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/2c62c230-5ffb-4be1-a160-67b532c3d537/compare?selectedSessions=53cc885e-a871-4aa7-bf70-fa1cf31bdcd3




0it [00:00, ?it/s]

{'type': 'Document', 'metadata': {'Title': 'Acute severe asthma', 'source': 'https://en.wikipedia.org/wiki/Acute_severe_asthma', 'Header2': 'Recent research'}, 'page_content': 'A recent study proposed that the interaction between host airway epithelial cells and respiratory viruses is another aspect of innate immunity that is also a critical determination of asthma. It was also proposed that a rationale for how antiviral performance at the epithelial cell level might be improved to prevent acute infectious illness and chronic inflammatory disease caused by respiratory viruses.  \nAnother study aimed to show that experimental asthma after viral infection inmate depended on Type I IFN-driven up-regulation of the high-affinity receptor for IgE \\(FcεRI\\) on conventional dendritic cells \\(cDCs\\) in the lungs. The study found that a Novell PMN-cDc interaction in the lung is necessary for a viral infection to induce atopic disease.'}{'type': 'Document', 'metadata': {'Title': 'Asthma', 'so

Error running evaluator <DynamicRunEvaluator grade_docs> on run ed4b637b-bf1a-4258-bc03-a73dde6904d4: ValueError("Expected an EvaluationResult object, or dict with a metric 'key' and optional 'score'; got {'key': 'grade', 'score': '1', 'comment': 'grade for doc'}")
Traceback (most recent call last):
  File "/Users/a_wei/miniconda3/envs/llm/lib/python3.10/site-packages/langsmith/evaluation/evaluator.py", line 204, in _coerce_evaluation_result
    return EvaluationResult(**{"source_run_id": source_run_id, **result})
  File "/Users/a_wei/miniconda3/envs/llm/lib/python3.10/site-packages/pydantic/v1/main.py", line 341, in __init__
    raise validation_error
pydantic.v1.error_wrappers.ValidationError: 3 validation errors for EvaluationResult
score
  value is not a valid boolean (type=value_error.strictbool)
score
  value is not a valid integer (type=type_error.integer)
score
  value is not a valid float (type=type_error.float)

The above exception was the direct cause of the following except

{'type': 'Document', 'metadata': {'Title': 'Acute severe asthma', 'source': 'https://en.wikipedia.org/wiki/Acute_severe_asthma', 'Header2': 'Recent research'}, 'page_content': 'A recent study proposed that the interaction between host airway epithelial cells and respiratory viruses is another aspect of innate immunity that is also a critical determination of asthma. It was also proposed that a rationale for how antiviral performance at the epithelial cell level might be improved to prevent acute infectious illness and chronic inflammatory disease caused by respiratory viruses.  \nAnother study aimed to show that experimental asthma after viral infection inmate depended on Type I IFN-driven up-regulation of the high-affinity receptor for IgE \\(FcεRI\\) on conventional dendritic cells \\(cDCs\\) in the lungs. The study found that a Novell PMN-cDc interaction in the lung is necessary for a viral infection to induce atopic disease.'}
{'type': 'Document', 'metadata': {'Title': 'Asthma', 's

Error running evaluator <DynamicRunEvaluator grade_docs> on run c817e9ad-4dcb-407e-8f89-baefd01c481b: ValueError("Expected an EvaluationResult object, or dict with a metric 'key' and optional 'score'; got {'key': 'grade', 'score': '1', 'comment': 'grade for doc'}")
Traceback (most recent call last):
  File "/Users/a_wei/miniconda3/envs/llm/lib/python3.10/site-packages/langsmith/evaluation/evaluator.py", line 204, in _coerce_evaluation_result
    return EvaluationResult(**{"source_run_id": source_run_id, **result})
  File "/Users/a_wei/miniconda3/envs/llm/lib/python3.10/site-packages/pydantic/v1/main.py", line 341, in __init__
    raise validation_error
pydantic.v1.error_wrappers.ValidationError: 3 validation errors for EvaluationResult
score
  value is not a valid boolean (type=value_error.strictbool)
score
  value is not a valid integer (type=type_error.integer)
score
  value is not a valid float (type=type_error.float)

The above exception was the direct cause of the following except

{'type': 'Document', 'metadata': {'Title': 'Asthma-COPD overlap', 'source': 'https://en.wikipedia.org/wiki/Asthma-COPD_overlap', 'Header2': 'Treatment'}, 'page_content': 'Treatment of ACO is based on expert opinion as there are no universally accepted clinical guidelines. Treatment is usually based on whether clinical features of asthma or COPD predominate. Inhaled corticosteroids are the primary treatment in those with ACOS. Inhaled corticosteroids \\(ICS\\) should be continued in those with asthma who develop decreased airway responsiveness to bronchodilators consistent with ACO. Therapy can be escalated to include a long acting beta-agonist \\(LABA\\) and inhaled steroid combination \\(ICS-LABA\\) or by adding on a long-acting anti-muscarinic inhaler \\(LAMA\\), known as triple therapy, in those with more severe or resistant disease.  \nMonoclonal antibodies targeting type 2 inflammation \\(which is predominant in asthma\\) have been used to treat severe asthma, and may also be used

Error running evaluator <DynamicRunEvaluator grade_docs> on run 49c309e8-1563-4db8-b514-1a2775634ea1: ValueError("Expected an EvaluationResult object, or dict with a metric 'key' and optional 'score'; got {'key': 'grade', 'score': '2', 'comment': 'grade for doc'}")
Traceback (most recent call last):
  File "/Users/a_wei/miniconda3/envs/llm/lib/python3.10/site-packages/langsmith/evaluation/evaluator.py", line 204, in _coerce_evaluation_result
    return EvaluationResult(**{"source_run_id": source_run_id, **result})
  File "/Users/a_wei/miniconda3/envs/llm/lib/python3.10/site-packages/pydantic/v1/main.py", line 341, in __init__
    raise validation_error
pydantic.v1.error_wrappers.ValidationError: 3 validation errors for EvaluationResult
score
  value is not a valid boolean (type=value_error.strictbool)
score
  value is not a valid integer (type=type_error.integer)
score
  value is not a valid float (type=type_error.float)

The above exception was the direct cause of the following except

## Ground truth checker
** LLM gives different evaluation each time?

In [65]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List

# Data model
class GroundTruth(BaseModel):
    """List facts in handout not based on ground truth"""
    list_of_false: List[str] = Field(description="List of facts in handout not based on ground truth")

# LLM with function call 
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_grader = llm.with_structured_output(GroundTruth)

# Prompt 
system = """
You are an expert assessor tasked with evaluating whether the generated text (provided by the user between the XML tags GENERATED TEXT) is factually based on the provided context (provided by the user between the XML tags CONTEXT).  

Follow these steps:
    Step 1: Read the provided context carefully. Understand the information presented in the context.
    Step 2: Analyze each sentence in the GENERATED TEXT. Compare it with the provided CONTEXT to determine its factual basis. Sentences in the GENERATED TEXT that are similar (but not verbatim) to the sentences in provided CONTEXT, but are still factually aligned are considered factually based on the CONTEXT.
    Step 3: Identify sentences in the GENERATED TEXT that are not factually based on the context (not factually supported by the provided CONTEXT or directly contradicts the provided CONTEXT.) 

"""

human = """
<GENERATED TEXT>
{handout} 
</GENERATED TEXT>

<CONTEXT>
{contexts}
</CONTEXT>
"""

prompt_groundtruth = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", human),
    ]
)

grader_groundtruth = prompt_groundtruth | structured_llm_grader

def tmp(inputs) -> dict:
    return inputs

def grade_groundtruth(run, example):
    result = grader_groundtruth.invoke({"handout": example.inputs["handout"], "contexts": example.inputs["contexts"]}).list_of_false
    count = len(result)
    
    return {
        "key": "count", "score": count, "comment": "num infactual sentences",
        "sentences": result
    }


In [57]:
def create_dataset_groundtruth(handout, context, dataset_name):
    """Takes handout and contexts used to make a dataset"""
    client = langsmith.Client()
    
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description=f"Test whether handout for {diagnosis} is based on provided context",
    )

    # **Preprocess context to remove headings
    client.create_examples(
        inputs=[{"handout": handout, "contexts": contexts}],
        dataset_id=dataset.id,
    )

In [66]:
def context_groundtruth(rag_bot, assessment, md_plan):
    """
    retrieved = rag_bot.retrieval_steps(assessment) # dict of query:context
    context_dict = retrieved["contexts"]
    diagnosis = retrieved["diagnosis"]
    """
    #outputs = rag_bot.make_handout(assessment, md_plan)
    #dataset_name = f"Groundtruth_{diagnosis}"
    #create_dataset_relevance(diagnosis, context_dict, dataset_name=f"Queries_Docs_{diagnosis}") # also makes a dataset for retrieval relevance but not evaluate on it
    #create_dataset_groundtruth(outputs["handout"], outputs["contexts"], dataset_name)


    # tmp - using existing context ground truth to test the llm evaluator
    dataset_name = "Context_groundtruth"
        
    evaluate(
        lambda x:x,
        data=dataset_name,
        evaluators=[grade_groundtruth],
        experiment_prefix="Groundtruth-",
        metadata={
            "model": "oai",
            "diagnosis":"tmp"
        },
    )

In [67]:
context_groundtruth(bot, None, None)

View the evaluation results for experiment: 'Groundtruth--091700f9' at:
https://smith.langchain.com/o/edfbc8bb-c3a3-5c1e-8b48-11b5a8cfd8ac/datasets/8c0209e9-1fa7-4eae-9ee6-657491c07e75/compare?selectedSessions=71f30369-3ab9-4fab-97ad-34e61d6e4790




0it [00:00, ?it/s]

## LLM grading based on custom metrics
- jargon
- reference list
- template format

## Human feedback of output