# Installing References

In [None]:
pip install --upgrade --quiet  langchain langchain-huggingface sentence_transformers langchain-community faiss-gpu

# Loading References

In [None]:
import getpass
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain_huggingface.llms import HuggingFacePipeline
from langchain_huggingface import ChatHuggingFace
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import getpass
import os
from langsmith import Client
import pandas as pd
from langsmith import traceable
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

In [None]:
inference_api_key = getpass.getpass("Enter your HF Inference API Key:\n\n")

Enter your HF Inference API Key:

··········



# Loading Embedding Model

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'cuda'})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Loading Vector Store

In [None]:
vector_store = FAISS.load_local("vector_store_index", embeddings, allow_dangerous_deserialization=True)

In [None]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 4})

# Loading the LLM

In [None]:
llm = HuggingFacePipeline.from_model_id(
    model_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    pipeline_kwargs=dict(
        max_new_tokens=512,
        do_sample=False,
        repetition_penalty=1.03,
    ),
)

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
llm = ChatHuggingFace(llm=llm)

# Defining Q&A Chain used for answer generation

In [None]:
# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
prompt = hub.pull("rlm/rag-prompt")



In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_chain = (
    {
        "context": vector_store.as_retriever() | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

## Testing Q&A Chain

In [None]:
question = "What is (are) Glaucoma ?"

In [None]:
response = qa_chain.invoke(question)

In [None]:
print('question:', question)
print('answer:', response.split('<|assistant|>')[1].strip())

question: What is (are) Glaucoma ?
answer: Glaucoma is a group of eye diseases that damage the optic nerve, leading to vision loss and blindness. The most common type is open-angle glaucoma, where the fluid in the eye drains too slowly through the meshwork at the angle between the cornea and iris, causing pressure and potential damage to the optic nerve. Early treatment can help prevent serious vision loss, especially for those at higher risk, such as African Americans over 40, people over 60, especially Mexican Americans, and those with a family history of glaucoma. Without treatment, glaucoma can cause loss of peripheral vision and eventually straight-ahead vision. There is currently no cure for glaucoma, but it can be managed with prescription eyedrops and/or surgery. Early detection through regular eye exams is crucial for protecting against vision loss.


In [None]:
question = "What is (are) Medicare and Continuing Care ?"

In [None]:
response = qa_chain.invoke("What is (are) Medicare and Continuing Care ?")

In [None]:
print('question:', question)
print('answer:', response.split('<|assistant|>')[1].strip())

question: What is (are) Medicare and Continuing Care ?
answer: Medicare is a federal health insurance program for individuals over 65, those under 65 with certain disabilities, and those with end-stage renal disease. It covers some healthcare costs but not all. Original Medicare is managed by the federal government and allows people to choose any healthcare provider who accepts Medicare. Medicare Part A, also known as hospital insurance, helps cover inpatient care in hospitals, skilled nursing facility care, hospice care, and home health care under certain conditions. Medicaid is a state-run program that provides hospital and medical coverage for people with low income and little or no resources. Some people may qualify for both Medicare and Medicaid. For more information about Medicare and Medicaid, visit the Medicare website or call 1-800-MEDICARE.


In [None]:
!pip install --quiet -U langsmith openai

# Connecting to LangSmith for the Offline Evaluation

In [None]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter your LangSmith API key: ")

Enter your LangSmith API key: ··········


In [None]:
ls_client = Client(api_key=os.environ["LANGCHAIN_API_KEY"])

### Loading Evaluation Data

In [None]:
eval_df = pd.read_csv('intern_screening_dataset_eval.csv').sample(200, random_state=123)

In [None]:
labeled_texts = list(eval_df.itertuples(index=False, name=None))

In [None]:
labeled_texts[:6]

[('What are the treatments for Age-related Macular Degeneration ?',
  'Wet AMD can be treated with laser surgery, photodynamic therapy, and drugs that are injected into the eye. None of these treatments is a cure for wet AMD. The disease and loss of vision may progress despite treatment.'),
 ('What is (are) Kidney Disease ?',
  "Instead of dialysis, some people with kidney failure -- including older adults -- may be able to receive a kidney transplant. This involves having a healthy kidney from another person surgically placed into your body. The new, donated kidney does the work that your two failed kidneys used to do. The donated kidney can come from an anonymous donor who has recently died, or from a living person -- usually a relative. But you might also be able to receive a kidney from an unrelated donor, including your spouse or a friend. (Watch the video to learn more about kidney transplantation. To enlarge the video, click the brackets in the lower right-hand corner. To reduce

### Defining LangSmith dataset

In [None]:
dataset_name = "Supportiv__QnA__Offline_Eval"
dataset = ls_client.create_dataset(dataset_name=dataset_name)
ls_client.create_examples(
  inputs=[{"text": text} for text, _ in labeled_texts],
  outputs=[{"answer": label} for _, label in labeled_texts],
  dataset_id=dataset.id,
)

# Creating Q&A Function using Q&A Chain

In [None]:
# 'traceable' decorator to trace the inputs/outputs of this function.
@traceable
def question_and_answering(question: dict) -> dict:
    response = qa_chain.invoke(question['text'])
    response_splitted = response.split('<|assistant|>')
    answer = response_splitted[1].strip() if len(response_splitted) > 1 else ''
    return {'answer':answer, 'question':question['text'], 'context':response_splitted[0].strip()}

## Using Open AI gpt-4-turbo as the LLM Judge

In [None]:
!pip install --quiet langchain-openai

In [None]:
OPENAI_API_KEY=getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key: ··········


In [None]:
llm_oai = ChatOpenAI(model="gpt-4-turbo", temperature=0, api_key=OPENAI_API_KEY)

In [None]:
# correctness_prompt = PromptTemplate.from_template("You are a teacher grading a quiz. \n\nYou will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. \n\nHere is the grade criteria to follow:\n(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. \n(2) Ensure that the student answer does not contain any conflicting statements.\n(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.\n\nScore:\nA score of 1 means that the student's answer meets all of the criteria. This is the highest (best) score. \nA score of 0 means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.\n\nExplain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. \n\nAvoid simply stating the correct answer at the outset.\n\nQUESTION: {question}\nGROUND TRUTH ANSWER: {correct_answer}\nSTUDENT ANSWER: {student_answer}")

In [None]:
# correctness_prompt

PromptTemplate(input_variables=['correct_answer', 'question', 'student_answer'], input_types={}, partial_variables={}, template="You are a teacher grading a quiz. \n\nYou will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. \n\nHere is the grade criteria to follow:\n(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. \n(2) Ensure that the student answer does not contain any conflicting statements.\n(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.\n\nScore:\nA score of 1 means that the student's answer meets all of the criteria. This is the highest (best) score. \nA score of 0 means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.\n\nExplain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct

# Evaluators

## Correctness Evaluator
measures if the answer are correct, in comparison with reference answer.

In [None]:
# See full prompt at https://smith.langchain.com/hub/rlm/rag-answer-vs-reference
correctness_prompt = hub.pull("langchain-ai/rag-answer-vs-reference")

def answer_evaluator(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    # Get the question, the ground truth reference answer, RAG chain answer prediction
    input_question = inputs["text"]
    reference = reference_outputs["answer"]
    prediction = outputs["answer"]

    # Define an LLM grader
    answer_grader = correctness_prompt | llm_oai

    # Run evaluator
    score = answer_grader.invoke(
        {
            "question": input_question,
            "correct_answer": reference,
            "student_answer": prediction,
        }
    )
    score = score["Score"]
    return {"key": "answer_v_reference_score", "score": score}



Correctness Chain Testing

In [None]:
answer_grader = correctness_prompt | llm_oai

## Helpfulness Evaluator
measures if the answer is in fact addressing the question made.

In [None]:
# See full prompt at https://smith.langchain.com/hub/rlm/rag-answer-helpfulness
helpfulness_prompt = hub.pull("langchain-ai/rag-answer-helpfulness")

def answer_helpfulness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    # Get the question and RAG chain answer prediction
    input_question = inputs["text"]
    prediction = outputs["answer"]

    # Define an LLM grader
    helpfulness_grader = helpfulness_prompt | llm_oai

    # Run evaluator
    result = helpfulness_grader.invoke(
        {
            "question": input_question,
            "student_answer": prediction,
        }
    )
    score = result["Score"]
    return {"key": "answer_helpfulness_score", "score": score}



## Hallucination Evaluator
measures if the system hallucinated in the answer, based on the context provided.

In [None]:
# See full prompt at https://smith.langchain.com/prompts/rag-answer-hallucination?organizationId=1729c2d2-0d51-41fa-bdac-845786869e0e
hallucination_prompt = hub.pull("factoredaiexperiments/rag-answer-hallucination")

def answer_hallucination_evaluator(answer: str, context: str) -> dict:
    """
    A simple evaluator for RAG answer hallucination
    """

    # Define an LLM grader
    hallucination_grader = hallucination_prompt | llm_oai

    # Run evaluator
    result = hallucination_grader.invoke(
        {
            "documents": context,
            "answer": answer,
        }
    )
    return result



In [None]:
hallucination_grader = hallucination_prompt | llm_oai

## Document Relevance Evaluator
 measures if the documents included in the context are indeed relevant to answer the question.

In [None]:
# See full prompt at https://smith.langchain.com/prompts/rag-document-relevance?organizationId=1729c2d2-0d51-41fa-bdac-845786869e0e
document_relevance_prompt = hub.pull("factoredaiexperiments/rag-document-relevance")

def document_relevance_evaluator(context: str, question: str) -> dict:
    """
    A simple evaluator for RAG document relevance
    """
    # Define an LLM grader
    document_relevance_grader = document_relevance_prompt | llm_oai

    # Run evaluator
    result = document_relevance_grader.invoke(
        {
            "documents": context,
            "question": question,
        }
    )
    return result



## Testing Evaluators

Getting Single Example

In [None]:
input = {"text":labeled_texts[0][0]}
output = question_and_answering({"text":labeled_texts[0][0]})
reference_output = {"answer":labeled_texts[0][1]}

In [None]:
input

{'text': 'What is (are) Danon disease ?'}

In [None]:
output['answer']

'Danon disease is a genetic disorder that affects both the heart muscle (cardiomyopathy) and skeletal muscles (myopathy), leading to weakness and intellectual disability. Males with Danon disease typically develop symptoms earlier and more severely than females. Cardiomyopathy is the most common symptom and can lead to heart failure and premature death. Other symptoms may include palpitations, arrhythmias, chest pain, and conduction abnormalities. Women with Danon disease may also develop cardiomyopathy, but later in life and with less severity. The disorder is inherited in an X-linked manner.'

In [None]:
reference_output

{'answer': 'Danon disease is a condition characterized by weakening of the heart muscle (cardiomyopathy); weakening of the muscles used for movement, called skeletal muscles, (myopathy); and intellectual disability. Males with Danon disease usually develop the condition earlier than females and are more severely affected. Signs and symptoms begin in childhood or adolescence in most affected males and in early adulthood in most affected females. Affected males, on average, live to age 19, while affected females live to an average age of 34.  Cardiomyopathy is the most common symptom of Danon disease and occurs in all males with the condition. Most affected men have hypertrophic cardiomyopathy, which is a thickening of the heart muscle that may make it harder for the heart to pump blood. Other affected males have dilated cardiomyopathy, which is a condition that weakens and enlarges the heart, preventing it from pumping blood efficiently. Some affected men with hypertrophic cardiomyopath

Testing Correctness Evaluation

In [None]:
answer_evaluator(input, output, reference_output)

{'key': 'answer_v_reference_score', 'score': 1}

In [None]:
answer_grader.invoke(
        {
            "question": input['text'],
            "correct_answer": output['answer'],
            "student_answer": reference_output['answer'],
        }
    )

{'Score': 1,
 'Explanation': "The student's answer provides a detailed and accurate description of Danon disease, aligning well with the ground truth answer. It expands on the ground truth by providing additional details about the types of cardiomyopathy (hypertrophic and dilated), specific symptoms like Wolff-Parkinson-White syndrome, and other potential symptoms such as gastrointestinal and visual abnormalities. The student also includes information about the average lifespan of affected males and females, which, while not mentioned in the ground truth, is factually relevant and adds depth to the understanding of the disease's impact. The student's answer does not contradict any information from the ground truth and enhances the explanation with more specific details, thus earning a score of 1."}

Testing Hallucination Evaluation

In [None]:
answer_hallucination_evaluator(answer=output['answer'], context=output['context'])

{'Score': 1,
 'Explanation': 'The ANSWER accurately reflects the CONTEXT provided. It correctly identifies Danon disease as a genetic disorder characterized by cardiomyopathy and myopathy, and mentions the intellectual disability associated with it. The ANSWER also correctly notes the differences in symptom severity and onset between males and females, as well as the inheritance pattern of the disease. There is no hallucinated information; all details in the ANSWER are supported by the CONTEXT. Therefore, the ANSWER meets all the grading criteria and deserves a score of 1.'}

In [None]:
mock_wrong_context = "Glaucoma is a group of diseases that can damage the eye's optic nerve and result in vision loss and blindness. The most common form of the disease is open-angle glaucoma. With early treatment, you can often protect your eyes against serious vision loss. (Watch the video to learn more about glaucoma. To enlarge the video, click the brackets in the lower right-hand corner. To reduce the video, press the Escape (Esc) button on your keyboard.)  See this graphic for a quick overview of glaucoma, including how many people it affects, whos at risk, what to do if you have it, and how to learn more.  See a glossary of glaucoma terms."
answer_hallucination_evaluator(answer=output['answer'], context=mock_wrong_context)

{'Score': 0,
 'Explanation': "The ANSWER provided discusses Danon disease, which is a genetic disorder affecting the heart and skeletal muscles. This is entirely unrelated to the CONTEXT, which focuses on glaucoma, a group of diseases that damage the eye's optic nerve and can lead to vision loss and blindness. The ANSWER does not address any aspect of glaucoma, its risk factors, symptoms, or treatments, and instead introduces information about a completely different medical condition. Therefore, the ANSWER does not meet the criteria of being grounded in the CONTEXT and contains information outside the scope of the CONTEXT."}

Testing Document Relevance Evaluation

In [None]:
document_relevance_evaluator(context=output['context'], question=input['text'])

{'Score': 1,
 'Explanation': 'The documents provided are directly relevant to the question about Danon disease. They contain detailed descriptions of Danon disease, including its genetic basis, symptoms, and inheritance patterns. The documents discuss the lysosomal storage disorder aspect, the mutations in the LAMP2 gene, and the specific symptoms such as cardiomyopathy, skeletal myopathy, and intellectual disability. All these details are pertinent to understanding what Danon disease is, thus meeting the criteria for a score of 1.'}

In [None]:
mock_wrong_input = "What is (are) Glaucoma ?"
document_relevance_evaluator(context=output['context'], question=mock_wrong_input)

{'Score': 0,
 'Explanation': 'The provided documents focus exclusively on Danon disease, discussing its symptoms, genetic causes, and effects on the body. There is no mention of Glaucoma, its symptoms, causes, or treatments. Glaucoma is a group of eye conditions that damage the optic nerve, crucial for good vision, and is unrelated to Danon disease, which primarily affects the heart, muscles, and intellectual abilities. Therefore, the documents are completely unrelated to the question about Glaucoma, and thus receive a score of 0.'}

# Offline Evaluation

In [None]:
results = ls_client.evaluate(
    question_and_answering,
    data=dataset_name,
    evaluators=[answer_evaluator, answer_helpfulness_evaluator],
    max_concurrency=4, # optional, add concurrency
)

View the evaluation results for experiment: 'virtual-system-63' at:
https://smith.langchain.com/o/1729c2d2-0d51-41fa-bdac-845786869e0e/datasets/f7d5d060-9ff7-4ba4-8c9d-ca1d8fa159c9/compare?selectedSessions=d2d76b5e-217a-4157-a3cf-3b947abea408




0it [00:00, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
results

Unnamed: 0,inputs.text,outputs.answer,outputs.question,outputs.context,error,reference.answer,feedback.answer_v_reference_score,feedback.answer_helpfulness_score,execution_time,example_id,id
0,Is Tay-Sachs disease inherited ?,"Yes, Tay-Sachs disease is inherited in an auto...",Is Tay-Sachs disease inherited ?,<|user|>\nYou are an assistant for question-an...,,This condition is inherited in an autosomal re...,1,1,11.640936,6fefee0b-4462-44b0-ab68-ed9934ff51e3,7ea40cc0-d6f4-4f4c-80da-18f44ed94d29
1,What are the treatments for Proctitis ?,1. Treatment for proctitis depends on its caus...,What are the treatments for Proctitis ?,<|user|>\nYou are an assistant for question-an...,,Proctitis that is not treated or does not resp...,0,1,27.814781,a5c3cfc9-c8f6-40a9-b801-720cbc3353f4,33391451-9422-4fac-952e-c0fcab11d3ce
2,Is Prader-Willi syndrome inherited ?,Most cases of Prader-Willi syndrome are not in...,Is Prader-Willi syndrome inherited ?,<|user|>\nYou are an assistant for question-an...,,Most cases of Prader-Willi syndrome are not in...,1,1,30.235728,02985be4-74e2-4772-8fec-79525b848d5c,dcb5e968-e15a-489d-a9a5-487e80a105d7
3,What is (are) Brachial Plexus Injuries ?,Brachial plexus injuries are damage to the ner...,What is (are) Brachial Plexus Injuries ?,<|user|>\nYou are an assistant for question-an...,,The brachial plexus is a network of nerves tha...,1,1,43.220116,aa9f1285-bf0d-4ba8-95ba-664b812a974c,ddeac030-5a00-4896-9882-94e7aa5693bb
4,What are the treatments for Stroke ?,"1. For stroke, treatments include prevention m...",What are the treatments for Stroke ?,<|user|>\nYou are an assistant for question-an...,,Generally there are three treatment stages for...,0,1,45.316889,0478ec1e-ea0c-4980-8c9c-63fc53188757,e07fe532-46a9-478b-8f40-e66ce1748838
...,...,...,...,...,...,...,...,...,...,...,...
195,What are the treatments for Acromegaly ?,The treatments for Acromegaly include surgery ...,What are the treatments for Acromegaly ?,<|user|>\nYou are an assistant for question-an...,,No single treatment is effective for all patie...,1,1,34.748666,1ccdcd9e-b6e4-4c6d-939c-592b3c632509,6239bb5a-14b9-4712-8eb7-7567143a3b0b
196,What is (are) Down syndrome ?,Down syndrome is a genetic condition caused by...,What is (are) Down syndrome ?,<|user|>\nYou are an assistant for question-an...,,Down syndrome is a chromosomal condition that ...,1,1,39.678329,eebeb9bd-c3a5-4f39-b763-53918a93d646,d74753b4-836d-4888-b029-75567241626f
197,What is (are) Kidney Disease ?,"1. Kidney disease, also known as chronic kidne...",What is (are) Kidney Disease ?,<|user|>\nYou are an assistant for question-an...,,"Instead of dialysis, some people with kidney f...",0,1,49.325931,9d10f132-f55b-4f15-a6ed-5f71155b758a,94eecfe9-d9f5-4919-86bd-5c27fc3ecf33
198,What are the treatments for Age-related Macula...,"1. For intermediate AMD, treatment can delay a...",What are the treatments for Age-related Macula...,<|user|>\nYou are an assistant for question-an...,,"Wet AMD can be treated with laser surgery, pho...",1,1,26.227031,e40e3dcc-4770-4be2-8a0e-0ab954eca5a7,4fcb2b80-c096-49e4-8c0e-e6b4e56b7ff1


In [None]:
metrics_columns = ['feedback.answer_v_reference_score', 'feedback.answer_helpfulness_score']
results.to_pandas()[metrics_columns].mean()

Unnamed: 0,0
feedback.answer_v_reference_score,0.755
feedback.answer_helpfulness_score,0.955


Hallucination and Document Relevance Evaluators are used during the Online Evaluation