# Installing References

In [1]:
!pip install --upgrade --quiet  langchain langchain-huggingface sentence_transformers langchain-community faiss-gpu langsmith openai gradio langchain-openai

# Loading References

In [2]:
import getpass
import gradio as gr
from langchain import hub
from langchain_community.vectorstores.faiss import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.vectorstores import VectorStoreRetriever
from langchain_huggingface import ChatHuggingFace
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_huggingface.llms import HuggingFacePipeline
from langchain_openai import ChatOpenAI
from langsmith import Client
from langsmith import traceable
import os

# Getting API Keys

In [3]:
os.environ["HUGGINGFACE_API_KEY"] = getpass.getpass("Enter your HF Inference API Key: ")

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter your LangSmith API key: ")

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your HF Inference API Key: ··········
Enter your LangSmith API key: ··········
Enter your OpenAI API key: ··········


# Functions to Load Artifacts (embedding model, vector store and system LLM)

In [4]:
def load_embedding_model(model_id: str) -> HuggingFacePipeline:
  return HuggingFaceEmbeddings(model_name=model_id, model_kwargs={'device': 'cuda'})

In [5]:
def load_vector_index(embedding_model: HuggingFaceEmbeddings, k: int) -> VectorStoreRetriever:
  vector_store = FAISS.load_local("vector_store_index", embeddings, allow_dangerous_deserialization=True)
  retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": k})
  return retriever

In [6]:
def load_chat_model(model_id: str) -> HuggingFacePipeline:
  llm = HuggingFacePipeline.from_model_id(
      model_id=model_id,
      task="text-generation",
      pipeline_kwargs=dict(
          max_new_tokens=512,
          do_sample=False,
          repetition_penalty=1.03,
      ),
  )
  return ChatHuggingFace(llm=llm)

# Loading Artifacts

In [7]:
embeddings = load_embedding_model("sentence-transformers/all-mpnet-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
retriever = load_vector_index(embeddings, 4)

In [9]:
chat_model = load_chat_model(model_id="HuggingFaceH4/zephyr-7b-beta")

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


In [10]:
llm_oai = ChatOpenAI(model="gpt-4-turbo", temperature=0, api_key=os.environ["OPENAI_API_KEY"])

# System Prompts

In [11]:
# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
prompt = hub.pull("rlm/rag-prompt")

# See full prompt at https://smith.langchain.com/hub/rlm/rag-answer-helpfulness
helpfulness_prompt = hub.pull("langchain-ai/rag-answer-helpfulness")

# See full prompt at https://smith.langchain.com/prompts/rag-answer-hallucination?organizationId=1729c2d2-0d51-41fa-bdac-845786869e0e
hallucination_prompt = hub.pull("factoredaiexperiments/rag-answer-hallucination")

# See full prompt at https://smith.langchain.com/prompts/rag-document-relevance?organizationId=1729c2d2-0d51-41fa-bdac-845786869e0e
document_relevance_prompt = hub.pull("factoredaiexperiments/rag-document-relevance")

# Defining Online Evaluation Functions

In [12]:
def answer_helpfulness_evaluator(question: str, answer: str) -> dict:
    """
    A simple evaluator for RAG answer helpfulness
    """

    # Define an LLM grader
    helpfulness_grader = helpfulness_prompt | llm_oai

    # Run evaluator
    result = helpfulness_grader.invoke(
        {
            "question": question,
            "student_answer": answer,
        }
    )
    score = result["Score"]
    return {"key": "answer_helpfulness_score", "score": score}

In [13]:
def answer_hallucination_evaluator(answer: str, context: str) -> dict:
    """
    A simple evaluator for RAG answer hallucination
    """

    # Define an LLM grader
    hallucination_grader = hallucination_prompt | llm_oai

    # Run evaluator
    result = hallucination_grader.invoke(
        {
            "documents": context,
            "answer": answer,
        }
    )
    return result

In [14]:
def document_relevance_evaluator(context: str, question: str) -> dict:
    """
    A simple evaluator for RAG document relevance
    """
    # Define an LLM grader
    document_relevance_grader = document_relevance_prompt | llm_oai

    # Run evaluator
    result = document_relevance_grader.invoke(
        {
            "documents": context,
            "question": question,
        }
    )
    return result

# Defining Q&A Function

In [15]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def question_and_answering(question: dict) -> dict:

    qa_chain = (
        {
            "context": retriever | format_docs,
            "question": RunnablePassthrough(),
        }
        | prompt
        | chat_model
        | StrOutputParser()
    )
    response = qa_chain.invoke(question['text'])
    response_splitted = response.split('<|assistant|>')
    answer = response_splitted[1].strip() if len(response_splitted) > 1 else ''
    context = response_splitted[0].strip()

    answer_helpfulness_evaluation = answer_helpfulness_evaluator(question['text'], answer)
    answer_hallucination_evaluation = answer_hallucination_evaluator(answer, context)
    document_relevance_evaluation = document_relevance_evaluator(question['text'], answer)

    if answer_helpfulness_evaluation["score"] == 1 and answer_hallucination_evaluation["Score"] == 1 and document_relevance_evaluation["Score"] == 1:
        return {'answer':answer}
    else:
        return {'answer':"It wasn't possible to answer your question"}

# Creating Function for Interface App

In [16]:
def question_and_answering_fn(question: str) -> str:
    result = question_and_answering({'text': question})
    return result['answer']

## Testing Function

In [17]:
question_and_answering_fn("What are the treatments for Glaucoma ?")

'1. Treatments for glaucoma include medications, laser surgery, conventional surgery, or a combination of these. Early diagnosis and treatment can delay progression of the disease.\n2. Research at the National Eye Institute is focused on finding better ways to detect, treat, and prevent vision loss in people with glaucoma. This includes studying genes that contribute to the disease, identifying individuals at high risk, and determining the best initial treatment.\n3. While treatments for glaucoma can slow the rate of vision loss, they do not restore sight already lost from the condition. Immediate treatment is crucial for those with early-stage, open-angle glaucoma.'

In [18]:
question_and_answering_fn("What are the treatments for Diabetes ?")

'For diabetes, managing blood glucose, blood pressure, and cholesterol is crucial to prevent serious complications. Treatments for diabetes include taking medications, monitoring blood glucose levels, checking blood pressure (if advised by a doctor), checking feet, brushing teeth, quitting smoking, eating well, and being active. For diabetic kidney disease, treatment options include dialysis, a kidney transplant, or a combination of both. For heart disease, a heart-healthy diet, physical activity, medications to treat heart damage and lower blood glucose, blood pressure, and cholesterol, aspirin (if not already taking a low dose), and possibly surgery or medical procedures are recommended. For nerve damage from diabetes, treatment is based on symptoms and may include low doses of certain medications, creams or patches, over-the-counter pain medicines, acupuncture, physical therapy, relaxation exercises, and quitting smoking and alcohol consumption. No treatment can reverse nerve damage

In [19]:
question_and_answering_fn("What is Blood Pressure ?")

'Blood pressure is the force of blood pushing against the walls of blood vessels as the heart pumps blood. High blood pressure, also known as hypertension, occurs when this force remains high over time, putting extra strain on the heart and potentially damaging other organs such as the kidneys, brain, and eyes. A normal blood pressure for adults is below 120/80 mmHg, but it can vary based on factors such as age and activity level. Pre-hypertension is a systolic or diastolic blood pressure between 120-139 mmHg or 80-89 mmHg, respectively. Pulmonary hypertension, on the other hand, is high blood pressure in the arteries leading to the lungs, which can lead to heart weakness and failure over time. Treatment for pulmonary hypertension involves managing underlying conditions, medications, oxygen therapy, and in severe cases, lung transplantation. (NIH: National Heart, Lung, and Blood Institute)'

# Creating Interface for the System

In [20]:
iface = gr.Interface(
    fn=question_and_answering_fn,
    inputs=gr.Textbox(lines=2, placeholder="Enter your question here..."),
    outputs="text",
    title="Question Answering",
    description="Ask a question and get an answer.",
    submit_btn="Submit Question"
)

iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://26579595f8046b1bac.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


