In [None]:
!pip install pandas pydantic langchain langchain_community langchain_openai langchain_anthropic langchain_google_genai deepeval rank_bm25 faiss-cpu duckduckgo-search

In [4]:
!which python

/home/aragy/HumaAI/.venv/bin/python


In [5]:
import os
import re
import json
from typing import List, Tuple, Optional

import pandas as pd
from pydantic import BaseModel, Field
from langchain.docstore.document import Document
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.tools import DuckDuckGoSearchResults
from dotenv import load_dotenv

from deepeval import evaluate
from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams



In [6]:
load_dotenv("/home/aragy/HumaAI/Huma.ai-assessment/Section_2_to_6/.env")

True

# Section 2 : Setting Up the Retrieval System

In this section, I set up the retrieval system using advanced techniques to enhance the effectiveness and accuracy of information retrieval.

I implemented a combination of Reciprocal Rank Fusion (RRF) and Corrective Retrieval Augmented Generation (CRAG) for our retrieval system.

1. **Reciprocal Rank Fusion** (https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf): RRF is an effective method for combining the results of multiple retrieval systems. By integrating both BM25 (a statistical term-based retrieval method) and FAISS (a vector similarity search using embeddings), we leverage the strengths of both approaches.
    - **BM25 Retriever**: BM25 excels at term-based matching, effectively retrieving documents containing terms similar to the query.
    - **FAISS Retriever**: FAISS allows for efficient vector similarity search, capturing semantic similarities that may not be evident through term matching.

By fusing the rankings from both retrievers, RRF enhances overall retrieval performance, ensuring that relevant documents are not missed due to the limitations of a single method.


2. **CRAG(Corrective RAG)**:    Extends the standard RAG approach by incorporating dynamic evaluation and correction of the retrieval process. This is crucial for ensuring the reliability and accuracy of the information provided to the user.

- Dynamic Evaluation: An evaluator (implemented using a Language Model, LLM) assesses the relevance of the retrieved documents to the query.

- Corrective Actions: Depending on the evaluation score:
    - If relevance is high, the best document is selected.
    - If relevance is moderate, retrieved knowledge is combined with web search results.
    - If relevance is low, the system relies on web search to supplement knowledge.

- Combining Internal and External Knowledge: This approach allows the system to provide comprehensive and accurate responses by utilizing both the internal dataset and external web resources when necessary.

### Trade-offs Considered:

- Complexity vs. Performance:

    - Complexity: Implementing RRF and CRAG increases system complexity due to the integration of multiple retrieval methods and the need for dynamic evaluation.
    - Performance Gains: The combined approach significantly improves retrieval accuracy and the ability to handle a wider range of queries effectively.
    - Decision: Accepted increased complexity in favor of achieving higher accuracy and reliability, essential for medical information retrieval.

- Computational Resources:

    - Resource Consumption: The combined approach requires more computational resources due to multiple retrievers and the dynamic evaluation process.
    - Optimization: Parameters were tuned (e.g., setting k=5 for the number of documents retrieved) to balance performance, cost and efficiency without overloading the system.

- Cost Implications:

    - API Costs: Using OpenAI embeddings involves API costs.
    - Value Assessment: The improved performance and accuracy justify the expenditure, especially when accurate information retrieval can have significant impacts on user decisions in healthcare contexts.


In [7]:
class DocumentRetriever:
    def __init__(self, bm25_docs: List[str], faiss_docs: List[str]):
        self.bm25_retriever = BM25Retriever.from_texts(
            bm25_docs, metadatas=[{"source": 1}] * len(bm25_docs)
        )
        self.bm25_retriever.k = 5        
        embedding = OpenAIEmbeddings()
        faiss_vectorstore = FAISS.from_texts(
            faiss_docs, embedding, metadatas=[{"source": 2}] * len(faiss_docs)
        )
        self.faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 5})

        self.ensemble_retriever = EnsembleRetriever(
            retrievers=[self.bm25_retriever, self.faiss_retriever], weights=[0.5, 0.5]
        )

    def retrieve(self, query: str) -> List[Document]:
        return self.ensemble_retriever.invoke(query)

In [8]:
class RetrievalEvaluatorInput(BaseModel):
    relevance_score: float = Field(..., description="The relevance score of the document to the query. the score should be between 0 and 1.")

In [9]:
class Evaluator:
    def __init__(self, llm):
        self.llm = llm

    def evaluate_relevance(self, query: str, document: str) -> float:
        prompt = PromptTemplate(
            input_variables=["query", "document"],
            template="On a scale from 0 to 1, how relevant is the following document to the query? Query: {query}\nDocument: {document}\nRelevance score:"
        )
        chain = prompt | self.llm.with_structured_output(RetrievalEvaluatorInput)
        input_variables = {"query": query, "document": document}
        result = chain.invoke(input_variables).relevance_score
        return result

In [10]:
class KnowledgeRefinementInput(BaseModel):
    key_points: str = Field(..., description="The document to extract key information from.")

In [11]:

class KnowledgeRefiner:
    def __init__(self, llm):
        self.llm = llm

    def refine(self, document: str) -> List[str]:
        prompt = PromptTemplate(
            input_variables=["document"],
            template="Extract the key information from the following document in bullet points:\n{document}\nKey points:"
        )
        chain = prompt | self.llm.with_structured_output(KnowledgeRefinementInput)
        input_variables = {"document": document}
        result = chain.invoke(input_variables).key_points
        return [point.strip() for point in result.split('\n') if point.strip()]

In [12]:
class QueryRewriterInput(BaseModel):
    query: str = Field(..., description="The query to rewrite.")

In [13]:
class QueryRewriter:
    def __init__(self, llm):
        self.llm = llm

    def rewrite(self, query: str) -> str:
        prompt = PromptTemplate(
            input_variables=["query"],
            template="Rewrite the following query to make it more suitable for a web search:\n{query}\nRewritten query:"
        )
        chain = prompt | self.llm.with_structured_output(QueryRewriterInput)
        input_variables = {"query": query}
        return chain.invoke(input_variables).query.strip()

In [14]:
class WebSearcher:
    def __init__(self, llm):
        self.llm = llm
        self.search = DuckDuckGoSearchResults()

    def search_and_refine(self, query: str) -> Tuple[List[str], List[Tuple[str, str]]]:
        rewritten_query = QueryRewriter(self.llm).rewrite(query)
        web_results = self.search.run(rewritten_query)
        web_knowledge = KnowledgeRefiner(self.llm).refine(web_results)
        sources = self.parse_search_results(web_results)
        return web_knowledge, sources

    @staticmethod
    def parse_search_results(results_string: str) -> List[Tuple[str, str]]:
        try:
            pattern = r'snippet: (.*?), title: (.*?), link: (https?://[^\s,]+)'
            matches = re.findall(pattern, results_string)
            data = [{"snippet": snippet, "title": title, "link": link} for snippet, title, link in matches]
            results = json.loads(json.dumps(data, indent=4))
            return [(result.get('title', 'Untitled'), result.get('link', '')) for result in results]
        except json.JSONDecodeError:
            print("Error parsing search results. Returning empty list.")
            return []

In [15]:
class ResponseGenerator:
    def __init__(self, llm):
        self.llm = llm

    def generate(self, query: str, knowledge: str, sources: List[Tuple[str, str]]) -> str:
        response_prompt = PromptTemplate(
            input_variables=["query", "knowledge", "sources"],
            template="Based on the following knowledge, answer the query. Include the sources with their links (if available) at the end of your answer:\nQuery: {query}\nKnowledge: {knowledge}\nSources: {sources}\nAnswer:"
        )
        input_variables = {
            "query": query,
            "knowledge": knowledge,
            "sources": "\n".join([f"{title}: {link}" if link else title for title, link in sources])
        }
        response_chain = response_prompt | self.llm
        return response_chain.invoke(input_variables).content

In [16]:
class QueryProcessor:
    def __init__(self, retriever: DocumentRetriever, evaluator: Evaluator, web_searcher: WebSearcher, llm):
        self.retriever = retriever
        self.evaluator = evaluator
        self.web_searcher = web_searcher
        self.llm = llm

    def process(self, query: str, eval_documents: bool = True) -> str:
        """
        Process the query and generate a response.

        Args:
            query (str): The query string.
            eval_documents (bool): Whether to calculate evaluation scores or just use the retrieved documents.

        Returns:
            str: The generated response.
        """
        retrieved_docs = self.retriever.retrieve(query)
        
        if eval_documents:
            eval_scores = [self.evaluator.evaluate_relevance(query, doc.page_content) for doc in retrieved_docs]
            max_score = max(eval_scores)

            if max_score > 0.7:
                best_doc = retrieved_docs[eval_scores.index(max_score)]
                final_knowledge = best_doc.page_content
                sources = [("Retrieved document", "")]
            elif max_score < 0.3:
                final_knowledge, sources = self.web_searcher.search_and_refine(query)
            else:
                best_doc = retrieved_docs[eval_scores.index(max_score)]
                retrieved_knowledge = KnowledgeRefiner(self.llm).refine(best_doc.page_content)
                web_knowledge, web_sources = self.web_searcher.search_and_refine(query)
                final_knowledge = "\n".join(retrieved_knowledge + web_knowledge)
                sources = [("Retrieved document", "")] + web_sources
        else:
            final_knowledge = retrieved_docs[0].page_content
            sources = [("Retrieved document", "")]

        response = ResponseGenerator(self.llm).generate(query, final_knowledge, sources)
        return response

In [17]:
#df = pd.read_csv('/home/aragy/Huma.ai-assessment/Dataset/cleaned_dataset.csv')
df = pd.read_csv('/home/aragy/HumaAI/Huma.ai-assessment/Dataset/cleaned_dataset.csv')
documents = [row['question'] + ' ' + row['answer_solution'] for _, row in df.iterrows()]

retriever = DocumentRetriever(documents, documents)


In [18]:
retriever.retrieve("What are the Keytruda's side effects?")

[Document(metadata={'source': 1}, page_content='What are the common side effects of Keytruda? Common side effects include fatigue, nausea, and skin rash.'),
 Document(metadata={'source': 1}, page_content='Can Keytruda cause immune-related adverse effects? Yes, Keytruda can cause immune-related adverse effects such as colitis, hepatitis, and pneumonitis.'),
 Document(metadata={'source': 1}, page_content='What were the side effects noted in the KEYNOTE-006 trial? Common side effects included fatigue, itching, and diarrhea.'),
 Document(metadata={'source': 2}, page_content='Are there specific side effects of Keytruda that NSCLC patients should monitor? NSCLC patients should monitor for cough, shortness of breath, and chest pain, as these could indicate immune-related pneumonitis.'),
 Document(metadata={'source': 2}, page_content='What should patients report immediately while on Keytruda treatment? Patients should report any new or worsening symptoms such as cough, chest pain, or changes i

In [19]:
llm = ChatOpenAI(model="gpt-4o-mini", max_tokens=1000, temperature=0)
evaluator = Evaluator(llm)
web_searcher = WebSearcher(llm)

# Section 3: Integrating the Generation Component

In this section, I integrate generative language models into the retrieval system to provide detailed and contextual answers based on the retrieved documents. I tested three advanced models: gpt-4o-mini, claude-3.5-sonnet, and gemini-1.5-flash.

-Integration of Multiple Generative Models
- Models Used:

    1. gpt-4o-mini (ChatOpenAI)
    2. claude-3.5-sonnet (ChatAnthropic)
    3. gemini-1.5-flash (ChatGoogleGenerativeAI)

- Justification for Model Selection:

    - Diversity of Capabilities: By integrating multiple models, we aim to leverage the unique strengths of each model, enhancing the system's overall performance.
        - gpt-4o-mini:
            - Strengths: Known for its advanced language understanding and ability to generate coherent and contextually relevant responses.
            - Use Case: Serves as a reliable baseline for comparison and is efficient in handling a wide range of queries.
        - claude-3.5-sonnet:
            - Strengths: Excels in generating detailed and accurate responses, particularly in complex or nuanced queries.
            - Use Case: Valuable for critical queries requiring high precision and depth.
        - gemini-1.5-flash:
            - Strengths: Offers rapid response times and is optimized for speed.
            - Use Case: Suitable for scenarios where quick responses are essential, albeit with a potential trade-off in depth.

- Process of Integration:

    - Evaluator and WebSearcher Instances: For each model, we created separate instances of the evaluator and web searcher to assess document relevance and perform web searches when necessary.

    - QueryProcessor Class: We instantiated the QueryProcessor class for each model, orchestrating the retrieval and generation process tailored to the specific capabilities of each model.



### GPT 4o mini

In [20]:
llm_gpt = ChatOpenAI(model="gpt-4o-mini", max_tokens=1000, temperature=0)

In [21]:

evaluator = Evaluator(llm_gpt)
web_searcher = WebSearcher(llm_gpt)
processor_gpt = QueryProcessor(retriever, evaluator, web_searcher, llm_gpt)

In [21]:
response = processor_gpt.process("What are the Keytruda's side effects?")
print(response)

Keytruda (pembrolizumab) is an immunotherapy drug used to treat various types of cancer. Common side effects of Keytruda include:

- Fatigue
- Nausea
- Skin rash

These side effects can vary in intensity and may not occur in every patient. It's important for patients to discuss any side effects they experience with their healthcare provider.

Sources: Retrieved document.


In [22]:
response = processor_gpt.process("What is the indication for using Keytruda?")
print(response)

Keytruda (pembrolizumab) is an immunotherapy medication primarily indicated for the treatment of various types of cancer, including:

1. **Non-small cell lung cancer (NSCLC)** - It is particularly effective in patients with PD-L1 expression, significantly improving survival rates.
2. **Melanoma**
3. **Head and neck squamous cell cancer (HNSCC)**
4. **Classical Hodgkin Lymphoma**
5. **Endometrial carcinoma**
6. **Triple-negative breast cancer**

Keytruda functions as a checkpoint inhibitor, enhancing the immune system's ability to fight cancer. It is intended for long-term treatment, with recommended dosages of 200 mg once every 3 weeks or 400 mg once every 6 weeks. Treatment effectiveness is monitored through imaging, blood tests, and laboratory tests.

For more detailed information, you can refer to the following sources:
- [Medical News Today - Keytruda (pembrolizumab): Side effects, uses, cost, and more](https://www.medicalnewstoday.com/articles/keytruda)
- [Healthline - Signs That 

In [23]:
response = processor_gpt.process("How Keytruda perform in cancer remission?")
print(response)

Keytruda (pembrolizumab) has shown significant effectiveness in promoting cancer remission, particularly in specific types of cancer such as non-small cell lung cancer (NSCLC) and muscle-invasive urothelial carcinoma (MIUC). 

1. **Non-Small Cell Lung Cancer (NSCLC)**: Keytruda is particularly effective in patients with high PD-L1 expression, leading to improved survival rates. The use of PD-1 immune checkpoint inhibitors like Keytruda has been associated with prolonged overall survival (OS) in various cancer patients.

2. **Muscle-Invasive Urothelial Carcinoma (MIUC)**: In patients with high-risk MIUC, adjuvant treatment with Keytruda has resulted in improved disease-free survival (DFS) compared to observation alone after surgery. This suggests that Keytruda can significantly enhance remission rates in this patient population.

3. **General Effectiveness**: The effectiveness of Keytruda can vary based on the type of cancer, its stage, and the patient's previous treatments. It has been

### Claude 3.5 sonnet

In [22]:
llm_claude = ChatAnthropic(temperature=0, model_name='claude-3-5-sonnet-20240620')

In [23]:
evaluator_claude = Evaluator(llm_claude)
web_searcher_claude = WebSearcher(llm_claude)
processor_claude = QueryProcessor(retriever, evaluator_claude, web_searcher_claude, llm_claude)

In [58]:
response = processor_claude.process("What are the Keytruda's side effects?")
print(response)

Based on the provided knowledge, the common side effects of Keytruda include:

1. Fatigue
2. Nausea
3. Skin rash

It's important to note that these are just the common side effects, and individual experiences may vary. Patients should always consult their healthcare provider for a comprehensive list of potential side effects and to discuss any concerns they may have about their treatment.

Source:
Retrieved document (no link available)


In [59]:
response = processor_claude.process("What is the indication for using Keytruda?")
print(response)

Based on the provided knowledge, Keytruda (pembrolizumab) is indicated for the treatment of various types of cancer, including:

1. Non-small cell lung cancer, particularly in patients with PD-L1 expression
2. Melanoma (skin cancer)
3. Adenocarcinomas
4. Endometrial carcinoma
5. Triple-negative breast cancer

Keytruda is an immunotherapy treatment classified as a checkpoint inhibitor. It works by helping the immune system fight cancer cells. The treatment has shown significant improvement in survival rates, especially for non-small cell lung cancer patients.

It's important to note that Keytruda is intended for long-term treatment, and its use is continuously being evaluated in numerous clinical trials for various cancers. Since its initial approval in September 2014, at least 20 new indications have been approved, and ongoing investigational studies are exploring new uses for pembrolizumab.

Sources:
2. Keytruda (pembrolizumab): Side effects, uses, cost, and more: https://www.medicaln

In [60]:
response = processor_claude.process("How Keytruda perform in cancer remission?")
print(response)

Based on the provided knowledge, I cannot directly answer how Keytruda performs in cancer remission. The information given is specifically about Keytruda's effectiveness in treating non-small cell lung cancer, not about its performance in cancer remission.

However, I can share what is provided:

Keytruda has demonstrated significant improvement in survival rates for patients with non-small cell lung cancer who have PD-L1 expression. This suggests that Keytruda is effective in treating this specific type of cancer, but it does not provide information about cancer remission in general or for other types of cancer.

To accurately answer the query about Keytruda's performance in cancer remission, more specific and relevant information would be needed.

Source:
Retrieved document (no link available)


### Gemini

In [24]:
llm_gemini = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0,)

In [25]:
evaluator_gemini = Evaluator(llm_gemini)
web_searcher_gemini = WebSearcher(llm_gemini)
processor_gemini = QueryProcessor(retriever, evaluator_gemini, web_searcher_gemini, llm_gemini)

In [26]:
response = processor_gemini.process("What are the Keytruda's side effects?")
print(response)

Keytruda's common side effects include fatigue, nausea, and skin rash. 

**Source:** Retrieved document 



In [42]:
response = processor_gemini.process("What is the indication for using Keytruda?")
print(response)

Keytruda is indicated for the treatment of relapsed or refractory classical Hodgkin lymphoma after two or more lines of therapy. 

**Source:** Retrieved document 



In [43]:
response = processor_gemini.process("How Keytruda perform in cancer remission?")
print(response)

Keytruda demonstrated superior efficacy compared to chemotherapy in NSCLC patients in the KEYNOTE-456 trial. This superiority was observed in terms of overall survival and progression-free survival. 

**Source:** Retrieved document 



### Challenges and Solutions:

- Model-Specific Limitations:
    - Challenge: Each model has its own limitations in terms of response quality, depth, and accuracy.
    - Solution: By testing all three models, we identified the strengths and weaknesses of each, allowing us to select the most appropriate model based on the specific query and context.

- Consistency Across Models:
    - Challenge: Ensuring consistent formatting and information across different models can be difficult due to variations in output styles.
    - Solution: Standardized the prompts and post-processing steps to harmonize the responses as much as possible.

- Integration Complexity:
    - Challenge: Integrating multiple models increases the complexity of the system.
    - Solution: Encapsulated model-specific logic within dedicated classes and maintained a consistent interface for the QueryProcessor, simplifying the overall integration.

- Comparative Analysis:

    - Accuracy and Detail:
        - gpt-4o-mini and claude-3.5-sonnet provided more detailed responses, with claude-3.5-sonnet often offering slightly more elaboration.
        - gemini-1.5-flash tended to give concise answers, which may lack depth but are quicker to read.

    - Response Time:
        - gemini-1.5-flash was designed for speed but, in practice, had longer response times due to API limitations.
        - gpt-4o-mini offered a good balance between speed and detail.
        - claude-3.5-sonnet had moderate response times but excelled in accuracy.

    - Use Case Suitability:
        - gpt-4o-mini: Suitable for general queries requiring balanced performance.
        - claude-3.5-sonnet: Preferred for complex queries where accuracy is paramount.
        - gemini-1.5-flash: Appropriate for situations where brevity is acceptable.

- Justification:

    - Comprehensive Evaluation: Testing all three models allowed us to assess their performance in the context of our RAG system and select the best fit for different scenarios.

    - Optimizing User Experience: By understanding the nuances of each model, we can route queries to the model that will provide the most accurate and useful response, enhancing overall user satisfaction.

### Conclusion:

Integrating multiple generative models into the RAG system provided valuable insights into their respective capabilities. This approach enabled us to leverage the strengths of each model and address their weaknesses, ultimately leading to a more robust and flexible system capable of delivering high-quality answers across a variety of medical queries.

# Section 4: Evaluation and Optimization

In this section, I evaluate the performance of the RAG system using specific metrics and optimize it based on the results.

1. Correctness:

    * This metric measures how factually accurate the generated output is compared to the ground truth. It reflects the system's ability to provide the correct information in response to the question.
    * Since the system deals with medical data (Keytruda), factual correctness is crucial. Errors in drug interactions, survival rates, or side effects could lead to significant consequences in real-world applications.

2. Faithfulness:

    * Faithfulness checks whether the generated response aligns with the retrieved documents. It ensures that the system does not hallucinate or introduce information that was not present in the retrieval phase.
    * This metric ensures that the system remains grounded in the retrieved documents, avoiding the creation of content that wasn't found in the original sources, which is particularly important in critical domains like healthcare.

3. Contextual Relevancy:

    * This metric measures how well the generated output matches the context retrieved. It checks whether the system is selecting the most relevant content from the retrieved information.
    * Even if the answer is factually correct, it is essential to ensure that the response is relevant to the specific question being asked. A high relevancy score indicates that the system focuses on the right content from the retrieved documents.

4. Response Time:

    * This measures the average time it takes for each model to generate a response.
    * In a real-world application, response time is critical, especially in healthcare-related scenarios where rapid answers may be necessary. The evaluation of processing time helps us compare the efficiency of different models.

5. Cost:

    * The cost metric evaluates the financial cost associated with using each model based on the number of tokens processed (input and output).
    * Cost-effectiveness is an important factor, particularly when scaling the system for frequent usage. Knowing the cost per model allows for informed decisions on model selection based on budget constraints.

## Test Preparation

In [27]:
class TestScenario:
    def __init__(self, question: str, gt_answer: str, retrieval_context: List[str], ):
        self.question = question
        self.gt_answer = gt_answer
        self.retrieval_context = retrieval_context        
        self.pred_answer = None

    def run(self, processor:QueryProcessor):

        self.pred_answer = processor.process(self.question)

        test_case_combined = LLMTestCase(
            input=self.question,
            expected_output=self.gt_answer,
            actual_output=self.pred_answer,
            retrieval_context=self.retrieval_context,
        )
        return test_case_combined

In [28]:
class MetricsEvaluator:
    def __init__(self,model_name:str) -> None:
        self.model_name = model_name
        self.correctness_metric = GEval(
            name="Correctness",
            model=model_name,
            evaluation_params=[
                LLMTestCaseParams.EXPECTED_OUTPUT,
                LLMTestCaseParams.ACTUAL_OUTPUT],
            evaluation_steps=[
                "Determine whether the actual output is factually correct based on the expected output."
            ],
        )
        self.faithfulness_metric = FaithfulnessMetric(
            threshold=0.7,
            model=model_name,
            include_reason=True
        )
        self.relevance_metric = ContextualRelevancyMetric(
            threshold=1,
            model=model_name,
            include_reason=True
        )


In [29]:
class TestRunner:
    def __init__(self, metrics_evaluator:MetricsEvaluator):
        self.metrics_evaluator = metrics_evaluator

    def run_test(self, test_case: LLMTestCase):

        self.metrics_evaluator.correctness_metric.measure(test_case)
        self.metrics_evaluator.faithfulness_metric.measure(test_case)
        self.metrics_evaluator.relevance_metric.measure(test_case)

        print(f"Correctness Score: {self.metrics_evaluator.correctness_metric.score}")
        print(f"Faithfulness Score: {self.metrics_evaluator.faithfulness_metric.score}")
        print(f"Contextual Relevancy Score: {self.metrics_evaluator.relevance_metric.score}")

## Scenarios

### 1. Keytruda efficacy and drug interactions

* **Question**: "Are there any known interactions between Keytruda and other medications?"
* **Ground Truth** : "Yes, Keytruda can interact with steroids and certain immunosuppressants, potentially affecting its efficacy and safety."
* **Context**: Includes documents on Keytruda side effects, effectiveness, and interactions.

In [30]:
question = "Are there any known interactions between Keytruda and other medications?"
gt_answer = "Yes, Keytruda can interact with steroids and certain immunosuppressants, potentially affecting its efficacy and safety."

retrieval_context = [
    "Keytruda has interactions with steroids",
    "Keytruda is commonly used with other immunosuppressants"
]

scenario_1 = TestScenario(question=question, gt_answer=gt_answer, retrieval_context=retrieval_context)


### 2. **Five-Year Survival Rate with Keytruda**

* **Question**: "What is the five-year survival rate for patients using Keytruda for lung cancer?"
* **Ground Truth**: "No detailed information available on the given topic."
* **Context**: Documents related to survival rates, but lacking detailed data for five-year outcomes.


In [31]:
question = "What is the five-year survival rate for patients using Keytruda for lung cancer?"
gt_answer = "No detailed information available on the given topic."

retrieval_context = [
    "No detailed information available on the given topic.",
    "Keytruda has improved survival rates, but specific five-year data may not be available."
]

scenario_2 = TestScenario(question=question, gt_answer=gt_answer, retrieval_context=retrieval_context)


### 3. Common Side Effects of Keytruda

* **Question**: "What are the common side effects of Keytruda?"
* **Ground Truth**: "Common side effects include fatigue, nausea, and skin rash."
* **Context**: Medical literature on Keytruda's side effects.

In [32]:
question = "What are the common side effects of Keytruda?"
gt_answer = "Common side effects include fatigue, nausea, and skin rash."

retrieval_context = [
    "Common side effects include fatigue, nausea, and skin rash.",
    "Patients should report any severe or unexpected side effects to their healthcare provider."
]

scenario_3 = TestScenario(question=question, gt_answer=gt_answer, retrieval_context=retrieval_context)


### 4. **Efficacy of Keytruda in NSCLC Patients**

* **Question**: "How effective is Keytruda in treating non-small cell lung cancer (NSCLC)?"
* **Ground Truth**: "Keytruda has shown to improve survival rates significantly in non-small cell lung cancer patients with PD-L1 expression."
* **Context**: Studies on the effectiveness of Keytruda in NSCLC treatment.

In [33]:
question = "How effective is Keytruda in treating non-small cell lung cancer (NSCLC)?"
gt_answer = "Keytruda has shown to improve survival rates significantly in non-small cell lung cancer patients with PD-L1 expression."

retrieval_context = [
    "Keytruda has shown to improve survival rates significantly in non-small cell lung cancer patients with PD-L1 expression.",
    "Some patients may see effects as early as 2 to 3 months into the treatment."
]

scenario_4 = TestScenario(question=question, gt_answer=gt_answer, retrieval_context=retrieval_context)


### 5. **Response Rate in Melanoma Patients Treated with Keytruda**

* **Question**: "What is the response rate of melanoma patients treated with Keytruda?"
* **Ground Truth**: "Approximately 40% of melanoma patients respond positively to Keytruda treatment."
* **Context**: Documents on Keytruda's efficacy in melanoma treatment.

In [34]:
question = "What is the response rate of melanoma patients treated with Keytruda?"
gt_answer = "Approximately 40% of melanoma patients respond positively to Keytruda treatment."

retrieval_context = [
    "Approximately 40% of melanoma patients respond positively to Keytruda treatment.",
    "Response rates may vary based on individual patient factors."
]

scenario_5 = TestScenario(question=question, gt_answer=gt_answer, retrieval_context=retrieval_context)


### Run Test

In [35]:
model_name_gpt = "gpt-4o-mini"

In [36]:
test_scenarios = [scenario_1, scenario_2, scenario_3, scenario_4, scenario_5]

In [37]:
model_evaluator= MetricsEvaluator(model_name=model_name_gpt)

In [38]:
test_runner = TestRunner(model_evaluator)

In [41]:
import time

### Testing GPT

In [42]:
print("Testing processor GPT-4o-mini" )
time_acc = 0
for number, scenario in enumerate(test_scenarios):
    start = time.time()
    counter = number+1
    print("Scenario "+ str(counter))
    print("Question:"+ scenario.question)
    print("Ground Thruth:"+scenario.gt_answer)


    test_case = scenario.run(processor=processor_gpt)
    test_runner.run_test(test_case)
    print("------------------------------------")
    print("\n")
    end = time.time()
    time_acc+=end-start

print("Elapsed time:",time_acc* 10**3, "ms")

Testing processor GPT-4o-mini
Scenario 1
Question:Are there any known interactions between Keytruda and other medications?
Ground Thruth:Yes, Keytruda can interact with steroids and certain immunosuppressants, potentially affecting its efficacy and safety.


Output()

Output()

Output()

Correctness Score: 0.9977022633008398
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Scenario 2
Question:What is the five-year survival rate for patients using Keytruda for lung cancer?
Ground Thruth:No detailed information available on the given topic.


Output()

Output()

Output()

Correctness Score: 0.07315557895781337
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Scenario 3
Question:What are the common side effects of Keytruda?
Ground Thruth:Common side effects include fatigue, nausea, and skin rash.


Output()

Output()

Output()

Correctness Score: 0.9939913347566941
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Scenario 4
Question:How effective is Keytruda in treating non-small cell lung cancer (NSCLC)?
Ground Thruth:Keytruda has shown to improve survival rates significantly in non-small cell lung cancer patients with PD-L1 expression.


Output()

Output()

Output()

Correctness Score: 0.9324822668259791
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Scenario 5
Question:What is the response rate of melanoma patients treated with Keytruda?
Ground Thruth:Approximately 40% of melanoma patients respond positively to Keytruda treatment.


Output()

Output()

Output()

Correctness Score: 1.0
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Elapsed time: 97798.81167411804 ms


### Testing Gemini

In [43]:
print("Testing processor Gemini" )
time_acc = 0
for number, scenario in enumerate(test_scenarios):
    start = time.time()
    counter = number+1
    print("Scenario "+ str(counter))
    print("Question:"+ scenario.question)
    print("Ground Thruth:"+scenario.gt_answer)


    test_case = scenario.run(processor=processor_gemini)
    test_runner.run_test(test_case)
    print("------------------------------------")
    print("\n")
    time.sleep(60)
    end = time.time()
    time_acc+=end-start

print("Elapsed time:",time_acc* 10**3, "ms")    

Testing processor Gemini
Scenario 1
Question:Are there any known interactions between Keytruda and other medications?
Ground Thruth:Yes, Keytruda can interact with steroids and certain immunosuppressants, potentially affecting its efficacy and safety.


Output()

Output()

Output()

Correctness Score: 0.0
Faithfulness Score: 1
Contextual Relevancy Score: 1.0
------------------------------------


Scenario 2
Question:What is the five-year survival rate for patients using Keytruda for lung cancer?
Ground Thruth:No detailed information available on the given topic.


Output()

Output()

Output()

Correctness Score: 0.6802709729332108
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Scenario 3
Question:What are the common side effects of Keytruda?
Ground Thruth:Common side effects include fatigue, nausea, and skin rash.


Output()

Output()

Output()

Correctness Score: 0.974648446131859
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Scenario 4
Question:How effective is Keytruda in treating non-small cell lung cancer (NSCLC)?
Ground Thruth:Keytruda has shown to improve survival rates significantly in non-small cell lung cancer patients with PD-L1 expression.


Output()

Output()

Output()

Correctness Score: 1.0
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Scenario 5
Question:What is the response rate of melanoma patients treated with Keytruda?
Ground Thruth:Approximately 40% of melanoma patients respond positively to Keytruda treatment.


Output()

Output()

Output()

Correctness Score: 1.0
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Elapsed time: 363425.68349838257 ms


### Testing Claude

In [44]:
print("Testing processor Claude" )
time_acc = 0
for number, scenario in enumerate(test_scenarios):
    start = time.time()
    counter = number+1
    print("Scenario "+ str(counter))
    print("Question:"+ scenario.question)
    print("Ground Thruth:"+scenario.gt_answer)


    test_case = scenario.run(processor=processor_claude)
    test_runner.run_test(test_case)
    print("------------------------------------")
    print("\n")
    end = time.time()
    time_acc+=end-start

print("Elapsed time:",time_acc* 10**3, "ms")    

Testing processor Claude
Scenario 1
Question:Are there any known interactions between Keytruda and other medications?
Ground Thruth:Yes, Keytruda can interact with steroids and certain immunosuppressants, potentially affecting its efficacy and safety.


Output()

Output()

Output()

Correctness Score: 0.9982013783961936
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Scenario 2
Question:What is the five-year survival rate for patients using Keytruda for lung cancer?
Ground Thruth:No detailed information available on the given topic.


Output()

Output()

Output()

Correctness Score: 0.7512615082693693
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Scenario 3
Question:What are the common side effects of Keytruda?
Ground Thruth:Common side effects include fatigue, nausea, and skin rash.


Output()

Output()

Output()

Correctness Score: 0.997966764570123
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Scenario 4
Question:How effective is Keytruda in treating non-small cell lung cancer (NSCLC)?
Ground Thruth:Keytruda has shown to improve survival rates significantly in non-small cell lung cancer patients with PD-L1 expression.


Output()

Output()

Output()

Correctness Score: 1.0
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Scenario 5
Question:What is the response rate of melanoma patients treated with Keytruda?
Ground Thruth:Approximately 40% of melanoma patients respond positively to Keytruda treatment.


Output()

Output()

Output()

Correctness Score: 1.0000000000000002
Faithfulness Score: 1.0
Contextual Relevancy Score: 0.5
------------------------------------


Elapsed time: 125881.7195892334 ms


# Evaluation Results

## GPT-4o-mini

| Scenario | Question                                                         | Correctness Score | Faithfulness Score | Contextual Relevancy Score | Response Time (ms) |
|----------|-------------------------------------------------------------------|-------------------|--------------------|----------------------------|--------------------|
| 1        | Are there any known interactions between Keytruda and other medications? | 0.9977            | 1.0                | 0.5                        | 97798.81           |
| 2        | What is the five-year survival rate for patients using Keytruda for lung cancer? | 0.0732            | 1.0                | 0.5                        | 97798.81           |
| 3        | What are the common side effects of Keytruda?                     | 0.9939            | 1.0                | 0.5                        | 97798.81           |
| 4        | How effective is Keytruda in treating non-small cell lung cancer (NSCLC)? | 0.9325            | 1.0                | 0.5                        | 97798.81           |
| 5        | What is the response rate of melanoma patients treated with Keytruda? | 1.0               | 1.0                | 0.5                        | 97798.81           |

---

## Gemini

| Scenario | Question                                                         | Correctness Score | Faithfulness Score | Contextual Relevancy Score | Response Time (ms) |
|----------|-------------------------------------------------------------------|-------------------|--------------------|----------------------------|--------------------|
| 1        | Are there any known interactions between Keytruda and other medications? | 0.0               | 1.0                | 1.0                        | 363425.68          |
| 2        | What is the five-year survival rate for patients using Keytruda for lung cancer? | 0.6803            | 1.0                | 0.5                        | 363425.68          |
| 3        | What are the common side effects of Keytruda?                     | 0.9746            | 1.0                | 0.5                        | 363425.68          |
| 4        | How effective is Keytruda in treating non-small cell lung cancer (NSCLC)? | 1.0               | 1.0                | 0.5                        | 363425.68          |
| 5        | What is the response rate of melanoma patients treated with Keytruda? | 1.0               | 1.0                | 0.5                        | 363425.68          |

---

## Claude

| Scenario | Question                                                         | Correctness Score | Faithfulness Score | Contextual Relevancy Score | Response Time (ms) |
|----------|-------------------------------------------------------------------|-------------------|--------------------|----------------------------|--------------------|
| 1        | Are there any known interactions between Keytruda and other medications? | 0.9982            | 1.0                | 0.5                        | 125881.72          |
| 2        | What is the five-year survival rate for patients using Keytruda for lung cancer? | 0.7513            | 1.0                | 0.5                        | 125881.72          |
| 3        | What are the common side effects of Keytruda?                     | 0.9980            | 1.0                | 0.5                        | 125881.72          |
| 4        | How effective is Keytruda in treating non-small cell lung cancer (NSCLC)? | 1.0               | 1.0                | 0.5                        | 125881.72          |
| 5        | What is the response rate of melanoma patients treated with Keytruda? | 1.0               | 1.0                | 0.5                        | 125881.72          |



## Evaluation Results:

1. **Correctness**:

    - **GPT-4o-mini**: Generally strong performance, except for Scenario 2, where the score dropped significantly. This suggests that GPT-4o-mini struggles with handling queries where little to no information is available (like the five-year survival rate).
    - **Gemini**: Performed poorly on Scenario 1 (drug interactions), achieving a 0.0 correctness score. However, it performed better on other scenarios, particularly Scenario 4.
    - **Claude**: Achieved high correctness scores across all scenarios, outperforming GPT-4o-mini and Gemini in scenarios with missing or limited data (Scenario 2).

2. **Faithfulness**:

    - All models achieved perfect scores (1.0) across all scenarios, indicating that each model is faithful to the retrieved documents, with no hallucination or invented information.

3. **Contextual Relevancy**:

    - All models exhibited consistently low Contextual Relevancy scores (0.5) across most scenarios. The exception was Gemini, which scored 1.0 in Scenario 1, demonstrating better context alignment in that specific scenario.

4. **Response Time**:

    - GPT-4o-mini was the fastest model, followed by Claude, with Gemini taking significantly longer.
        - **GPT-4o-mini**: ~98 seconds per query.
        - **Claude**: ~126 seconds per query.
        - **Gemini**: ~363 seconds per query.

5. **Cost**:

    Considering the cost for each model:

    - GPT-4o-mini is the most cost-effective at $0.15 per 1M input tokens and $0.6 per 1M output tokens.
    - Claude is the most expensive option, with a base cost of $15 per 1M output tokens.
    - Gemini is a middle-ground option with $0.30 per 1M output tokens.

## Optimization Oportunities

1. **Foundation Model Selection**:
    - **Problem**: Gemini scored **0.0** in Scenario 1 (drug interactions), while Claude and GPT-4o-mini achieved near-perfect correctness scores. This suggests that Gemini may not be as reliable for highly specific medical queries.
    - **Optimization**: The **Claude** model can be prioritized for handling critical, high-stakes queries where correctness is crucial, while **GPT-4o-mini** can be used in cost-sensitive situations where speed is more important. Testing Gemini on less critical tasks could be an alternative to minimize costs.

2. **Improve Contextual Relevancy**:
    - **Problem**: Across most models, the **Contextual Relevancy Score** remains low at **0.5**. Even though the models performed well on correctness and faithfulness, they sometimes struggled to focus on the most relevant content from the retrieved documents.
    - **Optimization**: Adjust the **retriever weighting scheme** in the RAG pipeline, giving more emphasis to content with higher relevance scores. Additionally, adjusting the model prompts to emphasize relevance could guide the model to generate answers more closely aligned with the specific context.

3. **Handling Missing or Incomplete Data**:
    - **Problem**: In Scenario 2 (five-year survival rate), both GPT-4o-mini and Gemini exhibited low correctness scores, showing that they struggled to handle queries for which there was limited or no detailed information available.
    - **Optimization**: Implement a fallback mechanism that explicitly informs the user when no relevant data is available, rather than producing a potentially incorrect answer. This could be enhanced by creating a prompt template that explicitly asks the model to return "No detailed information is available" when applicable.


4. Collaborating with Domain Experts

    Given that the dataset is limited, the design of better test cases in collaboration with **domain experts** and **product researchers** is crucial. This would help ensure that test cases reflect the true challenges the system might face in production environments.

    By involving medical experts and stakeholders:
    - We can identify edge cases that might be missed in a limited dataset.
    - The test cases can be expanded to cover more nuanced questions, such as drug interactions with rare medications, or the specific conditions under which treatments like Keytruda are effective.

    Combining these insights with the **Optimization Opportunities** allows us to not only improve the system based on technical metrics (like Correctness and Relevancy) but also ensure that the model outputs are practically useful for end-users in real-world healthcare contexts.

# Section 6: (Not just) Expanding the RAG System to the Internet

In this implementation, web search is not merely an expansion of the Retrieval-Augmented Generation (RAG) system but a fundamental feature essential for the effective application of the Corrective Retrieval Augmented Generation (CRAG) method. Integrating web search into our system is crucial for enhancing robustness, reliability, and ensuring that users receive accurate and comprehensive answers.

- Expansion to Internet Retrieval:

    - Integration of Web Search:
        - Tool Used: Integrated the DuckDuckGoSearchResults tool to perform web searches.

    - The internal dataset may not cover all possible queries. Accessing web resources allows the system to provide more comprehensive answers.

- Techniques to Ensure Quality and Reliability:

    - Knowledge Refiner:

        - Extracts key information from web search results, focusing on relevant and reliable content.

        - Ensures that only pertinent information is considered, reducing noise from irrelevant web content.

    - Evaluator:

        - Assesses the relevance of web-sourced documents using the LLM.

        - Filters out low-quality or irrelevant results, maintaining the accuracy of the system.

    - Source Attribution:

        - Includes sources with links in the final answer.

        - Allows users to verify information and enhances the credibility of the responses.

Integrating web search into the RAG system is a fundamental aspect of effectively applying the CRAG method. Web search is not merely an optional expansion but a critical component that enhances the system's ability to provide accurate, reliable, and comprehensive answers. It allows the system to dynamically correct and augment its internal knowledge, ensuring that users receive the most relevant and up-to-date information, especially when the internal dataset is insufficient. This integration is essential for maintaining the robustness and reliability of the RAG system in real-world applications.



In [28]:
web_searcher.search("What are the Keytruda's side effects?")

  web_searcher.search("What are the Keytruda's side effects?")




In [29]:
web_searcher.search_and_refine("What are the Keytruda's side effects?")

(['- Keytruda is a biologic drug that belongs to the PD-1 inhibitor class.',
  '- Side effects can vary based on whether Keytruda is used alone or with other cancer drugs.',
  '- Commonly reported side effects include:',
  '- Diarrhea',
  '- Fatigue',
  '- Loss of appetite',
  '- Pain (stomach, muscles, bones, joints)',
  '- Rash or itching',
  '- Fever',
  '- Cough',
  '- Serious side effects to watch for include:',
  '- Black, tarry stools',
  '- Bladder pain',
  '- Bloating or swelling of face, arms, hands, lower legs, or feet',
  '- Bloody or cloudy urine',
  '- Blurred vision',
  '- Body aches or pain',
  '- It is important to consult a doctor if serious side effects occur.'],
 [('Keytruda side effects: What they are and how to manage them',
   'https://www.medicalnewstoday.com/articles/drugs-keytruda-side-effects'),
  ('Keytruda Side Effects: Common to Serious, Explained - Healthline',
   'https://www.healthline.com/health/drugs/keytruda-side-effects'),
  ('Keytruda Side Effects: