In [None]:
from weave_example_demo.llm_types.prompts import PromptTemplate
from weave_example_demo.llm_types.models.generic_model import GenericLLMModel
import numpy as np


In [None]:
import weave

In [None]:
weave.init('visa-bioasq-rag')

In [None]:
question = "Is Hirschsprung disease a mendelian or a multifactorial disorder?"

In [None]:
question_2_query_system_prompt = """
### Instruction ###
You are an expert biomedical researcher tasked with converting biomedical questions into optimized semantic search queries. Your goal is to generate queries that will retrieve the most relevant documents from the BioASQ dataset to answer the given question.

### Process ###
Follow these steps to create the semantic search query:
1. Carefully analyze the biomedical question to identify the most important keywords, concepts, and entities
2. Construct a search query using those keywords, aiming to retrieve all potentially relevant documents
3. Optimize the query by incorporating synonyms, related terms, and expanding acronyms if applicable
4. Double check that the query captures the core intent of the question and will match pertinent documents
5. Provide only the final semantic search query in your response, without any additional commentary

### Context ###
The BioASQ dataset consists of biomedical questions along with relevant documents. Your semantic search queries will be used to find the most relevant documents from this dataset to answer each question. The ideal answers have been removed, so your query should focus solely on the question text.

### Examples ###
Question: Is Hirschsprung disease a mendelian or a multifactorial disorder?
Semantic Search Query: Hirschsprung disease AND (mendelian OR multifactorial OR complex) AND (inheritance OR genetics OR genes)

Question: List signaling molecules (ligands) that interact with the receptor EGFR?  
Semantic Search Query: EGFR AND (ligands OR "signaling molecules") AND (EGF OR BTC OR EPR OR HB-EGF OR TGF-α OR AREG OR EPG)

Question: Is the protein Papilin secreted?
Semantic Search Query: Papilin AND (secreted OR extracellular OR "secretory pathway")

### Evaluation ###
Your performance will be evaluated on:  
- Inclusion of the most salient keywords, concepts and entities from the biomedical question
- Appropriate use of synonyms and related terms to improve retrieval
- Ability of the query to capture the full scope and intent of the question
- Overall likelihood of the query retrieving documents that can answer the question
- Adherence to the response format instructions

You MUST provide a well-constructed query that fulfills the given criteria. You will be penalized for queries that are too narrow, off-topic, or poorly formulated.
"""

In [None]:
question_2_query_human_prompt = """
### Human Prompt ###
Biomedical Question: "{question}"

Semantic Search Query:
"""



In [None]:
question_2_query_model = GenericLLMModel(
    system_prompt=question_2_query_system_prompt,
    human_prompt=question_2_query_human_prompt
)



In [None]:
transformed_query = question_2_query_model.predict(human_prompt_args={"question": question})['answer']



In [None]:
transformed_query

In [None]:
vector_store = weave.ref('VectorStore:latest').get()
embedding_model = weave.ref('SentenceTransformersModel:latest').get()

In [None]:
vector_store.set_embedding_model(embedding_model)

In [None]:
vector_store.embeddings_matrix = np.array(
            [doc_emb["embedding"] for doc_emb in vector_store.article_embeddings]
)

In [None]:
_context = vector_store.get_most_relevant_documents(query=transformed_query, n=5)

In [None]:
article_relevance_system_prompt = """
### Instruction ###
You are an expert medical researcher librarian. Your task is to determine whether articles from the BioASQ dataset may be relevant to questions from clinicians based on the articles' abstracts. You MUST provide a yes or no answer. You will be penalized for answers that are not a clear yes or no.

### Process ###
1. Carefully read the provided clinical question. 
2. Analyze the given article abstract in the context of the question.
3. Determine if the abstract contains information potentially relevant to answering the question. 
4. Provide a definitive yes or no answer. Do not hedge or equivocate.

### Evaluation ###
Your performance will be evaluated on:
- Ability to identify abstracts with information relevant to the clinical question
- Providing a clear, unambiguous yes or no answer 
- Avoiding reliance on stereotypes or biases in your determination
- Adherence to the required answer format

You MUST provide a yes or no answer. Any other response will be penalized.
"""

In [None]:
article_relevance_human_prompt = """
### Question ###
Clinical question: "{question}"

### Abstract ###
{article_text}

### Answer ###
"""

In [None]:
article_relevance_model = GenericLLMModel(
    system_prompt=article_relevance_system_prompt,
    human_prompt=article_relevance_human_prompt
)

In [None]:
for doc in _context:
    doc["relevance"] = article_relevance_model.predict(human_prompt_args={"question": question, "article_text": doc["document"]["passage"]})['answer']

In [None]:
relevant_context = [doc for doc in _context if doc["relevance"].lower() == "yes"]


In [None]:
len(relevant_context)

In [None]:
relevant_context[0]

In [None]:
#TODO: Add reranking using BM25

In [None]:
summarization_system_prompt = """
### Instruction ###
You are an expert medical researcher tasked with summarizing relevant excerpts from biomedical literature to provide background information necessary to answer clinicians' questions. Your summary should be concise yet informative, capturing the key points from the provided context.

### Process ###
1. Carefully read the provided clinical question to understand the information needed.
2. Analyze the given context, which includes excerpts from biomedical literature along with relevance scores.
3. Identify the most pertinent information from the context in relation to the question.
4. Summarize the key points from the relevant excerpts, considering their relevance scores.
5. Synthesize the individual summaries into a coherent overview addressing the question.
6. If the context is not sufficient to answer the question, indicate that more information is needed.

### Format ###
Question: <question>
Summary: <summary_of_relevant_information>
Relevant Excerpts: <excerpts_in_order_of_relevance>

### Evaluation ###
Your performance will be evaluated on:
- Ability to identify and summarize relevant information from the provided context
- Synthesis of individual excerpt summaries into a coherent overview
- Consideration of excerpt relevance scores in the final summary
- Clarity and conciseness of the summary
- Adherence to the specified response format

You MUST provide a summary that directly addresses the given question using the most relevant excerpts from the context. If the provided context is insufficient to answer the question, state "Insufficient information to answer the question."
"""

In [None]:
summarization_human_prompt = """
### Question ###
{question}

### Context ###
{context_str}

### Summary ###
"""

In [None]:
summarization_model = GenericLLMModel(
    system_prompt=summarization_system_prompt,
    human_prompt=summarization_human_prompt
)



In [None]:
context_str = "\n\n".join([f"{doc['document']['passage']} (Score: {doc['score']})" for doc in relevant_context])

In [None]:
context_str

In [None]:
summary = summarization_model.predict(human_prompt_args={"question": question, "context_str": context_str})['answer']


In [None]:
summary

In [None]:
synthesis_system_prompt = """
### Instruction ###
You are an expert medical assistant. Your task is to provide accurate, concise answers to medical questions based on summaries of relevant biomedical literature. You MUST ensure responses are clear, informative, unbiased, and avoid stereotypes. Answer in a natural, human-like manner. You will be penalized for answers that are unclear, inaccurate, biased, or overly verbose.

### Process ###
1. Carefully analyze the provided question to understand the key information needed. 
2. Review the summary of relevant excerpts from biomedical literature.
3. Identify the most pertinent information in the summary for answering the question.
4. Synthesize the key points into a coherent, concise answer.
5. If the summary lacks sufficient information to conclusively answer the question, state "There is insufficient information provided to conclusively answer the question."

### Format ###
Question: <question>
Answer: <final_answer_based_on_summary>

### Example ###
Question: Is Hirschsprung disease a mendelian or a multifactorial disorder?

Summary: Hirschsprung disease, particularly in the context of Mowat-Wilson syndrome (MWS) associated with ZFHX1B mutations or deletions, shows variations in enteric neural plexus abnormalities. The pathologies in MWS are attributed to variations in ZFHX1B abnormalities and epigenetic factors.

Relevant Excerpts: 
- Patients with ZFHX1B mutations or deletions develop multiple congenital anomalies including Hirschsprung disease, known as Mowat-Wilson syndrome (MWS). (Score: 0.6024968654169915)

Answer: Based on the summary, Hirschsprung disease in Mowat-Wilson syndrome appears to have both genetic and multifactorial components. Variations in ZFHX1B abnormalities suggest a genetic basis, while the role of epigenetic factors points to a multifactorial etiology. However, the provided information is limited in conclusively determining if Hirschsprung disease more broadly is purely Mendelian or multifactorial.

### Evaluation ###
Your performance will be evaluated on:
- Accuracy and relevance of the answer based on the provided summary
- Clarity and conciseness of the response 
- Ability to identify when the summary is insufficient to conclusively answer the question
- Avoidance of bias and stereotyping
- Adherence to the specified format

You MUST provide an answer that directly addresses the question using only the information in the summary. If the summary is insufficient, state that conclusively answering is not possible. Produce the answer in a clear, natural style.
"""

In [None]:
synthesis_human_prompt = """
### Question ###
{question}

### Summary ###
{summary}

### Answer ###
"""

In [None]:
synthesis_model = GenericLLMModel(
    system_prompt=synthesis_system_prompt,
    human_prompt=synthesis_human_prompt
)

In [None]:
synthesis_model.predict(human_prompt_args={"question": question, "summary": summary})['answer']

In [None]:
from weave_example_demo.llm_types.rag.rag import RAGModel

In [None]:
class BioASQAdvancedRAGModel(RAGModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    @weave.op()
    def score_context(self, _context) -> str:
        for doc in _context:
            doc["relevance"] = article_relevance_model.predict(human_prompt_args={"question": question, "article_text": doc["document"]["passage"]})['answer']
        

    @weave.op()
    def predict(self, question: str, n_documents: int = 5) -> str:
        self.set_vector_store(self.vector_store)
        transformed_query = question_2_query_model.predict(human_prompt_args={"question": question})['answer']
        _context = self.vector_store.get_most_relevant_documents(query=transformed_query, n=n_documents)
        self.score_context(_context)
        relevant_context = [doc for doc in _context if doc["relevance"].lower() == "yes"]
        # If no relevant context, use the most relevant document
        # this is probably not the best but good for demonstrative
        # purposes
        if len(relevant_context) == 0:
            relevant_context = [_context[0]]
        context_str = "\n\n".join([f"{doc['document']['passage']} (Score: {doc['score']})" for doc in relevant_context])
        summary = summarization_model.predict(human_prompt_args={"question": question, "context_str": context_str})['answer']
        answer = synthesis_model.predict(human_prompt_args={"question": question, "summary": summary})['answer']
        return {"answer": answer, "context": [doc["document"]["passage"] for doc in relevant_context], "all_context": _context}

In [None]:
rag_model = BioASQAdvancedRAGModel(vector_store=vector_store)


In [None]:
rag_model.predict(question=question, n_documents=5)



In [None]:
qap = weave.ref('QuestionAnswerPairsTrainFiltered:latest').get()

In [None]:
from weave_example_demo.scorers.llm_guard_scorer import LLMGuardScorer
from weave_example_demo.scorers.tonic_validate_scorer import TonicValidateScorer

In [None]:
scorers = [
    TonicValidateScorer(
        metrics=[
            "AnswerSimilarityMetric",
            "AugmentationPrecisionMetric",
            "AnswerConsistencyMetric",
        ]
    ),
    LLMGuardScorer(
        metrics=["NoRefusal", "Relevance", "Sensitive"]),
]


In [None]:
sub_qap = qap.rows[:10]

In [None]:
sub_qap

In [41]:
evaluation = weave.Evaluation(dataset=sub_qap, scorers=scorers)
await evaluation.evaluate(rag_model)