In [1]:
from weave_example_demo.llm_types.prompts import PromptTemplate
from weave_example_demo.llm_types.models.generic_model import GenericLLMModel


In [2]:
import weave

In [3]:
weave.init('bioasq-rag-data')

Logged in as Weights & Biases user: a-sh0ts.
View Weave data at https://wandb.ai/a-sh0ts/bioasq-rag-data/weave




In [4]:
question = "Is Hirschsprung disease a mendelian or a multifactorial disorder?"

In [5]:
question_2_query_system_prompt = """
### Instruction ###
You are an expert biomedical researcher tasked with converting biomedical questions into optimized semantic search queries. Your goal is to generate queries that will retrieve the most relevant documents from the BioASQ dataset to answer the given question.

### Process ###
Follow these steps to create the semantic search query:
1. Carefully analyze the biomedical question to identify the most important keywords, concepts, and entities
2. Construct a search query using those keywords, aiming to retrieve all potentially relevant documents
3. Optimize the query by incorporating synonyms, related terms, and expanding acronyms if applicable
4. Double check that the query captures the core intent of the question and will match pertinent documents
5. Provide only the final semantic search query in your response, without any additional commentary

### Context ###
The BioASQ dataset consists of biomedical questions along with relevant documents. Your semantic search queries will be used to find the most relevant documents from this dataset to answer each question. The ideal answers have been removed, so your query should focus solely on the question text.

### Examples ###
Question: Is Hirschsprung disease a mendelian or a multifactorial disorder?
Semantic Search Query: Hirschsprung disease AND (mendelian OR multifactorial OR complex) AND (inheritance OR genetics OR genes)

Question: List signaling molecules (ligands) that interact with the receptor EGFR?  
Semantic Search Query: EGFR AND (ligands OR "signaling molecules") AND (EGF OR BTC OR EPR OR HB-EGF OR TGF-α OR AREG OR EPG)

Question: Is the protein Papilin secreted?
Semantic Search Query: Papilin AND (secreted OR extracellular OR "secretory pathway")

### Evaluation ###
Your performance will be evaluated on:  
- Inclusion of the most salient keywords, concepts and entities from the biomedical question
- Appropriate use of synonyms and related terms to improve retrieval
- Ability of the query to capture the full scope and intent of the question
- Overall likelihood of the query retrieving documents that can answer the question
- Adherence to the response format instructions

You MUST provide a well-constructed query that fulfills the given criteria. You will be penalized for queries that are too narrow, off-topic, or poorly formulated.
"""

In [6]:
question_2_query_human_prompt = """
### Human Prompt ###
Biomedical Question: "{question}"

Semantic Search Query:
"""



In [7]:
question_2_query_model = GenericLLMModel(
    system_prompt=question_2_query_system_prompt,
    human_prompt=question_2_query_human_prompt
)



In [8]:
transformed_query = question_2_query_model.predict(human_prompt_args={"question": question})['answer']



🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/07bc35c5-33fe-421e-a5e9-dd4247c33cde


In [9]:
transformed_query

'Hirschsprung disease AND (mendelian OR multifactorial OR complex) AND (genetic OR inheritance)'

In [10]:
vector_store = weave.ref('VectorStore:latest').get()
embedding_model = weave.ref('SentenceTransformersModel:latest').get()

In [11]:
vector_store.set_embedding_model(embedding_model)

  from tqdm.autonotebook import tqdm, trange


🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/9f6b45a1-1811-478c-a841-2d6bfafbab10


In [12]:
_context = vector_store.get_most_relevant_documents(query=transformed_query, n=5)



🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/0a1ac835-d6a7-41e5-ba1d-ef598ab5f332


In [13]:
article_relevance_system_prompt = """
### Instruction ###
You are an expert medical researcher librarian. Your task is to determine whether articles from the BioASQ dataset may be relevant to questions from clinicians based on the articles' abstracts. You MUST provide a yes or no answer. You will be penalized for answers that are not a clear yes or no.

### Process ###
1. Carefully read the provided clinical question. 
2. Analyze the given article abstract in the context of the question.
3. Determine if the abstract contains information potentially relevant to answering the question. 
4. Provide a definitive yes or no answer. Do not hedge or equivocate.

### Evaluation ###
Your performance will be evaluated on:
- Ability to identify abstracts with information relevant to the clinical question
- Providing a clear, unambiguous yes or no answer 
- Avoiding reliance on stereotypes or biases in your determination
- Adherence to the required answer format

You MUST provide a yes or no answer. Any other response will be penalized.
"""

In [14]:
article_relevance_human_prompt = """
### Question ###
Clinical question: "{question}"

### Abstract ###
{article_text}

### Answer ###
"""

In [15]:
article_relevance_model = GenericLLMModel(
    system_prompt=article_relevance_system_prompt,
    human_prompt=article_relevance_human_prompt
)

In [16]:
for doc in _context:
    doc["relevance"] = article_relevance_model.predict(human_prompt_args={"question": question, "article_text": doc["document"]["passage"]})['answer']

🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/50213149-8d37-4885-afa4-4fee8e0318ee
🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/1aaac9d5-26c9-415d-9564-ff8642ba7a3a
🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/1f5bd5b6-b23c-4656-8b4d-fef861c0a630
🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/dc8f97d1-b8ec-4939-91ee-ecb5d8e36ee9
🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/62e15eec-69d8-4a6c-af4b-ae8838634c3f


In [17]:
relevant_context = [doc for doc in _context if doc["relevance"].lower() == "yes"]


In [18]:
len(relevant_context)

1

In [19]:
relevant_context[0]

{'document': TraceDict({'passage': ObjectRef(entity='a-sh0ts', project='bioasq-rag-data', name='TextCorpusFiltered', digest='77OxkEhXzsBqaN73ElmxTdZW8iWMhJAOqaG9TqmGT9E', extra=['attr', 'rows', 'id', 'SrPvc43iYzmMjVUw0jHmKvAdpOMYZtJrDJG8KOYXNDY', 'key', 'passage']), 'id': ObjectRef(entity='a-sh0ts', project='bioasq-rag-data', name='TextCorpusFiltered', digest='77OxkEhXzsBqaN73ElmxTdZW8iWMhJAOqaG9TqmGT9E', extra=['attr', 'rows', 'id', 'SrPvc43iYzmMjVUw0jHmKvAdpOMYZtJrDJG8KOYXNDY', 'key', 'id'])}),
 'score': 0.6059975759468366,
 'relevance': 'Yes'}

In [20]:
#TODO: Add reranking using BM25

In [21]:
summarization_system_prompt = """
### Instruction ###
You are an expert medical researcher tasked with summarizing relevant excerpts from biomedical literature to provide background information necessary to answer clinicians' questions. Your summary should be concise yet informative, capturing the key points from the provided context.

### Process ###
1. Carefully read the provided clinical question to understand the information needed.
2. Analyze the given context, which includes excerpts from biomedical literature along with relevance scores.
3. Identify the most pertinent information from the context in relation to the question.
4. Summarize the key points from the relevant excerpts, considering their relevance scores.
5. Synthesize the individual summaries into a coherent overview addressing the question.
6. If the context is not sufficient to answer the question, indicate that more information is needed.

### Format ###
Question: <question>
Summary: <summary_of_relevant_information>
Relevant Excerpts: <excerpts_in_order_of_relevance>

### Evaluation ###
Your performance will be evaluated on:
- Ability to identify and summarize relevant information from the provided context
- Synthesis of individual excerpt summaries into a coherent overview
- Consideration of excerpt relevance scores in the final summary
- Clarity and conciseness of the summary
- Adherence to the specified response format

You MUST provide a summary that directly addresses the given question using the most relevant excerpts from the context. If the provided context is insufficient to answer the question, state "Insufficient information to answer the question."
"""

In [22]:
summarization_human_prompt = """
### Question ###
{question}

### Context ###
{context_str}

### Summary ###
"""

In [23]:
summarization_model = GenericLLMModel(
    system_prompt=summarization_system_prompt,
    human_prompt=summarization_human_prompt
)



In [24]:
context_str = "\n\n".join([f"{doc['document']['passage']} (Score: {doc['score']})" for doc in relevant_context])

In [25]:
context_str

"BACKGROUND/PURPOSE: Patients with zinc finger homeo box 1B (ZFHX1B) mutations or \ndeletions develop multiple congenital anomalies including Hirschsprung disease, \nknown as Mowat-Wilson syndrome (MWS). In this study, we investigated variations \nin the enteric neural plexus abnormalities in MWS using morphometry-based \nhistopathologic analysis.\nMETHODS: Seven patients with MWS (3 with mutations in exon 8 of ZFHX1B and 4 \nwith deletions) who had undergone modified Duhamel's operations for Hirschsprung \ndisease were examined. Surgically resected rectosigmoid specimens were analyzed \nmorphometrically.\nRESULTS: The length of the aganglionic segment was longer than 3 cm in all the \npatients with deletions. In 3 patients with mutations, the aganglionic region \nwas not detected in the surgically resected specimens; however, the parameters \nof the ganglions and plexus were significantly smaller than those of controls \n(cloaca and aproctia), indicative of a transitional zone. Variat

In [26]:
summary = summarization_model.predict(human_prompt_args={"question": question, "context_str": context_str})['answer']


🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/94dcb374-bb90-419a-abf6-b9763ca1bc52


In [27]:
summary

'Question: Is Hirschsprung disease a mendelian or a multifactorial disorder?\nSummary: Hirschsprung disease, particularly in the context of Mowat-Wilson syndrome (MWS) associated with ZFHX1B mutations or deletions, shows variations in enteric neural plexus abnormalities that contribute to the pathology. The study suggests that the pathologies in MWS are influenced by both ZFHX1B abnormalities and epigenetic factors.\nRelevant Excerpts:\n1. The variations in myenteric plexus pathologies in MWS appear to be caused by both variations in ZFHX1B abnormalities and epigenetic factors.'

In [28]:
synthesis_system_prompt = """
### Instruction ###
You are an expert medical assistant. Your task is to provide accurate, concise answers to medical questions based on summaries of relevant biomedical literature. You MUST ensure responses are clear, informative, unbiased, and avoid stereotypes. Answer in a natural, human-like manner. You will be penalized for answers that are unclear, inaccurate, biased, or overly verbose.

### Process ###
1. Carefully analyze the provided question to understand the key information needed. 
2. Review the summary of relevant excerpts from biomedical literature.
3. Identify the most pertinent information in the summary for answering the question.
4. Synthesize the key points into a coherent, concise answer.
5. If the summary lacks sufficient information to conclusively answer the question, state "There is insufficient information provided to conclusively answer the question."

### Format ###
Question: <question>
Answer: <final_answer_based_on_summary>

### Example ###
Question: Is Hirschsprung disease a mendelian or a multifactorial disorder?

Summary: Hirschsprung disease, particularly in the context of Mowat-Wilson syndrome (MWS) associated with ZFHX1B mutations or deletions, shows variations in enteric neural plexus abnormalities. The pathologies in MWS are attributed to variations in ZFHX1B abnormalities and epigenetic factors.

Relevant Excerpts: 
- Patients with ZFHX1B mutations or deletions develop multiple congenital anomalies including Hirschsprung disease, known as Mowat-Wilson syndrome (MWS). (Score: 0.6024968654169915)

Answer: Based on the summary, Hirschsprung disease in Mowat-Wilson syndrome appears to have both genetic and multifactorial components. Variations in ZFHX1B abnormalities suggest a genetic basis, while the role of epigenetic factors points to a multifactorial etiology. However, the provided information is limited in conclusively determining if Hirschsprung disease more broadly is purely Mendelian or multifactorial.

### Evaluation ###
Your performance will be evaluated on:
- Accuracy and relevance of the answer based on the provided summary
- Clarity and conciseness of the response 
- Ability to identify when the summary is insufficient to conclusively answer the question
- Avoidance of bias and stereotyping
- Adherence to the specified format

You MUST provide an answer that directly addresses the question using only the information in the summary. If the summary is insufficient, state that conclusively answering is not possible. Produce the answer in a clear, natural style.
"""

In [29]:
synthesis_human_prompt = """
### Question ###
{question}

### Summary ###
{summary}

### Answer ###
"""

In [30]:
synthesis_model = GenericLLMModel(
    system_prompt=synthesis_system_prompt,
    human_prompt=synthesis_human_prompt
)

In [31]:
synthesis_model.predict(human_prompt_args={"question": question, "summary": summary})['answer']

🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/6d9b0a88-4eed-4394-b72f-7ea8ba6ad1ff


'Based on the provided summary, Hirschsprung disease, especially in the context of Mowat-Wilson syndrome, seems to have contributions from both genetic factors like ZFHX1B abnormalities and epigenetic factors. This suggests that the development of Hirschsprung disease in this specific syndrome involves a combination of genetic and environmental influences. However, the summary does not provide enough information to definitively classify Hirschsprung disease as purely Mendelian or multifactorial.'

In [32]:
from weave_example_demo.llm_types.rag.rag import RAGModel

In [33]:
class BioASQAdvancedRAGModel(RAGModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    @weave.op()
    def score_context(self, _context) -> str:
        for doc in _context:
            doc["relevance"] = article_relevance_model.predict(human_prompt_args={"question": question, "article_text": doc["document"]["passage"]})['answer']
        

    @weave.op()
    def predict(self, question: str, n_documents: int = 5) -> str:
        self.set_vector_store(self.vector_store)
        transformed_query = question_2_query_model.predict(human_prompt_args={"question": question})['answer']
        _context = self.vector_store.get_most_relevant_documents(query=transformed_query, n=n_documents)
        self.score_context(_context)
        relevant_context = [doc for doc in _context if doc["relevance"].lower() == "yes"]
        # If no relevant context, use the most relevant document
        # this is probably not the best but good for demonstrative
        # purposes
        if len(relevant_context) == 0:
            relevant_context = [_context[0]]
        context_str = "\n\n".join([f"{doc['document']['passage']} (Score: {doc['score']})" for doc in relevant_context])
        summary = summarization_model.predict(human_prompt_args={"question": question, "context_str": context_str})['answer']
        answer = synthesis_model.predict(human_prompt_args={"question": question, "summary": summary})['answer']
        return {"answer": answer, "context": [doc["document"]["passage"] for doc in relevant_context], "all_context": _context}

In [34]:
rag_model = BioASQAdvancedRAGModel(vector_store=vector_store)


🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/e60b060a-6429-4ab8-9b05-c16158e3d80f


In [35]:
rag_model.predict(question=question, n_documents=5)





🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/95fa9630-1f37-4de9-8d11-d974a841b08e


{'answer': 'Based on the summary provided, Hirschsprung disease, especially in the context of Mowat-Wilson syndrome (MWS) associated with ZFHX1B mutations or deletions, appears to have a multifactorial etiology. The variations in enteric neural plexus abnormalities in MWS are influenced by both ZFHX1B abnormalities and epigenetic factors. Therefore, Hirschsprung disease is likely a disorder with both genetic and environmental factors contributing to its development.',
 'context': ["BACKGROUND/PURPOSE: Patients with zinc finger homeo box 1B (ZFHX1B) mutations or \ndeletions develop multiple congenital anomalies including Hirschsprung disease, \nknown as Mowat-Wilson syndrome (MWS). In this study, we investigated variations \nin the enteric neural plexus abnormalities in MWS using morphometry-based \nhistopathologic analysis.\nMETHODS: Seven patients with MWS (3 with mutations in exon 8 of ZFHX1B and 4 \nwith deletions) who had undergone modified Duhamel's operations for Hirschsprung \nd

In [36]:
qap = weave.ref('QuestionAnswerPairsTrainFiltered:latest').get()

In [37]:
from weave_example_demo.scorers.llm_guard_scorer import LLMGuardScorer
from weave_example_demo.scorers.tonic_validate_scorer import TonicValidateScorer

In [38]:
scorers = [
    TonicValidateScorer(
        metrics=[
            "AnswerSimilarityMetric",
            "AugmentationPrecisionMetric",
            "AnswerConsistencyMetric",
        ]
    ),
    LLMGuardScorer(
        metrics=["NoRefusal", "Relevance", "Sensitive"]),
]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [39]:
sub_qap = qap.rows[:10]

In [40]:
sub_qap

[TraceDict({'question': 'What are the side effects during statins administration in patients with atherosclerosis?', 'ground_truth': 'The side effects during statins administration in patients with atherosclerosis are:\n1) Myopathy\n2) Transaminase elevations\n3) Diabetes mellitus \n4) Renal and neurologic adverse effects.', 'relevant_passage_ids': [12891851], 'id': 1938}),
 TraceDict({'question': 'Which are the predominant rotavirus genotypes around the world?', 'ground_truth': 'The predominant RV genotypes circulating all over the world are G1P[8], G2P[4], G3P[8], G4P[8], and G9P[8], while G12[P6] and G12[P8] are emerging genotypes.', 'relevant_passage_ids': [30156344], 'id': 3750}),
 TraceDict({'question': 'Describe Achenbach’s syndrome.', 'ground_truth': 'Achenbach’s syndrome is Paroxysmal finger haematoma. It is benign condition resulting in the sudden appearance of bruising on one or more fingers, either spontaneously or after minimal trauma, and resolving without treatment.It ca

In [41]:
evaluation = weave.Evaluation(dataset=sub_qap, scorers=scorers)
await evaluation.evaluate(rag_model)

Retrieving responses: 100%|██████████| 1/1 [00:00<00:00, 16980.99it/s]
Retrieving responses: 100%|██████████| 1/1 [00:00<00:00, 18477.11it/s]
[92m23:40:46 - LiteLLM:INFO[0m: utils.py:3341 - Wrapper: Completed Call, calling success_handler
[92m23:40:46 - LiteLLM:INFO[0m: utils.py:3341 - Wrapper: Completed Call, calling success_handler
[92m23:40:46 - LiteLLM:INFO[0m: utils.py:1298 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api.openai.com/v1/ \
-d '{'model': 'gpt-3.5-turbo', 'messages': [{'role': 'system', 'content': '\n### Instruction ###\nYou are an expert medical assistant. Your task is to provide accurate, concise answers to medical questions based on summaries of relevant biomedical literature. You MUST ensure responses are clear, informative, unbiased, and avoid stereotypes. Answer in a natural, human-like manner. You will be penalized for answers that are unclear, inaccurate, biased, or overly verbose.\n\n### Process ###\n1. Carefully analyze the provided

[2m2024-06-10 23:40:51[0m [[32m[1mdebug    [0m] [1mInitialized classification model[0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='ProtectAI/distilroberta-base-rejection-v1', subfolder='', revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_path='ProtectAI/distilroberta-base-rejection-v1', onnx_revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps'), 'return_token_type_ids': False, 'max_length': 128, 'truncation': True}, tokenizer_kwargs={})[0m


[92m23:40:52 - LiteLLM:INFO[0m: utils.py:3341 - Wrapper: Completed Call, calling success_handler


[2m2024-06-10 23:40:52[0m [[32m[1mdebug    [0m] [1mInitialized model             [0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='BAAI/bge-base-en-v1.5', subfolder='', revision='a5beb1e3e68b9ab74eb54cfd186867f64f240e1a', onnx_path='BAAI/bge-base-en-v1.5', onnx_revision='a5beb1e3e68b9ab74eb54cfd186867f64f240e1a', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps')}, tokenizer_kwargs={})[0m
[2m2024-06-10 23:40:52[0m [[32m[1mdebug    [0m] [1mNo entity types provided, using default[0m [36mdefault_entity_types[0m=[35m['CREDIT_CARD', 'CRYPTO', 'EMAIL_ADDRESS', 'IBAN_CODE', 'IP_ADDRESS', 'PERSON', 'PHONE_NUMBER', 'US_SSN', 'US_BANK_NUMBER', 'CREDIT_CARD_RE', 'UUID', 'EMAIL_ADDRESS_RE', 'US_SSN_RE'][0m


[92m23:40:53 - LiteLLM:INFO[0m: utils.py:3341 - Wrapper: Completed Call, calling success_handler
[92m23:40:53 - LiteLLM:INFO[0m: utils.py:1298 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api.openai.com/v1/ \
-d '{'model': 'gpt-3.5-turbo', 'messages': [{'role': 'system', 'content': '\n### Instruction ###\nYou are an expert medical assistant. Your task is to provide accurate, concise answers to medical questions based on summaries of relevant biomedical literature. You MUST ensure responses are clear, informative, unbiased, and avoid stereotypes. Answer in a natural, human-like manner. You will be penalized for answers that are unclear, inaccurate, biased, or overly verbose.\n\n### Process ###\n1. Carefully analyze the provided question to understand the key information needed. \n2. Review the summary of relevant excerpts from biomedical literature.\n3. Identify the most pertinent information in the summary for answering the question.\n4. Synthesize the key points

[2m2024-06-10 23:40:54[0m [[32m[1mdebug    [0m] [1mInitialized NER model         [0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='Isotonic/deberta-v3-base_finetuned_ai4privacy_v2', subfolder='', revision='9ea992753ab2686be4a8f64605ccc7be197ad794', onnx_path='Isotonic/deberta-v3-base_finetuned_ai4privacy_v2', onnx_revision='9ea992753ab2686be4a8f64605ccc7be197ad794', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps'), 'aggregation_strategy': 'simple'}, tokenizer_kwargs={'model_input_names': ['input_ids', 'attention_mask']})[0m



Scoring responses: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


[2m2024-06-10 23:40:54[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mCREDIT_CARD_RE[0m
[2m2024-06-10 23:40:54[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mUUID[0m
[2m2024-06-10 23:40:54[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mEMAIL_ADDRESS_RE[0m
[2m2024-06-10 23:40:54[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mUS_SSN_RE[0m
[2m2024-06-10 23:40:54[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mBTC_ADDRESS[0m
[2m2024-06-10 23:40:54[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mURL_RE[0m
[2m2024-06-10 23:40:54[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mCREDIT_CARD[0m
[2m2024-06-10 23:40:54[0m [[32m[1mdebug    [0m] [1mLoaded regex patte



Scoring responses: 100%|██████████| 1/1 [00:06<00:00,  6.20s/it]
Scoring responses: 100%|██████████| 1/1 [00:07<00:00,  7.70s/it]
[92m23:40:54 - LiteLLM:INFO[0m: utils.py:3341 - Wrapper: Completed Call, calling success_handler



Scoring responses: 100%|██████████| 1/1 [00:05<00:00,  5.64s/it]

[2m2024-06-10 23:40:54[0m [[32m[1mdebug    [0m] [1mInitialized classification model[0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='ProtectAI/distilroberta-base-rejection-v1', subfolder='', revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_path='ProtectAI/distilroberta-base-rejection-v1', onnx_revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps'), 'return_token_type_ids': False, 'max_length': 128, 'truncation': True}, tokenizer_kwargs={})[0m







Scoring responses: 100%|██████████| 1/1 [00:05<00:00,  5.61s/it]


[2m2024-06-10 23:40:55[0m [[32m[1mdebug    [0m] [1mInitialized classification model[0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='ProtectAI/distilroberta-base-rejection-v1', subfolder='', revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_path='ProtectAI/distilroberta-base-rejection-v1', onnx_revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps'), 'return_token_type_ids': False, 'max_length': 128, 'truncation': True}, tokenizer_kwargs={})[0m


[92m23:40:55 - LiteLLM:INFO[0m: utils.py:1298 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api.openai.com/v1/ \
-d '{'model': 'gpt-3.5-turbo', 'messages': [{'role': 'system', 'content': "\n### Instruction ###\nYou are an expert medical researcher librarian. Your task is to determine whether articles from the BioASQ dataset may be relevant to questions from clinicians based on the articles' abstracts. You MUST provide a yes or no answer. You will be penalized for answers that are not a clear yes or no.\n\n### Process ###\n1. Carefully read the provided clinical question. \n2. Analyze the given article abstract in the context of the question.\n3. Determine if the abstract contains information potentially relevant to answering the question. \n4. Provide a definitive yes or no answer. Do not hedge or equivocate.\n\n### Evaluation ###\nYour performance will be evaluated on:\n- Ability to identify abstracts with information relevant to the clinical question\n- Providing 

[2m2024-06-10 23:40:55[0m [[32m[1mdebug    [0m] [1mInitialized classification model[0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='ProtectAI/distilroberta-base-rejection-v1', subfolder='', revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_path='ProtectAI/distilroberta-base-rejection-v1', onnx_revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps'), 'return_token_type_ids': False, 'max_length': 128, 'truncation': True}, tokenizer_kwargs={})[0m
[2m2024-06-10 23:40:55[0m [[32m[1mdebug    [0m] [1mInitialized classification model[0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='ProtectAI/distilroberta-base-rejection-v1', subfolder='', revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_path='ProtectAI/distilroberta-base-rejection-v1', onnx_revision='65584967c3f22ff7723e5370c65e0e76

[92m23:40:55 - LiteLLM:INFO[0m: utils.py:3341 - Wrapper: Completed Call, calling success_handler


[2m2024-06-10 23:40:55[0m [[32m[1mdebug    [0m] [1mInitialized classification model[0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='ProtectAI/distilroberta-base-rejection-v1', subfolder='', revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_path='ProtectAI/distilroberta-base-rejection-v1', onnx_revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps'), 'return_token_type_ids': False, 'max_length': 128, 'truncation': True}, tokenizer_kwargs={})[0m
[2m2024-06-10 23:40:56[0m [[32m[1mdebug    [0m] [1mNo rejection detected         [0m [36mhighest_score[0m=[35m0.0[0m
[2m2024-06-10 23:40:56[0m [[32m[1mdebug    [0m] [1mScanner completed             [0m [36melapsed_time_seconds[0m=[35m1.523868[0m [36mis_valid[0m=[35mTrue[0m [36mscanner[0m=[35mNoRefusal[0m


Retrieving responses: 100%|██████████| 1/1 [00:00<00:00, 27776.85it/s]
Scoring responses:   0%|          | 0/1 [00:00<?, ?it/s][92m23:40:56 - LiteLLM:INFO[0m: utils.py:1298 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api.openai.com/v1/ \
-d '{'model': 'gpt-3.5-turbo', 'messages': [{'role': 'system', 'content': "\n### Instruction ###\nYou are an expert medical researcher librarian. Your task is to determine whether articles from the BioASQ dataset may be relevant to questions from clinicians based on the articles' abstracts. You MUST provide a yes or no answer. You will be penalized for answers that are not a clear yes or no.\n\n### Process ###\n1. Carefully read the provided clinical question. \n2. Analyze the given article abstract in the context of the question.\n3. Determine if the abstract contains information potentially relevant to answering the question. \n4. Provide a definitive yes or no answer. Do not hedge or equivocate.\n\n### Evaluation ###\nYour perf

[2m2024-06-10 23:40:58[0m [[32m[1mdebug    [0m] [1mInitialized classification model[0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='ProtectAI/distilroberta-base-rejection-v1', subfolder='', revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_path='ProtectAI/distilroberta-base-rejection-v1', onnx_revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps'), 'return_token_type_ids': False, 'max_length': 128, 'truncation': True}, tokenizer_kwargs={})[0m
[2m2024-06-10 23:40:59[0m [[32m[1mdebug    [0m] [1mInitialized model             [0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='BAAI/bge-base-en-v1.5', subfolder='', revision='a5beb1e3e68b9ab74eb54cfd186867f64f240e1a', onnx_path='BAAI/bge-base-en-v1.5', onnx_revision='a5beb1e3e68b9ab74eb54cfd186867f64f240e1a', onnx_subfolder='onnx', onnx_fil

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[2m2024-06-10 23:41:00[0m [[32m[1mdebug    [0m] [1mInitialized model             [0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='BAAI/bge-base-en-v1.5', subfolder='', revision='a5beb1e3e68b9ab74eb54cfd186867f64f240e1a', onnx_path='BAAI/bge-base-en-v1.5', onnx_revision='a5beb1e3e68b9ab74eb54cfd186867f64f240e1a', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps')}, tokenizer_kwargs={})[0m
[2m2024-06-10 23:41:00[0m [[32m[1mdebug    [0m] [1mNo entity types provided, using default[0m [36mdefault_entity_types[0m=[35m['CREDIT_CARD', 'CRYPTO', 'EMAIL_ADDRESS', 'IBAN_CODE', 'IP_ADDRESS', 'PERSON', 'PHONE_NUMBER', 'US_SSN', 'US_BANK_NUMBER', 'CREDIT_CARD_RE', 'UUID', 'EMAIL_ADDRESS_RE', 'US_SSN_RE'][0m








Scoring responses: 100%|██████████| 1/1 [00:09<00:00,  9.92s/it]
[92m23:41:03 - LiteLLM:INFO[0m: utils.py:3341 - Wrapper: Completed Call, calling success_handler
[92m23:41:03 - LiteLLM:INFO[0m: utils.py:1298 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api.openai.com/v1/ \
-d '{'model': 'gpt-3.5-turbo', 'messages': [{'role': 'system', 'content': '\n### Instruction ###\nYou are an expert medical assistant. Your task is to provide accurate, concise answers to medical questions based on summaries of relevant biomedical literature. You MUST ensure responses are clear, informative, unbiased, and avoid stereotypes. Answer in a natural, human-like manner. You will be penalized for answers that are unclear, inaccurate, biased, or overly verbose.\n\n### Process ###\n1. Carefully analyze the provided question to understand the key information needed. \n2. Review the summary of relevant excerpts from biomedical literature.\n3. Identify the most pertinent information i

[2m2024-06-10 23:41:03[0m [[32m[1mdebug    [0m] [1mInitialized classification model[0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='ProtectAI/distilroberta-base-rejection-v1', subfolder='', revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_path='ProtectAI/distilroberta-base-rejection-v1', onnx_revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps'), 'return_token_type_ids': False, 'max_length': 128, 'truncation': True}, tokenizer_kwargs={})[0m


Scoring responses: 100%|██████████| 1/1 [00:08<00:00,  8.12s/it]
[92m23:41:06 - LiteLLM:INFO[0m: utils.py:3341 - Wrapper: Completed Call, calling success_handler


[2m2024-06-10 23:41:06[0m [[32m[1mdebug    [0m] [1mNo sensitive data found in the output[0m
[2m2024-06-10 23:41:06[0m [[32m[1mdebug    [0m] [1mScanner completed             [0m [36melapsed_time_seconds[0m=[35m6.00361[0m [36mis_valid[0m=[35mTrue[0m [36mscanner[0m=[35mSensitive[0m
[2m2024-06-10 23:41:06[0m [[32m[1minfo     [0m] [1mScanned output                [0m [36melapsed_time_seconds[0m=[35m11.463628[0m [36mscores[0m=[35m{'NoRefusal': 0.0, 'Relevance': 0.51, 'Sensitive': 0.0}[0m
[2m2024-06-10 23:41:06[0m [[32m[1mdebug    [0m] [1mInitialized NER model         [0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='Isotonic/deberta-v3-base_finetuned_ai4privacy_v2', subfolder='', revision='9ea992753ab2686be4a8f64605ccc7be197ad794', onnx_path='Isotonic/deberta-v3-base_finetuned_ai4privacy_v2', onnx_revision='9ea992753ab2686be4a8f64605ccc7be197ad794', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipe

[2m2024-06-10 23:41:06[0m [[32m[1mdebug    [0m] [1mInitialized classification model[0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='ProtectAI/distilroberta-base-rejection-v1', subfolder='', revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_path='ProtectAI/distilroberta-base-rejection-v1', onnx_revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps'), 'return_token_type_ids': False, 'max_length': 128, 'truncation': True}, tokenizer_kwargs={})[0m


Retrieving responses: 100%|██████████| 1/1 [00:00<00:00, 8322.03it/s]
Scoring responses:   0%|          | 0/1 [00:00<?, ?it/s]

[2m2024-06-10 23:41:12[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mCREDIT_CARD_RE[0m
[2m2024-06-10 23:41:12[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mCREDIT_CARD_RE[0m
[2m2024-06-10 23:41:12[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mUUID[0m
[2m2024-06-10 23:41:12[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mCREDIT_CARD_RE[0m
[2m2024-06-10 23:41:12[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mCREDIT_CARD_RE[0m
[2m2024-06-10 23:41:12[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mCREDIT_CARD_RE[0m
[2m2024-06-10 23:41:12[0m [[32m[1mdebug    [0m] [1mLoaded regex pattern          [0m [36mgroup_name[0m=[35mCREDIT_CARD_RE[0m
[2m2024-06-10 23:41:12[0m [[32m[1mdebug    [0m] [1mL

Scoring responses: 100%|██████████| 1/1 [00:11<00:00, 11.44s/it]


[2m2024-06-10 23:41:20[0m [[32m[1mdebug    [0m] [1mInitialized classification model[0m [36mdevice[0m=[35mdevice(type='mps')[0m [36mmodel[0m=[35mModel(path='ProtectAI/distilroberta-base-rejection-v1', subfolder='', revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_path='ProtectAI/distilroberta-base-rejection-v1', onnx_revision='65584967c3f22ff7723e5370c65e0e76791e6055', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='mps'), 'return_token_type_ids': False, 'max_length': 128, 'truncation': True}, tokenizer_kwargs={})[0m
[2m2024-06-10 23:41:32[0m [[32m[1mdebug    [0m] [1mNo rejection detected         [0m [36mhighest_score[0m=[35m0.05[0m
[2m2024-06-10 23:41:32[0m [[32m[1mdebug    [0m] [1mNo rejection detected         [0m [36mhighest_score[0m=[35m0.0[0m
[2m2024-06-10 23:41:32[0m [[32m[1mdebug    [0m] [1mNo rejection detected         [0m [36mhighest_score[0m=[35m0.0[0

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[2m2024-06-10 23:42:45[0m [[32m[1mdebug    [0m] [1mResult is similar to the prompt[0m [36msimilarity_score[0m=[35m0.8046054[0m
[2m2024-06-10 23:42:45[0m [[32m[1mdebug    [0m] [1mScanner completed             [0m [36melapsed_time_seconds[0m=[35m30.019465[0m [36mis_valid[0m=[35mTrue[0m [36mscanner[0m=[35mRelevance[0m


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[2m2024-06-10 23:42:50[0m [[32m[1mdebug    [0m] [1mResult is similar to the prompt[0m [36msimilarity_score[0m=[35m0.83699644[0m
[2m2024-06-10 23:42:50[0m [[32m[1mdebug    [0m] [1mScanner completed             [0m [36melapsed_time_seconds[0m=[35m35.301057[0m [36mis_valid[0m=[35mTrue[0m [36mscanner[0m=[35mRelevance[0m


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[2m2024-06-10 23:43:11[0m [[32m[1mdebug    [0m] [1mResult is similar to the prompt[0m [36msimilarity_score[0m=[35m0.84559655[0m
[2m2024-06-10 23:43:11[0m [[32m[1mdebug    [0m] [1mScanner completed             [0m [36melapsed_time_seconds[0m=[35m47.764896[0m [36mis_valid[0m=[35mTrue[0m [36mscanner[0m=[35mRelevance[0m


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[2m2024-06-10 23:43:16[0m [[32m[1mdebug    [0m] [1mNo sensitive data found in the output[0m
[2m2024-06-10 23:43:16[0m [[32m[1mdebug    [0m] [1mScanner completed             [0m [36melapsed_time_seconds[0m=[35m53.399463[0m [36mis_valid[0m=[35mTrue[0m [36mscanner[0m=[35mSensitive[0m
[2m2024-06-10 23:43:16[0m [[32m[1minfo     [0m] [1mScanned output                [0m [36melapsed_time_seconds[0m=[35m119.820852[0m [36mscores[0m=[35m{'NoRefusal': 0.0, 'Relevance': 0.0, 'Sensitive': 0.0}[0m
[2m2024-06-10 23:43:16[0m [[32m[1mdebug    [0m] [1mScanner completed             [0m [36melapsed_time_seconds[0m=[35m53.295529[0m [36mis_valid[0m=[35mFalse[0m [36mscanner[0m=[35mSensitive[0m
[2m2024-06-10 23:43:16[0m [[32m[1minfo     [0m] [1mScanned output                [0m [36melapsed_time_seconds[0m=[35m119.935433[0m [36mscores[0m=[35m{'NoRefusal': 0.0, 'Relevance': 0.0, 'Sensitive': 1.0}[0m
[2m2024-06-10 23:43:16[0m [[32m[1m

[2m2024-06-10 23:43:31[0m [[32m[1mdebug    [0m] [1mIgnoring entity               [0m [36mentity_group[0m=[35mCURRENCY[0m
[2m2024-06-10 23:43:31[0m [[32m[1mdebug    [0m] [1mIgnoring entity               [0m [36mentity_group[0m=[35mCURRENCY[0m
[2m2024-06-10 23:43:31[0m [[32m[1mdebug    [0m] [1mNo sensitive data found in the output[0m
[2m2024-06-10 23:43:31[0m [[32m[1mdebug    [0m] [1mScanner completed             [0m [36melapsed_time_seconds[0m=[35m46.247075[0m [36mis_valid[0m=[35mTrue[0m [36mscanner[0m=[35mSensitive[0m
[2m2024-06-10 23:43:31[0m [[32m[1minfo     [0m] [1mScanned output                [0m [36melapsed_time_seconds[0m=[35m102.002173[0m [36mscores[0m=[35m{'NoRefusal': 0.0, 'Relevance': 0.0, 'Sensitive': 0.0}[0m
[2m2024-06-10 23:43:31[0m [[32m[1mdebug    [0m] [1mIgnoring entity               [0m [36mentity_group[0m=[35mEYECOLOR[0m
[2m2024-06-10 23:43:31[0m [[32m[1mdebug    [0m] [1mNo sensitive data

[2m2024-06-10 23:43:32[0m [[32m[1mdebug    [0m] [1mIgnoring entity               [0m [36mentity_group[0m=[35mORGANIZATION[0m
[2m2024-06-10 23:43:32[0m [[32m[1mdebug    [0m] [1mIgnoring entity               [0m [36mentity_group[0m=[35mORGANIZATION[0m
[2m2024-06-10 23:43:32[0m [[32m[1mdebug    [0m] [1mNo sensitive data found in the output[0m
[2m2024-06-10 23:43:32[0m [[32m[1mdebug    [0m] [1mScanner completed             [0m [36melapsed_time_seconds[0m=[35m21.117702[0m [36mis_valid[0m=[35mTrue[0m [36mscanner[0m=[35mSensitive[0m
[2m2024-06-10 23:43:32[0m [[32m[1minfo     [0m] [1mScanned output                [0m [36melapsed_time_seconds[0m=[35m87.997612[0m [36mscores[0m=[35m{'NoRefusal': 0.0, 'Relevance': 0.0, 'Sensitive': 0.0}[0m


🍩 https://wandb.ai/a-sh0ts/bioasq-rag-data/r/call/c20f29bd-453c-4f80-9d5e-b58c4f7afe7a


{'TonicValidateScorer': {'answer_similarity': {'mean': 1.3},
  'augmentation_precision': {'mean': 0.3},
  'answer_consistency': {'mean': 0.44666666666666666}},
 'LLMGuardScorer': {'results_valid': {'NoRefusal': {'true_count': 10,
    'true_fraction': 1.0},
   'Relevance': {'true_count': 9, 'true_fraction': 0.9},
   'Sensitive': {'true_count': 8, 'true_fraction': 0.8}},
  'results_score': {'NoRefusal': {'mean': 0.0},
   'Relevance': {'mean': 0.051000000000000004},
   'Sensitive': {'mean': 0.2}}},
 'model_latency': {'mean': 21.244274926185607}}