# Generatng the Data Sample for Retrieval Evaluation 

In [27]:
from IPython.display import display, HTML
import json
import os
import time
from tqdm.auto import tqdm
import pandas as pd
import google.auth
from google.oauth2 import service_account
import vertexai
from vertexai.generative_models import GenerativeModel
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

# Set up the API key and project ID for Gemini 
PROJECT_ID = os.environ['GCP_PROJECT_ID']
credentials = service_account.Credentials.from_service_account_file(
    "../pacific-ethos-428312-n5-eb4864ff3add.json"
)
vertexai.init(project=PROJECT_ID, credentials=credentials, location="us-central1")

### Load and Sample the Source Data

In [28]:
# Path to your data JSONL file
file_path = '../data/bq-results-20240829-041517-1724904953827.jsonl'

# Read the JSONL file directly into a Pandas DataFrame and transform to JSON documents
df = pd.read_json(file_path, lines=True)

# Get a random sample
df = df.sample(200).reset_index(drop=True)

documents = df.to_dict(orient='records')

### Evaluation Data Generation Prompt

In [29]:
prompt_template = """
You emulate a user of our Biomedical Research database.
Formulate 5 questions this user might ask based on a provided Biomedical article.
Make the questions very specific to this article.
The article record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

abstract: {abstract}
authors: {authors}
keywords: {keywords}
organization_affiliated: {organization_affiliated}
title: {title}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

### Sanity Check on First Article

In [30]:
prompt = prompt_template.format(**documents[0])

In [31]:
def llm(prompt, model="gemini-1.5-flash-001"):
    model = GenerativeModel(model)
    response = model.generate_content(prompt)
    return response.text

In [32]:
questions = llm(prompt)
json_string = questions.strip().replace('json', '').replace('`', '')
json.loads(json_string)

{'questions': ['What are the systemic factors that should be considered when assessing residual periodontal probing depths?',
  'What are the non-surgical and surgical treatment options available for periodontal reassessment?',
  'How do general, practical, and local site factors influence the choice of treatment option?',
  'What are the specific systemic factors that are associated with residual periodontal probing depths?',
  'What information should be gathered at the reassessment appointment, as outlined in the first article of this series?']}

### Generate the Questions for the Sampled Records

In [33]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    questions = llm(prompt)
    json_string = questions.strip().replace('json', '').replace('`', '')
    return json_string

def handle_rate_limit_error():
    print("Rate limit exceeded. Sleeping for 60 seconds...")
    time.sleep(60)

In [34]:
results = {}
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue

    while True:  # Retry loop
        try:
            questions_raw = generate_questions(doc)
            questions = json.loads(questions_raw)
            results[doc_id] = questions['questions']
            break  # Exit the retry loop if successful
        except Exception as e:
            if "Quota exceeded" in str(e):
                handle_rate_limit_error()
            else:
                # Handle other exceptions or re-raise them
                raise e

  0%|          | 0/200 [00:00<?, ?it/s]

Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 seconds...
Rate limit exceeded. Sleeping for 60 sec

In [35]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [36]:
final_results[0]

('c3ea29df-6683-4443-a2c7-3f027137c1d8',
 'What are some systemic factors that influence treatment decisions for residual periodontal probing depths?')

In [37]:
final_results[1]

('c3ea29df-6683-4443-a2c7-3f027137c1d8',
 'What types of systemic factors should be considered when a patient presents with residual probing depths?')

### Save the Data as Ground Truth for Evaluation

In [38]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [39]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)