In [1]:
import google.generativeai as genai
import json
import os
import time
from pathlib import Path
from tqdm import tqdm

In [2]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Create the model
# See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
generation_config = {
  "temperature": 0.3,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "application/json",
}
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
]

In [3]:
data_dir = "data/eutils_raw/"
api_docs = []

for _file in Path(data_dir).iterdir():
    with open(_file) as f:
        doc = f.readlines()
    doc = "".join(doc)
    api_docs.append(doc)

In [4]:
with open("data/prompts/apidoc2json.md") as f:
    base_prompt = f.readlines()
base_prompt = "".join(base_prompt)

In [6]:

model = genai.GenerativeModel(
  model_name="gemini-1.5-flash-latest",
  safety_settings=safety_settings,
  generation_config=generation_config,
  system_instruction="You are a helpful, meticulous, scientific programming assistant. You carefully read API documentation and generate JSON.",
)


responses = []
for doc in tqdm(api_docs):
  prompt = base_prompt.replace("{API_DOC}", doc)

  response = model.generate_content(prompt)
  text = json.loads(response.text)
  responses.append(text)
  time.sleep(0.5)

100%|██████████| 6/6 [00:52<00:00,  8.67s/it]


In [7]:
responses

[[{'user_name': 'template_usrname',
   'api_name': 'EFetch',
   'api_call': 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db}&id={id}&retmode={retmode}&rettype={rettype}',
   'api_version': '1.0',
   'api_arguments': {'db': 'string (The database to retrieve records from. Must be a valid Entrez database name. Default is pubmed)',
    'id': 'string (A single UID or a comma-delimited list of UIDs to retrieve. All UIDs must be from the database specified by db)',
    'retmode': 'string (The data format of the records returned. Options include plain text, HTML, or XML. See Table 1 in Chapter 2 for a full list of allowed values for each database)',
    'rettype': 'string (The record view returned. Options include Abstract or MEDLINE from PubMed, or GenPept or FASTA from protein. See Table 1 in Chapter 2 for a full list of allowed values for each database)'},
   'functionality': 'Retrieves formatted data records for a list of input UIDs',
   'env_requirements': ['requests'],
