In [1]:
import os

import google.generativeai as genai
import json

from pathlib import Path

In [2]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Create the model
# See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
generation_config = {
  "temperature": 0.3,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "application/json",
}
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE",
  },
]

In [3]:
model = genai.GenerativeModel(
  model_name="gemini-1.5-flash-latest",
  safety_settings=safety_settings,
  generation_config=generation_config,
  system_instruction="You are a helpful, meticulous, scientific research assistant.",
)

In [4]:
prompt = "Instruction: Condense each database description to an informative one sentence summary.\n\n"

_file = "data/prompts/entrez_dbs.md"
with open(_file) as f:
    doc = f.readlines()
doc = "".join(doc)
prompt += doc

In [5]:
response = model.generate_content(prompt)

In [7]:
text = json.loads(response.text)

{'Assembly': 'Assembly provides versioned accession identifiers for submitted and RefSeq assemblies, linking to components in the Nucleotide system and providing direct access to downloads on the NCBI FTP site.', 'BioProject': 'BioProject is a searchable collection of large-scale molecular projects including genome sequencing, transcriptome, metagenomic, annotation, expression, and mapping projects, providing a central point to link to all associated data in NCBI databases.', 'BioSample': 'BioSample contains descriptions of biological source materials used in studies with data in other NCBI molecular databases such as Assembly, Nucleotide, and SRA.', 'BioSystems': 'BioSystems collects information on interacting sets of biomolecules involved in metabolic and signaling pathways, disease states, and other biological processes, currently containing pathways from KEGG and EcoCyc and designed to accommodate other data in the future.', 'Bookshelf': 'NCBI Bookshelf contains a collection of ful