<a href="https://colab.research.google.com/github/anngvu/bioc-curation/blob/main/Bioconductor_package_curation_with_OpenAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install openai
!pip install jsonschema



In [3]:
from openai import OpenAI
import os

"""
Checks if the code is running in Google Colab or a local environment
to determine how to retrieve the OpenAI API key.
get_ipython() returns the current interactive shell instance
when running in a Jupyter/IPython environment.
"""
if 'google.colab' in str(get_ipython()):
    from google.colab import userdata
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
else:
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
MODEL="gpt-4o"
client = OpenAI(api_key=OPENAI_API_KEY)

In [4]:
import requests

def get_text_from_url(url):
  try:
    response = requests.get(url)
    response.raise_for_status()
    return response.text
  except requests.exceptions.RequestException as e:
    print(f"Error fetching URL: {e}")
    return None

# Retrieve text from example sources for the package chromVAR
# Sources to curate from can be Bioconductor homepage, READMEs, vignettes, paper (if acccessible), function docs, ...

# Change urls to use selected material for different packages
source_for_base = "https://bioconductor.org/packages/release/bioc/html/chromVAR.html"
base_content = get_text_from_url(source_for_base)

source_for_edam = "https://raw.githubusercontent.com/GreenleafLab/chromVAR/refs/heads/master/README.md"
edam_content = get_text_from_url(source_for_edam)


In [5]:
from jsonschema import validate, ValidationError
import json

# Retrieve schemas

# Base
base_schema = get_text_from_url("https://raw.githubusercontent.com/anngvu/bioc-curation/refs/heads/main/base.json")

# EDAM
edam_schema = get_text_from_url("https://raw.githubusercontent.com/anngvu/bioc-curation/refs/heads/main/edammap.json")

In [6]:
# Base schema completion

def schema_completion(content, schema):
  completion=client.chat.completions.create(
    model=MODEL,
    messages=[
      {"role": "system", "content": "You are a helpful expert in data curation and data modeling, especially with structured JSON data." + 
       "You return only valid JSON string, not in a code block, and without any other explanation so that the string and decoded and inserted into a database."},
      {"role": "user", "content": "Given content about a bioformatics tool, represent it as a JSON object compliant with the provided schema:" +
       "\nCONTENT:\n\n" + content + '\nSCHEMA:\n\n' + schema}]
  )
  return(completion)

base_completion = schema_completion(base_content, base_schema)


In [8]:
# Validate and send any error to be corrected (default: max of 3 times), based on validation error

def fix_completion(content, error):
  completion=client.chat.completions.create(
    model=MODEL,
    messages=[
      {"role": "system", "content": "You are debugging an API. Review the given JSON object and schema error and return the corrected JSON object only."},
      {"role": "user", "content": "JSON:\n\n" + content + "\nERROR:\n\n" + error }]
  )
  return(completion)

def validate_json_with_retries(json_string, schema, max_retries=3, attempts=0):
    validation = json.loads(schema)
    
    if attempts > max_retries:
        raise Exception(f"Failed to validate JSON after {max_retries} attempts")
    try:
        parsed_json = json.loads(json_string)
        validate(instance=parsed_json, schema=validation)
        
        # Both JSON parsing and validation succeeded
        print("Success after", attempts, "attempts")
        return parsed_json
        
    except (json.JSONDecodeError, ValidationError) as e:
        attempts += 1
        print("JSON not valid, trying QC/correction prompt, attempt", attempts)
        if attempts == max_retries:
            raise
        response = fix_completion(json_string, str(e))
        json_string = response.choices[0].message.content
        validate_json_with_retries(json_string, schema, max_retries, attempts)
    

base_json = base_completion.choices[0].message.content
base_final = validate_json_with_retries(base_json, base_schema)

Success after 0 attempts


In [10]:
# EDAM schema completion

edam_completion = schema_completion(edam_content, edam_schema)
edam_json = edam_completion.choices[0].message.content
edam_final = validate_json_with_retries(edam_json, edam_schema)

Success after 0 attempts


In [29]:
# https://openai.com/api/pricing/
# Note: minimum cost, ignores cached tokens and completions for QC re-prompts
def openai_completion_cost(usage):
    input_pricing_per_token = 0.0000025
    output_pricing_per_token = 0.00001
    total = (usage.prompt_tokens * input_pricing_per_token) + (usage.completion_tokens * output_pricing_per_token)
    return(total)

print("-- Costs in USD -- ", "\nBase inference:", openai_completion_cost(base_completion.usage), "\nEdam inference:", openai_completion_cost(edam_completion.usage))


-- Costs in USD --  
Base inference: 0.0399325 
Edam inference: 0.043965000000000004


In [11]:
# View final results
print(base_final)
print(edam_final)

{'name': 'chromVAR', 'description': 'Determine variation in chromatin accessibility across sets of annotations or peaks. Designed primarily for single-cell or sparse chromatin accessibility data, e.g. from scATAC-seq or sparse bulk ATAC or DNAse-seq experiments.', 'homepage': 'http://bioconductor.org/packages/chromVAR/', 'version': ['1.26.0'], 'otherID': [{'type': 'doi', 'value': '10.18129/B9.bioc.chromVAR', 'version': '1.26.0'}], 'toolType': ['Library'], 'operatingSystem': ['Linux', 'Windows', 'Mac'], 'language': ['R'], 'license': 'MIT', 'collectionID': [], 'maturity': 'Mature', 'cost': 'Free of charge', 'accessibility': 'Open access', 'link': [{'url': 'https://support.bioconductor.org/tag/chromvar', 'type': ['Discussion forum'], 'note': 'Bioconductor Support site for chromVAR.'}, {'url': 'https://bioconductor.org/checkResults/release/bioc-LATEST/chromVAR/', 'type': ['Technical monitoring'], 'note': 'Build report page for chromVAR.'}], 'download': [{'url': 'http://bioconductor.org/pac

In [None]:
# For EDAM, get class ids
