<a href="https://colab.research.google.com/github/anngvu/bioc-curation/blob/main/Bioconductor_package_curation_with_OpenAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install openai
!pip install jsonschema



In [3]:
from openai import OpenAI
from google.colab import userdata

OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')
MODEL="gpt-4o"
client = OpenAI(api_key=OPENAI_API_KEY)

In [27]:
import requests

def get_text_from_url(url):
  try:
    response = requests.get(url)
    response.raise_for_status()
    return response.text
  except requests.exceptions.RequestException as e:
    print(f"Error fetching URL: {e}")
    return None

# Retrieve text from example sources for the package chromVAR
# Sources to curate from can be Bioconductor homepage, READMEs, vignettes, paper (if acccessible), function docs, ...

# Change urls to use selected material for different packages
source_for_base = "https://bioconductor.org/packages/release/bioc/html/chromVAR.html"
base_content = get_text_from_url(source_for_base)

source_for_edam = "https://raw.githubusercontent.com/GreenleafLab/chromVAR/refs/heads/master/README.md"
edam_content = get_text_from_url(source_for_edam)


In [31]:
from jsonschema import validate
import json

# Retrieve schemas

# Base
base_schema = get_text_from_url("https://raw.githubusercontent.com/anngvu/bioc-curation/refs/heads/main/base.json")
base_validation = json.loads(base_schema)

# EDAM
edam_schema = get_text_from_url("https://raw.githubusercontent.com/anngvu/bioc-curation/refs/heads/main/edammap.json")
edam_validation = json.loads(edam_schema)

In [29]:
# Base schema completion

def schema_completion(content, schema):
  completion=client.chat.completions.create(
    model=MODEL,
    messages=[
      {"role": "system", "content": "You are a helpful expert in data curation and data modeling, especially with structured JSON data. You return only valid JSON without any other explanation so that it can be inserted into a database."},
      {"role": "user", "content": "Given content about a bioformatics tool, represent it as a JSON object compliant with the provided schema:" +
       "\nCONTENT:\n\n" + content + '\nSCHEMA:\n\n' + schema}]
  )
  return(completion)

base_completion = schema_completion(base_content, base_schema)
base_json = base_completion.choices[0].message.content


In [26]:
json.loads(base_json)

{'name': 'chromVAR',
 'description': 'chromVAR is an R package for the analysis of sparse chromatin accessibility data from single cell or bulk ATAC or DNAse-seq data. The package aims to identify motifs or other genomic annotations associated with variability in chromatin accessibility between individual cells or samples.',
 'homepage': 'https://greenleaflab.github.io/chromVAR/',
 'version': ['1.0.0'],
 'otherID': [{'type': 'doi', 'value': '10.1038/nmeth.4401', 'version': '1.0'}],
 'toolType': ['Library', 'Command-line tool'],
 'operatingSystem': ['Linux', 'Windows', 'Mac'],
 'language': ['R'],
 'license': 'GPL-3.0',
 'collectionID': 'bioinformatics',
 'maturity': 'Mature',
 'cost': 'Free of charge',
 'accessibility': 'Open access',
 'elixirPlatform': ['Tools'],
 'elixirNode': ['EMBL'],
 'elixirCommunity': ['Galaxy'],
 'link': [{'url': 'https://github.com/GreenleafLab/chromVAR/issues',
   'type': ['Issue tracker'],
   'note': 'For installation issues and other technical queries.'}],
 

In [32]:
# Validate and sending result to be corrected (default: max of 3 times), based on validation error

validate(instance=json.loads(base_json), schema=base_validation)

ValidationError: ['Web application'] is not of type 'string', 'null'

Failed validating 'type' in schema['properties']['toolType']:
    {'type': ['string', 'null'],
     'items': {'title': 'Tool type',
               'description': 'A type of application software: a discrete '
                              'software entity can have more than one '
                              'type.',
               'type': 'string',
               'enum': ['Bioinformatics portal',
                        'Command-line tool',
                        'Database portal',
                        'Desktop application',
                        'Library',
                        'Ontology',
                        'Plug-in',
                        'Script',
                        'SPARQL endpoint',
                        'Suite',
                        'Web application',
                        'Web API',
                        'Web service',
                        'Workbench',
                        'Workflow'],
               '$comment': 'bio.tools includes all types of bioinformatics '
                           'tools: application software with well-defined '
                           'data processing functions (inputs, outputs and '
                           'operations). When registering a tool, one or '
                           'more tool types may be assigned, reflecting '
                           'the different facets of the software being '
                           'described.'}}

On instance['toolType']:
    ['Web application']

In [14]:
# EDAM schema completion

edam_completion = schema_completion(edam_content, edam_schema)
edam_json = edam_completion.choices[0].message.content
print(edam_json)

{
  "name": "EDAM mapping",
  "description": "Description of topics and function for a bioinformatics tool chromVAR using the EDAM vocabulary.",
  "topic": [
    {
      "term": "Gene regulation"
    },
    {
      "term": "Gene expression"
    },
    {
      "term": "Functional genomics"
    },
    {
      "term": "Epigenomics"
    },
    {
      "term": "Transcription factors and regulatory sites"
    },
    {
      "term": "DNA"
    }
  ],
  "function": [
    {
      "operation": [
        {
          "term": "Gene expression profiling"
        },
        {
          "term": "Gene regulatory network analysis"
        },
        {
          "term": "Genome annotation"
        }
      ],
      "input": [
        {
          "data": {
            "term": "Nucleic acid sequence"
          },
          "format": [
            {
              "term": "BAM"
            },
            {
              "term": "BED"
            }
          ]
        }
      ],
      "output": [
        {
    