# MPS Annotation Pipeline #

In [None]:
import os
import subprocess
from dotenv import load_dotenv
import yaml
from pprintpp import pprint as pp
import pipe

load_dotenv() # load .env

# load OpenAI API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

OntoGPT examples: [here](https://github.com/monarch-initiative/ontogpt/blob/main/notebooks/)

OntoGPT templates: [here](https://github.com/monarch-initiative/ontogpt/blob/main/src/ontogpt/templates/)

CurateGPT examples: [here](https://github.com/monarch-initiative/curategpt/blob/main/notebooks/command-line/)

In [None]:
!pip install --upgrade pip
!pip install onnxruntime
!pip install pyyaml
!pip install pprintpp
!pip install pipe

### Install LLM Tools ###

OntoGPT is based on Structured Prompt Interrogation and Recursive Extraction of Semantics (SPIRES), a novel method to extract ontological content from text or structured data authored by [Caufield et al., 2024](https://doi.org/10.1093/bioinformatics/btae104).

CurateGPT is another library that uses LLM embeddings to prioritize semantically similar ontology content to text or structured data input. CurateGPT also enables users to suggest new ontology content and programmatically interact with GitHub issue trackers. Find the preprint for CurateGPT [here](https://doi.org/10.48550/arXiv.2411.00046)

In [None]:
!pip install ontogpt
!pip install curategpt

### Set OpenAI API Key ###

In [None]:
!runoak set-apikey -e openai $OPENAI_API_KEY

### Show OntoGPT and CurateGPT Options ###

In [None]:
!ontogpt --help

In [None]:
!curategpt --help

In [None]:
!ontogpt -vvv extract -i example1.txt -t templates/human_phenotype.yaml -o output.yaml --model-provider openai

In [None]:
with open('output/output.yaml', 'r') as infile:
    output1 = yaml.safe_load(infile)
pp(output1)

### Index HPO For AUTO Prefix Terms ###

In [None]:
!curategpt ontology index  -m openai: -c terms_hp sqlite:obo:hp

In [None]:
with open("output/output.yaml", "r") as f:
    data = yaml.safe_load(f)

# Extract AUTO terms and find semantically similar phenotypes
raw_auto_terms = [item for item in data["extracted_object"]["phenotypes"] if item.startswith("AUTO:")]
auto_terms = [item.replace("AUTO:", "").replace("%20", " ") for item in raw_auto_terms]

print(auto_terms)


#!curategpt search -c terms_hp "alginate transport"

In [None]:
!curategpt ask -c phenopackets_384 "what genes are associated with renal phenotypes?"