In [1]:
import logging
import re
import sys

from oaklib import get_adapter
from oaklib.datamodels.text_annotator import TextAnnotation
from oaklib.interfaces.text_annotator_interface import TextAnnotatorInterface
from oaklib.utilities.lexical.lexical_indexer import load_lexical_index
from rdflib import Dataset, Graph, Namespace, URIRef, BNode, RDF, Literal
from rdflib.namespace import OWL, RDFS

from tqdm.notebook import tqdm

# from oaklib.datamodels.vocabulary import OBONamespace

In [2]:
log = logging.getLogger("chebi_annotation")
log.setLevel(logging.INFO)

if not log.handlers:
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(levelname)s:%(name)s: %(message)s')
    handler.setFormatter(formatter)
    log.addHandler(handler)

In [3]:
LEX_INDEX_FILE = "chebi.lex"

In [4]:
# Load the N-Quads file
file_path = "DrugResponseProperty_of_ncbi_taxon.nq"

In [5]:
output_file = "DrugResponseProperty_enriched_with_subjects.ttl"

In [6]:
# Target named graph
TARGET_GRAPH_URI = URIRef("http://example.com/metpo/DrugResponseProperty_of_ncbi_taxon")

In [7]:
# Setup OAK adapter
oak_adapter_string = "sqlite:obo:chebi"

In [8]:
def normalize_text(text):
    # Replace known problematic Unicode spaces with ASCII space
    return text.replace("\u2005", " ").strip()

In [9]:
def normalize_all_spaces(text):
    # Replace ALL Unicode space-like chars with a single ASCII space
    return re.sub(r"[\u2000-\u200B\u202F\u205F\u3000]", " ", text).strip()

In [10]:
def filter_and_label_longest_annotations(annotations, oak_adapter):
    """
    Filters annotations to keep only those with the longest span (≥3 characters),
    and looks up the canonical label for each matched object_id.
    """
    # Step 1: Filter out annotations with span length < 3
    annotations = [
        ann for ann in annotations
        if ann.subject_end - ann.subject_start + 1 >= 3
    ]

    if not annotations:
        return []

    # Step 2: Find max span length
    max_span = max(ann.subject_end - ann.subject_start + 1 for ann in annotations)

    # Step 3: Filter for only longest-span annotations
    longest_annotations = [
        ann for ann in annotations
        if (ann.subject_end - ann.subject_start + 1) == max_span
    ]

    # Step 4: Annotate with canonical labels
    for ann in longest_annotations:
        ann.canonical_label = oak_adapter.label(ann.object_id)  # Add a new attribute

    return longest_annotations


In [11]:
# Helper: Parse free-text entries like "penicillin (resistant)"
def parse_drug_response(value):
    items = re.split(r",\s*(?![^()]*\))", value.strip(", "))
    parsed = []
    for item in items:
        match = re.match(r"^(.*?)\s*\(([^)]+)\)$", item.strip())
        if match:
            parsed.append({"label": match[1].strip(), "details": match[2].strip()})
        else:
            parsed.append({"label": item.strip()})
    return parsed

In [12]:
def cached_annotate(label: str, annotator) -> list[TextAnnotation]:
    if label in _annotate_cache:
        return _annotate_cache[label]
    matches = list(annotator.annotate_text(label))
    _annotate_cache[label] = matches
    return matches

In [13]:
def cached_label(curie: str, oak_adapter) -> str:
    if curie in _label_cache:
        return _label_cache[curie]
    label = oak_adapter.label(curie)
    _label_cache[curie] = label
    return label

In [14]:
_annotate_cache = {}

In [15]:
_label_cache = {}

In [16]:
oak_adapter = get_adapter(oak_adapter_string)

In [17]:
lexical_index = load_lexical_index(LEX_INDEX_FILE)
# 6 minutes

In [18]:
# Set up the TextAnnotatorInterface with the lexical index.
annotator = TextAnnotatorInterface()
annotator.lexical_index = lexical_index

In [19]:
dataset = Dataset()
dataset.parse(file_path, format="nquads")

<Graph identifier=urn:x-rdflib:default (<class 'rdflib.graph.Graph'>)>

In [20]:
# Define constants for METPO IRIs
CLASS_IRI = URIRef("https://w3id.org/metpo/1000527")              # antimicrobial susceptibility assay result
DETAILS_CLASS = URIRef("https://w3id.org/metpo/1000528")          # structured susceptibility detail
COMPOUND_CLASS = URIRef("https://w3id.org/metpo/1000526")         # chemical entity
TAXON_CLASS = URIRef("https://w3id.org/metpo/1000525")            # microbe

HAS_DETAILS = URIRef("https://w3id.org/metpo/2000002")            # has structured assay detail
ABOUT_COMPOUND = URIRef("https://w3id.org/metpo/2000003")         # has antimicrobial substance
HAS_SUBJECT = URIRef("https://w3id.org/metpo/2000004")            # has organism identified

SOURCE_STRING = URIRef("https://w3id.org/metpo/2000007")          # has normalized compound string
ASSAY_DETAILS = URIRef("https://w3id.org/metpo/2000006")          # has assay measurement value
RAW_TEXT = URIRef("https://w3id.org/metpo/2000005")               # has original response string

SUSCEPTIBLE_TO = URIRef("https://w3id.org/metpo/2000000")         # has susceptibility profile
RESISTANT_TO = URIRef("https://w3id.org/metpo/2000001")           # has resistance profile

HAS_VALUE_INPUT = URIRef("http://example.com/metpo/has_value")
RAW_TEXT = URIRef("https://w3id.org/metpo/2000005")  # has original response string

In [21]:
# Output RDF graph (you could also make this a named graph if needed)
enriched_graph = Graph()

# Access the named graph
named_graph = dataset.graph(TARGET_GRAPH_URI)

In [22]:
# Main processing loop
triples_iter = list(named_graph.triples((None, None, None)))
for ncbi_taxon, predicate, response_node in tqdm(triples_iter, desc="Processing DrugResponseProperty triples"):
    if not isinstance(response_node, BNode):
        continue

    raw_value = named_graph.value(subject=response_node, predicate=HAS_VALUE_INPUT)
    subject_uri = named_graph.value(subject=response_node, predicate=HAS_SUBJECT)

    if not isinstance(raw_value, Literal):
        continue

    enriched_graph.add((response_node, RDF.type, CLASS_IRI))
    enriched_graph.add((ncbi_taxon, predicate, response_node))
    enriched_graph.add((ncbi_taxon, RDF.type, TAXON_CLASS))
    enriched_graph.add((response_node, RAW_TEXT, raw_value))  # write with canonical IRI

    if subject_uri:
        enriched_graph.add((response_node, HAS_SUBJECT, subject_uri))

    for entry in parse_drug_response(str(raw_value)):
        detail_node = BNode()
        enriched_graph.add((detail_node, RDF.type, DETAILS_CLASS))
        enriched_graph.add((response_node, HAS_DETAILS, detail_node))

        compound_bnode = BNode()
        enriched_graph.add((compound_bnode, RDF.type, COMPOUND_CLASS))
        enriched_graph.add((detail_node, ABOUT_COMPOUND, compound_bnode))

        normalized_label = normalize_text(entry["label"])
        matches = cached_annotate(normalized_label, annotator)
        best_matches = filter_and_label_longest_annotations(matches, oak_adapter)

        if best_matches:
            match = best_matches[0]
            chebi_curie = match.object_id
            chebi_label = cached_label(chebi_curie, oak_adapter)
            chebi_iri = URIRef(f"http://purl.obolibrary.org/obo/{chebi_curie.replace(':', '_')}")
            enriched_graph.add((compound_bnode, RDFS.label, Literal(chebi_label)))
            enriched_graph.add((compound_bnode, OWL.sameAs, chebi_iri))
        else:
            enriched_graph.add((compound_bnode, RDFS.label, Literal(normalized_label)))

        enriched_graph.add((compound_bnode, SOURCE_STRING, Literal(normalized_label)))

        if "details" in entry:
            normalized_detail = normalize_text(entry["details"])
            enriched_graph.add((detail_node, ASSAY_DETAILS, Literal(normalized_detail)))


Processing DrugResponseProperty triples:   0%|          | 0/2916 [00:00<?, ?it/s]

In [None]:
enriched_graph.serialize(destination=output_file, format="turtle")
print(f"Serialized RDF with enrichment to: {output_file}")

In [None]:
# annotations = list(annotator.annotate_text("vancomycin"))

In [None]:
# import pprint

In [None]:
# pprint.pprint(annotations)

In [None]:
# oak_adapter.label("CHEBI:16414")