In [1]:
import os
import pickle
import logging
import typing as t
import pandas as pd

import networkx as nx
import owlready2
from smart_open import open

from cellarium.ml.utilities.inference.metadata_benchmarking.utils import \
    build_nx_graph_from_owl_ontology, \
    get_all_ancestors

from cellarium.ml.utilities.inference.metadata_benchmarking.build_resources import \
    build_ontology_ancestor_dictionary, \
    build_benchmarking_ontology_dictionary_resource

logging.basicConfig(level=logging.INFO)

In [2]:
ROOT_PATH = "/home/mehrtash/data"
ONTOLOGY_ROOT_PATH = os.path.join(ROOT_PATH, "data", "cellariumgpt_artifacts", "ontology")

In [3]:
PARTOF_RELATIONSHIP = "BFO_0000050"  # part_of

def build_nx_graph_from_owl_ontology_legacy(
    owl_ontology: owlready2.Ontology,
    owl_classes: t.List[owlready2.ThingClass],
    prefix: str,
    extra_nodes: t.Optional[t.List[str]] = None,
) -> nx.DiGraph:

    owl_graph = nx.DiGraph(name="OWL graph")

    if extra_nodes is not None:
        for node in extra_nodes:
            owl_graph.add_node(node)

    names_set = set(_class.name.replace("_", ":") for _class in owl_classes)
    classes_set = set(owl_classes)

    for _class in owl_classes:
        owl_graph.add_node(_class.name.replace("_", ":"))

    for self_class in owl_classes:
        # parents
        for parent_class in owl_ontology.get_parents_of(self_class):
            if parent_class not in classes_set:
                continue
            owl_graph.add_edge(parent_class.name.replace("_", ":"), self_class.name.replace("_", ":"))
        # children
        for child_class in owl_ontology.get_children_of(self_class):
            if child_class not in classes_set:
                continue
            owl_graph.add_edge(self_class.name.replace("_", ":"), child_class.name.replace("_", ":"))
        # part of
        for prop in self_class.get_class_properties():
            if PARTOF_RELATIONSHIP in prop.name:
                for related_term in prop[self_class]:
                    if related_term.name.startswith(prefix):
                        if related_term.name.replace("_", ":") not in names_set:
                            continue
                        owl_graph.add_edge(related_term.name.replace("_", ":"), self_class.name.replace("_", ":"))
        # deprecated terms (WHY???!!)
        if "deprecated" in [prop.name for prop in self_class.get_class_properties()]:
            for prop in self_class.get_class_properties():
                if "consider" in prop.name:
                    for substitute in prop[self_class]:
                        substitute = str(substitute)
                        if substitute.startswith(prefix):
                            if substitute.replace("_", ":") not in names_set:
                                continue
                            owl_graph.add_edge(substitute.replace("_", ":"), self_class.name.replace("_", ":"))

    return owl_graph

### Cell ontology resources

In [18]:
cl_ontology_owl_file_url = "https://github.com/obophenotype/cell-ontology/releases/download/v2024-01-04/cl.owl"
cl_datastore_terms_csv = os.path.join(ONTOLOGY_ROOT_PATH, "datastore_cl_terms.csv")

cl_prefix = "CL_"
n_hops = 4

cl_propagation_resource_file_path = os.path.join(ONTOLOGY_ROOT_PATH, "cl_propagation_resource.pkl")
cl_benchmarking_resource_file_path = os.path.join(ONTOLOGY_ROOT_PATH, "cl_benchmarking_resource.pkl")

In [19]:
cl_datastore_terms_df = pd.read_csv(cl_datastore_terms_csv)
cl_datastore_terms_set = set(cl_datastore_terms_df['cell_type_ontology_term_id'].values)
print(f"Number of CL terms in datastore: {len(cl_datastore_terms_set)}")

Number of CL terms in datastore: 804


In [20]:
logging.info("Loading ontology ...")

owl_ontology = owlready2.get_ontology(cl_ontology_owl_file_url).load()

# get all classes first
owl_classes = [
    _class for _class in owl_ontology.classes()
    if _class.name.startswith(cl_prefix)
    if len(_class.label) == 1
]

logging.info(f"All classes: {len(owl_classes)}")

# build the full graph
owl_graph = build_nx_graph_from_owl_ontology(owl_ontology=owl_ontology, owl_classes=owl_classes)

# get all ancestors for datastore nodes
filtered_owl_names_set = set()
for owl_name in cl_datastore_terms_set:
    filtered_owl_names_set.update(get_all_ancestors(owl_graph, owl_name))

logging.info(f"Filtered classes: {len(filtered_owl_names_set)}")

owl_classes = [
    _class for _class in owl_classes
    if _class.name.replace("_", ":") in filtered_owl_names_set
]

# build the filtered graph
owl_graph = build_nx_graph_from_owl_ontology(owl_ontology=owl_ontology, owl_classes=owl_classes)

owl_names = [_class.name.replace("_", ":") for _class in owl_classes]
owl_labels = [_class.label[0] for _class in owl_classes]

if len(set(owl_labels)) != len(set(owl_names)):
    raise ValueError("Number of unique labels doesn't correspond to number of unique names")

owl_names_to_labels_map = {owl_name: owl_label for owl_name, owl_label in zip(owl_names, owl_labels)}
owl_names_to_idx_map = {owl_name: idx for idx, owl_name in enumerate(owl_names)}

INFO:root:Loading ontology ...
INFO:root:All classes: 2914
INFO:root:Building nx graph from OWL ontology...
INFO:root:Filtered classes: 1026
INFO:root:Building nx graph from OWL ontology...


In [21]:
logging.info("Generating cell type ontology propagation resource from CL ontology...")
owl_ancestors_dictionary = build_ontology_ancestor_dictionary(
    owl_graph=owl_graph,
    owl_names=owl_names,
    owl_names_to_idx_map=owl_names_to_idx_map
)
cell_ontology_resource = {
    "ancestors_dictionary": owl_ancestors_dictionary,
    "ontology_term_id_to_label": owl_names_to_labels_map,
}

logging.info(f"Writing output file to {cl_propagation_resource_file_path}")
with open(cl_propagation_resource_file_path, "wb") as output_file:
    pickle.dump(cell_ontology_resource, output_file)

INFO:root:Generating cell type ontology propagation resource from CL ontology...
INFO:root:Building cell ontology ancestor dictionary...
INFO:root:Writing output file to /home/mehrtash/data/data/cellariumgpt_artifacts/ontology/cl_propagation_resource.pkl


In [22]:
logging.info("Generating cell type ontology benchmarking resource from CL ontology...")
ontology_resource_dict = build_benchmarking_ontology_dictionary_resource(
    owl_graph=owl_graph,
    owl_names=owl_names,
    n_hops=n_hops
)

logging.info(f"Writing output file to {cl_benchmarking_resource_file_path}")

with open(cl_benchmarking_resource_file_path, "wb") as output_file:
    pickle.dump(ontology_resource_dict, output_file)

INFO:root:Generating cell type ontology benchmarking resource from CL ontology...
INFO:root:Writing output file to /home/mehrtash/data/data/cellariumgpt_artifacts/ontology/cl_benchmarking_resource.pkl


### Development stage resources

In [23]:
hsapdv_ontology_owl_file_url = "http://purl.obolibrary.org/obo/hsapdv.owl"
hsapdv_datastore_terms_csv = os.path.join(ONTOLOGY_ROOT_PATH, "datastore_hsapdv_terms.csv")

hsapdv_prefix = "HsapDv_"
n_hops = 4

hsapdv_propagation_resource_file_path = os.path.join(ONTOLOGY_ROOT_PATH, "hsapdv_propagation_resource.pkl")
hsapdv_benchmarking_resource_file_path = os.path.join(ONTOLOGY_ROOT_PATH, "hsapdv_benchmarking_resource.pkl")

In [24]:
hsapdv_datastore_terms_df = pd.read_csv(hsapdv_datastore_terms_csv)
hsapdv_datastore_terms_set = set(hsapdv_datastore_terms_df['development_stage_ontology_term_id'].values)
print(f"Number of HsapDv terms in datastore: {len(hsapdv_datastore_terms_set)}")

Number of HsapDv terms in datastore: 175


In [32]:
logging.info("Loading ontology ...")

owl_ontology = owlready2.get_ontology(hsapdv_ontology_owl_file_url).load()

owl_classes = [
    _class for _class in owl_ontology.classes()
    if _class.name.startswith(hsapdv_prefix)
    if len(_class.label) == 1
]

logging.info(f"All classes: {len(owl_classes)}")

# build the full graph
owl_graph = build_nx_graph_from_owl_ontology_legacy(
    owl_ontology=owl_ontology,
    owl_classes=owl_classes,
    prefix=hsapdv_prefix)

# get all ancestors for datastore nodes
filtered_owl_names_set = set()
for owl_name in hsapdv_datastore_terms_set:
    try:
        filtered_owl_names_set.update(get_all_ancestors(owl_graph, owl_name))
    except:
        print(f"Could not find {owl_name} in ontology, ignoring.")

logging.info(f"Filtered classes: {len(filtered_owl_names_set)}")

owl_classes = [
    _class for _class in owl_classes
    if _class.name.replace("_", ":") in filtered_owl_names_set
]

# build the filtered graph
owl_graph = build_nx_graph_from_owl_ontology_legacy(
    owl_ontology=owl_ontology,
    owl_classes=owl_classes,
    prefix=hsapdv_prefix)

owl_names = [_class.name.replace("_", ":") for _class in owl_classes]
owl_labels = [_class.label[0] for _class in owl_classes]

# assert that names are unique, labels do not have to be unique
assert len(set(owl_names)) == len(owl_classes)

owl_names_to_labels_map = {name: label for name, label in zip(owl_names, owl_labels)}
owl_names_to_idx_map = {name: idx for idx, name in enumerate(owl_names)}

INFO:root:Loading ontology ...
INFO:root:All classes: 260
INFO:root:Filtered classes: 194


Could not find UBERON:0018241 in ontology, ignoring.
Could not find UBERON:0000113 in ontology, ignoring.


In [33]:
owl_ancestors_dictionary = build_ontology_ancestor_dictionary(
    owl_graph=owl_graph,
    owl_names=owl_names,
    owl_names_to_idx_map=owl_names_to_idx_map
)
hsapdv_ontology_resource = {
    "ancestors_dictionary": owl_ancestors_dictionary,
    "ontology_term_id_to_label": owl_names_to_labels_map,
}

logging.info(f"Writing output file to {hsapdv_propagation_resource_file_path}")
with open(hsapdv_propagation_resource_file_path, "wb") as output_file:
    pickle.dump(hsapdv_ontology_resource, output_file)

INFO:root:Building cell ontology ancestor dictionary...
INFO:root:Writing output file to /home/mehrtash/data/data/cellariumgpt_artifacts/ontology/hsapdv_propagation_resource.pkl


In [34]:
logging.info("Generating ontology benchmarking resource from HsapDv ontology...")
ontology_resource_dict = build_benchmarking_ontology_dictionary_resource(
    owl_graph=owl_graph,
    owl_names=owl_names,
    n_hops=n_hops
)

logging.info(f"Writing output file to {hsapdv_benchmarking_resource_file_path}")

with open(hsapdv_benchmarking_resource_file_path, "wb") as output_file:
    pickle.dump(ontology_resource_dict, output_file)

INFO:root:Generating ontology benchmarking resource from HsapDv ontology...
INFO:root:Writing output file to /home/mehrtash/data/data/cellariumgpt_artifacts/ontology/hsapdv_benchmarking_resource.pkl


In [37]:
n_print = 2000

i_print = 0
for k, v in hsapdv_ontology_resource["ancestors_dictionary"].items():
    print(hsapdv_ontology_resource["ontology_term_id_to_label"][k])
    for v_ in v:
        print("\t", hsapdv_ontology_resource["ontology_term_id_to_label"][v_])
    i_print += 1
    if i_print == n_print:
        break

life cycle stage
life cycle
	 life cycle stage
embryonic stage
	 life cycle stage
	 life cycle
	 prenatal stage
prenatal stage
	 life cycle stage
	 life cycle
organogenesis stage
	 life cycle stage
	 life cycle
	 embryonic stage
	 prenatal stage
Carnegie stage 12
	 life cycle stage
	 life cycle
	 embryonic stage
	 prenatal stage
	 organogenesis stage
Carnegie stage 13
	 life cycle stage
	 life cycle
	 embryonic stage
	 prenatal stage
	 organogenesis stage
Carnegie stage 14
	 life cycle stage
	 life cycle
	 embryonic stage
	 prenatal stage
	 organogenesis stage
Carnegie stage 16
	 life cycle stage
	 life cycle
	 embryonic stage
	 prenatal stage
	 organogenesis stage
Carnegie stage 17
	 life cycle stage
	 life cycle
	 embryonic stage
	 prenatal stage
	 organogenesis stage
Carnegie stage 18
	 life cycle stage
	 life cycle
	 embryonic stage
	 prenatal stage
	 organogenesis stage
Carnegie stage 19
	 life cycle stage
	 life cycle
	 embryonic stage
	 prenatal stage
	 organogenesis stage
Carne

### Disease resources

In [38]:
mondo_ontology_owl_file_url = "https://github.com/monarch-initiative/mondo/releases/download/v2024-01-03/mondo.owl"
mondo_datastore_terms_csv = os.path.join(ONTOLOGY_ROOT_PATH, "datastore_mondo_terms.csv")

mondo_prefix = "MONDO_"
n_hops = 4

mondo_propagation_resource_file_path = os.path.join(ONTOLOGY_ROOT_PATH, "mondo_propagation_resource.pkl")
mondo_benchmarking_resource_file_path = os.path.join(ONTOLOGY_ROOT_PATH, "mondo_benchmarking_resource.pkl")

In [39]:
mondo_datastore_terms_df = pd.read_csv(mondo_datastore_terms_csv)
mondo_datastore_terms_set = set(mondo_datastore_terms_df['disease_ontology_term_id'].values)
print(f"Number of MONDO terms in datastore: {len(mondo_datastore_terms_set)}")

Number of MONDO terms in datastore: 110


In [51]:
logging.info("Loading ontology ...")

owl_ontology = owlready2.get_ontology(mondo_ontology_owl_file_url).load()

owl_classes = [
    _class for _class in owl_ontology.classes()
    if _class.name.startswith(mondo_prefix)
    if len(_class.label) == 1
]

logging.info(f"All classes: {len(owl_classes)}")

# build the full graph
owl_graph = build_nx_graph_from_owl_ontology_legacy(
    owl_ontology=owl_ontology,
    owl_classes=owl_classes,
    prefix=mondo_prefix,
    extra_nodes=["PATO:0000461"] # normal
)

# get all ancestors for datastore nodes
filtered_owl_names_set = set()
for owl_name in mondo_datastore_terms_set:
    try:
        filtered_owl_names_set.update(get_all_ancestors(owl_graph, owl_name))
    except:
        print(f"Could not find {owl_name} in ontology, ignoring.")

logging.info(f"Filtered classes: {len(filtered_owl_names_set)}")

owl_classes = [
    _class for _class in owl_classes
    if _class.name.replace("_", ":") in filtered_owl_names_set
]

# build the filtered graph
owl_graph = build_nx_graph_from_owl_ontology_legacy(
    owl_ontology=owl_ontology,
    owl_classes=owl_classes,
    prefix=mondo_prefix,
    extra_nodes=["PATO:0000461"] # normal
)

owl_names = ["PATO:0000461"] + [_class.name.replace("_", ":") for _class in owl_classes]
owl_labels = ["normal"] + [_class.label[0] for _class in owl_classes]

owl_names_to_labels_map = {name: label for name, label in zip(owl_names, owl_labels)}
owl_names_to_idx_map = {name: idx for idx, name in enumerate(owl_names)}

INFO:root:Loading ontology ...
INFO:root:All classes: 27656
INFO:root:Filtered classes: 397


In [52]:
owl_ancestors_dictionary = build_ontology_ancestor_dictionary(
    owl_graph=owl_graph, owl_names=owl_names, owl_names_to_idx_map=owl_names_to_idx_map
)
mondo_ontology_resource = {
    "ancestors_dictionary": owl_ancestors_dictionary,
    "ontology_term_id_to_label": owl_names_to_labels_map,
}

logging.info(f"Writing output file to {mondo_propagation_resource_file_path}")
with open(mondo_propagation_resource_file_path, "wb") as output_file:
    pickle.dump(mondo_ontology_resource, output_file)

INFO:root:Building cell ontology ancestor dictionary...
INFO:root:Writing output file to /home/mehrtash/data/data/cellariumgpt_artifacts/ontology/mondo_propagation_resource.pkl


In [53]:
logging.info("Generating ontology benchmarking resource from MONDO ontology...")
ontology_resource_dict = build_benchmarking_ontology_dictionary_resource(
    owl_graph=owl_graph, owl_names=owl_names, n_hops=n_hops
)

logging.info(f"Writing output file to {mondo_benchmarking_resource_file_path}")

with open(mondo_benchmarking_resource_file_path, "wb") as output_file:
    pickle.dump(ontology_resource_dict, output_file)

INFO:root:Generating ontology benchmarking resource from MONDO ontology...
INFO:root:Writing output file to /home/mehrtash/data/data/cellariumgpt_artifacts/ontology/mondo_benchmarking_resource.pkl


In [54]:
n_print = 2000

i_print = 0
for k, v in mondo_ontology_resource["ancestors_dictionary"].items():
    print(mondo_ontology_resource["ontology_term_id_to_label"][k])
    for v_ in v:
        print("\t", mondo_ontology_resource["ontology_term_id_to_label"][v_])
    i_print += 1
    if i_print == n_print:
        break

normal
disease
injury
hereditary disease
	 disease
	 human disease
familial partial epilepsy
	 disease
	 hereditary disease
	 brain disorder
	 monogenic epilepsy
	 epilepsy
	 central nervous system disorder
	 nervous system disorder
	 human disease
	 focal epilepsy
	 epilepsy syndrome
	 childhood-onset epilepsy syndrome
	 adolescent-onset epilepsy syndrome
motor neuron disorder
	 disease
	 neurodegenerative disease
	 central nervous system disorder
	 neuromuscular disease
	 nervous system disorder
	 human disease
reproductive system disorder
	 disease
	 human disease
congenital nervous system disorder
	 disease
	 nervous system disorder
	 human disease
disorder of development or morphogenesis
	 disease
	 human disease
otorhinolaryngologic disease
	 disease
	 human disease
congenital heart disease
	 disease
	 disorder of development or morphogenesis
	 heart disorder
	 cardiovascular disorder
	 human disease
	 congenital anomaly of cardiovascular system
metabolic disease
	 disease
	 huma

### Tissue resources

In [4]:
uberon_ontology_owl_file_url = "https://github.com/obophenotype/uberon/releases/download/v2024-01-18/uberon.owl"
uberon_datastore_terms_csv = os.path.join(ONTOLOGY_ROOT_PATH, "datastore_uberon_terms.csv")

uberon_prefix = "UBERON_"
n_hops = 4

uberon_propagation_resource_file_path = os.path.join(ONTOLOGY_ROOT_PATH, "uberon_propagation_resource.pkl")
uberon_benchmarking_resource_file_path = os.path.join(ONTOLOGY_ROOT_PATH, "uberon_benchmarking_resource.pkl")

In [5]:
uberon_datastore_terms_df = pd.read_csv(uberon_datastore_terms_csv)
uberon_datastore_terms_set = set(uberon_datastore_terms_df['tissue_ontology_term_id'].values)
print(f"Number of UBERON terms in datastore: {len(uberon_datastore_terms_set)}")

Number of UBERON terms in datastore: 312


In [7]:
logging.info("Loading ontology ...")

owl_ontology = owlready2.get_ontology(uberon_ontology_owl_file_url).load()

owl_classes = [
    _class for _class in owl_ontology.classes()
    if _class.name.startswith(uberon_prefix)
    if len(_class.label) == 1
]

logging.info(f"All classes: {len(owl_classes)}")

# build the full graph
owl_graph = build_nx_graph_from_owl_ontology_legacy(
    owl_ontology=owl_ontology,
    owl_classes=owl_classes,
    prefix=uberon_prefix)

INFO:root:Loading ontology ...
INFO:root:All classes: 15567


In [8]:
# get all ancestors for datastore nodes
filtered_owl_names_set = set()
for owl_name in uberon_datastore_terms_set:
    try:
        filtered_owl_names_set.update(get_all_ancestors(owl_graph, owl_name))
    except:
        logging.info(f"Could not find {owl_name} in ontology, ignoring.")

logging.info(f"Filtered classes: {len(filtered_owl_names_set)}")

owl_classes = [
    _class for _class in owl_classes
    if _class.name.replace("_", ":") in filtered_owl_names_set
]

# build the filtered graph
owl_graph = build_nx_graph_from_owl_ontology_legacy(
    owl_ontology=owl_ontology,
    owl_classes=owl_classes,
    prefix=uberon_prefix)

owl_names = [_class.name.replace("_", ":") for _class in owl_classes]
owl_labels = [_class.label[0] for _class in owl_classes]

# assert that names are unique, labels do not have to be unique
assert len(set(owl_names)) == len(owl_classes)

owl_names_to_labels_map = {name: label for name, label in zip(owl_names, owl_labels)}
owl_names_to_idx_map = {name: idx for idx, name in enumerate(owl_names)}

INFO:root:Could not find CL:0002322 in ontology, ignoring.
INFO:root:Filtered classes: 891


In [9]:
owl_ancestors_dictionary = build_ontology_ancestor_dictionary(
    owl_graph=owl_graph,
    owl_names=owl_names,
    owl_names_to_idx_map=owl_names_to_idx_map
)
uberon_ontology_resource = {
    "ancestors_dictionary": owl_ancestors_dictionary,
    "ontology_term_id_to_label": owl_names_to_labels_map,
}

logging.info(f"Writing output file to {uberon_propagation_resource_file_path}")
with open(uberon_propagation_resource_file_path, "wb") as output_file:
    pickle.dump(uberon_ontology_resource, output_file)

INFO:root:Building cell ontology ancestor dictionary...
INFO:root:Writing output file to /home/mehrtash/data/data/cellariumgpt_artifacts/ontology/uberon_propagation_resource.pkl


In [10]:
logging.info("Generating ontology benchmarking resource from UBERON ontology...")
ontology_resource_dict = build_benchmarking_ontology_dictionary_resource(
    owl_graph=owl_graph,
    owl_names=owl_names,
    n_hops=n_hops
)

logging.info(f"Writing output file to {uberon_benchmarking_resource_file_path}")

with open(uberon_benchmarking_resource_file_path, "wb") as output_file:
    pickle.dump(ontology_resource_dict, output_file)

INFO:root:Generating ontology benchmarking resource from UBERON ontology...


INFO:root:Writing output file to /home/mehrtash/data/data/cellariumgpt_artifacts/ontology/uberon_benchmarking_resource.pkl


In [11]:
n_print = 2000

i_print = 0
for k, v in uberon_ontology_resource["ancestors_dictionary"].items():
    print(uberon_ontology_resource["ontology_term_id_to_label"][k])
    for v_ in v:
        print("\t", uberon_ontology_resource["ontology_term_id_to_label"][v_])
    i_print += 1
    if i_print == n_print:
        break

anatomical entity
rib
	 anatomical entity
	 anatomical structure
	 material anatomical entity
	 multicellular anatomical structure
	 bone element
	 skeletal system
	 endochondral bone
	 multicellular organism
	 organ
	 anatomical system
	 organ system subdivision
	 musculoskeletal system
	 subdivision of skeletal system
	 skeletal element
	 postcranial axial skeletal system
	 endochondral element
	 rib skeletal system
	 rib endochondral element
appendage
	 anatomical entity
	 anatomical structure
	 material anatomical entity
	 multicellular anatomical structure
	 organism subdivision
	 multicellular organism
subdivision of organism along appendicular axis
	 anatomical entity
	 appendage
	 anatomical structure
	 material anatomical entity
	 multicellular anatomical structure
	 organism subdivision
	 multicellular organism
anatomical structure
	 anatomical entity
	 material anatomical entity
material anatomical entity
	 anatomical entity
artery
	 anatomical entity
	 anatomical structure
