# Data Preparation
***
**Author**: [Chris Li](mailto:chris@biobox.io)
<br>

**Purpose**: This notebook serves as a script to download files and prepare data assets for loading into the BioBox knowledge graph. For more information about the data sources, please see [Data Sources](https://docs.biobox.io/biobox/data_sources)

## Environment
***

In [1]:
# Uncomment the following lines to install required modules
# import sys
# !{sys.executable} -m pip install -r requirements.txt

In [1]:
import pandas as pd
import numpy as np
import os
import json
import gzip
import sys
from tqdm import tqdm
import requests
from biobox_analytics.utils import ensure_primitive_or_array_of_primitives
import pronto
import rdflib

In [2]:
# Set up environment configurations
processed_data_directory = "../resources/processed_data"
tmp_directory = "../resources/tmp_data"
ontologies_directory = "../resources/ontologies"
os.makedirs(ontologies_directory, exist_ok=True)
os.makedirs(processed_data_directory, exist_ok=True)
os.makedirs(tmp_directory, exist_ok=True)

***
# Clinical Trials

**Purpose**: The purpose of this section is to scape NIH clinical trial information into knowledge graph nodes.

In [3]:

clinical_trial_data = []

page_size = 500
url = 'https://clinicaltrials.gov/api/v2'

has_more = True
next_page_token = None

all_intervention_terms = set()
all_condition_terms = set()

def get_mesh_descriptor_id(term):
    res = requests.get(
        url='https://id.nlm.nih.gov/mesh/lookup/descriptor',
        params={
            'label': term,
            'match': 'exact',
            'year':'current',
            'limit': 1
        }
    )
    res.raise_for_status()
    data = res.json()
    if len(data) == 0:
        return None
    else:
        return data[0]['resource']

def format_clinical_trial(o):
    id_module = o['protocolSection'].get('identificationModule', {})
    status = o['protocolSection'].get('statusModule', {})
    description_module = o['protocolSection'].get('descriptionModule', {})
    design_module = o['protocolSection'].get('designModule', {})
    derived_section = o.get('derivedSection', {})
    intervention_module = o['protocolSection'].get('armsInterventionsModule', {})
    outcomes_module = o['protocolSection'].get('outcomesModule', {})

    condition_terms = []
    condition_mesh_ids = []

    if 'conditionBrowseModule' in derived_section:
        for c in derived_section['conditionBrowseModule'].get('meshes', []):
            term = c.get('term', None)
            condition_mesh_ids.append(c.get('id', None))
            if term is not None:
                condition_terms.append(term)
            #
            #     descriptor_id = get_mesh_descriptor_id(term)
            #     if descriptor_id is not None:
            #         condition_mesh_ids.append(descriptor_id)



    intervention_terms = []
    intervention_term_ids = []

    if 'interventionBrowseModule' in derived_section:
        for c in derived_section['interventionBrowseModule'].get('meshes', []):
            intervention_term = c.get('term', None)
            intervention_term_ids.append(c.get('id'))
            if intervention_term is not None:
                intervention_terms.append(c.get('term', None))
                # descriptor_id = get_mesh_descriptor_id(intervention_term)
                # if descriptor_id is not None:
                #     intervention_term_ids.append(descriptor_id)

    temp_properties = {
        'uuid': id_module.get('nctId', None),
        'NCT_ID': id_module.get('nctId', None),
        'displayName': id_module.get('nctId', None),
        'brief_title': id_module.get('briefTitle', None),
        'official_title': id_module.get('officialTitle', None),
        'overall_status': status.get('overallStatus', None),
        'termination_reason': status.get('whyStopped', None),
        'last_known_status': status.get('lastKnownStatus', None),
        'description': description_module.get('briefSummary', None),
        'study_type': design_module.get('studyType', None),
        'phase': design_module.get('phases', None),
        'study_arms': intervention_module.get('armGroup', None),
        'allocation': design_module.get('allocation', None),
        'intervention_model': design_module.get('interventionModel', None),
        'masking': design_module.get('masking', None),
        'primary_purpose': design_module.get('primaryPurpose', None),
        'intervention_type': intervention_module.get('interventionType', None),
        'intervention_name': intervention_module.get('interventionName', None),
        'primary_outcome_measures': outcomes_module.get('primaryOutcomeMeasure', None),
        'secondary_outcome_measures': outcomes_module.get('secondaryOutcomeMeasure', None),
        'condition_terms': condition_terms if condition_terms else None,
        'condition_mesh_ids': condition_mesh_ids if condition_mesh_ids else None,
        'intervention_terms': intervention_terms if intervention_terms else None,
        'intervention_term_ids': intervention_term_ids if intervention_term_ids else None,
        'doc_json': json.dumps(o)
    }

    # Ensure all values are primitives or arrays of primitives
    def ensure_primitive_or_array_of_primitives(value):
        if isinstance(value, (str, int, float, bool)) or value is None:
            return value
        elif isinstance(value, list):
            return [ensure_primitive_or_array_of_primitives(v) for v in value]
        else:
            return json.dumps(value)

    properties = {k: ensure_primitive_or_array_of_primitives(v) for k, v in temp_properties.items() if v is not None}

    all_intervention_terms.update([term for term in intervention_terms if term is not None])
    all_condition_terms.update([term for term in condition_terms if term is not None])

    return properties


In [None]:
while has_more:
    params = {'pageSize': page_size}
    if next_page_token:
        params['pageToken'] = next_page_token
    res = requests.get(url=url + '/studies', params=params)
    res.raise_for_status()
    data = res.json()
    next_page_token = data.get('nextPageToken', None)
    studies = data.get('studies', [])
    for s in studies:
        formatted = format_clinical_trial(s)
        if formatted is not None:
            clinical_trial_data.append(formatted)
    print(len(clinical_trial_data))
    if not next_page_token:
        has_more = False


In [5]:
len(all_condition_terms)

4348

In [6]:
len(all_intervention_terms)

3810

In [8]:
all_condition_map = {}
for c in tqdm(all_condition_terms):
    id = get_mesh_descriptor_id(c)
    if id is not None:
        all_condition_map[c] = id
all_intervention_map = {}
for c in tqdm(all_intervention_terms):
    id = get_mesh_descriptor_id(c)
    if id is not None:
        all_intervention_map[c] = id

100%|██████████| 4348/4348 [08:47<00:00,  8.25it/s]
100%|██████████| 3810/3810 [07:56<00:00,  8.00it/s]


## Start re-organizing

In [13]:
with open(os.path.join(tmp_directory, 'all_condition_map.json'), 'w') as outfile:
    json.dump(all_condition_map, outfile)

In [14]:
with open(os.path.join(tmp_directory, 'all_intervention_map.json'), 'w') as outfile:
    json.dump(all_intervention_map, outfile)

In [15]:
# scratch

clinical_trial_df = pd.DataFrame(clinical_trial_data)

In [18]:
clinical_trial_df.replace({np.nan: None}, inplace=True)

In [19]:
clinical_trial_df.head()

Unnamed: 0,uuid,NCT_ID,displayName,brief_title,official_title,overall_status,description,study_type,phase,condition_terms,condition_mesh_ids,doc_json,intervention_terms,intervention_term_ids,last_known_status,termination_reason
0,NCT05013879,NCT05013879,NCT05013879,Kinesiotape for Edema After Bilateral Total Kn...,"Effect of Kinesiotaping on Edema Management, P...",COMPLETED,The purpose of this study is to determine if k...,INTERVENTIONAL,[NA],[Edema],[D000004487],"{""protocolSection"": {""identificationModule"": {...",,,,
1,NCT00517179,NCT00517179,NCT00517179,Effect of Vardenafil on Blood Pressure in Pati...,Effect of Vardenafil on Blood Pressure in Pati...,COMPLETED,The purpose of this study is to investigate th...,INTERVENTIONAL,[NA],"[Erectile Dysfunction, Prostatic Hyperplasia, ...","[D000007172, D000011470, D000006965]","{""protocolSection"": {""identificationModule"": {...",[Vardenafil Dihydrochloride],[D000069058],,
2,NCT00812279,NCT00812279,NCT00812279,Investigate the Exposure to Selected Smoke Con...,"A Controlled, Randomised, Open-label, 3-arm Pa...",COMPLETED,The overall purpose of this clinical study con...,INTERVENTIONAL,[NA],,,"{""protocolSection"": {""identificationModule"": {...",,,,
3,NCT03878979,NCT03878979,NCT03878979,Preoperative Immune Checkpoint Inhibitor for P...,Preoperative Immune Checkpoint Inhibitor Thera...,COMPLETED,Nivolumab (also known as BMS-936558) before su...,INTERVENTIONAL,[PHASE2],"[Carcinoma, Carcinoma, Squamous Cell, Head and...","[D000002277, D000002294, D000006258, D000077195]","{""protocolSection"": {""identificationModule"": {...",[Nivolumab],[D000077594],,
4,NCT04175379,NCT04175379,NCT04175379,The Effect of Permissive Hypercapnia on Oxygen...,The Effect of Permissive Hypercapnia on Oxygen...,UNKNOWN,Permissive hypercapnia increased the survival ...,INTERVENTIONAL,[NA],[Hypercapnia],[D000006935],"{""protocolSection"": {""identificationModule"": {...",,,ENROLLING_BY_INVITATION,


In [23]:
clinical_trial_df['study_type'].value_counts()

study_type
INTERVENTIONAL     379423
OBSERVATIONAL      113337
EXPANDED_ACCESS       938
Name: count, dtype: int64

In [197]:
if not os.path.exists(os.path.join(processed_data_directory, 'CLINICAL_TRIAL_NODES.jsonl.gz')):
    with gzip.open(os.path.join(processed_data_directory, 'CLINICAL_TRIAL_NODES.jsonl.gz'), 'wt', encoding='utf-8') as outfile:
        for node in tqdm(clinical_trial_data):
            if 'study_type' not in node:
                continue
            if node['study_type'] != 'INTERVENTIONAL':
                continue
            tmp_properties = {k: ensure_primitive_or_array_of_primitives(v) for k, v in node.items() if v is not None}
            if 'intervention_terms' in tmp_properties:

                _remapped_ids = []
                _uris = []
                for x in tmp_properties['intervention_terms']:
                    uri = all_intervention_map.get(x, None)
                    if uri is None:
                        continue
                    else:
                        _remapped_id = uri.replace('http://id.nlm.nih.gov/mesh/', '')
                        _remapped_ids.append(_remapped_id)
                        _uris.append(uri)
                tmp_properties['intervention_uri_list'] = _uris


            if 'condition_terms' in tmp_properties:
                _remapped_ids = []
                _uris = []
                for x in tmp_properties['condition_terms']:
                    uri = all_condition_map.get(x, None)
                    if uri is None:
                        continue
                    else:
                        _remapped_id = uri.replace('http://id.nlm.nih.gov/mesh/', '')
                        _remapped_ids.append(_remapped_id)
                        _uris.append(uri)
                tmp_properties['condition_uri_list'] = _uris


            line = {
                '_id': node.get('uuid'),
                'labels': ['ClinicalTrial'],
                'properties': tmp_properties
            }
            outfile.write(json.dumps(line) + '\n')

100%|██████████| 494578/494578 [06:23<00:00, 1290.53it/s]


Since MESH terms are not directly supported in BioBox ontologies, we need load in the Experimental Factor Ontology and look for equivalences to convert between them.

In [3]:
efo = rdflib.Graph().parse("../resources/ontologies/efo.owl")

There are 3 possible properties that can match to the MESH value of interest, but they require small modifications as each mapping varies in its final representation.

In [4]:
def xref_property_mapping(mesh_uri):
    # Expects MESH URI to be of the form: http://id.nlm.nih.gov/mesh/D004487
    mesh_id = mesh_uri.replace("http://id.nlm.nih.gov/mesh/", "")
    hasDbXref_value_alternate= f"MSH:{mesh_id}"
    hasDbXref_value = f"MESH:{mesh_id}"
    exactMatch_value = mesh_uri.replace("http://id.nlm.nih.gov/mesh/", "http://identifiers.org/mesh/")

    property_sets = [
        ('oboInOwl:hasDbXref', hasDbXref_value),
        ('skos:exactMatch', exactMatch_value),
        ('mondo:exactMatch', exactMatch_value),
        ('oboInOwl:hasDbXref', hasDbXref_value_alternate)
    ]

    all_hits = []

    for s in property_sets:

        xsd_query = f"""
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
            PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX mondo: <http://purl.obolibrary.org/obo/mondo#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

            SELECT DISTINCT ?class
            WHERE {{
              ?class a owl:Class ;
                    {s[0]} "{s[1]}"^^xsd:string .
            }}
            """
        xsd_results = efo.query(xsd_query)
        if len(xsd_results):
            all_hits.extend([str(result['class']) for result in xsd_results])


            efo_hits = [ x for x in all_hits if x.startswith('http://www.ebi.ac.uk/efo/')]
            if len(efo_hits):
                return efo_hits

        query = f"""
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
            PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX mondo: <http://purl.obolibrary.org/obo/mondo#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

            SELECT DISTINCT ?class
            WHERE {{
              ?class a owl:Class ;
                    {s[0]} "{s[1]}" .
            }}
            """

        results = efo.query(query)
        if len(results):
            all_hits.extend([str(result['class']) for result in results])

            efo_hits = [ x for x in all_hits if x.startswith('http://www.ebi.ac.uk/efo/')]
            if len(efo_hits):
                return efo_hits
    return all_hits



def transform_uri_to_curie(uri):
    obo_short = uri.split('/').pop()
    if obo_short is not None:
        return obo_short.replace('_', ':')

def check_efo_internal_xref(xref_value):
    curie = transform_uri_to_curie(xref_value)
    # SPARQL query to find classes with a specific dbXref within the EFO namespace
    query = f"""
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    PREFIX efo: <http://www.ebi.ac.uk/efo/>

    SELECT DISTINCT ?class
    WHERE {{
        ?class a owl:Class ;
               oboInOwl:hasDbXref "{curie}" .
        FILTER(STRSTARTS(STR(?class), STR(efo:)))
    }}
    """
    results = efo.query(query)
    return [str(result['class']) for result in results]

def get_xref_efo(mesh_uri):
    first_match_list = xref_property_mapping(mesh_uri)
    efo_ns = rdflib.Namespace("http://www.ebi.ac.uk/efo/")
    if len(first_match_list) == 0:

        raise Exception(f"Cannot find a match for {mesh_uri}")
    first_match = first_match_list[0]

    if first_match.startswith(efo_ns):
        return transform_uri_to_curie(first_match)
    if first_match.startswith('http://purl.obolibrary.org/obo/MONDO'):
        return transform_uri_to_curie(first_match)
    else:
        internal_match = check_efo_internal_xref(first_match)

        if len(internal_match) == 0:

            raise Exception("A mapping was found, but not to any EFO term")
        else:
            return transform_uri_to_curie(internal_match[0])


In [5]:
get_xref_efo('http://id.nlm.nih.gov/mesh/D007172')

'EFO:0004234'

In [6]:
class Edge:
    def __init__(self, label, src=None, trg=None, properties=None):
        self.label = label
        self.src = src
        self.trg = trg
        self.properties = properties if properties else {}

    def serialize(self, format="data-pack"):
        if not self._can_serialize():
            raise Exception("Cannot serialize")
        if format == 'data-pack':
            return self._serialize_as_data_pack()


    def _can_serialize(self):
        if self.src is None:
            return False
        if self.trg is None:
            return False
        return True


    def _serialize_as_data_pack(self):
        obj = {
            'from': {
                'uuid': self.src.get('uuid')
            },
            'to': {
                'uuid': self.trg.get('uuid')
            },
            'label': self.label,
            'properties': self.properties
        }
        return obj

In [None]:
clinical_trial_edges = []
with gzip.open(os.path.join(processed_data_directory, 'CLINICAL_TRIAL_NODES.jsonl.gz'), 'rt') as file:
    for line in tqdm(file):
        obj = json.loads(line)
        properties = obj.get('properties')
        if 'condition_uri_list' in properties:
            condition_efo_xrefs = []
            for x in properties['condition_uri_list']:
               try:
                   condition_efo_xrefs.append(get_xref_efo(x))
               except:
                   continue
            for c_xref in condition_efo_xrefs:
                trg_uuid = transform_uri_to_curie(c_xref)
                clinical_trial_edges.append(
                    Edge(
                        label='study of condition',
                        src={'uuid': obj.get('_id')},
                        trg={'uuid': trg_uuid}
                    )
                )
        if 'intervention_uri_list' in properties:
            intervention_efo_xrefs = []
            for x in properties['intervention_uri_list']:
                try:
                    intervention_efo_xrefs.append(get_xref_efo(x))
                except:
                    continue
            for i_xref in intervention_efo_xrefs:
                trg_uuid = transform_uri_to_curie(i_xref)
                clinical_trial_edges.append(
                    Edge(
                        label='using intervention',
                        src={'uuid': obj.get('_id')},
                        trg={'uuid': trg_uuid}
                    )
                )



23294it [22:43,  8.04it/s]

In [205]:
clinical_trial_edges[0].serialize()

Exception: Cannot serialize

In [151]:
# Define a SPARQL query to fetch details about the class using f-string for clarity

def lookup_uri(uri):

    query = f"""
    SELECT ?property ?value
    WHERE {{
        <{uri}> ?property ?value .
    }}
    """

    # Execute the query
    results = efo.query(query)

    output = {}
    for row in results:
        if str(row.property) in output:
            output[str(row.property)].append(str(row.value))
        else:
            output[str(row.property)] = [str(row.value)]
    return output

In [152]:
lookup_uri('http://www.ebi.ac.uk/efo/EFO_0004234')

{'http://www.w3.org/1999/02/22-rdf-syntax-ns#type': ['http://www.w3.org/2002/07/owl#Class'],
 'http://www.w3.org/2002/07/owl#equivalentClass': ['N69b60bced67946a08831271c1e4b3a7d'],
 'http://www.w3.org/2000/01/rdf-schema#subClassOf': ['http://purl.obolibrary.org/obo/MONDO_0002036',
  'http://purl.obolibrary.org/obo/MONDO_0002134',
  'N39f28cbad3dc4512be170d51d428b8e2',
  'Nc0203f603f5a4031a875aa5f8c03d935'],
 'http://purl.obolibrary.org/obo/IAO_0000115': ['Persistent or recurrent inability to achieve or to maintain an erection during sexual activity.',
  'The inability in the male to have a PENILE ERECTION due to psychological or organ dysfunction.'],
 'http://purl.obolibrary.org/obo/IAO_0000117': ['Dani Welter'],
 'http://purl.obolibrary.org/obo/IAO_0000589': ['erectile dysfunction (disease)'],
 'http://purl.obolibrary.org/obo/mondo#exactMatch': ['http://identifiers.org/mesh/D007172',
  'http://identifiers.org/snomedct/397803000',
  'http://purl.obolibrary.org/obo/DOID_1875',
  'http:

In [188]:
def search_by_property_value(property, value):
    query = f"""
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
        PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

        SELECT DISTINCT ?class
        WHERE {{
            ?class a owl:Class ;
               <{property}> "{value}"
        }}
        """

    results = efo.query(query)
    return [str(r['class']) for r in results]


In [189]:
search_by_property_value('http://www.geneontology.org/formats/oboInOwl#hasDbXref', 'MSH:D004487')

['http://purl.obolibrary.org/obo/HP_0000969']