# Data Preparation
***
**Author**: [Chris Li](mailto:chris@biobox.io)
<br>

**Purpose**: This notebook serves as a script to download files and prepare data assets for loading into the BioBox knowledge graph. For more information about the data sources, please see [Data Sources](https://docs.biobox.io/biobox/data_sources)

## Environment
***

In [1]:
# Uncomment the following lines to install required modules
# import sys
# !{sys.executable} -m pip install -r requirements.txt

In [1]:
import pandas as pd
import numpy as np
import os
import json
import gzip
import sys
from tqdm import tqdm
import requests
from biobox_analytics.utils import ensure_primitive_or_array_of_primitives
import pronto
import rdflib

In [2]:
# Set up environment configurations
processed_data_directory = "../resources/processed_data"
tmp_directory = "../resources/tmp_data"
ontologies_directory = "../resources/ontologies"
os.makedirs(ontologies_directory, exist_ok=True)
os.makedirs(processed_data_directory, exist_ok=True)
os.makedirs(tmp_directory, exist_ok=True)

***
# Clinical Trials

**Purpose**: The purpose of this section is to scape NIH clinical trial information into knowledge graph nodes.

In [3]:

clinical_trial_data = []

page_size = 500
url = 'https://clinicaltrials.gov/api/v2'

has_more = True
next_page_token = None

all_intervention_terms = set()
all_condition_terms = set()

def get_mesh_descriptor_id(term):
    res = requests.get(
        url='https://id.nlm.nih.gov/mesh/lookup/descriptor',
        params={
            'label': term,
            'match': 'exact',
            'year':'current',
            'limit': 1
        }
    )
    res.raise_for_status()
    data = res.json()
    if len(data) == 0:
        return None
    else:
        return data[0]['resource']

def format_clinical_trial(o):
    id_module = o['protocolSection'].get('identificationModule', {})
    status = o['protocolSection'].get('statusModule', {})
    description_module = o['protocolSection'].get('descriptionModule', {})
    design_module = o['protocolSection'].get('designModule', {})
    derived_section = o.get('derivedSection', {})
    intervention_module = o['protocolSection'].get('armsInterventionsModule', {})
    outcomes_module = o['protocolSection'].get('outcomesModule', {})

    results_section = o.get('resultsSection', {})
    more_info_module = results_section.get('moreInfoModule', {})

    condition_terms = []
    condition_mesh_ids = []

    if 'conditionBrowseModule' in derived_section:
        for c in derived_section['conditionBrowseModule'].get('meshes', []):
            term = c.get('term', None)
            condition_mesh_ids.append(c.get('id', None))
            if term is not None:
                condition_terms.append(term)
            #
            #     descriptor_id = get_mesh_descriptor_id(term)
            #     if descriptor_id is not None:
            #         condition_mesh_ids.append(descriptor_id)

    intervention_terms = []
    intervention_term_ids = []

    if 'interventionBrowseModule' in derived_section:
        for c in derived_section['interventionBrowseModule'].get('meshes', []):
            intervention_term = c.get('term', None)
            intervention_term_ids.append(c.get('id'))
            if intervention_term is not None:
                intervention_terms.append(c.get('term', None))
                # descriptor_id = get_mesh_descriptor_id(intervention_term)
                # if descriptor_id is not None:
                #     intervention_term_ids.append(descriptor_id)
    limitations_caveats = more_info_module.get('limitationsAndCaveats', {}).get('description', None)
    
    temp_properties = {
        'uuid': id_module.get('nctId', None),
        'NCT_ID': id_module.get('nctId', None),
        'displayName': id_module.get('nctId', None),
        'brief_title': id_module.get('briefTitle', None),
        'official_title': id_module.get('officialTitle', None),
        'overall_status': status.get('overallStatus', None),
        'termination_reason': status.get('whyStopped', None),
        'last_known_status': status.get('lastKnownStatus', None),
        'description': description_module.get('briefSummary', None),
        'study_type': design_module.get('studyType', None),
        'phase': design_module.get('phases', None),
        'study_arms': intervention_module.get('armGroup', None),
        'allocation': design_module.get('allocation', None),
        'intervention_model': design_module.get('interventionModel', None),
        'masking': design_module.get('masking', None),
        'primary_purpose': design_module.get('primaryPurpose', None),
        'intervention_type': intervention_module.get('interventionType', None),
        'intervention_name': intervention_module.get('interventionName', None),
        'primary_outcome_measures': outcomes_module.get('primaryOutcomeMeasure', None),
        'secondary_outcome_measures': outcomes_module.get('secondaryOutcomeMeasure', None),
        'condition_terms': condition_terms if condition_terms else None,
        'condition_mesh_ids': condition_mesh_ids if condition_mesh_ids else None,
        'intervention_terms': intervention_terms if intervention_terms else None,
        'intervention_term_ids': intervention_term_ids if intervention_term_ids else None,
        'limitations_caveats': limitations_caveats
        # 'doc_json': json.dumps(o)
    }

    # Ensure all values are primitives or arrays of primitives
    def ensure_primitive_or_array_of_primitives(value):
        if isinstance(value, (str, int, float, bool)) or value is None:
            return value
        elif isinstance(value, list):
            return [ensure_primitive_or_array_of_primitives(v) for v in value]
        else:
            return json.dumps(value)

    properties = {k: ensure_primitive_or_array_of_primitives(v) for k, v in temp_properties.items() if v is not None}

    all_intervention_terms.update([term for term in intervention_terms if term is not None])
    all_condition_terms.update([term for term in condition_terms if term is not None])

    return properties


In [4]:
while has_more:
    params = {'pageSize': page_size}
    if next_page_token:
        params['pageToken'] = next_page_token
    res = requests.get(url=url + '/studies', params=params)
    res.raise_for_status()
    data = res.json()
    next_page_token = data.get('nextPageToken', None)
    studies = data.get('studies', [])
    for s in studies:
        formatted = format_clinical_trial(s)
        if formatted is not None:
            clinical_trial_data.append(formatted)
    print(len(clinical_trial_data))
    if not next_page_token:
        has_more = False


500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45500
46000
46500
47000
47500
48000
48500
49000
49500
50000
50500
51000
51500
52000
52500
53000
53500
54000
54500
55000
55500
56000
56500
57000
57500
58000
58500
59000
59500
60000
60500
61000
61500
62000
62500
63000
63500
64000
64500
65000
65500
66000
66500
67000
67500
68000
68500
69000
69500
70000
70500
71000
71500
72000
72500
73000
73500
74000
74500
75000
75500
76000
76500
77000
77500
78000
78500
79000
79500
80000
80500
81000
81500
82000
82500
83000
83500
84000
84500
85000


In [6]:
len(all_condition_terms)

4348

In [7]:
len(all_intervention_terms)

3810

In [5]:
# UNCOMMENT BELOW TO REGENERATE
# all_condition_map = {}
# for c in tqdm(all_condition_terms):
#     id = get_mesh_descriptor_id(c)
#     if id is not None:
#         all_condition_map[c] = id
# all_intervention_map = {}
# for c in tqdm(all_intervention_terms):
#     id = get_mesh_descriptor_id(c)
#     if id is not None:
#         all_intervention_map[c] = id
        
# with open(os.path.join(tmp_directory, 'all_condition_map.json'), 'w') as outfile:
#     json.dump(all_condition_map, outfile)
# with open(os.path.join(tmp_directory, 'all_intervention_map.json'), 'w') as outfile:
#     json.dump(all_intervention_map, outfile)

all_condition_map = {}
with open(os.path.join(tmp_directory, 'all_condition_map.json'), 'r') as f:
    all_condition_map = json.load(f)

all_intervention_map = {}
with open(os.path.join(tmp_directory, 'all_intervention_map.json'), 'r') as f:
    all_intervention_map = json.load(f)

In [6]:
# scratch

clinical_trial_df = pd.DataFrame(clinical_trial_data)

In [7]:
clinical_trial_df.replace({np.nan: None}, inplace=True)

In [8]:
clinical_trial_df.head()

Unnamed: 0,uuid,NCT_ID,displayName,brief_title,official_title,overall_status,description,study_type,phase,condition_terms,condition_mesh_ids,intervention_terms,intervention_term_ids,last_known_status,termination_reason,limitations_caveats
0,NCT00072579,NCT00072579,NCT00072579,Sargramostim in Treating Patients With Chronic...,Phase II Study of GM-CSF in Patients With Chro...,COMPLETED,"RATIONALE: Colony-stimulating factors, such as...",INTERVENTIONAL,[PHASE2],"[Leukemia, Leukemia, Myelogenous, Chronic, BCR...","[D000007938, D000015464, D000015466]",[Sargramostim],[C000081222],,,
1,NCT00517179,NCT00517179,NCT00517179,Effect of Vardenafil on Blood Pressure in Pati...,Effect of Vardenafil on Blood Pressure in Pati...,COMPLETED,The purpose of this study is to investigate th...,INTERVENTIONAL,[NA],"[Erectile Dysfunction, Prostatic Hyperplasia, ...","[D000007172, D000011470, D000006965]",[Vardenafil Dihydrochloride],[D000069058],,,
2,NCT00812279,NCT00812279,NCT00812279,Investigate the Exposure to Selected Smoke Con...,"A Controlled, Randomised, Open-label, 3-arm Pa...",COMPLETED,The overall purpose of this clinical study con...,INTERVENTIONAL,[NA],,,,,,,
3,NCT03878979,NCT03878979,NCT03878979,Preoperative Immune Checkpoint Inhibitor for P...,Preoperative Immune Checkpoint Inhibitor Thera...,COMPLETED,Nivolumab (also known as BMS-936558) before su...,INTERVENTIONAL,[PHASE2],"[Carcinoma, Carcinoma, Squamous Cell, Head and...","[D000002277, D000002294, D000006258, D000077195]",[Nivolumab],[D000077594],,,
4,NCT04175379,NCT04175379,NCT04175379,The Effect of Permissive Hypercapnia on Oxygen...,The Effect of Permissive Hypercapnia on Oxygen...,UNKNOWN,Permissive hypercapnia increased the survival ...,INTERVENTIONAL,[NA],[Hypercapnia],[D000006935],,,ENROLLING_BY_INVITATION,,


In [9]:
clinical_trial_df['study_type'].value_counts()

study_type
INTERVENTIONAL     379778
OBSERVATIONAL      113458
EXPANDED_ACCESS       939
Name: count, dtype: int64

In [10]:
if not os.path.exists(os.path.join(processed_data_directory, 'CLINICAL_TRIAL_NODES.jsonl.gz')):
    with gzip.open(os.path.join(processed_data_directory, 'CLINICAL_TRIAL_NODES.jsonl.gz'), 'wt', encoding='utf-8') as outfile:
        for node in tqdm(clinical_trial_data):
            if 'study_type' not in node:
                continue
            if node['study_type'] != 'INTERVENTIONAL':
                continue
            tmp_properties = {k: ensure_primitive_or_array_of_primitives(v) for k, v in node.items() if v is not None}
            if 'intervention_terms' in tmp_properties:

                _remapped_ids = []
                _uris = []
                for x in tmp_properties['intervention_terms']:
                    uri = all_intervention_map.get(x, None)
                    if uri is None:
                        continue
                    else:
                        _remapped_id = uri.replace('http://id.nlm.nih.gov/mesh/', '')
                        _remapped_ids.append(_remapped_id)
                        _uris.append(uri)
                tmp_properties['intervention_uri_list'] = _uris


            if 'condition_terms' in tmp_properties:
                _remapped_ids = []
                _uris = []
                for x in tmp_properties['condition_terms']:
                    uri = all_condition_map.get(x, None)
                    if uri is None:
                        continue
                    else:
                        _remapped_id = uri.replace('http://id.nlm.nih.gov/mesh/', '')
                        _remapped_ids.append(_remapped_id)
                        _uris.append(uri)
                tmp_properties['condition_uri_list'] = _uris


            line = {
                '_id': node.get('uuid'),
                'labels': ['ClinicalTrial'],
                'properties': tmp_properties
            }
            outfile.write(json.dumps(line) + '\n')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 495056/495056 [00:30<00:00, 16441.68it/s]


In [11]:
conditions_mesh2efo = {}

condition_mesh_curies = []

for k,v in tqdm(all_condition_map.items()):
    curie = 'MESH:' + v.split('/').pop()
    condition_mesh_curies.append(curie)

    
has_more = True
url = 'https://www.ebi.ac.uk/spot/oxo/api/search'
while has_more:
    res = requests.post(
        url,
        json={
            'ids': condition_mesh_curies,
            'mappingTarget': ['EFO', 'MONDO'],
            'distance': 3
        },
        params={
            'size': 500
        }
    )
    data = res.json()
    next_page = data.get('_links', {}).get('next', {}).get('href')
    print(f"{data.get('page').get('number')} of {data.get('page').get('totalPages')}")
    if next_page is None:
        has_more = False
    else:
        url = next_page

    mappings = data.get('_embedded', {}).get('searchResults', [])

    for m in mappings:
        mappingResponseList = m.get('mappingResponseList', [])
        if len(mappingResponseList) == 0:
            continue
        mapping_prioritized = sorted(mappingResponseList, key=lambda x: (x['targetPrefix'] != 'EFO', x['distance']))
        conditions_mesh2efo[m.get('queryId')] = mapping_prioritized[0].get('curie')

if not os.path.exists(os.path.join(processed_data_directory, 'conditions_mesh2efo.json')):
    with open(os.path.join(processed_data_directory, 'conditions_mesh2efo.json'), 'w') as outfile:
        json.dump(conditions_mesh2efo, outfile)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4340/4340 [00:00<00:00, 1019277.64it/s]


0 of 9
1 of 9
2 of 9
3 of 9
4 of 9
5 of 9
6 of 9
7 of 9
8 of 9


In [12]:
edges = []
with gzip.open(os.path.join(processed_data_directory, 'CLINICAL_TRIAL_NODES.jsonl.gz'), 'rt', encoding='utf-8') as f:
    for line in tqdm(f):
        obj = json.loads(line)
        properties = obj.get('properties')
        if 'condition_uri_list' in properties:
            condition_curies = ['MESH:' + x.split('/').pop() for x in properties.get('condition_uri_list', [])]
            for c in condition_curies:
                target_efo_map = conditions_mesh2efo.get(c, None)
                if target_efo_map is None:
                    continue
                e = {
                    'from': {
                        'uuid': obj.get('_id')
                    },
                    'to': {
                        'uuid': target_efo_map
                    },
                    'label': 'studies condition',
                    'properties': {}
                }
                edges.append(e)
print(len(edges))
        

379778it [00:07, 47734.58it/s]

510951





In [13]:
if not os.path.exists(os.path.join(processed_data_directory, 'CLINICAL_TRIAL_edge.jsonl.gz')):
    with gzip.open(os.path.join(processed_data_directory, 'CLINICAL_TRIAL_edge.jsonl.gz'), 'wt', encoding='utf-8') as outfile:
        for e in edges:
            outfile.write(json.dumps(e) + '\n')

In [14]:
metadata = {
    "_meta": {
        "version": "0.2.1",
        "date_updated": "2024-05-17",
        "maintainer": "BioBox Analytics"
    },
    "key": "clinical_trials",
    "name": "Clinical Trials - Interventional Studies",
    "description": "A subset of the clinical trials database that captures all intervention studies.",
    "source": [
        {
            "uri": "https://clinicaltrials.gov/",
            "type": "doc"
        },
        {
            "uri": "https://clinicaltrials.gov/",
            "type": "data",
            "version": "2024-05-15"
        }
    ],
    "concepts": {
        "ClinicalTrial": {
            "label": "Clinical Trial",
            "dbLabel": "ClinicalTrial",
            "definition": "A research study in which one or more human subjects are prospectively assigned to one or more interventions (which may include placebo or other control) to evaluate the effects of those interventions on health-related biomedical or behavioral outcomes."
        }
    },
    "relationships": {
        "studies condition": {
            "from": "ClinicalTrial",
            "to": "Disease"
        }
    }
}

with open(os.path.join(processed_data_directory, 'CLINICAL_TRIALS_metadata.json'), 'w') as f:
    json.dump(metadata, f)

In [38]:
edges[0]

{'from': {'uuid': 'NCT05013879'},
 'to': {'uuid': 'EFO:0009373'},
 'label': 'studies condition',
 'properties': {}}