# Open Targets - Data Preparation

# Environment Setup

In [None]:
# Uncomment the following lines to install required modules
# import sys
# !{sys.executable} -m pip install -r requirements.txt

In [1]:
import os
import json
import requests
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm
from biobox_analytics.utils import read_jsonl_files
from pyspark import SparkConf
from pyspark.sql import SparkSession
import subprocess
import pyspark.sql.functions as F
import humanize

In [15]:
tmp_data = '../resources/tmp_data'
processed_data = '../resources/processed_data'
data_directory = os.path.abspath(os.path.join(tmp_data, OT_VERSION, 'parquet'))
json_data_directory = os.path.abspath(os.path.join(tmp_data, OT_VERSION, 'json'))
# Constants
OT_VERSION="24.03"

# make directories
os.makedirs(os.path.join(tmp_data, 'open_targets', OT_VERSION), exist_ok=True)

Establishing some generic utility functions here:

In [10]:
def OT_download_files(version_code, directory_path, requested_folder="targets", base_url="ftp://ftp.ebi.ac.uk/pub/databases/opentargets/platform", data_source="parquet"):
    """
    Downloads files from a specified Open Targets (OT) FTP directory using wget.

    Args:
        version_code (str): The version code of the OT data to download.
        directory_path (str): The local path where the downloaded files will be saved.
        requested_folder (str, optional): The specific folder within the OT data to download (defaults to "targets").
        base_url (str, optional): The base URL of the OT FTP server (defaults to the official EBI FTP location).
        data_source (str, optional): The format of the data to download ("parquet" or "json", defaults to "parquet").

    Returns:
        None

    Raises:
        ValueError: If an invalid `data_source` is provided.
    """

    # Validate the data source format
    valid_sources = ["parquet", "json"]
    if data_source not in valid_sources:
        raise ValueError(f"Invalid data source: {data_source}. Choose from: {valid_sources}")

    # Construct the full URL with flexibility
    url = f"{base_url}/{version_code}/output/etl/{data_source}/{requested_folder}"

    # Absolute download path (for user clarity)
    abs_directory_path = os.path.abspath(directory_path)
    os.makedirs(abs_directory_path, exist_ok=True)

    # Build the wget command 
    wget_command = [
        "wget",
        "--recursive",
        "--no-parent",
        "--no-host-directories",
        "--cut-dirs", "8",
        "-P", directory_path,
        url
    ]

    
    print(f"\n[INFO] Downloading Open Targets data:")
    print(f"  - Version: {version_code}")
    print(f"  - Folder: {requested_folder}")
    print(f"  - Format: {data_source}")
    print(f"  - Saving to: {abs_directory_path}\n")

    try:
        result = subprocess.run(wget_command, capture_output=True, text=True, check=True)
    except subprocess.CalledProcessError as e:
        print(f"[ERROR] Download failed:\n{e.stderr}")
        return  # Exit early on failure

    # Post-download analysis
    file_count = 0
    total_size = 0
    for dirpath, _, filenames in os.walk(abs_directory_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
            file_count += 1

    print("\n[SUCCESS] Download complete!")
    print(f"  - Files downloaded: {file_count}")
    print(f"  - Total size: {humanize.naturalsize(total_size)}") 
    print(f"  - Directory: {abs_directory_path}")

# Open Targets Genetics

In [3]:
otg = read_jsonl_files(os.path.join(tmp_data, 'open_targets/v24.03/evidence/sourceId=ot_genetics_portal'))

Loading .jsonl files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:13<00:00, 14.36it/s]


In [8]:
otg_df = pd.DataFrame(otg)

In [10]:
otg_df.columns

Index(['datasourceId', 'targetId', 'beta', 'betaConfidenceIntervalLower',
       'betaConfidenceIntervalUpper', 'datatypeId', 'diseaseFromSource',
       'diseaseFromSourceMappedId', 'literature', 'pValueExponent',
       'pValueMantissa', 'projectId', 'publicationFirstAuthor',
       'publicationYear', 'resourceScore', 'studyId', 'studySampleSize',
       'targetFromSourceId', 'variantFunctionalConsequenceFromQtlId',
       'variantFunctionalConsequenceId', 'variantId', 'variantRsId',
       'diseaseId', 'id', 'score', 'variantEffect', 'directionOnTrait',
       'oddsRatio', 'oddsRatioConfidenceIntervalLower',
       'oddsRatioConfidenceIntervalUpper'],
      dtype='object')

In [11]:
otg_df['variantEffect'].unique()

array(['GoF', 'LoF', nan], dtype=object)

In [12]:
otg_df['directionOnTrait'].unique()

array(['protect', 'risk', nan], dtype=object)

In [13]:
otg_df['variantFunctionalConsequenceFromQtlId'].unique()

array(['SO_0002315', 'SO_0002316', nan, 'SO_0002314'], dtype=object)

In [14]:
otg_df['variantFunctionalConsequenceId'].unique()

array(['SO_0001628', 'SO_0001627', 'SO_0001631', 'SO_0001583',
       'SO_0001819', 'SO_0001632', 'SO_0001624', 'SO_0001623',
       'SO_0001822', 'SO_0001630', 'SO_0001589', 'SO_0001821',
       'SO_0001587', 'SO_0001575', 'SO_0001792', 'SO_0001574',
       'SO_0002012', 'SO_0001580', 'SO_0001818'], dtype=object)

In [19]:
so_term_map = dict(zip(['SO_0001628', 'SO_0001627', 'SO_0001631', 'SO_0001583',
       'SO_0001819', 'SO_0001632', 'SO_0001624', 'SO_0001623',
       'SO_0001822', 'SO_0001630', 'SO_0001589', 'SO_0001821',
       'SO_0001587', 'SO_0001575', 'SO_0001792', 'SO_0001574',
       'SO_0002012', 'SO_0001580', 'SO_0001818','SO_0002315', 'SO_0002316', 'SO_0002314'],["intergenic_variant", "intron_variant", "upstream_gene_variant", "missense_variant", "synonymous_variant", "downstream_gene_variant", "3_prime_UTR_variant", "5_prime_UTR_variant", "inframe_deletion", "splice_region_variant", "frameshift_variant", "inframe_insertion", "stop_gained", "splice_donor_variant", "non_coding_transcript_exon_variant", "splice_acceptor_variant", "start_lost", "coding_sequence_variant", "protein_altering_variant","increased_gene_product_level", "decreased_gene_product_level", "altered_gene_product_level"]))

In [20]:
so_term_map

{'SO_0001628': 'intergenic_variant',
 'SO_0001627': 'intron_variant',
 'SO_0001631': 'upstream_gene_variant',
 'SO_0001583': 'missense_variant',
 'SO_0001819': 'synonymous_variant',
 'SO_0001632': 'downstream_gene_variant',
 'SO_0001624': '3_prime_UTR_variant',
 'SO_0001623': '5_prime_UTR_variant',
 'SO_0001822': 'inframe_deletion',
 'SO_0001630': 'splice_region_variant',
 'SO_0001589': 'frameshift_variant',
 'SO_0001821': 'inframe_insertion',
 'SO_0001587': 'stop_gained',
 'SO_0001575': 'splice_donor_variant',
 'SO_0001792': 'non_coding_transcript_exon_variant',
 'SO_0001574': 'splice_acceptor_variant',
 'SO_0002012': 'start_lost',
 'SO_0001580': 'coding_sequence_variant',
 'SO_0001818': 'protein_altering_variant',
 'SO_0002315': 'increased_gene_product_level',
 'SO_0002316': 'decreased_gene_product_level',
 'SO_0002314': 'altered_gene_product_level'}

In [26]:
otg_df['directionOnTrait'].unique()

array(['protect', 'risk', nan], dtype=object)

In [27]:
otg[0]

{'datasourceId': 'ot_genetics_portal',
 'targetId': 'ENSG00000002919',
 'beta': -0.0122706,
 'betaConfidenceIntervalLower': -0.0160973236,
 'betaConfidenceIntervalUpper': -0.0084438764,
 'datatypeId': 'genetic_association',
 'diseaseFromSource': 'Sex hormone-binding globulin levels adjusted for BMI',
 'diseaseFromSourceMappedId': 'EFO_0004696',
 'literature': ['32042192'],
 'pValueExponent': -10,
 'pValueMantissa': 4.2,
 'projectId': 'GCST',
 'publicationFirstAuthor': 'Ruth KS',
 'publicationYear': 2020,
 'resourceScore': 0.19180332124233246,
 'studyId': 'GCST90012106',
 'studySampleSize': 188908,
 'targetFromSourceId': 'ENSG00000002919',
 'variantFunctionalConsequenceFromQtlId': 'SO_0002315',
 'variantFunctionalConsequenceId': 'SO_0001628',
 'variantId': '17_47962177_G_T',
 'variantRsId': 'rs2525105',
 'diseaseId': 'EFO_0004696',
 'id': '6f6156109c5d406a5c622370350733614be17b59',
 'score': 0.19180332124233246,
 'variantEffect': 'GoF',
 'directionOnTrait': 'protect'}

In [41]:
node_map = {}
edges = []

In [42]:
# Ensure all values are primitives or arrays of primitives
def ensure_primitive_or_array_of_primitives(value):
    if isinstance(value, (str, int, float, bool)) or value is None:
        return value
    elif isinstance(value, list):
        return [ensure_primitive_or_array_of_primitives(v) for v in value]
    else:
        return json.dumps(value)

    # properties = {k: ensure_primitive_or_array_of_primitives(v) for k, v in temp_properties.items() if v is not None}

In [43]:
for i in tqdm(otg):
    disease_id = i.get('diseaseId').replace('_', ':')
    gene_id = i.get('targetId')
    variant_id = i.get('variantId')
    references = ['PMID:' + x for x in i.get('literature', [])]

    va2g_label = 'has association'

    functional_consequence = so_term_map.get(i.get('variantFunctionalConsequenceId', None), None)
    functional_consequence_qtl = so_term_map.get(i.get('variantFunctionalConsequenceFromQtlId', None), None)

    if functional_consequence_qtl is not None:
        va2g_label = functional_consequence_qtl.replace('_', ' ') + ' of'

    direction_on_trait = i.get('directionOnTrait')
    va2d_label = 'of trait'

    if direction_on_trait == 'protect':
        va2d_label = 'protective against'
    elif direction_on_trait == 'risk':
        va2d_label = 'risk of trait'
    

    va2d = {
        'from': {
            'uuid': i.get('id')
        },
        'to': {
            'uuid': disease_id
        },
        'label': va2d_label,
        'properties': {}
    }

    va2g = {
        'from': {
            'uuid': i.get('id')
        },
        'to': {
            'uuid': gene_id
        },
        'label': va2g_label,
        'properties': {}
    }

    va2v_label= 'when variant is'

    va2v = {
        'from': {
            'uuid': i.get('id')
        },
        'to': {
            'uuid': variant_id
        },
        'label': va2v_label,
        'properties': {}
    }

    displayName = f"{i.get('variantId')} {va2g_label} and is {va2d_label}"

    va_properties = {
        'uuid': i.get('id'),
        'displayName': displayName,
        'score': i.get('score', None),
        'variantFunctionalConsequenceId': i.get('variantFunctionalConsequenceId', None),
        'variantFunctionalConsequenceFromQtlId': i.get('variantFunctionalConsequenceFromQtlId', None),
        'beta': i.get('beta', None),
        'betaConfidenceIntervalLower': i.get('betaConfidenceIntervalLower', None),
        'betaConfidenceIntervalUpper': i.get('betaConfidenceIntervalUpper', None),
        'pValueMantissa': i.get('pValueMantissa', None),
        'pValueExponent': i.get('pValueExponent', None),
        'oddsRatio': i.get('oddsRatio', None),
        'oddsRatioConfidenceIntervalLower': i.get('oddsRatioConfidenceIntervalLower', None),
        'oddsRatioConfidenceIntervalUpper': i.get('oddsRatioConfidenceIntervalUpper', None),
        'direction_on_trait': direction_on_trait,
        'functional_consequence': functional_consequence,
        'functional_consequence_qtl': functional_consequence_qtl,
        'variantFunctionalConsequenceId': i.get('variantFunctionalConsequenceId', None),
        'variantFunctionalConsequenceQtlId': i.get('variantFunctionalConsequenceQtlId', None),
        'variantEffect': i.get('variantEffect', None),
         'projectId': i.get('projectId', None),
         'publicationFirstAuthor': i.get('publicationFirstAuthor', None),
         'publicationYear': i.get('publicationYear', None),
    }
    
    va = {
        '_id': i.get('id'),
        'labels': ['Association', 'VariantAssociation'],
        'properties': {k: ensure_primitive_or_array_of_primitives(v) for k, v in va_properties.items() if v is not None}
    }

    v = {
        '_id': i.get('variantId'),
        'labels': ['Variant'],
        'properties': {
            'uuid': i.get('variantId'),
            'displayName': i.get('variantId'),
            'rsID': i.get('variantRsId', None),
        }
    }

    
    node_map[i.get('id')] = va
    node_map[v['_id']] = v
    edges.extend([va2d,va2g,va2v])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 781437/781437 [00:23<00:00, 33030.47it/s]


In [44]:
nodes = node_map.values()
print(f'Nodes: {len(nodes)}')
print(f'Edges: {len(edges)}')

with gzip.open(os.path.join(processed_data, 'OTG_node.jsonl.gz'), 'wt') as f:
    print('Writing Nodes to File')
    for n in tqdm(nodes):
        f.write(json.dumps(n) + '\n')
with gzip.open(os.path.join(processed_data, 'OTG_edge.jsonl.gz'), 'wt') as f:
    print('Writing Edges to File')
    for e in tqdm(edges):
        f.write(json.dumps(e) + '\n')

metadata = {
    "_meta": {
        "version": "0.2.0",
        "date_updated": "2024-05-17",
        "maintainer": "BioBox Analytics"
    },
      "key": "ot_genetics",
      "name": "Open Targets - Genetics",
      "description": "This data package contains information extracted from Open Targets Genetics (OTG) to enhance your graph with GWAS semantics and observations. Contents of this package provides an evidence-based connection between known GWAS traits and their relatedness to variants and genes. Assignment of variants to their lead and/or causal genes is achieved through OTG Locus2Gene machine learning model.",
      "source": [
        {
          "uri": "https://platform-docs.opentargets.org/evidence#open-targets-genetics",
          "type": "doc"
        },
        {
          "uri": "https://platform.opentargets.org/downloads",
          "type": "data",
          "version": "24.03"
        }
      ],
    "concepts": {
       "Trait": {
          "label": "Trait",
          "dbLabel": "Trait",
          "definition": "GWAS trait is an Experiment Factor Ontology (EFO) that can represent a biological process, a disease, or a phenotype."
        },
        "Variant": {
          "label": "Variant",
          "dbLabel": "Variant",
          "definition": "A genetic variation"
        },
        "VariantAssociation": {
          "label": "Variant Association",
          "dbLabel": "VariantAssociation",
          "definition": "A hyper edge that connects a study, a variant, and a trait that, together, represents an association event."
        }
    },
    "relationships": {
        'protective against': {
            'from': 'VariantAssociation',
            'to': 'Trait'
        },
        'risk of trait': {
            'from': 'VariantAssociation',
            'to': 'Trait'
        },
        'of trait': {
            'from': 'VariantAssociation',
            'to': 'Trait'
        },
        "when variant is": {
          "from": "VariantAssociation",
          "to": "Variant"
        },
        'increased gene product level of': {
          "from": "VariantAssociation",
          "to": "Gene"
        },
        'decreased gene product level of': {
          "from": "VariantAssociation",
          "to": "Gene"
        },
        'altered gene product level of': {
          "from": "VariantAssociation",
          "to": "Gene"
        },
        'has association': {
          "from": "VariantAssociation",
          "to": "Gene"
        }
    }
}

with open(os.path.join(processed_data, 'OTG_metadata.json'), 'w') as f:
    json.dump(metadata, f)

Nodes: 919566
Edges: 2344311
Writing Nodes to File


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 919566/919566 [00:28<00:00, 32446.65it/s]


Writing Edges to File


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2344311/2344311 [00:16<00:00, 144386.92it/s]


# Safety Liabilities

Safety liabilities are annotated information found on a target. First we will download the latest data release (as of this writing it is 24.03) to our local tmp_data directories. We can control the data version by controlling the top-level parameter: `OT_VERSION`

In [None]:
OT_download_files("24.03", data_directory)

In [17]:
OT_download_files("24.03", data_directory, data_source="parquet", requested_folder="molecule")


[INFO] Downloading Open Targets data:
  - Version: 24.03
  - Folder: molecule
  - Format: parquet
  - Saving to: /Users/cl/projects/biobox-analytics/biobox-analytics/notebooks/resources/tmp_data/24.03/parquet


[SUCCESS] Download complete!
  - Files downloaded: 201


NameError: name 'humanize' is not defined

In [27]:
spark = (SparkSession.builder.master('local[*]').getOrCreate())
evd = spark.read.parquet(os.path.join(data_directory, 'targets'))
evd.printSchema()

root
 |-- id: string (nullable = true)
 |-- approvedSymbol: string (nullable = true)
 |-- biotype: string (nullable = true)
 |-- transcriptIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- canonicalTranscript: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: string (nullable = true)
 |-- canonicalExons: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genomicLocation: struct (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: integer (nullable = true)
 |-- alternativeGenes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- approvedName: string (nullable = true)
 |-- go: array (nullable = true)
 |    |-- element: struct (containsNull = tru

In [37]:
evdSelect = (evd
 .select("id",
         F.explode("safetyLiabilities").alias("element")
 )
            )

evdSelect.toPandas()

Unnamed: 0,id,element
0,ENSG00000037280,"(heart disease, EFO_0003777, None, [(None, Non..."
1,ENSG00000037280,"(regulation of catalytic activity, None, None,..."
2,ENSG00000073050,"(nausea, HP_0002018, None, None, True, PharmGK..."
3,ENSG00000073050,"(severe neutropenia, None, None, None, True, P..."
4,ENSG00000073050,"(survival, EFO_0000714, None, None, True, Phar..."
...,...,...
4120,ENSG00000111665,"(increase in pulse rate, None, None, None, Tru..."
4121,ENSG00000111665,"(diabetes mellitus, EFO_0000400, None, None, T..."
4122,ENSG00000111665,"(less of an increase in pulse rate, None, None..."
4123,ENSG00000120903,"(receptor binding, None, None, [(tissue-based ..."


# Target - Baseline Expression

In [16]:
OT_download_files("24.03", json_data_directory, data_source="json", requested_folder="baselineExpression")


[INFO] Downloading Open Targets data:
  - Version: 24.03
  - Folder: baselineExpression
  - Format: json
  - Saving to: /Users/cl/projects/biobox-analytics/biobox-analytics/notebooks/resources/tmp_data/24.03/json


[SUCCESS] Download complete!
  - Files downloaded: 403
  - Total size: 2.5 GB
  - Directory: /Users/cl/projects/biobox-analytics/biobox-analytics/notebooks/resources/tmp_data/24.03/json


In [18]:
target_baseline_expression = read_jsonl_files(os.path.join(json_data_directory, 'baselineExpression'))


Loading .jsonl files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:52<00:00,  3.82it/s]


In [25]:
target_baseline_expression[0]

{'id': 'ENSG00000008294',
 'tissues': [{'efo_code': 'UBERON_0000947',
   'label': 'aorta',
   'organs': ['vasculature'],
   'anatomical_systems': ['circulatory system'],
   'rna': {'value': 7487.5, 'zscore': 0, 'level': 4, 'unit': 'TPM'},
   'protein': {'reliability': False, 'level': -1, 'cell_type': []}},
  {'efo_code': 'UBERON_0000945',
   'label': 'stomach',
   'organs': ['stomach'],
   'anatomical_systems': ['digestive system'],
   'rna': {'value': 3809.5, 'zscore': -1, 'level': 4, 'unit': 'TPM'},
   'protein': {'reliability': True,
    'level': 3,
    'cell_type': [{'name': 'glandular cells', 'reliability': True, 'level': 3},
     {'name': 'glandular cells', 'reliability': True, 'level': 2}]}},
  {'efo_code': 'CL_0002057',
   'label': 'CD14-positive, CD16-negative classical monocyte',
   'organs': ['immune organ', 'blood'],
   'anatomical_systems': ['immune system', 'hematopoietic system'],
   'rna': {'value': 22141.0, 'zscore': 4, 'level': 5, 'unit': 'TPM'},
   'protein': {'relia

In [24]:
len(target_baseline_expression)

43804