## Step 1: Import relevant libraries

In [1]:
import pandas as pd
import numpy as np
import requests as r
import json

## Step 2: Import list of targets and diseases of interest

In [20]:
target_list = [
    {
        "target_symbol": "PIK3CA",
        "target_id": "ENSG00000121879",
    },
    {
        "target_symbol": "AKT1",
        "target_id": "ENSG00000142208",
    },
    {
        "target_symbol": "APC",
        "target_id": "ENSG00000134982",
    },
    {
        "target_symbol": "ESR1",
        "target_id": "ENSG00000091831",
    },
    {
        "target_symbol": "AXIN1",
        "target_id": "ENSG00000103126",
    },
    {
        "target_symbol": "FAKE-GENE-1",
        "target_id": "ENSG1234567890-FAKE",
    },
    
]

In [None]:
disease_list = [
    {
        "disease_label": "melanoma",
        "disease_id": "EFO_0000756"
    },
    {
        "disease_label": "lymphoma",
        "disease_id": "EFO_0000574"
    },
    {
        "disease_label": "multiple_myeloma",
        "disease_id": "EFO_0001378"
    }
]

## Step 3: Generate dataframe with association scores and supporting publications

In [None]:
association_score_base_url = "https://platform-api.opentargets.io/v3/platform/public/association/filter?"

text_mining_evidence_base_url = "https://platform-api.opentargets.io/v3/platform/public/evidence/filter?"

all_dataframe_rows = []

for target in target_list:
    
    dataframe_entry = {}
    
    target_id = target["target_id"]
    
    dataframe_entry["target_id"] = target_id
    
    for disease in disease_list:
        
        disease_id = disease["disease_id"]
        association_params = "target=" + target_id + "&disease=" + disease_id + "&facets=false"
        api_call_url = association_score_base_url + association_params
        # print(api_call_url)
        raw_api_response = r.get(api_call_url).json()
        if raw_api_response["data"]:
            overall_association_score = round(raw_api_response["data"][0]["association_score"]["overall"], 5)
        else:
            overall_association_score = 0
        assoc_score_key_label = disease["disease_label"] + "_association_score"
        dataframe_entry[assoc_score_key_label] = overall_association_score 
        
        evidence_params = (
            "target=" 
            + target_id 
            + "&disease=" 
            + disease_id 
            + "&datasource=europepmc&size=10000&"
            + "fields=evidence.literature_ref.lit_id&expandefo=true"
        )
        text_mining_evidence_api_call_url = text_mining_evidence_base_url + evidence_params
        raw_api_response_text_mining = r.get(text_mining_evidence_api_call_url).json()
        number_of_text_mining_evidence_strings = raw_api_response_text_mining ["total"]
        text_mining_evidence_count_key_label = disease["disease_label"] + "_number_of_publications"
        dataframe_entry[text_mining_evidence_count_key_label] = number_of_text_mining_evidence_strings
        all_pmids = []
        if raw_api_response_text_mining["data"]:
            for evidence_string in raw_api_response_text_mining["data"]:
                text_mining_evidence_string_pmid_url = evidence_string["evidence"]["literature_ref"]["lit_id"].split("/")
                text_mining_evidence_string_pmid = text_mining_evidence_string_pmid_url[-1]
                all_pmids.append(text_mining_evidence_string_pmid)
        
        text_mining_publications_key_label = disease["disease_label"] + "_publications"
        dataframe_entry[text_mining_publications_key_label] = all_pmids
        
    all_dataframe_rows.append(dataframe_entry)

In [None]:
association_df = pd.DataFrame(all_dataframe_rows)
association_df.head()

## Step 4: Generate dataframe with known drug evidence

In [None]:
drug_evidence_base_url = "https://platform-api.opentargets.io/v3/platform/public/evidence/filter?"

drug_evidence_df_rows = []

for target in target_list:
    
    dataframe_entry = {}
    
    target_id = target["target_id"]
    
    dataframe_entry["target_id"] = target_id
    
    drug_evidence_params = (
        "target=" 
        + target_id 
        + "&size=10000&datasource=chembl&"
        + "fields=drug.id"
    )
    
    drug_evidence_api_call_url = drug_evidence_base_url + drug_evidence_params
    raw_api_response_drugs = r.get(drug_evidence_api_call_url).json()
    if raw_api_response_drugs["data"]:
        dataframe_entry["known_drug_in_clinic"] = True
        all_drugs = []
        for obj in raw_api_response_drugs["data"]:
            drug_url = obj["drug"]["id"]
            drug_url_strings = drug_url.split("/")
            chembl_id = drug_url_strings[-1]
            all_drugs.append(chembl_id)
        drugs_list = list(set(all_drugs))
        dataframe_entry["drug_in_clinic_ids"] = drugs_list
        
    else:
        dataframe_entry["known_drug_in_clinic"] = False
        
    drug_evidence_df_rows.append(dataframe_entry)

In [None]:
drug_evidence_df = pd.DataFrame(drug_evidence_df_rows)
drug_evidence_df.head()

## Step 5: Generate dataframe with relevant tractability data

In [37]:
all_tract_df = pd.read_csv("data/test_tract_data.txt", sep="\t", encoding="utf-8")

In [39]:
all_tract_df.head(10)

Unnamed: 0,ensembl_gene_id,accession,symbol,Bucket_1,Bucket_2,Bucket_3,Bucket_4,Bucket_5,Bucket_6,Bucket_7,...,Bucket_sum_PROTAC,Top_bucket_PROTAC,Bcell_mean,NKcell_mean,Hepatocytes_mean,MouseNeuorons_mean,Max_halflife,number_of_ubiquitination_sites,full_id,PROTAC_location_Bucket
0,ENSG00000121879,P42336,PIK3CA,1,0,0,1,1,0,1,...,2,4,,,,44.374,44.374,,,1
1,ENSG00000142208,P31749,AKT1,0,1,0,1,1,0,1,...,3,4,,240.913,,43.3965,240.913,1.0,,1
2,ENSG00000134982,P25054,APC,0,0,0,1,0,0,1,...,2,5,,,,12.139,12.139,,,1
3,ENSG00000132646,P12004,PCNA,0,0,0,1,0,0,1,...,3,4,36.035,149.579,138.9365,64.5695,149.579,7.0,,1
4,ENSG00000133703,P01116,KRAS,0,0,0,1,0,0,1,...,3,4,50.542,,,59.582,59.582,2.0,,1
5,ENSG00000133895,O00255,MEN1,0,0,0,1,0,0,1,...,2,4,68.054,85.9365,,19.113,85.9365,,,1
6,ENSG00000091831,P03372,ESR1,1,0,0,1,1,0,1,...,1,9,,,,,-1.0,,,1
7,ENSG00000103126,O15169,AXIN1,0,0,0,1,0,0,0,...,1,9,,,,,-1.0,,,1


In [8]:
tract_df_cols = list(all_tract_df.columns.values)

In [9]:
print(tract_df_cols)

['ensembl_gene_id', 'accession', 'symbol', 'Bucket_1', 'Bucket_2', 'Bucket_3', 'Bucket_4', 'Bucket_5', 'Bucket_6', 'Bucket_7', 'Bucket_8', 'Bucket_sum', 'Top_bucket', 'Category', 'Clinical_Precedence', 'Discovery_Precedence', 'Predicted_Tractable', 'PDB_Known_Ligand', 'ensemble', 'High_Quality_ChEMBL_compounds', 'Small_Molecule_Druggable_Genome_Member', 'Bucket_1_ab', 'Bucket_2_ab', 'Bucket_3_ab', 'Bucket_4_ab', 'Bucket_5_ab', 'Bucket_6_ab', 'Bucket_7_ab', 'Bucket_8_ab', 'Bucket_9_ab', 'Bucket_sum_ab', 'Top_bucket_ab', 'Clinical_Precedence_ab', 'Predicted_Tractable__High_confidence', 'Predicted_Tractable__Medium_to_low_confidence', 'Category_ab', 'Uniprot_high_conf_loc', 'GO_high_conf_loc', 'Uniprot_med_conf_loc', 'GO_med_conf_loc', 'Transmembrane', 'Signal_peptide', 'HPA_main_location', 'Bucket_1_PROTAC', 'Bucket_2_PROTAC', 'Bucket_3_PROTAC', 'Bucket_4_PROTAC', 'Bucket_5_PROTAC', 'Bucket_6_PROTAC', 'Bucket_7_PROTAC', 'Bucket_8_PROTAC', 'Bucket_9_PROTAC', 'Bucket_sum_PROTAC', 'Top_buck

In [23]:
tract_req_cols = [
    'ensembl_gene_id', 
    'Bucket_4', 
    'Bucket_5', 
    'Bucket_6', 
    'Bucket_7', 
    'Bucket_8',
#     'Top_bucket', 
#     'PDB_Known_Ligand', 
#     'ensemble', 
#     'High_Quality_ChEMBL_compounds', 
#     'Small_Molecule_Druggable_Genome_Member'
]

In [24]:
tract_subset_df = all_tract_df.filter(tract_req_cols, axis=1)

In [25]:
tract_subset_df.head()

Unnamed: 0,ensembl_gene_id,Bucket_4,Bucket_5,Bucket_6,Bucket_7,Bucket_8
0,ENSG00000001626,1,1,0,1,1
1,ENSG00000002726,1,1,0,1,1
2,ENSG00000005844,1,1,0,1,1
3,ENSG00000010671,1,1,0,1,1
4,ENSG00000012504,1,1,0,1,1


In [36]:
new_df = pd.DataFrame()

for target in target_list:
    random_thing = tract_subset_df.loc[tract_subset_df["ensembl_gene_id"] == target["target_id"]]
    

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [31]:
new_df