In [1]:
import pandas as pd
import numpy as np
from functools import reduce
import requests as r

### Import and process list provided by Denise

In [2]:
dcs_targets = pd.read_csv('data/dcs_target_file.txt', header=None, names=['ensembl_gene_id'])
dcs_targets['dcs_target_list'] = 'Yes'

In [3]:
dcs_targets.head()

Unnamed: 0,ensembl_gene_id,dcs_target_list
0,ENSG00000003400,Yes
1,ENSG00000004468,Yes
2,ENSG00000006071,Yes
3,ENSG00000006606,Yes
4,ENSG00000006638,Yes


In [4]:
print('Number of entries before drop_duplicates: %i' % (len(dcs_targets)))
dcs_targets.drop_duplicates(inplace=True)
print('Number of entries after drop_duplicates: %i' % (len(dcs_targets)))

Number of entries before drop_duplicates: 445
Number of entries after drop_duplicates: 445


### Import and process list generated by Prudence

In [5]:
pm_targets = pd.read_csv('data/experimental_toxicity.tsv', sep='\t', usecols=['ensembl_gene_id'])
pm_targets['pm_target_list'] = 'Yes'

In [6]:
pm_targets.head()

Unnamed: 0,ensembl_gene_id,pm_target_list
0,ENSG00000091831,Yes
1,ENSG00000141510,Yes
2,ENSG00000120738,Yes
3,ENSG00000091831,Yes
4,ENSG00000169856,Yes


In [7]:
print('Number of entries before drop_duplicates: %i' % (len(pm_targets)))
pm_targets.drop_duplicates(inplace=True)
print('Number of entries after drop_duplicates: %i' % (len(pm_targets)))

Number of entries before drop_duplicates: 480
Number of entries after drop_duplicates: 305


### Generate data frame to map HGNC symbols to Ensembl IDs

In [8]:
mapping_df = pd.read_csv('data/hgnc_ensembl_mapping_file.txt', sep='\t')

In [9]:
mapping_df.head()

Unnamed: 0,HGNC ID,Approved symbol,RefSeq IDs,Ensembl gene ID
0,HGNC:5,A1BG,NM_130786,ENSG00000121410
1,HGNC:37133,A1BG-AS1,NR_015380,ENSG00000268895
2,HGNC:24086,A1CF,NM_014576,ENSG00000148584
3,HGNC:6,A1S9T,,
4,HGNC:7,A2M,NM_000014,ENSG00000175899


In [10]:
def find_ensembl_id(hgnc_symbol):
    mapping_entry = mapping_df.loc[mapping_df['Approved symbol'] == hgnc_symbol].values
    return mapping_entry[0][3]

### Import and process list of targets with safety risk information

In [11]:
safety_risk_targets = pd.read_csv('data/safety_risk_information.tsv', sep='\t', usecols=['Target'])
safety_risk_targets['safety_risk_target_list'] = 'Yes'
safety_risk_targets = safety_risk_targets.rename(columns={'Target': 'safety_risk_symbol'})

In [12]:
safety_risk_targets['ensembl_gene_id'] = safety_risk_targets['safety_risk_symbol'].apply(find_ensembl_id)

In [13]:
safety_risk_targets.head()

Unnamed: 0,safety_risk_symbol,safety_risk_target_list,ensembl_gene_id
0,ABL1,Yes,ENSG00000097007
1,AKT1,Yes,ENSG00000142208
2,AKT2,Yes,ENSG00000105221
3,AKT3,Yes,ENSG00000117020
4,AURKA,Yes,ENSG00000087586


In [14]:
print('Number of entries before drop_duplicates: %i' % (len(safety_risk_targets)))
safety_risk_targets.drop_duplicates(inplace=True)
print('Number of entries after drop_duplicates: %i' % (len(safety_risk_targets)))

Number of entries before drop_duplicates: 77
Number of entries after drop_duplicates: 73


### Import and process list of targets with adverse effects

In [15]:
adverse_effects_targets = pd.read_csv('data/adverse_effects.tsv', sep='\t', usecols=['Target'])
adverse_effects_targets['adverse_effects_target_list'] = 'Yes'
adverse_effects_targets = adverse_effects_targets.rename(columns={'Target': 'adverse_effects_symbol'})

In [16]:
adverse_effects_targets['ensembl_gene_id'] = adverse_effects_targets['adverse_effects_symbol'].apply(find_ensembl_id)

In [17]:
adverse_effects_targets.head()

Unnamed: 0,adverse_effects_symbol,adverse_effects_target_list,ensembl_gene_id
0,ABCC8,Yes,ENSG00000006071
1,ABCC9,Yes,ENSG00000069431
2,ACE,Yes,ENSG00000159640
3,ACHE,Yes,ENSG00000087085
4,ACHE,Yes,ENSG00000087085


In [18]:
print('Number of entries before drop_duplicates: %i' % (len(adverse_effects_targets)))
adverse_effects_targets.drop_duplicates(inplace=True)
print('Number of entries after drop_duplicates: %i' % (len(adverse_effects_targets)))

Number of entries before drop_duplicates: 186
Number of entries after drop_duplicates: 122


### Create unified DataFrame from all safety data sources

In [19]:
list_of_dfs = [
    dcs_targets, 
    pm_targets, 
    adverse_effects_targets, 
    safety_risk_targets
]

merged_df = reduce(lambda x,y: pd.merge(x,y, on='ensembl_gene_id', how='outer'), list_of_dfs)

In [20]:
merged_df.head()

Unnamed: 0,ensembl_gene_id,dcs_target_list,pm_target_list,adverse_effects_symbol,adverse_effects_target_list,safety_risk_symbol,safety_risk_target_list
0,ENSG00000003400,Yes,Yes,,,,
1,ENSG00000004468,Yes,Yes,,,,
2,ENSG00000006071,Yes,,ABCC8,Yes,,
3,ENSG00000006606,Yes,Yes,,,,
4,ENSG00000006638,Yes,Yes,TBXA2R,Yes,,


In [21]:
print('Number of entries in unified dataframe: %i' % (len(merged_df)))

Number of entries in unified dataframe: 446


In [22]:
merged_df.isna().any()

ensembl_gene_id                False
dcs_target_list                 True
pm_target_list                  True
adverse_effects_symbol          True
adverse_effects_target_list     True
safety_risk_symbol              True
safety_risk_target_list         True
dtype: bool

In [23]:
merged_df['dcs_target_list'].fillna('No', inplace=True)
merged_df['pm_target_list'].fillna('No', inplace=True)
merged_df['adverse_effects_target_list'].fillna('No', inplace=True)
merged_df['safety_risk_target_list'].fillna('No', inplace=True)
merged_df['adverse_effects_symbol'].fillna('n/a', inplace=True)
merged_df['safety_risk_symbol'].fillna('n/a', inplace=True)

In [24]:
merged_df.isna().any()

ensembl_gene_id                False
dcs_target_list                False
pm_target_list                 False
adverse_effects_symbol         False
adverse_effects_target_list    False
safety_risk_symbol             False
safety_risk_target_list        False
dtype: bool

### Add note if target has data in QC and production API target index responses

In [25]:
def platform_safety_info_qc(gene_id):
    url = 'http://platform-api-qc.opentargets.io/v3/platform/private/target/' + gene_id
    raw_api_response_target_data = r.get(url).json()
    if "safety" in  raw_api_response_target_data:
        return 'Yes'
    else:
        return 'No'

def platform_safety_info_production(gene_id):
    url = 'https://platform-api.opentargets.io/v3/platform/private/target/' + gene_id
    raw_api_response_target_data = r.get(url).json()
    if "safety" in  raw_api_response_target_data:
        return 'Yes'
    else:
        return 'No'

In [26]:
merged_df['on_platform_qc'] = merged_df['ensembl_gene_id'].apply(platform_safety_info_qc)

In [27]:
merged_df['on_platform_production'] = merged_df['ensembl_gene_id'].apply(platform_safety_info_production)

### Generate CSV report

In [28]:
merged_df.to_csv('target_safety_list_comparison_report.csv', index=False)