**Author:** Benoît BAILLIF

**Purpose:** Process target data extracted from Pubchem and the Drug Repurposing Hub

**Input:**
- data/raw/
 - repurposing_drugs_20180907.txt : find the tubulin inhibitors / inactive compounds
- data/processed/
 - pert_id_pubchem_cid.csv : mapping each used pert_id with one or more corresponding pubchem CID
 - pubchem_bioactivity_matrix_2018.csv : target x pubchem CID table (matrix like, including some target information)
 
**Output:**
- data/processed/
 - used_pertid_target_matrix.csv : final activity matrix to be used downstream, for used compounds
 - target_information.csv : table recording target information

In [1]:
import pandas as pd
import numpy as np

# Input

In [2]:
raw_data_directory = 'data/raw/'
processed_data_directory = 'data/processed/'

In [38]:
pubchem_bioactivity_matrix_path = processed_data_directory + 'pubchem_bioactivity_matrix_2018.csv'
pert_id_pubchem_cid_map_path = processed_data_directory + 'pert_id_to_pubchem_cid.csv'
brd_repurposing_path = raw_data_directory + 'repurposing_drugs_20180907.txt'
cmp_info_cmap_path = processed_data_directory + 'cmp_info_cmap.csv'

# Output

In [4]:
target_information_path = processed_data_directory + "target_information.csv"

In [5]:
used_pert_id_target_matrix_path = processed_data_directory + "used_pert_id_target_matrix.csv"

# Main

## Create a binary activity matrix with unique pert_id and target gene symbol

In [6]:
pubchem_bioactivity_matrix = pd.read_csv(pubchem_bioactivity_matrix_path)

In [7]:
print(pubchem_bioactivity_matrix.shape)
pubchem_bioactivity_matrix.head()

(1625, 19657)


Unnamed: 0.1,Unnamed: 0,description,uniprotswissprot,external_gene_name,2853908,44825297,107867,2381,16746329,3182,...,568763,405012,3594,114750,3792,107926,36687767,121928,4006,5311128
0,47.0,"arachidonate 15-lipoxygenase, type B [Source:H...",O15296,ALOX15B,2,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,365.0,arachidonate 15-lipoxygenase [Source:HGNC Symb...,P16050,ALOX15,2,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,195.0,tumor protein p53 [Source:HGNC Symbol;Acc:HGNC...,P04637,TP53,2,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,388.0,"arachidonate 12-lipoxygenase, 12S type [Source...",P18054,ALOX12,2,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4,801.0,"growth factor, augmenter of liver regeneration...",P55789,GFER,2,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [8]:
target_information = pubchem_bioactivity_matrix[['external_gene_name', 'description', 'uniprotswissprot']]
target_information = target_information.drop_duplicates()
target_information.head()

Unnamed: 0,external_gene_name,description,uniprotswissprot
0,ALOX15B,"arachidonate 15-lipoxygenase, type B [Source:H...",O15296
1,ALOX15,arachidonate 15-lipoxygenase [Source:HGNC Symb...,P16050
2,TP53,tumor protein p53 [Source:HGNC Symbol;Acc:HGNC...,P04637
3,ALOX12,"arachidonate 12-lipoxygenase, 12S type [Source...",P18054
4,GFER,"growth factor, augmenter of liver regeneration...",P55789


In [9]:
target_information.to_csv(target_information_path, index=False)

In [10]:
pert_id_pubchem_cid_map = pd.read_csv(pert_id_pubchem_cid_map_path)
print(pert_id_pubchem_cid_map.shape)
pert_id_pubchem_cid_map.head()

(10015, 2)


Unnamed: 0,pert_id,pubchem_cid
0,BRD-A00100033,6842999
1,BRD-A00267231,4043357
2,BRD-A00420644,2853908
3,BRD-A00474148,44825297
4,BRD-A00520476,107867


In [11]:
target_compound_matrix = pubchem_bioactivity_matrix.drop(['description', 'uniprotswissprot', 'Unnamed: 0'], axis=1)

In [12]:
# merge data for different targets coming from different target ids having the same symbol
target_compound_matrix = target_compound_matrix.groupby('external_gene_name').max()

In [86]:
# replace pubchem_cid with corresponding pert_id
compound_target_matrix = target_compound_matrix.T
compound_target_matrix = compound_target_matrix.reset_index(drop=False)
compound_target_matrix = compound_target_matrix.rename({'index' : 'pubchem_cid'}, axis=1)
compound_target_matrix['pubchem_cid'] = compound_target_matrix['pubchem_cid'].astype(int)
compound_target_matrix = compound_target_matrix.merge(pert_id_pubchem_cid_map, on='pubchem_cid')
used_pert_id_target_matrix = compound_target_matrix.drop('pubchem_cid', axis=1)

In [87]:
used_pert_id_target_matrix = used_pert_id_target_matrix.groupby('pert_id').max()

In [88]:
# remove compounds/targets with no data
used_pert_id_target_matrix = used_pert_id_target_matrix.replace({0 : np.nan, 1 : 0, 2 : 1})
used_pert_id_target_matrix = used_pert_id_target_matrix.dropna(how='all', axis=0).dropna(how='all', axis=1)

In [89]:
used_pert_id_target_matrix.head()

Unnamed: 0_level_0,AAK1,ABAT,ABCA1,ABCB1,ABCB11,ABCB6,ABCC1,ABCC2,ABCC3,ABCC5,...,WEE2,WNT3A,WRN,XDH,XIAP,XPO1,YES1,YWHAB,YWHAG,ZAP70
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BRD-A00100033,,,,,,,,,,,...,,,,,,,,,,
BRD-A00218260,,,,,,,,,,,...,,,,,,,,,,
BRD-A00267231,,,,,,,,,,,...,,,,,,,,,,
BRD-A00420644,,,,0.0,,0.0,0.0,,,,...,,,0.0,,,,,,0.0,
BRD-A00474148,,,,,,0.0,,,,,...,,,,,,,,,,


In [91]:
n_annotations = used_pert_id_target_matrix.count().sum()
n_annotated_compounds = used_pert_id_target_matrix.shape[0]
print('Total number of annotations for ' + str(n_annotated_compounds) + ' (DMSO excluded) : ' + str(n_annotations))

print('Previous result (2018) : 512406 annotations if DMSO included')

Total number of annotations for 7803 (DMSO excluded) : 517779
Previous result (2018) : 512406 annotations if DMSO included


## Add tubulin inhibitor information

In [93]:
# default utf-8 encoding does not work
brd_repurposing_data = pd.read_csv(brd_repurposing_path, sep='\t', skiprows=9, encoding='latin_1')

In [94]:
brd_repurposing_data.head()

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
1,A-1070722,Preclinical,glycogen synthase kinase inhibitor,GSK3A|GSK3B,,
2,A-1120,Preclinical,retinoid receptor ligand,RBP4,,
3,A-317491,Preclinical,purinergic receptor antagonist,P2RX3,,
4,A-33903,Phase 2,,,,


In [95]:
is_tubb_active = brd_repurposing_data['target'].str.contains('TUBB', na = False)
is_tubulin_inhibitor = brd_repurposing_data['moa'].str.contains('tubul.*inhibitor', na = False)
tubb_active_pert_inames = brd_repurposing_data[is_tubb_active | is_tubulin_inhibitor]['pert_iname'].values

In [96]:
cmp_info_cmap = pd.read_csv(cmp_info_cmap_path)

In [97]:
is_tubb_active = cmp_info["pert_iname"].isin(tubb_active_pert_inames)
is_used_compound = cmp_info['used_compound'] == 1
tubb_cmp_info = cmp_info[is_tubb_active & is_used_compound]

In [98]:
tubb_active_pert_ids = tubb_cmp_info['pert_id'].values

In [99]:
for pert_id in tubb_active_pert_ids :
    used_pert_id_target_matrix.loc[pert_id,'TUBB'] = 1

In [100]:
n_annotations = used_pert_id_target_matrix.count().sum()
n_annotated_compounds = used_pert_id_target_matrix.shape[0]
print('Total number of annotations for ' + str(n_annotated_compounds) + ' (DMSO excluded, and adding tubulin inhibitors) : ' + str(n_annotations))

print('Previous result (2018) : 512406 annotations (DMSO included)')

Total number of annotations for 7825 (DMSO excluded, and adding tubulin inhibitors) : 517825
Previous result (2018) : 512406 annotations (DMSO included)


In [101]:
used_pert_id_target_matrix.to_csv(used_pert_id_target_matrix_path)