# PubChem

In this notebook we took structures from PDBbind core-set to find UniProt IDs for their corresponding protein. Next we mapped UniProt IDs to Gene IDs using [UniProt ID mapping](http://www.uniprot.org/uploadlists/) and found all matching PubChem Assay IDs. After downloading data from [PubChem](https://pubchem.ncbi.nlm.nih.gov/), we cleaned it and compared to data from [ChEMBL](https://www.ebi.ac.uk/chembl/). Finally we saved the data to csv. 

In [1]:
import os
import re
import requests
import numpy as np
import pandas as pd
from lxml import etree
from tqdm import tqdm_notebook
from contextlib import redirect_stderr

import PubChem as pubchem

## Loading pdbbind core-set information

#### Cluster ID

In [2]:
f = './refined-set/index/INDEX_core_cluster.2016'
clusters = pd.read_csv(f, sep='\s+', usecols=[0, 5], 
                       names=['pdb_id', 'cluster_id'], comment='#')
clusters.head()

Unnamed: 0,pdb_id,cluster_id
0,1ps3,3
1,3dx1,3
2,3d4z,3
3,3dx2,3
4,3ejr,3


#### UniProt ID

In [3]:
f = './refined-set/index/INDEX_general_PL_name.2016'
uniprot = pd.read_csv(f, sep='\s+', usecols=[0, 2], 
                      names=['pdb_id', 'uniprot_id'], comment='#')
uniprot.head()

Unnamed: 0,pdb_id,uniprot_id
0,3eql,Q9Z9H6
1,1zyr,Q5SHR6
2,3dxj,Q5SHR6
3,4zh4,P0A7Z4
4,4zh3,P0A7Z4


#### Merge

In [4]:
data_pdb = clusters.merge(uniprot, on='pdb_id')
data_pdb.head()

Unnamed: 0,pdb_id,cluster_id,uniprot_id
0,1ps3,3,Q24451
1,3dx1,3,Q24451
2,3d4z,3,Q24451
3,3dx2,3,Q24451
4,3ejr,3,Q24451


## ID mapping

#### From UniProt ID to Gene ID

In [5]:
# all uniprot IDs 
uniprot_ids = set(data_pdb['uniprot_id'].tolist())

In [6]:
text_file = open('UniProt_IDs.txt', 'w')
tmp = re.sub(r'[,\'{}]', '', str(uniprot_ids))
text_file.write(tmp)
text_file.close()

For mapping I used http://www.uniprot.org/uploadlists/.

In [7]:
mapping_gid = pd.read_csv('./GeneName.txt', sep='\t')
mapping_gid.head()

Unnamed: 0,From,To
0,P00519,25
1,P15207,24208
2,P19491,29627
3,P11309,5292
4,Q9Y233,10846


In [8]:
gene_ids = list(set(mapping_gid['To']))

## Download data
#### For every Gene ID find matching Assay ID(s) and then for every Assay ID download csv from PubChem

In [9]:
# path to store information
directory = './aid_files/'
os.system('mkdir -p %s' % directory)

0

In [10]:
for gene_id in gene_ids:

    aids = pubchem.get_AIDs(gene_id)
    
    if aids[0] != 'Status:':
        for aid in aids:
            pubchem.download_aid_csv(aid, directory)
        

In [11]:
aid_list = []
gen_list = []

for gene_id in gene_ids:

    aids = pubchem.get_AIDs(gene_id)
    if aids[0] != 'Status:':
        aid_list += aids
        gen_list += [gene_id]*len(aids)
    
gen_aid = pd.DataFrame({'aid': aid_list, 'gen': gen_list})
gen_aid.head()

Unnamed: 0,aid,gen
0,1433,3716
1,1982,3716
2,256646,3716
3,277462,3716
4,339778,3716


### Example

In [12]:
pd.read_csv('./aid_files/1141064.csv', index_col=0)

Unnamed: 0_level_0,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,IC50,SEI,BEI,LE,LLE,IC50 activity comment,IC50 standard flag,IC50 qualifier,IC50 published value,IC50 standard value,IC50 data validity
PUBCHEM_RESULT_TAG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
RESULT_TYPE,,,,,,,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,STRING,INTEGER,STRING,FLOAT,FLOAT,STRING
RESULT_DESCR,,,,,,,IC50 PubChem standard value,Surface Efficiency Index(nM),Binding Efficiency Index(nM),Ligand Efficiency,Lipophilic Ligand Efficiency,IC50 activity comment,IC50 standard flag,IC50 qualifier,IC50 published value,IC50 standard value,IC50 data validity
RESULT_UNIT,,,,,,,MICROMOLAR,,,,,,,,MICROMOLAR,NANOMOLAR,
RESULT_IS_ACTIVE_CONCENTRATION,,,,,,,TRUE,,,,,,,,,,
1,194146212.0,56684144.0,Active,,,,4.1,4.43,13.13,0.25,5.23,,,=,4.1,4100,
2,242634058.0,57381960.0,Unspecified,,,,117,,,,,,,=,117,117000,Outside typical range


## Create new csv

In [13]:
# choosen bioactivity types
bioactivity_types = ['Activity', 'EC50', 'IC50', 'Inhibition',
                     'Ka', 'Kd', 'Ki', 'Km']

In [14]:
# redirect stderr to file
with open('stderr.log', 'w') as stderr, redirect_stderr(stderr):
    for gene_id in gene_ids:

        aids = list(gen_aid.loc[gen_aid['gen'] == gene_id]['aid'])
        pubchem.create_data_frame(gene_id, aids, bioactivity_types, directory='./pubchem/', overwrite=False)


In [15]:
pd.read_csv('./pubchem/pubchem_1017.csv').head()

Unnamed: 0,gene_id,pubchem_aid,pubchem_cid,pubchem_sid,bioactivity,qualifier,value,unit
0,1017.0,53532,9999648.0,103332847.0,Activity,=,9.0,%
1,1017.0,53532,10366534.0,103332947.0,Activity,=,25.0,%
2,1017.0,53532,10073694.0,103333238.0,Activity,=,2.0,%
3,1017.0,53698,9999648.0,103332847.0,Activity,=,11.0,%
4,1017.0,53698,10366534.0,103332947.0,Activity,=,30.0,%


## Data cleaning and filtering

In [16]:
def table_type_unit(gene_id, directory, all_act_type, all_units):
    """Counting occurances of bioactivity types and units.
    
    Parameters
    ----------
    uniprot_id : str
        Uniprot ID.
        
    directory : str
        Path to target csv.
        
    all_act_type: list of str
        List of bioactivity types which interest us.
        
    all_units: list of str
        List of units which interest us.
        
    Returns
    -------
    DataFrame with all_act_type as columns and all_units as indices.
    Values are numbers of occurrences of bioactivity-unit pairs.
    """

    f = os.path.join(directory, 'pubchem_%s.csv' % gene_id)
    if os.path.isfile(f):
        data = pd.read_csv(f)
        count = {}
        for act_type in all_act_type:
            count[act_type] = []
            query = data[data['bioactivity'] == act_type]
            for unit in all_units:
                count[act_type].append(sum(query['unit'] == unit))

        return pd.DataFrame(count, index=all_units)
    

In [17]:
type_unit = sum([table_type_unit(gene_id, 'pubchem', bioactivity_types, ['%', 'nM', 'uM', 'pM']) for gene_id in gene_ids])
type_unit

Unnamed: 0,Activity,EC50,IC50,Inhibition,Ka,Kd,Ki,Km
%,3413,3,0,18092,0,0,0,0
nM,70,5240,40978,0,1,3560,19089,70
uM,86,0,0,0,80,0,0,0
pM,14,0,0,0,0,0,0,0


In [18]:
for gene_id in gene_ids:
    
    data = pd.read_csv('./pubchem/pubchem_%s.csv' % gene_id)
    
    # drop nan values
    data = data.dropna(axis=0, how='any')
    
    pubchem.convert_unit(data, 'uM', 'nM', 10e3)  # convert uM to nM
    pubchem.convert_unit(data, 'pM', 'nM', 1/10e3)  # convert pM to nM
    
    # remove EC50 values with % as unit and Activity with nM as unit
    idx = data.index[((data['unit'] == 'nM') & (data['bioactivity'] == 'Activity')) | 
                     ((data['bioactivity'] == 'EC50') & (data['unit'] == '%'))]
    data.drop(idx, inplace=True)
    
    # drop duplicates
    data.drop_duplicates(inplace=True)
    
    os.system('mkdir -p pubchem_cleaned')
    f = './pubchem_cleaned/pubchem_%s.csv' % gene_id
    data.to_csv(f, index=False)
    

In [19]:
type_unit = sum([table_type_unit(gene_id, 'pubchem_cleaned', bioactivity_types, ['%', 'nM']) for gene_id in gene_ids])
type_unit

Unnamed: 0,Activity,EC50,IC50,Inhibition,Ka,Kd,Ki,Km
%,3141,0,0,17697,0,0,0,0
nM,0,4752,35045,0,81,3189,16115,45


## Create cluster files

In [20]:
os.system('mkdir -p pubchem_cluster')

0

In [21]:
for cluster in set(data_pdb['cluster_id'].tolist()):
    
    uniprot_ids = set(data_pdb.loc[data_pdb['cluster_id'] == cluster]['uniprot_id'])
    
    idx = data_pdb.index[data_pdb['cluster_id'] == cluster]
    cluster_data = []
    
    for uniprot_id in uniprot_ids:
        
        gene_ids = set(mapping_gid.loc[mapping_gid['From'] == uniprot_id]['To'])
        
        for gene_id in gene_ids:
        
            data = pd.read_csv('./pubchem_cleaned/pubchem_%s.csv' % gene_id)
            data = data.assign(uniprot_id = [uniprot_id]*len(data))
            cluster_data.append(data)
    
    if cluster_data:
        cluster_data = pd.concat(cluster_data)
    
        cluster_data.to_csv('./pubchem_cluster/cluster_%s.csv' % cluster, index=False)


In [22]:
pd.read_csv('./pubchem_cluster/cluster_1148.csv').head()

Unnamed: 0,gene_id,pubchem_aid,pubchem_cid,pubchem_sid,bioactivity,qualifier,value,unit,uniprot_id
0,23476.0,696221,71456689.0,163327176.0,Activity,=,20.0,%,O60885
1,23476.0,620195,5325760.0,134437304.0,IC50,=,84200.0,nM,O60885
2,23476.0,620195,44243549.0,134437305.0,IC50,=,24600.0,nM,O60885
3,23476.0,620195,54757749.0,134437306.0,IC50,=,23200.0,nM,O60885
4,23476.0,620195,54757752.0,134437307.0,IC50,=,7500.0,nM,O60885
