In [21]:
import os
import pandas as pd
import sqlite3

from pdbbind_metadata_processor import PDBBindMetadataProcessor

In [22]:
dude_targets = os.listdir('../DUD-E/all/')

In [23]:
pdbbind_table = PDBBindMetadataProcessor().get_master_dataframe(remove_peptide_ligands=False)

In [24]:
pdbbind_table.head()

Unnamed: 0,PDB code,resolution,release year_x,-logKd/Ki,Kd/Ki,reference,ligand name,activity_list,sep,value,units,release year_y,Uniprot ID,protein name,active
0,3zzf,2.2,2012,0.4,Ki=400mM //,3zzf.pdf,(NLG),"[mM, =, 400]",=,400000000.0,nM,2012,Q01217,ACETYLGLUTAMATE KINASE,False
1,3gww,2.46,2009,0.45,IC50=355mM //,3gwu.pdf,(SFX),"[mM, =, 355]",=,355000000.0,nM,2009,O67854,TRANSPORTER,False
2,1w8l,1.8,2004,0.49,Ki=320mM //,1w8l.pdf,(1P3),"[mM, =, 320]",=,320000000.0,nM,2004,P62937,PEPTIDYL-PROLYL CIS-TRANS ISOMERASE A,False
3,3fqa,2.35,2009,0.49,IC50=320mM //,3fq7.pdf,(GAB&PMP),"[mM, =, 320]",=,320000000.0,nM,2009,P24630,"GLUTAMATE-1-SEMIALDEHYDE 2,1-AMINOMUTASE",False
4,1zsb,2.0,1996,0.6,Kd=250mM //,1zsb.pdf,(AZM),"[mM, =, 250]",=,250000000.0,nM,1996,P00918,CARBONIC ANHYDRASE 2,False


In [25]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("../ChEMBL/chembl_29_sqlite/chembl_29.db")
df = pd.read_sql_query("""SELECT * 
FROM target_dictionary a 
JOIN target_components b ON a.tid = b.tid
JOIN component_sequences c ON b.component_id = c.component_id
JOIN component_class d ON c.component_id = d.component_id
JOIN protein_classification e ON d.protein_class_id = e.protein_class_id
JOIN component_synonyms f ON c.component_id = f.component_id""", con)

con.close()

In [26]:
from chembl_connector import ChEMBLConnector

In [27]:
cc = ChEMBLConnector()
chembl_target_df = cc.get_target_table()
chembl_target_df.head()

Unnamed: 0,accession,component_synonym,protein_class_desc,level2,gene_symbol_lowercase
0,O09028,Gabrp,ion channel lgic gabaa,ion channel lgic,gabrp
1,P02708,CHRNA1,ion channel lgic ach chrn alpha,ion channel lgic,chrna1
2,P04637,TP53,transcription factor,transcription factor,tp53
3,P04757,Chrna3,ion channel lgic ach chrn alpha,ion channel lgic,chrna3
4,P05106,ITGB3,membrane receptor,membrane receptor,itgb3


In [28]:
merged_table = pdbbind_table.merge(chembl_target_df, left_on='Uniprot ID', right_on='accession')

In [31]:
pdbbind_genes = merged_table['gene_symbol_lowercase'].unique()

In [44]:
for gene in pdbbind_genes :
    if gene.endswith('a4') :
        print(merged_table[merged_table['gene_symbol_lowercase'] == gene]['protein name'].head(1))

4294    CARBOXYPEPTIDASE A4
Name: protein name, dtype: object
10498    EPHRIN TYPE-A RECEPTOR 4
Name: protein name, dtype: object
10812    CARBONIC ANHYDRASE 4
Name: protein name, dtype: object
10870    SODIUM-DEPENDENT SEROTONIN TRANSPORTER
Name: protein name, dtype: object
12555    GLUTAMATE RECEPTOR 4
Name: protein name, dtype: object
12707    TRANSCRIPTION ACTIVATOR BRG1
Name: protein name, dtype: object
12863    INTEGRIN ALPHA-4
Name: protein name, dtype: object
12990    NEURONAL ACETYLCHOLINE RECEPTOR SUBUNIT ALPHA-4
Name: protein name, dtype: object


In [35]:
dude_only = []
for gene in dude_targets :
    if gene not in pdbbind_genes :
        dude_only.append(gene)

In [40]:
dude_only

['cp3a4',
 'tryb1',
 'hs90a',
 'tgfr1',
 'cah2',
 'andr',
 'pa2ga',
 'pnph',
 'nos1',
 'pgh1',
 'hivint',
 'tysy',
 'glcm',
 'aces',
 'fkb1a',
 'sahh',
 'kpcb',
 'kith',
 'nram',
 'cp2c9',
 'try1',
 'mk14',
 'dhi1',
 'aldr',
 'fak1',
 'aofb',
 'inha',
 'hivrt',
 'lkha4',
 'mcr',
 'prgr',
 'drd3',
 'adrb1',
 'mapk2',
 'urok',
 'mp2k1',
 'gcr',
 'pgh2',
 'fa7',
 'thb',
 'mk01',
 'mk10',
 'hivpr',
 'ptn1',
 'ada17',
 'vgfr2',
 'hxk4',
 'pur2',
 'aa2ar',
 'dyr',
 'fa10',
 'pyrd',
 'hmdh',
 'ital',
 'reni']

In [39]:
from rdkit import Chem
import gzip
for gene in dude_only :
    active_filepath = f'../DUD-E/all/{gene}/actives_final.sdf.gz'
    with Chem.ForwardSDMolSupplier(gzip.open(active_filepath)) as sdm :
        mols = [mol for mol in sdm]
        print(len(mols))

363
171
125
281
835
523
127
233
234
251
211
311
313
664
273
190
248
132
222
183
758
915
519
220
114
168
71
639
244
193
444
877
458
206
306
242
563
531
185
168
139
186
1395
225
959
620
127
201
844
566
792
134
299
233
387


In [8]:
pdbbind_table[pdbbind_table['Uniprot ID'].str.contains('--')].shape

(354, 15)

In [9]:
chembl_target_df['protein_class_desc'].value_counts()

unclassified                                               1271
enzyme  hydrolase                                           915
enzyme  transferase                                         865
enzyme  reductase                                           676
enzyme                                                      675
                                                           ... 
enzyme  kinase  protein kinase  other  other-unique           1
membrane receptor  7tm1  peptide  glycohormone receptor       1
enzyme  cytochrome p450  cyp_3  cyp_3a  cyp_3a12              1
enzyme  kinase  protein kinase  other  ttk                    1
enzyme  cytochrome p450  cyp_2  cyp_2c  cyp_2c5               1
Name: protein_class_desc, Length: 670, dtype: int64

In [11]:
chembl_target_df['level2'].value_counts()

unclassified                                   1271
enzyme  hydrolase                               915
enzyme  transferase                             865
enzyme  kinase                                  748
membrane receptor  7tm1                         732
enzyme  reductase                               676
enzyme                                          675
enzyme  protease                                533
cytosolic other                                 502
ion channel  lgic                               288
transporter  electrochemical                    278
ion channel  vgc                                216
secreted                                        215
enzyme  lyase                                   186
membrane receptor                               172
transcription factor                            171
enzyme  isomerase                               149
transcription factor  nuclear receptor          140
enzyme  ligase                                  135
epigenetic r

In [13]:
df['component_synonym'] = df['component_synonym'].str.lower()

In [14]:
pdbbind_chembl = pdbbind_table.merge(df, left_on='Uniprot ID', right_on='accession')

In [15]:
pdbbind_chembl

Unnamed: 0,PDB code,resolution,release year_x,-logKd/Ki,Kd/Ki,reference,ligand name,activity_list,sep,value,...,parent_id,pref_name,short_name,protein_class_desc,definition,class_level,compsyn_id,component_id,component_synonym,syn_type
0,1w8l,1.80,2004,0.49,Ki=320mM //,1w8l.pdf,(1P3),"[mM, =, 320]",=,3.200000e+08,...,1,Isomerase,Isomerase,enzyme isomerase,A group of enzymes that catalyze the structura...,2,1311294,272,5.2.1.8,EC_NUMBER
1,1w8l,1.80,2004,0.49,Ki=320mM //,1w8l.pdf,(1P3),"[mM, =, 320]",=,3.200000e+08,...,1,Isomerase,Isomerase,enzyme isomerase,A group of enzymes that catalyze the structura...,2,863490,272,cypa,GENE_SYMBOL_OTHER
2,1w8l,1.80,2004,0.49,Ki=320mM //,1w8l.pdf,(1P3),"[mM, =, 320]",=,3.200000e+08,...,1,Isomerase,Isomerase,enzyme isomerase,A group of enzymes that catalyze the structura...,2,1311290,272,cyclophilin a,UNIPROT
3,1w8l,1.80,2004,0.49,Ki=320mM //,1w8l.pdf,(1P3),"[mM, =, 320]",=,3.200000e+08,...,1,Isomerase,Isomerase,enzyme isomerase,A group of enzymes that catalyze the structura...,2,1311291,272,cyclosporin a-binding protein,UNIPROT
4,1w8l,1.80,2004,0.49,Ki=320mM //,1w8l.pdf,(1P3),"[mM, =, 320]",=,3.200000e+08,...,1,Isomerase,Isomerase,enzyme isomerase,A group of enzymes that catalyze the structura...,2,1311286,272,ppia,GENE_SYMBOL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312403,5c1m,2.10,2015 1,0.80,Ki=16pM //,5c1m.pdf,(4VO),"[pM, =, 16]",=,1.600000e-02,...,1253,Opioid receptor,Opioid receptor,membrane receptor 7tm1 peptide short peptid...,Cell membrane proteins that bind opioids and t...,5,1325056,1188,mor-1,UNIPROT
312404,5c1m,2.10,2015 1,0.80,Ki=16pM //,5c1m.pdf,(4VO),"[pM, =, 16]",=,1.600000e-02,...,1253,Opioid receptor,Opioid receptor,membrane receptor 7tm1 peptide short peptid...,Cell membrane proteins that bind opioids and t...,5,885412,1188,mor,GENE_SYMBOL_OTHER
312405,5c1m,2.10,2015 1,0.80,Ki=16pM //,5c1m.pdf,(4VO),"[pM, =, 16]",=,1.600000e-02,...,1253,Opioid receptor,Opioid receptor,membrane receptor 7tm1 peptide short peptid...,Cell membrane proteins that bind opioids and t...,5,1325054,1188,mu-type opioid receptor,UNIPROT
312406,5c1m,2.10,2015 1,0.80,Ki=16pM //,5c1m.pdf,(4VO),"[pM, =, 16]",=,1.600000e-02,...,1253,Opioid receptor,Opioid receptor,membrane receptor 7tm1 peptide short peptid...,Cell membrane proteins that bind opioids and t...,5,885413,1188,oprm,GENE_SYMBOL_OTHER


In [16]:
chembl_targets_in_dude = df[df['component_synonym'].isin(dude_targets)]

In [17]:
pdbbind_in_dude = pdbbind_table.merge(chembl_targets_in_dude, left_on='Uniprot ID', right_on='accession')

In [18]:
pdbbind_in_dude = pdbbind_table[pdbbind_table['Uniprot ID'].isin(chembl_targets_in_dude['accession'])]

In [19]:
pdbbind_in_dude[pdbbind_in_dude['protein name'] == 'BETA-SECRETASE 1']

Unnamed: 0,PDB code,resolution,release year_x,-logKd/Ki,Kd/Ki,reference,ligand name,activity_list,sep,value,units,release year_y,Uniprot ID,protein name,active
301,4zsm,1.96,2015,2.40,IC50~4mM //,4zsm.pdf,(4RW),"[mM, ~, 4]",~,4000000.000,nM,2015,P56817,BETA-SECRETASE 1,False
441,2ohk,2.20,2007,2.70,IC50~2000uM //,2ohk.pdf,(1SQ),"[uM, ~, 2000]",~,2000000.000,nM,2007,P56817,BETA-SECRETASE 1,False
444,3bra,2.30,2008,2.70,Kd=2000uM //,3bra.pdf,(AEF),"[uM, =, 2000]",=,2000000.000,nM,2008,P56817,BETA-SECRETASE 1,False
451,3hvg,2.26,2009,2.70,IC50=2mM //,3hvg.pdf,(EV0),"[mM, =, 2]",=,2000000.000,nM,2009,P56817,BETA-SECRETASE 1,False
552,3udh,1.70,2012,2.85,Kd=1.4mM //,3udh.pdf,(091),"[mM, =, 1.4]",=,1400000.000,nM,2012,P56817,BETA-SECRETASE 1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18743,1m4h,2.10,2002,9.52,Ki=0.3nM //,1m4h.pdf,(7-mer),"[nM, =, 0.3]",=,0.300,nM,2002,P56817,BETA-SECRETASE 1,True
18750,2g94,1.86,2006,9.52,Ki=0.3nM //,2g94.pdf,(ZPQ),"[nM, =, 0.3]",=,0.300,nM,2006,P56817,BETA-SECRETASE 1,True
18768,4wtu,1.85,2015,9.52,IC50=0.3nM //,4wtu.pdf,(3UT),"[nM, =, 0.3]",=,0.300,nM,2015,P56817,BETA-SECRETASE 1,True
19211,1xn2,1.90,2005 1,0.52,Ki=0.03nM //,1xn2.pdf,(11-mer),"[nM, =, 0.03]",=,0.030,nM,2005,P56817,BETA-SECRETASE 1,True


In [20]:
dict(pdbbind_in_dude['protein name'].value_counts())

{'BETA-SECRETASE 1': 343,
 'CELL DIVISION PROTEIN KINASE 2': 167,
 'GLYCOGEN PHOSPHORYLASE': 93,
 'CYCLIN-DEPENDENT KINASE 2': 88,
 'DIPEPTIDYL PEPTIDASE 4': 85,
 'EPIDERMAL GROWTH FACTOR RECEPTOR': 78,
 'BETA-LACTAMASE': 75,
 'GLUTAMATE RECEPTOR 2': 70,
 'PEROXISOME PROLIFERATOR-ACTIVATED RECEPTOR GAMMA': 67,
 'TYROSINE-PROTEIN KINASE JAK2': 66,
 'TYROSINE-PROTEIN KINASE SRC': 64,
 'ESTROGEN RECEPTOR': 61,
 'HEPATOCYTE GROWTH FACTOR RECEPTOR': 56,
 'SERINE/THREONINE-PROTEIN KINASE B-RAF': 39,
 'KINESIN-LIKE PROTEIN KIF11': 35,
 'SERINE/THREONINE-PROTEIN KINASE PLK1': 34,
 'TYROSINE-PROTEIN KINASE ABL1': 32,
 'COLLAGENASE 3': 31,
 'CATECHOL O-METHYLTRANSFERASE': 31,
 'POLY [ADP-RIBOSE] POLYMERASE 1': 29,
 "CGMP-SPECIFIC 3',5'-CYCLIC PHOSPHODIESTERASE": 26,
 'GLUTAMATE RECEPTOR, IONOTROPIC KAINATE 1': 26,
 'E3 UBIQUITIN-PROTEIN LIGASE XIAP': 24,
 'RETINOIC ACID RECEPTOR RXR-ALPHA': 23,
 'ANGIOTENSIN-CONVERTING ENZYME': 22,
 'FIBROBLAST GROWTH FACTOR RECEPTOR 1': 21,
 'HISTONE DEACETYLAS