In [1]:
pip install chembl_webresource_client


Note: you may need to restart the kernel to use updated packages.


In [88]:
import numpy as np
import pandas as pd
from time import time

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
%matplotlib inline

from chembl_webresource_client.new_client import new_client


Search for Target protein    (Scaffold analysis of ChEMBL data with pandas and RDKit )

In [89]:
target = new_client.target
target_query = target.search('aurora a')
targets = pd.DataFrame.from_dict(target_query)


In [90]:
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Xenopus laevis,Aurora kinase B-A,23.0,False,CHEMBL2176838,"[{'accession': 'Q6DE08', 'component_descriptio...",SINGLE PROTEIN,8355.0
1,[],Xenopus laevis,Aurora kinase A-A,23.0,False,CHEMBL5169182,"[{'accession': 'Q91820', 'component_descriptio...",SINGLE PROTEIN,8355.0
2,"[{'xref_id': 'O14965', 'xref_name': None, 'xre...",Homo sapiens,Serine/threonine-protein kinase Aurora-A,20.0,False,CHEMBL4722,"[{'accession': 'O14965', 'component_descriptio...",SINGLE PROTEIN,9606.0
3,"[{'xref_id': 'P97477', 'xref_name': None, 'xre...",Mus musculus,Serine/threonine-protein kinase Aurora-A,20.0,False,CHEMBL2211,"[{'accession': 'P97477', 'component_descriptio...",SINGLE PROTEIN,10090.0
4,[],Homo sapiens,Aurora kinase A/B,20.0,False,CHEMBL3883303,"[{'accession': 'Q96GD4', 'component_descriptio...",PROTEIN FAMILY,9606.0
...,...,...,...,...,...,...,...,...,...
6564,[],Homo sapiens,Histone-lysine N-methyltransferase PRDM7,0.0,False,CHEMBL5214861,"[{'accession': 'Q9NQW5', 'component_descriptio...",SINGLE PROTEIN,9606.0
6565,[],Homo sapiens,PR domain zinc finger protein 2,0.0,False,CHEMBL5214862,"[{'accession': 'Q13029', 'component_descriptio...",SINGLE PROTEIN,9606.0
6566,[],Homo sapiens,PR domain zinc finger protein 10,0.0,False,CHEMBL5214863,"[{'accession': 'Q9NQV6', 'component_descriptio...",SINGLE PROTEIN,9606.0
6567,[],Homo sapiens,PR domain zinc finger protein 8,0.0,False,CHEMBL5214864,"[{'accession': 'Q9NQV8', 'component_descriptio...",SINGLE PROTEIN,9606.0


In [91]:
##Select and retrieve bioactivity data for Human aurora a (third entry)

In [92]:
selected_target = targets.target_chembl_id[2]
selected_target

'CHEMBL4722'

In [93]:
#Here, we will retrieve only bioactivity data
# for Human aurora a (CHEMBL4722) that are reported as pChEMBL values.

In [94]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)
#df.head(3)

In [95]:
df = df[df.standard_value.notna()]
df = df[df.canonical_smiles.notna()]
df.head(3)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1504915,[],CHEMBL827045,Inhibition of Aurora kinase A,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase Aurora-A,9606,,,IC50,uM,UO_0000065,,0.005
1,,,1504917,[],CHEMBL827045,Inhibition of Aurora kinase A,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase Aurora-A,9606,,,IC50,uM,UO_0000065,,0.041
2,,,1504919,[],CHEMBL827045,Inhibition of Aurora kinase A,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase Aurora-A,9606,,,IC50,uM,UO_0000065,,0.13


In [96]:
df.units.shape

(3656,)

In [97]:
df['units'].value_counts()

nM           1970
uM           1603
10'-6g/ml       6
mM              5
ug ml-1         4
10'-8M          3
10'-7g/ml       2
10'-5g/ml       2
µM              2
umol            1
M               1
10'-9M          1
10^-10M         1
Name: units, dtype: int64

In [98]:
df = df[(df['units'] == 'nM')]
len(df)

1970

In [99]:
#Store 'value' in a list 
fix = []

for i in df.value:
    fix.append(float(i))

In [100]:
# define active, inactive, intermediate compounds ------

STATUS = []

for i in fix:
    if i <=1000:
        STATUS.append("Active") #active
        
    elif i >=10000:
        STATUS.append("Inactive") #inactive
        
    else:
        STATUS.append("Intermediate") #intermediate

In [101]:
# replace new value from fix list
df = df.drop('value',1)
df['value'] = fix
df['STATUS'] = STATUS
df.head(3)

  df = df.drop('value',1)


Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value,STATUS
8,,,1504939,[],CHEMBL831065,Inhibitory activity against Aurora kinase A,B,,,BAO_0000190,...,Serine/threonine-protein kinase Aurora-A,9606,,,IC50,nM,UO_0000065,,27.0,Active
9,,,1507327,[],CHEMBL827052,Inhibition of Aurora kinase A (Aur1),B,,,BAO_0000190,...,Serine/threonine-protein kinase Aurora-A,9606,,,IC50,nM,UO_0000065,,3229.0,Intermediate
10,,,1507343,[],CHEMBL827052,Inhibition of Aurora kinase A (Aur1),B,,,BAO_0000190,...,Serine/threonine-protein kinase Aurora-A,9606,,,IC50,nM,UO_0000065,,10000.0,Inactive


In [102]:
df['STATUS'] = STATUS
df.head(3)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value,STATUS
8,,,1504939,[],CHEMBL831065,Inhibitory activity against Aurora kinase A,B,,,BAO_0000190,...,Serine/threonine-protein kinase Aurora-A,9606,,,IC50,nM,UO_0000065,,27.0,Active
9,,,1507327,[],CHEMBL827052,Inhibition of Aurora kinase A (Aur1),B,,,BAO_0000190,...,Serine/threonine-protein kinase Aurora-A,9606,,,IC50,nM,UO_0000065,,3229.0,Intermediate
10,,,1507343,[],CHEMBL827052,Inhibition of Aurora kinase A (Aur1),B,,,BAO_0000190,...,Serine/threonine-protein kinase Aurora-A,9606,,,IC50,nM,UO_0000065,,10000.0,Inactive


In [103]:
df['assay_description'].value_counts()

Enzyme Assay: The Aurora assays described here are performed on two Caliper Life Sciences systems: the LC3000 and the Desktop Profiler. These provide data on enzyme activity via measurement of the relative amounts of phosphorylated or unphosphorylated fluorescently labelled substrate peptide at the end of an enzymatic reaction. These different states of peptide are resolved by applying a potential difference across the sample. The presence of the charged phosphate group on the product (as opposed to the substrate) causes a different peptide mobility between the two peptides. This is visualized by excitation of the fluorescent label on the substrate and product peptides and represented as peaks within the analysis software.    634
Inhibition of Aurora A                                                                                                                                                                                                                                              

In [104]:
#df = df.rename(columns={'molcule_chembl_id': 'chemblId'})

In [105]:
#df

In [106]:
len(df), len(df['molecule_chembl_id'].unique())

(1970, 1715)

In [107]:
df.tail(2)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value,STATUS
3722,,,24958542,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5212975,Selectivity interaction (Enzymatic assay) EUB0...,B,,,BAO_0000190,...,Serine/threonine-protein kinase Aurora-A,9606,,,IC50,nM,UO_0000065,,19.4,Active
3724,"{'action_type': 'INHIBITOR', 'description': 'N...",,24959160,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5215249,Inhibition of recombinant GST Aurora A kinase ...,B,,,BAO_0000190,...,Serine/threonine-protein kinase Aurora-A,9606,,,IC50,nM,UO_0000065,,22.0,Active


In [108]:
# clean smiles

from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem import MolFromSmiles,MolToSmiles

def clean_smiles (ListSMILEs):
    remover = SaltRemover()
    len(remover.salts)

    SMILES_desalt = []

    for i in ListSMILEs:
        mol = MolFromSmiles(i) 
        mol_desalt = remover.StripMol(mol)
        mol_SMILES = MolToSmiles(mol_desalt)
        SMILES_desalt.append(mol_SMILES)
    return SMILES_desalt

In [109]:
df['SMILES_desalt'] = clean_smiles(df.canonical_smiles)

In [110]:
df= df.drop_duplicates(\
                    subset='SMILES_desalt', keep='last')

print ("RAW data of " + str(len(df)) + \
          " SMILES has been reduced to " \
          + str(len(df)) + " SMILES.")

RAW data of 1707 SMILES has been reduced to 1707 SMILES.


In [111]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value','SMILES_desalt','STATUS']
df = df[selection]
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,SMILES_desalt,STATUS
9,CHEMBL425904,CC(CN(C)C)Oc1ccc(Cl)cc1NC(=O)Nc1cnc(C#N)cn1,3229.0,CC(CN(C)C)Oc1ccc(Cl)cc1NC(=O)Nc1cnc(C#N)cn1,Intermediate
10,CHEMBL192161,CC1(COc2ccc(Cl)cc2NC(=O)Nc2cnc(C#N)cn2)COC1,10000.0,CC1(COc2ccc(Cl)cc2NC(=O)Nc2cnc(C#N)cn2)COC1,Inactive
68,CHEMBL204318,CCCCC(=O)Nc1ncc(Nc2ncnc3cc(OCCCN4CCOCC4)c(OC)c...,17.0,CCCCC(=O)Nc1ncc(Nc2ncnc3cc(OCCCN4CCOCC4)c(OC)c...,Active
69,CHEMBL203507,COc1cc2c(Nc3cnc(NC(=O)c4ccncc4)nc3)ncnc2cc1OCC...,690.0,COc1cc2c(Nc3cnc(NC(=O)c4ccncc4)nc3)ncnc2cc1OCC...,Active
70,CHEMBL206374,CCCN(CCC)S(=O)(=O)c1ccc(C(=O)Nc2ncc(Nc3ncnc4cc...,3900.0,CCCN(CCC)S(=O)(=O)c1ccc(C(=O)Nc2ncc(Nc3ncnc4cc...,Intermediate
...,...,...,...,...,...
3719,CHEMBL514499,COc1cc2ncn(-c3cc(OCc4ccccc4C(F)(F)F)c(C(N)=O)s...,4800.0,COc1cc2ncn(-c3cc(OCc4ccccc4C(F)(F)F)c(C(N)=O)s...,Intermediate
3720,CHEMBL3808844,CCOc1cn(-c2ccc(F)cc2C)nc1C(=O)Nc1ccc(Oc2ccnc3c...,36.0,CCOc1cn(-c2ccc(F)cc2C)nc1C(=O)Nc1ccc(Oc2ccnc3c...,Active
3721,CHEMBL3990456,Cn1cc(-c2[nH]c3cc(NC(=O)[C@H](N)C4CCCCC4)cc4c3...,23.0,Cn1cc(-c2[nH]c3cc(NC(=O)[C@H](N)C4CCCCC4)cc4c3...,Active
3722,CHEMBL4554938,CN1C(=O)c2sccc2N(C)c2nc(Nc3ccc(S(N)(=O)=O)cc3)...,19.4,CN1C(=O)c2sccc2N(C)c2nc(Nc3ccc(S(N)(=O)=O)cc3)...,Active


In [112]:
len(df['molecule_chembl_id'].unique())

1707

In [113]:
df.to_csv('~/Documents/DRUG-DISCOVERY/QSAR-modeling/DataModel/Aurora.csv', sep=',' ,index=False)

In [114]:
len(df)

1707

In [115]:
# save DataModel folder
df.to_csv   ('~/Documents/DRUG-DISCOVERY/QSAR-modeling/DataModel/Train_aurora.csv'   , sep=',' ,index=False)

In [116]:
Train_smiles = df[['SMILES_desalt','molecule_chembl_id']]
Train_smiles.to_csv('~/Documents/DRUG-DISCOVERY/QSAR-modeling/Smiles/Train_aurora.smi', sep='\t' ,header=False ,index=False)
