In [1]:
import sys
import math
from pathlib import Path
from zipfile import ZipFile
from tempfile import TemporaryDirectory
import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools

In [2]:
from chembl_webresource_client.new_client import new_client

In [3]:
## display all results and not only last result
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"


In [4]:
HERE = Path(_dh[-1])
DATA = HERE / "data"

In [5]:
# checking variables
HERE
DATA

PosixPath('/home/mandar/git_repos/TeachopenCADD_Excercises/T001_Compound_data_acquisition_chEMBL')

PosixPath('/home/mandar/git_repos/TeachopenCADD_Excercises/T001_Compound_data_acquisition_chEMBL/data')

In [6]:
targets_api = new_client.target
compounds_api = new_client.molecule
bioactivities_api = new_client.activity

In [7]:
type(targets_api)

chembl_webresource_client.query_set.QuerySet

In [8]:
## import target protein based on UniProt ID , example :EGFR kinase P00533
uniprot_id = "P00533"

In [9]:
targets = targets_api.get(targets_component__accession=uniprot_id)
#targets has too much information. so we choose headers of interest.
targets

[{'cross_references': [{'xref_id': 'O43451', 'xref_name': None, 'xref_src': 'canSAR-Target'}, {'xref_id': 'Maltase-glucoamylase', 'xref_name': None, 'xref_src': 'Wikipedia'}], 'organism': 'Homo sapiens', 'pref_name': 'Maltase-glucoamylase', 'species_group_flag': False, 'target_chembl_id': 'CHEMBL2074', 'target_components': [{'accession': 'O43451', 'component_description': 'Maltase-glucoamylase, intestinal', 'component_id': 434, 'component_type': 'PROTEIN', 'relationship': 'SINGLE PROTEIN', 'target_component_synonyms': [{'component_synonym': '3.2.1.20', 'syn_type': 'EC_NUMBER'}, {'component_synonym': '3.2.1.3', 'syn_type': 'EC_NUMBER'}, {'component_synonym': 'Alpha-glucosidase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'Glucan 1,4-alpha-glucosidase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'Glucoamylase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'Maltase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'Maltase-glucoamylase, intestinal', 'syn_type': 'UNIPROT'}, {'componen

In [10]:
targets = targets_api.get(target_components__accession=uniprot_id).only(
    "target_chembl_id", "organism", "pref_name", "target_type"
)
print(f'The type of the targets is "{type(targets)}"')
targets

The type of the targets is "<class 'chembl_webresource_client.query_set.QuerySet'>"


[{'organism': 'Homo sapiens', 'pref_name': 'Epidermal growth factor receptor erbB1', 'target_chembl_id': 'CHEMBL203', 'target_type': 'SINGLE PROTEIN'}, {'organism': 'Homo sapiens', 'pref_name': 'Epidermal growth factor receptor and ErbB2 (HER1 and HER2)', 'target_chembl_id': 'CHEMBL2111431', 'target_type': 'PROTEIN FAMILY'}, {'organism': 'Homo sapiens', 'pref_name': 'Epidermal growth factor receptor', 'target_chembl_id': 'CHEMBL2363049', 'target_type': 'PROTEIN FAMILY'}, {'organism': 'Homo sapiens', 'pref_name': 'MER intracellular domain/EGFR extracellular domain chimera', 'target_chembl_id': 'CHEMBL3137284', 'target_type': 'CHIMERIC PROTEIN'}]

In [11]:
# converting to Pandas dataframe for better understanding.
targets = pd.DataFrame.from_records(targets)
targets

Unnamed: 0,organism,pref_name,target_chembl_id,target_type
0,Homo sapiens,Epidermal growth factor receptor erbB1,CHEMBL203,SINGLE PROTEIN
1,Homo sapiens,Epidermal growth factor receptor erbB1,CHEMBL203,SINGLE PROTEIN
2,Homo sapiens,Epidermal growth factor receptor and ErbB2 (HE...,CHEMBL2111431,PROTEIN FAMILY
3,Homo sapiens,Epidermal growth factor receptor,CHEMBL2363049,PROTEIN FAMILY
4,Homo sapiens,MER intracellular domain/EGFR extracellular do...,CHEMBL3137284,CHIMERIC PROTEIN


In [12]:
## chosing target id
target=targets.iloc[0]
target

organism                                      Homo sapiens
pref_name           Epidermal growth factor receptor erbB1
target_chembl_id                                 CHEMBL203
target_type                                 SINGLE PROTEIN
Name: 0, dtype: object

In [13]:
chembl_id = target.target_chembl_id
chembl_id
print(f"The target ChEMBL ID is {chembl_id}")

'CHEMBL203'

The target ChEMBL ID is CHEMBL203


In [14]:
bioactivities = bioactivities_api.filter(
    target_chembl_id=chembl_id, type="IC50", relation="=", assay_type="B"
).only(
    "activity_id",
    "assay_chembl_id",
    "assay_description",
    "assay_type",
    "molecule_chembl_id",
    "type",
    "standard_units",
    "relation",
    "standard_value",
    "target_chembl_id",
    "target_organism",
)

print(f"Length and type of bioactivities object: {len(bioactivities)}, {type(bioactivities)}")

Length and type of bioactivities object: 7177, <class 'chembl_webresource_client.query_set.QuerySet'>


In [15]:
print(f"Length and type of first element: {len(bioactivities[0])}, {type(bioactivities[0])}")
bioactivities[0]

Length and type of first element: 13, <class 'dict'>


{'activity_id': 32260,
 'assay_chembl_id': 'CHEMBL674637',
 'assay_description': 'Inhibitory activity towards tyrosine phosphorylation for the epidermal growth factor-receptor kinase',
 'assay_type': 'B',
 'molecule_chembl_id': 'CHEMBL68920',
 'relation': '=',
 'standard_units': 'nM',
 'standard_value': '41.0',
 'target_chembl_id': 'CHEMBL203',
 'target_organism': 'Homo sapiens',
 'type': 'IC50',
 'units': 'uM',
 'value': '0.041'}

## The step below takes longer time to fetch data and is expected. 
If you face error or there is no data in dataframe, run above two cells and then run cell below

In [16]:
bioactivities_df = pd.DataFrame.from_records(bioactivities)
print(f"DataFrame shape: {bioactivities_df.shape}")
bioactivities_df

DataFrame shape: (7178, 13)


Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,32260,CHEMBL674637,Inhibitory activity towards tyrosine phosphory...,B,CHEMBL68920,=,nM,41.0,CHEMBL203,Homo sapiens,IC50,uM,0.041
1,32260,CHEMBL674637,Inhibitory activity towards tyrosine phosphory...,B,CHEMBL68920,=,nM,41.0,CHEMBL203,Homo sapiens,IC50,uM,0.041
2,32267,CHEMBL674637,Inhibitory activity towards tyrosine phosphory...,B,CHEMBL69960,=,nM,170.0,CHEMBL203,Homo sapiens,IC50,uM,0.17
3,32680,CHEMBL677833,In vitro inhibition of Epidermal growth factor...,B,CHEMBL137635,=,nM,9300.0,CHEMBL203,Homo sapiens,IC50,uM,9.3
4,32770,CHEMBL674643,Inhibitory concentration of EGF dependent auto...,B,CHEMBL306988,=,nM,500000.0,CHEMBL203,Homo sapiens,IC50,uM,500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7173,18798661,CHEMBL4274201,Inhibition of recombinant human EGFR L858R mut...,B,CHEMBL554,=,nM,37.55,CHEMBL203,Homo sapiens,IC50,nM,37.55
7174,18798662,CHEMBL4274201,Inhibition of recombinant human EGFR L858R mut...,B,CHEMBL939,=,nM,11.5,CHEMBL203,Homo sapiens,IC50,nM,11.5
7175,18798663,CHEMBL4274202,Inhibition of N-terminal GST-tagged recombinan...,B,CHEMBL4282506,=,nM,137.3,CHEMBL203,Homo sapiens,IC50,nM,137.3
7176,18798664,CHEMBL4274202,Inhibition of N-terminal GST-tagged recombinan...,B,CHEMBL554,=,nM,224.89,CHEMBL203,Homo sapiens,IC50,nM,224.89


In [17]:
bioactivities_df["units"].unique()

array(['uM', 'nM', 'pM', 'M', "10'3 uM", "10'1 ug/ml", 'ug ml-1',
       "10'-1microM", "10'1 uM", "10'-1 ug/ml", "10'-2 ug/ml", "10'2 uM",
       "10'-3 ug/ml", "10'-2microM", '/uM', "10'-6g/ml", 'mM', 'umol/L',
       'nmol/L', "10'-10M"], dtype=object)

In [18]:
bioactivities_df.drop(["units", "value"], axis=1, inplace=True)
bioactivities_df.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type
0,32260,CHEMBL674637,Inhibitory activity towards tyrosine phosphory...,B,CHEMBL68920,=,nM,41.0,CHEMBL203,Homo sapiens,IC50
1,32260,CHEMBL674637,Inhibitory activity towards tyrosine phosphory...,B,CHEMBL68920,=,nM,41.0,CHEMBL203,Homo sapiens,IC50
2,32267,CHEMBL674637,Inhibitory activity towards tyrosine phosphory...,B,CHEMBL69960,=,nM,170.0,CHEMBL203,Homo sapiens,IC50
3,32680,CHEMBL677833,In vitro inhibition of Epidermal growth factor...,B,CHEMBL137635,=,nM,9300.0,CHEMBL203,Homo sapiens,IC50
4,32770,CHEMBL674643,Inhibitory concentration of EGF dependent auto...,B,CHEMBL306988,=,nM,500000.0,CHEMBL203,Homo sapiens,IC50


In [19]:
with ZipFile(DATA / "chembl27_activities.npz.zip") as z, TemporaryDirectory() as tmpdir:
    z.extract("chembl27_activities.npz", tmpdir)
    with np.load(Path(tmpdir) / "chembl27_activities.npz") as f:
        bioactivity_ids_chembl_27 = set(f["activities"])

print(f"Number of bioactivity values in full ChEMBL 27 release: {len(bioactivity_ids_chembl_27)}")
# NBVAL_CHECK_OUTPUT


FileNotFoundError: [Errno 2] No such file or directory: '/home/mandar/git_repos/TeachopenCADD_Excercises/T001_Compound_data_acquisition_chEMBL/data/chembl27_activities.npz.zip'

In [None]:
DATA