## **ChEMBL Database**


## **Installing libraries**

Install the ChEMBL web service package

In [None]:
! pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m788.6 kB/s[0m eta [36m0:00:00[0m
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.0-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-23.2.3-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: url-normalize, cattrs, requests-cache, chembl_webresource_client
Successfully installed cattrs-

## **Importing libraries**

In [None]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

### **Target search for coronavirus**

In [None]:
# Target search for coronavirus
target = new_client.target
target_query = target.search('PLK1')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P70032', 'xref_name': None, 'xre...",Xenopus laevis,Serine/threonine-protein kinase PLK1,28.0,False,CHEMBL4519,"[{'accession': 'P70032', 'component_descriptio...",SINGLE PROTEIN,8355
1,"[{'xref_id': 'P53350', 'xref_name': None, 'xre...",Homo sapiens,Serine/threonine-protein kinase PLK1,18.0,False,CHEMBL3024,"[{'accession': 'P53350', 'component_descriptio...",SINGLE PROTEIN,9606
2,[],Homo sapiens,Cereblon/Serine/threonine-protein kinase PLK1,18.0,False,CHEMBL4742280,"[{'accession': 'P53350', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
3,[],Homo sapiens,Mitotic interactor and substrate of PLK1,15.0,False,CHEMBL4295893,"[{'accession': 'Q8IVT2', 'component_descriptio...",SINGLE PROTEIN,9606


Download the data from target_chembl_id: CHEMBL4519

In [None]:

t1= targets.target_chembl_id[1]

In [None]:
activity = new_client.activity
res = activity.filter(target_chembl_id__in=(t1)).filter(standard_type="IC50")

In [None]:
df = pd.DataFrame.from_dict(res)

In [None]:
df.head(3)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,750655,[],CHEMBL763908,Inhibition of PLK1 kinase,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,nM,UO_0000065,,10000.0
1,,,1662506,[],CHEMBL864552,Inhibitory activity against Plk1,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,uM,UO_0000065,,100.0
2,,,1662531,[],CHEMBL864552,Inhibitory activity against Plk1,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,uM,UO_0000065,,20.0


In [None]:
df.to_csv('bioactivity_data_raw_IC50.csv', index=False)

## **Handling missing data**
Remove any compounds that have null/missing values in the "standard_value" column from the dataset. This ensures data completeness for IC50 analysis.

In [None]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,750655,[],CHEMBL763908,Inhibition of PLK1 kinase,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,nM,UO_0000065,,10000.0
1,,,1662506,[],CHEMBL864552,Inhibitory activity against Plk1,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,uM,UO_0000065,,100.0
2,,,1662531,[],CHEMBL864552,Inhibitory activity against Plk1,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,uM,UO_0000065,,20.0
3,,,1662532,[],CHEMBL864552,Inhibitory activity against Plk1,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,uM,UO_0000065,,100.0
4,,,1662533,[],CHEMBL864552,Inhibitory activity against Plk1,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,uM,UO_0000065,,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1569,,,25097347,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5260350,Inhibition of human N-terminal His6-tagged rec...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,nM,UO_0000065,,25000.0
1570,,,25097348,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5260350,Inhibition of human N-terminal His6-tagged rec...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,nM,UO_0000065,,25000.0
1571,"{'action_type': 'INHIBITOR', 'description': 'N...",,25097349,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5260350,Inhibition of human N-terminal His6-tagged rec...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,nM,UO_0000065,,16.0
1572,"{'action_type': 'INHIBITOR', 'description': 'N...",,25097350,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5260350,Inhibition of human N-terminal His6-tagged rec...,B,,,BAO_0000190,...,Homo sapiens,Serine/threonine-protein kinase PLK1,9606,,,IC50,nM,UO_0000065,,2.5


In [None]:
df2.to_csv('bioactivity_data_Potency2.csv', index=False)

## **Data pre-processing of the bioactivity data**

# Labeling compounds as either being active, inactive or intermediate


### **Combine the 3 columns (molecule_chembl_id,canonical_smiles,standard_value) and bioactivity_class into a DataFrame**

In [None]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL115220,O=C(Cc1ccc2ccccc2c1)Nc1cc(C2CC2)n[nH]1,10000.0
1,CHEMBL200586,COC(=O)c1cc2c(C)n[nH]c2s1,100000.0
2,CHEMBL199996,Cc1n[nH]c2sc(C(N)=O)c(NC(=O)Nc3ccccc3)c12,20000.0
3,CHEMBL199658,Cc1n[nH]c2sc(C(N)=O)c(NC(=O)c3ccc(Cl)cc3)c12,100000.0
4,CHEMBL199657,Cc1n[nH]c2sc(C(N)=O)c(NC(=O)c3cccc(Cl)c3)c12,100000.0
...,...,...,...
1569,CHEMBL5266595,CCn1c(=O)c(-c2c(Cl)cccc2Cl)cc2cnc(Nc3ccc(N4CCN...,25000.0
1570,CHEMBL5275152,C=CCn1c(=O)c2cnc(Nc3ccc4c(c3)CN(C)CC4)cc2n1-c1...,25000.0
1571,CHEMBL5272327,C=CCn1c(=O)c2cnc(Nc3ccc(N4CCN(C)CC4)c4ccn(C)c3...,16.0
1572,CHEMBL1233528,CC[C@@H]1C(=O)N(C)c2cnc(Nc3ccc(C(=O)N[C@H]4CC[...,2.5


In [None]:
df3.to_csv('bioactivity_data_raw-PLK1-ChemBL.csv', index=False)

In [None]:
df3=pd.read_csv(r'bioactivity_data_raw-PLK1-ChemBL.csv')
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL115220,O=C(Cc1ccc2ccccc2c1)Nc1cc(C2CC2)n[nH]1,10000.0
1,CHEMBL200586,COC(=O)c1cc2c(C)n[nH]c2s1,100000.0
2,CHEMBL199996,Cc1n[nH]c2sc(C(N)=O)c(NC(=O)Nc3ccccc3)c12,20000.0
3,CHEMBL199658,Cc1n[nH]c2sc(C(N)=O)c(NC(=O)c3ccc(Cl)cc3)c12,100000.0
4,CHEMBL199657,Cc1n[nH]c2sc(C(N)=O)c(NC(=O)c3cccc(Cl)c3)c12,100000.0
...,...,...,...
1558,CHEMBL5266595,CCn1c(=O)c(-c2c(Cl)cccc2Cl)cc2cnc(Nc3ccc(N4CCN...,25000.0
1559,CHEMBL5275152,C=CCn1c(=O)c2cnc(Nc3ccc4c(c3)CN(C)CC4)cc2n1-c1...,25000.0
1560,CHEMBL5272327,C=CCn1c(=O)c2cnc(Nc3ccc(N4CCN(C)CC4)c4ccn(C)c3...,16.0
1561,CHEMBL1233528,CC[C@@H]1C(=O)N(C)c2cnc(Nc3ccc(C(=O)N[C@H]4CC[...,2.5


IC50 represents the concentration needed to inhibit 50% of a biological target's activity. Lower IC50 values indicate stronger binding/inhibition, hence compounds with IC50 < 1,000 nM are considered active since they can achieve significant inhibition at lower concentrations. Compounds requiring higher concentrations (>10,000 nM) are classified as inactive due to their weak binding affinity.

In [None]:
bioactivity_class = []
for i in df3.standard_value:
    if float(i) >= 10000:
        bioactivity_class.append("inactive")
    elif float(i) <= 1000:
        bioactivity_class.append("active")
    else:
        bioactivity_class.append("intermediate")


In [None]:
bioactivity_class1 = pd.DataFrame({'bioactivity_class': bioactivity_class})
bioactivity_class1.to_csv('bioactivityclass.csv', index=False)
bioactivity_class1

Unnamed: 0,bioactivity_class
0,inactive
1,inactive
2,inactive
3,inactive
4,inactive
...,...
1558,inactive
1559,inactive
1560,active
1561,active


In [None]:
df4 = pd.concat([df3, bioactivity_class1], axis=1)
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL115220,O=C(Cc1ccc2ccccc2c1)Nc1cc(C2CC2)n[nH]1,10000.0,inactive
1,CHEMBL200586,COC(=O)c1cc2c(C)n[nH]c2s1,100000.0,inactive
2,CHEMBL199996,Cc1n[nH]c2sc(C(N)=O)c(NC(=O)Nc3ccccc3)c12,20000.0,inactive
3,CHEMBL199658,Cc1n[nH]c2sc(C(N)=O)c(NC(=O)c3ccc(Cl)cc3)c12,100000.0,inactive
4,CHEMBL199657,Cc1n[nH]c2sc(C(N)=O)c(NC(=O)c3cccc(Cl)c3)c12,100000.0,inactive
...,...,...,...,...
1558,CHEMBL5266595,CCn1c(=O)c(-c2c(Cl)cccc2Cl)cc2cnc(Nc3ccc(N4CCN...,25000.0,inactive
1559,CHEMBL5275152,C=CCn1c(=O)c2cnc(Nc3ccc4c(c3)CN(C)CC4)cc2n1-c1...,25000.0,inactive
1560,CHEMBL5272327,C=CCn1c(=O)c2cnc(Nc3ccc(N4CCN(C)CC4)c4ccn(C)c3...,16.0,active
1561,CHEMBL1233528,CC[C@@H]1C(=O)N(C)c2cnc(Nc3ccc(C(=O)N[C@H]4CC[...,2.5,active


Save the the processed data from ChemBL

In [None]:
df4.to_csv('bioactivity_data_preprocessed-PLK1.csv', index=False)

In [None]:
! ls -l

total 1400
-rw-r--r-- 1 root root   6735 May 13 01:34 bioactivityclass.csv
-rw-r--r-- 1 root root  87505 May 13 01:34 bioactivity_data_preprocessed.csv
-rw-r--r-- 1 root root 602685 May 13 01:34 bioactivity_data_raw2.csv
-rw-r--r-- 1 root root  80770 May 13 01:34 bioactivity_data_raw3.csv
-rw-r--r-- 1 root root 639992 May 13 01:31 bioactivity_data_raw.csv
drwxr-xr-x 1 root root   4096 May  9 13:24 sample_data


---