# **1. Installing Libraries**

Install the ChEMBL web service package so that we can retrieve bioactivity data from the ChEMBL Database.

In [None]:
! pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.8-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests-cache~=0.7.0 (from chembl_webresource_client)
  Downloading requests_cache-0.7.5-py3-none-any.whl (39 kB)
Collecting attrs<22.0,>=21.2 (from requests-cache~=0.7.0->chembl_webresource_client)
  Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting url-normalize<2.0,>=1.4 (from requests-cache~=0.7.0->chembl_webresource_client)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: url-normalize, attrs, requests-cache, chembl_webresource_client
  Attempting uninstall: attrs
    Found existing installation: attrs 23.1.0
    Uninstalling attrs-23.1.0:
      Successfully uninstal

In [None]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

# **2. Search for Target Protein (NS3)**

## **2.1 Target search for dengue virus type 2 NS3 protein**




In [None]:
# Target search for Dengue virus type 2 NS3 protein
target = new_client.target
target_query = target.search('dengue virus NS3')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P29990', 'xref_name': None, 'xre...",Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,25.0,False,CHEMBL5980,"[{'accession': 'P29990', 'component_descriptio...",SINGLE PROTEIN,31634
1,[],Dengue virus,Dengue virus,24.0,False,CHEMBL613757,[],ORGANISM,12637
2,[],Dengue virus 1,Dengue virus 1,21.0,False,CHEMBL613360,[],ORGANISM,11053
3,[],Dengue virus 2,Dengue virus 2,21.0,False,CHEMBL613966,[],ORGANISM,11060
4,[],Dengue virus 3,Dengue virus 3,21.0,False,CHEMBL612717,[],ORGANISM,11069
...,...,...,...,...,...,...,...,...,...
219,"[{'xref_id': 'P10274', 'xref_name': None, 'xre...",Human T-cell leukemia virus 1 (strain Japan AT...,Human T-cell leukemia virus type I protease,4.0,False,CHEMBL3346,"[{'accession': 'P10274', 'component_descriptio...",SINGLE PROTEIN,11926
220,[],Human immunodeficiency virus type 1 (RF/HAT IS...,Human immunodeficiency virus type 1 (RF/HAT IS...,4.0,False,CHEMBL613498,[],ORGANISM,11701
221,[],Homo sapiens,VEGF-receptor 2 and tyrosine-protein kinase SRC,4.0,False,CHEMBL2111336,"[{'accession': 'P12931', 'component_descriptio...",SELECTIVITY GROUP,9606
222,[],Homo sapiens,Sharpin/RBCK1/RNF31,4.0,False,CHEMBL4296109,"[{'accession': 'Q9H0F6', 'component_descriptio...",PROTEIN COMPLEX,9606


## **2.2 Select and retrieve bioactivity data for dengue virus type 2 NS3 protein (first entry)**

In [None]:
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL5980'

Here, we will retrieve only bioactivity data for *Dengue virus type 2 NS3 protein* (CHEMBL5980) that are reported as IC50 values.

In [None]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)
df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,active,7018430,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,100.0
1,,active,7018431,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,12.31
2,,active,7018432,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,100.0
3,,active,7018433,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,50.97
4,,active,7018434,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1279,"{'action_type': 'INHIBITOR', 'description': 'N...",,24708309,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5109296,Inhibition of DENV2 NS2B-NS3 protease using Bz...,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,25.2
1280,"{'action_type': 'INHIBITOR', 'description': 'N...",,24708310,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5109296,Inhibition of DENV2 NS2B-NS3 protease using Bz...,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,23.9
1281,"{'action_type': 'INHIBITOR', 'description': 'N...",,24708311,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5109296,Inhibition of DENV2 NS2B-NS3 protease using Bz...,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,24.0
1282,"{'action_type': 'INHIBITOR', 'description': 'N...",,24866731,[],CHEMBL5151793,Inhibition of DENV-2 DENV2 NS2B-NS3,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,9.61


Finally we will save the resulting bioactivity data to a CSV file .

In [None]:
df.to_csv('NS3_protein_IC50_raw.csv', index=False)

## **2.3 Data processing**





If any compounds has missing value for the **standard_value** and **canonical_smiles** column then drop it.

In [None]:
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]
df2

  df2 = df2[df.canonical_smiles.notna()]


Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,active,7018430,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,100.0
1,,active,7018431,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,12.31
2,,active,7018432,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,100.0
3,,active,7018433,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,50.97
4,,active,7018434,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1279,"{'action_type': 'INHIBITOR', 'description': 'N...",,24708309,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5109296,Inhibition of DENV2 NS2B-NS3 protease using Bz...,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,25.2
1280,"{'action_type': 'INHIBITOR', 'description': 'N...",,24708310,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5109296,Inhibition of DENV2 NS2B-NS3 protease using Bz...,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,23.9
1281,"{'action_type': 'INHIBITOR', 'description': 'N...",,24708311,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5109296,Inhibition of DENV2 NS2B-NS3 protease using Bz...,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,24.0
1282,"{'action_type': 'INHIBITOR', 'description': 'N...",,24866731,[],CHEMBL5151793,Inhibition of DENV-2 DENV2 NS2B-NS3,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,9.61


Delete duplicate rows with the same normalized SMILES value, and then calculate the number of unique normalized SMILES values.

In [None]:
df2_nr = df2.drop_duplicates(['canonical_smiles'])
df2_nr

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,active,7018430,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,100.0
1,,active,7018431,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,12.31
2,,active,7018432,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,100.0
3,,active,7018433,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,50.97
4,,active,7018434,[],CHEMBL1794550,PUBCHEM_BIOASSAY: Primary and Confirmatory Scr...,F,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1279,"{'action_type': 'INHIBITOR', 'description': 'N...",,24708309,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5109296,Inhibition of DENV2 NS2B-NS3 protease using Bz...,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,25.2
1280,"{'action_type': 'INHIBITOR', 'description': 'N...",,24708310,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5109296,Inhibition of DENV2 NS2B-NS3 protease using Bz...,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,23.9
1281,"{'action_type': 'INHIBITOR', 'description': 'N...",,24708311,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5109296,Inhibition of DENV2 NS2B-NS3 protease using Bz...,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,24.0
1282,"{'action_type': 'INHIBITOR', 'description': 'N...",,24866731,[],CHEMBL5151793,Inhibition of DENV-2 DENV2 NS2B-NS3,B,,,BAO_0000190,...,Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,31634,,,IC50,uM,UO_0000065,,9.61


Combine the 3 columns (molecule_chembl_id,canonical_smiles,standard_value) into a DataFrame

In [None]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2_nr[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL1401841,COc1ccc2nc3cccc(OC)c3nc2c1,100000.0
1,CHEMBL1608853,O=C(O)c1ccc2c(c1)C(=O)/C(=C\c1ccco1)C2=O,12310.0
2,CHEMBL1429799,O=C1NN(c2ccccc2)C(=O)/C1=C\c1ccccc1OCC(=O)N1CC...,100000.0
3,CHEMBL246446,O=C(O)c1ccc2nc(-c3ccco3)c(-c3ccco3)nc2c1,50970.0
4,CHEMBL1383455,CCn1nc([N+](=O)[O-])c(C(C#N)c2nc3ccccc3n2C)c(C...,100000.0
...,...,...,...
1279,CHEMBL5190612,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccc(C(F)(F)F...,25200.0
1280,CHEMBL5176952,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4cccc(C(F)(F)...,23900.0
1281,CHEMBL5184942,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccccc4C(F)(F...,24000.0
1282,CHEMBL4526128,O=C(N[C@@H](Cc1ccc(O)cc1)C(=O)O)c1cc(-c2ccccc2...,9610.0


## **2.4 Labeling compounds as either being active, inactive or intermediate**

---



The bioactivity data is in the IC50 unit. Compounds having values of less than 20,000 nM will be considered to be **active** while those greater than 70,000 nM will be considered to be **inactive**. As for those values in between 20,000 and 70,000 nM will be referred to as **intermediate**. This threshold comes from [Implementation of ensemble methods on QSAR Study of NS3 inhibitor activity as anti-dengue agent](https://www.tandfonline.com/doi/full/10.1080/1062936X.2020.1773534.)

In [None]:
bioactivity_threshold = []
for i in df3.standard_value:
  if float(i) >= 70000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 20000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

df3 = df3.reset_index(drop=True)

bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df4 = pd.concat([df3, bioactivity_class], axis=1)
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL1401841,COc1ccc2nc3cccc(OC)c3nc2c1,100000.0,inactive
1,CHEMBL1608853,O=C(O)c1ccc2c(c1)C(=O)/C(=C\c1ccco1)C2=O,12310.0,active
2,CHEMBL1429799,O=C1NN(c2ccccc2)C(=O)/C1=C\c1ccccc1OCC(=O)N1CC...,100000.0,inactive
3,CHEMBL246446,O=C(O)c1ccc2nc(-c3ccco3)c(-c3ccco3)nc2c1,50970.0,intermediate
4,CHEMBL1383455,CCn1nc([N+](=O)[O-])c(C(C#N)c2nc3ccccc3n2C)c(C...,100000.0,inactive
...,...,...,...,...
1190,CHEMBL5190612,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccc(C(F)(F)F...,25200.0,intermediate
1191,CHEMBL5176952,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4cccc(C(F)(F)...,23900.0,intermediate
1192,CHEMBL5184942,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccccc4C(F)(F...,24000.0,intermediate
1193,CHEMBL4526128,O=C(N[C@@H](Cc1ccc(O)cc1)C(=O)O)c1cc(-c2ccccc2...,9610.0,active


Delete rows containing intermediate

In [None]:
df5 = df4[df4['class'] != 'intermediate']
df5 = df5.sort_values('standard_value')
df5

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
1180,CHEMBL5188858,Cc1cc(O)c(S(=O)(=O)O)cc1Cc1cc(S(=O)(=O)O)c(O)c...,0.48,active
1013,CHEMBL4483351,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(N)=O)NC(=O)[C@...,10000.0,active
1148,CHEMBL4849139,O=C(NC1CCNCC1)c1c[nH]c2cc(-c3ccncc3)ccc12,10000.0,active
0,CHEMBL1401841,COc1ccc2nc3cccc(OC)c3nc2c1,100000.0,inactive
372,CHEMBL1443706,Cl.O=C(NCC(c1cccs1)N1CCOCC1)c1ccccc1NS(=O)(=O)...,100000.0,inactive
...,...,...,...,...
510,CHEMBL1543611,c1coc(-c2nc3nc4ccccc4n3nc2-c2ccco2)c1,99300.0,inactive
1104,CHEMBL4871540,O=C(C[C@H](NC(=O)c1ccccc1)C(=O)Nc1nc2cc(O)c(O)...,9950.0,active
396,CHEMBL1598944,COc1ccc(/C=C2\C(=O)NC(=O)N(c3ccc(C)c(C)c3)C2=O...,99510.0,inactive
590,CHEMBL1534470,O=Cc1cc(N=Nc2cccc(C(=O)O)c2)ccc1O,99550.0,inactive


In [27]:
counts = df5['class'].value_counts()
active_count = counts['active']
inactive_count = counts['inactive']

print("Active count:", active_count)
print("Inactive count:", inactive_count)


Active count: 290
Inactive count: 301


Saves dataframe to CSV file

In [None]:
df5.to_csv('NS3_Data_processing.csv', index=False)
from google.colab import files
files.download('NS3_Data_processing.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>