# **Selecting "New Data"**
If working with existing molecules or multiple molecules in a dataframe
---

## **Importing libraries**

In [2]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

### **Target search for Target (in this case Coronavirus)**

In [3]:
# Target search for coronavirus
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],SARS coronavirus,SARS coronavirus,15.0,False,CHEMBL612575,[],ORGANISM,227859
2,[],Feline coronavirus,Feline coronavirus,15.0,False,CHEMBL612744,[],ORGANISM,12663
3,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
4,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
5,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
6,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859
7,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


### **Select and retrieve bioactivity data you're interested in**

In [4]:
selected_target = targets.target_chembl_id[6]
selected_target

'CHEMBL5118'

In [5]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [6]:
df = pd.DataFrame.from_dict(res)

In [7]:
df

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1988091,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,870.0
1,,1988092,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,200.0
2,,1988093,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,300.0
3,,1988094,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,15.0
4,,1988095,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,,18548176,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.3
211,,18548177,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,5.5
212,,18548178,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.1
213,,18548179,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,3.2


## **Handling missing data**
If any compounds has missing value for the **standard_value** and **canonical_smiles** column then drop it.

In [8]:
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]
df2

  df2 = df2[df.canonical_smiles.notna()]


Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1988091,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,870.0
1,,1988092,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,200.0
2,,1988093,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,300.0
3,,1988094,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,15.0
4,,1988095,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,,18548176,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.3
211,,18548177,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,5.5
212,,18548178,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.1
213,,18548179,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,3.2


In [9]:
len(df2.canonical_smiles.unique())

170

In [10]:
df2_nr = df2.drop_duplicates(['canonical_smiles'])
df2_nr

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1988091,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,870.0
1,,1988092,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,200.0
2,,1988093,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,300.0
3,,1988094,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,15.0
4,,1988095,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,,18548176,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.3
211,,18548177,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,5.5
212,,18548178,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.1
213,,18548179,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,3.2


## **Data pre-processing of the bioactivity data**

In [11]:
selection = ['canonical_smiles', 'molecule_chembl_id']
df3 = df2_nr[selection]
df3

Unnamed: 0,canonical_smiles,molecule_chembl_id
0,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@@H...,CHEMBL194398
1,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,CHEMBL393608
2,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,CHEMBL238216
3,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,CHEMBL235873
4,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,CHEMBL397154
...,...,...
210,CC(C)C[C@H](NC(=O)OC1(Cc2ccccc2)CCN(S(C)(=O)=O...,CHEMBL4208764
211,CCC1(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C=O)C[C@...,CHEMBL4212620
212,CCC1(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](C[C@@H]...,CHEMBL4216101
213,CCOC(=O)N1CCC(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@H]...,CHEMBL4217568


In [12]:
df3.to_csv('molecule_new.smi', sep='\t', index=False, header=False)
df3.to_csv('list_to_predict_new.txt', sep=' ', index=False, header=False)

In [13]:
! cat molecule_new.smi | head -5

'cat' is not recognized as an internal or external command,
operable program or batch file.


In [14]:
import glob
xml_files = glob.glob('*.xml')
xml_files.sort()
xml_files

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [15]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [16]:
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [17]:
#! pip install padelpy

In [18]:
from padelpy import padeldescriptor

fingerprint = 'PubChem' # Remember there are other methods such as PubChem

#fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule_new.smi', 
                d_file='fingerprint_output_file_new.csv', #'Substructure.csv'
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [19]:
df3_X = pd.read_csv('fingerprint_output_file_new.csv')
df3_X

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL194398,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL393608,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL238216,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL235873,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL397154,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,CHEMBL4208764,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
166,CHEMBL4212620,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
167,CHEMBL4216101,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
168,CHEMBL4217568,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
