# Overview: 
 
### Collect data from Chembl Database 

In [35]:
! pip install chembl_webresource_client



In [36]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

In [37]:
# Target search for coronavirus
target = new_client.target
target_query = target.search('alzheimer')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,Nucleosome-remodeling factor subunit BPTF,12.0,False,CHEMBL3085621,"[{'accession': 'Q12830', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Rattus norvegicus,Amyloid beta A4 protein,5.0,False,CHEMBL3638365,"[{'accession': 'P08592', 'component_descriptio...",SINGLE PROTEIN,10116
2,"[{'xref_id': 'P05067', 'xref_name': None, 'xre...",Homo sapiens,Beta amyloid A4 protein,4.0,False,CHEMBL2487,"[{'accession': 'P05067', 'component_descriptio...",SINGLE PROTEIN,9606


### Select data for Beta amyloid A4 protein

In [38]:
selected_target = targets.target_chembl_id[2]
selected_target

'CHEMBL2487'

##### Filter for compounds that report bioactivity in IC50 values only. 

In [39]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type = 'IC50')

In [40]:
df = pd.DataFrame.from_dict(res)
pd.set_option('max_columns', None)

In [41]:
df.head()

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,357577,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CCC(C1)C(C)(C)C2NS(=O)(=O)c1ccc(F)cc1,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '17.02', 'le': '0.34', 'lle': '1.98', ...",CHEMBL311039,,CHEMBL311039,5.3,False,http://www.openphacts.org/units/Nanomolar,132837,=,1,True,=,,IC50,nM,,5000.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,5.0
1,,357580,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1cccs1,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '18.60', 'le': '0.40', 'lle': '2.33', ...",CHEMBL450926,,CHEMBL450926,5.57,False,http://www.openphacts.org/units/Nanomolar,132839,=,1,True,=,,IC50,nM,,2700.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,2.7
2,,358965,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '18.45', 'le': '0.37', 'lle': '2.42', ...",CHEMBL310242,,CHEMBL310242,5.75,False,http://www.openphacts.org/units/Nanomolar,132841,=,1,True,=,,IC50,nM,,1800.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,1.8
3,,368887,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '15.12', 'le': '0.32', 'lle': '1.13', ...",CHEMBL74874,,CHEMBL74874,4.96,False,http://www.openphacts.org/units/Nanomolar,132840,=,1,True,=,,IC50,nM,,11000.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,11.0
4,,375954,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '13.43', 'le': '0.33', 'lle': '1.06', ...",CHEMBL75183,,CHEMBL75183,5.0,False,http://www.openphacts.org/units/Nanomolar,132838,=,1,True,=,,IC50,nM,,10000.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,10.0


In [42]:
df.shape

(1143, 43)

In [43]:
df.standard_type.unique()

array(['IC50'], dtype=object)

In [44]:
df.to_csv('bioactivity_data.csv', index = False)

## Missing Data

In [45]:
df2 = df[df.standard_value.notna()]
df3 = df2[df.canonical_smiles.notna()]
df3

  


Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,357577,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CCC(C1)C(C)(C)C2NS(=O)(=O)c1ccc(F)cc1,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '17.02', 'le': '0.34', 'lle': '1.98', ...",CHEMBL311039,,CHEMBL311039,5.30,False,http://www.openphacts.org/units/Nanomolar,132837,=,1,True,=,,IC50,nM,,5000.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,5.0
1,,357580,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1cccs1,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '18.60', 'le': '0.40', 'lle': '2.33', ...",CHEMBL450926,,CHEMBL450926,5.57,False,http://www.openphacts.org/units/Nanomolar,132839,=,1,True,=,,IC50,nM,,2700.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,2.7
2,,358965,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '18.45', 'le': '0.37', 'lle': '2.42', ...",CHEMBL310242,,CHEMBL310242,5.75,False,http://www.openphacts.org/units/Nanomolar,132841,=,1,True,=,,IC50,nM,,1800.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,1.8
3,,368887,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '15.12', 'le': '0.32', 'lle': '1.13', ...",CHEMBL74874,,CHEMBL74874,4.96,False,http://www.openphacts.org/units/Nanomolar,132840,=,1,True,=,,IC50,nM,,11000.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,11.0
4,,375954,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '13.43', 'le': '0.33', 'lle': '1.06', ...",CHEMBL75183,,CHEMBL75183,5.00,False,http://www.openphacts.org/units/Nanomolar,132838,=,1,True,=,,IC50,nM,,10000.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,,18754817,[],CHEMBL4265047,Inhibition of HFIP-pretreated amyloid beta (1 ...,B,BAO_0000190,BAO_0000357,single protein format,COc1cc(/C=C/C(=O)CC(=O)/C=C/c2ccc(O)c(OC)c2)ccc1O,,,CHEMBL4261624,Bioorg Med Chem,2018,"{'bei': '12.83', 'le': '0.24', 'lle': '1.36', ...",CHEMBL140,,CHEMBL140,4.73,False,http://www.openphacts.org/units/Nanomolar,3116211,=,1,True,=,,IC50,nM,,18730.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,18.73
1139,,18769559,[],CHEMBL4267965,Disaggregation of amyloid beta (1 to 42 residu...,B,BAO_0000190,BAO_0000357,single protein format,O=C(/C=C/c1ccc(O)c(O)c1)NCCCNC(=O)/C=C/c1ccc(O...,,,CHEMBL4265933,Eur J Med Chem,2018,"{'bei': '13.50', 'le': '0.25', 'lle': '3.52', ...",CHEMBL197478,,CHEMBL197478,5.38,False,http://www.openphacts.org/units/Nanomolar,3118599,=,1,True,=,,IC50,nM,,4200.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,4.2
1140,,18769571,[],CHEMBL4267959,Inhibition of amyloid beta (1 to 42) fibrilliz...,B,BAO_0000190,BAO_0000357,single protein format,O=C(O)/C=C/c1ccc(O)c(O)c1,,,CHEMBL4265933,Eur J Med Chem,2018,"{'bei': '26.68', 'le': '0.50', 'lle': '3.61', ...",CHEMBL145,CAFFEIC ACID,CHEMBL145,4.81,False,http://www.openphacts.org/units/Nanomolar,3118610,=,1,True,=,,IC50,nM,,15600.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,15.6
1141,,18769572,[],CHEMBL4267959,Inhibition of amyloid beta (1 to 42) fibrilliz...,B,BAO_0000190,BAO_0000357,single protein format,COc1cc(/C=C/C(=O)CC(=O)/C=C/c2ccc(O)c(OC)c2)ccc1O,,,CHEMBL4265933,Eur J Med Chem,2018,"{'bei': '14.84', 'le': '0.28', 'lle': '2.10', ...",CHEMBL140,,CHEMBL140,5.47,False,http://www.openphacts.org/units/Nanomolar,3118609,=,1,True,=,,IC50,nM,,3400.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,3.4


In [46]:
df2_nr = df3.drop_duplicates(['canonical_smiles'])
df2_nr

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,357577,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CCC(C1)C(C)(C)C2NS(=O)(=O)c1ccc(F)cc1,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '17.02', 'le': '0.34', 'lle': '1.98', ...",CHEMBL311039,,CHEMBL311039,5.30,False,http://www.openphacts.org/units/Nanomolar,132837,=,1,True,=,,IC50,nM,,5000.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,5.0
1,,357580,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1cccs1,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '18.60', 'le': '0.40', 'lle': '2.33', ...",CHEMBL450926,,CHEMBL450926,5.57,False,http://www.openphacts.org/units/Nanomolar,132839,=,1,True,=,,IC50,nM,,2700.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,2.7
2,,358965,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '18.45', 'le': '0.37', 'lle': '2.42', ...",CHEMBL310242,,CHEMBL310242,5.75,False,http://www.openphacts.org/units/Nanomolar,132841,=,1,True,=,,IC50,nM,,1800.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,1.8
3,,368887,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '15.12', 'le': '0.32', 'lle': '1.13', ...",CHEMBL74874,,CHEMBL74874,4.96,False,http://www.openphacts.org/units/Nanomolar,132840,=,1,True,=,,IC50,nM,,11000.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,11.0
4,,375954,[],CHEMBL678443,Inhibition of A-beta-42 production by inhibiti...,B,BAO_0000190,BAO_0000219,cell-based format,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,,,CHEMBL1133739,J. Med. Chem.,2000,"{'bei': '13.43', 'le': '0.33', 'lle': '1.06', ...",CHEMBL75183,,CHEMBL75183,5.00,False,http://www.openphacts.org/units/Nanomolar,132838,=,1,True,=,,IC50,nM,,10000.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,,18754810,[],CHEMBL4265047,Inhibition of HFIP-pretreated amyloid beta (1 ...,B,BAO_0000190,BAO_0000357,single protein format,COc1ccccc1/C=C/c1cc(=O)c(O)co1,,,CHEMBL4261624,Bioorg Med Chem,2018,"{'bei': '20.88', 'le': '0.39', 'lle': '2.58', ...",CHEMBL4286002,,CHEMBL4286002,5.10,False,http://www.openphacts.org/units/Nanomolar,3116204,=,1,True,=,,IC50,nM,,7930.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,7.93
1133,,18754812,[],CHEMBL4265047,Inhibition of HFIP-pretreated amyloid beta (1 ...,B,BAO_0000190,BAO_0000357,single protein format,O=c1cc(/C=C/c2ccc(O)cc2)occ1O,,,CHEMBL4261624,Bioorg Med Chem,2018,"{'bei': '22.34', 'le': '0.41', 'lle': '2.92', ...",CHEMBL4287929,,CHEMBL4287929,5.14,False,http://www.openphacts.org/units/Nanomolar,3116206,=,1,True,=,,IC50,nM,,7200.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,7.2
1134,,18754813,[],CHEMBL4265047,Inhibition of HFIP-pretreated amyloid beta (1 ...,B,BAO_0000190,BAO_0000357,single protein format,O=c1cc(/C=C/c2ccccc2O)occ1O,,,CHEMBL4261624,Bioorg Med Chem,2018,"{'bei': '22.07', 'le': '0.41', 'lle': '2.86', ...",CHEMBL4284208,,CHEMBL4284208,5.08,False,http://www.openphacts.org/units/Nanomolar,3116207,=,1,True,=,,IC50,nM,,8290.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,8.29
1135,,18754814,[],CHEMBL4265047,Inhibition of HFIP-pretreated amyloid beta (1 ...,B,BAO_0000190,BAO_0000357,single protein format,O=c1cc(/C=C/c2cc(O)cc(O)c2)occ1O,,,CHEMBL4261624,Bioorg Med Chem,2018,"{'bei': '19.28', 'le': '0.36', 'lle': '2.82', ...",CHEMBL4294830,,CHEMBL4294830,4.75,False,http://www.openphacts.org/units/Nanomolar,3116208,=,1,True,=,,IC50,nM,,17950.0,CHEMBL2487,Homo sapiens,Beta amyloid A4 protein,9606,,,IC50,uM,UO_0000065,,17.95


In [47]:
# One row was dropped due to missing standard value data

## Data Preprocessing of Bioactivity Data

#### The goal of the project is to identify which descriptors influence whether or not a compound activates, inactivates, or is an intermediate to Replicase polyprotein 1ab. 

IC50 values can be used to label the compounds as one of the three categories. Compounds with values < 1000 nM will be considered active, and those > 10,000 nM will be labeled inactive. Values in between will be intermediate. 

### We will create a new dataframe by extracting bioactivity data and the compound identifiers from the original. First, label each compound as active, inactive, or intermediate. 

#### First, delete all the rows that do not have a Standard Value entry. 

In [48]:
bioactivity_class = []
for i in df2_nr.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

In [49]:
print(len(bioactivity_class))

862


### Make database of chembl_id, canonical smiles, and standard value. 

In [50]:
selection = ['molecule_chembl_id','canonical_smiles', 'standard_value']
df4 = df2_nr[selection]
df4 = df4.reset_index()
df4

Unnamed: 0,index,molecule_chembl_id,canonical_smiles,standard_value
0,0,CHEMBL311039,CC12CCC(C1)C(C)(C)C2NS(=O)(=O)c1ccc(F)cc1,5000.0
1,1,CHEMBL450926,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1cccs1,2700.0
2,2,CHEMBL310242,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,1800.0
3,3,CHEMBL74874,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,11000.0
4,4,CHEMBL75183,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,10000.0
...,...,...,...,...
857,1131,CHEMBL4286002,COc1ccccc1/C=C/c1cc(=O)c(O)co1,7930.0
858,1133,CHEMBL4287929,O=c1cc(/C=C/c2ccc(O)cc2)occ1O,7200.0
859,1134,CHEMBL4284208,O=c1cc(/C=C/c2ccccc2O)occ1O,8290.0
860,1135,CHEMBL4294830,O=c1cc(/C=C/c2cc(O)cc(O)c2)occ1O,17950.0


In [51]:
bioactivity_class = pd.Series(bioactivity_class, name='class')
bioactivity_class

0      intermediate
1      intermediate
2      intermediate
3          inactive
4          inactive
           ...     
857    intermediate
858    intermediate
859    intermediate
860        inactive
861    intermediate
Name: class, Length: 862, dtype: object

In [56]:
bioactivity_class.isnull().any().any()

False

In [52]:
df5 = pd.concat([df4,bioactivity_class], axis=1)
df5

Unnamed: 0,index,molecule_chembl_id,canonical_smiles,standard_value,class
0,0,CHEMBL311039,CC12CCC(C1)C(C)(C)C2NS(=O)(=O)c1ccc(F)cc1,5000.0,intermediate
1,1,CHEMBL450926,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1cccs1,2700.0,intermediate
2,2,CHEMBL310242,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,1800.0,intermediate
3,3,CHEMBL74874,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,11000.0,inactive
4,4,CHEMBL75183,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,10000.0,inactive
...,...,...,...,...,...
857,1131,CHEMBL4286002,COc1ccccc1/C=C/c1cc(=O)c(O)co1,7930.0,intermediate
858,1133,CHEMBL4287929,O=c1cc(/C=C/c2ccc(O)cc2)occ1O,7200.0,intermediate
859,1134,CHEMBL4284208,O=c1cc(/C=C/c2ccccc2O)occ1O,8290.0,intermediate
860,1135,CHEMBL4294830,O=c1cc(/C=C/c2cc(O)cc(O)c2)occ1O,17950.0,inactive


In [53]:
df6 = df5[df5['class'].notna()]
df7 = df6[df6['canonical_smiles'].notna()]
df7

Unnamed: 0,index,molecule_chembl_id,canonical_smiles,standard_value,class
0,0,CHEMBL311039,CC12CCC(C1)C(C)(C)C2NS(=O)(=O)c1ccc(F)cc1,5000.0,intermediate
1,1,CHEMBL450926,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1cccs1,2700.0,intermediate
2,2,CHEMBL310242,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,1800.0,intermediate
3,3,CHEMBL74874,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,11000.0,inactive
4,4,CHEMBL75183,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,10000.0,inactive
...,...,...,...,...,...
857,1131,CHEMBL4286002,COc1ccccc1/C=C/c1cc(=O)c(O)co1,7930.0,intermediate
858,1133,CHEMBL4287929,O=c1cc(/C=C/c2ccc(O)cc2)occ1O,7200.0,intermediate
859,1134,CHEMBL4284208,O=c1cc(/C=C/c2ccccc2O)occ1O,8290.0,intermediate
860,1135,CHEMBL4294830,O=c1cc(/C=C/c2cc(O)cc(O)c2)occ1O,17950.0,inactive


In [54]:
df7.to_csv('bioactivity_preprocessed_data.csv', index=False)