<a href="https://colab.research.google.com/github/baker371/Drug-Discovery/blob/main/Malaria%20Drug%20Discovery/Malaria_Drug_Discovery_Part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Machine Learning model to predict Malaria Treatment using target data from ChEMBL Database

### Data Collection

In [70]:
# Package to retrieve bioactivity data from the ChEMBL Database.

#! pip install chembl_webresource_client

In [71]:
import pandas as pd
import numpy as np

from chembl_webresource_client.new_client import new_client

import warnings
warnings.filterwarnings('ignore')

In [72]:
# Target search for Malaria
target = new_client.target
target_query = target.search('Plasmodium malariae')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Plasmodium malariae,Plasmodium malariae,30.0,False,CHEMBL613257,[],ORGANISM,5858
1,[],Homo sapiens,Duffy antigen/chemokine receptor,14.0,False,CHEMBL2321626,"[{'accession': 'Q16570', 'component_descriptio...",SINGLE PROTEIN,9606
2,[],Plasmodium falciparum,Plasmodium falciparum,12.0,False,CHEMBL364,[],ORGANISM,5833
3,[],Plasmodium berghei,Plasmodium berghei,12.0,False,CHEMBL612653,[],ORGANISM,5821
4,[],Plasmodium yoelii,Plasmodium yoelii,12.0,False,CHEMBL612889,[],ORGANISM,5861
5,[],Plasmodium cynomolgi,Plasmodium cynomolgi,12.0,False,CHEMBL613883,[],ORGANISM,5827
6,[],Plasmodium chabaudi,Plasmodium chabaudi,12.0,False,CHEMBL613256,[],ORGANISM,5825
7,[],Plasmodium knowlesi,Plasmodium knowlesi,12.0,False,CHEMBL613011,[],ORGANISM,5850
8,[],Plasmodium vivax,Plasmodium vivax,12.0,False,CHEMBL613013,[],ORGANISM,5855
9,[],Plasmodium gallinaceum,Plasmodium gallinaceum,12.0,False,CHEMBL3301401,[],ORGANISM,5849


In [73]:
# Randomly selecting different and testing out with different targets

selected_target = 'CHEMBL2366922'

In [74]:
# Retrieve bioactivity data for selected target with reported IC50 values

activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
dfn = pd.DataFrame.from_dict(res)
dfn.head(5)

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,12393147,[],CHEMBL3051455,Antimalarial activity against chloroquine-sens...,F,,,BAO_0000190,BAO_0000218,organism-based format,CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12,,,CHEMBL3044612,Med Chem Res,2012,,CHEMBL76,CHLOROQUINE,CHEMBL76,,False,http://www.openphacts.org/units/MicrogramPerMi...,1799100,=,1,True,=,,IC50,ug.mL-1,,0.0223,CHEMBL2366922,Plasmodium falciparum 3D7,Plasmodium falciparum 3D7,36329,,,IC50,ng/ml,UO_0000274,,22.3
1,,12393154,[],CHEMBL3051455,Antimalarial activity against chloroquine-sens...,F,,,BAO_0000190,BAO_0000218,organism-based format,O=C(/C=C/c1ccco1)c1c(O)c2ccccc2oc1=O,,,CHEMBL3044612,Med Chem Res,2012,,CHEMBL2237985,,CHEMBL2237985,,False,http://www.openphacts.org/units/MicrogramPerMi...,1799099,>,1,True,>,,IC50,ug.mL-1,,50.0,CHEMBL2366922,Plasmodium falciparum 3D7,Plasmodium falciparum 3D7,36329,,,IC50,ug ml-1,UO_0000274,,50.0
2,,12393161,[],CHEMBL3051455,Antimalarial activity against chloroquine-sens...,F,,,BAO_0000190,BAO_0000218,organism-based format,COc1cc(/C=C/C(=O)c2c(O)c3ccccc3oc2=O)cc(OC)c1OC,,,CHEMBL3044612,Med Chem Res,2012,,CHEMBL2234257,,CHEMBL2234257,,False,http://www.openphacts.org/units/MicrogramPerMi...,1799098,>,1,True,>,,IC50,ug.mL-1,,50.0,CHEMBL2366922,Plasmodium falciparum 3D7,Plasmodium falciparum 3D7,36329,,,IC50,ug ml-1,UO_0000274,,50.0
3,,12393168,[],CHEMBL3051455,Antimalarial activity against chloroquine-sens...,F,,,BAO_0000190,BAO_0000218,organism-based format,COc1ccc(/C=C/C(=O)c2c(O)c3ccccc3oc2=O)c(OC)c1OC,,,CHEMBL3044612,Med Chem Res,2012,,CHEMBL2237984,,CHEMBL2237984,,False,http://www.openphacts.org/units/MicrogramPerMi...,1799097,=,1,True,=,,IC50,ug.mL-1,,8.5,CHEMBL2366922,Plasmodium falciparum 3D7,Plasmodium falciparum 3D7,36329,,,IC50,ug ml-1,UO_0000274,,8.5
4,,12393175,[],CHEMBL3051455,Antimalarial activity against chloroquine-sens...,F,,,BAO_0000190,BAO_0000218,organism-based format,COc1ccc(/C=C/C(=O)c2c(O)c3ccccc3oc2=O)cc1OC,,,CHEMBL3044612,Med Chem Res,2012,,CHEMBL2237983,,CHEMBL2237983,,False,http://www.openphacts.org/units/MicrogramPerMi...,1799096,=,1,True,=,,IC50,ug.mL-1,,6.2,CHEMBL2366922,Plasmodium falciparum 3D7,Plasmodium falciparum 3D7,36329,,,IC50,ug ml-1,UO_0000274,,6.2


In [75]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
dfm = dfn[selection]
dfm.head(5)

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL76,CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12,0.0223
1,CHEMBL2237985,O=C(/C=C/c1ccco1)c1c(O)c2ccccc2oc1=O,50.0
2,CHEMBL2234257,COc1cc(/C=C/C(=O)c2c(O)c3ccccc3oc2=O)cc(OC)c1OC,50.0
3,CHEMBL2237984,COc1ccc(/C=C/C(=O)c2c(O)c3ccccc3oc2=O)c(OC)c1OC,8.5
4,CHEMBL2237983,COc1ccc(/C=C/C(=O)c2c(O)c3ccccc3oc2=O)cc1OC,6.2


In [76]:
dfm.dtypes

molecule_chembl_id    object
canonical_smiles      object
standard_value        object
dtype: object

In [77]:
dfm["standard_value"] = pd.to_numeric(dfm["standard_value"])

In [78]:
dfm.dtypes

molecule_chembl_id     object
canonical_smiles       object
standard_value        float64
dtype: object

In [79]:
# create a list of our conditions
conditions = [
    (dfm['standard_value'] >= 10000),
    (dfm['standard_value'] > 1000) & (dfm['standard_value'] <= 9999),
    (dfm['standard_value'] <= 1000)
    ]

# create a list of the values we want to assign for each condition
values = ['Inactive','Intermediate', 'Active']

dfm['bioactivity_class'] = np.select(conditions, values)

# display updated DataFrame
dfm.head(5)

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL76,CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12,0.0223,Active
1,CHEMBL2237985,O=C(/C=C/c1ccco1)c1c(O)c2ccccc2oc1=O,50.0,Active
2,CHEMBL2234257,COc1cc(/C=C/C(=O)c2c(O)c3ccccc3oc2=O)cc(OC)c1OC,50.0,Active
3,CHEMBL2237984,COc1ccc(/C=C/C(=O)c2c(O)c3ccccc3oc2=O)c(OC)c1OC,8.5,Active
4,CHEMBL2237983,COc1ccc(/C=C/C(=O)c2c(O)c3ccccc3oc2=O)cc1OC,6.2,Active


In [80]:
dfm.shape

(1688, 4)

In [81]:
# dropping ALL duplicate molecule_chembl_id	values
dfm.drop_duplicates(subset ="molecule_chembl_id",
                     keep = False, inplace = True)

In [82]:
df = dfm[dfm.standard_value.notna()]

In [83]:
df.shape

(1320, 4)

In [84]:
df.to_csv('bioactive_data.csv', index=False)

In [85]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [86]:
! cp bioactive_data.csv "/content/gdrive/My Drive/Colab Notebooks/data"