In [None]:
! pip install chembl_webresource_client

import pandas as pd
import numpy as np
from chembl_webresource_client.new_client import new_client

Endometriosis is associated with progesterone resistance: https://pmc.ncbi.nlm.nih.gov/articles/PMC9687824/. 
Therefore, in drug discoveries to relieve endometriosis, we're looking for substances that can activate Progesterone receptors and improve binding

In [None]:
target = new_client.target
target_query = target.search('progesterone') 
targets = pd.DataFrame.from_dict(target_query)
targets

Using id[0] to return the protein associated with homo sapien. 

In [None]:
selected = targets.target_chembl_id[0]
activity = new_client.activity
res = activity.filter(target_chembl_id = selected).filter(standard_type = "IC50") #IC50 means that the potency is measured by amount needed to halt biological processes by 50%: https://pubmed.ncbi.nlm.nih.gov/27365221/

df = pd.DataFrame.from_dict(res)[lambda d: d.standard_value.notna()] #need standard value to determine potency
df.head(3)

In [None]:
#adding activity label for ML

act_class = []
for i in df.standard_value:
    if float(i) >= 10000:
        act_class.append("inactive")
    elif float(i) <= 1000:
        act_class.append("active")
    else:
        act_class.append("intermediate")


#combining into df
finaldf = df[['molecule_chembl_id', 'canonical_smiles', 'standard_value']] 
#canonical smile is another way to depict molecular structure in text: https://luis-vollmers.medium.com/tutorial-to-smiles-and-canonical-smiles-explained-with-examples-fbc8a46ca29f
pd.concat([finaldf, pd.Series(act_class)], axis = 1)


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,0
0,CHEMBL1276308,CC#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3[...,0.028,active
1,CHEMBL1276308,CC#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CCC4=C3[...,0.025,active
2,CHEMBL146032,CC(=O)c1ccc([C@H]2C[C@@]3(C)[C@@H](CC[C@@]3(O)...,0.0036,active
3,CHEMBL146032,CC(=O)c1ccc([C@H]2C[C@@]3(C)[C@@H](CC[C@@]3(O)...,0.0025,active
4,CHEMBL286130,C=C1CC(C)(C)Nc2ccc3c(c21)C(c1ccc(Cl)cc1)Oc1ccc...,1755.0,intermediate
...,...,...,...,...
1525,,,,intermediate
1527,,,,active
1533,,,,intermediate
1534,,,,intermediate


Lipiski descriptors: the "druglikeness" of a compound (absorption, distribution, metabolism, excretion), or the pharmacokinetic profile.

In [64]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors

MW = molecular weight; LogP = Octanol-water partition coefficient (measure of hydrophobism); hydrogen donors and acceptors

In [None]:
lip_df = lipinski(finaldf.canonical_smiles)
df_combine = pd.concat([finaldf, lip_df], axis = 1)
df_combine = df_combine.dropna()
df_combine['standard_value'] = pd.to_numeric(df_combine['standard_value'])
df_combine

In [None]:
def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', axis =1)
        
    return x

df_norm = norm_value(df_combine)


In [None]:
def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', axis =1)
        
    return x

df_final = pIC50(df_norm)
df_final