In [53]:
#mount drive
def mount_drive():
  from google.colab import drive
  drive.mount('/content/gdrive')

In [54]:
#installing & importing  libraries
def install_download_lib():
  #chembl
  ! pip install chembl_webresource_client
  import pandas as pd
  import numpy as np
  from chembl_webresource_client.new_client import new_client

#install conda
  ! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
  ! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
  ! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local
  import sys
  sys.path.append('/usr/local/lib/python3.7/site-packages/')

#install rdkit
  ! conda install -c rdkit rdkit -y

#import lib
  from rdkit import Chem
  from rdkit.Chem import Descriptors, Lipinski

#Download PaDEL-Descriptor

  ! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip
  ! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh

  ! unzip padel.zip

In [55]:
#fn to search and lock target
def search_target_protein(protein_name='coronavirus'):
  target = new_client.target
  target_query = target.search(protein_name)
  targets = pd.DataFrame.from_dict(target_query)
  return targets


def select_target_protein(protein_index=4,protein_name='coronavirus'):
  targets = search_target_protein(protein_name)
  selected_target = targets.target_chembl_id[protein_index]
  activity = new_client.activity
  res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
  bioactivity_data = pd.DataFrame.from_dict(res)
  return  bioactivity_data

#fn for cleaning the bioactivity data
def select_n_clean_target_protein(protein_index=4,protein_name='coronavirus'):
  df=select_target_protein(protein_index,protein_name)
  clean=df[df.standard_value.notna()]
  return clean

In [56]:
#pre-processing of the bioactivity data

def preprocess_bioactivity_of_target_protein(protein_index=4,protein_name='coronavirus'):
  df= select_n_clean_target_protein(protein_index,protein_name)
  bioactivity_class = []
  mol_cid = []
  canonical_smiles = []
  standard_value = []

  #active, inactive or intermediate
  for i in df.standard_value:
    if float(i) >= 10000:
      bioactivity_class.append("inactive")
    elif float(i) <= 1000:
      bioactivity_class.append("active")
    else:
        bioactivity_class.append("intermediate")
  
  #molecule_chembl_id
  for i in df.molecule_chembl_id:
    mol_cid.append(i)
  
  #smiles
  for i in df.canonical_smiles:
    canonical_smiles.append(i)
  
  #std_value
  for i in df.standard_value:
    standard_value.append(i)
  
#Combine the 4 lists into a dataframe
  selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
  final = df[selection]
  final['bioactivity_class']=pd.Series(bioactivity_class)
  return final

In [57]:
#Calculate Lipinski descriptors

def calculate_lipinski_desc():
  sm=preprocess_bioactivity_of_target_protein(protein_index=4,protein_name='coronavirus')
  smiles=sm.canonical_smiles
  moldata= []
  for elem in smiles:
    mol=Chem.MolFromSmiles(elem) 
    moldata.append(mol)
  
  baseData= np.arange(1,1)
  i=0  
  for mol in moldata: 
    desc_MolWt = Descriptors.MolWt(mol)
    desc_MolLogP = Descriptors.MolLogP(mol)
    desc_NumHDonors = Lipinski.NumHDonors(mol)
    desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
    row = np.array([desc_MolWt, desc_MolLogP, desc_NumHDonors, desc_NumHAcceptors])
    if(i==0):
      baseData=row
    else:
      baseData=np.vstack([baseData, row])
    i=i+1 
  columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
  descriptors = pd.DataFrame(data=baseData,columns=columnNames)
  return descriptors

In [63]:
#combine prep & lip
def combine_df(protein_index=4,protein_name='coronavirus'):
  prep=preprocess_bioactivity_of_target_protein(protein_index,protein_name)
  lip=calculate_lipinski_desc()
  comb = pd.concat([prep, lip], axis=1)
  return comb

#normalize comb
def norm_value(input):
    norm = []
    input['standard_value'] = pd.to_numeric(input['standard_value'])
    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', 1)
    return x

#IC50 to pIC50
def IC50_to_pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', 1)
        
    return x

In [59]:
#remove intermediate 
def remove_intermediate():
  a=combine_df()
  b=norm_value(a)
  c=IC50_to_pIC50(b)
  d= c[c.bioactivity_class != 'intermediate']
  d.to_csv('bioactivity_final.csv')
  return d

In [60]:
#gen molecule.smi
def generate_molecule_smi():
  d=remove_intermediate()
  df=pd.read_csv('bioactivity_final.csv')
  selection = ['canonical_smiles','molecule_chembl_id']
  df_sel = df[selection]
  df_sel.to_csv('molecule.smi', sep='\t', index=False, header=False)
  #Calculate PaDEL descriptors
  ! cat padel.sh
  ! bash padel.sh
  ! ls -l

In [61]:
#final export of data
def final_data_expp():
  generate_molecule_smi()
  x=pd.read_csv('descriptors_output.csv')
  x = x.drop(columns=['Name'])
  y= df['pIC50']
  dt = pd.concat([x,y], axis=1)
  return dt

In [64]:
#fn calls
mount_drive()
install_download_lib()


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
--2021-09-15 19:18:32--  https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.131.3, 104.16.130.3, 2606:4700::6810:8203, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.131.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85055499 (81M) [application/x-sh]
Saving to: ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh.8’


2021-09-15 19:18:33 (120 MB/s) - ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh.8’ saved [85055499/85055499]

PREFIX=/usr/local
Unpacking payload ...


KeyboardInterrupt: ignored

In [1]:
search_target_protein()
select_target_protein()
select_n_clean_target_protein()
preprocess_bioactivity_of_target_protein()
calculate_lipinski_desc()
remove_intermediate()
generate_molecule_smi()
final_data_expp()

NameError: ignored