In [None]:
print('Jupyter notebook version:')
!jupyter notebook --version
print('-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.')

import sys, os
import os.path

PYTHON = %system which python3
print('python dir:     ', PYTHON)

current_dir = os.getcwd()
full_path = os.path.join(current_dir, 'codes')
if full_path not in sys.path:
    sys.path.append(full_path)
    
from OpencanSARchem_libraries import *
from OpencanSARchem import *


print('')
print('Python: ', sys.version.split()[0])
print('RDKit:  ',rdkit.__version__)
print('Pandas: ',pd.__version__)
print('Numpy:  ',np.__version__)
print('tqdm:   ',tqdm.__version__)
print('-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.')

%load_ext autoreload
%autoreload 2
print('')
    
pd.set_option('display.max_columns',None, 'display.max_rows',None)
import warnings
warnings.filterwarnings('ignore')
    
from datetime import datetime
now = datetime.now()
print("Current date and time:", now.strftime("%Y-%m-%d %H:%M:%S"))


# <font color='green'>OpencanSARchem.ipynb</font>
### <font color='teal'>Read the files downloaded from ChEMBL, PubChem, BindingDB, or PDB (e.g. chembl_33_chemreps.txt.gz)</font>

    ChEMBL:    https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/chembl_<version>_chemreps.txt.gz
    PubChem:   https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-SMILES.gz
    BindingDB: https://www.bindingdb.org/rwd/bind/chemsearch/marvin/Download.jsp/BindingDB_All_<date>.tsv.zip
    PDB:       http://ligand-expo.rcsb.org/ld-download.html/Components-smiles-stereo-oe.smi 

    PubChem: https://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/Extras/bioactivities.tsv.gz  
    PubChem: https://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/Extras/Sid2CidSMILES.gz 

In [None]:
current_dir = os.getcwd()
database_path = os.path.join(current_dir, 'database')

In [None]:
input = 'ChEMBL'
fraction = 1

if input == 'ChEMBL':
    print('Input Database is:', input)

    smiles_file = os.path.join(database_path, 'chembl_34_chemreps_5000.txt.gz')
    basename = 'chembl_34_chemreps' 
    print('Output files basename:', basename)
    
    df = pd.read_csv(smiles_file, sep='\t', usecols=['chembl_id','canonical_smiles'])
    df = df.rename(columns={"chembl_id": "MOLREGNO", "canonical_smiles": "Canonical SMILES"})
    df = df.replace('CHEMBL','ch:', regex=True)
    df = df[['MOLREGNO','Canonical SMILES']]
    
    print('no of input structures:',df.shape[0])
    print('fraction is:',fraction)
   
    no_of_struct_to_process = int(fraction*df.shape[0])
    print('no_of_struct_to_process: ',no_of_struct_to_process)
    
elif input == 'BindingDB':
    print('Input Database is:', input)

    smiles_file = os.path.join(database_path, 'BindingDB_All_202406_tsv.zip')
    basename = 'BindingDB_All_202406'
    print('Output files basename:', basename)
    
    df = pd.read_csv(smiles_file, sep='\t',usecols=['Ligand SMILES','BindingDB MonomerID'])
    df = df.rename(columns={"Ligand SMILES": "Canonical SMILES", "BindingDB MonomerID": "MOLREGNO"})
    df['MOLREGNO'] = 'bi:' + df['MOLREGNO'].astype(str)
    df = df[['MOLREGNO','Canonical SMILES']]
    
    print('no of input structures:',df.shape[0])
    print('fraction is:',fraction)

    no_of_struct_to_process = int(fraction*df.shape[0])
    print('no_of_struct_to_process: ',no_of_struct_to_process)
    
elif input == 'PubChem':
    print('Input Database is:', input)

    smiles_file = os.path.join(database_path, 'CID-SMILES.gz')
    basename = 'PubChem_CIDS_202307'
    print('Output files basename:', basename)
    
    colnames = ['MOLREGNO','Canonical SMILES']
    df = pd.read_csv(smiles_file, sep='\t', names=colnames, header = None)
    df['MOLREGNO'] = 'pu:' + df['MOLREGNO'].astype(str)
    df = df[['MOLREGNO','Canonical SMILES']]
    
    print('no of input structures:',df.shape[0])
    print('fraction is:',fraction)

    no_of_struct_to_process = int(fraction*df.shape[0])
    print('no_of_struct_to_process: ',no_of_struct_to_process)
    
elif input == 'PubChem_bio':
    print('Input Database is:', input)

    smiles_file = os.path.join(database_path, 'pubchem_bioactivity.csv.gz')
    basename = 'PubChem_bio_202407'
    print('Output files basename:', basename)
    
    df = pd.read_csv(smiles_file, usecols=['CID','Isomeric SMILES'])
    df = df.rename(columns={"CID": "MOLREGNO", "Isomeric SMILES": "Canonical SMILES"})
    df['MOLREGNO'] = 'pu:' + df['MOLREGNO'].astype(str)
    df = df[['MOLREGNO','Canonical SMILES']]
    
    print('no of input structures:',df.shape[0])
    print('fraction is:',fraction)

    no_of_struct_to_process = int(fraction*df.shape[0])
    print('no_of_struct_to_process: ',no_of_struct_to_process)


elif input == 'canSAR':
    print('Input Database is:', input)

    smiles_file = os.path.join(database_path, 'canSAR_unmatched_202407.csv')
    basename = 'canSAR_unmatched_202407.csv'
    print('Output files basename:', basename)

    df = pd.read_csv(smiles_file, usecols=['compound_id','smiles'])
    df = df.rename(columns={"compound_id": "MOLREGNO", "smiles": "Canonical SMILES"})
    df['MOLREGNO'] = df['MOLREGNO'].apply(lambda x: 'ca:' + str(x))
    df = df[['MOLREGNO','Canonical SMILES']]
    
    print('no of input structures:',df.shape[0])
    print('fraction is:',fraction)

    no_of_struct_to_process = int(fraction*df.shape[0])
    print('no_of_struct_to_process: ',no_of_struct_to_process)
    print(df.head(3))
    
else:
    print('Input Database is:', input)

    smiles_file = os.path.join(database_path, 'Components-smiles-stereo-oe.smi')
    basename = 'PDB_Ligands_202406'
    print('Output files basename:', basename)
    
    colnames = ['Canonical SMILES','MOLREGNO','StructName']
    df = pd.read_csv(smiles_file, sep='\t', names=colnames)
    df['MOLREGNO'] = 'pd:' + df['MOLREGNO'].astype(str)
    df = df[['MOLREGNO','Canonical SMILES']]
    
    print('no of input structures:',df.shape[0])
    print('fraction is:',fraction)

    no_of_struct_to_process = int(fraction*df.shape[0])
    print('no_of_struct_to_process: ',no_of_struct_to_process)
    

In [None]:
TIME1=time.time()
try:
    
    if fraction < 1:
        df=df.sample(frac=fraction) 
except:
    print('Loading the dataset failed.')
    exit()

#collect metadata
df['RDKit_Version']=rdBase.rdkitVersion
df['Run_Date']=str(date.today())
df['Input_Version']=basename


chunksize=(int(mp.cpu_count()/2))
print(mp.cpu_count())
print((chunksize))


with mp.Pool(mp.cpu_count()-1) as pool:
    mols=list(tqdm.tqdm(pool.imap(mol_from_smiles, df['Canonical SMILES'].tolist(), chunksize=6), total=df.shape[0]))


df['Category']='Intake'
df['mol']=mols

#drop molecules that failed to convert from smiles
df[df.mol.isna().values.tolist()].drop(columns=['mol'])[['MOLREGNO','Canonical SMILES','Category']].to_csv(basename+'_Failures.tsv', sep='\t',mode='w')
df=df.dropna(subset=['mol'])

for level in [FIuTS,uIuTS,uIuuS,uuuuu]:
    print('Level: ',level)
    #perform transormations
    print(f'perform {level.__name__} transformations')
    df.loc[:,'Category']=level.__name__
    with mp.Pool(mp.cpu_count()-1) as pool:
        mols = list(tqdm.tqdm(pool.imap(level, df.mol.tolist(), chunksize=chunksize), total=df.shape[0]))
    df.loc[:,'mol']=mols
    print(df.head(5))
    #save and remove failures
    df[df.mol.isna().values.tolist()].drop(columns=['mol'])[['MOLREGNO','Canonical SMILES','Category']].to_csv(basename+'_Failures.tsv', sep='\t',mode='a',header=False)
    df=df.dropna(subset=['mol']) 

    #get representations
    print(f'generate {level.__name__} representations')
    with mp.Pool(mp.cpu_count()-1) as pool:
        reps = list(tqdm.tqdm(pool.imap(representations, df.mol.tolist(), chunksize=chunksize), total=df.shape[0]))
    
    df.loc[:,['Canonical SMILES','InChI','InChIKey','Non Standard InChI',f'{level.__name__}_Non Standard InChIKey']]=reps
    #***
    df_renamed=df.copy()
    df_renamed.rename(columns={"Canonical SMILES": "Molecule (RDKit)"}, inplace=True)
    df_renamed.drop(columns=['mol']).to_csv(basename+f'_{level.__name__}.tsv.gz', sep='\t')
    del df_renamed
    #***

    #get pains
    print(f'generate {level.__name__} pains')
    with mp.Pool(mp.cpu_count()-1) as pool:
        filtered = list(tqdm.tqdm(pool.imap(pains, df.mol.tolist(), chunksize=chunksize), total=df.shape[0]))
    df_pains=df[['MOLREGNO','Canonical SMILES',f'{level.__name__}_Non Standard InChIKey','Category']]


    df_pains.loc[:,('HasPAINS')] = [item[0] for item in filtered]
    df_pains.loc[:,('PAINS_Descriptions')] = [item[1] for item in filtered]
    df_pains.loc[:,('PAINS_References')] = [item[2] for item in filtered]
    
    df_pains.to_csv(basename+f'_{level.__name__}_pains.tsv.gz', sep='\t')
    print(df_pains.head(5))
    del df_pains

    #get toxicophores
    print(f'generate {level.__name__} toxicophores')
    with mp.Pool(mp.cpu_count()-1) as pool:
        filtered = list(tqdm.tqdm(pool.imap(toxicophore, df.mol.tolist(), chunksize=chunksize), total=df.shape[0]))
    df_toxi=df[['MOLREGNO','Canonical SMILES',f'{level.__name__}_Non Standard InChIKey','Category']]
    df_toxi.loc[:,('HasToxicophore')]=filtered
    df_toxi.to_csv(basename+f'_{level.__name__}_toxi.tsv.gz', sep='\t')
    print(df_toxi.head(5))
    del df_toxi
    
    #get properties
    print(f'generate {level.__name__} properties')
    with mp.Pool(mp.cpu_count()-1) as pool:
        props = list(tqdm.tqdm(pool.imap(mol_props, df['Canonical SMILES'].tolist(), chunksize=chunksize), total=df.shape[0]))
    
    df_props=df[['MOLREGNO','Canonical SMILES',f'{level.__name__}_Non Standard InChIKey','Category','Run_Date','RDKit_Version']]
    df_props.loc[:,prop_list_names]=props
    df_props.to_csv(basename+f'_{level.__name__}_properties.tsv.gz', sep='\t')
    print(df_props.head(5))
    del df_props

    if level.__name__ == 'uIuuS' or level.__name__ == 'uuuuu':
        #generate murcko scaffolds
        print(f'generate {level.__name__} murcko scaffolds')
        df_murcko=df.copy()
        with mp.Pool(mp.cpu_count()-1) as pool:
            scaffolds = list(tqdm.tqdm(pool.imap(get_murcko, df_murcko.mol.tolist(), chunksize=chunksize), total=df.shape[0]))

        df_murcko['murcko_mol']=scaffolds
        df_murcko=df_murcko.dropna(subset=['murcko_mol']) #drop compounds without rings

        #get representations of murcko scaffolds
        print('generate murcko scaffold representations')
        with mp.Pool(mp.cpu_count()-1) as pool:
            reps = list(tqdm.tqdm(pool.imap(representations, df_murcko.murcko_mol.tolist(), chunksize=chunksize), total=df.shape[0]))

        if level.__name__ == 'uIuuS':
            df_murcko[['Canonical SMILES (Murcko Scaffold)','InChI (Murcko Scaffold)','InChIKey (Murcko Scaffold)','Non Standard InChI (Murcko Scaffold)','Non Standard InChIKey (Murcko Scaffold)']]=reps
            df_murcko.loc[:,'Category']='FIuTS'
            df_murcko[['FIuTS_Non Standard InChIKey','Category','Run_Date','Canonical SMILES (Murcko Scaffold)','InChI (Murcko Scaffold)','InChIKey (Murcko Scaffold)']].to_csv(basename+'_FIuTS_Murcko_Scaffolds.tsv.gz', sep='\t')
            df_murcko.loc[:,'Category']='uIuTS'
            df_murcko[['uIuTS_Non Standard InChIKey','Category','Run_Date','Canonical SMILES (Murcko Scaffold)','InChI (Murcko Scaffold)','InChIKey (Murcko Scaffold)']].to_csv(basename+'_uIuTS_Murcko_Scaffolds.tsv.gz', sep='\t')
            df_murcko.loc[:,'Category']='uIuuS'
            df_murcko[['uIuuS_Non Standard InChIKey','Category','Run_Date','Canonical SMILES (Murcko Scaffold)','InChI (Murcko Scaffold)','InChIKey (Murcko Scaffold)']].to_csv(basename+'_uIuuS_Murcko_Scaffolds.tsv.gz', sep='\t')
            del df_murcko

        if level.__name__ == 'uuuuu':
            df_murcko[['Canonical SMILES (Murcko Scaffold)','InChI (Murcko Scaffold)','InChIKey (Murcko Scaffold)','Non Standard InChI (Murcko Scaffold)','Non Standard InChIKey (Murcko Scaffold)']]=reps
            df_murcko.loc[:,'Category']='uuuuu'
            df_murcko[['uuuuu_Non Standard InChIKey','Category','Run_Date','Canonical SMILES (Murcko Scaffold)','InChI (Murcko Scaffold)','InChIKey (Murcko Scaffold)']].to_csv(basename+'_uuuuu_Murcko_Scaffolds.tsv.gz', sep='\t')
            del df_murcko


print('*****************************************************************************************************************************')
print('*****************************************************************************************************************************')
print('*****************************************************************************************************************************')

TIME2=time.time()
TOTAL_TIME=np.round((TIME2-TIME1)/60,2)
print(f'Execution took {TOTAL_TIME} minutes.')