# Dataset Preparation

## Installing and Importing Libraries

In [1]:
!pip install padelpy



PaDELPy provides a Python wrapper for the PaDEL-Descriptor molecular descriptor calculation software. It was created to allow direct access to the PaDEL-Descriptor command-line interface via Python.

https://github.com/ecrl/padelpy

In [2]:
!pip install wget



Python utility command used to download files

https://pypi.org/project/wget/

In [3]:
import wget
import zipfile as z
import glob

import pandas as pd
from padelpy import padeldescriptor

## Functions

In [4]:
def download_fingerprints(url):
    filename = wget.download(url)
    zipf = z.ZipFile(filename)
    zipf.extractall()

In [5]:
def get_xml():
    files = glob.glob('*.xml')
    files.sort()
    return files

In [6]:
def xml_string_removal(list_):
    fp_new = list()
    for i in list_:
        i = i[:-4]
        fp_new.append(i)
    return fp_new

In [7]:
def print_shape(list_):
    for i in list_:
        print(i.shape)

In [8]:
def descriptor_building(fingerprint_list, molecule_dir):
    descriptor_list = list()
    for i in fingerprint_list:
        fingerprint_output_file = ''.join([i,'.csv']) 
        fingerprint_descriptortypes = fp[i]
        padeldescriptor(mol_dir = molecule_dir, 
                    d_file = fingerprint_output_file, 
                    descriptortypes = fingerprint_descriptortypes,
                    detectaromaticity = True,
                    standardizenitro = True,
                    standardizetautomers = True,
                    threads = 2,
                    removesalt = True,
                    log = True,
                    fingerprints = True) 
        descriptor_list.append(pd.read_csv(fingerprint_output_file))
    return descriptor_list

In [9]:
def lipinski_building(lipinski_list, molecule_dir):
    descriptor_list = list()
    for i in lipinski_list:
        lipinski_output_file = ''.join([i,'.csv']) 
        lipinski_descriptortypes = lp[i]
        padeldescriptor(mol_dir = molecule_dir, 
                    d_file = lipinski_output_file, 
                    descriptortypes = lipinski_descriptortypes,
                    detectaromaticity = True,
                    standardizenitro = True,
                    standardizetautomers = True,
                    threads = 2,
                    removesalt = True,
                    log = True,
                    d_2d = True) 
        descriptor_list.append(pd.read_csv(lipinski_output_file))
    return descriptor_list

In [10]:
def save_descriptors_as_csv(fingerprint_list, descriptor_list):
    step = 0
    for i in fingerprint_list:
        filename_list = ''.join([i,'.csv']) 
        descriptor_list[step].to_csv(filename_list)
        step += 1

## Fingerprints

### Fingerprint Files Acquisition

In [11]:
url = 'https://github.com/carineribeirost/data/blob/main/fingerprints_xml.zip?raw=true'

In [12]:
download_fingerprints(url)

  0% [                                                                              ]     0 / 10871 75% [..........................................................                    ]  8192 / 10871100% [..............................................................................] 10871 / 10871

In [13]:
fingerprint_list = get_xml()

In [14]:
fingerprint_name = xml_string_removal(fingerprint_list)
fingerprint_name

['AtomPairs2DFingerprintCount',
 'AtomPairs2DFingerprinter',
 'EStateFingerprinter',
 'ExtendedFingerprinter',
 'Fingerprinter',
 'GraphOnlyFingerprinter',
 'KlekotaRothFingerprintCount',
 'KlekotaRothFingerprinter',
 'MACCSFingerprinter',
 'PubchemFingerprinter',
 'SubstructureFingerprintCount',
 'SubstructureFingerprinter']

A dictionary linking the fingerprints names with the xml files names will be used

In [15]:
fp = dict(zip(fingerprint_name, fingerprint_list))
fp

{'AtomPairs2DFingerprintCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter': 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter': 'EStateFingerprinter.xml',
 'ExtendedFingerprinter': 'ExtendedFingerprinter.xml',
 'Fingerprinter': 'Fingerprinter.xml',
 'GraphOnlyFingerprinter': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter': 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter': 'MACCSFingerprinter.xml',
 'PubchemFingerprinter': 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount': 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter': 'SubstructureFingerprinter.xml'}

## dataframe loading  and interest columns selection

In [16]:
df_processed = pd.read_csv('df_interest_columns.csv', index_col = 0)
df_processed.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,pIC50
8,CHEMBL217524,O=C(Nc1ccc(F)c(Cl)c1)C1CC(=O)N(C2CCCCC2)C1,inactive,4.828859
9,CHEMBL384149,COc1ccc(Cl)cc1NC(=O)C1CC(=O)N(C2CCCCC2)C1,intermediate,5.79588
10,CHEMBL216339,O=C(Nc1cc(C(F)(F)F)cc(C(F)(F)F)c1)C1CC(=O)N(C2...,intermediate,5.435334
11,CHEMBL216704,COc1cc(NC(=O)C2CC(=O)N(C3CCCCC3)C2)cc(C(F)(F)F)c1,intermediate,5.886057
12,CHEMBL386324,O=C(Nc1cc(Br)cc(C(F)(F)F)c1)C1CC(=O)N(C2CCCCC2)C1,active,6.070581


In [17]:
df_smi = pd.concat( [df_processed['canonical_smiles'], df_processed['molecule_chembl_id']], axis=1 )
df_smi.to_csv('molecule.smi', sep='\t', index=False, header=False)
df_smi.head()

Unnamed: 0,canonical_smiles,molecule_chembl_id
8,O=C(Nc1ccc(F)c(Cl)c1)C1CC(=O)N(C2CCCCC2)C1,CHEMBL217524
9,COc1ccc(Cl)cc1NC(=O)C1CC(=O)N(C2CCCCC2)C1,CHEMBL384149
10,O=C(Nc1cc(C(F)(F)F)cc(C(F)(F)F)c1)C1CC(=O)N(C2...,CHEMBL216339
11,COc1cc(NC(=O)C2CC(=O)N(C3CCCCC3)C2)cc(C(F)(F)F)c1,CHEMBL216704
12,O=C(Nc1cc(Br)cc(C(F)(F)F)c1)C1CC(=O)N(C2CCCCC2)C1,CHEMBL386324


## Building fingerprints for the selected molecule

It can take a few minutes to run. It would be nice trying to make a coffee or taking a walk around the neighbourhood

In [18]:
descriptor_list = descriptor_building(fingerprint_name, 'molecule.smi')

In [19]:
print_shape(descriptor_list)

(329, 781)
(329, 781)
(329, 80)
(329, 1025)
(329, 1025)
(329, 1025)
(329, 4861)
(329, 4861)
(329, 167)
(329, 882)
(329, 308)
(329, 308)


In [20]:
save_descriptors_as_csv(fingerprint_name, descriptor_list)

## Lipinski Descriptors

The exact same process will be made to acquire de lipinski descriptors for the molecules

In [21]:
url_lip = 'https://github.com/carineribeirost/data/blob/main/Lipinski_xml.zip?raw=true'

In [22]:
download_fingerprints(url_lip)

  0% [                                                                                ]    0 / 3492100% [................................................................................] 3492 / 3492

In [23]:
lipinski_list =  get_xml()
lipinski_new = list()
for i in lipinski_list:
    if i not in fingerprint_list:
        lipinski_new.append(i)
lipinski_list = lipinski_new

In [24]:
lipinski_list

['ALOGP.xml', 'HBondAcceptorCount.xml', 'HBondDonorCount.xml', 'Weight.xml']

In [25]:
lipinski_name = xml_string_removal(lipinski_list)
lipinski_name

['ALOGP', 'HBondAcceptorCount', 'HBondDonorCount', 'Weight']

In [26]:
lp = dict(zip(lipinski_name, lipinski_list))
lp

{'ALOGP': 'ALOGP.xml',
 'HBondAcceptorCount': 'HBondAcceptorCount.xml',
 'HBondDonorCount': 'HBondDonorCount.xml',
 'Weight': 'Weight.xml'}

In [27]:
lipinski_descriptor = lipinski_building(lipinski_name, 'molecule.smi')

In [28]:
print_shape(lipinski_descriptor)

(329, 4)
(329, 5)
(329, 3)
(329, 3)


In [29]:
save_descriptors_as_csv(lipinski_name, lipinski_descriptor)

In [30]:
lipinski_name

['ALOGP', 'HBondAcceptorCount', 'HBondDonorCount', 'Weight']

### Removing from lipinski csv the columns that are unimportant for the analysis

In [31]:
df_ALOGP = pd.read_csv("ALOGP.csv")
df_ALOGP = df_ALOGP.drop(['Name','ALogp2', 'AMR'], axis=1)
df_ALOGP.to_csv('ALOGP.csv')

In [32]:
df_HBondAcceptorCount = pd.read_csv("HBondAcceptorCount.csv")
df_HBondAcceptorCount = df_HBondAcceptorCount.drop(['Name','nHBAcc','nHBAcc2','nHBAcc3'], axis=1)
df_HBondAcceptorCount.to_csv('HBondAcceptorCount.csv')

In [33]:
df_HBondDonorCount = pd.read_csv("HBondDonorCount.csv")
df_HBondDonorCount = df_HBondDonorCount.drop(['Name','nHBDon'], axis=1)
df_HBondDonorCount.to_csv('HBondDonorCount.csv')

In [34]:
df_Weight = pd.read_csv("Weight.csv")
df_Weight = df_Weight.drop(['Name','AMW'], axis=1)
df_Weight.to_csv('Weight.csv')