# **1. Install Padelpy**

In [None]:
! pip install padelpy

Collecting padelpy
  Downloading padelpy-0.1.14-py2.py3-none-any.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.14


In [None]:
# Download fingerprint XML files
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip fingerprints_xml.zip

--2023-12-06 07:20:28--  https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip [following]
--2023-12-06 07:20:29--  https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘fingerprints_xml.zip’


2023-12-06 07:20:29 (12.5 MB/s) - ‘fingerprints_xml.zip’ saved [10871/10871]

Archive:  fingerprints_xml.zip
  inflating: AtomPairs2DFingerprintCount.xml  
  inflating: AtomPairs2DF

In [None]:
# List and sort fingerprint XML files
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [None]:
# Create a list
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [None]:
# Create a dictionary
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [None]:
fp['PubChem']

'PubchemFingerprinter.xml'

# **2. Calculating Molecular Fingerprints using padelpy**

## **2.1 Load dataset**

In [3]:
import pandas as pd
url = 'https://raw.githubusercontent.com/YuanColab/Dengue-NS3-active-small-molecules-classifier/main/Data/NS3_Data_processing.csv'
df = pd.read_csv(url)
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL5188858,Cc1cc(O)c(S(=O)(=O)O)cc1Cc1cc(S(=O)(=O)O)c(O)c...,0.48,active
1,CHEMBL5075993,CCOc1ccc(S(=O)(=O)n2nc(OC(=O)c3ccc(-c4ccccc4)c...,6.00,active
2,CHEMBL5075008,Nc1ccc(C(=O)Oc2cc(N)n(S(=O)(=O)c3ccc(Oc4ccccc4...,9.00,active
3,CHEMBL5092085,Nc1cc(OC(=O)c2ccc(-c3ccccc3)cc2)nn1S(=O)(=O)c1...,12.00,active
4,CHEMBL5081752,Nc1ccc(C(=O)Oc2cc(N)n(S(=O)(=O)c3ccc(Br)cc3)n2...,74.00,active
...,...,...,...,...
586,CHEMBL4871225,COc1cccc(Cn2c3ccc(-c4cccc(-c5noc(=O)[nH]5)c4)c...,121000.00,inactive
587,CHEMBL4584771,O=C1c2ccccc2C(=O)N1c1ccc(S(=O)(=O)Nc2cccc3cccc...,121900.00,inactive
588,CHEMBL5199579,CCCCC1C(C(=O)OCC)C(c2ccccc2)CCN1C(=O)OC(C)(C)C,213700.00,inactive
589,CHEMBL4456898,CCCCC1C(C(=O)N(C)OC)C(c2ccccc2)CCN1C(=O)OC(C)(C)C,257400.00,inactive


## **2.2 Prepare data subset as input to PaDEL**

To use padel, the file must be .smi format. The purpose of this code is to adjust the data format for PaDEL recognition.

In [None]:
df2 = pd.concat( [df['canonical_smiles'],df['molecule_chembl_id']], axis=1 )
df2.to_csv('molecule.smi', sep='\t', index=False, header=False)
df2

Unnamed: 0,canonical_smiles,molecule_chembl_id
0,Cc1cc(O)c(S(=O)(=O)O)cc1Cc1cc(S(=O)(=O)O)c(O)c...,CHEMBL5188858
1,CCOc1ccc(S(=O)(=O)n2nc(OC(=O)c3ccc(-c4ccccc4)c...,CHEMBL5075993
2,Nc1ccc(C(=O)Oc2cc(N)n(S(=O)(=O)c3ccc(Oc4ccccc4...,CHEMBL5075008
3,Nc1cc(OC(=O)c2ccc(-c3ccccc3)cc2)nn1S(=O)(=O)c1...,CHEMBL5092085
4,Nc1ccc(C(=O)Oc2cc(N)n(S(=O)(=O)c3ccc(Br)cc3)n2...,CHEMBL5081752
...,...,...
586,COc1cccc(Cn2c3ccc(-c4cccc(-c5noc(=O)[nH]5)c4)c...,CHEMBL4871225
587,O=C1c2ccccc2C(=O)N1c1ccc(S(=O)(=O)Nc2cccc3cccc...,CHEMBL4584771
588,CCCCC1C(C(=O)OCC)C(c2ccccc2)CCN1C(=O)OC(C)(C)C,CHEMBL5199579
589,CCCCC1C(C(=O)N(C)OC)C(c2ccccc2)CCN1C(=O)OC(C)(C)C,CHEMBL4456898


## **2.3 Calculate all molecular fingerprints**

The nine fingerprints used in this study only contain 0 and 1 in bit positions

In [None]:
import pandas as pd
from padelpy import padeldescriptor

# Dictionary of fingerprint types
fingerprint_types = {
    'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
    'EState': 'EStateFingerprinter.xml',
    'CDKextended': 'ExtendedFingerprinter.xml',
    'CDK': 'Fingerprinter.xml',
    'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
    'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
    'MACCS': 'MACCSFingerprinter.xml',
    'PubChem': 'PubchemFingerprinter.xml',
    'Substructure': 'SubstructureFingerprinter.xml'
}

# DataFrame to store all results
all_descriptors = pd.DataFrame()

# Loop through each fingerprint type
for fingerprint, descriptor_file in fingerprint_types.items():
    # Output file name
    fingerprint_output_file = f'{fingerprint}.csv'

    # Generate fingerprints using padeldescriptor
    padeldescriptor(
      mol_dir='molecule.smi',  # Directory of the molecule file
      d_file=fingerprint_output_file,  # Destination file for fingerprints
      descriptortypes=descriptor_file,  # Type of descriptor to use
      detectaromaticity=True,  # Option to detect aromaticity
      standardizenitro=True,  # Standardize nitro groups
      standardizetautomers=True,  # Standardize tautomers
      threads=2,  # Number of threads to use
      removesalt=True,  # Remove salts from the molecule
      log=True,  # Enable logging
      fingerprints=True  # Generate fingerprints
    )

    # Read the generated fingerprint data
    descriptors = pd.read_csv(fingerprint_output_file)

    # Add new fingerprint data to the overall DataFrame
    if all_descriptors.empty:
        all_descriptors = descriptors
    else:
        all_descriptors = all_descriptors.merge(descriptors, on='Name')

# Save the merged data to a new CSV file
df['class'] = df['class'].map({'active': 1, 'inactive': 0})
combined_data = pd.concat([all_descriptors, df['class']], axis=1)
combined_data.to_csv('combined_fingerprints.csv', index=False)

combined_data

Unnamed: 0,Name,AD2D1,AD2D2,AD2D3,AD2D4,AD2D5,AD2D6,AD2D7,AD2D8,AD2D9,...,SubFP299,SubFP300,SubFP301,SubFP302,SubFP303,SubFP304,SubFP305,SubFP306,SubFP307,class
0,CHEMBL5188858,1,0,1,1,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,1
1,CHEMBL5075993,1,1,1,1,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,1
2,CHEMBL5075008,1,1,1,1,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,1
3,CHEMBL5092085,1,1,1,1,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,1
4,CHEMBL5081752,1,1,1,1,0,0,0,1,0,...,0,1,1,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,CHEMBL4871225,1,1,1,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
587,CHEMBL4584771,1,1,1,1,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
588,CHEMBL5199579,1,1,1,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
589,CHEMBL4456898,1,1,1,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
