# Dataset descriptors HDAC1

This notebook creates a dataset of physicochemical descriptors, based on SMILE, to be used in classification tasks with ML models.

In [None]:
# It was necessary to install these libraries.
#!pip install SDF
#!pip install rdkit

Collecting SDF
  Downloading sdf-0.3.7-py3-none-any.whl.metadata (2.3 kB)
Collecting attrs>=25.3.0 (from SDF)
  Downloading attrs-25.4.0-py3-none-any.whl.metadata (10 kB)
Collecting h5py>=3.13.0 (from SDF)
  Downloading h5py-3.15.1-cp312-cp312-win_amd64.whl.metadata (3.1 kB)
Collecting xlrd>=2.0.1 (from SDF)
  Downloading xlrd-2.0.2-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading sdf-0.3.7-py3-none-any.whl (342 kB)
Downloading attrs-25.4.0-py3-none-any.whl (67 kB)
Downloading h5py-3.15.1-cp312-cp312-win_amd64.whl (2.9 MB)
   ---------------------------------------- 0.0/2.9 MB ? eta -:--:--
   ---------------------------------------- 2.9/2.9 MB 18.7 MB/s eta 0:00:00
Downloading xlrd-2.0.2-py2.py3-none-any.whl (96 kB)
Installing collected packages: xlrd, h5py, attrs, SDF

   ---------- ----------------------------- 1/4 [h5py]
   ---------- ----------------------------- 1/4 [h5py]
   ---------------------------------------- 4/4 [SDF]

Successfully installed SDF-0.3.7 attrs-25.4.0 h5py-

In [7]:
import numpy as np
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import PandasTools
from createdescritores import addNewDescriptorsToDF

In [None]:
name_folder = 'MUBD_sdf/' 
dir_folder = "C:/Users/Alessandra/Documents/Repositorio_Mestrado/Projeto_mestrado/descritores/"
project_folder = os.path.join(dir_folder, name_folder)

In [19]:
print(project_folder)

C:/Users/Alessandra/Documents/Repositorio_Mestrado/Projeto_mestrado/descritores/MUBD_sdf/


In [22]:
path_ligand = os.path.join(project_folder, f"uploaded_new_HDAC01_ligands.sdf")
path_decoys = os.path.join(project_folder, f"upload_HDAC1_final_decoys.sdf")

### SDF

if the file does not open
 1. Inspect the SDF file for potential issues:
   - Open the SDF file in a text editor.
   - Look for any properties that have unusual values, especially those containing array-like structures.
   - If possible, try to simplify the properties in the SDF file to basic data types.
   - If issues are found, correct them in the SDF file and then try loading again.
 2. Load SDF with low-level RDKit function and inspect properties
   - Check if any properties have a NumPy array or an unexpected type as the value.
3. Modify the LoadSDF call:
  - If specific properties are problematic, consider removing them during loading using the 'removeprops' argument in PandasTools.LoadSDF.
  - Specify the column for molecule objects using 'molColName'.


In [None]:
#- Load the SDF with Chem.SDMolSupplier to inspect the properties of the molecules
""" suppl = Chem.SDMolSupplier(path_ligand)
for mol in suppl:
    if mol is not None:
        print(mol.GetPropNames())  # Print property names for each molecule
        for prop in mol.GetPropNames():
            print(f"{prop}: {mol.GetProp(prop)}, type: {type(mol.GetProp(prop))}") """


# Ligands

In [23]:
# Open the file
df_1 = PandasTools.LoadSDF(path_ligand, embedProps=True, molColName='RMol', smilesName='smiles')

[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 9, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 14, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 6, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 28, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 8, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 32, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 19, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 6, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 21, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 22, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 6, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 25, ignored.
[14:55:21] MRV_IMPLICIT_H SGroup on atom without aromatic bonds, 10, 

In [24]:
df_1.head()

Unnamed: 0,Name,ALogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,FormalCharge,ID,smiles,RMol
0,CHEMBL2000089,2.467,588.69389,6,3,11,0,CHEMBL2000089,O=C1N[C@@H](Cc2ccccc2)C(=O)N2CCC[C@@H]2C(=O)N[...,<rdkit.Chem.rdchem.Mol object at 0x00000246FA6...
1,CHEMBL99,2.772,302.36818,4,2,6,0,CHEMBL99,CC(/C=C/C(=O)NO)=C\[C@@H](C)C(=O)c1ccc(N(C)C)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000246FA6...
2,CHEMBL2047701,1.999,411.08964,5,3,5,-1,CHEMBL2047701,O=C(NCCS)/C(Cc1cc(Br)c(O)c(Br)c1)=N/[O-],<rdkit.Chem.rdchem.Mol object at 0x00000246FA6...
3,CHEMBL343448,1.689,540.69583,8,4,2,0,CHEMBL343448,C/C=C1\NC(=O)[C@H]2CSSCC/C=C/[C@H](CC(=O)N[C@H...,<rdkit.Chem.rdchem.Mol object at 0x00000246FA6...
4,CHEMBL360194,0.817,515.60188,6,5,8,0,CHEMBL360194,CC1(C)NC(=O)[C@H](CCCCCC(=O)NO)NC(=O)[C@H]2CCC...,<rdkit.Chem.rdchem.Mol object at 0x00000246FA6...


In [25]:
df_1.columns

Index(['Name', 'ALogP', 'Molecular_Weight', 'Num_H_Acceptors', 'Num_H_Donors',
       'Num_RotatableBonds', 'FormalCharge', 'ID', 'smiles', 'RMol'],
      dtype='object')

In [26]:
# Select columns
new_df1 = df_1[['ALogP', 'Molecular_Weight', 'Num_H_Acceptors', 'Num_H_Donors',
       'Num_RotatableBonds', 'FormalCharge', 'ID', 'smiles']].copy()

In [27]:
new_df1['category'] = 'ligante'

In [28]:
new_df1.shape

(180, 9)

# Decoys

In [None]:
# Open the file
df_1d = PandasTools.LoadSDF(path_decoys, embedProps=True, molColName='RMol', smilesName='smiles')

In [30]:
df_1d.head()

Unnamed: 0,Name,simp,simsdiff,class,ALogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,FormalCharge,ID,smiles,RMol
0,ZINC20439165,0.909381037109605,0.0363164628442907,1,2.316,481.54084,6,3,11,0,ZINC20439165,CCn1cc(/C=C(\NC(=O)c2cc(OC)c(OC)c(OC)c2)C(=O)N...,<rdkit.Chem.rdchem.Mol object at 0x00000246FA5...
1,ZINC12654965,0.907067935746782,0.0436879984324576,1,0.978,529.62845,6,3,11,0,ZINC12654965,CC(C)C[C@@H](NC(=O)[C@@H]1CCCN1C(=O)C1CCN(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x00000246FA5...
2,ZINC08995062,0.910130168187163,0.043776168867969,1,1.957,490.5509,6,3,12,0,ZINC08995062,COCCNC(=O)[C@@H](c1ccc(O)cc1)N(Cc1ccccc1)C(=O)...,<rdkit.Chem.rdchem.Mol object at 0x00000246FA5...
3,ZINC12393277,0.93856853652448,0.0440672753080888,1,2.71,522.59275,6,3,12,0,ZINC12393277,CC(C)C[C@H](NC(=O)[C@H]1CCC(=O)N1C(=O)OCc1cccc...,<rdkit.Chem.rdchem.Mol object at 0x00000246FA5...
4,ZINC12393275,0.93856853652448,0.0440672753080888,1,2.71,522.59275,6,3,12,0,ZINC12393275,CC(C)C[C@@H](NC(=O)[C@H]1CCC(=O)N1C(=O)OCc1ccc...,<rdkit.Chem.rdchem.Mol object at 0x00000246FA5...


In [None]:
# Function to get the first row of each group.
def getFirstRow(group):
    return group.iloc[0]

In [None]:
# Group by class and take the first line.
df_1dclass = df_1d.groupby('class').apply(getFirstRow)
df_1dclass.shape

  df_1dclass = df_1d.groupby('class').apply(get_first_row)


(180, 13)

In [None]:
# Select coluns
new_df1dclass = df_1dclass[['ALogP', 'Molecular_Weight', 'Num_H_Acceptors', 'Num_H_Donors',
       'Num_RotatableBonds', 'FormalCharge', 'ID', 'smiles']].copy()

In [69]:
new_df1dclass['category'] = 'decoys'

In [None]:
# Concatenates ligands and decoys
df_H1 = pd.concat([new_df1dclass, new_df1]) 

In [72]:
df_H1.shape

(360, 9)

In [104]:
df_H1['category'].value_counts()

category
decoys     180
ligante    180
Name: count, dtype: int64

# Descriptores

Create a dataframe with new descriptors from the smiles.

In [105]:
# List of smiles
smiles_list = df_H1['smiles'].tolist()
len(smiles_list)

360

In [106]:
# Complete list of descriptors that can be calculated directly from SMILES
desc_names = [
    'HeavyAtomCount', 'MolLogP', 'TPSA', 'MolMR', 'FractionCSP3',
       'RingCount', 'NumAromaticRings', 'NumAliphaticRings',
       'NumSaturatedRings', 'Kappa1', 'Kappa2', 'Kappa3', 'BalabanJ'
]

In [None]:
# Create descriptor generator
desc_funcs = {name: getattr(Descriptors, name) for name in desc_names if hasattr(Descriptors, name)}

In [77]:
desc_funcs

{'HeavyAtomCount': <function rdkit.Chem.Lipinski.HeavyAtomCount(mol)>,
 'MolLogP': <function rdkit.Chem.Crippen.<lambda>(*x, **y)>,
 'TPSA': <function rdkit.Chem.MolSurf.<lambda>(*x, **y)>,
 'MolMR': <function rdkit.Chem.Crippen.<lambda>(*x, **y)>,
 'FractionCSP3': <function rdkit.Chem.Lipinski.<lambda>(x, y=<Boost.Python.function object at 0x00000246F8730DA0>)>,
 'RingCount': <function rdkit.Chem.Lipinski.<lambda>(x)>,
 'NumAromaticRings': <function rdkit.Chem.Lipinski.<lambda>(x, y=<Boost.Python.function object at 0x00000246F8732100>)>,
 'NumAliphaticRings': <function rdkit.Chem.Lipinski.<lambda>(x, y=<Boost.Python.function object at 0x00000246F8731570>)>,
 'NumSaturatedRings': <function rdkit.Chem.Lipinski.<lambda>(x, y=<Boost.Python.function object at 0x00000246F8731AC0>)>,
 'Kappa1': <function rdkit.Chem.GraphDescriptors.<lambda>(x)>,
 'Kappa2': <function rdkit.Chem.GraphDescriptors.<lambda>(x)>,
 'Kappa3': <function rdkit.Chem.GraphDescriptors.<lambda>(x)>,
 'BalabanJ': <function

In [107]:
# Calculate descriptors
data = []
for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        desc_values = [func(mol) for func in desc_funcs.values()]
        data.append([smiles] + desc_values)

# Create DataFrame
columns = ["smiles"] + list(desc_funcs.keys())
df_descriptors1 = pd.DataFrame(data, columns=columns)

In [108]:
df_descriptors1.shape

(360, 14)

In [109]:
df_descriptors1.sample(7)

Unnamed: 0,smiles,HeavyAtomCount,MolLogP,TPSA,MolMR,FractionCSP3,RingCount,NumAromaticRings,NumAliphaticRings,NumSaturatedRings,Kappa1,Kappa2,Kappa3,BalabanJ
139,O=C1COc2cc(O)ccc2N1,12,0.7231,58.56,42.2315,0.125,2,1,1,0,6.96721,2.405928,1.070152,2.42724
20,CC[C@@H](CCNS(=O)(=O)c1ccc(NC(=O)OC)cc1)n1nc(C...,30,3.70832,102.32,103.5692,0.444444,2,2,0,0,23.676731,9.474137,5.953225,2.016488
226,CCC(=O)NCc1ccc(C(=O)Nc2cc(-c3ccccc3)ccc2N)cc1,28,4.2143,84.22,112.8233,0.130435,3,3,0,0,19.169628,8.861538,4.764744,1.712751
228,CN(C)c1ccc(-c2nnc(Nc3ncc(C(=O)NO)cn3)s2)cc1,25,1.9237,116.16,94.2394,0.133333,3,3,0,0,16.73749,7.195564,3.869646,1.717816
304,CCNC(=S)SCC(=O)c1ccc(NC(C)=O)cc1,19,2.4553,58.2,83.8089,0.307692,1,1,0,0,15.656818,7.909343,5.847141,2.529675
21,CCNC(=O)Nc1cccc(NC(=O)c2ccccc2C(=O)c2ccc(C)c(C...,31,4.92814,87.3,122.8811,0.16,3,3,0,0,21.771094,9.714049,5.362842,1.826295
125,C[C@]1(CO)[C@H]2Cc3sc(Nc4cccc(O)c4)nc3[C@@H](C...,36,4.3073,127.85,137.1508,0.481481,5,3,2,1,24.798503,9.53407,4.398126,1.477862


In [None]:
# Add new descriptors to df_descriptors1
df_descriptors2 = addNewDescriptorsToDF(df_descriptors1, smiles_column='smiles')

In [112]:
df_descriptors2.shape

(360, 28)

In [None]:
# Merge df_H1 and df_descriptors2 to create the new dataframe
new_dfH1 = pd.merge(df_descriptors2, df_H1, how='left', on='smiles') 

In [114]:
new_dfH1.shape

(360, 36)

In [115]:
new_dfH1.sample(7)

Unnamed: 0,smiles,HeavyAtomCount,MolLogP,TPSA,MolMR,FractionCSP3,RingCount,NumAromaticRings,NumAliphaticRings,NumSaturatedRings,...,MolComplexity,ShannonEntropy,ALogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,FormalCharge,ID,category
294,Nc1ccccc1NC(=O)c1ccc(CNc2nc3cc(Cl)ccc3o2)cc1,28,4.9278,93.18,111.5953,0.047619,4,4,0,0,...,55,1.155974,3.519,392.83828,4,3,5,0,CHEMBL254111,ligante
318,COc1ccccc1NC(=O)NCc1ccc(C(=O)NO)cc1,23,2.1359,99.69,84.5471,0.125,2,2,0,0,...,45,1.186393,1.528,315.32388,4,4,5,0,CHEMBL2179247,ligante
158,COC(=O)c1cccc(C(=O)NC(=S)NNC(=O)c2ccc(NC(=O)c3...,35,2.98322,125.63,134.0218,0.08,3,3,0,0,...,69,1.251967,3.799,490.53098,6,4,9,0,ZINC10294084,decoys
267,COc1ccc(-c2ccnc(NCc3ccc(C(=O)Nc4ccccc4N)cc3)n2...,34,4.6074,111.39,133.1483,0.115385,4,4,0,0,...,67,1.011699,3.444,455.5084,7,3,8,0,CHEMBL271862,ligante
297,CC(C)C[NH2+]Cc1ccc(C(=O)Nc2cc(-c3ccccc3)ccc2[O...,28,3.3989,68.77,111.0536,0.208333,3,3,0,0,...,55,0.73453,2.686,374.47544,2,2,7,0,CHEMBL472670,ligante
7,[H]/N=C(Nc1nc(C)c2cc(C)c(C)cc2n1)/[NH+]=C(\N)[...,33,1.40173,145.02,126.151,0.25,3,3,0,0,...,67,1.254999,3.835,447.50958,6,4,9,1,ZINC63328775,decoys
270,CCN1CCN(C(=O)NCc2ccc(C(=O)Nc3cc(-c4cccs4)ccc3N...,35,3.1499,124.84,134.7653,0.2,4,3,1,1,...,69,1.251967,2.18,491.5621,5,3,6,0,CHEMBL1957458,ligante


In [None]:
# Checks for null values ​​and in which column.
new_dfH1.isnull().value_counts()

smiles  HeavyAtomCount  MolLogP  TPSA   MolMR  FractionCSP3  RingCount  NumAromaticRings  NumAliphaticRings  NumSaturatedRings  Kappa1  Kappa2  Kappa3  BalabanJ  BenzeneRingRatio  NumSP2Atoms  NumSP3Atoms  NumTerminalN  NumTerminalO  NumTerminalS  MolecularRigidity  ChemicalFlexibilityIndex  NumHalogenBonds  BranchingIndex  NumConjugatedBonds  SaturationRatio  MolComplexity  ShannonEntropy  ALogP  Molecular_Weight  Num_H_Acceptors  Num_H_Donors  Num_RotatableBonds  FormalCharge  ID     category
False   False           False    False  False  False         False      False             False              False              False   False   False   False     False             False        False        False         False         False         False              False                     False            False           False               False            False          False           False  False             False            False         False               False         False  False    

In [117]:
new_dfH1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   smiles                    360 non-null    object 
 1   HeavyAtomCount            360 non-null    int64  
 2   MolLogP                   360 non-null    float64
 3   TPSA                      360 non-null    float64
 4   MolMR                     360 non-null    float64
 5   FractionCSP3              360 non-null    float64
 6   RingCount                 360 non-null    int64  
 7   NumAromaticRings          360 non-null    int64  
 8   NumAliphaticRings         360 non-null    int64  
 9   NumSaturatedRings         360 non-null    int64  
 10  Kappa1                    360 non-null    float64
 11  Kappa2                    360 non-null    float64
 12  Kappa3                    360 non-null    float64
 13  BalabanJ                  360 non-null    float64
 14  BenzeneRin

In [None]:
# Save the new dataframe
output_path = os.path.join(dir_folder, f"descriptores_HDAC1.csv")
new_df1.to_csv(output_path, index=False)