In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.preprocessing import FunctionTransformer
from pandas import DataFrame

In [3]:
#preparing molecules form work
molecules = [mol for mol in Chem.SDMolSupplier("logBB.sdf") if mol is not None]

[11:10:31] Can't kekulize mol.  Unkekulized atoms: 13 14 15 16 17 18 19 20 21
[11:10:31] ERROR: Could not sanitize molecule ending on line 3192
[11:10:31] ERROR: Can't kekulize mol.  Unkekulized atoms: 13 14 15 16 17 18 19 20 21
[11:10:31] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 18
[11:10:31] ERROR: Could not sanitize molecule ending on line 4541
[11:10:31] ERROR: Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 18
[11:10:31] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12 13 14 15 16
[11:10:31] ERROR: Could not sanitize molecule ending on line 5314
[11:10:31] ERROR: Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12 13 14 15 16
[11:10:31] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 14 15 16 17 18
[11:10:31] ERROR: Could not sanitize molecule ending on line 7252
[11:10:31] ERROR: Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 14 15 16 17 18
[11:10:31] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 14 15
[11:10:31] ERROR

In [5]:
with open("logBB.sdf", "rb") as f:
    suppl = Chem.ForwardSDMolSupplier(f)
    for idx, mol in enumerate(suppl):
        if mol is not None:
            try:
                Chem.SanitizeMol(mol)
                Chem.Kekulize(mol)
            except Exception as e:
                print(f"Error processing molecule at line {idx + 1}: {e}")
                print(Chem.MolToMolBlock(mol))

[11:10:34] Can't kekulize mol.  Unkekulized atoms: 13 14 15 16 17 18 19 20 21
[11:10:34] ERROR: Could not sanitize molecule ending on line 3192
[11:10:34] ERROR: Can't kekulize mol.  Unkekulized atoms: 13 14 15 16 17 18 19 20 21
[11:10:34] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 18
[11:10:34] ERROR: Could not sanitize molecule ending on line 4541
[11:10:34] ERROR: Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 18
[11:10:34] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12 13 14 15 16
[11:10:34] ERROR: Could not sanitize molecule ending on line 5314
[11:10:34] ERROR: Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12 13 14 15 16
[11:10:34] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 14 15 16 17 18
[11:10:34] ERROR: Could not sanitize molecule ending on line 7252
[11:10:34] ERROR: Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 14 15 16 17 18
[11:10:34] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 14 15
[11:10:34] ERROR

In [22]:
#lets use Sklearn library for creating a descriptor
descriptors = {'HeavyAtomCount': Descriptors.HeavyAtomCount,
               'NHOHCount': Descriptors.NHOHCount,
               'NOCount': Descriptors.NOCount,
               'NumHAcceptors': Descriptors.NumHAcceptors,
               'NumHDonors': Descriptors.NumHDonors,
               'NumHeteratoms': Descriptors.NumHeteroatoms,
               'NumRotatableBonds': Descriptors.NumRotatableBonds,
               'NumValenceElectrons': Descriptors.NumValenceElectrons,
               'NumAromaticRings': Descriptors.NumAromaticRings,
               'NumAliphaticHeterocycles': Descriptors.NumAliphaticHeterocycles,
               'RingCount': Descriptors.RingCount,
               'MW': Descriptors.RingCount,
               'LogP': Descriptors.MolLogP,
               'MR': Descriptors.MolMR,
               'TPSA': Descriptors.TPSA}

def mol_dsc_calc(mols):
    return DataFrame({
        k: f(m) for k, f in descriptors.items()} for m in mols)

# function is wrapped into sklearn transformer for use in 'conveyor modeling'
descriptors_transformer = FunctionTransformer(mol_dsc_calc)

In [65]:
X = descriptors_transformer.transform(molecules)

In [67]:
X.head()

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteratoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA
0,5,0,0,0,0,3,0,32,0,0,0,0,2.3765,25.964,0.0
1,5,0,0,0,0,3,1,32,0,0,0,0,2.0289,26.214,0.0
2,5,0,0,0,0,3,0,30,0,0,0,0,1.9631,16.152,0.0
3,5,1,1,1,1,1,2,32,0,0,0,0,0.7788,21.9938,20.23
4,5,0,1,1,0,1,1,30,0,0,0,0,0.9854,20.972,17.07


In [69]:
X.to_csv('X_RDKitDescriptors.csv')

##### **Generation of molecular fingerprints** (like Morgan fingerprints)

In [None]:
#There is no point to use molecular fingerprints together with descriptors dicussed above.

In [33]:
import rdkit.Chem.AllChem as AllChem
from numpy import zeros
from rdkit import DataStructs

In [71]:
def calc_morgan(mols):
    for_df = []
    for m in mols:
        arr = zeros((1,), dtype=int)
        DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(m,2), arr)
        for_df.append(arr)
    return DataFrame(for_df)

morgan_transformer = FunctionTransformer(calc_morgan)
X = morgan_transformer.transform(molecules)

In [72]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
X.to_csv('X_MorganFingerprint.csv')

##### **Generation of fragmentary descriptors like ISIDA Fragmentor**

In [13]:
#No need of wrapping these descriptors for their compatibility with Sklearn
from CGRtools.files import SDFRead
from CIMtools.preprocessing import Fragmentor

In [15]:
data = SDFRead("logBB.sdf").read()

In [21]:
fragmentor = Fragmentor(fragment_type=9, max_length=4)
X = fragmentor.fit_transform(data)

In [25]:
X.head()

Unnamed: 0,"(C-C),(C-H),(C-H),(C-H),xC","(C-C-Cl),(C-C-Cl),(C-C-Cl),xC","(C-C),(C-Cl),(C-Cl),(C-Cl),xC","(C-C-H),(C-C-H),(C-C-H),xC","(Cl-C),xCl","(Cl-C-C),(Cl-C-Cl),(Cl-C-Cl),xCl","(Cl-C-C-H),(Cl-C-C-H),(Cl-C-C-H),xCl","(H-C),xH","(H-C-C),(H-C-H),(H-C-H),xH","(H-C-C-Cl),(H-C-C-Cl),(H-C-C-Cl),xH",...,"(C-C*C*C),(C-C*C*N),(C-C*C-H),(C-C*C-H),(C-N-C-C),(C-N-C-C),(C-N-C-H),(C-N-C-H),(C-N-C-H),(C-N-C-H),xC","(C*C*C),(C*C*N),(C*C-H),(C*C-H),(C-C-H),(C-C-H),(C-C-N),xC","(C*C*C*C),(C*C*C-H),(C*C*N*C),(C-C-N-C),(C-C-N-C),xC","(C*C*C*N),(C*C*C*N),(C*C*C-H),(C*C*C-H),(C*C-C-H),(C*C-C-H),(C*C-C-N),xC","(C*C*C*C),(C*C*C-H),(C*C-C-H),(C*C-C-H),(C*C-C-N),(C*N*C*C),(C*N*C-H),xC","(H-C-C-H),(H-C-C-H),(H-C-C-N),(H-C-C-N),(H-C-C=O),(H-C-N-C),(H-C-N-C),xH","(H-C-C-C),(H-C-C-C),(H-C-C-H),(H-C-C-H),(H-C-C-H),(H-C-C-H),(H-C-C-N),(H-C-C=O),xH","(H-C-C*C),(H-C-C*C),(H-C-C-C),(H-C-C-H),(H-C-C-O),(H-C-N-C),(H-C-N-H),xH","(H-C-C*C),(H-C-C*C),(H-C-C-C),(H-C-C-H),(H-C-C-O),xH","(H-C-C-C),(H-C-C-C),(H-C-C-H),(H-C-C-H),(H-C-C-H),(H-C-C-N),(H-C-O-H),xH"
0,1.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [27]:
X.to_csv('X_ISIDAfragmentor.csv')