In [1]:
from rdkit import Chem
from rdkit.Chem import (
    Descriptors,
    Lipinski,
    Crippen,
    rdMolDescriptors,
    EState,
    QED,
    Fragments,
)


In [2]:
def compute_rdkit_descriptors(mol: Chem.Mol) -> dict:
    """
    Compute a selected set of RDKit molecular descriptors.
    Input:
        mol : rdkit.Chem.Mol
    Output:
        dict[str, float]
    """

    if mol is None:
        raise ValueError("Mol is None")

    desc = {}

    # ===== EState =====
    desc["MaxAbsEStateIndex"] = Descriptors.MaxAbsEStateIndex(mol)
    desc["MinAbsEStateIndex"] = Descriptors.MinAbsEStateIndex(mol)
    desc["MinEStateIndex"] = Descriptors.MinEStateIndex(mol)

    # ===== Global =====
    desc["qed"] = QED.qed(mol)
    desc["SPS"] = Descriptors.SPS(mol)
    desc["MolWt"] = Descriptors.MolWt(mol)
    desc["NumRadicalElectrons"] = Descriptors.NumRadicalElectrons(mol)

    # ===== Charges =====
    desc["MaxPartialCharge"] = Descriptors.MaxPartialCharge(mol)
    desc["MinPartialCharge"] = Descriptors.MinPartialCharge(mol)

    # ===== Fingerprint density =====
    desc["FpDensityMorgan1"] = Descriptors.FpDensityMorgan1(mol)

    # ===== Information indices =====
    desc["AvgIpc"] = Descriptors.AvgIpc(mol)
    desc["Ipc"] = Descriptors.Ipc(mol)
    desc["BalabanJ"] = Descriptors.BalabanJ(mol)

    # ===== PEOE_VSA =====
    for i in range(1, 15):
        key = f"PEOE_VSA{i}"
        if hasattr(Descriptors, key):
            desc[key] = getattr(Descriptors, key)(mol)

    # ===== SMR_VSA =====
    for i in [1, 2, 3, 4, 5, 6, 7, 9, 10]:
        key = f"SMR_VSA{i}"
        desc[key] = getattr(Descriptors, key)(mol)

    # ===== SlogP_VSA =====
    for i in [1, 2, 3, 4, 7, 8, 10, 11, 12]:
        key = f"SlogP_VSA{i}"
        desc[key] = getattr(Descriptors, key)(mol)

    # ===== TPSA / LogP =====
    desc["TPSA"] = rdMolDescriptors.CalcTPSA(mol)
    desc["MolLogP"] = Crippen.MolLogP(mol)

    # ===== EState_VSA =====
    for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 11]:
        key = f"EState_VSA{i}"
        desc[key] = getattr(Descriptors, key)(mol)

    # ===== VSA_EState =====
    for i in [2, 3, 4, 5, 7, 8, 9]:
        key = f"VSA_EState{i}"
        desc[key] = getattr(Descriptors, key)(mol)

    # ===== Ring & topology =====
    desc["FractionCSP3"] = rdMolDescriptors.CalcFractionCSP3(mol)
    desc["NHOHCount"] = Lipinski.NHOHCount(mol)
    desc["NumAliphaticCarbocycles"] = Lipinski.NumAliphaticCarbocycles(mol)
    desc["NumAliphaticHeterocycles"] = Lipinski.NumAliphaticHeterocycles(mol)
    desc["NumAromaticCarbocycles"] = Lipinski.NumAromaticCarbocycles(mol)
    desc["NumAromaticHeterocycles"] = Lipinski.NumAromaticHeterocycles(mol)
    desc["NumAromaticRings"] = Lipinski.NumAromaticRings(mol)
    desc["RingCount"] = Lipinski.RingCount(mol)

    # ===== Functional groups (fr_*) =====
    fragment_functions = [
        "fr_Al_COO", "fr_Al_OH", "fr_ArN", "fr_Ar_COO", "fr_Ar_NH",
        "fr_Ar_OH", "fr_C_O", "fr_C_S", "fr_HOCCN", "fr_Imine",
        "fr_NH1", "fr_NH2", "fr_N_O", "fr_Ndealkylation1",
        "fr_Ndealkylation2", "fr_SH", "fr_alkyl_carbamate",
        "fr_allylic_oxid", "fr_amidine", "fr_aniline",
        "fr_aryl_methyl", "fr_azo", "fr_barbitur",
        "fr_benzodiazepine", "fr_bicyclic",
        "fr_dihydropyridine", "fr_epoxide", "fr_ester",
        "fr_ether", "fr_furan", "fr_guanido",
        "fr_hdrzine", "fr_hdrzone", "fr_imidazole",
        "fr_imide", "fr_ketone", "fr_lactam",
        "fr_lactone", "fr_methoxy", "fr_morpholine",
        "fr_nitro", "fr_oxazole", "fr_oxime",
        "fr_para_hydroxylation", "fr_phos_acid",
        "fr_piperdine", "fr_piperzine", "fr_priamide",
        "fr_pyridine", "fr_quatN", "fr_sulfide",
        "fr_sulfonamd", "fr_sulfone", "fr_term_acetylene",
        "fr_tetrazole", "fr_thiazole", "fr_thiophene",
        "fr_unbrch_alkane", "fr_urea",
    ]

    for name in fragment_functions:
        desc[name] = getattr(Fragments, name)(mol)

    return desc


In [3]:
mol = Chem.MolFromSmiles("CC(=O)Oc1ccccc1C(=O)O")
features = compute_rdkit_descriptors(mol)

print(len(features))
print(features["MolWt"], features["TPSA"])


131
180.15899999999996 63.60000000000001
