In [1]:
import pandas as pd
from pandas import DataFrame

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams

from sklearn.preprocessing import FunctionTransformer

from tqdm.auto import tqdm

from scopy.ScoTox import Toxfilter



In [2]:
molecules = [mol for mol in Chem.SDMolSupplier("Adenosine_20323.sdf") if mol is not None]
print(f'Количество молекул = {len(molecules)}')

Количество молекул = 20323


In [3]:
# это ужасный велосипед, который и сейчас кажется мне сущим кошмаром, но он сработал

SMILES = []
index_list = []
i = 0

with open('Adenosine_20323.sdf', 'r', encoding='utf-8') as f:
    for index, line in enumerate(f):
        if "<Smile>" in line:
            index_list.append(index)

SMILES_index = [x+1 for x in index_list]

with open('Adenosine_20323.sdf', 'r', encoding='utf-8') as f:
    for index, line in enumerate(f):
        if index in SMILES_index:
            SMILES.append(line)

SMILES = [s.rstrip() for s in SMILES]
print(len(SMILES))

20323


In [4]:
# создаем словарь из дескриторов структуры
ConstDescriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
                    "NHOHCount": Descriptors.NHOHCount,
                    "NOCount": Descriptors.NOCount,
                    "NumHAcceptors": Descriptors.NumHAcceptors,
                    "NumHDonors": Descriptors.NumHDonors,
                    "NumHeteroatoms": Descriptors.NumHeteroatoms,
                    "NumRotatableBonds": Descriptors.NumRotatableBonds,
                    "NumValenceElectrons": Descriptors.NumValenceElectrons,
                    "NumAromaticRings": Descriptors.NumAromaticRings,
                    "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
                    "RingCount": Descriptors.RingCount}

# создаем словарь из физико-химических дескрипторов                            
PhisChemDescriptors = {"MW": Descriptors.MolWt,
                       "LogP": Descriptors.MolLogP,
                       "MR": Descriptors.MolMR,
                       "TPSA": Descriptors.TPSA}

In [5]:
# объединяем все дескрипторы в один словарь
descriptors = {}
descriptors.update(ConstDescriptors)
descriptors.update(PhisChemDescriptors)
print(f"Количество дескрипторов в словаре: {len(descriptors)}")

Количество дескрипторов в словаре: 15


In [6]:
# функция для генерации дескрипторов из молекул
def mol_dsc_calc(mols): 
    return DataFrame({k: f(m) for k, f in descriptors.items()} 
             for m in mols)

# оформляем sklearn трансформер для использования в конвеерном моделировании (sklearn Pipeline)
descriptors_transformer = FunctionTransformer(mol_dsc_calc, validate=False)

In [7]:
X = descriptors_transformer.transform(molecules)

In [8]:
X

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA
0,32,2,8,6,2,9,9,176,2,1,3,463.600,3.00734,122.4067,100.73
1,25,0,6,5,0,6,4,130,3,0,3,339.395,2.88572,95.4385,68.34
2,34,1,9,8,1,9,7,174,4,0,4,459.506,3.40454,128.2442,108.11
3,35,1,9,8,1,9,8,180,4,0,4,473.533,3.79464,132.8612,108.11
4,23,1,6,6,1,7,4,118,3,0,3,331.349,2.30060,86.8332,85.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20318,22,1,4,4,1,6,4,112,3,0,3,316.357,4.08640,85.1422,51.22
20319,24,2,6,6,2,8,3,120,4,0,4,357.416,3.09479,94.1129,79.51
20320,27,0,6,9,0,9,5,138,4,0,4,417.537,4.34999,110.1840,62.19
20321,29,1,5,6,1,7,7,152,3,0,4,428.579,5.11890,119.2662,60.33


In [9]:
X['smiles'] = SMILES
X

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA,smiles
0,32,2,8,6,2,9,9,176,2,1,3,463.600,3.00734,122.4067,100.73,c1(c(c([nH]c1C)C)C(=O)OCC)S(NCC1CCN(Cc2c(OC)cc...
1,25,0,6,5,0,6,4,130,3,0,3,339.395,2.88572,95.4385,68.34,c12c(C(N(Cc3ccccc3)C(C)C)=O)c(oc1\N=C/N(C2=O)C)C
2,34,1,9,8,1,9,7,174,4,0,4,459.506,3.40454,128.2442,108.11,c12c(c(n(n1)c1ccccc1)C)\C(=N/N(C2=O)CCCC(Nc1c(...
3,35,1,9,8,1,9,8,180,4,0,4,473.533,3.79464,132.8612,108.11,c12c(c(n(n1)c1ccccc1)C)\C(=N/N(C2=O)CCCC(Nc1cc...
4,23,1,6,6,1,7,4,118,3,0,3,331.349,2.30060,86.8332,85.61,CCOC(CNC(C1SC2=C(C=1)C(Oc1c2cccc1)=O)=O)=O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20318,22,1,4,4,1,6,4,112,3,0,3,316.357,4.08640,85.1422,51.22,c1cc(c(C(NC2Sc3c(N=2)ccc(c3)OCC)=O)cc1)F
20319,24,2,6,6,2,8,3,120,4,0,4,357.416,3.09479,94.1129,79.51,c1cc2N3C(SC(=C3NC(c2cc1)=O)C(NCC1OC=CC=1)=O)=S
20320,27,0,6,9,0,9,5,138,4,0,4,417.537,4.34999,110.1840,62.19,c1cc(c(N2c3[nH0]c([nH0](CC4OC=CC=4)c(c3SC2=S)=...
20321,29,1,5,6,1,7,7,152,3,0,4,428.579,5.11890,119.2662,60.33,CCOC(C1=C(NC(=O)CSc2c[nH0](CC)c3c2cccc3)SC2=C1...


In [10]:
filtered_molecules = X.loc[(X.NumHDonors < 5) & (X.MW < 500) & (X.LogP < 5) & 
                           (X.NumHAcceptors < 10) & (X.NumRotatableBonds < 10) & (X.RingCount > 0)]

In [11]:
filtered_molecules

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA,smiles
0,32,2,8,6,2,9,9,176,2,1,3,463.600,3.00734,122.4067,100.73,c1(c(c([nH]c1C)C)C(=O)OCC)S(NCC1CCN(Cc2c(OC)cc...
1,25,0,6,5,0,6,4,130,3,0,3,339.395,2.88572,95.4385,68.34,c12c(C(N(Cc3ccccc3)C(C)C)=O)c(oc1\N=C/N(C2=O)C)C
2,34,1,9,8,1,9,7,174,4,0,4,459.506,3.40454,128.2442,108.11,c12c(c(n(n1)c1ccccc1)C)\C(=N/N(C2=O)CCCC(Nc1c(...
3,35,1,9,8,1,9,8,180,4,0,4,473.533,3.79464,132.8612,108.11,c12c(c(n(n1)c1ccccc1)C)\C(=N/N(C2=O)CCCC(Nc1cc...
4,23,1,6,6,1,7,4,118,3,0,3,331.349,2.30060,86.8332,85.61,CCOC(CNC(C1SC2=C(C=1)C(Oc1c2cccc1)=O)=O)=O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20316,28,1,7,7,1,7,6,148,3,1,4,379.512,2.81582,113.7537,62.11,CCCN1CCN(c2[nH0]c3c(c([nH0]2)NCc2ccc(C)cc2)c[n...
20317,22,1,4,4,1,5,5,112,3,0,3,312.394,4.33740,89.8012,51.22,CCCOc1ccc(C(NC2Sc3c(N=2)cccc3)=O)cc1
20318,22,1,4,4,1,6,4,112,3,0,3,316.357,4.08640,85.1422,51.22,c1cc(c(C(NC2Sc3c(N=2)ccc(c3)OCC)=O)cc1)F
20319,24,2,6,6,2,8,3,120,4,0,4,357.416,3.09479,94.1129,79.51,c1cc2N3C(SC(=C3NC(c2cc1)=O)C(NCC1OC=CC=1)=O)=S
