# Импорты

События в блоке кода:

- Импорт модулей
- Загрузка датасета

In [1]:
from IPython.display import HTML

import pandas as pd
from pandas import array
from pandas import DataFrame

import numpy as np
from numpy import zeros, array

from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Descriptors, AllChem
import rdkit.Chem.AllChem as AllChem

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv('data.csv')

In [3]:
smiles = data['smiles'].to_list()

# Дескрипторы

События в блоке кода:

- Расчёт физико-химических дескрипторов
- Расчёт структурных дескрипторов
- Расчёт фингерпринтов
- Проверка молекул на корректность
- Удаление некорректных значений
- Стандартизация и нормализация данных

In [4]:
ConstDescriptors = {"NumHAcceptors": Descriptors.NumHAcceptors,
                    "NumHDonors": Descriptors.NumHDonors,
                    "NumHeteroatoms": Descriptors.NumHeteroatoms,
                    "NumAromaticRings": Descriptors.NumAromaticRings,
                    "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles}
                     
PhisChemDescriptors = {"MW": Descriptors.MolWt,
                       "LogP": Descriptors.MolLogP,
                       "TPSA": Descriptors.TPSA}

descriptors = {}
descriptors.update(ConstDescriptors)
descriptors.update(PhisChemDescriptors)


def mol_dsc_calc(mols): 
    return DataFrame({k: f(m) for k, f in descriptors.items()} 
             for m in mols)

descriptors_transformer = FunctionTransformer(mol_dsc_calc, validate=False)

In [5]:
def mol_dsc_calc(mols): 
    return DataFrame({k: f(m) for k, f in descriptors.items()} 
             for m in mols)

In [6]:
descriptors_transformer = FunctionTransformer(mol_dsc_calc, validate=False)

In [7]:
mols = []
incorrect_smiles = []

for smi in smiles:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        mols.append(mol)
    else:
        incorrect_smiles.append(smiles.index(smi))

[12:06:32] Explicit valence for atom # 0 N, 4, is greater than permitted
[12:06:33] Explicit valence for atom # 0 S, 7, is greater than permitted
[12:06:33] Explicit valence for atom # 0 S, 7, is greater than permitted


In [8]:
incorrect_smiles

[2, 3282, 3500]

In [9]:
y = data['label'].to_list()

In [10]:
y.pop(2)
y.pop(3282)
y.pop(3500)

1

In [11]:
smiles.pop(2)
smiles.pop(3282)
smiles.pop(3500)

'FC1=CC(CNC(=O)C=2C=C3NC(=O)C=4C(NC3=CC2)=CC(C=5C=C(OC)C(O)=CC5)=CC4)=CC=C1'

In [12]:
X = descriptors_transformer.transform(mols)
X

Unnamed: 0,NumHAcceptors,NumHDonors,NumHeteroatoms,NumAromaticRings,NumAliphaticHeterocycles,MW,LogP,TPSA
0,12,6,12,2,1,543.525,0.00130,206.07
1,14,7,21,2,1,507.182,-1.62900,279.13
2,14,5,15,2,1,807.890,3.25960,224.45
3,0,0,2,0,0,42.013,-1.09300,31.50
4,4,3,11,3,0,464.831,5.54970,92.35
...,...,...,...,...,...,...,...,...
3754,11,6,19,3,1,871.835,5.87248,241.40
3755,17,5,25,4,2,993.782,2.02488,311.51
3756,16,5,24,4,2,949.729,2.00828,302.28
3757,15,5,23,4,2,905.676,1.99168,293.05


In [13]:
def calc_morgan(mols):
    for_df = []
    for m in mols:
        arr = zeros((1,), dtype='float32')
        DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(m, 2, 2048), arr)
        for_df.append(arr)
    return DataFrame(for_df)

In [14]:
morgan_transformer = FunctionTransformer(calc_morgan, validate=False)

In [15]:
M = morgan_transformer.transform(mols)
M

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df = pd.concat([X,M],axis=1)
df

Unnamed: 0,NumHAcceptors,NumHDonors,NumHeteroatoms,NumAromaticRings,NumAliphaticHeterocycles,MW,LogP,TPSA,0,1,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,12,6,12,2,1,543.525,0.00130,206.07,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14,7,21,2,1,507.182,-1.62900,279.13,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14,5,15,2,1,807.890,3.25960,224.45,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,2,0,0,42.013,-1.09300,31.50,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,3,11,3,0,464.831,5.54970,92.35,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3754,11,6,19,3,1,871.835,5.87248,241.40,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3755,17,5,25,4,2,993.782,2.02488,311.51,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3756,16,5,24,4,2,949.729,2.00828,302.28,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3757,15,5,23,4,2,905.676,1.99168,293.05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
scaler = StandardScaler()
scaler.fit(df.values)
X = DataFrame(scaler.transform(df.values), index=df.index, columns=df.columns)
X    

Unnamed: 0,NumHAcceptors,NumHDonors,NumHeteroatoms,NumAromaticRings,NumAliphaticHeterocycles,MW,LogP,TPSA,0,1,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,3.246157,3.251687,1.416232,-1.484853,-0.003943,1.082791,-3.380094,4.384257,-0.028262,-0.582777,...,-0.101056,-0.073137,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456
1,4.370793,4.182829,5.421961,-1.484853,-0.003943,0.665024,-4.795324,7.217683,-0.028262,-0.582777,...,-0.101056,-0.073137,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456
2,4.370793,2.320546,2.751475,-1.484853,-0.003943,4.121697,-0.551631,5.097073,-0.028262,1.715921,...,-0.101056,-0.073137,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456
3,-3.501661,-2.335161,-3.034579,-3.935595,-1.239092,-4.682147,-4.330034,-2.385949,-0.028262,-0.582777,...,-0.101056,-0.073137,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456
4,-1.252388,0.458263,0.971151,-0.259483,-1.239092,0.178194,1.436357,-0.026053,-0.028262,-0.582777,...,-0.101056,-0.073137,-0.028262,-0.076727,-0.09411,9.642354,-0.105012,-0.043193,-0.099704,-0.083456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3754,2.683839,3.251687,4.531799,-0.259483,-0.003943,4.856752,1.716556,5.754431,-0.028262,-0.582777,...,-0.101056,-0.073137,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456
3755,6.057748,2.320546,7.202286,0.965888,1.231206,6.258547,-1.623466,8.473450,-0.028262,-0.582777,...,-0.101056,-0.073137,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456
3756,5.495430,2.320546,6.757204,0.965888,1.231206,5.752152,-1.637876,8.115491,-0.028262,-0.582777,...,-0.101056,-0.073137,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456
3757,4.933111,2.320546,6.312123,0.965888,1.231206,5.245758,-1.652287,7.757531,-0.028262,-0.582777,...,-0.101056,-0.073137,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456


In [18]:
df = X.join(pd.DataFrame(
    {
        'smiles': smiles,
        'labels': y,
    }, index=X.index
))

In [19]:
df.to_csv('data_ML.csv')

In [20]:
df

Unnamed: 0,NumHAcceptors,NumHDonors,NumHeteroatoms,NumAromaticRings,NumAliphaticHeterocycles,MW,LogP,TPSA,0,1,...,2040,2041,2042,2043,2044,2045,2046,2047,smiles,labels
0,3.246157,3.251687,1.416232,-1.484853,-0.003943,1.082791,-3.380094,4.384257,-0.028262,-0.582777,...,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456,COC1=C2C(=O)C3=C(C(O)=C4C[C@](O)(C[C@H](O[C@H]...,0
1,4.370793,4.182829,5.421961,-1.484853,-0.003943,0.665024,-4.795324,7.217683,-0.028262,-0.582777,...,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456,NC1=NC=NC2=C1N=CN2[C@@H]1O[C@H](COP(O)(=O)OP(O...,0
2,4.370793,2.320546,2.751475,-1.484853,-0.003943,4.121697,-0.551631,5.097073,-0.028262,1.715921,...,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456,[H][C@@]12C[C@H](O)[C@@]3(C)C(=O)[C@H](O)C4=C(...,0
3,-3.501661,-2.335161,-3.034579,-3.935595,-1.239092,-4.682147,-4.330034,-2.385949,-0.028262,-0.582777,...,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456,[H].O.[Na],0
4,-1.252388,0.458263,0.971151,-0.259483,-1.239092,0.178194,1.436357,-0.026053,-0.028262,-0.582777,...,-0.028262,-0.076727,-0.09411,9.642354,-0.105012,-0.043193,-0.099704,-0.083456,CNC(=O)C1=NC=CC(OC2=CC=C(NC(=O)NC3=CC(=C(Cl)C=...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3754,2.683839,3.251687,4.531799,-0.259483,-0.003943,4.856752,1.716556,5.754431,-0.028262,-0.582777,...,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456,C[C@@H]1NC(=O)N[C@@H]1CCCCCC(=O)NCCCCCCCCC(=O)...,0
3755,6.057748,2.320546,7.202286,0.965888,1.231206,6.258547,-1.623466,8.473450,-0.028262,-0.582777,...,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456,N#CCN(C(=O)CCOCCOCCNC(=O)COc1cccc2c1C(=O)N(C1C...,0
3756,5.495430,2.320546,6.757204,0.965888,1.231206,5.752152,-1.637876,8.115491,-0.028262,-0.582777,...,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456,N#CCN(C(=O)CCOCCNC(=O)COc1cccc2c1C(=O)N(C1CCC(...,0
3757,4.933111,2.320546,6.312123,0.965888,1.231206,5.245758,-1.652287,7.757531,-0.028262,-0.582777,...,-0.028262,-0.076727,-0.09411,-0.103709,-0.105012,-0.043193,-0.099704,-0.083456,N#CCN(C(=O)CCNC(=O)COc1cccc2c1C(=O)N(C1CCC(=O)...,0
