In [1]:
import pandas as pd
from pandas import array
from pandas import DataFrame

import numpy as np
from numpy import zeros, array

from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Descriptors, AllChem
import rdkit.Chem.AllChem as AllChem

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline



In [2]:
data = pd.read_csv("data.csv", sep='\t')

In [3]:
data

Unnamed: 0.1,Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Value,Standard Units,label
0,0,CHEMBL3585775,COc1ccccc1C(C)NS(=O)(=O)NC(=O)OCc1ccccc1,IC50,0.02685,nM,1
1,1,CHEMBL552832,Cl.O=C(CCC1CCN(Cc2ccccc2)CC1)c1ccccc1,IC50,305.00000,nM,1
2,2,CHEMBL2441674,COc1cc2c(cc1OC)C(=O)C(CCCCCc1cccc(OC)[n+]1C)C2...,IC50,120.00000,nM,1
3,3,CHEMBL2441672,COc1cc2c(cc1OC)C[N+](C)(CCCCCCc1cccc(OC)[n+]1C...,IC50,150.00000,nM,1
4,4,CHEMBL2441669,COc1cccc(CCCCCCCCc2cccc(OC)[n+]2C)[n+]1C.O=S(=...,IC50,180.00000,nM,1
...,...,...,...,...,...,...,...
5634,5634,CHEMBL4763528,CN1C[C@@H](c2cccs2)[C@@]2(CSc3ccccc3C2=O)C12C(...,IC50,21160.00000,nM,0
5635,5635,CHEMBL4169394,Cc1cc(=O)oc2cc(OCCSC(=S)N(C)C)ccc12,IC50,10590.00000,nM,0
5636,5636,CHEMBL4590539,CCN(CC)Cc1cc2cc(c1O)Oc1ccc(cc1)C[C@H]1c3cc(c(O...,IC50,68000.00000,nM,0
5637,5637,CHEMBL452895,COc1cc2c3cc1Oc1c(O)c(OC)cc4c1[C@@H](Cc1ccc(O)c...,IC50,97000.00000,nM,0


In [4]:
smiles = data['Smiles'].to_list()
labels = data['label'].to_list()

In [6]:
print(data['label'].value_counts())

0    3646
1    1993
Name: label, dtype: int64


In [7]:
PhisChemDescriptors = {"LogP": Descriptors.MolLogP,
                       "TPSA": Descriptors.TPSA}

descriptors = {}
descriptors.update(PhisChemDescriptors)
print(f"Количество дескрипторов в словаре: {len(descriptors)}")

# функция для генерации дескрипторов из молекул
def mol_dsc_calc(mols): 
    return DataFrame({k: f(m) for k, f in descriptors.items()} 
             for m in mols)

# оформляем sklearn трансформер для использования в конвеерном моделировании (sklearn Pipeline)
descriptors_transformer = FunctionTransformer(mol_dsc_calc, validate=False)

Количество дескрипторов в словаре: 2


In [8]:
molecules = []

for smile in smiles:
    mol = Chem.MolFromSmiles(smile)
    if mol is not None:
        molecules.append(mol)
        
print(len(molecules))

5639


In [9]:
X = descriptors_transformer.transform(molecules)

In [10]:
X

Unnamed: 0,LogP,TPSA
0,2.51700,93.73
1,4.98350,20.31
2,3.74640,105.84
3,3.94550,145.97
4,3.58140,140.62
...,...,...
5634,4.57010,54.45
5635,3.05992,42.68
5636,7.70120,76.10
5637,6.55640,83.86


In [11]:
# функция, которая на вход принимает молекулы, на выходы выдаёт табличку
def calc_morgan(mols):
    """ генерация молекулярных отпечатков по методу Моргана с радиусом 2
    """
    for_df = []
    for m in mols:
        arr = zeros((1,), dtype=int)
        DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(m, 2, 2048), arr)
        for_df.append(arr)
    return DataFrame(for_df)

In [12]:
morgan_transformer = FunctionTransformer(calc_morgan, validate=False)
X = morgan_transformer.transform(molecules)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5634,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5635,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5636,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5637,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
scaler = StandardScaler()
scaler.fit(X.values)
X_norm_SS = DataFrame(scaler.transform(X.values), index=X.index, columns=X.columns)
X_norm_SS 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,-0.063996,2.919818,4.139062,-0.06806,-0.084523,-0.091678,-0.075546,-0.066732,-0.215836,-0.128785,...,-0.063996,-0.049889,-0.094584,-0.191759,8.181672,-0.039982,-0.08868,-0.070641,-0.02979,-0.08127
1,-0.063996,-0.342487,-0.241601,-0.06806,-0.084523,-0.091678,-0.075546,-0.066732,-0.215836,-0.128785,...,-0.063996,-0.049889,-0.094584,-0.191759,-0.122224,-0.039982,-0.08868,-0.070641,-0.02979,-0.08127
2,-0.063996,-0.342487,-0.241601,-0.06806,-0.084523,-0.091678,-0.075546,-0.066732,-0.215836,-0.128785,...,-0.063996,-0.049889,-0.094584,-0.191759,-0.122224,-0.039982,-0.08868,-0.070641,-0.02979,-0.08127
3,-0.063996,-0.342487,-0.241601,-0.06806,-0.084523,-0.091678,-0.075546,-0.066732,-0.215836,-0.128785,...,-0.063996,-0.049889,-0.094584,-0.191759,-0.122224,-0.039982,-0.08868,-0.070641,-0.02979,-0.08127
4,-0.063996,-0.342487,-0.241601,-0.06806,-0.084523,-0.091678,-0.075546,-0.066732,-0.215836,-0.128785,...,-0.063996,-0.049889,-0.094584,-0.191759,-0.122224,-0.039982,-0.08868,-0.070641,-0.02979,-0.08127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5634,-0.063996,-0.342487,-0.241601,-0.06806,-0.084523,-0.091678,-0.075546,-0.066732,-0.215836,-0.128785,...,-0.063996,-0.049889,-0.094584,-0.191759,-0.122224,-0.039982,-0.08868,-0.070641,-0.02979,-0.08127
5635,-0.063996,-0.342487,-0.241601,-0.06806,-0.084523,-0.091678,-0.075546,-0.066732,-0.215836,-0.128785,...,-0.063996,-0.049889,-0.094584,-0.191759,-0.122224,-0.039982,-0.08868,-0.070641,-0.02979,-0.08127
5636,-0.063996,-0.342487,-0.241601,-0.06806,-0.084523,-0.091678,-0.075546,-0.066732,-0.215836,-0.128785,...,-0.063996,-0.049889,-0.094584,-0.191759,-0.122224,-0.039982,-0.08868,-0.070641,-0.02979,-0.08127
5637,-0.063996,-0.342487,-0.241601,-0.06806,-0.084523,-0.091678,-0.075546,-0.066732,-0.215836,-0.128785,...,-0.063996,-0.049889,-0.094584,-0.191759,-0.122224,-0.039982,-0.08868,-0.070641,-0.02979,-0.08127


In [14]:
normal_descriptors_transformer = Pipeline([('gen', descriptors_transformer), ('norm', scaler)])

X_norm_SS = normal_descriptors_transformer.fit_transform(molecules)
X_norm_SS

array([[-0.62220977,  1.02971085],
       [ 0.38475076, -1.33777075],
       [-0.12030129,  1.42020666],
       ...,
       [ 1.4942649 ,  0.46121861],
       [ 1.02689478,  0.71144548],
       [ 0.34551747, -0.0659991 ]])

In [15]:
normal_descriptors_transformer

Pipeline(steps=[('gen',
                 FunctionTransformer(func=<function mol_dsc_calc at 0x7f0ce84d9160>)),
                ('norm', StandardScaler())])