In [1]:
import rdkit
rdkit.__version__

'2023.03.3'

In [2]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Draw
from rdkit.Chem import DataStructs
from rdkit.Chem.Draw import IPythonConsole

from sklearn import datasets, metrics
from sklearn.preprocessing import StandardScaler


# Test dataset

In [3]:
test_df = pd.read_csv('test_DILIrank_raw.csv')
test_df = test_df.dropna().drop_duplicates().reset_index(drop=True)
test_df

Unnamed: 0,name,smiles,toxicity
0,raltegravir,CC1=NN=C(O1)C(=O)NC(C)(C)C2=N/C(=C(/NCC3=CC=C(...,1
1,riluzole,C1=CC2=C(C=C1OC(F)(F)F)SC(=N2)N,1
2,cyclosporine,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,1
3,fenoprofen,CC(C1=CC(=CC=C1)OC2=CC=CC=C2)C(=O)O,1
4,acetazolamide,CC(=O)NC1=NN=C(S1)S(=O)(=O)N,1
...,...,...,...
447,Levoleucovorin,C1C(N(C2=C(N1)NC(=NC2=O)N)C=O)CNC3=CC=C(C=C3)C...,0
448,levomefolate calcium,CN1[C@H](CNC2=C1C(=O)N=C(N2)N)CNC3=CC=C(C=C3)C...,0
449,daunorubicin,C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC...,0
450,nystatin,C[C@@H]1[C@H]([C@@H]([C@@H](C(O1)OC\2CC(C(C(CC...,0


In [4]:
test_df['toxicity'].value_counts()

0    268
1    184
Name: toxicity, dtype: int64

## For vit

In [5]:
# transform smiles to mol
mols = [Chem.MolFromSmiles(smiles) for smiles in test_df["smiles"]]

# if smiles don't transform to mol, add to non_list
none_list = []
for i in range(len(mols)):
    if mols[i] is None :
        none_list.append(i)
        print('none_list에 추가됨')
    
reg_idx = 0
for i in none_list :
    del mols[i - reg_idx]
    reg_idx += 1
    
# modify index
if len(none_list) != 0 :
    test_df = test_df.drop(none_list, axis=0)
    test_df = test_df.reset_index(drop = True)

In [6]:
# create fingerprint
bit_info_list = [] # bit vector
bit_info = {} #bit vector
fps = []

b = 0

# mol to fingerprint Bit Vector
for a in mols :
    fps.append(AllChem.GetMorganFingerprintAsBitVect(a, 3, nBits = 1024, bitInfo = bit_info))
    bit_info_list.append(bit_info.copy()) 
    
# to array

arr_list = list()
for i in range(len(fps)):
    array = np.zeros((0,), dtype = np.int8)
    arr_list.append(array)
    
for i in range(len(fps)):
    bit = fps[i]
    DataStructs.ConvertToNumpyArray(bit, arr_list[i])
    
test_x = np.stack([i.tolist() for i in arr_list])
test_x = test_x.astype(np.float32)
test_finprt = pd.DataFrame(test_x)

In [7]:
# create physicochemical properties

from rdkit.Chem import QED

qe = [QED.properties(mol) for mol in mols]
qe = pd.DataFrame(qe)
qe

Unnamed: 0,MW,ALOGP,HBA,HBD,PSA,ROTB,AROM,ALERTS
0,444.423,0.98202,8,3,150.02,6,2,3
1,234.202,2.77710,3,1,48.14,1,2,0
2,1202.635,3.26900,12,5,278.80,15,0,1
3,242.274,3.66700,3,1,46.53,4,2,0
4,222.251,-0.85610,5,2,115.04,2,1,1
...,...,...,...,...,...,...,...,...
447,473.446,-0.73110,11,7,219.84,10,2,1
448,497.525,-3.30790,11,5,208.43,9,2,1
449,527.526,1.02890,11,5,185.84,4,3,1
450,926.107,0.77830,18,12,327.45,3,0,1


In [8]:
from pickle import load 

#QED datapreprocessing 
load_ss = load(open('./DILI_standard_scaler.pkl','rb'))

ss = StandardScaler()

ss.fit(qe)
qe_scaled = ss.transform(qe) 


qe_scaled = pd.DataFrame(qe_scaled)
qe_scaled.columns =['MW','ALOGP','HBA','HBD','PSA','ROTB','AROM','ALERTS']
qe_scaled

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,MW,ALOGP,HBA,HBD,PSA,ROTB,AROM,ALERTS
0,-0.031682,-0.078496,0.124693,-0.070660,0.065800,-0.124521,0.286838,1.806079
1,-0.372504,0.309422,-0.362862,-0.288668,-0.316390,-0.396949,0.286838,-0.882223
2,1.197573,0.415722,0.514738,0.147348,0.548903,0.365849,-1.212015,0.013878
3,-0.359417,0.501730,-0.362862,-0.288668,-0.322430,-0.233492,0.286838,-0.882223
4,-0.391879,-0.475714,-0.167840,-0.179664,-0.065423,-0.342464,-0.462589,0.013878
...,...,...,...,...,...,...,...,...
447,0.015372,-0.448702,0.417227,0.365357,0.327722,0.093421,0.286838,0.013878
448,0.054410,-1.005549,0.417227,0.147348,0.284919,0.038936,0.286838,0.013878
449,0.103049,-0.068365,0.417227,0.147348,0.200175,-0.233492,1.036265,0.013878
450,0.749251,-0.122520,1.099804,0.910377,0.731408,-0.287978,-1.212015,0.013878


In [9]:
input_df = pd.concat([test_finprt,qe_scaled,test_df['toxicity']],axis=1)
input_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1023,MW,ALOGP,HBA,HBD,PSA,ROTB,AROM,ALERTS,toxicity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.031682,-0.078496,0.124693,-0.070660,0.065800,-0.124521,0.286838,1.806079,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.372504,0.309422,-0.362862,-0.288668,-0.316390,-0.396949,0.286838,-0.882223,1
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.197573,0.415722,0.514738,0.147348,0.548903,0.365849,-1.212015,0.013878,1
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.359417,0.501730,-0.362862,-0.288668,-0.322430,-0.233492,0.286838,-0.882223,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.391879,-0.475714,-0.167840,-0.179664,-0.065423,-0.342464,-0.462589,0.013878,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.015372,-0.448702,0.417227,0.365357,0.327722,0.093421,0.286838,0.013878,0
448,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.054410,-1.005549,0.417227,0.147348,0.284919,0.038936,0.286838,0.013878,0
449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.103049,-0.068365,0.417227,0.147348,0.200175,-0.233492,1.036265,0.013878,0
450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.749251,-0.122520,1.099804,0.910377,0.731408,-0.287978,-1.212015,0.013878,0


In [10]:
x_test = input_df.iloc[:,:1032]
y_test = input_df['toxicity']