# Stability data

In [1]:
import pandas as pd
import numpy as np

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

In [6]:
data_df=pd.read_csv('../../2023-2/ETC/molecule_stability.csv')

In [7]:
print(len(data_df))
data_df.head()

3498


Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea
0,TRAIN_0000,CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC,26.01,50.68,3.259,400.495,5,2,8,3.259,117.37
1,TRAIN_0001,Cc1nc(C)c(CN2CC(C)C(=O)Nc3ccccc32)s1,29.27,50.59,2.169,301.407,2,1,2,2.172,73.47
2,TRAIN_0002,CCCN1CCN(c2nn3nnnc3c3ccccc23)CC1,5.586,80.892,1.593,297.358,5,0,3,1.585,62.45
3,TRAIN_0003,Cc1ccc(-c2ccc(-n3nc(C)c(S(=O)(=O)N4CCN(C5CCCCC...,5.71,2.0,4.771,494.652,6,0,5,3.475,92.6
4,TRAIN_0004,Cc1ccc2c(c1)N(C(=O)c1ccncc1)CC(C)O2,93.27,99.99,2.335,268.31,3,0,1,2.337,42.43


In [9]:
smis = []
labels_MLM = []
labels_HLM = []

failed=0
for smi,label_m, label_h in zip(data_df.SMILES, data_df.MLM, data_df.HLM):
    mol=Chem.MolFromSmiles(smi)
    if mol:
        smis.append(smi)
        labels_MLM.append(label_m)
        labels_HLM.append(label_h)
    else:
        print(smi)
        failed+=1
failed

0

In [11]:
len(smis), len(labels_MLM), failed, len(bbbp_df)

(3498, 3498, 0, 3498)

# Chemical Fingerprint

In [12]:
def rdkit_fingerprint(smi,radius=2, nbits=1024):
    mol = Chem.MolFromSmiles(smi)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
    return fp.ToList()

In [13]:
fps = [rdkit_fingerprint(smi) for smi in smis]
fps = np.array(fps)
fps.shape

(3498, 1024)

In [14]:
fp_df = pd.DataFrame(fps)
fp_df = fp_df.assign(MLM=labels_MLM, HLM=labels_HLM)
fp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1016,1017,1018,1019,1020,1021,1022,1023,MLM,HLM
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,26.010,50.680
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,29.270,50.590
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5.586,80.892
3,0,1,1,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,5.710,2.000
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,93.270,99.990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.556,3.079
3494,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,35.560,47.630
3495,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,56.150,1.790
3496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0.030,2.770


In [15]:
fp_df.to_csv('../../2023-2/processed_data/MolStab_ECFP_R2B1024.csv',index=False)

### mordred descriptor (~1300 descriptors)
- requires `mordred` package: `pip install mordred`

In [16]:
from mordred import Calculator, descriptors

In [17]:
mols = [Chem.MolFromSmiles(smi) for smi in smis]

In [18]:
calc = Calculator(descriptors, ignore_3D=True)
mordred_df = calc.pandas(mols)

  8%|▊         | 271/3498 [00:33<06:11,  8.68it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 34%|███▍      | 1190/3498 [02:05<05:35,  6.88it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 3498/3498 [05:45<00:00, 10.13it/s]


In [19]:
mordred_df

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,35.689316,2.421199,4.745523,35.689316,1.274618,4.249154,...,10.081676,78.761075,400.156912,7.695325,2380,40,142.0,165.0,9.500000,6.361111
1,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,26.575899,2.426398,4.757199,26.575899,1.265519,3.984419,...,9.907828,69.149596,301.124883,7.528122,870,35,112.0,132.0,7.138889,4.527778
2,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,2,1,29.802128,2.510668,4.982923,29.802128,1.354642,4.049690,...,10.144510,70.158066,297.170194,7.248054,1028,36,120.0,145.0,5.277778,4.888889
3,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,1,45.884166,2.532483,4.973440,45.884166,1.310976,4.500758,...,10.613467,86.199585,494.246395,7.162991,4170,61,192.0,231.0,10.784722,7.500000
4,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,26.308663,2.452930,4.905860,26.308663,1.315433,3.935574,...,9.978363,53.872357,268.121178,7.447810,762,32,106.0,125.0,6.277778,4.361111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3493,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,30.902711,2.515832,4.792938,30.902711,1.236108,4.158311,...,10.208580,77.363487,395.052750,10.129558,1615,38,136.0,162.0,10.062500,5.250000
3494,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,35.887372,2.494190,4.985907,35.887372,1.329162,4.237338,...,10.291162,75.955433,359.138225,8.162232,1765,45,144.0,173.0,7.750000,6.027778
3495,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,23.546531,2.450594,4.723643,23.546531,1.239291,3.855798,...,9.677277,66.189153,261.147727,6.872309,795,26,94.0,108.0,6.916667,4.333333
3496,module 'numpy' has no attribute 'float'.\n`np....,module 'numpy' has no attribute 'float'.\n`np....,0,0,23.936088,2.314213,4.623104,23.936088,1.259794,3.855415,...,9.604475,65.335399,284.056385,8.876762,812,25,94.0,107.0,6.916667,4.250000


In [20]:
mordred_df.to_csv('../../2023-2/processed_data/MolStab_mordred.csv',index=False)