# BBBP data

In [1]:
import pandas as pd
import numpy as np

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
bbbp_df=pd.read_csv('../../2023-2/MoleculeNet/BBBP.csv')

In [3]:
print(len(bbbp_df))
bbbp_df.head()

2050


Unnamed: 0,num,name,p_np,smiles
0,1,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,2,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2,3,40730,1,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3,4,24,1,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4,5,cloxacillin,1,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...


In [4]:
smis = []
labels = []

failed=0
for smi,label in zip(bbbp_df.smiles, bbbp_df.p_np):
    mol=Chem.MolFromSmiles(smi)
    if mol:
        smis.append(smi)
        labels.append(label)
    else:
        print(smi)
        failed+=1
failed

[13:41:06] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:41:06] Explicit valence for atom # 6 N, 4, is greater than permitted
[13:41:06] Explicit valence for atom # 6 N, 4, is greater than permitted
[13:41:06] Explicit valence for atom # 11 N, 4, is greater than permitted
[13:41:06] Explicit valence for atom # 12 N, 4, is greater than permitted
[13:41:06] Explicit valence for atom # 5 N, 4, is greater than permitted
[13:41:06] Explicit valence for atom # 5 N, 4, is greater than permitted
[13:41:06] Explicit valence for atom # 5 N, 4, is greater than permitted
[13:41:06] Explicit valence for atom # 5 N, 4, is greater than permitted
[13:41:06] Explicit valence for atom # 5 N, 4, is greater than permitted
[13:41:06] Explicit valence for atom # 5 N, 4, is greater than permitted


O=N([O-])C1=C(CN=C1NCCSCc2ncccc2)Cc3ccccc3
c1(nc(NC(N)=[NH2])sc1)CSCCNC(=[NH]C#N)NC
Cc1nc(sc1)\[NH]=C(\N)N
s1cc(CSCCN\C(NC)=[NH]\C#N)nc1\[NH]=C(\N)N
c1c(c(ncc1)CSCCN\C(=[NH]\C#N)NCC)Br
n1c(csc1\[NH]=C(\N)N)c1ccccc1
n1c(csc1\[NH]=C(\N)N)c1cccc(c1)N
n1c(csc1\[NH]=C(\N)N)c1cccc(c1)NC(C)=O
n1c(csc1\[NH]=C(\N)N)c1cccc(c1)N\C(NC)=[NH]\C#N
s1cc(nc1\[NH]=C(\N)N)C
c1(cc(N\C(=[NH]\c2cccc(c2)CC)C)ccc1)CC




11

In [5]:
len(smis), len(labels), failed, len(bbbp_df)

(2039, 2039, 11, 2050)

# Chemical Fingerprint

In [6]:
def rdkit_fingerprint(smi,radius=2, nbits=1024):
    mol = Chem.MolFromSmiles(smi)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits)
    return fp.ToList()

In [7]:
fps = [rdkit_fingerprint(smi) for smi in smis]
fps = np.array(fps)
fps.shape



(2039, 1024)

In [8]:
fp_df = pd.DataFrame(fps)
fp_df = fp_df.assign(bbbp=labels)
fp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,bbbp
0,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
3,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2034,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2035,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
2036,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
fp_df.to_csv('../../2023-2/processed_data/BBBP_ECFP_R2B1024.csv',index=False)

### mordred descriptor (~1600 descriptors)
- requires `mordred` package: `pip install mordred`

In [9]:
from mordred import Calculator, descriptors

In [10]:
mols = [Chem.MolFromSmiles(smi) for smi in smis]



In [11]:
calc = Calculator(descriptors, ignore_3D=True)
mordred_df = calc.pandas(mols)



  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  9%|▉         | 188/2039 [00:17<03:15,  9.46it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 11%|█▏        | 233/2039 [00:19<01:36, 18.66it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 18%|█▊        | 365/2039 [00:28<01:55, 14.49it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 19%|█▉        | 390/2039 [00:33<02:59,  9.16it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 22%|██▏       | 450/2039 [00:33<00:50, 31.77it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 35%|███▍      | 708/2039 [00:47<02:31,  8.76it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 39%|███▉      | 800/2039 [01:01<06:44,  3.06it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 40%|███▉      | 815/2039 [01:03<04:26,  4.59it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 40%|███▉      | 815/2039 [01:05<04:26,  4.59it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)




  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 43%|████▎     | 871/2039 [01:07<00:54, 21.31it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 44%|████▎     | 892/2039 [01:12<03:29,  5.46it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 44%|████▍     | 906/2039 [01:16<04:21,  4.33it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 59%|█████▉    | 1200/2039 [01:38<00:19, 43.08it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 77%|███████▋  | 1572/2039 [01:58<00:14, 33.15it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 93%|█████████▎| 1890/2039 [02:14<00:04, 33.43it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 2039/2039 [02:19<00:00, 14.59it/s]


In [12]:
mordred_df

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,14.389425,11.808563,0,1,multiple fragments (SpAbs_A/SpAbs),multiple fragments (SpMax_A/SpMax),multiple fragments (SpDiam_A/SpDiam),multiple fragments (SpAD_A/SpAD),multiple fragments (SpMAD_A/SpMAD),multiple fragments (LogEE_A/LogEE),...,9.604475,52.719904,294.126082,7.173807,1900000792,25,92.0,103.0,divide by zero encountered in power (mZagreb1),4.305556
1,16.809162,13.974216,0,0,27.070798,2.295408,4.590816,27.070798,1.176991,4.006826,...,9.620262,56.206491,359.141884,7.182838,1492,28,106.0,114.0,9.506944,5.319444
2,20.758034,16.164169,1,1,33.465822,2.578523,5.157047,33.465822,1.287147,4.212471,...,10.513498,61.857420,361.143784,7.850952,1484,51,146.0,180.0,9.333333,5.527778
3,15.775129,12.193243,0,1,26.569835,2.281042,4.562084,26.569835,1.26523,3.937026,...,9.441849,53.624898,290.199428,6.174456,1158,24,98.0,106.0,6.194444,4.833333
4,23.095142,19.875288,1,0,36.206475,2.63295,5.121946,36.206475,1.248499,4.326727,...,10.721173,82.182989,435.065569,9.256714,2212,50,164.0,203.0,11.395833,6.097222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2034,13.141620,11.708690,0,0,20.566611,2.480613,4.961227,20.566611,1.209801,3.762705,...,9.947839,50.301872,274.950061,13.747503,460,30,90.0,108.0,7.638889,3.666667
2035,20.948304,17.680284,1,2,34.049364,2.514721,4.847757,34.049364,1.261088,4.222748,...,10.212038,78.369416,398.137239,8.125250,1993,43,142.0,169.0,10.472222,5.888889
2036,18.563116,14.624477,1,1,31.11871,2.340051,4.576553,31.11871,1.296613,4.096809,...,9.762673,71.444395,322.142976,7.670071,1597,30,120.0,135.0,6.666667,5.361111
2037,21.068022,17.811784,0,0,36.251368,2.543817,5.087633,36.251368,1.294692,4.253973,...,10.444095,63.734766,382.189257,7.077579,1854,54,146.0,179.0,10.722222,6.583333


In [14]:
# remove columns with more than one types -> strings
sum(mordred_df.dtypes != 'O'), sum(mordred_df.dtypes == 'O')#.value_counts()), 

(878, 735)

In [15]:
mordred_df=mordred_df[mordred_df.columns[mordred_df.dtypes != 'O']]
mordred_df

Unnamed: 0,ABC,ABCGG,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,14.389425,11.808563,0,1,10,11,41,20,0,0,...,0.000000,9.604475,52.719904,294.126082,7.173807,1900000792,25,92.0,103.0,4.305556
1,16.809162,13.974216,0,0,6,6,50,23,0,0,...,0.000000,9.620262,56.206491,359.141884,7.182838,1492,28,106.0,114.0,5.319444
2,20.758034,16.164169,1,1,10,11,46,26,0,0,...,0.000000,10.513498,61.857420,361.143784,7.850952,1484,51,146.0,180.0,5.527778
3,15.775129,12.193243,0,1,6,6,47,21,0,0,...,0.000000,9.441849,53.624898,290.199428,6.174456,1158,24,98.0,106.0,4.833333
4,23.095142,19.875288,1,0,11,11,47,29,0,0,...,7.853605,10.721173,82.182989,435.065569,9.256714,2212,50,164.0,203.0,6.097222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2034,13.141620,11.708690,0,0,10,11,20,17,0,0,...,0.000000,9.947839,50.301872,274.950061,13.747503,460,30,90.0,108.0,3.666667
2035,20.948304,17.680284,1,2,9,10,49,27,0,0,...,7.627057,10.212038,78.369416,398.137239,8.125250,1993,43,142.0,169.0,5.888889
2036,18.563116,14.624477,1,1,17,17,42,24,0,0,...,6.580639,9.762673,71.444395,322.142976,7.670071,1597,30,120.0,135.0,5.361111
2037,21.068022,17.811784,0,0,16,17,54,28,0,0,...,0.000000,10.444095,63.734766,382.189257,7.077579,1854,54,146.0,179.0,6.583333


In [16]:
mordred_df.to_csv('../../2023-2/processed_data/BBBP_mordred.csv',index=False)