# Starting the environment

In [1]:
!pip install rdkit
!pip install chardet

Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6


In [2]:
import pandas as pd
import zipfile
import os
import glob

from rdkit import Chem
from rdkit.Chem import Descriptors, MolToSmiles
from rdkit.Chem.PandasTools import LoadSDF
from rdkit.rdBase import BlockLogs
from standardize import standardize_smiles

# *S. aureus* ATCC 43300

## Descriptors

In [3]:
s_aureus_data = pd.read_csv("/content/s_aureus_strain_43300.csv")
s_aureus_data = s_aureus_data.drop(columns=["split"])
s_aureus_data

Unnamed: 0,id,label,smiles
0,CHEMBL439379,1,O=C(CN1C(=O)S/C(=C/c2sc(Cl)nc2Cl)C1=O)c1ccc(Cl...
1,CHEMBL5282842,0,O=C(NNC(=O)C1CC[C@@H]2CN1C(=O)N2OS(=O)(=O)O)c1...
2,CHEMBL4103936,0,Clc1ccc2c(Nc3cccc(C4CC(c5ccccc5)=NN4c4ccccc4)c...
3,CHEMBL467948,1,CC(=Cc1oc(O)c(C)c1O)CC/C=C(/C)CC/C=C(/C)CCCc1c...
4,CHEMBL4575403,0,COC(=O)c1ccc(-c2c3[nH]c4ccc(OC)cc4c3cc[n+]2Cc2...
...,...,...,...
2873,CHEMBL2206237,1,CCCCCC(=O)NCCCCC(N)C(=O)NC(CCCN=C(N)N)C(=O)NC(...
2874,CHEMBL2030125,0,CCCOSSOCCC
2875,CHEMBL4856036,0,CC[C@H]1OC(=O)C(C)[C@@H](OC(=O)NNC(=O)Cn2cc(-c...
2876,CHEMBL161,1,CON=C(C(=O)NC1C(=O)N2C(C(=O)O)=C(CSc3nc(=O)c(=...


In [4]:
# Wanted descriptors

# MolWt
# TPSA
# HBA
# HBD
# MolLogP
# NumRotatableBonds
# RingCount

mol = [Chem.MolFromSmiles(smi) for smi in s_aureus_data["smiles"]]

s_aureus_data["MolWt"] = [Descriptors.MolWt(mol) for mol in mol]
s_aureus_data["TPSA"] = [Descriptors.TPSA(mol) for mol in mol]
s_aureus_data["HBA"] = [Descriptors.NOCount(mol) for mol in mol]
s_aureus_data["HBD"] = [Descriptors.NHOHCount(mol) for mol in mol]
s_aureus_data["MolLogP"] = [Descriptors.MolLogP(mol) for mol in mol]
s_aureus_data["NumRotatableBonds"] = [Descriptors.NumRotatableBonds(mol) for mol in mol]
s_aureus_data["RingCount"] = [Descriptors.RingCount(mol) for mol in mol]

s_aureus_data.head()

Unnamed: 0,id,label,smiles,MolWt,TPSA,HBA,HBD,MolLogP,NumRotatableBonds,RingCount
0,CHEMBL439379,1,O=C(CN1C(=O)S/C(=C/c2sc(Cl)nc2Cl)C1=O)c1ccc(Cl...,468.17,67.34,5,0,5.6759,4,3
1,CHEMBL5282842,0,O=C(NNC(=O)C1CC[C@@H]2CN1C(=O)N2OS(=O)(=O)O)c1...,593.594,182.62,15,4,-0.3509,6,6
2,CHEMBL4103936,0,Clc1ccc2c(Nc3cccc(C4CC(c5ccccc5)=NN4c4ccccc4)c...,474.995,40.52,4,1,7.9875,5,6
3,CHEMBL467948,1,CC(=Cc1oc(O)c(C)c1O)CC/C=C(/C)CC/C=C(/C)CCCc1c...,398.543,66.74,4,2,7.47142,11,2
4,CHEMBL4575403,0,COC(=O)c1ccc(-c2c3[nH]c4ccc(OC)cc4c3cc[n+]2Cc2...,437.519,55.2,5,1,5.42752,5,5


In [5]:
s_aureus_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2878 entries, 0 to 2877
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2878 non-null   object 
 1   label              2878 non-null   int64  
 2   smiles             2878 non-null   object 
 3   MolWt              2878 non-null   float64
 4   TPSA               2878 non-null   float64
 5   HBA                2878 non-null   int64  
 6   HBD                2878 non-null   int64  
 7   MolLogP            2878 non-null   float64
 8   NumRotatableBonds  2878 non-null   int64  
 9   RingCount          2878 non-null   int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 225.0+ KB


## Lipinski’s “Rule of 5”

In [6]:
# Molecular Weight <= 500 Da
# No. Hydrogen Bond Donors <= 5
# No. Hydrogen Bond Acceptors <= 10
# LogP <= 5

lipinski = []

for row in s_aureus_data.itertuples():
  MW = row.MolWt
  HBA = row.HBA
  HBD = row.HBD
  LogP = row.MolLogP

  conditions = [MW <= 500, HBA <= 10, HBD <= 5, LogP <= 5]

  if conditions.count(True) >= 3:
    lipinski.append(1)
  else:
    lipinski.append(0)

s_aureus_data["Lipinski"] = lipinski
s_aureus_data.tail()

Unnamed: 0,id,label,smiles,MolWt,TPSA,HBA,HBD,MolLogP,NumRotatableBonds,RingCount,Lipinski
2873,CHEMBL2206237,1,CCCCCC(=O)NCCCCC(N)C(=O)NC(CCCN=C(N)N)C(=O)NC(...,1270.56,508.38,29,26,0.32674,40,6,0
2874,CHEMBL2030125,0,CCCOSSOCCC,182.31,18.46,2,0,3.051,7,0,1
2875,CHEMBL4856036,0,CC[C@H]1OC(=O)C(C)[C@@H](OC(=O)NNC(=O)Cn2cc(-c...,894.464,225.37,19,4,3.5349,8,5,0
2876,CHEMBL161,1,CON=C(C(=O)NC1C(=O)N2C(C(=O)O)=C(CSc3nc(=O)c(=...,554.592,214.96,15,5,-1.6113,8,4,0
2877,CHEMBL927,1,C=CC1=C(C(=O)O)N2C(=O)C(NC(=O)C(=NO)c3csc(N)n3...,395.422,158.21,10,5,-0.1718,5,3,1


In [7]:
s_aureus_data["Lipinski"].value_counts()

Unnamed: 0_level_0,count
Lipinski,Unnamed: 1_level_1
1,1861
0,1017


In [8]:
s_aureus_data.to_csv("s_aureus_43300_descriptors_and_lipinski.csv", index=False)

# BraCoLi

## Chemical data curation

### BraCoLi V1

In [17]:
bracoli_v1 = "/content/bracoli_v1.sdf"

with BlockLogs():
  bracoli_v1 = Chem.PandasTools.LoadSDF(bracoli_v1)

bracoli_v1 = bracoli_v1.drop(columns=["Name"])
bracoli_v1 = bracoli_v1.reset_index(drop=True)
bracoli_v1

Unnamed: 0,ID,ROMol
0,BR010001,<rdkit.Chem.rdchem.Mol object at 0x7fa7479a94d0>
1,BR010002,<rdkit.Chem.rdchem.Mol object at 0x7fa7479a8cf0>
2,BR010003,<rdkit.Chem.rdchem.Mol object at 0x7fa7479aa2d0>
3,BR010004,<rdkit.Chem.rdchem.Mol object at 0x7fa7479ab290>
4,BR010005,<rdkit.Chem.rdchem.Mol object at 0x7fa7479abb50>
...,...,...
1166,BR020454,<rdkit.Chem.rdchem.Mol object at 0x7fa7478fd000>
1167,BR020455,<rdkit.Chem.rdchem.Mol object at 0x7fa7478fd070>
1168,BR020456,<rdkit.Chem.rdchem.Mol object at 0x7fa7478fd0e0>
1169,BR020457,<rdkit.Chem.rdchem.Mol object at 0x7fa7478fd150>


In [18]:
bracoli_v1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1171 entries, 0 to 1170
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      1171 non-null   object
 1   ROMol   1171 non-null   object
dtypes: object(2)
memory usage: 18.4+ KB


### BraCoLi V1.5

In [27]:
import chardet

file_path = '/content/bracoli_v1_5.sdf'

with open(file_path, 'rb') as f:
    rawdata = f.read(100000)

result = chardet.detect(rawdata)
result

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

In [28]:
encoding_detected = result['encoding']

with open(file_path, 'r', encoding=encoding_detected) as f:
    content = f.read()

utf8_path = '/content/bracoli1_5_utf8.sdf'
with open(utf8_path, 'w', encoding='utf-8') as f:
    f.write(content)

In [29]:
with BlockLogs():
  bracoli_v1_5 = LoadSDF(utf8_path)

bracoli_v1_5 = bracoli_v1_5["ROMol"]
bracoli_v1_5 = bracoli_v1_5.reset_index(drop=True)
bracoli_v1_5

Unnamed: 0,ROMol
0,<rdkit.Chem.rdchem.Mol object at 0x7fa7475dfd10>
1,<rdkit.Chem.rdchem.Mol object at 0x7fa7475df300>
2,<rdkit.Chem.rdchem.Mol object at 0x7fa7475dc820>
3,<rdkit.Chem.rdchem.Mol object at 0x7fa7475de650>
4,<rdkit.Chem.rdchem.Mol object at 0x7fa7475dd770>
...,...
1073,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe7a0>
1074,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe810>
1075,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe880>
1076,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe8f0>


In [30]:
ID = ["V1_5_" + str(i) for i in range(len(bracoli_v1_5))]

bracoli_v1_5 = pd.DataFrame(bracoli_v1_5, columns=["ROMol"])
bracoli_v1_5["ID"] = ID
bracoli_v1_5

Unnamed: 0,ROMol,ID
0,<rdkit.Chem.rdchem.Mol object at 0x7fa7475dfd10>,V1_5_0
1,<rdkit.Chem.rdchem.Mol object at 0x7fa7475df300>,V1_5_1
2,<rdkit.Chem.rdchem.Mol object at 0x7fa7475dc820>,V1_5_2
3,<rdkit.Chem.rdchem.Mol object at 0x7fa7475de650>,V1_5_3
4,<rdkit.Chem.rdchem.Mol object at 0x7fa7475dd770>,V1_5_4
...,...,...
1073,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe7a0>,V1_5_1073
1074,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe810>,V1_5_1074
1075,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe880>,V1_5_1075
1076,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe8f0>,V1_5_1076


In [31]:
bracoli_v1_5 = bracoli_v1_5.iloc[:, [1,0]]
bracoli_v1_5

Unnamed: 0,ID,ROMol
0,V1_5_0,<rdkit.Chem.rdchem.Mol object at 0x7fa7475dfd10>
1,V1_5_1,<rdkit.Chem.rdchem.Mol object at 0x7fa7475df300>
2,V1_5_2,<rdkit.Chem.rdchem.Mol object at 0x7fa7475dc820>
3,V1_5_3,<rdkit.Chem.rdchem.Mol object at 0x7fa7475de650>
4,V1_5_4,<rdkit.Chem.rdchem.Mol object at 0x7fa7475dd770>
...,...,...
1073,V1_5_1073,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe7a0>
1074,V1_5_1074,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe810>
1075,V1_5_1075,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe880>
1076,V1_5_1076,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe8f0>


### BraCoLi All Versions

In [32]:
bracoli_full = pd.concat([bracoli_v1, bracoli_v1_5], axis=0, ignore_index=True)
bracoli_full

Unnamed: 0,ID,ROMol
0,BR010001,<rdkit.Chem.rdchem.Mol object at 0x7fa7479a94d0>
1,BR010002,<rdkit.Chem.rdchem.Mol object at 0x7fa7479a8cf0>
2,BR010003,<rdkit.Chem.rdchem.Mol object at 0x7fa7479aa2d0>
3,BR010004,<rdkit.Chem.rdchem.Mol object at 0x7fa7479ab290>
4,BR010005,<rdkit.Chem.rdchem.Mol object at 0x7fa7479abb50>
...,...,...
2244,V1_5_1073,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe7a0>
2245,V1_5_1074,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe810>
2246,V1_5_1075,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe880>
2247,V1_5_1076,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe8f0>


In [33]:
bracoli_full["smiles"] = [MolToSmiles(mol) for mol in bracoli_full["ROMol"]]
bracoli_full

Unnamed: 0,ID,ROMol,smiles
0,BR010001,<rdkit.Chem.rdchem.Mol object at 0x7fa7479a94d0>,[NH3+][C@@H](CNC(=O)/C=C/c1ccccc1)C(=O)[O-]
1,BR010002,<rdkit.Chem.rdchem.Mol object at 0x7fa7479a8cf0>,COC(=O)c1ccc(/C=C/C(=O)NC[C@H]([NH3+])C(=O)[O-...
2,BR010003,<rdkit.Chem.rdchem.Mol object at 0x7fa7479aa2d0>,N#Cc1ccc(/C=C/C(=O)NC[C@H]([NH3+])C(=O)[O-])cc1
3,BR010004,<rdkit.Chem.rdchem.Mol object at 0x7fa7479ab290>,N#Cc1ccccc1/C=C/C(=O)NC[C@H]([NH3+])C(=O)[O-]
4,BR010005,<rdkit.Chem.rdchem.Mol object at 0x7fa7479abb50>,[NH3+][C@@H](CNC(=O)/C=C/c1ccc(Cl)cc1)C(=O)[O-]
...,...,...,...
2244,V1_5_1073,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe7a0>,O=C(N/N=C/c1cccc(O)c1)c1ccc(Cl)cc1
2245,V1_5_1074,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe810>,COc1cccc2cc(C(=O)O)c(=O)oc12
2246,V1_5_1075,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe880>,COc1cccc2cc(C(=O)c3ccc(N)cc3)c(=O)oc12
2247,V1_5_1076,<rdkit.Chem.rdchem.Mol object at 0x7fa7475fe8f0>,COc1cccc2cc(C(=O)c3ccccc3)c(=O)oc12


In [34]:
bracoli_full = standardize_smiles(bracoli_full)
bracoli_full.info()

SMILES standardization : 100%|██████████████████████████████████████████████████████| 2249/2249 [03:32<00:00, 10.58it/s]


<class 'pandas.core.frame.DataFrame'>
Index: 2221 entries, 0 to 2248
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   2221 non-null   object
 1   ROMol                2221 non-null   object
 2   smiles               2221 non-null   object
 3   smiles_standardized  2221 non-null   object
 4   inchi                2221 non-null   object
 5   mol                  2221 non-null   object
dtypes: object(6)
memory usage: 121.5+ KB


In [35]:
bracoli_full = bracoli_full[["ID", "smiles_standardized", "inchi", "mol"]]
bracoli_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2221 entries, 0 to 2248
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   2221 non-null   object
 1   smiles_standardized  2221 non-null   object
 2   inchi                2221 non-null   object
 3   mol                  2221 non-null   object
dtypes: object(4)
memory usage: 86.8+ KB


In [36]:
bracoli_full.nunique()

Unnamed: 0,0
ID,2221
smiles_standardized,1969
inchi,1969
mol,2221


In [37]:
df = bracoli_full
inchi_col = "inchi"

duplicated_data = pd.DataFrame()
unique_data = pd.DataFrame()
hash_list = list(set(df[inchi_col]))

for hashs in hash_list:
  selection = df[df[inchi_col] == hashs]

  if len(selection) > 1:  # Duplicated
    duplicated_data = pd.concat([duplicated_data, selection], axis=0)
  else: # Unique data
    unique_data = pd.concat([unique_data, selection], axis=0)

In [38]:
unique_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1779 entries, 2226 to 709
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   1779 non-null   object
 1   smiles_standardized  1779 non-null   object
 2   inchi                1779 non-null   object
 3   mol                  1779 non-null   object
dtypes: object(4)
memory usage: 69.5+ KB


In [39]:
unique_data.nunique()

Unnamed: 0,0
ID,1779
smiles_standardized,1779
inchi,1779
mol,1779


In [40]:
duplicated_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 442 entries, 1226 to 2012
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   442 non-null    object
 1   smiles_standardized  442 non-null    object
 2   inchi                442 non-null    object
 3   mol                  442 non-null    object
dtypes: object(4)
memory usage: 17.3+ KB


In [41]:
duplicated_data.nunique()

Unnamed: 0,0
ID,442
smiles_standardized,190
inchi,190
mol,442


In [42]:
inchi_list = list(set(duplicated_data["inchi"]))

for inchi in inchi_list:
  selection = duplicated_data[duplicated_data["inchi"] == inchi]

  index_to_drop = list(selection.index)[1:]
  duplicated_data.drop(index=index_to_drop, inplace=True)

In [43]:
duplicated_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 190 entries, 1226 to 1981
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   190 non-null    object
 1   smiles_standardized  190 non-null    object
 2   inchi                190 non-null    object
 3   mol                  190 non-null    object
dtypes: object(4)
memory usage: 7.4+ KB


In [44]:
duplicated_data.nunique()

Unnamed: 0,0
ID,190
smiles_standardized,190
inchi,190
mol,190


In [45]:
bracoli_full_unique = pd.concat([unique_data, duplicated_data], axis=0, ignore_index=True)
bracoli_full_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   1969 non-null   object
 1   smiles_standardized  1969 non-null   object
 2   inchi                1969 non-null   object
 3   mol                  1969 non-null   object
dtypes: object(4)
memory usage: 61.7+ KB


In [46]:
bracoli_full_unique

Unnamed: 0,ID,smiles_standardized,inchi,mol
0,V1_5_1055,C=CCc1cc(C=NN=C(N)N)c(O)c(OC)c1,POEPNPUSZBOFAY-UHFFFAOYSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7474007b0>
1,BR010359,OC[C@H]1O[C@H](O)[C@H](O)[C@@H](OCCCCCCOCc2ccc...,GTJOSKVORPAFNG-QQXKLLMISA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7476b98c0>
2,BR020309,CC1(C)O[C@H]2O[C@@H]3CN(C(=O)/C=C/c4ccccc4)C(C...,GHDGFLXQUKJLOP-BKXKVBGLSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7476d3d10>
3,V1_5_866,C/C=C/c1cc(OC)c2c(c1)[C@H](C)[C@@H](c1ccc(O)c(...,ITDOFWOJEDZPCF-FNINDUDTSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7474bb3e0>
4,V1_5_334,CO[C@H]1O[C@H](COC/C=C/c2ccccc2)[C@H](NC(=O)c2...,OJZQKANRUHKNBQ-NHQSTMNZSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa747495230>
...,...,...,...,...
1964,V1_5_26,C[C@H]1O[C@H](OC[C@H]2O[C@@H](Oc3c(-c4ccc(O)c(...,IKGXIBQEEMLURG-DDRMSXOHSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa747640b30>
1965,BR010401,COc1cc(CCC(=O)NCCO)ccc1O[C@@H]1O[C@H](CO)[C@H]...,AZZRGFYEWXDSAT-PLLDYVMSSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7476b93f0>
1966,BR010436,CC(C)(C)OC(=O)N1[C@H](CO)COC1(C)C,DWFOEHLGMZJBAA-MRVPVSSYSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7476bb4c0>
1967,V1_5_811,CCCc1cc(OC)c(O)cc1NS(=O)(=O)c1ccc(N)cc1,ZICJORDMNCAZLK-UHFFFAOYSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7474b9bd0>


## Descriptors

In [47]:
# Wanted descriptors

# MolWt
# TPSA
# HBA
# HBD
# MolLogP
# NumRotatableBonds
# RingCount

mol = [mol for mol in bracoli_full_unique["mol"]]

bracoli_full_unique["MolWt"] = [Descriptors.MolWt(mol) for mol in mol]
bracoli_full_unique["TPSA"] = [Descriptors.TPSA(mol) for mol in mol]
bracoli_full_unique["HBA"] = [Descriptors.NOCount(mol) for mol in mol]
bracoli_full_unique["HBD"] = [Descriptors.NHOHCount(mol) for mol in mol]
bracoli_full_unique["MolLogP"] = [Descriptors.MolLogP(mol) for mol in mol]
bracoli_full_unique["NumRotatableBonds"] = [Descriptors.NumRotatableBonds(mol) for mol in mol]
bracoli_full_unique["RingCount"] = [Descriptors.RingCount(mol) for mol in mol]

bracoli_full_unique

Unnamed: 0,ID,smiles_standardized,inchi,mol,MolWt,TPSA,HBA,HBD,MolLogP,NumRotatableBonds,RingCount
0,V1_5_1055,C=CCc1cc(C=NN=C(N)N)c(O)c(OC)c1,POEPNPUSZBOFAY-UHFFFAOYSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7474007b0>,248.286,106.22,6,5,0.7366,5,1
1,BR010359,OC[C@H]1O[C@H](O)[C@H](O)[C@@H](OCCCCCCOCc2ccc...,GTJOSKVORPAFNG-QQXKLLMISA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7476b98c0>,370.442,108.61,7,4,0.5801,11,2
2,BR020309,CC1(C)O[C@H]2O[C@@H]3CN(C(=O)/C=C/c4ccccc4)C(C...,GHDGFLXQUKJLOP-BKXKVBGLSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7476d3d10>,359.422,57.23,6,0,2.5397,2,4
3,V1_5_866,C/C=C/c1cc(OC)c2c(c1)[C@H](C)[C@@H](c1ccc(O)c(...,ITDOFWOJEDZPCF-FNINDUDTSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7474bb3e0>,326.392,47.92,4,1,4.6797,4,3
4,V1_5_334,CO[C@H]1O[C@H](COC/C=C/c2ccccc2)[C@H](NC(=O)c2...,OJZQKANRUHKNBQ-NHQSTMNZSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa747495230>,593.720,75.25,7,1,6.0571,14,5
...,...,...,...,...,...,...,...,...,...,...,...
1964,V1_5_26,C[C@H]1O[C@H](OC[C@H]2O[C@@H](Oc3c(-c4ccc(O)c(...,IKGXIBQEEMLURG-DDRMSXOHSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa747640b30>,610.521,269.43,16,10,-1.6871,6,5
1965,BR010401,COc1cc(CCC(=O)NCCO)ccc1O[C@@H]1O[C@H](CO)[C@H]...,AZZRGFYEWXDSAT-PLLDYVMSSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7476b93f0>,401.412,157.94,10,6,-2.0850,9,2
1966,BR010436,CC(C)(C)OC(=O)N1[C@H](CO)COC1(C)C,DWFOEHLGMZJBAA-MRVPVSSYSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7476bb4c0>,231.292,59.00,5,1,1.3507,1,1
1967,V1_5_811,CCCc1cc(OC)c(O)cc1NS(=O)(=O)c1ccc(N)cc1,ZICJORDMNCAZLK-UHFFFAOYSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7474b9bd0>,336.413,101.65,6,4,2.7363,6,2


In [48]:
bracoli_full_unique = bracoli_full_unique.drop(columns=["inchi", "mol"])
bracoli_full_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   1969 non-null   object 
 1   smiles_standardized  1969 non-null   object 
 2   MolWt                1969 non-null   float64
 3   TPSA                 1969 non-null   float64
 4   HBA                  1969 non-null   int64  
 5   HBD                  1969 non-null   int64  
 6   MolLogP              1969 non-null   float64
 7   NumRotatableBonds    1969 non-null   int64  
 8   RingCount            1969 non-null   int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 138.6+ KB


In [49]:
rename = {
    "smiles_standardized": "smiles",
    "ID": "id"
    }

bracoli_full_unique = bracoli_full_unique.rename(columns=rename)
bracoli_full_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1969 non-null   object 
 1   smiles             1969 non-null   object 
 2   MolWt              1969 non-null   float64
 3   TPSA               1969 non-null   float64
 4   HBA                1969 non-null   int64  
 5   HBD                1969 non-null   int64  
 6   MolLogP            1969 non-null   float64
 7   NumRotatableBonds  1969 non-null   int64  
 8   RingCount          1969 non-null   int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 138.6+ KB


In [50]:
bracoli_full_unique.to_csv("bracoli_descriptors.csv", index=False)

# NuBBE

## Chemical data curation

In [63]:
zip_path = "/content/nubbe_files_mol2.zip"
extract_dir = "/content/mol2_files"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [64]:
mol2_files = glob.glob(os.path.join(extract_dir, "*.mol2"))

data = []

with BlockLogs():
  for file in mol2_files:
      with open(file, 'r') as f:
          mol_block = f.read()
          mol = Chem.MolFromMol2Block(mol_block, sanitize=True, removeHs=False, cleanupSubstructures=False)
          if mol is not None:
              id = os.path.basename(file)
              smiles = MolToSmiles(mol)
              data.append({
                  "id": id,
                  "smiles": smiles,
                  "mol_original": mol
              })

NuBBE = pd.DataFrame(data)
NuBBE # Sanitization removes broken mol structures

Unnamed: 0,id,smiles,mol_original
0,NuBBE_555_obabel_3D.mol2,[H]O[C@]([H])(C([H])([H])/C([H])=C(\[H])[C@]1(...,<rdkit.Chem.rdchem.Mol object at 0x7fa7468360a0>
1,NuBBE_1782_obabel_3D.mol2,[H]C1=C(OC([H])([H])[H])C([H])([H])[C@]([H])(C...,<rdkit.Chem.rdchem.Mol object at 0x7fa7468351c0>
2,NuBBE_2146_obabel_3D.mol2,[H]OC(=O)C([H])([H])C([H])([H])C([H])([H])C([H...,<rdkit.Chem.rdchem.Mol object at 0x7fa746ba2b90>
3,NuBBE_1993_obabel_3D.mol2,[H]C([H])=C([H])C([H])([H])Oc1c(OC([H])([H])[H...,<rdkit.Chem.rdchem.Mol object at 0x7fa746975f50>
4,NuBBE_1820_obabel_3D.mol2,[H]c1c(OC(=O)C([H])([H])[H])c([H])c2c([H])c(C(...,<rdkit.Chem.rdchem.Mol object at 0x7fa7468bf840>
...,...,...,...
1770,NuBBE_1464_obabel_3D.mol2,[H]O[C@@]([H])(C(=O)OC([H])([H])[H])[C@@]1([H]...,<rdkit.Chem.rdchem.Mol object at 0x7fa7466ac0b0>
1771,NuBBE_425_obabel_3D.mol2,[H]N=C(N(C([H])([H])[H])C([H])([H])/C([H])=C(/...,<rdkit.Chem.rdchem.Mol object at 0x7fa7466ac120>
1772,NuBBE_2432_obabel_3D.mol2,[H]c1c([H])c([H])c([C@@]2([H])Oc3c([H])c(OC([H...,<rdkit.Chem.rdchem.Mol object at 0x7fa7466ac190>
1773,NuBBE_2234_obabel_3D.mol2,[H]C([H])=C1C([H])([H])[C@@]23C([H])([H])C([H]...,<rdkit.Chem.rdchem.Mol object at 0x7fa7466ac200>


In [65]:
NuBBE = standardize_smiles(NuBBE)
NuBBE.info()

SMILES standardization : 100%|██████████████████████████████████████████████████████| 1775/1775 [01:18<00:00, 22.49it/s]


<class 'pandas.core.frame.DataFrame'>
Index: 1760 entries, 0 to 1774
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   1760 non-null   object
 1   smiles               1760 non-null   object
 2   mol_original         1760 non-null   object
 3   smiles_standardized  1760 non-null   object
 4   inchi                1760 non-null   object
 5   mol                  1760 non-null   object
dtypes: object(6)
memory usage: 96.2+ KB


In [66]:
NuBBE = NuBBE.drop(columns=["smiles", "mol_original"])
NuBBE.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1760 entries, 0 to 1774
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   1760 non-null   object
 1   smiles_standardized  1760 non-null   object
 2   inchi                1760 non-null   object
 3   mol                  1760 non-null   object
dtypes: object(4)
memory usage: 68.8+ KB


In [67]:
NuBBE.nunique()

Unnamed: 0,0
id,1760
smiles_standardized,1721
inchi,1721
mol,1760


In [68]:
df = NuBBE
inchi_col = "inchi"

duplicated_data = pd.DataFrame()
unique_data = pd.DataFrame()
hash_list = list(set(df[inchi_col]))

for hashs in hash_list:
  selection = df[df[inchi_col] == hashs]

  if len(selection) > 1:  # Duplicated
    duplicated_data = pd.concat([duplicated_data, selection], axis=0)
  else: # Unique data
    unique_data = pd.concat([unique_data, selection], axis=0)

In [69]:
unique_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1684 entries, 749 to 1278
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   1684 non-null   object
 1   smiles_standardized  1684 non-null   object
 2   inchi                1684 non-null   object
 3   mol                  1684 non-null   object
dtypes: object(4)
memory usage: 65.8+ KB


In [70]:
unique_data.nunique()

Unnamed: 0,0
id,1684
smiles_standardized,1684
inchi,1684
mol,1684


In [71]:
duplicated_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76 entries, 926 to 1286
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   76 non-null     object
 1   smiles_standardized  76 non-null     object
 2   inchi                76 non-null     object
 3   mol                  76 non-null     object
dtypes: object(4)
memory usage: 3.0+ KB


In [72]:
duplicated_data.nunique()

Unnamed: 0,0
id,76
smiles_standardized,37
inchi,37
mol,76


In [73]:
inchi_list = list(set(duplicated_data["inchi"]))

for inchi in inchi_list:
  selection = duplicated_data[duplicated_data["inchi"] == inchi]

  index_to_drop = list(selection.index)[1:]
  duplicated_data.drop(index=index_to_drop, inplace=True)

In [74]:
duplicated_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37 entries, 926 to 843
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   37 non-null     object
 1   smiles_standardized  37 non-null     object
 2   inchi                37 non-null     object
 3   mol                  37 non-null     object
dtypes: object(4)
memory usage: 1.4+ KB


In [75]:
duplicated_data.nunique()

Unnamed: 0,0
id,37
smiles_standardized,37
inchi,37
mol,37


In [76]:
NuBBE_unique = pd.concat([unique_data, duplicated_data], axis=0, ignore_index=True)
NuBBE_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1721 entries, 0 to 1720
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   1721 non-null   object
 1   smiles_standardized  1721 non-null   object
 2   inchi                1721 non-null   object
 3   mol                  1721 non-null   object
dtypes: object(4)
memory usage: 53.9+ KB


In [77]:
NuBBE_unique

Unnamed: 0,id,smiles_standardized,inchi,mol
0,NuBBE_1598_obabel_3D.mol2,CCC/C=C/c1ccc2ccccc2n1,ZAPKXDJGBCUUKL-XBXARRHUSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7469d8900>
1,NuBBE_366_obabel_3D.mol2,C[C@H]1[C@H](C)CC[C@]2(CO)CC[C@]3(C)C(=CC[C@@H...,XUARCIYIVXVTAE-FADOWZOZSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7469db5a0>
2,NuBBE_1989_obabel_3D.mol2,C=C(CC[C@@H](C)[C@H]1CC[C@@H]2[C@@H]3CC[C@@H]4...,BTBRBCIRTOKWDH-HQWQRDTBSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa74699f8b0>
3,NuBBE_1089_obabel_3D.mol2,COC(=O)c1ccc(/C(=C\c2ccc(OC)c(OC)c2)C(=O)O)cc1O,FIZFMQMREPLBIE-RIYZIHGNSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7466ecb30>
4,NuBBE_550_obabel_3D.mol2,COc1ccc([C@H](O)[C@@H]2NC=Cc3cc4c(cc32)OCO4)cc1,WGVFEYQMUJAWIJ-MSOLQXFVSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa746a04eb0>
...,...,...,...,...
1716,NuBBE_330_obabel_3D.mol2,COC(=O)C[C@@H]1C(C(=O)OC)=CO[C@@H](O[C@@H]2O[C...,GHGFDNVEGGUXDM-XDZSTEIDSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7469d93f0>
1717,NuBBE_829_obabel_3D.mol2,COC(=O)[C@]1(C)CCC[C@@H]2[C@H]1C[C@@H]1OC(=O)[...,ZGRXZVQQLDOVAT-OBLRMQJZSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa746836c70>
1718,NuBBE_2320_obabel_3D.mol2,C=CC[C@]12CC(OC)C(=O)C(OC)=C1O[C@@H](c1ccc3c(c...,SSPDVRMNHFFRCE-XNTNJFEASA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7466b7c30>
1719,NuBBE_188_obabel_3D.mol2,COc1cc(C2c3cc(O)c(OC)cc3CC(C)C2C)ccc1O,TZAAYUCUPIYQBR-UHFFFAOYSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7468bf8b0>


## Descriptors

In [78]:
# Wanted descriptors

# MolWt
# TPSA
# HBA
# HBD
# MolLogP
# NumRotatableBonds
# RingCount

mol = [mol for mol in NuBBE_unique["mol"]]

NuBBE_unique["MolWt"] = [Descriptors.MolWt(mol) for mol in mol]
NuBBE_unique["TPSA"] = [Descriptors.TPSA(mol) for mol in mol]
NuBBE_unique["HBA"] = [Descriptors.NOCount(mol) for mol in mol]
NuBBE_unique["HBD"] = [Descriptors.NHOHCount(mol) for mol in mol]
NuBBE_unique["MolLogP"] = [Descriptors.MolLogP(mol) for mol in mol]
NuBBE_unique["NumRotatableBonds"] = [Descriptors.NumRotatableBonds(mol) for mol in mol]
NuBBE_unique["RingCount"] = [Descriptors.RingCount(mol) for mol in mol]

NuBBE_unique

Unnamed: 0,id,smiles_standardized,inchi,mol,MolWt,TPSA,HBA,HBD,MolLogP,NumRotatableBonds,RingCount
0,NuBBE_1598_obabel_3D.mol2,CCC/C=C/c1ccc2ccccc2n1,ZAPKXDJGBCUUKL-XBXARRHUSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7469d8900>,197.281,12.89,1,0,4.0481,3,2
1,NuBBE_366_obabel_3D.mol2,C[C@H]1[C@H](C)CC[C@]2(CO)CC[C@]3(C)C(=CC[C@@H...,XUARCIYIVXVTAE-FADOWZOZSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7469db5a0>,442.728,40.46,2,2,6.9972,1,5
2,NuBBE_1989_obabel_3D.mol2,C=C(CC[C@@H](C)[C@H]1CC[C@@H]2[C@@H]3CC[C@@H]4...,BTBRBCIRTOKWDH-HQWQRDTBSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa74699f8b0>,414.718,20.23,1,1,7.8807,5,4
3,NuBBE_1089_obabel_3D.mol2,COC(=O)c1ccc(/C(=C\c2ccc(OC)c(OC)c2)C(=O)O)cc1O,FIZFMQMREPLBIE-RIYZIHGNSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7466ecb30>,358.346,102.29,7,2,2.8212,6,2
4,NuBBE_550_obabel_3D.mol2,COc1ccc([C@H](O)[C@@H]2NC=Cc3cc4c(cc32)OCO4)cc1,WGVFEYQMUJAWIJ-MSOLQXFVSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa746a04eb0>,311.337,59.95,5,2,2.7725,3,4
...,...,...,...,...,...,...,...,...,...,...,...
1716,NuBBE_330_obabel_3D.mol2,COC(=O)C[C@@H]1C(C(=O)OC)=CO[C@@H](O[C@@H]2O[C...,GHGFDNVEGGUXDM-XDZSTEIDSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7469d93f0>,478.447,187.51,13,4,-2.0368,8,2
1717,NuBBE_829_obabel_3D.mol2,COC(=O)[C@]1(C)CCC[C@@H]2[C@H]1C[C@@H]1OC(=O)[...,ZGRXZVQQLDOVAT-OBLRMQJZSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa746836c70>,388.416,92.04,7,0,2.7949,2,5
1718,NuBBE_2320_obabel_3D.mol2,C=CC[C@]12CC(OC)C(=O)C(OC)=C1O[C@@H](c1ccc3c(c...,SSPDVRMNHFFRCE-XNTNJFEASA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7466b7c30>,372.417,63.22,6,0,3.5310,5,4
1719,NuBBE_188_obabel_3D.mol2,COc1cc(C2c3cc(O)c(OC)cc3CC(C)C2C)ccc1O,TZAAYUCUPIYQBR-UHFFFAOYSA-N,<rdkit.Chem.rdchem.Mol object at 0x7fa7468bf8b0>,328.408,58.92,4,2,4.0752,3,3


In [79]:
NuBBE_unique = NuBBE_unique.drop(columns=["inchi", "mol"])
NuBBE_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1721 entries, 0 to 1720
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1721 non-null   object 
 1   smiles_standardized  1721 non-null   object 
 2   MolWt                1721 non-null   float64
 3   TPSA                 1721 non-null   float64
 4   HBA                  1721 non-null   int64  
 5   HBD                  1721 non-null   int64  
 6   MolLogP              1721 non-null   float64
 7   NumRotatableBonds    1721 non-null   int64  
 8   RingCount            1721 non-null   int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 121.1+ KB


In [80]:
NuBBE_unique = NuBBE_unique.rename(columns={"smiles_standardized": "smiles"})
NuBBE_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1721 entries, 0 to 1720
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1721 non-null   object 
 1   smiles             1721 non-null   object 
 2   MolWt              1721 non-null   float64
 3   TPSA               1721 non-null   float64
 4   HBA                1721 non-null   int64  
 5   HBD                1721 non-null   int64  
 6   MolLogP            1721 non-null   float64
 7   NumRotatableBonds  1721 non-null   int64  
 8   RingCount          1721 non-null   int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 121.1+ KB


In [81]:
NuBBE_unique.to_csv("nubbe_descriptors.csv", index=False)