# Starting the environment

In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6


In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

# *S. aureus* ATCC 43300

## Descriptors

In [3]:
s_aureus_data = pd.read_csv("/content/s_aureus_43300_MASSA_split_final.csv")
s_aureus_data = s_aureus_data.drop(columns=["label", "split"])
s_aureus_data

Unnamed: 0,id,smiles
0,CHEMBL439379,O=C(CN1C(=O)S/C(=C/c2sc(Cl)nc2Cl)C1=O)c1ccc(Cl...
1,CHEMBL5282842,O=C(NNC(=O)C1CC[C@@H]2CN1C(=O)N2OS(=O)(=O)O)c1...
2,CHEMBL4103936,Clc1ccc2c(Nc3cccc(C4CC(c5ccccc5)=NN4c4ccccc4)c...
3,CHEMBL467948,CC(=Cc1oc(O)c(C)c1O)CC/C=C(/C)CC/C=C(/C)CCCc1c...
4,CHEMBL4575403,COC(=O)c1ccc(-c2c3[nH]c4ccc(OC)cc4c3cc[n+]2Cc2...
...,...,...
2873,CHEMBL2206237,CCCCCC(=O)NCCCCC(N)C(=O)NC(CCCN=C(N)N)C(=O)NC(...
2874,CHEMBL2030125,CCCOSSOCCC
2875,CHEMBL4856036,CC[C@H]1OC(=O)C(C)[C@@H](OC(=O)NNC(=O)Cn2cc(-c...
2876,CHEMBL161,CON=C(C(=O)NC1C(=O)N2C(C(=O)O)=C(CSc3nc(=O)c(=...


In [4]:
# Wanted descriptors

# MolWt
# TPSA
# HBA
# HBD
# MolLogP
# NumRotatableBonds
# RingCount

mol = [Chem.MolFromSmiles(smi) for smi in s_aureus_data["smiles"]]

MolWt = [Descriptors.MolWt(mol) for mol in mol]
TPSA = [Descriptors.TPSA(mol) for mol in mol]
HBA = [Descriptors.NOCount(mol) for mol in mol]
HBD = [Descriptors.NHOHCount(mol) for mol in mol]
MolLogP = [Descriptors.MolLogP(mol) for mol in mol]
NumRotatableBonds = [Descriptors.NumRotatableBonds(mol) for mol in mol]
RingCount = [Descriptors.RingCount(mol) for mol in mol]

s_aureus_data["MolWt"] = MolWt
s_aureus_data["TPSA"] = TPSA
s_aureus_data["HBA"] = HBA
s_aureus_data["HBD"] = HBD
s_aureus_data["MolLogP"] = MolLogP
s_aureus_data["NumRotatableBonds"] = NumRotatableBonds
s_aureus_data["RingCount"] = RingCount

s_aureus_data.head()

Unnamed: 0,id,smiles,MolWt,TPSA,HBA,HBD,MolLogP,NumRotatableBonds,RingCount
0,CHEMBL439379,O=C(CN1C(=O)S/C(=C/c2sc(Cl)nc2Cl)C1=O)c1ccc(Cl...,468.17,67.34,5,0,5.6759,4,3
1,CHEMBL5282842,O=C(NNC(=O)C1CC[C@@H]2CN1C(=O)N2OS(=O)(=O)O)c1...,593.594,182.62,15,4,-0.3509,6,6
2,CHEMBL4103936,Clc1ccc2c(Nc3cccc(C4CC(c5ccccc5)=NN4c4ccccc4)c...,474.995,40.52,4,1,7.9875,5,6
3,CHEMBL467948,CC(=Cc1oc(O)c(C)c1O)CC/C=C(/C)CC/C=C(/C)CCCc1c...,398.543,66.74,4,2,7.47142,11,2
4,CHEMBL4575403,COC(=O)c1ccc(-c2c3[nH]c4ccc(OC)cc4c3cc[n+]2Cc2...,437.519,55.2,5,1,5.42752,5,5


In [5]:
s_aureus_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2878 entries, 0 to 2877
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2878 non-null   object 
 1   smiles             2878 non-null   object 
 2   MolWt              2878 non-null   float64
 3   TPSA               2878 non-null   float64
 4   HBA                2878 non-null   int64  
 5   HBD                2878 non-null   int64  
 6   MolLogP            2878 non-null   float64
 7   NumRotatableBonds  2878 non-null   int64  
 8   RingCount          2878 non-null   int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 202.5+ KB


## Lipinski’s “Rule of 5”

In [6]:
# Molecular Weight <= 500 Da
# No. Hydrogen Bond Donors <= 5
# No. Hydrogen Bond Acceptors <= 10
# LogP <= 5

lipinski = []

for row in s_aureus_data.itertuples():
  if row.MolWt <= 500 and row.HBA <= 10 and row.HBD <= 5 and row.MolLogP <= 5:
    lipinski.append(1)
  else:
    lipinski.append(0)

s_aureus_data["Lipinski"] = lipinski
s_aureus_data.head()

Unnamed: 0,id,smiles,MolWt,TPSA,HBA,HBD,MolLogP,NumRotatableBonds,RingCount,Lipinski
0,CHEMBL439379,O=C(CN1C(=O)S/C(=C/c2sc(Cl)nc2Cl)C1=O)c1ccc(Cl...,468.17,67.34,5,0,5.6759,4,3,0
1,CHEMBL5282842,O=C(NNC(=O)C1CC[C@@H]2CN1C(=O)N2OS(=O)(=O)O)c1...,593.594,182.62,15,4,-0.3509,6,6,0
2,CHEMBL4103936,Clc1ccc2c(Nc3cccc(C4CC(c5ccccc5)=NN4c4ccccc4)c...,474.995,40.52,4,1,7.9875,5,6,0
3,CHEMBL467948,CC(=Cc1oc(O)c(C)c1O)CC/C=C(/C)CC/C=C(/C)CCCc1c...,398.543,66.74,4,2,7.47142,11,2,0
4,CHEMBL4575403,COC(=O)c1ccc(-c2c3[nH]c4ccc(OC)cc4c3cc[n+]2Cc2...,437.519,55.2,5,1,5.42752,5,5,0


In [8]:
s_aureus_data["Lipinski"].value_counts()

Unnamed: 0_level_0,count
Lipinski,Unnamed: 1_level_1
0,1658
1,1220


In [9]:
s_aureus_data.to_csv("s_aureus_43300_descriptors_and_lipinski.csv", index=False)