In [6]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import Crippen
from rdkit.Chem import Draw
from rdkit.Chem.Descriptors import rdMolDescriptors
from rdkit.Chem.Draw import MolToImage
from rdkit.Chem import MolFromSmiles

In [7]:
data = pd.read_csv('clusters_csvs/smiles.csv', index_col=0)
data

Unnamed: 0,compound,smiles,cluster
0,INH01,CC(C)C[C@@H](C(=O)O)N[C@@H](CC1=CN=CN1CC2=CC(=...,0
1,INH02,C1=CC(=CC=C1C(=O)O)N=C(N)N,0
2,INH03,C1=CC2=C(C=CC(=C2N=C1)O)[N+](=O)[O-],0
3,INV01,CCCC[C@@H](C(=O)C(=O)NC1CC1)NC(=O)[C@@H]2[C@@H...,3
4,INV02,CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@@H]1[C@H]...,0
...,...,...,...
75,ALI23,C/C=C/S(=O)C[C@@H](C(=O)O)N,4
76,ALI22,C[S@](=O)C[C@@H](C(=O)O)N,4
77,ALI24,C[C@H]1C[S@](=O)C[C@H](N1)C(=O)O,3
78,ALI25,C=CCSCC(C(=O)O)NC(=O)CCC(C(=O)O)N,4


In [8]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
##save it into pandas dataframe
import pandas as pd

# create an empty DataFrame with the appropriate column names
df = pd.DataFrame(columns=['Drug', 'Molecular formula', 'Molecular Weight', 'Number of Atoms', 'Number of Heavy Atoms', 'Hydrogen bond donors (Lipinski)', 'Hydrogen bond acceptors (Lipinski)', 'Number of rotable bonds', 'Polar surface area', 'LogP', 'aromatic rings']) 

def calculate_descriptors(mol):
    mol = Chem.MolFromSmiles(mol)
    formula = rdMolDescriptors.CalcMolFormula(mol)
    weight = rdMolDescriptors.CalcExactMolWt(mol)
    num_atoms = rdMolDescriptors.CalcNumAtoms(mol)
    num_heavy_atoms = rdMolDescriptors.CalcNumHeavyAtoms(mol)
    num_hbd = rdMolDescriptors.CalcNumLipinskiHBD(mol)
    num_hba = rdMolDescriptors.CalcNumLipinskiHBA(mol)
    num_rot_bonds = rdMolDescriptors.CalcNumRotatableBonds(mol)
    psa = rdMolDescriptors.CalcTPSA(mol)
    logp = Chem.Crippen.MolLogP(mol)
    num_aromatic_rings = rdMolDescriptors.CalcNumAromaticRings(mol)
    return pd.Series({
                    'formula': formula,
                    'weight': weight,
                    'num_atoms': num_atoms,
                    'num_heavy_atoms': num_heavy_atoms,
                    'num_hbd': num_hbd,
                    'num_hba': num_hba,
                    'num_rot_bonds': num_rot_bonds,
                    'psa': psa,
                    'logp': logp,
                    'num_aromatic_rings': num_aromatic_rings
                    })

df = data.join(data['smiles'].apply(calculate_descriptors))


In [9]:
# import the Chem module from the RDKit library
from rdkit import Chem

# define the list of functional groups
functionalgroups = ['COOH', 'NH2', 'OH', 'C=C', 'C=O','C=OC', 'S']

# create new columns for the functional groups
df = df.assign(**{group: 0 for group in functionalgroups})

# define the patterns for each functional group
patterns = {
    'COOH': Chem.MolFromSmarts('CC(=O)O'),
    'NH2': Chem.MolFromSmarts('N'),
    'OH': Chem.MolFromSmarts('O'),
    'C=C': Chem.MolFromSmarts('C=C'),
    'C=O': Chem.MolFromSmarts('C=O'),
    'C=OC': Chem.MolFromSmarts('C=OC'),
    'S': Chem.MolFromSmarts('S')
}

# iterate over drugs
for i in df.index:
    # convert the SMILES string to a RDKit Mol object
    mol = Chem.MolFromSmiles(df.loc[i, 'smiles'])
    
    # iterate over the functional groups
    for group, pattern in patterns.items():
        # check if the drug contains the functional group
        if mol.HasSubstructMatch(pattern):
            # if the group is present, set the value to 1
            df.loc[i, group] = 1
        else:
            # if the group is not present, set the value to 0
            df.loc[i, group] = 0


In [10]:

# Save the DataFrame to a CSV file
df.to_csv('Molecular_properties(vf).csv', index=False)
n_clusters =5 
## make dfs per cluster
dfs = {i:df.loc[df['cluster'] == i] for i in range(0, n_clusters)}

## save csv
df.to_csv('clusters_csvs/Molecular_properties_vf.csv')

for i in dfs:
    dfs[i].to_csv('clusters_csvs/Mol_prop_by_clusters_'+str(i)+'_vf.csv')
    print('files are save it on: clusters_csvs/Mol_prop_by_clusters_'+str(i)+'_vf.csv')

files are save it on: clusters_csvs/Mol_prop_by_clusters_0_vf.csv
files are save it on: clusters_csvs/Mol_prop_by_clusters_1_vf.csv
files are save it on: clusters_csvs/Mol_prop_by_clusters_2_vf.csv
files are save it on: clusters_csvs/Mol_prop_by_clusters_3_vf.csv
files are save it on: clusters_csvs/Mol_prop_by_clusters_4_vf.csv
