<a href="https://colab.research.google.com/github/aalonsca73/in_silico_toxicology/blob/main/in_silico_toxicology.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
pip install pubchempy rdkit mordred



In [21]:
# Definition of file name
filename = 'llistes.xlsx'

# Libraries needed
import sys
import pubchempy as pcp
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors

# Creation of a descriptor calculator with all descriptors
calc = Calculator(descriptors)

# We use pandas to read the file we defined previously, which does not have a header.
df = pd.read_excel(filename,sheet_name=1,header=0)

# We create a list to add all SMILES
SMILES_list = []

# We create a list to store indices of compounds not found in PubChem
indices_to_drop = []

# Iterate through each identifier in the 'Name' column to obtain compound names from PubChem
for idx, ids in enumerate(df['Name']):
    try:
        compound_name = pcp.get_compounds(ids,'name')
        # Check if compound name is empty (no results)
        if not compound_name:
            print(f"Molecule not found in PubChem: {ids}")
            indices_to_drop.append(idx)
        else:
            # Once we have compound names, iterate through every name to obtain canonical SMILES from PubChem
            for name in compound_name:
                smiles = name.canonical_smiles
                # Then, we add canonical SMILES to a list called SMILES_list
                SMILES_list.append(smiles)
    except Exception as e:
        print(f"Error for molecule {ids}: {e}")

# Drop the rows with indices that were not found in PubChem
df.drop(indices_to_drop, inplace=True)

# We turn the list into a dictionary to remove all repeated canonical SMILES while maintaining order
canonical_SMILES = list(dict.fromkeys(SMILES_list))

# We create a DataFrame with canonical SMILES
smiles_df = pd.DataFrame(data=canonical_SMILES)
# Rename the column
smiles_df.columns = ['SMILES']
# Insert the DataFrame with compound names into the DataFrame with SMILES
smiles_df.insert(0,'Name',df['Name'],True)

# We create a new list called data to add all properties
data = []

# We iterate through every SMILES in smiles_df to get properties of every molecule
for molecule in smiles_df['SMILES']:
    try:
        mol = Chem.MolFromSmiles(molecule)
        data.append(mol)
    except:
        print(molecule)

# We define a new DataFrame called props_df with all properties we obtained before
props_df = calc.pandas(data)

# We merge the DataFrame containing SMILES with the DataFrame containing all properties
names_props = pd.concat([smiles_df,props_df],axis=1)

# We create a new document with all information
names_props.to_csv('molecules_with_properties.csv',index=False)

Molecule not found in PubChem: 1-(2,3-Dibromopropyl)-3,5-diallyl-1,3,5-triazine-2,4,6(1H, 3H, 5H)-trione
Molecule not found in PubChem: 1,3-Bis(2,3-dibromopropyl)-5-(2-propen-1-yl)-1,3,5-triazine-2,4,5(1H, 3H, 5H)-trione
Molecule not found in PubChem: 2,2-bis(chloromethyl-)trimethylen-ebis(bis(2-chloroethyl)phosphate
Molecule not found in PubChem: 2,2-Bis(chloromethyl)-1,3-propanediol bis[bis(2-chloroethyl)phosphate]
Molecule not found in PubChem: 2,2',2,4',6-Pentbromodiphenyl ether
Molecule not found in PubChem: 2,2',4,4',5',6-Hexabromodiphenil ether
Molecule not found in PubChem: 3-(Dimethylphosphono)propionic acid methylamide
Molecule not found in PubChem: 9,10-Dihydro-9-oxa-10-phospha-phenanthrene-10-oxide
Molecule not found in PubChem: Bis(2-ethylhexyl)tetrabromophthalate
Molecule not found in PubChem: Bis(4-carboxyphenyl) phenylphosphine oxide
Molecule not found in PubChem: Bis(5,5-dimethyl-2-thiono-1,3,2-doxa-phosphorin amyl) oxide
Molecule not found in PubChem: bisphenol A bis 

100%|██████████| 65/65 [00:18<00:00,  3.57it/s]
