<a href="https://colab.research.google.com/github/aalonsca73/in_silico_toxicology/blob/main/in_silico_toxicology.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install pubchempy rdkit mordred

Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pubchempy, mordred
  Building wheel for pubchempy (setup.py) ... [?25l[?25hdone
  Created wheel for pubchempy: filename=PubChemPy-1.

In [13]:
# Definition of file name
filename = 'llistes.xlsx'

# Libraries needed
import sys
import pubchempy as pcp
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors

# Creation of a descriptor calculator with all descriptors
calc = Calculator(descriptors)

# We use pandas to read the file we defined previously, which does not have a header.
df = pd.read_excel(filename,sheet_name=1,header=0)

# We create a list to add all SMILES
SMILES_list = []

# Iterate through each identifier in the 'Name' column to obtain compound names from PubChem
for ids in df['Name']:
    try:
        compound_name = pcp.get_compounds(ids,'name')
        # Once we have compound names, iterate through every name to obtain canonical SMILES from PubChem
        for name in compound_name:
            smiles = name.canonical_smiles
            # Then, we add canonical SMILES to a list called SMILES_list
            SMILES_list.append(smiles)
    except Exception as e:
        print(f"Error for molecule {ids}: {e}")


# We turn the list into a dictionary to remove all repeated canonical SMILES while maintaining order
canonical_SMILES = list(dict.fromkeys(SMILES_list))

# We create a DataFrame with canonical SMILES
smiles_df = pd.DataFrame(data=canonical_SMILES)
# Rename the column
smiles_df.columns = ['SMILES']
# Insert the DataFrame with compound names into the DataFrame with SMILES
smiles_df.insert(0,'Name',df['Name'],True)

# We create a new list called data to add all properties
data = []

# We iterate through every SMILES in smiles_df to get properties of every molecule
for molecule in smiles_df['SMILES']:
    try:
        mol = Chem.MolFromSmiles(molecule)
        data.append(mol)
    except:
        print(molecule)

# We define a new DataFrame called props_df with all properties we obtained before
props_df = calc.pandas(data)

# We merge the DataFrame containing SMILES with the DataFrame containing all properties
names_props = pd.concat([smiles_df,props_df],axis=1)

# We create a new document with all information
names_props.to_csv('molecules_with_properties.csv',index=False)

100%|██████████| 65/65 [00:17<00:00,  3.68it/s]
