<a href="https://colab.research.google.com/github/aalonsca73/in_silico_toxicology/blob/main/in_silico_toxicology.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pubchempy rdkit mordred

Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rdkit
  Downloading rdkit-2023.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pubchempy, mordred
  Building wheel for pubchempy (setup.py) ... [?25l[?25hdone
  Created wheel for pubchempy: filename=PubChemPy-1.

In [8]:
# Define the filename for the Excel file
filename = 'llistes.xlsx'

# Define the column name to be used as the index.
# Set this variable to the name of the column containing the molecule names of interest.
index = 'Name'

# Import necessary libraries
import sys
import pubchempy as pcp
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors

# Create a descriptor calculator with all available descriptors
calc = Calculator(descriptors)

# Read the Excel file using pandas
df = pd.read_excel(filename,sheet_name=1,header=0)
df.set_index(index,inplace=True)

# Create a list to store all canonical SMILES
SMILES_list = []

# Create a list to store molecules not found in PubChem
molecules_not_found = []

# Iterate through each molecule identifier in the index column to retrieve compound names from PubChem
for ids in df.index:
    try:
        # Fetch the compound name from PubChem
        compound_name = pcp.get_compounds(ids,'name')
        # Check if compound name is empty (no results)
        if not compound_name:
            # Use boolean indexing to drop rows with empty compound_name
            df.drop(ids,axis=0,inplace=True)
            # Add the molecule to the list of molecules_not_found
            molecules_not_found.append(ids)
        else:
            # Extract the first identifier for the compound
            first_identifier = compound_name[0].cid
            # Fetch the canonical SMILES for the first identifier
            first_smiles = pcp.get_compounds(first_identifier,'cid')[0].canonical_smiles
            # Add the canonical SMILES to a list called SMILES_list
            SMILES_list.append(first_smiles)
    except Exception as e:
        print(f"Error for molecule {ids}: {e}")

# Create a DataFrame with molecules not found in PubChem
missing_molecules_df = pd.DataFrame(data=molecules_not_found)

# Reset the index after dropping rows
df.reset_index(inplace=True)

# Remove duplicate canonical SMILES while preserving order
canonical_SMILES = list(dict.fromkeys(SMILES_list))

# We create a DataFrame with canonical SMILES
smiles_df = pd.DataFrame(data=canonical_SMILES)
# Rename the column
smiles_df.columns = ['SMILES']
# Insert the DataFrame with compound names into the DataFrame with SMILES
smiles_df.insert(0,'Name',df['Name'],True)

# Create a new list called data to store all properties
data = []

# Iterate through every SMILES in smiles_df to extract properties of every molecule
for molecule in smiles_df['SMILES']:
    try:
        mol = Chem.MolFromSmiles(molecule)
        data.append(mol)
    except:
        print(molecule)

# Create a new DataFrame called props_df with all obtained molecular properties
props_df = calc.pandas(data)

# Set column names equal to the values in the first row index position
props_df.columns = props_df.iloc[0]
col_names = props_df.iloc[0]
print(col_names)
# Remove the first row from the DataFrame
props_df = props_df[1:]

props_df = props_df.select_dtypes(include=[np.number])

# Merge the DataFrame containing SMILES with the DataFrame containing all properties
names_props = pd.concat([smiles_df,props_df],axis=1)

# Set column names equal to the values in the first row index position
#names_props.columns = names_props.iloc[0]

# Remove the first row from the DataFrame
#names_props = names_props[1:]

# Remove molecular descriptors that are not informative
# Delete columns where all values are zero
names_props = names_props.loc[:, ~np.all(np.equal(names_props.values, 0), axis=0)]

# Delete columns containing non-numeric data
#cols_to_remove = []
#for col in names_props.columns:
#    try:
#        _ = names_props[col].astype(float)
#    except ValueError:
#        print('Couldn\'t convert %s to float' % col)
#        cols_to_remove.append(col)
#        pass
# Keep only the columns in names_props that do not conain string
#names_props = names_props[[col for col in names_props.columns not in cols_to_remove]]


# Create a new document with all information
names_props.to_csv('molecules_with_properties.csv',index=False)

# Create another document with molecules not found in PubChem
missing_molecules_df.to_csv('molecules_not_found.csv',index=False)

Error for molecule 2,3,5,6-tetrabromo-p-xylene: 'PUGREST.ServerBusy'
Error for molecule Dimethyl propyl phosphonate: 'PUGREST.ServerBusy'
Error for molecule p-MethoxyphenylPhosphinic acid: 'PUGREST.ServerBusy'
Error for molecule Triisopropyl para-phosphate: 'PUGREST.ServerBusy'
Error for molecule Triisopropyl phosphate: 'PUGREST.ServerBusy'
Error for molecule Triphenyl Phosphate: 'PUGREST.ServerBusy'


100%|██████████| 62/62 [00:16<00:00,  3.84it/s]


0
16.758034     16.758034
13.726208     13.726208
0.000000              0
0.000000              0
27.096081     27.096081
                ...    
33.000000            33
110.000000        110.0
126.000000        126.0
8.888889       8.888889
4.861111       4.861111
Name: 0, Length: 1826, dtype: object
