<a href="https://colab.research.google.com/github/aalonsca73/in_silico_toxicology/blob/main/in_silico_toxicology.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
pip install pubchempy rdkit mordred

Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mordred
  Building wheel for mordred (setup.py) ... [?25l[?25hdone
  Created wheel for mordred: filename=mordred-1.2.0-py3-none-any.whl size=176721 sha256=0681c1a6f0523429ec0d2c56936efed10b206facc67eb6d2d6981e88062cf06f
  Stored in directory: /root/.cache/pip/wheels/a7/4f/b8/d4c6591f6ac944aaced7865b349477695f662388ad958743c7
Successfully built mordred
Installing collected packages: networkx, mordred
  Attempting uninstall: networkx
    Found existing installation: networkx 3.1
    Uninstal

In [17]:
# Definition of file name
filename = 'aminoacids.txt'

# Libraries needed
import sys
import pubchempy as pcp
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors

# Creation of a descriptor calculator with all descriptors
calc = Calculator(descriptors)

# We use pandas to read the file we defined previously, which does not have a header.
# We're adding a header called 'Compound' to the DataFrame for clarity.
df = pd.read_csv(filename,header=None,names=['Compound'])

# We create a list to add all SMILES
SMILES_list = []

# Iterate through each identifier in the 'Compound' column to obtain compound names from PubChem
for ids in df['Compound']:
    compound_name = pcp.get_compounds(ids,'name')

    # Once we have compound names, iterate through every name to obtain canonical SMILES from PubChem
    for name in compound_name:
        smiles = name.canonical_smiles
        # Then, we add canonical SMILES to a list called SMILES_list
        SMILES_list.append(smiles)

# We turn the list into a dictionary to remove all repeated canonical SMILES while maintaining order
canonical_SMILES = list(dict.fromkeys(SMILES_list))

# We create a DataFrame with canonical SMILES
smiles_df = pd.DataFrame(data=canonical_SMILES)
# Rename the column
smiles_df.columns = ['SMILES']
# Insert the DataFrame with compound names into the DataFrame with SMILES
smiles_df.insert(0,'Compound',df['Compound'],True)

# We create a new list called data to add all properties
data = []

# We iterate through every SMILES in smiles_df to get properties of every molecule
for molecule in smiles_df['SMILES']:
    try:
        mol = Chem.MolFromSmiles(molecule)
        data.append(mol)
    except:
        print(molecule)

# We define a new DataFrame called props_df with all properties we obtained before
props_df = calc.pandas(data)

# We merge the DataFrame containing SMILES with the DataFrame containing all properties
names_props = pd.concat([smiles_df,props_df],axis=1)

# We create a new document with all information
names_props.to_csv('molecules_with_properties.csv',index=False)

100%|██████████| 20/20 [00:01<00:00, 10.51it/s]
