# EcoCyc *E. coli* Biochemical Synthesis Data Evaluation

# Installing Dependancies

In [2]:
# Install and import libraries
%%capture
!pip install rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import rdChemReactions
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import MolStandardize, inchi
from rdkit.Chem.MolStandardize import rdMolStandardize, tautomer
from rdkit import RDLogger

from concurrent.futures import ThreadPoolExecutor, TimeoutError

import numpy as np

from dataclasses import dataclass
from typing import List
from typing import Tuple

from IPython.display import display
from IPython.display import Image

# Downloading and Reading Drive Files with WGet

Using WGet to upload and read the two flat files sourced from BioCyc (**compounds.dat** and r**eactions.dat**) These two documents came from the EcoCyc flat file release that came to Dr. Anderson in an email on Oct. 10, 2023.

The custom lists of minimal metabolites and universal metabolites, to occupy Shell 0, are also uploaded using this method.

In [1]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1mB-xIewC1UY3mLzVi0ovRV6bQPS7RzYg' -O compounds.dat
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1SjVODv98B3KJxV0hFrNAPN0B-3nTfr1b' -O reactions.dat
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1PQazv8Q0cjn_ug76mIICyxNy7BxNJwds' -O minimal_metabolites_02-24.txt
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1jlLnO7oAmz4dqaynmPBegCSAPXemPB-R' -O universal_metabolites_03-24.txt

--2024-04-26 04:03:46--  https://docs.google.com/uc?export=download&id=1mB-xIewC1UY3mLzVi0ovRV6bQPS7RzYg
Resolving docs.google.com (docs.google.com)... 74.125.199.138, 74.125.199.102, 74.125.199.139, ...
Connecting to docs.google.com (docs.google.com)|74.125.199.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1mB-xIewC1UY3mLzVi0ovRV6bQPS7RzYg&export=download [following]
--2024-04-26 04:03:47--  https://drive.usercontent.google.com/download?id=1mB-xIewC1UY3mLzVi0ovRV6bQPS7RzYg&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 74.125.195.132, 2607:f8b0:400e:c09::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|74.125.195.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8344931 (8.0M) [application/octet-stream]
Saving to: ‘compounds.dat’


2024-04-26 04:03:52 (58.4 MB/s) - ‘compounds.dat’ saved [8344931/8344

#Defining EcoCyc Chemical and EcoCyc Reaction Classes

In [4]:
@dataclass(frozen=True)
class EcoCyc_Chemical:
    UNIQUE_ID: str = ""
    COMMON_NAME: str = ""
    SMILES: str = ""
    INCHI: str = ""
    NON_STANDARD_INCHI: str = ""

# Note: Since dataclasses automatically generate __repr__, you don't need to define it unless you want a custom representation.


@dataclass(frozen=True)
class EcoCyc_Reaction:
    UNIQUE_ID: str = ""
    LEFT: List[str] = None
    RIGHT: List[str] = None
    EC_NUMBER: str = ""
    REACTION_DIRECTION: str = ""

    def __post_init__(self): #used to ensure that SUBSTRATES and PRODUCTS are initialized as empty lists if they are not provided during object creation (safer for list operations)
        if self.LEFT is None:
            self.LEFT = []
        if self.RIGHT is None:
            self.RIGHT = []


# Parsing the EcoCyc files into EcoCyc objects

###EcoCyc_Chemicals from Compounds.dat

Using a parser of **compounds.dat** to construct the EcoCyc_Chemical ontology. The EcoCyc_Chemical class and cooresponding objects is meant to be an exact replica of what is found in the raw data. These objects are stored in a dictionary called **ecocyc_chemicals_dict**



In [5]:
def parse_chemicals_to_dict(file_contents):
    ecocyc_chemicals_dict = {}

    ecocyc_chemical_data = {
        'UNIQUE_ID': '',
        'COMMON_NAME': '',
        'SMILES': '',
        'INCHI': '',
        'NON_STANDARD_INCHI': '',
    }

    for line in file_contents:
        if '- ' in line:
            key, value = line.split('- ', 1)  # 1 is used as a max split to limit the number of splits
            key = key.strip().replace('-', '_').upper()  # Converting to dataclass format
            value = value.strip()

            if key in ecocyc_chemical_data:
                ecocyc_chemical_data[key] = value #Assigns values to the corresponding keys in the ecocyc_chemical_data dictionary
        elif line.strip() == '//':  # End of chemical data entry in the flat file
            chem_obj = EcoCyc_Chemical(**ecocyc_chemical_data) #Creates an instance of EcoCyc_Chemical using the data stored in ecocyc_chemical_data.
            #The ** operator is used to unpack the dictionary into keyword arguments
            ecocyc_chemicals_dict[ecocyc_chemical_data['UNIQUE_ID']] = chem_obj #The new EcoCyc_Chemical object is stored in ecocyc_chemicals_dict dictionary; UNIQUE_ID is the key.
            ecocyc_chemical_data = {k: '' for k in ecocyc_chemical_data}  # Reset for the next entry

    return ecocyc_chemicals_dict #In the dictionary: Key=UNIQUE_ID, Value= the entire EcoCyc_Chemical object

# Code to read file and create dictionary
with open("compounds.dat", "r", encoding="ISO-8859-1") as file:
    file_contents = file.readlines()
ecocyc_chemicals_dict = parse_chemicals_to_dict(file_contents)

# Test by printing the first 5 chemicals
for test_key in list(ecocyc_chemicals_dict.keys())[:5]: #Specifically creating list of the keys in ecocyc_chemicals_dict dictionary.
    print(ecocyc_chemicals_dict[test_key])


EcoCyc_Chemical(UNIQUE_ID='Ureidocarboxylates', COMMON_NAME='an ureidocarboxylate', SMILES='', INCHI='', NON_STANDARD_INCHI='')
EcoCyc_Chemical(UNIQUE_ID='Monogalactosyldiacylglycerols-38-3', COMMON_NAME='monogalactosyldiacylglycerol-38:3', SMILES='', INCHI='', NON_STANDARD_INCHI='')
EcoCyc_Chemical(UNIQUE_ID='6-H-5-I-2-methylhexanoates', COMMON_NAME='a 6-hydroxy-5-isopropenyl-2-methylhexanoate', SMILES='C=C(C)C(CCC(C)C(=O)[O-])CO', INCHI='InChI=1S/C10H18O3/c1-7(2)9(6-11)5-4-8(3)10(12)13/h8-9,11H,1,4-6H2,2-3H3,(H,12,13)/p-1', NON_STANDARD_INCHI='InChI=1S/C10H18O3/c1-7(2)9(6-11)5-4-8(3)10(12)13/h8-9,11H,1,4-6H2,2-3H3,(H,12,13)')
EcoCyc_Chemical(UNIQUE_ID='Monogalactosyldiacylglycerols-36-4', COMMON_NAME='monogalactosyldiacylglycerol-36:4', SMILES='', INCHI='', NON_STANDARD_INCHI='')
EcoCyc_Chemical(UNIQUE_ID='Short-Chain-Carboxylates', COMMON_NAME='a short-chain carboxylate', SMILES='C([O-])(=O)[R]', INCHI='', NON_STANDARD_INCHI='InChI=1S/CH2O2/c2-1-3/h1H,(H,2,3)/p-1')


###EcoCyc_Reactions from Reactions.dat

Using a parser of **reactions.dat** to construct the EcoCyc_Reaction ontology. The EcoCyc_Reaction class and cooresponding objects is meant to be an exact replica of what is found in the raw data. These objects are stored in a dictionary called **ecocyc_reactions_dict**




In [6]:
def parse_reactions_to_dict(file_contents):
    ecocyc_reactions_dict = {}
    ecocyc_reaction_data = {
        'UNIQUE_ID': '',
        'LEFT': [],
        'RIGHT': [],
        'EC_NUMBER': '',
        'REACTION_DIRECTION': ''
    }

    for line in file_contents:
        if '- ' in line:
            key, value = line.split('- ', 1)
            key = key.strip().replace('-', '_').upper()
            value = value.strip()

            if key in ['UNIQUE_ID', 'EC_NUMBER', 'REACTION_DIRECTION']:
                ecocyc_reaction_data[key] = value
            elif key in ['LEFT', 'RIGHT']:
                ecocyc_reaction_data[key].append(value) #Must append because we are making a list
        elif line.strip() == '//':  # End of reaction data entry
            reaction_obj = EcoCyc_Reaction(**ecocyc_reaction_data)
            ecocyc_reactions_dict[ecocyc_reaction_data['UNIQUE_ID']] = reaction_obj
            ecocyc_reaction_data = {
                'UNIQUE_ID': '',
                'LEFT': [],
                'RIGHT': [],
                'EC_NUMBER': '',
                'REACTION_DIRECTION': ''
            } # Reset for the next entry

    return ecocyc_reactions_dict

# Code to read file and create dictionary
with open("reactions.dat", "r", encoding="ISO-8859-1") as file:
    file_contents = file.readlines()
ecocyc_reactions_dict = parse_reactions_to_dict(file_contents)

# Test by printing the first 5 reactions
for key in list(ecocyc_reactions_dict.keys())[:5]:
    print(ecocyc_reactions_dict[key])


EcoCyc_Reaction(UNIQUE_ID='RXN-21279', LEFT=['a-5-deoxyribose-5-phosphate-DNA'], RIGHT=['5-Phospho-terminated-DNAs', 'CPD-22978', 'PROTON'], EC_NUMBER='', REACTION_DIRECTION='PHYSIOL-LEFT-TO-RIGHT')
EcoCyc_Reaction(UNIQUE_ID='RXN-19954', LEFT=['2Cys-Peroxiredoxins-With-HydroxyCys'], RIGHT=['Cys2-Peroxiredoxin-Disulfide', 'WATER'], EC_NUMBER='', REACTION_DIRECTION='PHYSIOL-LEFT-TO-RIGHT')
EcoCyc_Reaction(UNIQUE_ID='RXN-17919', LEFT=['A-5-prime-PP-5-prime-DNA', '3-Hydroxy-Terminated-DNAs'], RIGHT=['DNA-N', 'AMP', 'PROTON'], EC_NUMBER='', REACTION_DIRECTION='PHYSIOL-LEFT-TO-RIGHT')
EcoCyc_Reaction(UNIQUE_ID='RXN0-4701', LEFT=['mRNA-Holder', 'WATER'], RIGHT=['ssRNA-with-3phosphate', 'ssRNA-with-5OH'], EC_NUMBER='', REACTION_DIRECTION='LEFT-TO-RIGHT')
EcoCyc_Reaction(UNIQUE_ID='RXN-20692', LEFT=['Cys2-Peroxiredoxin-Disulfide', 'Red-Prx-Disulfide-Reductases'], RIGHT=['Reduced-Cys2-Peroxiredoxins', 'Ox-Prx-Disulfide-Reductases'], EC_NUMBER='', REACTION_DIRECTION='PHYSIOL-LEFT-TO-RIGHT')


# Creating Chemical and Reaction Objects for Synthesis

I'm making this a step because in the final product, there will be multiple alteration steps of the EcoCyc ontology compared with what we want in the end Chemical and Reaction ojects that are used directly in the synthesis algorithm. See "Transformation Validator Documentation" for more details: https://docs.google.com/document/d/14jfgooL9Do50gW2h1I39nQzDNsVGLdDSXtIxkN49jsc/edit?usp=sharing

In [7]:
@dataclass(frozen=True)
class Chemical:
    UNIQUE_ID: str = ""
    COMMON_NAME: str = ""
    SMILES: str = ""
    INCHI: str = ""
    NON_STANDARD_INCHI: str = ""

@dataclass(frozen=True)
class Reaction:
    UNIQUE_ID: str
    SUBSTRATES: Tuple[Chemical, ...]  # Assuming Chemical objects are hashable
    PRODUCTS: Tuple[Chemical, ...]
    EC_NUMBER: str
    REACTION_DIRECTION: str



#Data Counting and Filtering

In [8]:
# Step 1: Collect all unique substrate and product IDs from EcoCyc_Reactions
unique_chemical_ids = set()
for reaction_id, reaction in ecocyc_reactions_dict.items():
    unique_chemical_ids.update(reaction.LEFT)
    unique_chemical_ids.update(reaction.RIGHT)

# Step 2: Filter EcoCyc_Chemicals to keep only those referenced in EcoCyc_Reactions
# Now creating a list of EcoCyc_Chemical objects
ecocyc_chems_in_rxns = [ecocyc_chemicals_dict[chem_id] for chem_id in unique_chemical_ids if chem_id in ecocyc_chemicals_dict]

# Optional: Display some of the chemicals in the list to verify
for chemical in ecocyc_chems_in_rxns[:5]:
    print(chemical)


EcoCyc_Chemical(UNIQUE_ID='5-DEHYDROGLUCONATE', COMMON_NAME='5-dehydro-D-gluconate', SMILES='C(O)C(=O)[C@@H](O)[C@H](O)[C@@H](O)C(=O)[O-]', INCHI='InChI=1S/C6H10O7/c7-1-2(8)3(9)4(10)5(11)6(12)13/h3-5,7,9-11H,1H2,(H,12,13)/p-1/t3-,4+,5-/m1/s1', NON_STANDARD_INCHI='InChI=1S/C6H10O7/c7-1-2(8)3(9)4(10)5(11)6(12)13/h3-5,7,9-11H,1H2,(H,12,13)/p-1/t3-,4+,5-/m1/s1')
EcoCyc_Chemical(UNIQUE_ID='S-METHYLGLUTATHIONE', COMMON_NAME='<i>S</i>-methylglutathione', SMILES='CSC[C@@H](C(NCC([O-])=O)=O)NC(=O)CC[C@H]([NH3+])C([O-])=O', INCHI='InChI=1S/C11H19N3O6S/c1-21-5-7(10(18)13-4-9(16)17)14-8(15)3-2-6(12)11(19)20/h6-7H,2-5,12H2,1H3,(H,13,18)(H,14,15)(H,16,17)(H,19,20)/p-1/t6-,7-/m0/s1', NON_STANDARD_INCHI=':STANDARD-INCHI')
EcoCyc_Chemical(UNIQUE_ID='CPD-21685', COMMON_NAME='6-sulfo-&alpha;-D-quinovose', SMILES='C([C@H]1(O[C@H](O)[C@H](O)[C@@H](O)[C@H](O)1))S(=O)(=O)[O-]', INCHI='InChI=1S/C6H12O8S/c7-3-2(1-15(11,12)13)14-6(10)5(9)4(3)8/h2-10H,1H2,(H,11,12,13)/p-1/t2-,3-,4+,5-,6+/m1/s1', NON_STANDARD_INC

In [9]:
# Print the length of EcoCyc_chemicals_dict and ecocyc_chems_in_rxns
print(f"Length of EcoCyc_chemicals_dict: {len(ecocyc_chemicals_dict)}")
print(f"Length of ecocyc_chems_in_rxns: {len(ecocyc_chems_in_rxns)}")

Length of EcoCyc_chemicals_dict: 7585
Length of ecocyc_chems_in_rxns: 2083


###Functions to help with error handling when using RDKit

In [10]:
# Suppress RDKit warnings
logger = RDLogger.logger()
logger.setLevel(RDLogger.ERROR)  # Show only errors, ignore warnings


# Define a context manager to suppress RDKit warnings and errors temporarily
class SuppressRDKitLogs:
    def __enter__(self):
        RDLogger.DisableLog('rdApp.*')

    def __exit__(self, exc_type, exc_val, exc_tb):
        RDLogger.EnableLog('rdApp.*')

def safe_mol_from_inchi(inchi):
    with SuppressRDKitLogs():
        try:
            return Chem.MolFromInchi(inchi)
        except:
            return None

def safe_mol_from_smiles(smiles):
    with SuppressRDKitLogs():
        try:
            return Chem.MolFromSmiles(smiles)
        except:
            return None

def safe_mol_from_smarts(smarts):
    with SuppressRDKitLogs():
        try:
            return Chem.MolFromSmarts(smarts)
        except:
            return None

###None

In [11]:
# Filter to find chemicals with no InChI, no NSInChI, and no SMILES
chems_with_no_structural_info = [chem for chem in ecocyc_chems_in_rxns if not chem.INCHI and not chem.NON_STANDARD_INCHI and not chem.SMILES]
print(f"Number of chemicals with no InChI, no NSInChI, and no SMILES: {len(chems_with_no_structural_info)}")

Number of chemicals with no InChI, no NSInChI, and no SMILES: 69


###All 3

In [12]:
# Filter chemicals that have all three: InChI, NSInChI, and SMILES
chems_with_all_three = [
    chem for chem in ecocyc_chems_in_rxns
    if chem.INCHI and chem.NON_STANDARD_INCHI and chem.SMILES
]

# Of those, filter chemicals whose SMILES can be parsed as a SMILES
chems_with_all_three_parsable_smiles = [
    chem for chem in chems_with_all_three
    if safe_mol_from_smiles(chem.SMILES)
]

# Of those, filter chemicals whose SMILES can be parsed as a SMIRKS
chems_with_all_three_parsable_smirks = [
    chem for chem in chems_with_all_three
    if safe_mol_from_smarts(chem.SMILES)
]

# Of those, filter chemicals whose SMILES can be parsed as BOTH SMILES and SMIRKS
chems_with_all_three_parsable_as_both = [
    chem for chem in chems_with_all_three
    if safe_mol_from_smiles(chem.SMILES) and safe_mol_from_smarts(chem.SMILES)
]

# Of those, filter chemicals whose SMILES cannot be parsed as either SMILES or SMIRKS
chems_with_all_three_unparsable_smiles = [
    chem for chem in chems_with_all_three
    if not safe_mol_from_smiles(chem.SMILES) and not safe_mol_from_smarts(chem.SMILES)
]

# Print the counts
print(f"Number of chemicals with all three (InChI, NSInChI, SMILES): {len(chems_with_all_three)}")
print(f"Number of chemicals with all three where SMILES can be parsed as SMILES: {len(chems_with_all_three_parsable_smiles)}")
print(f"Number of chemicals with all three where SMILES can be parsed as SMIRKS: {len(chems_with_all_three_parsable_smirks)}")
print(f"Number of chemicals with all three where SMILES can be parsed as BOTH SMILES and SMIRKS: {len(chems_with_all_three_parsable_as_both)}")
print(f"Number of chemicals with all three where SMILES cannot be parsed as SMILES or SMIRKS: {len(chems_with_all_three_unparsable_smiles)}")


Number of chemicals with all three (InChI, NSInChI, SMILES): 1443
Number of chemicals with all three where SMILES can be parsed as SMILES: 1443
Number of chemicals with all three where SMILES can be parsed as SMIRKS: 1443
Number of chemicals with all three where SMILES can be parsed as BOTH SMILES and SMIRKS: 1443
Number of chemicals with all three where SMILES cannot be parsed as SMILES or SMIRKS: 0


### Inchi

In [13]:
# Suppress RDKit warnings
logger.setLevel(RDLogger.ERROR)  # Show only errors, ignore warnings

# Filter chemicals that have an InChI
inchi_chems_with_inchi = [chem for chem in ecocyc_chems_in_rxns if chem.INCHI]

# Of those, filter chemicals whose InChI cannot be parsed as a mol
inchi_chems_with_unparsable_inchi = [chem for chem in inchi_chems_with_inchi if not Chem.MolFromInchi(chem.INCHI)]

# Filter chemicals that have ONLY an InChI (No NSInChI or SMILES)
inchi_chems_with_inchi_only = [chem for chem in ecocyc_chems_in_rxns if chem.INCHI and not chem.NON_STANDARD_INCHI and not chem.SMILES]

# Filter chemicals that have BOTH an InChI and NSInChI ONLY
inchi_chems_with_inchi_and_ns_inchi_only = [chem for chem in ecocyc_chems_in_rxns if chem.INCHI and chem.NON_STANDARD_INCHI and not chem.SMILES]

# Filter chemicals that have BOTH an InChI and a SMILES ONLY
inchi_chems_with_inchi_and_smiles_only = [chem for chem in ecocyc_chems_in_rxns if chem.INCHI and chem.SMILES and not chem.NON_STANDARD_INCHI]

# Of those, filter chemicals whose SMILES cannot be parsed as a mol
inchi_chems_with_unparsable_smiles = [chem for chem in inchi_chems_with_inchi_and_smiles_only if not Chem.MolFromSmiles(chem.SMILES)]

# Print the counts
print(f"Number of chemicals with InChI: {len(inchi_chems_with_inchi)}")
print(f"Number of chemicals with InChI that cannot be parsed: {len(inchi_chems_with_unparsable_inchi)}")
print(f"Number of chemicals with ONLY InChI: {len(inchi_chems_with_inchi_only)}")
print(f"Number of chemicals with BOTH InChI and NSInChI ONLY: {len(inchi_chems_with_inchi_and_ns_inchi_only)}")
print(f"Number of chemicals with BOTH InChI and SMILES ONLY: {len(inchi_chems_with_inchi_and_smiles_only)}")
print(f"Number of chemicals with InChI and SMILES where SMILES cannot be parsed: {len(inchi_chems_with_unparsable_smiles)}")



Number of chemicals with InChI: 1454
Number of chemicals with InChI that cannot be parsed: 0
Number of chemicals with ONLY InChI: 0
Number of chemicals with BOTH InChI and NSInChI ONLY: 0
Number of chemicals with BOTH InChI and SMILES ONLY: 11
Number of chemicals with InChI and SMILES where SMILES cannot be parsed: 0


In [14]:
# Filter chemicals that have an InChI and NO SMILES
inchi_chems_with_inchi_no_smiles = [chem for chem in ecocyc_chems_in_rxns if chem.INCHI and not chem.SMILES]

# Print the count
print(f"Number of chemicals with InChI and NO SMILES: {len(inchi_chems_with_inchi_no_smiles)}")

Number of chemicals with InChI and NO SMILES: 0


### Non-Standard Inchi

In [15]:
# Filter chemicals that have a NSInChI
nsinchi_chems_with_ns_inchi = [chem for chem in ecocyc_chems_in_rxns if chem.NON_STANDARD_INCHI]

# Filter chemicals that have a NSInChI, excluding those with NON_STANDARD_INCHI set to ':STANDARD-INCHI'
# and check if the remaining NSInChIs cannot be parsed as a mol
nsinchi_chems_with_unparsable_ns_inchi = [
    chem for chem in nsinchi_chems_with_ns_inchi
    if chem.NON_STANDARD_INCHI != ':STANDARD-INCHI' and not safe_mol_from_inchi(chem.NON_STANDARD_INCHI)
]

# Filter chemicals that have ONLY a NSInChI (No InChI or SMILES)
nsinchi_chems_with_ns_inchi_only = [chem for chem in ecocyc_chems_in_rxns if chem.NON_STANDARD_INCHI and not chem.INCHI and not chem.SMILES]

# Filter chemicals that have BOTH a NSInChI and a SMILES ONLY
nsinchi_chems_with_ns_inchi_and_smiles_only = [chem for chem in ecocyc_chems_in_rxns if chem.NON_STANDARD_INCHI and chem.SMILES and not chem.INCHI]

# Of those, filter chemicals whose SMILES can be parsed as a SMILES
nsinchi_chems_with_parsable_smiles = [chem for chem in nsinchi_chems_with_ns_inchi_and_smiles_only if safe_mol_from_smiles(chem.SMILES)]

# Of those, filter chemicals whose SMILES can be parsed as a SMIRKS
nsinchi_chems_with_parsable_smirks = [chem for chem in nsinchi_chems_with_ns_inchi_and_smiles_only if safe_mol_from_smarts(chem.SMILES)]

# Filter chemicals that have BOTH a NSInChI and SMILES ONLY and whose SMILES can be parsed as both SMILES and SMIRKS
nsinchi_chems_with_smiles_parsable_as_both = [chem for chem in nsinchi_chems_with_ns_inchi_and_smiles_only if safe_mol_from_smiles(chem.SMILES) and safe_mol_from_smarts(chem.SMILES)]

# Of those, filter chemicals whose SMILES cannot be parsed as a SMILES or SMIRKS
nsinchi_chems_with_unparsable_smiles_smirks = [chem for chem in nsinchi_chems_with_ns_inchi_and_smiles_only if not safe_mol_from_smiles(chem.SMILES) and not safe_mol_from_smarts(chem.SMILES)]

# Print the counts
print(f"Number of chemicals with NSInChI: {len(nsinchi_chems_with_ns_inchi)}")
print(f"Number of chemicals with NSInChI (excluding ':STANDARD-INCHI') that cannot be parsed: {len(nsinchi_chems_with_unparsable_ns_inchi)}")
print(f"Number of chemicals with ONLY NSInChI: {len(nsinchi_chems_with_ns_inchi_only)}")
print(f"Number of chemicals with BOTH NSInChI and SMILES ONLY: {len(nsinchi_chems_with_ns_inchi_and_smiles_only)}")
print(f"Number of chemicals with NSInChI and SMILES where SMILES can be parsed as SMILES: {len(nsinchi_chems_with_parsable_smiles)}")
print(f"Number of chemicals with NSInChI and SMILES where SMILES can be parsed as SMIRKS: {len(nsinchi_chems_with_parsable_smirks)}")
print(f"Number of chemicals with BOTH NSInChI and SMILES ONLY where SMILES can be parsed as BOTH SMILES and SMIRKS: {len(nsinchi_chems_with_smiles_parsable_as_both)}")
print(f"Number of chemicals with NSInChI and SMILES where SMILES cannot be parsed as SMILES or SMIRKS: {len(nsinchi_chems_with_unparsable_smiles_smirks)}")



Number of chemicals with NSInChI: 1963
Number of chemicals with NSInChI (excluding ':STANDARD-INCHI') that cannot be parsed: 1
Number of chemicals with ONLY NSInChI: 1
Number of chemicals with BOTH NSInChI and SMILES ONLY: 519
Number of chemicals with NSInChI and SMILES where SMILES can be parsed as SMILES: 6
Number of chemicals with NSInChI and SMILES where SMILES can be parsed as SMIRKS: 180
Number of chemicals with BOTH NSInChI and SMILES ONLY where SMILES can be parsed as BOTH SMILES and SMIRKS: 6
Number of chemicals with NSInChI and SMILES where SMILES cannot be parsed as SMILES or SMIRKS: 339


### SMILES

In [16]:
# Filter chemicals that have a SMILES
smiles_chems_with_smiles = [chem for chem in ecocyc_chems_in_rxns if chem.SMILES]

# Of those, filter chemicals whose SMILES cannot be parsed as a SMILES or SMIRKS
smiles_chems_with_unparsable_smiles = [
    chem for chem in smiles_chems_with_smiles
    if not safe_mol_from_smiles(chem.SMILES) and not safe_mol_from_smarts(chem.SMILES)
]

# Filter chemicals that have ONLY a SMILES (No InChI or NSInChI)
smiles_chems_with_smiles_only = [
    chem for chem in ecocyc_chems_in_rxns
    if chem.SMILES and not chem.INCHI and not chem.NON_STANDARD_INCHI
]

# Of those, filter chemicals whose SMILES can be parsed as a SMILES
smiles_chems_with_parsable_smiles = [
    chem for chem in smiles_chems_with_smiles_only
    if safe_mol_from_smiles(chem.SMILES)
]

# Of those, filter chemicals whose SMILES can be parsed as a SMIRKS
smiles_chems_with_parsable_smirks = [
    chem for chem in smiles_chems_with_smiles_only
    if safe_mol_from_smarts(chem.SMILES)
]

# Of those, filter chemicals whose SMILES can be parsed BOTH as a SMILES or a SMIRKS
smiles_chems_parsable_as_both = [
    chem for chem in smiles_chems_with_smiles_only
    if safe_mol_from_smiles(chem.SMILES) and safe_mol_from_smarts(chem.SMILES)
]

# Filter chemicals that have ONLY a SMILES and whose SMILES cannot be parsed as either a SMILES or a SMIRKS
smiles_chems_with_smiles_only_unparsable = [
    chem for chem in smiles_chems_with_smiles_only
    if not safe_mol_from_smiles(chem.SMILES) and not safe_mol_from_smarts(chem.SMILES)
]

# Print the counts
print(f"Number of chemicals with SMILES: {len(smiles_chems_with_smiles)}")
print(f"Number of chemicals with SMILES that cannot be parsed as SMILES or SMIRKS: {len(smiles_chems_with_unparsable_smiles)}")
print(f"Number of chemicals with ONLY SMILES: {len(smiles_chems_with_smiles_only)}")
print(f"Number of chemicals with ONLY SMILES that can be parsed as SMILES: {len(smiles_chems_with_parsable_smiles)}")
print(f"Number of chemicals with ONLY SMILES that can be parsed as SMIRKS: {len(smiles_chems_with_parsable_smirks)}")
print(f"Number of chemicals with ONLY SMILES that can be parsed as BOTH SMILES and SMIRKS: {len(smiles_chems_parsable_as_both)}")
print(f"Number of chemicals with ONLY SMILES that cannot be parsed as SMILES or SMIRKS: {len(smiles_chems_with_smiles_only_unparsable)}")



Number of chemicals with SMILES: 2013
Number of chemicals with SMILES that cannot be parsed as SMILES or SMIRKS: 368
Number of chemicals with ONLY SMILES: 40
Number of chemicals with ONLY SMILES that can be parsed as SMILES: 1
Number of chemicals with ONLY SMILES that can be parsed as SMIRKS: 11
Number of chemicals with ONLY SMILES that can be parsed as BOTH SMILES and SMIRKS: 1
Number of chemicals with ONLY SMILES that cannot be parsed as SMILES or SMIRKS: 29


###Print List

In [17]:
def print_chem_list(chem_list):
    """
    Prints each EcoCyc_Chemical object in the provided list on a separate line.

    :param chem_list: List of EcoCyc_Chemical objects.
    """
    for chem in chem_list:
        print(chem)

# Call the function with chems_with_no_structural_info list
print_chem_list(chems_with_no_structural_info)


EcoCyc_Chemical(UNIQUE_ID='bacitracin', COMMON_NAME='bacitracin', SMILES='', INCHI='', NON_STANDARD_INCHI='')
EcoCyc_Chemical(UNIQUE_ID='Lysophosphatidylglycerols', COMMON_NAME='a lysophosphatidylglycerol', SMILES='', INCHI='', NON_STANDARD_INCHI='')
EcoCyc_Chemical(UNIQUE_ID='Lactones', COMMON_NAME='a lactone', SMILES='', INCHI='', NON_STANDARD_INCHI='')
EcoCyc_Chemical(UNIQUE_ID='IS30-Insertion-Sequences', COMMON_NAME='an insertion sequence element IS30', SMILES='', INCHI='', NON_STANDARD_INCHI='')
EcoCyc_Chemical(UNIQUE_ID='Resolution-of-Recombinational-Junction', COMMON_NAME='resolution of recombinational junction formation of two intact strands', SMILES='', INCHI='', NON_STANDARD_INCHI='')
EcoCyc_Chemical(UNIQUE_ID='CPD-18500', COMMON_NAME='calcium hydrogenphosphate', SMILES='', INCHI='', NON_STANDARD_INCHI='')
EcoCyc_Chemical(UNIQUE_ID='Aliphatic-N-Acetyl-Diamines', COMMON_NAME='', SMILES='', INCHI='', NON_STANDARD_INCHI='')
EcoCyc_Chemical(UNIQUE_ID='Ferric-Hydroxamate-Complexes

Export your print list to CSV file

In [18]:
import csv

def export_chem_list_to_csv(chem_list, file_name):
    """
    Exports each EcoCyc_Chemical object in the provided list to a CSV file.

    :param chem_list: List of EcoCyc_Chemical objects.
    :param file_name: The name of the file to which the data should be written.
    """
    with open(file_name, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(['UNIQUE_ID', 'COMMON_NAME', 'SMILES', 'INCHI', 'NON_STANDARD_INCHI'])
        # Write the chemical data
        for chem in chem_list:
            writer.writerow([chem.UNIQUE_ID, chem.COMMON_NAME, chem.SMILES, chem.INCHI, chem.NON_STANDARD_INCHI])

# Call the function with chems_with_no_structural_info list
export_chem_list_to_csv(smiles_chems_with_smiles_only_unparsable, "smiles_chems_with_smiles_only_unparsable.csv")