# Downloading and reading Drive files with gdown
This is probably the best approach to take for your 134 projects.

You can use a Google Drive-specific library (gdown).  This approach will also save the file (transiently) to your Drive so it will show up in the 'Files'.  

In [None]:
!pip install -q gdown

import gdown

def download_file(file_id, output_filename):
    """
    Download a file from Google Drive using its ID.

    :param file_id: str, Google Drive file ID.
    :param output_filename: str, Name to save the downloaded file.
    """
    # Construct the download URL
    url = f'https://drive.google.com/uc?id={file_id}'

    # Download the file
    gdown.download(url, output_filename, quiet=False)

def preview_file(output_filename, num_lines=5):
    """
    Print the first n lines of a file.

    :param output_filename: str, Name of the file to preview.
    :param num_lines: int, Number of lines to preview. Default is 5.
    """
    try:
        with open(output_filename, 'r') as file:
            print(f"Previewing first {num_lines} lines of {output_filename}:")
            for _ in range(num_lines):
                print(next(file).strip())
            print("\n")
    except Exception as e:
        print(f"An error occurred while previewing the file: {str(e)}")
    except StopIteration:
        print(f"{output_filename} may have less than {num_lines} lines to preview.\n")

# File IDs and names
files_info = {
    "metacyc_chemicals.txt": '1J_LC34OAlTCC3FS1NP3RKessqN3V_Lk_',
    "metacyc_reactions.txt": '1quJmoWI-aqjwCJ0YNAT5RRrqbY6G6-9i',
    "universal_metabolites.txt": '1UbDsaEOTr3uAUYTTcC06Z2Gfzgr5noHF',
    "minimal_metabolites.txt": '1YQciQaQ8mM68F0cP4g0o8ldwE4p52-aH',
}

# Download and preview the files
for filename, file_id in files_info.items():
    download_file(file_id, filename)

# Preview only the chemicals file
preview_file("metacyc_chemicals.txt")


Downloading...
From: https://drive.google.com/uc?id=1J_LC34OAlTCC3FS1NP3RKessqN3V_Lk_
To: /content/metacyc_chemicals.txt
100%|██████████| 125M/125M [00:01<00:00, 67.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1quJmoWI-aqjwCJ0YNAT5RRrqbY6G6-9i
To: /content/metacyc_reactions.txt
100%|██████████| 32.8M/32.8M [00:00<00:00, 139MB/s]
Downloading...
From: https://drive.google.com/uc?id=1UbDsaEOTr3uAUYTTcC06Z2Gfzgr5noHF
To: /content/universal_metabolites.txt
100%|██████████| 8.62k/8.62k [00:00<00:00, 11.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1YQciQaQ8mM68F0cP4g0o8ldwE4p52-aH
To: /content/minimal_metabolites.txt
100%|██████████| 1.73k/1.73k [00:00<00:00, 5.07MB/s]

Previewing first 5 lines of metacyc_chemicals.txt:
id	name	inchi
0	s-adenosyl-l-methionine	InChI=1S/C15H22N6O5S/c1-27(3-2-7(16)15(24)25)4-8-10(22)11(23)14(26-8)21-6-20-9-12(17)18-5-19-13(9)21/h5-8,10-11,14,22-23H,2-4,16H2,1H3,(H2-,17,18,19,24,25)/p+1/t7-,8+,10+,11+,14+,27?/m0/s1	null
1	undefined	InChI=/FAKE/METACYC/abau903895cyc/biopax-level3.owl/#ChemicalStructure57245	C2(N(C1(OC(COP(O[a tRNA])(=O)[O-])C(OP([O-])(=O)O[a tRNA])C(O)1))C(=O)N=C(N)C=2)
2	undefined	InChI=/FAKE/METACYC/abau903895cyc/biopax-level3.owl/#ChemicalStructure47307	C2(N(C1(OC(COP(O[tRNA<sup>Ile2</sup>])(=O)[O-])C(OP([O-])(=O)O[tRNA<sup>Ile2</sup>])C(O)1))C(=O)N=C(N)C=2)
3	undefined	InChI=/FAKE/METACYC/abau903895cyc/biopax-level3.owl/#ChemicalStructure57251	COC2(C(N1(C(=O)N=C(N)C=C1))OC(COP(O[a tRNA])(=O)[O-])C(OP([O-])(=O)O[a tRNA])2)







# Embedded files in a colab
You can also click the folder button at left and upload files to the colab and then access them

In [None]:
# Define the path to the file
file_path = '/content/metacyc_chemicals.txt'

# Open and read the file
try:
    with open(file_path, 'r') as file:
        # Initialize a counter for the lines
        line_count = 0

        # Loop through the file line by line
        for line in file:
            # Check if 6 lines have been printed
            if line_count < 6:
                print(line.strip())  # Print the line without leading/trailing whitespaces
                line_count += 1  # Increase the counter
            else:
                break  # Exit the loop after printing 6 lines
except FileNotFoundError:
    print(f"No file found at {file_path}")


id	name	inchi
0	s-adenosyl-l-methionine	InChI=1S/C15H22N6O5S/c1-27(3-2-7(16)15(24)25)4-8-10(22)11(23)14(26-8)21-6-20-9-12(17)18-5-19-13(9)21/h5-8,10-11,14,22-23H,2-4,16H2,1H3,(H2-,17,18,19,24,25)/p+1/t7-,8+,10+,11+,14+,27?/m0/s1	null
1	undefined	InChI=/FAKE/METACYC/abau903895cyc/biopax-level3.owl/#ChemicalStructure57245	C2(N(C1(OC(COP(O[a tRNA])(=O)[O-])C(OP([O-])(=O)O[a tRNA])C(O)1))C(=O)N=C(N)C=2)
2	undefined	InChI=/FAKE/METACYC/abau903895cyc/biopax-level3.owl/#ChemicalStructure47307	C2(N(C1(OC(COP(O[tRNA<sup>Ile2</sup>])(=O)[O-])C(OP([O-])(=O)O[tRNA<sup>Ile2</sup>])C(O)1))C(=O)N=C(N)C=2)
3	undefined	InChI=/FAKE/METACYC/abau903895cyc/biopax-level3.owl/#ChemicalStructure57251	COC2(C(N1(C(=O)N=C(N)C=C1))OC(COP(O[a tRNA])(=O)[O-])C(OP([O-])(=O)O[a tRNA])2)
4	h+	InChI=1S/p+1	null


# Defining Chemical and Reaction classes

In [None]:


class Chemical:
    def __init__(self, id, name, smiles, inchi):
        self.id = int(id)
        self.smiles = smiles  # Note: You didn't provide SMILES data, adjust if needed
        self.inchi = inchi
        self.name = name

class Reaction:
    def __init__(self, id, substrates, products, ec_num):
        self.id = int(id)
        self.substrates = substrates  # Set of Chemical instances
        self.products = products  # Set of Chemical instances
        self.ec_num = ec_num

    def get_substrates(self):
        return self.substrates

    def get_products(self):
        return self.products



class Metabolite:
    def __init__(self, name, inchi, descriptor):
      self.name = name
      self.inchi = inchi
      self.descriptor = descriptor #amino acid, etc.

# Parsing the two metacyc files into lists of Chemical and Reaction objects

In [None]:
def load_chemicals(file_path):
    chemicals = {}
    with open(file_path, 'r') as file:
        lines = file.read().splitlines()  # Read the whole file as a string and split into lines
    for i in range(1, len(lines)):  # Start from 1 to skip the header
        parts = lines[i].split('\t')
        if len(parts) < 4 or not parts[0].isdigit():
            continue  # Skip lines with insufficient data or non-numeric ID
        chem_id = int(parts[0])
        chemicals[chem_id] = Chemical(chem_id, parts[1], parts[3] if len(parts) > 3 else None, parts[2])
    return chemicals

# You would use the load_chemicals function as before to load your chemicals.


def load_reactions(file_path, chemicals):
    reactions = []
    with open(file_path, 'r') as file:
        next(file)  # Skip header line
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue  # Skip lines with insufficient data
            rxn_id, ec_num, substrates_str, products_str = parts[0], parts[1], parts[2], parts[3]
            substrates = {chemicals[int(cid)] for cid in substrates_str.split() if cid.isdigit()}
            products = {chemicals[int(cid)] for cid in products_str.split() if cid.isdigit()}
            reactions.append(Reaction(rxn_id, substrates, products, ec_num))
    return reactions




# File paths
chemicals_file_path = 'metacyc_chemicals.txt'
reactions_file_path = 'metacyc_reactions.txt'


# Load the data into objects
chemicals = load_chemicals(chemicals_file_path)
reactions = load_reactions(reactions_file_path, chemicals)

# Print the first 5 chemicals and reactions
print("First 5 chemicals:")
for chem_id in sorted(chemicals)[:5]:
    chemical = chemicals[chem_id]
    print(f"ID: {chemical.id}, Name: {chemical.name}, InChI: {chemical.inchi}, SMILES: {chemical.smiles}")

print("\nFirst 5 reactions:")
for reaction in sorted(reactions, key=lambda r: r.id)[:5]:
    substrates_ids = [str(chem.name) for chem in reaction.substrates]
    products_ids = [str(chem.id) for chem in reaction.products]
    print(f"ID: {reaction.id}, EC Number: {reaction.ec_num}, Substrates: {substrates_ids}, Products: {products_ids}")

First 5 chemicals:
ID: 0, Name: s-adenosyl-l-methionine, InChI: InChI=1S/C15H22N6O5S/c1-27(3-2-7(16)15(24)25)4-8-10(22)11(23)14(26-8)21-6-20-9-12(17)18-5-19-13(9)21/h5-8,10-11,14,22-23H,2-4,16H2,1H3,(H2-,17,18,19,24,25)/p+1/t7-,8+,10+,11+,14+,27?/m0/s1, SMILES: null
ID: 1, Name: undefined, InChI: InChI=/FAKE/METACYC/abau903895cyc/biopax-level3.owl/#ChemicalStructure57245, SMILES: C2(N(C1(OC(COP(O[a tRNA])(=O)[O-])C(OP([O-])(=O)O[a tRNA])C(O)1))C(=O)N=C(N)C=2)
ID: 2, Name: undefined, InChI: InChI=/FAKE/METACYC/abau903895cyc/biopax-level3.owl/#ChemicalStructure47307, SMILES: C2(N(C1(OC(COP(O[tRNA<sup>Ile2</sup>])(=O)[O-])C(OP([O-])(=O)O[tRNA<sup>Ile2</sup>])C(O)1))C(=O)N=C(N)C=2)
ID: 3, Name: undefined, InChI: InChI=/FAKE/METACYC/abau903895cyc/biopax-level3.owl/#ChemicalStructure57251, SMILES: COC2(C(N1(C(=O)N=C(N)C=C1))OC(COP(O[a tRNA])(=O)[O-])C(OP([O-])(=O)O[a tRNA])2)
ID: 4, Name: h+, InChI: InChI=1S/p+1, SMILES: null

First 5 reactions:
ID: 0, EC Number: 2.1.1.207, Substrates: ['und

In [None]:
def load_metabolites(file_path):
    metabolites_inchis = []
    with open(file_path, 'r') as file:
        lines = file.read().splitlines()  # Read the whole file as a string and split into lines
    for i in range(1, len(lines)):  # Start from 1 to skip the header
        parts = lines[i].split('\t')
        inchi = parts[1]
        inchi = inchi.replace('"','')
        metabolites_inchis.append(inchi)

    return metabolites_inchis

minimal_inchis = load_metabolites('minimal_metabolites.txt')
universal_inchis = load_metabolites('universal_metabolites.txt')
native_inchis = minimal_inchis + universal_inchis



In [None]:
#create a list of native chemicals
#native_chemcials
chem_dict = {}
for chem in chemicals.values():
  chem_dict[chem.inchi] = chem

native_chemicals = []
for native_inchi in native_inchis:
  native_chem = chem_dict.get(native_inchi)
  if native_chem:
    native_chemicals.append(native_chem)

native_chemicals


[<__main__.Chemical at 0x790a10335870>,
 <__main__.Chemical at 0x790a26917a00>,
 <__main__.Chemical at 0x790a103cd570>,
 <__main__.Chemical at 0x790a0b9d1ed0>,
 <__main__.Chemical at 0x790a07c46200>,
 <__main__.Chemical at 0x790a1004a470>,
 <__main__.Chemical at 0x790a0fa67610>,
 <__main__.Chemical at 0x790a0fb1baf0>,
 <__main__.Chemical at 0x790a0fdb4250>,
 <__main__.Chemical at 0x790a0d7f45e0>,
 <__main__.Chemical at 0x790a0f4dd960>,
 <__main__.Chemical at 0x790a269ac580>,
 <__main__.Chemical at 0x790a0fd26bf0>,
 <__main__.Chemical at 0x790a0ddafe80>,
 <__main__.Chemical at 0x790a10336410>,
 <__main__.Chemical at 0x790a1025cbb0>,
 <__main__.Chemical at 0x790a100688b0>,
 <__main__.Chemical at 0x790a1035f970>,
 <__main__.Chemical at 0x790a269ac5e0>,
 <__main__.Chemical at 0x790a269affa0>,
 <__main__.Chemical at 0x790a42ed7580>,
 <__main__.Chemical at 0x790a269ac310>,
 <__main__.Chemical at 0x790a10334bb0>,
 <__main__.Chemical at 0x790a0f115780>,
 <__main__.Chemical at 0x790a10325ab0>,


# Detecting abstract reactions and removing them

In [None]:
# To do
# !pip install rdkit-pypi
# from rdkit import Chem

# def create_molecules(chemicals):
#     molecules = []
#     for chemical in chemicals.values():
#         try:
#             molecule = Chem.MolFromSmiles(chemical.smiles)
#             if molecule is not None:
#                 molecules.append(molecule)
#         except:
#             pass
#     return molecules

# # Load the data into objects
# chemicals = load_chemicals(chemicals_file_path)
# reactions = load_reactions(reactions_file_path, chemicals)

# # Create the molecules
# molecules = create_molecules(chemicals)

# # Print the number of molecules created
# print(f"Created {len(molecules)} molecules.")


# Perform Synthesis

In [None]:
# To do
class HyperGraph:
    def __init__(self):
        self.reaction_to_shell = {}
        self.chemical_to_shell = {}
        self.chemical_to_cascade = {}
        self.chemical_to_pathway = {}

class Synthesizer:
    def initiate(self):
        # Implementation for the initiate method in Python
        pass  # You can replace this with the actual implementation


    def run(self, reactions, chemicals, native_chemicals):
        self.curr_shell = 0
        self.all_reactions = reactions
        self.all_chemicals = chemicals
        self.chemical_to_shell = {}  # Using a dictionary instead of HashMap
        self.reaction_to_shell = {}  # Using a dictionary instead of HashMap
        #log native chemicals into shell 0
        for native_chem in native_chemicals:
          self.chemical_to_shell[native_chem] = 0

        while self._expand_once():
          print(self.curr_shell)

        # Implementation for the run method in Python
        # You can replace this with the actual implementation

        # Returning a HyperGraph instance or relevant data
        output = HyperGraph()
        output.chemical_to_shell = self.chemical_to_shell
        output.reaction_to_shell = self.reaction_to_shell
        return output


    def _expand_once(self):
        # Increment the current shell
        self.curr_shell += 1
        is_expanded = False

        # Iterate through reactions
        for reaction in self.all_reactions:
            # If the reaction has already been put in the expansion, skip this reaction
            if reaction in self.reaction_to_shell:
                continue

            # If any of the substrates are not enabled, skip this reaction
            substrates_enabled = all(chemical in self.chemical_to_shell for chemical in reaction.get_substrates())
            if not substrates_enabled:
                continue

            # If it gets this far, the reaction is enabled and new, thus expansion will occur
            is_expanded = True

            # Log the reaction into the expansion at the current shell
            self.reaction_to_shell[reaction] = self.curr_shell

            # For each product, enable it with the current shell (if it isn't already)
            for product in reaction.get_products():
                if product not in self.chemical_to_shell:
                    self.chemical_to_shell[product] = self.curr_shell

        return is_expanded



In [None]:
synthesizer = Synthesizer()
hypergraph = synthesizer.run(reactions, chemicals, native_chemicals)

size = len(hypergraph.chemical_to_shell)

size

1
2
3
4
5
6
7
8
9
10
11
12
13


81778