In [None]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import os
import reframed

## Downloading XML Files and Storing All Model Data

In [None]:
import subprocess
#curl 'http://bigg.ucsd.edu/api/v2/models/iND750/download'
result = subprocess.run(
    ["curl", "http://bigg.ucsd.edu/api/v2/models"],
    stdout=subprocess.PIPE,
    check=True
)
output = result.stdout.decode("utf-8")
print(output)

In [None]:
import json

data = json.loads(output)
bigg_ids = [item['bigg_id'] for item in data['results']]
print(bigg_ids)
print(len(bigg_ids))

In [None]:
# read in all models and get outputs
current_dir = os.getcwd()

for model_id in bigg_ids:
    result = subprocess.run(
        ["curl", f"http://bigg.ucsd.edu/static/models/{model_id}.xml"],
        stdout=subprocess.PIPE,
        check=True
    )
    xml_output = result.stdout.decode("utf-8")
#     file_path = os.path.join(current_dir, f"{model_id}.xml")  # commented out so it doesn't download all files to repo again
#     with open(file_path, "w", encoding="utf-8") as f:
#         f.write(xml_output)
#     print(f"Saved output to {file_path}")

In [None]:
import reframed
import pandas as pd

In [None]:
# get list of all xml files from "models" folder in repo
xml_files = [f for f in os.listdir(current_dir + '/models')]

In [None]:
# create dictionary for {model_id: model data}
models = {}
for model_id in bigg_ids:
    model = reframed.load_cbmodel(f"{current_dir}/models/{model_id}.xml")
    models[model_id] = model

In [None]:
models

In [None]:
# models = {}
# for model_id in bigg_ids:
#     try:
#         model = reframed.load_cbmodel(f"{model_id}.xml")
#         models[model_id] = model
#     except:
#         print(f"{model_id} is not well formed")

In [None]:
print(type(model))

In [None]:
# getting data from e_coli_core.xml model
ecoli_model = models['e_coli_core']

In [None]:
# define function to parse through model data and turn into dfs, then store dfs in dictionary

def get_model_data(model):
    '''
    Given a model, the function returns a tuple of dataframes of reactions, metabolites, and genes
    '''
    # get reactions 
    reactions_data = []
    for rxn in test_model.reactions.values():
        reactions_data.append({
            'id': rxn.id,
            'name': getattr(rxn, 'name', ''),
            'lower_bound': getattr(rxn, 'lower_bound', None),
            'upper_bound': getattr(rxn, 'upper_bound', None),
            'stoichiometry': getattr(rxn, 'stoichiometry', {}),  # typically a dict mapping metabolite IDs to coefficients
            'objective_coefficient': getattr(rxn, 'objective_coefficient', 0)
        })

    # get metabolites 
    metabolites_data = []
    for met in test_model.metabolites.values():
        metabolites_data.append({
            'id': met.id,
            'name': getattr(met, 'name', ''),
            'formula': getattr(met, 'formula', None),
            'charge': getattr(met, 'charge', None)
        })

    # get genes
    genes_data = []
    for gene in test_model.genes.values():

        associated_reactions = getattr(gene, 'reactions', [])
        genes_data.append({
            'id': gene.id,
            'name': getattr(gene, 'name', ''),
            'associated_reactions': associated_reactions  
        })
    
    # turn data for all three categories (reactions, metabolites, genes) into dataframes, then store in list
    df_reactions = pd.DataFrame(reactions_data)
    df_metabolites = pd.DataFrame(metabolites_data)
    df_genes = pd.DataFrame(genes_data)
    df_list = [df_reactions, df_metabolites, df_genes]
    
    # store dataframes into dictionary with data category name as id
    model_data = {}
    data_names = ['reactions', 'metabolites', 'genes']
    for i in range(len(data_names)):
        model_data[data_names[i]] = df_list[i]

    return model_data

In [None]:
# get dictionary for ecoli data
ecoli_data = get_model_data(ecoli_model)

ecoli_rxn = ecoli_data['reactions']
ecoli_met = ecoli_data['metabolites']
ecoli_genes = ecoli_data['genes']

In [None]:
print("Reactions DataFrame:")
print(ecoli_rxn.head())
print("\nMetabolites DataFrame:")
print(ecoli_met.head())
print("\nGenes DataFrame:")
print(ecoli_genes.head())

In [None]:
ecoli_rxn.head()

In [None]:
ecoli_met.head()

In [None]:
ecoli_genes.head()

## Connect MySQL to Python Using SQLAlchemy and Add Data to Tables

In [None]:
from sqlalchemy import create_engine

In [None]:
# set environment variables (user and password)
    # user = root
    # password = your own root password (comment out below and delete your password after setting pwd variable)
    
# %env user = root
# %env password = {pwd}

In [None]:
# Make our connection to db
# set parameters first
host = "localhost"
db = "metabolic_pathways"
user = os.environ['user']
pw = os.environ['password']

# create connection using params
con = create_engine(f"mysql+mysqlconnector://{user}:{pw}@{host}/{db}", echo=False)

In [None]:
# create function to add reaction data from all models to reaction table

def add_reactions(bigg_ids):
    """
    Given the list of model_ids, get the data from each model and add the reactions to the reaction
    table in MySQL

    """
    # for each model
    for model_id in bigg_ids:
        
        # get data from all models and save each df as a variable
        model_data = get_model_data(model_id)
        model_rxn = model_data['reactions']
        model_met = model_data['metabolites']
        model_genes = model_data['genes']
    
        # remove stoichiometry from rxn df
        short_rxn = model_rxn[['id', 'name', 'lower_bound', 'upper_bound', 'objective_coefficient']]

        # add rxn data to reaction table
        short_rxn.to_sql('reaction', con, if_exists='append', index=False)
    

In [None]:
# call add_reactions() to write reaction data for all models to reaction table
add_reactions(models)