In [1]:
import pandas as pd
import os
import json
import cobra
import seaborn as sbn

In [2]:
%matplotlib inline

In [3]:
data_dir = "/Users/krisj/Documents/Data/Genomewide_landscape/"

## Match ions to metabolites

In [4]:
with open("../../ETH/Data/kegg_to_bigg.json") as infile:
    old_kegg_to_bigg = json.load(infile)

with open("../../ETH/Data/kegg_to_mnx.json") as infile:
    old_kegg_to_mnx = json.load(infile)
    
with open("../../ETH/Data/mnx_to_bigg.json") as infile:
    old_mnx_to_bigg = json.load(infile)

In [5]:
chem_xref = pd.read_csv("../Data/chem_xref.tsv", sep="\t", skiprows=365)
bigg_xref = chem_xref[chem_xref["#XREF"].str.startswith("bigg:")].copy()
bigg_xref["bigg"] = bigg_xref["#XREF"].map(lambda x: x.split(":", 1)[1])
kegg_xref = chem_xref[chem_xref["#XREF"].str.startswith("kegg:")].copy()
kegg_xref["kegg"] = kegg_xref["#XREF"].map(lambda x: x.split(":", 1)[1])

In [6]:
ion_annotations = pd.read_excel(data_dir + "neg_kegg_all_3mD.xls")

In [7]:
kegg_to_mnx = {}
for idx, row in kegg_xref.iterrows():
    kegg_to_mnx.setdefault(row["kegg"], []).append(row["MNX_ID"])

mnx_to_bigg = {}
for idx, row in bigg_xref.iterrows():
    mnx_to_bigg.setdefault(row["MNX_ID"], []).append(row["bigg"])

In [8]:
kegg_to_bigg = {}
for kegg, mnx_list in kegg_to_mnx.items():
    bigg_list = []
    for mnx in mnx_list:
        if mnx in mnx_to_bigg:
            bigg_list.extend(mnx_to_bigg[mnx])
    if bigg_list:
        kegg_to_bigg[kegg] = bigg_list

In [9]:
model = cobra.io.read_sbml_model("../../ETH/Data/iJO1366.xml")
metabolite_ids = []
for met in model.metabolites:
    if met.id[-2:] in ("_c", "_p", "_e"):
        met_id = met.id[:-2]
        metabolite_ids.append(met_id)
    else:
        raise ValueError(met.id)
        
metabolite_ids = set(metabolite_ids)
print(len(metabolite_ids))

1136


In [10]:
new_model_kegg = []
for kegg, bigg in kegg_to_bigg.items():
    if set(bigg) & metabolite_ids:
        new_model_kegg.append(kegg)
        
old_model_kegg = []
for kegg, bigg in old_kegg_to_bigg.items():
    if set(bigg) & metabolite_ids:
        old_model_kegg.append(kegg)

model_kegg = set(new_model_kegg) | set(old_model_kegg)

In [11]:
kegg_descriptions = {}
for idx, row in kegg_xref.iterrows():
    kegg_descriptions[row["kegg"]] = row["Description"]

In [12]:
def get_metabolite_name(s):
    for suff in ("_c", "_e", "_p"):
        try:
            m = model.metabolites.get_by_id(s + suff)
        except KeyError:
            pass
        else:
            break
    else:
        raise RuntimeError("WTF is: " + s)
    return m.name

In [50]:
for kegg in new_model_kegg:
    print(
        kegg_descriptions[kegg]
    )
    print("   ",
        get_metabolite_name(kegg_to_bigg[kegg][0])
    )

(2E)-Hexenoyl-[acp]|trans-Hex-2-enoyl-[acp]|trans-Hex-2-enoyl-[acyl-carrier protein]
    Trans-Hex-2-enoyl-[acyl-carrier protein]
Magmitt (TN)|Magnesium oxide|Magnesium oxide (JP17/USP)
    Magnesium
5-O-(1-Carboxyvinyl)-3-phosphoshikimate|O5-(1-Carboxyvinyl)-3-phosphoshikimate
    5-O-(1-Carboxyvinyl)-3-phosphoshikimate
Lipid I|N-Acetyl-D-glucosaminyldiphospho-ditrans,octacis-undecaprenol|N-Acetyl-D-glucosaminyldiphosphoundecaprenol|N-Acetyl-alpha-D-glucosaminyl-diphospho-ditrans,octacis-undecaprenol
    Undecaprenyl diphospho N-acetyl-glucosamine
(4-Amino-2-methylpyrimidin-5-yl)methyl phosphate|4-Amino-2-methyl-5-(phosphooxymethyl)pyrimidine|4-Amino-5-hydroxymethyl-2-methylpyrimidine phosphate
    4-Amino-2-methyl-5-phosphomethylpyrimidine
ADP|Adenosine 5'-diphosphate|Adenosine 5'-phosphate
    ADP C10H12N5O10P2
Phosphite|Phosphonate|Phosphonic acid|Phosphorous acid
    Phosphonate
7,8-Dihydromonapterin 3'-triphosphate
    Dihydromonapterin-triphosphate
L-Alanyl-L-glutamate
    L-ala

RuntimeError: WTF is: chit6p

In [15]:
deprot_ion_annotations = ion_annotations[ion_annotations["mod"] == "-H(+)"]
print(len(deprot_ion_annotations))
#deprot_ion_annotations.sort_values("mz")

951


In [16]:
model_ion_annotations = deprot_ion_annotations[deprot_ion_annotations["id"].isin(new_model_kegg)]
non_model_ions = deprot_ion_annotations[~deprot_ion_annotations["ion"].isin(model_ion_annotations["ion"])]
print(len(model_ion_annotations))
print(len(non_model_ions))
print(len(model_ion_annotations[model_ion_annotations["mod"] == "-H(+)"]))

297
456
297


In [17]:
ion_to_bigg_ids = {}
mapped_metabolites = set()
for idx, row in model_ion_annotations.iterrows():
    bigg_ids = kegg_to_bigg[row["id"]]
    bigg_ids = [a for a in bigg_ids if a in metabolite_ids]
    ion_to_bigg_ids.setdefault(row["ion"], []).extend(bigg_ids)
    mapped_metabolites.update(bigg_ids)

In [18]:
len(mapped_metabolites)

292

In [19]:
len(metabolite_ids - mapped_metabolites)

844

In [20]:
model_metabolite_elements = {}
for met_id in metabolite_ids - mapped_metabolites:
    for comp in ("_c", "_p", "_e"):
        try:
            met = model.metabolites.get_by_id(met_id + comp)
        except KeyError:
            pass
        else:
            break
    else:
        raise RuntimeError("WTF!!")
    charge = met.charge
    elements = met.elements
    if charge:
        elements["H"] = elements.get("H", 0) - charge
    model_metabolite_elements[met_id] = elements

In [21]:
model_metabolite_elements["pep"]

{'C': 3, 'H': 5, 'O': 6, 'P': 1}

In [22]:
model.metabolites.pep_c.formula

'C3H2O6P'

In [23]:
ion_annotations[ion_annotations["formula"] == "C3H5O6P"]

Unnamed: 0,id,name,formula,mz,mod,score,rank,mzDelta,ion,TIC correl,average int
215,C00074,Phosphoenolpyruvate,C3H5O6P,184.984917,+OH(-),36,3,0.000653,452,0.548239,4650.199598
216,C00074,Phosphoenolpyruvate,C3H5O6P,302.920303,.H2PO4K-H(+),5,2,-0.012438,1111,0.105998,1756.700566
2904,C02798,3-Phosphonopyruvate,C3H5O6P,184.984917,+OH(-),36,3,0.000653,452,0.548239,4650.199598
2905,C02798,3-Phosphonopyruvate,C3H5O6P,302.920303,.H2PO4K-H(+),5,2,-0.012438,1111,0.105998,1756.700566


In [24]:
"5".isnumeric()

True

In [25]:
element_set = {"C", "H", "O", "N", "P", "S", "F"}
def formula_to_elements(formula):
    elements = {}
    while formula:
        if formula[0] in element_set:
            number = ""
            ele = formula[0]
            formula = formula[1:]
            while formula and formula[0] not in element_set:
                number = number + formula[0]
                formula = formula[1:]
            if number == "":
                number = "1"
            number = int(number)
            elements[ele] = number
        else:
            raise RuntimeError(formula)
    return elements

def formula_to_elements(formula):
    fields = []
    flag = 0
    while formula:
        substring = ""
        if flag: # Parse numbers
            while formula and formula[0].isnumeric():
                substring = substring + formula[0]
                formula = formula[1:]
            fields.append(substring)
        else: # Parse letters
            while formula and not formula[0].isnumeric():
                if formula[0] == formula[0].upper() and substring:
                    fields.append(substring)
                    substring = ""
                substring = substring + formula[0]
                formula = formula[1:]
            fields.append(substring)
        flag = 1 - flag
    elements = {}
    ele = ""
    for field in fields:
        if field.isnumeric():
            elements[ele] = int(field)
        else:
            ele = field
            elements[ele] = 1
    return elements

In [26]:
additional_ion_mapping = {
    44: "but",
    387: "dca",
    522: "ddca",
    3151: "cbi",
    1164: "fpram",
    674: "ttdca",
    340: None,
    815: "hdcea",
    445: "cechddd",
    614: "ggptrc",
    1054: "air",
    720: "kdo",
    286: "orn",
    546: "tyr__L",
    651: "ser__L",
    230: ["cys__D", "cys__L"],
    602: "glx",
    648: "ptrc",
    
}

In [27]:
print(len(metabolite_ids - mapped_metabolites))
print(len(metabolite_ids - mapped_metabolites - set(additional_ion_mapping.values())))

844


TypeError: unhashable type: 'list'

In [28]:
ion_to_bigg_ids

{2: ['ppal'],
 5: ['ac', 'gcald'],
 10: ['hco3'],
 13: ['hco3'],
 17: ['no3'],
 23: ['mthgxl'],
 24: ['btal'],
 26: ['aact'],
 28: ['lald__L', 'lald__D', 'acetol', 'lald__L'],
 29: ['gly'],
 35: ['so3'],
 43: ['pyr', 'msa'],
 45: ['ala__L', 'ala_B', 'ala__D', 'sarcs', 'ala__L'],
 47: ['dha', 'lac__L', 'lac__D', 'glyald', '3hpp'],
 52: ['glyc'],
 63: ['pi'],
 78: ['2obut', 'acac', 'sucsal'],
 81: ['4abut'],
 82: ['ghb'],
 88: ['glyc__R'],
 92: ['4crsol'],
 96: ['hqn'],
 98: ['csn'],
 101: ['ura'],
 112: ['1pyr5c'],
 117: ['op4en'],
 124: ['pro__L'],
 129: ['3mob'],
 133: ['aspsa', '2aobut'],
 135: ['indole'],
 136: ['val__L', 'glyb'],
 140: ['succ'],
 145: ['thr__L', 'hom__L', 'athr__L'],
 160: ['nac'],
 185: ['4mop', '3mop'],
 187: ['agm'],
 190: ['5aop', 'glu5sa', 'glu1sa'],
 191: ['leu__L', 'ile__L'],
 196: ['asn__L'],
 201: ['asp__L'],
 207: ['mal__L', 'mal__D'],
 208: ['23dhmb'],
 212: ['ade', '4hthr'],
 217: ['hxan'],
 218: ['4hoxpacd', 'pac'],
 226: ['4hbz'],
 231: ['4ahmmp'],
 2

In [29]:
print(len(non_model_ions))
print(len(non_model_ions[~non_model_ions["ion"].isin(additional_ion_mapping)]))

456
421


In [30]:
all_mapped_metabolites = []
for v in ion_to_bigg_ids.values():
    all_mapped_metabolites.extend(v)
for v in additional_ion_mapping.values():
    if v is not None:
        if isinstance(v, list):
            all_mapped_metabolites.extend(v)
        else:
            all_mapped_metabolites.append(v)
all_mapped_metabolites = set(all_mapped_metabolites)

In [31]:
len(all_mapped_metabolites)

310

In [32]:
len(mapped_metabolites)

292

In [33]:
for idx, row in ion_annotations.sort_values("ion").iterrows():
    if row["ion"] in ion_to_bigg_ids:
        continue
    formula = row["formula"]
    elements = formula_to_elements(formula)
    if elements in model_metabolite_elements.values():
        if not row["ion"] in additional_ion_mapping:
            model_met_ids = [a for a, ele in model_metabolite_elements.items() if ele == elements]
            if set(model_met_ids) & all_mapped_metabolites:
                continue
            if not "[+1]" in row["mod"]:
                print(row["id"], formula, row["ion"], row["name"], model_met_ids, "\t", row["mod"])

C00163 C3H6O2 59 Propanoate ['ppa'] 	 .H/Na-H(+)
C00424 C3H6O2 59 (S)-Lactaldehyde ['ppa'] 	 .H/Na-H(+)
C05999 C3H6O2 59 Lactaldehyde ['ppa'] 	 .H/Na-H(+)
C00969 C3H6O2 59 3-Hydroxypropanal ['ppa'] 	 .H/Na-H(+)
C05235 C3H6O2 59 Hydroxyacetone ['ppa'] 	 .H/Na-H(+)
C00937 C3H6O2 59 (R)-Lactaldehyde ['ppa'] 	 .H/Na-H(+)
C02457 C3H8O2 65 Propane-1,3-diol ['12ppd__R', '12ppd__S'] 	 .H/Na-H(+)
C00583 C3H8O2 65 Propane-1,2-diol ['12ppd__R', '12ppd__S'] 	 .H/Na-H(+)
C12313 C6H12O2 66 cis-1,2-Cyclohexanediol ['hxa'] 	 -H2O-H(+)
C03739 C6H12O2 66 trans-1,2-Cyclohexanediol ['hxa'] 	 -H2O-H(+)
C03219 C4H9NO 94 2-Methylpropanal oxime ['4abutn'] 	 .H/Na-H(+)
C00583 C3H8O2 116 Propane-1,2-diol ['12ppd__R', '12ppd__S'] 	 .H/K-H(+)
C02457 C3H8O2 116 Propane-1,3-diol ['12ppd__R', '12ppd__S'] 	 .H/K-H(+)
C12313 C6H12O2 130 cis-1,2-Cyclohexanediol ['hxa'] 	 -H(+)
C03739 C6H12O2 130 trans-1,2-Cyclohexanediol ['hxa'] 	 -H(+)
C00168 C3H4O4 168 Hydroxypyruvate ['hpyr', '2h3oppan'] 	 .H/Na-H(+)
C00383 C3H4O4 1

C05350 C9H8O4 616 2-Hydroxy-3-(4-hydroxyphenyl)propenoate ['dhcinnm', '34hpp'] 	 .H/K-H(+)
C12623 C9H8O4 616 trans-2,3-Dihydroxycinnamate ['dhcinnm', '34hpp'] 	 .H/K-H(+)
C01197 C9H8O4 616 Caffeate ['dhcinnm', '34hpp'] 	 .H/K-H(+)
C01179 C9H8O4 616 3-(4-Hydroxyphenyl)pyruvate ['dhcinnm', '34hpp'] 	 .H/K-H(+)
C02218 C3H5NO2 640 2-Aminoacrylate ['3amac'] 	 .H2PO4K-H(+)
C01146 C3H4O4 646 2-Hydroxy-3-oxopropanoate ['hpyr', '2h3oppan'] 	 .H2PO4Na-H(+)
C00383 C3H4O4 646 Malonate ['hpyr', '2h3oppan'] 	 .H2PO4Na-H(+)
C00168 C3H4O4 646 Hydroxypyruvate ['hpyr', '2h3oppan'] 	 .H2PO4Na-H(+)
C00879 C6H10O8 670 D-Galactarate ['galct__D', 'glcr'] 	 +OH(-)
C00818 C6H10O8 670 D-Glucarate ['galct__D', 'glcr'] 	 +OH(-)
C04575 C6H8O7 681 (4R,5S)-4,5,6-Trihydroxy-2,3-dioxohexanoate ['25dkglcn'] 	 .H/K-H(+)
C00679 C6H8O7 681 5-Dehydro-4-deoxy-D-glucarate ['25dkglcn'] 	 .H/K-H(+)
C03921 C6H8O7 681 2-Dehydro-3-deoxy-D-glucarate ['25dkglcn'] 	 .H/K-H(+)
C00158 C6H8O7 681 Citrate ['25dkglcn'] 	 .H/K-H(+)
C00311

C02457 C3H8O2 1048 Propane-1,3-diol ['12ppd__R', '12ppd__S'] 	 .(H2PO4)2NaH-H(+)
C01983 C8H8O3 1048 (R)-Mandelate ['34dhpac'] 	 .HPO4Na2-H(+)
C00642 C8H8O3 1048 4-Hydroxyphenylacetate ['34dhpac'] 	 .HPO4Na2-H(+)
C00536 P3H5O10 1059 Triphosphate ['pppi'] 	 .H/K-H(+)
C01283 C6H14NO8P 1068 1-Amino-1-deoxy-scyllo-inositol 4-phosphate ['g3ps', 'gam1p', 'gam6p'] 	 .H/K-H(+)
C12214 C6H14NO8P 1068 Aminofructose 6-phosphate ['g3ps', 'gam1p', 'gam6p'] 	 .H/K-H(+)
C00352 C6H14NO8P 1068 D-Glucosamine 6-phosphate ['g3ps', 'gam1p', 'gam6p'] 	 .H/K-H(+)
C12213 C6H14NO8P 1068 Kanosamine 6-phosphate ['g3ps', 'gam1p', 'gam6p'] 	 .H/K-H(+)
C06156 C6H14NO8P 1068 alpha-D-Glucosamine 1-phosphate ['g3ps', 'gam1p', 'gam6p'] 	 .H/K-H(+)
C06377 C6H14NO8P 1068 D-Galactosamine 6-phosphate ['g3ps', 'gam1p', 'gam6p'] 	 .H/K-H(+)
C06321 C7H8O4 1075 1,6-Dihydroxy-cis-2,4-cyclohexadiene-1-carboxylic acid ['23ddhb'] 	 .HPO4Na2-H(+)
C18311 C7H8O4 1075 4-Methyl-3-oxoadipate-enol-lactone ['23ddhb'] 	 .HPO4Na2-H(+)
C03979 

C00978 C12H14N2O2 1397 N-Acetylserotonin ['Nmtrp'] 	 .H2PO4K-H(+)
C00568 C7H7NO2 1403 4-Aminobenzoate ['anth', '4abz'] 	 .(H2PO4)2NaH-H(+)
C01004 C7H7NO2 1403 N-Methylnicotinate ['anth', '4abz'] 	 .(H2PO4)2NaH-H(+)
C00108 C7H7NO2 1403 Anthranilate ['anth', '4abz'] 	 .(H2PO4)2NaH-H(+)
C06054 C4H7O8P 1409 2-Oxo-3-hydroxy-4-phosphobutanoate ['ohpb'] 	 .HPO4Na2-H(+)
C00122 C4H4O4 1409 Fumarate ['fum'] 	 .(H2PO4Na)2-H(+)
C01384 C4H4O4 1409 Maleic acid ['fum'] 	 .(H2PO4Na)2-H(+)
C01100 C6H12N3O4P 1413 L-Histidinol phosphate ['hisp'] 	 .H2PO4K-H(+)
C14179 C2H4O5S 1417 Sulfoacetate ['sulfac'] 	 .(H2PO4)2NaH-H(+)
C15556 C4H9O6P 1417 3,4-Dihydroxy-2-butanone 4-phosphate ['db4p'] 	 .HPO4K2-H(+)
C02730 C11H10O5 1418 2-Succinylbenzoate ['sucbz'] 	 .H2PO4K-H(+)
C14106 C11H10O5 1418 2-Hydroxy-7-hydroxymethylchromene-2-carboxylate ['sucbz'] 	 .H2PO4K-H(+)
C14107 C11H10O5 1418 2-Hydroxy-4-hydroxymethylbenzalpyruvate ['sucbz'] 	 .H2PO4K-H(+)
C04666 C6H11N2O6P 1418 D-erythro-1-(Imidazol-4-yl)glycerol 3-p

C00095 C6H12O6 1620 D-Fructose ['gal_bD'] 	 .(H2PO4)2NaH-H(+)
C02336 C6H12O6 1620 beta-D-Fructose ['gal_bD'] 	 .(H2PO4)2NaH-H(+)
C00267 C6H12O6 1620 alpha-D-Glucose ['gal_bD'] 	 .(H2PO4)2NaH-H(+)
C00031 C6H12O6 1620 D-Glucose ['gal_bD'] 	 .(H2PO4)2NaH-H(+)
C15923 C6H12O6 1620 L-Gulose ['gal_bD'] 	 .(H2PO4)2NaH-H(+)
C01452 C6H12O6 1620 Sorbose ['gal_bD'] 	 .(H2PO4)2NaH-H(+)
C05775 C14H18N2O4 1621 alpha-Ribazole ['rdmbzi'] 	 .H2PO4Na-H(+)
C12477 C7H7NO4 1644 3-Methylpyrrole-2,4-dicarboxylic acid ['23dhdp'] 	 .(H2PO4)2KH-H(+)
C03340 C7H7NO4 1644 L-2,3-Dihydrodipicolinate ['23dhdp'] 	 .(H2PO4)2KH-H(+)
C18315 C7H7NO4 1644 4-Methyl-5-nitrocatechol ['23dhdp'] 	 .(H2PO4)2KH-H(+)
C00900 C5H8O4 1647 2-Acetolactate ['mdhdhf', 'dhptd', 'alac__S', '4h2opntn'] 	 .(H2PO4K)2-H(+)
C03737 C5H11O8P 1647 alpha-D-Xylose 1-phosphate ['r5p'] 	 .HPO4K2-H(+)
C03906 C5H11O8P 1647 beta-L-Arabinose 1-phosphate ['r5p'] 	 .HPO4K2-H(+)
C00117 C5H11O8P 1647 D-Ribose 5-phosphate ['r5p'] 	 .HPO4K2-H(+)
C04181 C5H8O4 16

C00052 C15H24N2O17P2 2336 UDP-D-galactose ['udpgalfur'] 	 .H/K-H(+)
C00029 C15H24N2O17P2 2336 UDP-glucose ['udpgalfur'] 	 .H/K-H(+)
C05925 C9H14N5O7P 2341 Dihydroneopterin phosphate ['dhpmp'] 	 .(H2PO4K)2-H(+)
C00119 C5H13O14P3 2345 5-Phospho-alpha-D-ribose 1-diphosphate ['prpp'] 	 .(H2PO4)2NaH-H(+)
C11472 C7H16O13P2 2356 D-glycero-D-manno-Heptose 1,7-bisphosphate ['gmhep17bp'] 	 .(H2PO4Na)2-H(+)
C00460 C9H15N2O14P3 2356 dUTP ['dutp'] 	 .HPO4Na2-H(+)
C04677 C9H15N4O8P 2356 1-(5'-Phosphoribosyl)-5-amino-4-imidazolecarboxamide ['aicar'] 	 .(H2PO4K)2-H(+)
C00447 C7H16O13P2 2356 Sedoheptulose 1,7-bisphosphate ['gmhep17bp'] 	 .(H2PO4Na)2-H(+)
C00459 C10H17N2O14P3 2380 dTTP ['dttp'] 	 .H2PO4K-H(+)
C00063 C9H16N3O14P3 2398 CTP ['ctp'] 	 .HPO4Na2-H(+)
C01268 C9H15N4O9P 2402 5-Amino-6-(5'-phosphoribosylamino)uracil ['5apru'] 	 .(H2PO4K)2-H(+)
C00705 C9H15N3O10P2 2405 dCDP ['dcdp'] 	 .(H2PO4Na)2-H(+)
C00131 C10H16N5O12P3 2405 dATP ['datp'] 	 .H2PO4K-H(+)
C00447 C7H16O13P2 2442 Sedoheptulose 1,7-

In [34]:
total_ion_to_metabolites = {}
for ion, mets in ion_to_bigg_ids.items():
    if ion in total_ion_to_metabolites:
        raise RuntimeError("WTF")
    total_ion_to_metabolites[ion] = set(mets)

for ion, mets in additional_ion_mapping.items():
    if mets is None:
        continue
    if ion in total_ion_to_metabolites:
        raise RuntimeError("WTF")
    if not isinstance(mets, list):
        mets = [mets]
    total_ion_to_metabolites[ion] = set(mets)
    
metabolite_ions = {}
for ion, mets in total_ion_to_metabolites.items():
    for met in mets:
        metabolite_ions.setdefault(met, set()).add(ion)

In [35]:
with open("../Data/Genome-wide/Ion_to_metabolites.json", "w") as outfile:
    json.dump({k: list(v) for k, v in total_ion_to_metabolites.items()}, outfile)

In [36]:
for k, v in metabolite_ions.items():
    if len(v) > 1:
        print(k, v)

hco3 {10, 13}


In [37]:
for idx, row in model_ion_annotations.iterrows():
    bigg_ids = kegg_to_bigg[row["id"]]
    bigg_names = []
    for bigg in bigg_ids:
        try:
            model_name = get_metabolite_name(bigg)
        except RuntimeError:
            pass
        else:
            bigg_names.append(model_name)
    if not bigg_names:
        raise RuntimeError("WTF!!!")
    print(row["id"], row["name"], *bigg_names, sep="\t")

C00002	ATP	ATP C10H12N5O13P3
C00008	ADP	ADP C10H12N5O10P2
C00009	Orthophosphate	Phosphate
C00013	Diphosphate	Diphosphate
C00015	UDP	UDP C9H11N2O12P2
C00018	Pyridoxal phosphate	Pyridoxal 5'-phosphate
C00020	AMP	AMP C10H12N5O7P
C00021	S-Adenosyl-L-homocysteine	S-Adenosyl-L-homocysteine
C00022	Pyruvate	Pyruvate
C00024	Acetyl-CoA	Acetyl-CoA
C00025	L-Glutamate	L-Glutamate
C00026	2-Oxoglutarate	2-Oxoglutarate
C00029	UDP-glucose	UDPglucose
C00031	D-Glucose	D-Glucose
C00033	Acetate	Acetate
C00035	GDP	GDP C10H12N5O11P2
C00037	Glycine	Glycine
C00041	L-Alanine	L-Alanine
C00042	Succinate	Succinate
C00043	UDP-N-acetyl-D-glucosamine	UDP-N-acetyl-D-glucosamine
C00047	L-Lysine	L-Lysine
C00049	L-Aspartate	L-Aspartate
C00051	Glutathione	Reduced glutathione
C00052	UDP-D-galactose	UDPgalactose
C00053	3'-Phosphoadenylyl sulfate	3'-Phosphoadenylyl sulfate
C00054	Adenosine 3',5'-bisphosphate	Adenosine 3',5'-bisphosphate
C00055	CMP	CMP C9H12N3O8P
C00062	L-Arginine	L-Arginine
C00073	L-Methionine	L-Methionine
C

## Parse MS data

In [38]:
ion_annotations[ion_annotations["ion"] == 2]

Unnamed: 0,id,name,formula,mz,mod,score,rank,mzDelta,ion,TIC correl,average int
298,C00109,2-Oxobutanoate,C4H6O3,57.035753,-CO2-H(+),9,3,-0.001173,2,-0.849435,35177.515548
483,C00164,Acetoacetate,C4H6O3,57.035753,-CO2-H(+),9,3,-0.001173,2,-0.849435,35177.515548
589,C00207,Acetone,C3H6O,57.035753,-H(+),85,1,-0.001171,2,-0.849435,35177.515548
667,C00232,Succinate semialdehyde,C4H6O3,57.035753,-CO2-H(+),9,3,-0.001173,2,-0.849435,35177.515548
1126,C00479,Propanal,C3H6O,57.035753,-H(+),85,1,-0.001171,2,-0.849435,35177.515548
4402,C06002,(S)-Methylmalonate semialdehyde,C4H6O3,57.035753,-CO2-H(+),9,3,-0.001173,2,-0.849435,35177.515548


In [39]:
os.listdir(data_dir)

['sample_id_all.xls',
 'sample_id_zscore.xls',
 'rawdata_pos_all.tsv',
 'neg_ionMz.xls',
 'zscore_neg.tsv',
 'rawdata_neg_all.tsv',
 'zscore_pos.tsv',
 'neg_kegg_all_3mD.xls',
 'pos_kegg_all_3mD.xls',
 'pos_ionMz.xls']

In [40]:
ion_mz = pd.read_excel(data_dir + "neg_ionMz.xls", header=None)
print(len(ion_mz))

sample_list = list(pd.read_excel(data_dir + "sample_id_zscore.xls", header=None)[0])
print(len(sample_list))

z_df = pd.read_csv(
    data_dir + "zscore_neg.tsv", sep="\t",
    header=None,
    names=sample_list,
).transpose()
z_df.columns = list(range(1, len(ion_mz)+1))
print(z_df.shape)

3169
3807
(3807, 3169)


In [41]:
z_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,3160,3161,3162,3163,3164,3165,3166,3167,3168,3169
aaeA,-0.4309,-0.1195,-0.6621,0.9284,-0.6906,0.0978,0.8878,-0.0141,0.0222,-1.1386,...,-0.3240,-0.3257,-0.1634,-0.9368,0.5201,0.0450,-2.3103,0.3768,1.9357,-0.0728
aaeB,-1.8696,1.1863,1.7171,0.3849,0.7550,0.7893,2.4243,-0.0048,0.2601,0.4018,...,-1.5066,-0.7330,-0.9127,0.3339,0.3806,0.8550,0.7273,-1.3189,-1.3404,0.6170
aaeR,1.3402,-0.8219,1.0256,2.0786,-0.5455,-0.1343,-0.9245,-0.2020,0.8363,-1.0240,...,0.3042,-0.0184,0.0590,-0.2135,2.0133,-1.0133,0.8610,-0.1052,1.0941,0.0899
aaeX,0.2608,-0.5211,0.0221,-0.8930,0.4659,0.0207,-0.2926,-0.4521,0.0442,1.9518,...,0.1026,0.0950,1.2542,2.4301,1.5994,0.8915,1.7399,0.6637,-0.5776,1.2127
aas,1.1950,0.2000,-0.4579,0.8433,0.5156,-0.7464,-0.4782,-0.9230,0.4672,-0.0610,...,0.3902,-1.3893,0.0475,-1.1109,-0.5719,-0.5863,-0.0396,0.0125,0.5395,-0.0724
aat,-1.7969,-1.5457,0.0185,1.0175,-0.4498,-0.6882,-0.7067,-0.9205,-0.5352,-0.5512,...,0.9819,0.7080,0.0991,-0.4292,-0.8299,2.3601,-0.2793,-0.2229,0.5789,1.4059
abgA,-0.3466,0.1708,0.4809,0.7412,0.5548,0.6172,0.1622,2.0636,0.4966,-0.1597,...,-0.1690,0.3312,0.2463,0.4549,-1.1172,-0.3887,-0.5818,0.2015,-0.0926,-0.2211
abgB,0.8234,0.8783,1.4925,2.3298,-0.8948,0.7481,0.6520,0.8316,1.0813,1.3584,...,-0.0704,0.6913,-2.3629,-0.7450,-1.2887,1.5130,-0.2847,0.9159,0.3998,-1.8781
abgR,1.2770,-0.2916,0.5046,-1.5026,2.7675,-2.4744,-1.1063,-0.2764,1.1327,-0.3280,...,-0.5154,-2.3108,1.1750,1.2460,-0.0367,-2.5084,-2.5649,1.9832,1.8112,0.2812
abgT,0.5425,0.1869,1.5039,0.5465,0.6735,-0.2168,0.1083,-0.2241,-0.4263,0.0979,...,0.8122,0.8033,0.1259,0.4965,0.4402,0.9486,-0.0976,-0.3681,-0.2024,0.4190


In [42]:
def metabolite_z(ion_df, met_id):
    ions = list(metabolite_ions[met_id])
    return ion_df[ions].mean(1)

In [43]:
met_df = pd.DataFrame({met_id: metabolite_z(z_df, met_id) for met_id in metabolite_ions})

In [44]:
# Save the DataFrame of z-values for each metabolite
met_df.to_csv("../Data/Genome-wide/Metabolite_z_values.tsv", sep="\t")

In [45]:
gene_name_dict = {}
for gene in model.genes:
    if gene.name not in gene_name_dict:
        gene_name_dict[gene.name] = gene
    else:
        print("oops", gene.name, gene.id)

def gene_to_reactions(gene_name):
    if not gene_name in gene_name_dict:
        return ""
    gene = gene_name_dict[gene_name]
    return ", ".join([r.id for r in gene.reactions])

In [46]:
met_df_reactions = met_df.copy()
met_df_reactions["REACTIONS"] = met_df_reactions.index.map(gene_to_reactions)
met_df_reactions.to_csv("../Data/Genome-wide/Metabolite_z_values_with_reactions.tsv", sep="\t")

In [49]:
def get_met_name(name):
    try:
        return model.metabolites.get_by_id(name + "_c").id
    except KeyError:
        return model.metabolites.get_by_id(name + "_p").id
    
measured_metabolites = [get_met_name(col) for col in met_df]

with open("../Data/Genome-wide/Measured_metabolites.json", "w") as outfile:
    json.dump(measured_metabolites, outfile)