In [None]:
import json
import requests
import re

with open("MassBank/MassBank.json", "r") as f:
    data = json.load(f)

def get_text_from_url(url):
    try:
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            text = response.text
            return text
        else:
            print(
                f"Failed to retrieve content from {url}. Status code: {response.status_code}"
            )
            return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None


filtered_data = []

for metabolite in data:
    desc = metabolite["name"]
    url_content = get_text_from_url(metabolite["@id"]).split("\n")
    # get publication
    # pub = [l for l in url_content if "PUBLICATION" in l]
    # if pub:
        # if "A metabolomics pipeline for the mechanistic interrogation of the gut microbiome" in pub:
    # get name
    name = [l for l in url_content if "CH$NAME:" in l]
    if name:
        name = name[0].replace("CH$NAME:", "").replace("<b>", "").replace("</b>", "").replace("<br>", "").strip()
    else:
        name = desc
    # get sum formula
    formula = [l for l in url_content if "CH$FORMULA:" in l]
    if formula:
        formula = formula[0]
        match = re.findall(r"Search\.aspx\?q=[A-Z0-9a-z]+", formula)
        if match:
            formula = match[0].replace("Search.aspx?q=", "")
        else:
            formula = ""
    else:
        formula = ""
    # get exact mass
    exact_mass = [l for l in url_content if "CH$EXACT_MASS:" in l]
    if exact_mass:
        try:
            exact_mass = float(exact_mass[0].replace("CH$EXACT_MASS:", "").replace("<b>", "").replace("</b>", "").replace("<br>", "").strip())
        except ValueError:
            continue
    else:
        continue
    # get precursor m/z
    prec_mz = [l for l in url_content if "MS$FOCUSED_ION:</b> PRECURSOR_M/Z" in l]
    if prec_mz:
        try:
            prec_mz = float(prec_mz[0].replace("MS$FOCUSED_ION:</b> PRECURSOR_M/Z", "").replace("<b>", "").replace("</b>", "").replace("<br>", "").strip())
        except ValueError:
            continue
    else:
        continue
    # get InChI
    inchi = [l for l in url_content if "CH$IUPAC:</b> InChI=" in l]
    if inchi:
        inchi = inchi[0].replace("CH$IUPAC:</b> InChI=", "").replace("<b>", "").replace("</b>", "").replace("<br>", "").strip()
    else:
        inchi = ""
    smiles = [l for l in url_content if "CH$SMILES:" in l]
    if smiles:
        smiles = smiles[0].replace("CH$SMILES:", "").replace("<b>", "").replace("</b>", "").replace("<br>", "").strip()
    else:
        smiles = ""
    # get CAS
    cas = [l for l in url_content if "CH$LINK:</b> CAS" in l]
    if cas:
        cas = cas[0]
        match = re.findall(r"\d+-\d+-\d+", cas)
        if match:
            cas = match[0]
        else:
            cas = ""
    else:
        cas = ""
    # get publication
    pub = [l for l in url_content if "PUBLICATION" in l]
    if pub:
        pub = pub[0].replace("PUBLICATION:", "").replace("<b>", "").replace("</b>", "").replace("<br>", "").strip()
    else:
        pub = ""
    # get MS2 peak intensities
    for i, line in enumerate(url_content):
        if line.startswith("<b>PK$NUM_PEAK:</b>"):
            break
    num = int(re.findall(r"\d+", url_content[i])[0])
    i += 2
    url_content = url_content[i : i + num]
    mz_intys = sorted(
        [
            [
                float(l)
                for l in line.strip("&nbsp;&nbsp;")
                .replace("&nbsp", "")
                .replace("<br>", "")
                .split(";")
            ][:2]
            for line in url_content
            if line.startswith("&nbsp;&nbsp;")
        ],
        key=lambda x: x[1],
        reverse=True,
    )
    mzs = [x[0] for x in mz_intys]
    intys = [x[1] for x in mz_intys]
    # normalize intys 0 to 1
    intys = [x / max(intys) for x in intys]
    filtered_data.append(
        {
            "name": name,
            "description": desc,
            "formula": formula,
            "exact mass": exact_mass,
            "precursor mz": prec_mz,
            "InChI": inchi,
            "SMILES": smiles,
            "CAS": cas,
            "publication": pub,
            "url": metabolite["@id"],
            "m/z": mzs,
            "normalized intensity": intys,
        }
    )

with open("MassBank/MassBank-MetaData.json", "w") as f:
    json.dump(filtered_data, f, indent=4)

In [8]:
import pandas as pd
import json

with open("MassBank/MassBank-ESI-QTOF.json", "r") as f:
    mb = json.load(f)


df = pd.read_csv("precursor-lists/Han.tsv", sep="\t")

mb_filtered = []
# take only entries which have the second highest peak at least 20% of the highest
for m in mb:
    if len(m["normalized intensity"]) > 2:
        if m["normalized intensity"][1] > 0.2:
            mb_filtered.append(m)
mb = mb_filtered
del mb_filtered

matches = []
# filter based on CAS
for _, m_input in df.iterrows():
    for cas in str(m_input["CAS"]).split(";"):
        for m in mb:
            if m["CAS"] == cas:
                m["library name"] = m_input["name"]
                matches.append(m)

# # save intermediate matches in json file
with open("MassBank/MassBank-ESI-QTOF-Han.json", "w") as f:
    json.dump(matches, f, indent=4)

In [9]:
# generate assay library from final MassBank json
import json
import pandas as pd
import numpy as np

with open("MassBank/MassBank-ESI-QTOF-Han.json", "r") as f:
    mb = json.load(f)

def build_transition_table(data):
    """data is list of dicts with filtered entries from MassBank"""
    for m in data:
        # include all transitions with higher then 0.2 normalized intensity
        for mz, inty in zip(m["m/z"], m["normalized intensity"]):
            if inty > 0.2:
                yield (m["library name"], 
                    m["precursor mz"],
                    mz,
                    inty,
                    60, # Normalized Retention Time, todo
                    #f"{m['library name']}",
                    m["formula"],
                    m["SMILES"],
                    m["description"]
                )
            else:
                break # the others are lower anyways, sorted before

df = pd.DataFrame(np.fromiter(build_transition_table(mb), dtype=[("CompoundName", "U100"),
                                                                 ("PrecursorMz", "f"),
                                                                 ("ProductMz", "f"),
                                                                 ("LibraryIntensity", "f"),
                                                                 ("NormalizedRetentionTime", "f"),
                                                                 #("TransitionGroupId", "U100"),
                                                                 ("SumFormula", "U100"),
                                                                 ("SMILES", "U300"),
                                                                 ("Annotation", "U200"),
                                                                 ]))
# add unique TransitionGroupId
df["TransitionGroupId"] = [f"{n}_{i}" for i, n in enumerate(df["CompoundName"])]
df.head()

df.to_csv("assay-libraries/MassBankHanAssayLibrary.tsv", sep="\t", index=False)

In [10]:
# check which metabolites have not been found via CAS number

import pandas as pd

df = pd.read_csv("assay-libraries/MassBankHanAssayLibrary.tsv", sep="\t")
in_lib = list(set(df["CompoundName"].tolist()))

df = pd.read_csv("precursor-lists/Han.tsv", sep="\t")
in_input = list(set(df["name"].tolist()))

not_found_in_mass_bank = [x for x in in_input if x not in in_lib]
not_found_in_mass_bank

['3-HYDROXYBENZOATE',
 'METHYGLUTARATE',
 'PYRAZOLE',
 'MALTOSE',
 'DGTP',
 'N-ACETYLMETHIONINE',
 'D-GLUCURONOLACTONE',
 '2-HYDROXY-4-(METHYLTHIO)BUTANOATE',
 'CADAVERINE',
 'DOCOSAHEXAENOATE',
 'URACIL',
 'DESMOSTEROL',
 '3-HYDROXYBENZYL ALCOHOL',
 '4-ACETAMIDOBUTANOATE',
 'PALMITOYLCARNITINE',
 'GLYCINE',
 '3,4-DIHYDROXYBENZOATE',
 'DIPALMITOYL-PHOSPHATIDYLCHOLINE',
 'RIBOSE 1,5-BISPHOSPHATE',
 'O-PHOSPHOSERINE',
 'XYLOSE',
 '1-METHYLADENOSINE',
 'SUCCINATE',
 'ADENOSINE TRIPHOSPHATE',
 'AMINOISOBUTANOATE',
 '1,3-DIAMINOPROPANE',
 '3-HYDROXYBENZALDEHYDE',
 'BETA-GLYCEROPHOSPHATE',
 'N-ACETYLCYSTEINE',
 'CITRATE',
 'INOSINE-MONOPHOSPHATE',
 'PHENYLETHANOLAMINE',
 'GUAIACOL',
 'N-ETHYL-5-METHYL-2-(1-METHYLETHYL)-CYCLOHEXANECARBOXAMIDE',
 'URSODEOXYCHOLATE',
 'PALMITATE',
 'THREITOL',
 'MALATE',
 'ARABINOSE',
 'FORMAMIDE',
 'PSICOSE',
 '2-PHOSPHOGLYCERATE',
 'SORBATE',
 'MALEATE',
 '1-METHYL-L-HISTIDINE',
 '3-(2-HYDROXYPHENYL)PROPANOATE',
 'N-ACETYLGLUTAMATE',
 'GLUTATHIONE REDUCED',
 