In [None]:
import numpy as np
import pandas as pd
import glob
import shutil
import os
from pyteomics import mgf, auxiliary
from pyopenms import *
import pyteomics
from pyteomics import mztab
from src.export_feature_matrix import export

In [None]:
path = os.path.join("results", "interim", "annotations")

if os.path.exists(path):
    shutil.rmtree(path)

os.mkdir(path)

First,import the feature table:

In [None]:
DF_features = pd.read_csv(os.path.join("results", "interim", "FeatureMatrix.tsv"), sep="\t")
DF_features.head()

#### `2) Spectral matching`

Load the MGF file to an MSExperiment format:

In [None]:
mgf_file= os.path.join("results", "GNPSexport", "MSMS.mgf")
exp = MSExperiment()
MascotGenericFile().load(mgf_file, exp)

Perform spectral matching with a library in MGF format that is located under "resources":
(here we are using the one freely available from GNPS (all libraries))

In [None]:
database = os.path.join("resources", "GNPS-LIBRARY.mgf")
speclib = MSExperiment()
MascotGenericFile().load(database, speclib)
mztab = MzTab()
output_mztab = os.path.join("results", "interim", "annotations", "MSMS.mzTab")
out_merged = ""
MSMS_match = MetaboliteSpectralMatching()
MSMS_match_par = MSMS_match.getDefaults()
MSMS_match_par.setValue('merge_spectra', 'false')
MSMS_match.setParameters(MSMS_match_par)
MSMS_match.run(exp, speclib, mztab,  String(out_merged))
MzTabFile().store(output_mztab, mztab)

Clean up the mzTab to a dataframe:

In [None]:
spectralmatch =  pyteomics.mztab.MzTab(output_mztab, encoding="UTF8", table_format="df")
spectralmatch.metadata
df = spectralmatch.small_molecule_table
spectralmatch_DF = df.drop(columns= ["identifier", "inchi_key", "modifications", "calc_mass_to_charge", "opt_adduct_ion", "taxid", "species", "database", "spectra_ref", "search_engine", "opt_sec_id","smallmolecule_abundance_std_error_study_variable[1]", "smallmolecule_abundance_stdev_study_variable[1]", "smallmolecule_abundance_study_variable[1]", "chemical_formula"])
spectralmatch_DF =spectralmatch_DF[spectralmatch_DF["opt_ppm_error"] <= 10] 
spectralmatch_DF =spectralmatch_DF[spectralmatch_DF["opt_ppm_error"] >= -10]
spectralmatch_DF =spectralmatch_DF[spectralmatch_DF["opt_match_score"] >= 60]
spectralmatch_DF["opt_spec_native_id"]= spectralmatch_DF["opt_spec_native_id"].str.replace(r"index=", "")
spectralmatch_DF       

Annotate the feature matrix with the spectral matches:

In [None]:
path = os.path.join("results", "GNPSexport", "MSMS.mgf")
file= mgf.MGF(source=path, use_header=True, convert_arrays=2, read_charges=True, read_ions=False, dtype=None, encoding=None)
parameters=[]
for spectrum in file:
    parameters.append(spectrum['params'])
mgf_file= pd.DataFrame(parameters)
mgf_file["feature_id"] = mgf_file["feature_id"].str.replace(r"e_", "")

df = exp.get_df() #mzML loaded in MSExperiment()
for spec in exp:
    df["index"] = [spec.getNativeID() for spec in exp]
    df["SCANS"] = [spec.getMetaValue("Scan_ID") for spec in exp]
df["index"] = df["index"].str.replace(r"index=", "")

# Add Scan numbers to spectral match DF
spectralmatch_DF["SCANS"] = ""
for i, idx in zip(spectralmatch_DF.index, spectralmatch_DF["opt_spec_native_id"]):
    hits = []
    for index, scan_number, in zip(df["index"], df["SCANS"]):
        if idx==index:
            hit = f"{scan_number}"
            if hit not in hits:
                hits.append(hit)
    spectralmatch_DF.loc[i, "SCANS"] = " ## ".join(hits)

# Add Scan numbers to feature DF
scans = []
for consensus_id in DF_features["id"].astype(str):
    hits = []
    for scan, mgf_id in zip(mgf_file["scans"], mgf_file["feature_id"]):
        if consensus_id == mgf_id:
            hit = f"{scan}"
            if hit not in hits:
                hits.append(hit)
    scans.append(" ## ".join(hits))

DF_features["SCANS"] = scans

DF_features["SpectralMatch"] = ""
DF_features["SpectralMatch_smiles"] = ""

for i, scan in zip(DF_features.index, DF_features["SCANS"]):
    hits1 = []
    hits2=[]
    for name, smiles, scan_number, in zip(spectralmatch_DF["description"],spectralmatch_DF["smiles"], spectralmatch_DF["SCANS"]):
        if scan==scan_number:
            hit1 = f"{name}"
            hit2 = f"{smiles}"
            if hit1 not in hits1:
                hits1.append(hit1)
                hits2.append(hit2)
    DF_features.loc[i, "SpectralMatch"] = " ## ".join(hits1)
    DF_features.loc[i, "SpectralMatch_smiles"] = " ## ".join(hits2)

export(DF_features)