# `Pre-processing workflow`
#### `and export all files for GNPS FBMN`

Import libraries:

In [None]:
from pyopenms import *
import os
import glob
import pandas as pd
import plotly.express as px

In [None]:
# constant path for interim files
path = "results/interim"
if not os.path.exists(path): # if it doesn't exist
    os.mkdir("results") # create a results directory
    os.mkdir(path)  # create an interim directory for temporary results

In [None]:
# 1) Feature Detection

input_mzml_files = glob.glob('Example_data/*.mzML') # introduce a set of mzML files from the Example_data directory

# 1.1) Mass trace detection

for filename in input_mzml_files: # for each file in the set of files
    print("Mass Trace Detection: ", filename) #print the filename
    exp = MSExperiment()    
    MzMLFile().load(filename, exp) # load each mzML file to an OpenMS file format (MSExperiment)

    mass_traces = [] # introduce an empty list where the mass traces will be loaded
    mtd = MassTraceDetection()
    mtd_par = mtd.getDefaults() # get the default parameters in order to edit them
    mtd_par.setValue("mass_error_ppm", 10.0) # high-res instrument, orbitraps
    mtd_par.setValue("noise_threshold_int", 1.0e04) # data-dependent (usually works for orbitraps)
    mtd.setParameters(mtd_par) # set the new parameters
    mtd.run(exp, mass_traces, 0) # run mass trace detection

# 1.2) Elution peak detection

    print("Elution Peak Detection: ", filename)
    mass_traces_deconvol = []
    epd = ElutionPeakDetection()
    epd_par = epd.getDefaults()
    epd_par.setValue("width_filtering", "fixed") # The fixed setting filters out mass traces outside the [min_fwhm: 1.0, max_fwhm: 60.0] interval
    epd.setParameters(epd_par)
    epd.detectPeaks(mass_traces, mass_traces_deconvol)
     
# 1.3) Feature detection

    print("Feature Detection: ", filename)
    feature_map_FFM = FeatureMap() # output features 
    chrom_out = [] # output chromatograms 
    ffm = FeatureFindingMetabo()
    ffm_par = ffm.getDefaults() 
    ffm_par.setValue("remove_single_traces", "true") # remove mass traces without satellite isotopic traces
    ffm.setParameters(ffm_par)
    ffm.run(mass_traces_deconvol, feature_map_FFM, chrom_out)
    feature_map_FFM.setUniqueIds() # Assigns a new, valid unique id per feature
    feature_map_FFM.setPrimaryMSRunPath([filename.encode()]) # Sets the file path to the primary MS run (usually the mzML file)
    FeatureXMLFile().store(os.path.join(path, os.path.basename(filename)[:-5] + ".featureXML"), feature_map_FFM)
    
print("Finished Feature Detection")

In [None]:
# load feature files 

input_feature_files = glob.glob('results/interim/*.featureXML') # set of feature files

feature_maps = [] # empty list to fill with FeatureMaps: the OpenMS file format for feature files
for featurexml_file in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(featurexml_file, fmap) # load each file to a feature map
    feature_maps.append(fmap) # append all maps to the empty list 

In [None]:
# 2) Map alignment 

# use as reference for alignment, the file with the largest number of features (works well if you have a pooled QC for example)
ref_index = feature_maps.index(sorted(feature_maps, key=lambda x: x.size())[-1])

aligner = MapAlignmentAlgorithmPoseClustering()

# parameter optimization
aligner_par= aligner.getDefaults()
aligner_par.setValue("max_num_peaks_considered", -1) # infinite
aligner_par.setValue("pairfinder:distance_MZ:max_difference", 10.0) # Never pair features with larger m/z distance
aligner_par.setValue("pairfinder:distance_MZ:unit", "ppm")
aligner.setParameters(aligner_par)
aligner.setReference(feature_maps[ref_index])

for feature_map in feature_maps[:ref_index] + feature_maps[ref_index+1:]:
    trafo = TransformationDescription() # save the transformed data points
    aligner.align(feature_map, trafo)
    transformer = MapAlignmentTransformer()
    transformer.transformRetentionTimes(feature_map, trafo, True) 

# save the aligned feature maps
for feature_map in feature_maps:    
    feature_file = os.path.join(path, 'Aligned_' + os.path.basename(feature_map.getMetaValue('spectra_data')[0].decode())[:-5] +".featureXML")
    FeatureXMLFile().store(feature_file, feature_map)

In [None]:
# 3) IDMapper annotate features that have MS2 information with peptide identifications which is unrelated. This step is important for FBMN
# because we have to introduce only features that have fragmented. 

use_centroid_rt = False
use_centroid_mz = True
protein_ids = []
peptide_ids = []

mapper = IDMapper()

input_mzml_files = glob.glob("Example_data/*.mzML")

for filename in input_mzml_files:
    exp = MSExperiment()
    MzMLFile().load(filename, exp)

    for fmap in feature_maps:
        peptide_ids = []
        protein_ids = []
        if os.path.basename(fmap.getMetaValue('spectra_data')[0].decode()) == os.path.basename(filename):
            mapper.annotate(fmap, peptide_ids, protein_ids, use_centroid_rt, use_centroid_mz, exp)
            featureidx_file = os.path.join(path, 'IDMapper_' + os.path.basename(filename[:-4]) +"featureXML")
            FeatureXMLFile().store(featureidx_file, fmap)

In [None]:
# load annotated feature files 

input_feature_files = glob.glob('results/interim/IDMapper_*.featureXML')

feature_maps = []
for featurexml_file in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(featurexml_file, fmap)
    feature_maps.append(fmap)

In [None]:
# 4) Feature grouping

feature_grouper = FeatureGroupingAlgorithmKD()

consensus_map = ConsensusMap()
file_descriptions = consensus_map.getColumnHeaders()

for i, feature_map in enumerate(feature_maps):
    file_description = file_descriptions.get(i, ColumnHeader())
    file_description.filename = os.path.basename(feature_map.getMetaValue('spectra_data')[0].decode())
    file_description.size = feature_map.size()
    file_descriptions[i] = file_description

feature_grouper.group(feature_maps, consensus_map)
consensus_map.setUniqueIds()
consensus_map.setColumnHeaders(file_descriptions)


Consensus_file = os.path.join(path, 'consensus' + ".consensusXML")
ConsensusXMLFile().store(Consensus_file, consensus_map)

df = consensus_map.get_df()
df= df.drop(columns="sequence")
df

In [None]:
fig = px.scatter(df[df["quality"] > 0.01], x="RT", y="mz", color="quality")
fig.update_layout(title="Consensus features")
fig.show()

In [None]:
# 5) Filter out features that have not fragmented

input_consensus = "results/interim/consensus.consensusXML"
cmap = ConsensusMap()
ConsensusXMLFile().load(input_consensus, cmap)
new_map = ConsensusMap(cmap)
new_map.clear(False)
for f in cmap:
    if f.getPeptideIdentifications():
        new_map.push_back(f)

Consensus_file = os.path.join(path,'filtered' + ".consensusXML")
ConsensusXMLFile().store(Consensus_file, new_map)

In [None]:
# Export all MS2 information in a .MGF file

if not os.path.exists("results/GNPSexport"): # if it doesn't exist
    os.mkdir("results/GNPSexport")  # make a new one

consensus = "results/interim/filtered.consensusXML"
input_mzml_files = glob.glob("Example_data/*.mzML")
out_file = "results/GNPSexport/MSMS.mgf"

spectra_clustering = GNPSMGFFile()

spectra_clustering.run(String(consensus),[s.encode() for s in input_mzml_files], String(out_file))

In [None]:
# Export a .TXT table of features 

output_file = "results/GNPSexport/FeatureQuantificationTable.txt"
IonIdentityMolecularNetworking.writeFeatureQuantificationTable(cmap, output_file)

In [None]:
# Create a metadata table from the list of mzML files compatible for GNPS

metadata = pd.DataFrame()
metadata["filename"] = [file for file in os.listdir("Example_data") if file.endswith(".mzML")]
metadata["ATTRIBUTE_MAPID"]= ["MAP" + str(i) for i in range(len(metadata))]
metadata['ATTRIBUTE_compound'] = metadata['filename'].replace(".mzML", value="", regex=True)
metadata.to_csv("results/GNPSexport/metadata.tsv", sep='\t')

metadata