# `Pre-processing workflow`
#### `and export all files for GNPS FBMN`

Import libraries:

In [None]:
from pyopenms import *
import os
import glob
import pandas as pd
import numpy as np

In [None]:
# constant path for interim files

path= "results/interim"
if not os.path.exists(path):
    os.mkdir(path)

In [None]:
# 1) Feature Detection

input_mzml_files = glob.glob('Example_data/*.mzML')

# 1.1) Mass trace detection

for filename in input_mzml_files:
    print("Mass Trace Detection: ", filename)
    exp = MSExperiment()
    MzMLFile().load(filename, exp)
    exp.sortSpectra(True)
    mass_traces = []
    mtd = MassTraceDetection()
    mtd_par = mtd.getDefaults()
    mtd_par.setValue("mass_error_ppm", 10.0) # high-res instrument, orbitraps
    mtd_par.setValue("noise_threshold_int", 1.0e04) # data-dependent (usually works for orbitraps)
    mtd.setParameters(mtd_par)
    mtd.run(exp, mass_traces, 0)

# 1.2) Elution peak detection
    print("Elution Peak Detection: ", filename)
    mass_traces_split = []
    mass_traces_final = []
    epd = ElutionPeakDetection()
    epd_par = epd.getDefaults()
    epd_par.setValue("width_filtering", "fixed")
    epd.setParameters(epd_par)
    epd.detectPeaks(mass_traces, mass_traces_split)
     
    if (epd.getParameters().getValue("width_filtering") == "auto"):
          epd.filterByPeakWidth(mass_traces_split, mass_traces_final)
    else:
          mass_traces_final = mass_traces_split

# 1.3) Feature detection
    print("Feature Detection: ", filename)
    feature_map_FFM = FeatureMap()
    feat_chrom = []
    ffm = FeatureFindingMetabo()
    ffm_par = ffm.getDefaults() 
    ffm_par.setValue("isotope_filtering_model", "none")
    ffm_par.setValue("remove_single_traces", "true")
    ffm_par.setValue("mz_scoring_by_elements", "false")
    ffm_par.setValue("report_convex_hulls", "true")
    ffm.setParameters(ffm_par)
    ffm.run(mass_traces_final, feature_map_FFM, feat_chrom)
    feature_map_FFM.setUniqueIds()
    feature_map_FFM.setPrimaryMSRunPath([filename.encode()])
    print(filename[7:-5] + ".featureXML")
    FeatureXMLFile().store(os.path.join(path, os.path.basename(filename)[:-5] + ".featureXML"), feature_map_FFM)
    
print("Finished Feature Detection")

In [None]:
# load feature files 

input_feature_files = sorted(glob.glob('results/interim/*.featureXML'))

feature_maps = []
for featurexml_file in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(featurexml_file, fmap)
    feature_maps.append(fmap)

In [None]:
# 2) Map alignment 

#use as reference for alignment, the file with the largest number of features (works well if you have a pooled QC for example)
ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in feature_maps]), key=lambda x:x[1])][-1]

aligner = MapAlignmentAlgorithmPoseClustering()

#parameter optimization
aligner_par= aligner.getDefaults()
aligner_par.setValue("max_num_peaks_considered", -1) #infinite
aligner_par.setValue("superimposer:mz_pair_max_distance", 0.05) 
aligner_par.setValue("pairfinder:distance_MZ:max_difference", 10.0) #Never pair features with larger m/z distance
aligner_par.setValue("pairfinder:distance_MZ:unit", "ppm")
aligner.setParameters(aligner_par)
aligner.setReference(feature_maps[ref_index])

for feature_map in feature_maps[:ref_index] + feature_maps[ref_index+1:]:
    trafo = TransformationDescription() #save the transformed data points
    aligner.align(feature_map, trafo)
    transformer = MapAlignmentTransformer()
    transformer.transformRetentionTimes(feature_map, trafo, True) 

#save the aligned feature maps
for feature_map in feature_maps:    
    feature_file = os.path.join(path, 'Aligned_' + os.path.basename(feature_map.getMetaValue('spectra_data')[0].decode())[:-5] +".featureXML")
    FeatureXMLFile().store(feature_file, feature_map)

In [None]:
# Display the features in a dataframe

input_feature_files = sorted(glob.glob('results/interim/Aligned*.featureXML'))

for filename in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(filename, fmap)
    DF= fmap.get_df(export_peptide_identifications=False)
    feature_csv= os.path.join(path, os.path.basename(filename)[8:-10] +"csv")
    DF.to_csv(feature_csv)
print("example:", os.path.basename(filename))
display(DF)

In [None]:
# 3) Annotate features that have MS2 information

use_centroid_rt= False
use_centroid_mz= True
protein_ids = []
peptide_ids= []

mapper = IDMapper()

input_mzml_files= sorted(glob.glob("Example_data/*.mzML"))

for filename in input_mzml_files:
    exp = MSExperiment()
    MzMLFile().load(filename, exp)

    for fmap in feature_maps:
        peptide_ids = []
        protein_ids = []
        if os.path.basename(fmap.getMetaValue('spectra_data')[0].decode()) == os.path.basename(filename):
            mapper.annotate(fmap, peptide_ids, protein_ids, use_centroid_rt, use_centroid_mz, exp)
            featureidx_file = os.path.join(path, 'IDMapper_' + os.path.basename(filename[:-4]) +"featureXML")
            FeatureXMLFile().store(featureidx_file, fmap)

In [None]:
# load annotated feature files 

input_feature_files = sorted(glob.glob('results/interim/IDMapper_*.featureXML'))

feature_maps = []
for featurexml_file in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(featurexml_file, fmap)
    feature_maps.append(fmap)

In [None]:
# 4) Feature grouping

feature_grouper = FeatureGroupingAlgorithmKD()

consensus_map = ConsensusMap()
file_descriptions = consensus_map.getColumnHeaders()

for i, feature_map in enumerate(feature_maps):
    file_description = file_descriptions.get(i, ColumnHeader())
    file_description.filename = os.path.basename(feature_map.getMetaValue('spectra_data')[0].decode())
    file_description.size = feature_map.size()
    file_descriptions[i] = file_description

feature_grouper.group(feature_maps, consensus_map)
consensus_map.setColumnHeaders(file_descriptions)


Consensus_file= os.path.join(path, 'consensus' + ".consensusXML")
ConsensusXMLFile().store(Consensus_file, consensus_map)

In [None]:
# 5) Filter out features that have not fragmented

input_consensus= "results/interim/consensus.consensusXML"
cmap = ConsensusMap()
ConsensusXMLFile().load(input_consensus, cmap)
new_map= ConsensusMap(cmap)
new_map.clear(False)
for f in cmap:
    if f.getPeptideIdentifications() !=[]:
        new_map.push_back(f)

Consensus_file= os.path.join(path,'filtered' + ".consensusXML")
ConsensusXMLFile().store(Consensus_file, new_map)

In [None]:
# get intensities as a DataFrame
intensities = new_map.get_intensity_df()

# get meta data as DataFrame
meta_data = new_map.get_metadata_df()

# you can concatenate these two for a "result" DataFrame
result = pd.concat([meta_data, intensities], axis=1)

# if you don't need labeled index, remove it (and/or save with index = False)
result.reset_index(drop=True, inplace=True)

#change the format of the table to one that is compatible for GNPS FBMN:
idx = 0
new_col = "CONSENSUS"  # can be a list, a Series, an array or a scalar   
result.insert(loc=idx, column='#CONSENSUS', value=new_col)   
result= result.rename(columns= {"charge": "charge_cf", "RT": "rt_cf", "mz": "mz_cf", "quality": "quality_cf", "width": "width_cf"})
result= result.drop(["sequence"], axis= 1)
result= result.sort_index(axis=1)

filemeta= new_map.getColumnHeaders()
mapIDs = [k for k in filemeta.keys()]
filename= []
size=[]
label= []
for header in filemeta.values():
    files= header.filename
    sizes= header.size
    labels= header.label
    filename.append(files)
    size.append(sizes)
    label.append(labels)

dict = {'id': mapIDs, 'filename': filename,'label': label,'size': size}
DF= pd.DataFrame(dict)
DF["id"] = "intensity_"+ (DF["id"]).astype(str)

cols= result.columns
for col in cols:
    for i, path in enumerate(filename):
        if path== col:
            name= DF["id"][i]
            result.rename(columns={col: name}, inplace=True)

cols = result.columns
preordered = ["#CONSENSUS", "charge_cf", "rt_cf", "mz_cf", "quality_cf", "width_cf"]
new_cols = preordered + [c for c in result.columns if c not in preordered]
new_df = result.reindex(columns=new_cols)
new_df.to_csv("results/interim/FeatureMatrix.csv", sep="\t", index=None)
new_df

6) Create a metadata table for GNPS FBMN:

In [None]:
!(cd Example_data/ && ls *.mzML > filelist.txt)

In [None]:
header_list = ["filename"]
metadata=pd.read_csv("Example_data/filelist.txt", names=header_list, index_col= None)
metadata['ATTRIBUTE_MAPID'] = np.arange(len(metadata))
metadata["ATTRIBUTE_MAPID"]= "MAP" + metadata["ATTRIBUTE_MAPID"].astype(str)
metadata['ATTRIBUTE_compound']=metadata['filename'].replace(".mzML", value="", regex=True)
metadata.to_csv("results/GNPSexport/metadata.tsv", sep='\t')
metadata

In [None]:
!(cd Example_data/ && rm filelist.txt)

In [None]:
# merge the map ID information with the feature quantification table

metadata.filter(["filename", "ATTRIBUTE_MAPID"])
metadata["#MAP"]= ("MAP")
metadata["id"]= metadata["ATTRIBUTE_MAPID"].str.extract(r"(\d)")
metadata= metadata.drop(columns="ATTRIBUTE_MAPID")
metadata= metadata[["#MAP", "id", "filename"]]
metadata.to_csv("results/interim/mapIDs.csv", sep="\t", index=None)
metadata

In [None]:
map_file= "results/interim/mapIDs.csv"
feature_file= "results/interim/FeatureMatrix.csv"
merged_file= "results/GNPSexport/FeatureQuantificationTable.txt"

with open(map_file, "r") as map_file:
    map_text = map_file.read()

with open(feature_file, "r") as feature_file:
    feature_text = feature_file.read()

with open(merged_file, "w") as merged_file:
    merged_file.write(map_text + feature_text)

In [None]:
# Export all MS2 information in a .MGF file

consensus= "results/interim/filtered.consensusXML"
input_mzml_files=sorted(glob.glob("Example_data/*.mzML"))
out_file= "results/GNPSexport/MSMS.mgf"

spectra_clustering= GNPSMGFFile()

spectra_clustering.run(String(consensus),[s.encode() for s in input_mzml_files], String(out_file))