# `Pre-processing workflow`
#### `and export all files for GNPS FBMN`

Import libraries:

In [5]:
from pyopenms import *
import os
import glob
import pandas as pd
import numpy as np

In [6]:
# constant path for interim files

path= "results/interim"
if not os.path.exists(path):
    os.mkdir(path)

In [7]:
# 1) Feature Detection

input_mzml_files = glob.glob('Example_data/*.mzML')

# 1.1) Mass trace detection

for filename in input_mzml_files:
    print("Mass Trace Detection: ", filename)
    exp = MSExperiment()
    MzMLFile().load(filename, exp)
    exp.sortSpectra(True)
    mass_traces = []
    mtd = MassTraceDetection()
    mtd_par = mtd.getDefaults()
    mtd_par.setValue("mass_error_ppm", 10.0) # high-res instrument, orbitraps
    mtd_par.setValue("noise_threshold_int", 1.0e04) # data-dependent (usually works for orbitraps)
    mtd.setParameters(mtd_par)
    mtd.run(exp, mass_traces, 0)

# 1.2) Elution peak detection
    print("Elution Peak Detection: ", filename)
    mass_traces_split = []
    mass_traces_final = []
    epd = ElutionPeakDetection()
    epd_par = epd.getDefaults()
    epd_par.setValue("width_filtering", "fixed") #The fixed setting filters out mass traces outside the [min_fwhm: 1.0, max_fwhm: 60.0] interval
    epd.setParameters(epd_par)
    epd.detectPeaks(mass_traces, mass_traces_split)
     
    if (epd.getParameters().getValue("width_filtering") == "auto"):
          epd.filterByPeakWidth(mass_traces_split, mass_traces_final)
    else:
          mass_traces_final = mass_traces_split

# 1.3) Feature detection
    print("Feature Detection: ", filename)
    feature_map_FFM = FeatureMap()
    feat_chrom = []
    ffm = FeatureFindingMetabo()
    ffm_par = ffm.getDefaults() 
    ffm_par.setValue("isotope_filtering_model", "none") #no need to use support vector machine models for isotope scoring if you have high res data
    ffm_par.setValue("remove_single_traces", "true") #remove mass traces without satellite isotopic traces
    ffm.setParameters(ffm_par)
    ffm.run(mass_traces_final, feature_map_FFM, feat_chrom)
    feature_map_FFM.setUniqueIds()
    feature_map_FFM.setPrimaryMSRunPath([filename.encode()])
    print(filename[7:-5] + ".featureXML")
    FeatureXMLFile().store(os.path.join(path, os.path.basename(filename)[:-5] + ".featureXML"), feature_map_FFM)
    
print("Finished Feature Detection")

Mass Trace Detection:  Example_data/Kirromycin.mzML
Progress of 'mass trace detection':
Elution Peak Detection:  Example_data/Kirromycin.mzML
-- done [took 0.18 s (CPU), 0.25 s (Wall)] -- 
Progress of 'elution peak detection':
-- done [took 1.11 s (CPU), 5.19 s (Wall)] -- 
 Example_data/Kirromycin.mzML
Progress of 'assembling mass traces to features':
-- done [took 1.85 s (CPU), 8.37 s (Wall)] -- 
_data/Kirromycin.featureXML
Mass Trace Detection:  Example_data/Epemicins.mzML
Progress of 'Elution Peak Detection: mass trace detection':
-- done [took 0.61 s (CPU), 0.70 s (Wall)] -- 
 Example_data/Epemicins.mzML
Progress of 'elution peak detection':
-- done [took 2.62 s (CPU), 10.54 s (Wall)] -- 
 Example_data/Epemicins.mzML
Progress of 'assembling mass traces to features':
_data/Epemicins.featureXML
-- done [took 3.72 s (CPU), 14.70 s (Wall)] -- 
Mass Trace Detection:  Example_data/Pentamycin_std.mzML
Progress of 'mass trace detection':
-- done [took 0.06 s (CPU), 0.09 s (Wall)] -- 
Eluti

In [8]:
# load feature files 

input_feature_files = sorted(glob.glob('results/interim/*.featureXML'))

feature_maps = []
for featurexml_file in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(featurexml_file, fmap)
    feature_maps.append(fmap)

In [9]:
# 2) Map alignment 

#use as reference for alignment, the file with the largest number of features (works well if you have a pooled QC for example)
ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in feature_maps]), key=lambda x:x[1])][-1]

aligner = MapAlignmentAlgorithmPoseClustering()

#parameter optimization
aligner_par= aligner.getDefaults()
aligner_par.setValue("max_num_peaks_considered", -1) #infinite
aligner_par.setValue("superimposer:mz_pair_max_distance", 0.05) 
aligner_par.setValue("pairfinder:distance_MZ:max_difference", 10.0) #Never pair features with larger m/z distance
aligner_par.setValue("pairfinder:distance_MZ:unit", "ppm")
aligner.setParameters(aligner_par)
aligner.setReference(feature_maps[ref_index])

for feature_map in feature_maps[:ref_index] + feature_maps[ref_index+1:]:
    trafo = TransformationDescription() #save the transformed data points
    aligner.align(feature_map, trafo)
    transformer = MapAlignmentTransformer()
    transformer.transformRetentionTimes(feature_map, trafo, True) 

#save the aligned feature maps
for feature_map in feature_maps:    
    feature_file = os.path.join(path, 'Aligned_' + os.path.basename(feature_map.getMetaValue('spectra_data')[0].decode())[:-5] +".featureXML")
    FeatureXMLFile().store(feature_file, feature_map)

In [10]:
# Display the features in a dataframe

input_feature_files = sorted(glob.glob('results/interim/Aligned*.featureXML'))

for filename in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(filename, fmap)
    DF= fmap.get_df(export_peptide_identifications=False)
    feature_csv= os.path.join(path, os.path.basename(filename)[8:-10] +"csv")
    DF.to_csv(feature_csv)
print("example:", os.path.basename(filename))
display(DF)

example: Aligned_Pentamycin_std.featureXML


Unnamed: 0_level_0,charge,RT,mz,RTstart,RTend,MZstart,MZend,quality,intensity
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
15174451414421252977,1,534.033696,183.080447,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.001487,1.128897e+06
2200374209011886479,1,223.516607,185.114835,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.002798,2.156296e+06
17996572427023070265,1,244.219624,186.221632,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.024248,1.818761e+07
10811216579247619652,1,-103.840382,194.117569,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.004314,3.285196e+06
9998814133344025621,1,-129.351567,194.117570,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.004472,3.427699e+06
...,...,...,...,...,...,...,...,...,...
13653775389494155901,1,221.983403,737.396463,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.001707,1.163941e+06
2219378455649523967,1,274.062724,741.404400,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.000901,6.235882e+05
12892846233140306495,1,214.559598,753.391255,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.001924,1.296664e+06
2603084014433537888,1,354.587372,755.353376,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.000651,4.157225e+05


In [11]:
# 3) Annotate features that have MS2 information

use_centroid_rt= False
use_centroid_mz= True
protein_ids = []
peptide_ids= []

mapper = IDMapper()

input_mzml_files= sorted(glob.glob("Example_data/*.mzML"))

for filename in input_mzml_files:
    exp = MSExperiment()
    MzMLFile().load(filename, exp)

    for fmap in feature_maps:
        peptide_ids = []
        protein_ids = []
        if os.path.basename(fmap.getMetaValue('spectra_data')[0].decode()) == os.path.basename(filename):
            mapper.annotate(fmap, peptide_ids, protein_ids, use_centroid_rt, use_centroid_mz, exp)
            featureidx_file = os.path.join(path, 'IDMapper_' + os.path.basename(filename[:-4]) +"featureXML")
            FeatureXMLFile().store(featureidx_file, fmap)

Unassigned peptides: 0
Peptides assigned to exactly one feature: 0
Peptides assigned to multiple features: 0
Unassigned and unidentified precursors: 3587
Unidentified precursor assigned to exactly one feature: 0
Unidentified precursor assigned to multiple features: 0
Feature annotation with identifications:
    no ID: 2491
    single ID: 0
    multiple IDs (identical): 0
    multiple IDs (divergent): 0


Unassigned peptides: 0
Peptides assigned to exactly one feature: 0
Peptides assigned to multiple features: 0
Unassigned and unidentified precursors: 3587
Unidentified precursor assigned to exactly one feature: 0
Unidentified precursor assigned to multiple features: 0
Feature annotation with identifications:
    no ID: 2491
    single ID: 0
    multiple IDs (identical): 0
    multiple IDs (divergent): 0


Unassigned peptides: 0
Peptides assigned to exactly one feature: 0
Peptides assigned to multiple features: 0
Unassigned and unidentified precursors: 3587
Unidentified precursor assigne

In [12]:
# load annotated feature files 

input_feature_files = sorted(glob.glob('results/interim/IDMapper_*.featureXML'))

feature_maps = []
for featurexml_file in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(featurexml_file, fmap)
    feature_maps.append(fmap)

In [13]:
# 4) Feature grouping

feature_grouper = FeatureGroupingAlgorithmKD()

consensus_map = ConsensusMap()
file_descriptions = consensus_map.getColumnHeaders()

for i, feature_map in enumerate(feature_maps):
    file_description = file_descriptions.get(i, ColumnHeader())
    file_description.filename = os.path.basename(feature_map.getMetaValue('spectra_data')[0].decode())
    file_description.size = feature_map.size()
    file_descriptions[i] = file_description

feature_grouper.group(feature_maps, consensus_map)
consensus_map.setColumnHeaders(file_descriptions)


Consensus_file= os.path.join(path, 'consensus' + ".consensusXML")
ConsensusXMLFile().store(Consensus_file, consensus_map)

Progress of 'computing RT transformations':
-- done [took 0.03 s (CPU), 0.03 s (Wall)] -- 
Progress of 'linking features':
-- done [took 0.08 s (CPU), 0.10 s (Wall)] -- 
ConsensusXMLFile::store():  found 3528 invalid unique ids


In [14]:
# 5) Filter out features that have not fragmented

input_consensus= "results/interim/consensus.consensusXML"
cmap = ConsensusMap()
ConsensusXMLFile().load(input_consensus, cmap)
new_map= ConsensusMap(cmap)
new_map.clear(False)
for f in cmap:
    if f.getPeptideIdentifications() !=[]:
        new_map.push_back(f)

Consensus_file= os.path.join(path,'filtered' + ".consensusXML")
ConsensusXMLFile().store(Consensus_file, new_map)

ConsensusXMLFile::store():  found 1119 invalid unique ids


In [15]:
# Export all MS2 information in a .MGF file

consensus= "results/interim/filtered.consensusXML"
input_mzml_files=sorted(glob.glob("Example_data/*.mzML"))
out_file= "results/GNPSexport/MSMS.mgf"

spectra_clustering= GNPSMGFFile()

spectra_clustering.run(String(consensus),[s.encode() for s in input_mzml_files], String(out_file))

In [25]:
# Export a .TXT table of features 

output_file= "results/GNPSexport/FeatureQuantificationTable.txt"
IonIdentityMolecularNetworking.writeFeatureQuantificationTable(cmap, output_file)

In [27]:
# Create a metadata table from the list of mzML files compatible for GNPS

!(cd Example_data/ && ls *.mzML > filelist.txt)

header_list = ["filename"]
metadata=pd.read_csv("Example_data/filelist.txt", names=header_list, index_col= None)
metadata['ATTRIBUTE_MAPID'] = np.arange(len(metadata))
metadata["ATTRIBUTE_MAPID"]= "MAP" + metadata["ATTRIBUTE_MAPID"].astype(str)
metadata['ATTRIBUTE_compound']=metadata['filename'].replace(".mzML", value="", regex=True)
metadata.to_csv("results/GNPSexport/metadata.tsv", sep='\t')

!(cd Example_data/ && rm filelist.txt)

metadata

Unnamed: 0,filename,ATTRIBUTE_MAPID,ATTRIBUTE_compound
0,Epemicins.mzML,MAP0,Epemicins
1,Kirromycin.mzML,MAP1,Kirromycin
2,Leupeptin_std.mzML,MAP2,Leupeptin_std
3,Pentamycin_std.mzML,MAP3,Pentamycin_std
