# `Pre-processing workflow`
#### `and export all files for GNPS FBMN`

Import libraries:

In [1]:
from pyopenms import *
import os
import glob
import pandas as pd
import numpy as np

In [2]:
# constant path for interim files
path = "results/interim"
if not os.path.exists(path): #if it doesn't exist
    os.mkdir(path)  #make a new one

In [3]:
# 1) Feature Detection

input_mzml_files = glob.glob('Example_data/*.mzML') # introduce a set of mzML files from the Example_data directory

# 1.1) Mass trace detection

for filename in input_mzml_files: #for each file in the set of files
    print("Mass Trace Detection: ", filename) #print the filename
    exp = MSExperiment()    
    MzMLFile().load(filename, exp) # load each mzML file to an OpenMS file format (MSExperiment)
    exp.sortSpectra(True) # Sorts spectra by RT
    mass_traces = [] #introduce an empty list where the mass traces are loaded
    mtd = MassTraceDetection()
    mtd_par = mtd.getDefaults() #get the default parameters in order to edit them
    mtd_par.setValue("mass_error_ppm", 10.0) # high-res instrument, orbitraps
    mtd_par.setValue("noise_threshold_int", 1.0e04) # data-dependent (usually works for orbitraps)
    mtd.setParameters(mtd_par) #set the new parameters
    mtd.run(exp, mass_traces, 0) #run mass trace detection

# 1.2) Elution peak detection
    print("Elution Peak Detection: ", filename)
    mass_traces_deconvol = []
    epd = ElutionPeakDetection()
    epd_par = epd.getDefaults()
    epd_par.setValue("width_filtering", "fixed") #The fixed setting filters out mass traces outside the [min_fwhm: 1.0, max_fwhm: 60.0] interval
    epd.setParameters(epd_par)
    epd.detectPeaks(mass_traces, mass_traces_deconvol)
     
# 1.3) Feature detection
    print("Feature Detection: ", filename)
    feature_map_FFM = FeatureMap() # output features 
    chrom_out = [] # output chromatograms 
    ffm = FeatureFindingMetabo()
    ffm_par = ffm.getDefaults() 
    ffm_par.setValue("isotope_filtering_model", "none") #no need to use support vector machine models for isotope scoring if you have high res data (make sure this is correct)
    ffm_par.setValue("remove_single_traces", "true") #remove mass traces without satellite isotopic traces
    ffm.setParameters(ffm_par)
    ffm.run(mass_traces_deconvol, feature_map_FFM, chrom_out)
    feature_map_FFM.setUniqueIds() #Assigns a new, valid unique id per feature
    feature_map_FFM.setPrimaryMSRunPath([filename.encode()]) #Sets the file path to the primary MS run (usually the mzML file)
    FeatureXMLFile().store(os.path.join(path, os.path.basename(filename)[:-5] + ".featureXML"), feature_map_FFM)
    
print("Finished Feature Detection")

Mass Trace Detection:  Example_data/Leupeptin_std.mzML
Progress of 'mass trace detection':
-- done [took 0.01 s (CPU), 0.02 s (Wall)] -- 
Elution Peak Detection:  Example_data/Leupeptin_std.mzML
Progress of 'elution peak detection':
-- done [took 0.21 s (CPU), 0.03 s (Wall)] -- 
Feature Detection:  Example_data/Leupeptin_std.mzML
Progress of 'assembling mass traces to features':
-- done [took 0.09 s (CPU), 0.01 s (Wall)] -- 
Mass Trace Detection:  Example_data/Pentamycin_std.mzML
Progress of 'mass trace detection':
Elution Peak Detection:  Example_data/Pentamycin_std.mzML
-- done [took 0.02 s (CPU), 0.02 s (Wall)] -- 
Progress of 'elution peak detection':
-- done [took 0.05 s (CPU), 0.01 s (Wall)] -- 
Feature Detection:  Example_data/Pentamycin_std.mzML
Mass Trace Detection:  Example_data/Kirromycin.mzML
Progress of 'assembling mass traces to features':
-- done [took 0.02 s (CPU), 0.00 s (Wall)] -- 
Elution Peak Detection:  Example_data/Kirromycin.mzML
Feature Detection:  Example_data/

In [4]:
# load feature files 

input_feature_files = glob.glob('results/interim/*.featureXML') # set of feature files

feature_maps = [] #empty list to fill with FeatureMaps: the OpenMS file format for feature files
for featurexml_file in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(featurexml_file, fmap) # load each file to a feature map
    feature_maps.append(fmap) #append all maps to the empty list 

In [5]:
# 2) Map alignment 

#use as reference for alignment, the file with the largest number of features (works well if you have a pooled QC for example)
ref_index = feature_maps.index(sorted(feature_maps, key=lambda x: x.size())[-1])

aligner = MapAlignmentAlgorithmPoseClustering()

#parameter optimization
aligner_par= aligner.getDefaults()
aligner_par.setValue("max_num_peaks_considered", -1) #infinite
aligner_par.setValue("superimposer:mz_pair_max_distance", 0.05) #...?
aligner_par.setValue("pairfinder:distance_MZ:max_difference", 10.0) # Never pair features with larger m/z distance
aligner_par.setValue("pairfinder:distance_MZ:unit", "ppm")
aligner.setParameters(aligner_par)
aligner.setReference(feature_maps[ref_index])

for feature_map in feature_maps[:ref_index] + feature_maps[ref_index+1:]:
    trafo = TransformationDescription() #save the transformed data points
    aligner.align(feature_map, trafo)
    transformer = MapAlignmentTransformer()
    transformer.transformRetentionTimes(feature_map, trafo, True) 

#save the aligned feature maps
for feature_map in feature_maps:    
    feature_file = os.path.join(path, 'Aligned_' + os.path.basename(feature_map.getMetaValue('spectra_data')[0].decode())[:-5] +".featureXML")
    FeatureXMLFile().store(feature_file, feature_map)

In [6]:
# Display the features in a dataframe

input_feature_files = glob.glob('results/interim/Aligned*.featureXML')

for filename in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(filename, fmap)
    df= fmap.get_df(export_peptide_identifications=False) # we are not working with targeted peptide analysis, so we do not need that info
    df.to_csv(os.path.join(path, os.path.basename(filename)[8:-10] +"csv"))
print("example:", os.path.basename(filename))
df

example: Aligned_Epemicins.featureXML


Unnamed: 0_level_0,charge,RT,mz,RTstart,RTend,MZstart,MZend,quality,intensity
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2217936491873357040,1,329.607460,208.133234,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.000180,3.212667e+07
13626119967329176655,1,310.614806,208.133256,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.000011,1.985221e+06
6551803616801075372,1,379.585751,260.185663,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.000258,4.589339e+07
15171821162113179229,1,228.511545,261.123425,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.000025,4.301382e+06
10501161551313202635,1,316.367570,271.060021,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.000009,1.677704e+06
...,...,...,...,...,...,...,...,...,...
2831295821287060323,1,350.433279,1603.895077,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.001191,1.190649e+08
6013553204980098844,1,342.791333,1604.396813,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.000371,4.679359e+07
3104192950916895803,2,47.941818,1639.049833,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.000003,2.294459e+05
5934341519989898118,1,173.429955,1646.474399,1.797693e+308,-1.797693e+308,1.797693e+308,-1.797693e+308,0.000006,5.192035e+05


In [7]:
# 3) IDMapper annotate features that have MS2 information with peptide identifications which is unrelated. This step is important for FBMN
# because we have to introduce only features that have fragmented. 

use_centroid_rt = False
use_centroid_mz = True
protein_ids = []
peptide_ids = []

mapper = IDMapper()

input_mzml_files = glob.glob("Example_data/*.mzML")

for filename in input_mzml_files:
    exp = MSExperiment()
    MzMLFile().load(filename, exp)

    for fmap in feature_maps:
        peptide_ids = []
        protein_ids = []
        if os.path.basename(fmap.getMetaValue('spectra_data')[0].decode()) == os.path.basename(filename):
            mapper.annotate(fmap, peptide_ids, protein_ids, use_centroid_rt, use_centroid_mz, exp)
            featureidx_file = os.path.join(path, 'IDMapper_' + os.path.basename(filename[:-4]) +"featureXML")
            FeatureXMLFile().store(featureidx_file, fmap)

Unassigned peptides: 0
Peptides assigned to exactly one feature: 0
Peptides assigned to multiple features: 0
Unassigned and unidentified precursors: 3031
Unidentified precursor assigned to exactly one feature: 0
Unidentified precursor assigned to multiple features: 0
Feature annotation with identifications:
    no ID: 107
    single ID: 0
    multiple IDs (identical): 0
    multiple IDs (divergent): 0


Unassigned peptides: 0
Peptides assigned to exactly one feature: 0
Peptides assigned to multiple features: 0
Unassigned and unidentified precursors: 2970
Unidentified precursor assigned to exactly one feature: 0
Unidentified precursor assigned to multiple features: 0
Feature annotation with identifications:
    no ID: 85
    single ID: 0
    multiple IDs (identical): 0
    multiple IDs (divergent): 0


Unassigned peptides: 0
Peptides assigned to exactly one feature: 0
Peptides assigned to multiple features: 0
Unassigned and unidentified precursors: 5090
Unidentified precursor assigned t

In [8]:
# load annotated feature files 

input_feature_files = glob.glob('results/interim/IDMapper_*.featureXML')

feature_maps = []
for featurexml_file in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(featurexml_file, fmap)
    feature_maps.append(fmap)

In [9]:
# 4) Feature grouping

feature_grouper = FeatureGroupingAlgorithmKD()

consensus_map = ConsensusMap()
file_descriptions = consensus_map.getColumnHeaders()

for i, feature_map in enumerate(feature_maps):
    file_description = file_descriptions.get(i, ColumnHeader())
    file_description.filename = os.path.basename(feature_map.getMetaValue('spectra_data')[0].decode())
    file_description.size = feature_map.size()
    file_descriptions[i] = file_description

feature_grouper.group(feature_maps, consensus_map)
consensus_map.setColumnHeaders(file_descriptions)


Consensus_file= os.path.join(path, 'consensus' + ".consensusXML")
ConsensusXMLFile().store(Consensus_file, consensus_map)

Progress of 'computing RT transformations':
-- done [took 0.00 s (CPU), 0.01 s (Wall)] -- 
Progress of 'linking features':
-- done [took 0.02 s (CPU), 0.01 s (Wall)] -- 
ConsensusXMLFile::store():  found 3528 invalid unique ids


In [10]:
# 5) Filter out features that have not fragmented

input_consensus = "results/interim/consensus.consensusXML"
cmap = ConsensusMap()
ConsensusXMLFile().load(input_consensus, cmap)
new_map = ConsensusMap(cmap)
new_map.clear(False)
for f in cmap:
    if f.getPeptideIdentifications():
        new_map.push_back(f)

Consensus_file = os.path.join(path,'filtered' + ".consensusXML")
ConsensusXMLFile().store(Consensus_file, new_map)

ConsensusXMLFile::store():  found 1124 invalid unique ids


In [11]:
# Export all MS2 information in a .MGF file

if not os.path.exists("results/GNPSexport"): #if it doesn't exist
    os.mkdir("results/GNPSexport")  #make a new one

consensus = "results/interim/filtered.consensusXML"
input_mzml_files = glob.glob("Example_data/*.mzML")
out_file = "results/GNPSexport/MSMS.mgf"

spectra_clustering = GNPSMGFFile()

spectra_clustering.run(String(consensus),[s.encode() for s in input_mzml_files], String(out_file))

In [12]:
# Export a .TXT table of features 

output_file = "results/GNPSexport/FeatureQuantificationTable.txt"
IonIdentityMolecularNetworking.writeFeatureQuantificationTable(cmap, output_file)

In [13]:
# Create a metadata table from the list of mzML files compatible for GNPS

!(cd Example_data/ && ls *.mzML > filelist.txt)

header_list = ["filename"]
metadata=pd.read_csv("Example_data/filelist.txt", names=header_list, index_col= None)
metadata['ATTRIBUTE_MAPID'] = np.arange(len(metadata))
metadata["ATTRIBUTE_MAPID"]= "MAP" + metadata["ATTRIBUTE_MAPID"].astype(str)
metadata['ATTRIBUTE_compound']= metadata['filename'].replace(".mzML", value="", regex=True)
metadata.to_csv("results/GNPSexport/metadata.tsv", sep='\t')

!(cd Example_data/ && rm filelist.txt)

metadata

Unnamed: 0,filename,ATTRIBUTE_MAPID,ATTRIBUTE_compound
0,Epemicins.mzML,MAP0,Epemicins
1,Kirromycin.mzML,MAP1,Kirromycin
2,Leupeptin_std.mzML,MAP2,Leupeptin_std
3,Pentamycin_std.mzML,MAP3,Pentamycin_std
