Import feature files and load them to a FeatureMap() file

In [1]:
import os
import glob
from pyopenms import *
import pandas as pd

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [None]:
path= "results/GNPSexport/interim/"
isExist= os.path.exists(path)
if not isExist:
    os.mkdir(path)

input_feature_files = glob.glob('results/features/interim/*.featureXML')
feature_maps = []
for featurexml_file in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(featurexml_file, fmap)
    feature_maps.append(fmap)

1. MapAlignmentAlgorithmPoseClustering algorithm is used to align the retention time shifts caused by chromatographic differences. 
The reference file used for Map Alignment is the feature map with the highest number of features.

In [None]:
ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in feature_maps]), key=lambda x:x[1])][-1]

aligner = MapAlignmentAlgorithmPoseClustering()
aligner_par= aligner.getDefaults()

aligner_par.setValue("max_num_peaks_considered", -1)
aligner_par.setValue("superimposer:mz_pair_max_distance", 0.05)
aligner_par.setValue("pairfinder:distance_MZ:max_difference", 10.0)
aligner_par.setValue("pairfinder:distance_MZ:unit", "ppm")
aligner.setParameters(aligner_par)
aligner.setReference(feature_maps[ref_index])

for feature_map in feature_maps[:ref_index] + feature_maps[ref_index+1:]:
    trafo = TransformationDescription()
    aligner.align(feature_map, trafo)
    transformer = MapAlignmentTransformer()
    transformer.transformRetentionTimes(feature_map, trafo, True) # store original RT as meta value

for feature_map in feature_maps:    
    feature_file = os.path.join("results", "", "GNPSexport", "", "interim", "", 'MapAligned_' + os.path.basename(feature_map.getMetaValue('spectra_data')[0].decode())[7:-5] +".featureXML")
    trafo_file= os.path.join("results", "", "GNPSexport", "", "interim", "", 'MapAligned_' + os.path.basename(feature_map.getMetaValue('spectra_data')[0].decode())[7:-5] +".trafoXML")
    FeatureXMLFile().store(feature_file, feature_map)
    TransformationXMLFile().store(trafo_file, trafo)

Import aligned feature files and load them to a FeatureMap() file

In [None]:
input_feature_files = sorted(glob.glob("results/GNPSexport/interim/MapAligned*.featureXML"))

feature_maps = []
for featurexml_file in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(featurexml_file, fmap)
    feature_maps.append(fmap)

2. IDMapper annotates MS2 fragmentations as peptide/protein identifications. This is the only way to currently annotate MS2 data for GNPS FBMN

In [None]:
use_centroid_rt= False
use_centroid_mz= True
protein_ids = []
peptide_ids= []

mapper = IDMapper()

input_mzml_files= sorted(glob.glob("results/interim/PCpeak_*.mzML"))

for filename in input_mzml_files:
    exp = MSExperiment()
    MzMLFile().load(filename, exp)

    for fmap in feature_maps:
        if os.path.basename(fmap.getMetaValue('spectra_data')[0].decode()) == os.path.basename(filename):
            peptide_ids = []
            protein_ids = []
            
            mapper.annotate(fmap, peptide_ids, protein_ids, use_centroid_rt, use_centroid_mz, exp)
        featureidx_file = os.path.join("results", "", "GNPSexport", "", "interim", "", 'IDMapper_' + os.path.basename(fmap.getMetaValue('spectra_data')[0].decode())[7:-5] +".featureXML")
        FeatureXMLFile().store(featureidx_file, fmap)

Import the annotated feature files and load them in a FeatureMap() file

In [None]:
input_feature_files = sorted(glob.glob('results/GNPSexport/interim/IDMapper*.featureXML'))

feature_maps = []
for featurexml_file in input_feature_files:
    fmap = FeatureMap()
    FeatureXMLFile().load(featurexml_file, fmap)
    feature_maps.append(fmap)

3. The Feature Grouping Algorithm is used to aggregate the feature information (from single files) into a ConsensusFeature, linking features from different files together, which have a smiliar m/z and rt (MS1 level).

In [None]:
feature_grouper = FeatureGroupingAlgorithmKD()

consensus_map = ConsensusMap()
file_descriptions = consensus_map.getColumnHeaders()

for i, feature_map in enumerate(feature_maps):
    file_description = file_descriptions.get(i, ColumnHeader())
    file_description.filename = feature_map.getMetaValue('spectra_data')[0].decode()
    file_description.size = feature_map.size()
    file_descriptions[i] = file_description

feature_grouper.group(feature_maps, consensus_map)
consensus_map.setColumnHeaders(file_descriptions)


Consensus_file= os.path.join("results", "", "GNPSexport", "","interim", "", 'consensus' + ".consensusXML")
ConsensusXMLFile().store(Consensus_file, consensus_map)


# get intensities as a DataFrame
intensities = consensus_map.get_intensity_df()

# get meta data as DataFrame
meta_data = consensus_map.get_metadata_df()[['RT', 'mz', 'charge']]

# you can concatenate these two for a "result" DataFrame
result = pd.concat([meta_data, intensities], axis=1)

# if you don't need labeled index, remove it (and/or save with index = False)
result.reset_index(drop=True, inplace=True)

# store as tsv file
result.to_csv('results/GNPSexport/interim/consensus.tsv', sep = '\t', index = False)

4. File-filtering is used to remove all features that do not have an MS2 spectrum.

In [None]:
input_consensus= "results/GNPSexport/interim/consensus.consensusXML"
cmap = ConsensusMap()
ConsensusXMLFile().load(input_consensus, cmap)
new_map= ConsensusMap(cmap)
new_map.clear(False)
for f in cmap:
    if f.getPeptideIdentifications() !=[]:
        new_map.push_back(f)
        
Consensus_file= os.path.join("results", "", "GNPSexport", "", "interim", "",'filtered' + ".consensusXML")
ConsensusXMLFile().store(Consensus_file, new_map)

5. Convert the ConsensusXML file to a FeatureQuantificationTable.txt file compatible to GNPS

In [2]:
input_consensus= "results/GNPSexport/interim/filtered.consensusXML"
cmap = ConsensusMap()
ConsensusXMLFile().load(input_consensus, cmap)

# get intensities as a DataFrame
intensities = cmap.get_intensity_df()

# get meta data as DataFrame
meta_data = cmap.get_metadata_df()

# you can concatenate these two for a "result" DataFrame
result = pd.concat([meta_data, intensities], axis=1)

# if you don't need labeled index, remove it (and/or save with index = False)
result.reset_index(drop=True, inplace=True)

idx = 0
new_col = "CONSENSUS"  # can be a list, a Series, an array or a scalar   
result.insert(loc=idx, column='#CONSENSUS', value=new_col)   
result= result.rename(columns= {"charge": "charge_cf", "RT": "rt_cf", "mz": "mz_cf", "quality": "quality_cf", "width": "width_cf"})
result= result.drop(["sequence"], axis= 1)
result= result.sort_index(axis=1)

filemeta= cmap.getColumnHeaders()
mapIDs = [k for k in filemeta.keys()]
filename= []
size=[]
label= []
for header in filemeta.values():
    files= header.filename
    sizes= header.size
    labels= header.label
    filename.append(files)
    size.append(sizes)
    label.append(labels)

dict = {'id': mapIDs, 'filename': filename,'label': label,'size': size}
DF= pd.DataFrame(dict)
DF["id"] = "intensity_"+ (DF["id"]).astype(str)

cols= result.columns
for col in cols:
    for i, path in enumerate(filename):
        if path== col:
            name= DF["id"][i]
            result.rename(columns={col: name}, inplace=True)

cols = result.columns
preordered = ["#CONSENSUS", "charge_cf", "rt_cf", "mz_cf", "quality_cf", "width_cf"]
new_cols = preordered + [c for c in result.columns if c not in preordered]
new_df = result.reindex(columns=new_cols)
new_df.to_csv('results/GNPSexport/FeatureQuantificationTable.txt', sep = '\t', index = False)
print(new_df)

     #CONSENSUS  charge_cf       rt_cf       mz_cf  quality_cf  width_cf  \
0     CONSENSUS          1  301.766906  393.223249    0.000445       NaN   
1     CONSENSUS          1  141.660936  379.207529    0.000307       NaN   
2     CONSENSUS          1   79.132486  254.161193    0.003703       NaN   
3     CONSENSUS          1  526.656072  460.269339    0.002016       NaN   
4     CONSENSUS          1  151.911141  328.139078    0.001595       NaN   
...         ...        ...         ...         ...         ...       ...   
7046  CONSENSUS          1  491.720961  858.608860    0.000010       NaN   
7047  CONSENSUS          1  210.843145  226.089659    0.000046       NaN   
7048  CONSENSUS          2  111.798745  301.168825    0.000030       NaN   
7049  CONSENSUS          1  144.365028  450.255977    0.000009       NaN   
7050  CONSENSUS          1  364.415148  380.218136    0.000024       NaN   

      intensity_0  intensity_1  intensity_2  intensity_3  ...  intensity_26  \
0       