# Set up metatlas shifter environment

In [1]:
in_jupyterlab = False

if in_jupyterlab is False:
    import numpy as np
    import pandas as pd
    import sys
    from datetime import datetime
    import os
    from IPython.display import display
    import yaml

    sys.path.insert(1, '/global/homes/b/bkieft/metatlas')
    import notebooks.standards_library.standard_annotation as sta

    pd.options.display.max_colwidth = 300
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    
elif in_jupyterlab is True:
    import pandas as pd
    import sys
    from datetime import datetime
    from IPython.display import display
    import yaml
    from pathlib import Path

    class StopExecution(Exception):
        def _render_traceback_(self):
            pass

    kernel_def = """{"argv":["shifter","--entrypoint","--image=ghcr.io/biorack/metatlas/metatlas_shifter:latest","/usr/local/bin/python","-m",
                    "ipykernel_launcher","-f","{connection_file}"],"display_name": "Metatlas Targeted","language": "python",
                    "metadata": { "debugger": true }}"""
    kernel_file_name = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted" / "kernel.json"
    try:
        has_root_kernel = Path("/root/.local/share/jupyter/kernels/papermill/kernel.json").is_file()
    except PermissionError:
        has_root_kernel = False
    if not has_root_kernel and not kernel_file_name.is_file():
        kernel_file_name.parent.mkdir(parents=True, exist_ok=True)
        with kernel_file_name.open(mode="w", encoding="utf-8") as f:
            f.writelines(kernel_def)
        print('CRITICAL: Notebook kernel has been installed. Set kernel to "Metatlas Targeted" and re-run notebook.')
        raise StopExecution
    try:
        from metatlas.tools import notebook  # noqa: E402
    except ImportError as err:
        print('CRITICAL: Set notebook kernel to "Metatlas Targeted" and re-run notebook.')
        raise StopExecution from err

    source_code_version_id = "1909f8f01e385e2a9636b6f17b4c094c5a61cbd5"
    notebook.setup("INFO", source_code_version_id)

    try:
        import notebooks.standards_library.standard_annotation as sta
    except ImportError as err:
        print('CRITICAL: Could not import standard annotation tools and notebook modules.')
        raise StopExecution from err

# Read config file and set notebook options

In [46]:
config_path = "/global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/scheller_annotation_input.yaml"
with open(config_path, "r") as config_file:
    config = yaml.safe_load(config_file)

pd.options.display.max_colwidth = 300
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# Extract EIC and Spectra information from files in the run table

In [3]:
if config["cache"]["full_data_from_cache"] is False:
    lcmsruns_table_with_adducts = sta.build_standard_lcmsrun_table(config)
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.extract_data(lcmsruns_table_with_adducts,config,method="find_peaks")
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="full", data=(eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images))
elif config["cache"]["full_data_from_cache"] is True:
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.handle_data(mode="load", config=config, file_suffix="full")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250506190611_ref_stds_full.pkl


# Create interactive plot to choose adduct rt peaks for each standard compound

In [4]:
if config["cache"]["selected_data_from_cache"] is False:
    processed_data = sta.process_data_for_plotting(eics_full, top_spectra_full, group_names_full, rt_peaks_full, config)
    selected_adducts_dict = {}
    ambiguous_adducts_dict = {}
    best_adducts_dict = {}
    running_notes_dict = {}
    sta.create_interactive_plots(processed_data, mols_images, selected_adducts_dict, ambiguous_adducts_dict, best_adducts_dict, running_notes_dict)
    # Run next cell after manual selection of adducts

elif config["cache"]["selected_data_from_cache"] is True:
    selected_adducts_dict, ambiguous_adducts_dict, best_adducts_dict, processed_data, running_notes_dict = sta.handle_data(mode="load", config=config, file_suffix="selected")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250507153815_ref_stds_selected.pkl


In [5]:
if config["cache"]["selected_data_from_cache"] is False: # Save the selections after GUI is completed
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="selected", \
                    data=(selected_adducts_dict, ambiguous_adducts_dict, best_adducts_dict, processed_data, running_notes_dict))

# Filter RT Peak, EICs, and Top Spectra by the selected compounds+adducts

In [6]:
if config["cache"]["filtered_data_from_cache"] is False:
    eics_all, rt_peaks_all, top_spectra_all, eics_best, rt_peaks_best, top_spectra_best = sta.filter_by_selected(eics_full, rt_peaks_full, top_spectra_full, selected_adducts_dict, best_adducts_dict)
    rt_peaks_all_formatted, rt_peaks_best_formatted = sta.format_rt_peaks(rt_peaks_all, rt_peaks_best)
    #rt_peaks_filtered_all_selected, rt_peaks_filtered_best_selected = sta.format_and_select_best_adducts(rt_peaks_filtered, best_adducts_dict) # Export "all" and "best"
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="filtered", \
                    data=(eics_all, rt_peaks_all, top_spectra_all, eics_best, rt_peaks_best, top_spectra_best, rt_peaks_all_formatted, rt_peaks_best_formatted))

elif config["cache"]["filtered_data_from_cache"] is True:
    eics_all, rt_peaks_all, top_spectra_all, eics_best, rt_peaks_best, top_spectra_best, rt_peaks_all_formatted, rt_peaks_best_formatted = sta.handle_data(mode="load", config=config, file_suffix="filtered")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250507160817_ref_stds_filtered.pkl


# Generate static summary reports for each compound and a combined summary document

In [7]:
if config["analysis"]["generate_static_summary_pdfs"] is True:
    sta.generate_static_summary_plots(processed_data, selected_adducts_dict, best_adducts_dict, config)
    
if config["analysis"]["generate_selection_summary_table"] is True:
    sta.generate_selection_summary_table(rt_peaks_best_formatted, running_notes_dict, config, timestamp)

 Writing summary plots for selected compounds:   0%|          | 0/8 [00:00<?, ? compound group/s]

[09:07:46] non-ring atom 0 marked aromatic
[09:07:46] non-ring atom 0 marked aromatic


 Creating summary table:   0%|          | 0/6 [00:00<?, ? compound/s]

# Identify compounds not in the metatlas database Compounds table and store if necessary

In [8]:
if config["cache"]['metatlas_db_data_from_cache'] is False:
    in_db, notin_db = sta.search_for_matches_in_metatlas_db(rt_peaks_all_formatted, check_by_flat=True) # Check if selected compounds from ALL are in metatlas DB
    if len(notin_db) > 0 and config['compounds']['direct_store_to_compounds_table'] is True: # Store selected compounds+adducts in metatlas db
        sta.store_in_metatlas_db(notin_db)
    sta.check_db_deposit(rt_peaks_all_formatted)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="metatlas_db", \
                    data=(in_db, notin_db, rt_peaks_all_formatted))

elif config["cache"]['metatlas_db_data_from_cache'] is True:
    in_db, notin_db, rt_peaks_all_formatted = sta.handle_data(mode="load", config=config, file_suffix="metatlas_db")
    if len(notin_db) > 0 and config['compounds']['direct_store_to_compounds_table'] is True: # Store selected compounds+adducts in metatlas db
        sta.store_in_metatlas_db(notin_db)
    sta.check_db_deposit(rt_peaks_all_formatted)

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250507160817_ref_stds_metatlas_db.pkl
Running double check for compounds in metatlas db Comopunds table...
	All new entries found in the database.



# Identify compounds+adducts not in atlases and set up new atlas creation

In [9]:
if config["cache"]["ema_atlas_data_from_cache"] is False:
    if config["atlases"]["new_ema_atlas_dtype"] == "all":
        rt_peaks_ema_input = rt_peaks_all_formatted
    elif config["atlases"]["new_ema_atlas_dtype"] == "best":
        rt_peaks_ema_input = rt_peaks_best_formatted
    ema_atlases_data = sta.get_ema_atlas_data(config["atlases"]["current_ema_atlases"])
    rt_peaks_ema_input_formatted = sta.convert_rt_peaks_to_atlas_format(rt_peaks_ema_input)
    matches_to_atlases, nonmatches_to_atlases = sta.search_for_matches_in_atlases(rt_peaks_ema_input_formatted, ema_atlases_data)

    if config["cache"]["rt_correction_data_from_cache"] is False:
        print("Setting up RT correction for compounds not yet in atlases using baseline correction method:\n")
        baseline_qc, experimental_qc, baseline_to_experimental_qc = sta.get_qc_experimental_atlas(nonmatches_to_atlases, config["atlases"]["current_qc_atlases"], include_istds=True)
        baseline_correction_inputs = sta.create_baseline_correction_input(nonmatches_to_atlases, baseline_to_experimental_qc)
        baseline_correction_outputs = sta.rt_correction_from_baseline(baseline_correction_inputs, config)
        sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="rt_correction", \
                        data=(baseline_to_experimental_qc, baseline_correction_outputs))

    elif config["cache"]["rt_correction_data_from_cache"] is True:
        baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load",config=config, file_suffix="rt_correction")

    nonmatches_to_atlases_rt_corrected = sta.substitute_corrected_rt_values(nonmatches_to_atlases, baseline_correction_outputs)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="ema_atlases", \
                    data=(nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs))

elif config["cache"]["ema_atlas_data_from_cache"] is True:
    nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load", config=config, file_suffix="ema_atlases")
    print(f"Total compounds to add to EMA atlases per chromatography: {nonmatches_to_atlases_rt_corrected['chromatography'].nunique()}")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250507162308_ref_stds_ema_atlases.pkl
Total compounds to add to EMA atlases per chromatography: 2


# Create new EMA atlas with top selected reference standards added

In [10]:
if config['atlases']['save_new_ema_atlases'] is True:
    ema_atlas_ids, ema_atlas_names = sta.update_and_save_ema_atlases(nonmatches_to_atlases_rt_corrected, ema_atlases_data, config, timestamp)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="new_atlas_ids", \
                    data=(ema_atlas_ids, ema_atlas_names))
    ema_atlas_ids, ema_atlas_names = sta.handle_data(mode="load", config=config, file_suffix="new_atlas_ids")

    if config['atlases']['direct_deposit_new_emas'] is True:
        print("New EMA atlases have been saved to disk and deposited in the metatlas database:")
        display(pd.DataFrame.from_dict(ema_atlas_ids))
    print(f"\nNew EMA atlas locations:")
    display(pd.DataFrame.from_dict(ema_atlas_names))

elif config['atlases']['save_new_ema_atlases'] is False:
    print("No new EMA atlases saved to disk, as 'save_new_ema_atlases' is set to False in the config file.")
    print("Here is the new atlas data in memory:")
    display(nonmatches_to_atlases_rt_corrected)

No new EMA atlases saved to disk, as 'save_new_ema_atlases' is set to False in the config file.
Here is the new atlas data in memory:


Unnamed: 0,standard_lcmsrun,chromatography,compound_name,adduct,polarity,rt_peak,mz,smiles,peak_index,inchi,...,permanent_charge,formula,mono_isotopic_molecular_weight,collision_energy,label,rt_min,rt_max,mz_tolerance,mz_tolerance_units,in_metatlas
0,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run375.h5,HILICZ,sorgoleone,[M+H]+,positive,0.645454,359.221685,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,102040norm,sorgoleone,0.145454,1.145454,5,ppm,True
1,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run375.h5,HILICZ,sorgoleone,[M+NH4]+,positive,0.645454,376.248232,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,102040norm,sorgoleone,0.145454,1.145454,5,ppm,True
2,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run377.h5,HILICZ,sorgoleone,[M-H]-,negative,0.648787,357.207133,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,102040norm,sorgoleone,0.148787,1.148787,5,ppm,True
3,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,C18,sorgoleone,[M+H]+,positive,7.439548,359.221685,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,205060norm,sorgoleone,6.939548,7.939548,5,ppm,True
4,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,C18,sorgoleone,[M+Na]+,positive,7.439548,381.203627,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,205060norm,sorgoleone,6.939548,7.939548,5,ppm,True
5,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run384.h5,C18,sorgoleone,[M-H]-,negative,7.444448,357.207133,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,102040norm,sorgoleone,6.944448,7.944448,5,ppm,True


# Identify compounds not in MSMS refs and set up new MSMS refs creation

In [11]:
if config["cache"]["msms_refs_data_from_cache"] is False:
    if config["msms_refs"]["new_msms_refs_dtype"] == "all":
        rt_peaks_msms_input = rt_peaks_all_formatted
        top_spectra_msms_input = top_spectra_all
    elif config["msms_refs"]["new_msms_refs_dtype"] == "best":
        rt_peaks_msms_input = rt_peaks_best_formatted
        top_spectra_msms_input = top_spectra_best
    msms_refs = sta.get_msms_refs(msms_refs_path=config["msms_refs"]["current_msms_refs_path"])
    rt_peaks_msms_input_formatted = sta.format_for_msms_refs(rt_peaks_msms_input, top_spectra_msms_input, msms_refs, config)
    in_msms_refs, notin_msms_refs = sta.search_for_matches_in_msms_refs(rt_peaks_msms_input_formatted, msms_refs, check_by_flat=True)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="msms_refs", \
                    data=(msms_refs, notin_msms_refs, rt_peaks_msms_input_formatted))
    
elif config["cache"]["msms_refs_data_from_cache"] is True:
    msms_refs, notin_msms_refs, rt_peaks_msms_input_formatted = sta.handle_data(mode="load", config=config, file_suffix="msms_refs")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250507162308_ref_stds_msms_refs.pkl


# Create new MSMS refs table with all selected reference standards added

In [None]:
if config["msms_refs"]['save_new_msms_refs'] is True:
    sta.update_and_save_msms_refs(msms_refs, notin_msms_refs, config, timestamp)

elif config["msms_refs"]['save_new_msms_refs'] is False:
    print("No new MSMS refs saved to disk, as 'save_new_msms_refs' is set to False in the config file.")
    print("Here is the new MSMS refs data in memory:")
    display(notin_msms_refs)

In [18]:
top_spectra_all.columns

Index(['label', 'spectrum', 'rt', 'precursor_mz', 'precursor_peak_height',
       'adduct', 'standard_lcmsrun', 'total_intensity',
       'total_intensity_fraction', 'peak_index', 'chromatography',
       'compound_name', 'polarity'],
      dtype='object')

In [19]:
rt_peaks_all.columns

Index(['standard_lcmsrun', 'chromatography', 'compound_name', 'adduct',
       'polarity', 'rt_peak', 'intensity', 'mz_observed', 'mz_theoretical',
       'ppm_error', 'smiles', 'peak_index', 'inchi', 'inchi_key',
       'neutralized_inchi', 'neutralized_inchi_key', 'permanent_charge',
       'formula', 'monoisotopic_mass', 'collision_energy'],
      dtype='object')

In [29]:
rt_peaks_and_top_spectra = rt_peaks_all.merge(top_spectra_all[['compound_name', 'adduct', 'polarity', 'standard_lcmsrun', 'peak_index', 'rt', 'precursor_mz', 'precursor_peak_height', 'spectrum']],
                                                on=['compound_name', 'adduct', 'polarity', 'standard_lcmsrun', 'peak_index'], how='left')
rt_peaks_and_top_spectra.columns

Index(['standard_lcmsrun', 'chromatography', 'compound_name', 'adduct',
       'polarity', 'rt_peak', 'intensity', 'mz_observed', 'mz_theoretical',
       'ppm_error', 'smiles', 'peak_index', 'inchi', 'inchi_key',
       'neutralized_inchi', 'neutralized_inchi_key', 'permanent_charge',
       'formula', 'monoisotopic_mass', 'collision_energy', 'rt',
       'precursor_mz', 'precursor_peak_height', 'spectrum'],
      dtype='object')

In [32]:
rt_peaks_and_top_spectra['standard_lcmsrun'].head(1)

0    /global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run384.h5
Name: standard_lcmsrun, dtype: object

In [20]:
config['msms_refs']['msms_refs_metadata']

{'ce_type': 'ramp',
 'msms_refs_prefix': 'metascischeller',
 'frag_method': 'HCD',
 'instrument_type': 'Orbitrap',
 'decimal': 4.0}

In [None]:
from pyteomics import mgf
import subprocess
from typing import Dict, Any
import pandas as pd

def get_mgf_last_index(mgf_refs_path: str) -> str:
    """
    Index an MGF file and read specific spectra efficiently.

    Args:
        mgf_refs_path (str): Path to the MGF file.

    Returns:
        IndexedMGF: An indexed MGF object for efficient access.
    """
    try:
        # Create an IndexedMGF object
        indexed_mgf = mgf.IndexedMGF(mgf_refs_path, index_by_scans=True)
        
        # Access the last spectrum in the file
        last_spectrum = indexed_mgf[-1]

        # Access a spectrum by its SPECTRUMID (if available)
        if 'spectrumid' in last_spectrum['params']:
            id = last_spectrum['params']['spectrumid']
            return id
        else:
            print("Warning: SPECTRUMID parameter not found in the last spectrum. Check MGF file/path/format.")
    
    except FileNotFoundError:
        print(f"Error: File not found at {mgf_refs_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

def write_mgf_from_top_spectra(top_spectra_all: pd.DataFrame, 
                               rt_peaks_all: pd.DataFrame, 
                               config: Dict[str, Any]
) -> None:
    """
    Write an MGF file from the spectra data in top_spectra_all.

    Args:
        top_spectra_all (pd.DataFrame): DataFrame containing spectra data.
        rt_peaks_all (pd.DataFrame): DataFrame containing retention time peak data.
        config (Dict[str, Any]): Configuration dictionary containing metadata for the project, including output path.

    Returns:
        None
    """
    mgf_entries = []

    rt_peaks_and_top_spectra = rt_peaks_all.merge(top_spectra_all[['compound_name', 'adduct', 'polarity', 'standard_lcmsrun', 'peak_index', 'rt', 'precursor_mz', 'precursor_peak_height', 'spectrum']],
                                                on=['compound_name', 'adduct', 'polarity', 'standard_lcmsrun', 'peak_index'], how='left')

    if config['msms_refs']['standalone_mgf'] is True:
        current_id = 0
    elif config['msms_refs']['standalone_mgf'] is False:
        starting_mgf_id = get_mgf_last_index(config['msms_refs']['current_mgf_path'])
        print(f"Last ID from existing file to increment: {starting_mgf_id}")
        if "CCMSLIB" in starting_mgf_id:
            current_id = int(starting_mgf_id.split("CCMSLIB")[1])

    for _, row in rt_peaks_and_top_spectra.iterrows():
        # Extract m/z and intensity values
        mz_values, intensity_values = row['spectrum']
        
        # Increment the SPECTRUMID for each entry
        if config['msms_refs']['standalone_mgf'] is True:
            current_id += 1
            new_spectrum_id = f"{config['msms_refs']['msms_refs_metadata']['msms_refs_prefix'].upper()}{current_id:011d}"
        elif config['msms_refs']['standalone_mgf'] is False:
            current_id += 1
            new_spectrum_id = f"CCMSLIB{current_id:011d}"

        # Create an MGF entry
        mgf_entry = {
            'm/z array': mz_values,
            'intensity array': intensity_values,
            'params': {
                'TITLE': f"MS/MS scan for {row['compound_name']} at {round(row['rt_peak'], 3)} min with intensity {round(row['intensity'], 3)}",
                'PEPMASS': round(row['precursor_mz'], 3),
                'CHARGE': 1,
                'MSLEVEL': 2,
                'SOURCE_INSTRUMENT': 'LC-ESI-Orbitrap',
                'FILENAME': row['standard_lcmsrun'],
                'SEQ': "*..*",
                'IONMODE': "Positive" if row['polarity'] == "POS" else "Negative",
                'ORGANISM': 'BERKELEY-LAB',
                'NAME': f"{row['compound_name']} {row['collision_energy']} {row['adduct']}",
                'PI': 'Trent Northen',
                'DATACOLLECTOR': 'JGI',
                'SMILES': row['smiles'],
                'INCHI': row['inchi'],
                'FORMULA': row['formula'],
                'INCHIAUX': "N/A",
                'PUBMED': "N/A",
                'SUBMITUSER': 'bkieft',
                'LIBRARYQUALITY': 3,
                'SPECTRUMID': new_spectrum_id
            }
        }
        
        mgf_entries.append(mgf_entry)

    # Write the MGF file
    updated_refs_dir = f"{config['project']['standards_output_path']}/updated_MSMS_refs"
    standalone_tag = "_standalone" if config['msms_refs']['standalone_mgf'] else ""
    if not os.path.exists(updated_refs_dir):
        os.makedirs(updated_refs_dir)
    fname = f"{updated_refs_dir}/berkeley_lab_refs_{timestamp}{standalone_tag}.mgf"
    mgf.write(mgf_entries, fname)

    if config['msms_refs']['standalone_mgf'] is True:
        print(f"Standalone MGF file created: {fname}")
    elif config['msms_refs']['standalone_mgf'] is False:
        temp_fname = f"{fname}.tmp"
        command = f"(cat {config['msms_refs']['current_mgf_path']} && echo && cat {fname}) > {temp_fname}"
        subprocess.run(command, shell=True, check=True)
        os.replace(temp_fname, fname)
        print(f"Updated MGF file created: {fname}")

In [48]:
write_mgf_from_top_spectra(top_spectra_all, rt_peaks_all, config)

Starting MGF ID: CCMSLIB00010126999
Updated MGF file created: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/updated_MSMS_refs/berkeley_lab_refs_20250508101852.mgf
