# Set up metatlas shifter environment

In [None]:
in_jupyterlab = True

if in_jupyterlab is False:
    import numpy as np
    import pandas as pd
    import sys
    from datetime import datetime
    import os
    from IPython.display import display
    import yaml

    sys.path.insert(1, '/global/homes/b/bkieft/metatlas')
    import notebooks.standards_library.standard_annotation as sta

    pd.options.display.max_colwidth = 300
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    
elif in_jupyterlab is True:
    import pandas as pd
    import sys
    from datetime import datetime
    from IPython.display import display
    import yaml
    from pathlib import Path

    class StopExecution(Exception):
        def _render_traceback_(self):
            pass

    kernel_def = """{"argv":["shifter","--entrypoint","--image=ghcr.io/biorack/metatlas/metatlas_shifter:latest","/usr/local/bin/python","-m",
                    "ipykernel_launcher","-f","{connection_file}"],"display_name": "Metatlas Targeted","language": "python",
                    "metadata": { "debugger": true }}"""
    kernel_file_name = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted" / "kernel.json"
    try:
        has_root_kernel = Path("/root/.local/share/jupyter/kernels/papermill/kernel.json").is_file()
    except PermissionError:
        has_root_kernel = False
    if not has_root_kernel and not kernel_file_name.is_file():
        kernel_file_name.parent.mkdir(parents=True, exist_ok=True)
        with kernel_file_name.open(mode="w", encoding="utf-8") as f:
            f.writelines(kernel_def)
        print('CRITICAL: Notebook kernel has been installed. Set kernel to "Metatlas Targeted" and re-run notebook.')
        raise StopExecution
    try:
        from metatlas.tools import notebook  # noqa: E402
    except ImportError as err:
        print('CRITICAL: Set notebook kernel to "Metatlas Targeted" and re-run notebook.')
        raise StopExecution from err

    source_code_version_id = "1909f8f01e385e2a9636b6f17b4c094c5a61cbd5"
    notebook.setup("INFO", source_code_version_id)

    try:
        import notebooks.standards_library.standard_annotation as sta
    except ImportError as err:
        print('CRITICAL: Could not import standard annotation tools and notebook modules.')
        raise StopExecution from err

# Read config file and set notebook options

In [2]:
config_path = "/global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/scheller_annotation_input.yaml"
with open(config_path, "r") as config_file:
    config = yaml.safe_load(config_file)

pd.options.display.max_colwidth = 300
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# Extract EIC and Spectra information from files in the run table

In [3]:
if config["cache"]["full_data_from_cache"] is False:
    lcmsruns_table_with_adducts = sta.build_standard_lcmsrun_table(config)
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.extract_data(lcmsruns_table_with_adducts,config,method="find_peaks")
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="full", data=(eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images))
elif config["cache"]["full_data_from_cache"] is True:
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.handle_data(mode="load", config=config, file_suffix="full")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250506190611_ref_stds_full.pkl


# Create interactive plot to choose adduct rt peaks for each standard compound

In [4]:
if config["cache"]["selected_data_from_cache"] is False:
    processed_data = sta.process_data_for_plotting(eics_full, top_spectra_full, group_names_full, rt_peaks_full, config)
    selected_adducts_dict = {}
    ambiguous_adducts_dict = {}
    best_adducts_dict = {}
    running_notes_dict = {}
    sta.create_interactive_plots(processed_data, mols_images, selected_adducts_dict, ambiguous_adducts_dict, best_adducts_dict, running_notes_dict)
    # Run next cell after manual selection of adducts

elif config["cache"]["selected_data_from_cache"] is True:
    selected_adducts_dict, ambiguous_adducts_dict, best_adducts_dict, processed_data, running_notes_dict = sta.handle_data(mode="load", config=config, file_suffix="selected")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250508121721_ref_stds_selected.pkl


In [5]:
if config["cache"]["selected_data_from_cache"] is False: # Save the selections after GUI is completed
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="selected", \
                    data=(selected_adducts_dict, ambiguous_adducts_dict, best_adducts_dict, processed_data, running_notes_dict))

# Filter RT Peak, EICs, and Top Spectra by the selected compounds+adducts

In [6]:
if config["cache"]["filtered_data_from_cache"] is False:
    all_data, best_data = sta.filter_by_selected(eics_full, rt_peaks_full, top_spectra_full, selected_adducts_dict, best_adducts_dict)
    all_rt_peaks_formatted, best_rt_peaks_formatted = sta.format_rt_peaks(all_data['rt_peaks'], best_data['rt_peaks'])
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="filtered", \
                    data=(all_data, best_data, all_rt_peaks_formatted, best_rt_peaks_formatted))

elif config["cache"]["filtered_data_from_cache"] is True:
    all_data, best_data, all_rt_peaks_formatted, best_rt_peaks_formatted = sta.handle_data(mode="load", config=config, file_suffix="filtered")

[12:31:03] non-ring atom 0 marked aromatic
[12:31:03] non-ring atom 0 marked aromatic



All unique compounds selected: 1
All unique compound+adduct entries selected: 4
All unique compound+adduct+peak entries selected: 12
All unique compound+adduct+peak+spectra entries selected: 12

Best unique compounds selected: 1
Best unique compound+adduct entries selected: 2
Best unique compound+adduct+peak entries selected: 8
Best unique compound+adduct+peak+spectra entries selected: 8

Formatting all dataset
	Checking for differing RTs between CEs and polarities, which are unexpected...
		Group ('C18', 'sorgoleone'): All RT values for ['102040norm' '205060norm'] and ['NEG' 'POS'] are within 0.05 mins of each other (0.0057).
		Group ('HILICZ', 'sorgoleone'): All RT values for ['102040norm' '205060norm'] and ['NEG' 'POS'] are within 0.05 mins of each other (0.0107).

	Checking monoisotopic mass to identify isomers in the datasets...
		No isomers found in all data.


Formatting best dataset
	Checking for differing RTs between CEs and polarities, which are unexpected...
		Group ('C18',

# Generate static summary reports for each compound and a combined summary document

In [7]:
if config["analysis"]["generate_static_summary_pdfs"] is True:
    sta.generate_static_summary_plots(processed_data, selected_adducts_dict, best_adducts_dict, config)
    
if config["analysis"]["generate_selection_summary_table"] is True:
    sta.generate_selection_summary_table(best_rt_peaks_formatted, running_notes_dict, config, timestamp)

 Writing summary plots for selected compounds:   0%|          | 0/8 [00:00<?, ? compound group/s]

 Creating summary table: 0 compound [00:00, ? compound/s]

# Identify compounds not in the metatlas database Compounds table and store if necessary

In [10]:
if config["cache"]['metatlas_db_data_from_cache'] is False:
    in_db, notin_db = sta.search_for_matches_in_metatlas_db(all_rt_peaks_formatted, check_by_flat=True) # Check if selected compounds from ALL are in metatlas DB
    if len(notin_db) > 0 and config['compounds']['direct_store_to_compounds_table'] is True: # Store selected compounds+adducts in metatlas db
        sta.store_in_metatlas_db(notin_db)
    sta.check_db_deposit(all_rt_peaks_formatted)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="metatlas_db", \
                    data=(in_db, notin_db, all_rt_peaks_formatted))

elif config["cache"]['metatlas_db_data_from_cache'] is True:
    in_db, notin_db, all_rt_peaks_formatted = sta.handle_data(mode="load", config=config, file_suffix="metatlas_db")
    if len(notin_db) > 0 and config['compounds']['direct_store_to_compounds_table'] is True: # Store selected compounds+adducts in metatlas db
        sta.store_in_metatlas_db(notin_db)
    sta.check_db_deposit(all_rt_peaks_formatted)

 Searching for matches in metatlas db:   0%|          | 0/12 [00:00<?, ? compound/s]


Summary of compounds already in the metatlas database:



Unnamed: 0,query_label,query_matching_criterion,query_to_db,db_match
0,sorgoleone,inchi_key,FGWRUVXUQWGLOX-AFJQJTPPSA-N,[FGWRUVXUQWGLOX-AFJQJTPPSA-N]



All compounds are already in the metatlas database.

Running double check for compounds in metatlas db Comopunds table...
	All new entries found in the database.

Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250508123102_ref_stds_metatlas_db.pkl


# Identify compounds+adducts not in atlases and set up new atlas creation

In [11]:
if config["cache"]["ema_atlas_data_from_cache"] is False:
    if config["atlases"]["new_ema_atlas_dtype"] == "all":
        rt_peaks_ema_input = all_rt_peaks_formatted
    elif config["atlases"]["new_ema_atlas_dtype"] == "best":
        rt_peaks_ema_input = best_rt_peaks_formatted
    ema_atlases_data = sta.get_ema_atlas_data(config["atlases"]["current_ema_atlases"])
    rt_peaks_ema_input_formatted = sta.convert_rt_peaks_to_atlas_format(rt_peaks_ema_input)
    matches_to_atlases, nonmatches_to_atlases = sta.search_for_matches_in_atlases(rt_peaks_ema_input_formatted, ema_atlases_data)

    if config["cache"]["rt_correction_data_from_cache"] is False:
        print("Setting up RT correction for compounds not yet in atlases using baseline correction method:\n")
        baseline_qc, experimental_qc, baseline_to_experimental_qc = sta.get_qc_experimental_atlas(nonmatches_to_atlases, config["atlases"]["current_qc_atlases"], include_istds=True)
        baseline_correction_inputs = sta.create_baseline_correction_input(nonmatches_to_atlases, baseline_to_experimental_qc)
        baseline_correction_outputs = sta.rt_correction_from_baseline(baseline_correction_inputs, config)
        sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="rt_correction", \
                        data=(baseline_to_experimental_qc, baseline_correction_outputs))

    elif config["cache"]["rt_correction_data_from_cache"] is True:
        baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load",config=config, file_suffix="rt_correction")

    nonmatches_to_atlases_rt_corrected = sta.substitute_corrected_rt_values(nonmatches_to_atlases, baseline_correction_outputs)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="ema_atlases", \
                    data=(nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs))

elif config["cache"]["ema_atlas_data_from_cache"] is True:
    nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load", config=config, file_suffix="ema_atlases")
    print(f"Total compounds to add to EMA atlases per chromatography: {nonmatches_to_atlases_rt_corrected['chromatography'].nunique()}")

 Searching in HILICZ positive atlas:   0%|          | 0/4 [00:00<?, ? compound/s]

 Searching in HILICZ negative atlas:   0%|          | 0/2 [00:00<?, ? compound/s]

 Searching in C18 positive atlas:   0%|          | 0/4 [00:00<?, ? compound/s]

 Searching in C18 negative atlas:   0%|          | 0/2 [00:00<?, ? compound/s]


None of the compounds+adducts searched were found in the atlases.

There are 6 compounds+adducts are not yet in any atlases. View with 'nonmatches_to_atlases'.

Setting up RT correction for compounds not yet in atlases using baseline correction method:

	Getting all QC files for project /global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558

	Retrieving baseline HILICZ QC atlas: /global/homes/b/bkieft/metatlas-data/HILIC/HILIC_QCv7_positive.tsv

	Collecting QC MS1 data for HILICZ...



 Collecting MS1 data for QC compounds:   0%|          | 0/52 [00:00<?, ? file/s]

	Getting all QC files for project /global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782

	Retrieving baseline C18 QC atlas: /global/homes/b/bkieft/metatlas-data/C18/C18_QCv7_positive.tsv

	Collecting QC MS1 data for C18...



 Collecting MS1 data for QC compounds:   0%|          | 0/53 [00:00<?, ? file/s]

	Performing RT correction...



Calculating RT correction model:   0%|          | 0/2 [00:00<?, ? chromatography/s]

	HILICZ RT correction results:


Unnamed: 0,label,adduct,polarity,rt_peak_baseline,rt_peak_experimental,rt_peak_corrected,rt_min_corrected,rt_max_corrected,rt_diff_experimental_vs_corrected
0,sorgoleone,[M+H]+,positive,,0.801184,0.645454,0.145454,1.145454,0.15573
1,sorgoleone,[M+NH4]+,positive,,0.801184,0.645454,0.145454,1.145454,0.15573
2,sorgoleone,[M-H]-,negative,,0.804263,0.648787,0.148787,1.148787,0.155476
3,ABMBA (unlabeled),,QC,1.093806,1.21834,1.096205,0.596205,1.596205,0.122135
4,N-acetyl-glucosamine (U - 13C),,QC,6.707815,6.418233,6.580829,6.080829,7.080829,-0.162596
5,adenine (U - 15N),,QC,2.677602,2.834389,2.827321,2.327321,3.327321,0.007068
6,"alanine (U - 13C, 15N)",,QC,13.405091,13.240681,13.400469,12.900469,13.900469,-0.159788
7,"arginine (U - 13C, 15N)",,QC,16.939915,17.013107,16.987934,16.487934,17.487934,0.025173
8,"asparagine (U - 13C, 15N)",,QC,14.368089,14.252848,14.37583,13.87583,14.87583,-0.122982
9,"aspartic acid (U - 13C, 15N)",,QC,16.13036,16.043652,16.07848,15.57848,16.57848,-0.034828


	C18 RT correction results:


Unnamed: 0,label,adduct,polarity,rt_peak_baseline,rt_peak_experimental,rt_peak_corrected,rt_min_corrected,rt_max_corrected,rt_diff_experimental_vs_corrected
0,sorgoleone,[M+H]+,positive,,7.544277,7.439548,6.939548,7.939548,0.104729
1,sorgoleone,[M+Na]+,positive,,7.544277,7.439548,6.939548,7.939548,0.104729
2,sorgoleone,[M-H]-,negative,,7.549027,7.444448,6.944448,7.944448,0.104579
3,9-cis-retinoic acid,,QC,7.372778,7.322109,7.210608,6.710608,7.710608,0.111501
4,ABMBA (unlabeled),,QC,4.7,4.892107,4.73861,4.23861,5.23861,0.153496
5,caffeine,,QC,2.652537,2.833037,2.689957,2.189957,3.189957,0.14308
6,enoxolone,,QC,6.605928,6.792058,6.666384,6.166384,7.166384,0.125674
7,inosine (U - 15N),,QC,1.02,1.13626,1.03348,0.53348,1.53348,0.10278
8,nigericin,,QC,8.559813,8.749709,8.690287,8.190287,9.190287,0.059422
9,"phenylalanine (U - 13C, 15N)",,QC,1.452954,1.481643,1.368337,0.868337,1.868337,0.113306


Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250508123102_ref_stds_rt_correction.pkl
Formatted 6 RT-corrected compounds for insertion into HILICZ atlases.
Formatted 6 RT-corrected compounds for insertion into C18 atlases.
Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250508123102_ref_stds_ema_atlases.pkl


# Create new EMA atlas with top selected reference standards added

In [12]:
if config['atlases']['save_new_ema_atlases'] is True:
    ema_atlas_ids, ema_atlas_names = sta.update_and_save_ema_atlases(nonmatches_to_atlases_rt_corrected, ema_atlases_data, config, timestamp)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="new_atlas_ids", \
                    data=(ema_atlas_ids, ema_atlas_names))
    ema_atlas_ids, ema_atlas_names = sta.handle_data(mode="load", config=config, file_suffix="new_atlas_ids")

    if config['atlases']['direct_deposit_new_emas'] is True:
        print("New EMA atlases have been saved to disk and deposited in the metatlas database:")
        display(pd.DataFrame.from_dict(ema_atlas_ids))
    print(f"\nNew EMA atlas locations:")
    display(pd.DataFrame.from_dict(ema_atlas_names))

elif config['atlases']['save_new_ema_atlases'] is False:
    print("No new EMA atlases saved to disk, as 'save_new_ema_atlases' is set to False in the config file.")
    print("Here is the new atlas data in memory:")
    display(nonmatches_to_atlases_rt_corrected)

No new EMA atlases saved to disk, as 'save_new_ema_atlases' is set to False in the config file.
Here is the new atlas data in memory:


Unnamed: 0,standard_lcmsrun,chromatography,compound_name,adduct,polarity,rt_peak,mz,smiles,peak_index,inchi,...,formula,mono_isotopic_molecular_weight,collision_energy,label,best,rt_min,rt_max,mz_tolerance,mz_tolerance_units,in_metatlas
0,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run375.h5,HILICZ,sorgoleone,[M+H]+,positive,0.645454,359.221685,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,C22H30O4,358.214409,102040norm,sorgoleone,Yes,0.145454,1.145454,5,ppm,True
1,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run375.h5,HILICZ,sorgoleone,[M+NH4]+,positive,0.645454,376.248232,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,C22H30O4,358.214409,102040norm,sorgoleone,No,0.145454,1.145454,5,ppm,True
2,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run377.h5,HILICZ,sorgoleone,[M-H]-,negative,0.648787,357.207133,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,C22H30O4,358.214409,102040norm,sorgoleone,Yes,0.148787,1.148787,5,ppm,True
3,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,C18,sorgoleone,[M+H]+,positive,7.439548,359.221685,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,C22H30O4,358.214409,205060norm,sorgoleone,No,6.939548,7.939548,5,ppm,True
4,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,C18,sorgoleone,[M+Na]+,positive,7.439548,381.203627,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,C22H30O4,358.214409,205060norm,sorgoleone,No,6.939548,7.939548,5,ppm,True
5,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run384.h5,C18,sorgoleone,[M-H]-,negative,7.444448,357.207133,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,C22H30O4,358.214409,102040norm,sorgoleone,No,6.944448,7.944448,5,ppm,True


# Identify compounds not in MSMS refs and set up new MSMS refs creation

In [13]:
if config["cache"]["msms_refs_data_from_cache"] is False:
    if config["msms_refs"]["new_msms_refs_dtype"] == "all":
        rt_peaks_msms_input = all_rt_peaks_formatted
        top_spectra_msms_input = all_data['top_spectra']
    elif config["msms_refs"]["new_msms_refs_dtype"] == "best":
        rt_peaks_msms_input = best_rt_peaks_formatted
        top_spectra_msms_input = best_data['top_spectra']
    msms_refs = sta.get_msms_refs(msms_refs_path=config["msms_refs"]["current_msms_refs_path"])
    rt_peaks_msms_input_formatted = sta.format_for_msms_refs(rt_peaks_msms_input, top_spectra_msms_input, msms_refs, config)
    in_msms_refs, notin_msms_refs = sta.search_for_matches_in_msms_refs(rt_peaks_msms_input_formatted, msms_refs, check_by_flat=True)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="msms_refs", \
                    data=(msms_refs, notin_msms_refs, rt_peaks_msms_input_formatted))

elif config["cache"]["msms_refs_data_from_cache"] is True:
    msms_refs, notin_msms_refs, rt_peaks_msms_input_formatted = sta.handle_data(mode="load", config=config, file_suffix="msms_refs")

Loaded MSMS refs with 216409 rows and 17 columns.


 Searching for matches in MSMS refs:   0%|          | 0/12 [00:00<?, ? compound/s]


12 compounds+adducts are not yet in MSMS refs. Check notin_msms_refs to view.

Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250508123102_ref_stds_msms_refs.pkl


# Create new MSMS refs table from selected reference standards

In [None]:
if config["msms_refs"]['save_new_msms_refs'] is True:
    sta.update_and_save_msms_refs(msms_refs, notin_msms_refs, config, timestamp)

elif config["msms_refs"]['save_new_msms_refs'] is False:
    print("No new MSMS refs saved to disk, as 'save_new_msms_refs' is set to False in the config file.")

# Create new MSMS refs MGF file from selected reference standards

In [None]:
if config["msms_refs"]['save_new_mgf'] is True:
    sta.write_mgf_from_top_spectra(top_spectra_msms_input, rt_peaks_msms_input, config, timestamp)

elif config["msms_refs"]['save_new_mgf'] is False:
    print("No new MGF refs saved to disk, as 'save_new_mgf' is set to False in the config file.")

Last ID from existing file to increment: CCMSLIB00010126999
Updated MGF file created: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/updated_MSMS_refs/berkeley_lab_refs_20250508101852.mgf
