# Set up metatlas shifter environment

In [1]:
in_jupyterlab = False

if in_jupyterlab is False:
    import numpy as np
    import pandas as pd
    import sys
    from datetime import datetime
    import os
    from IPython.display import display
    import yaml

    sys.path.insert(1, '/global/homes/b/bkieft/metatlas')
    import notebooks.standards_library.standard_annotation as sta

    pd.options.display.max_colwidth = 300
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    
elif in_jupyterlab is True:
    import pandas as pd
    import sys
    from datetime import datetime
    from IPython.display import display
    import yaml
    from pathlib import Path

    class StopExecution(Exception):
        def _render_traceback_(self):
            pass

    kernel_def = """{"argv":["shifter","--entrypoint","--image=ghcr.io/biorack/metatlas/metatlas_shifter:latest","/usr/local/bin/python","-m",
                    "ipykernel_launcher","-f","{connection_file}"],"display_name": "Metatlas Targeted","language": "python",
                    "metadata": { "debugger": true }}"""
    kernel_file_name = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted" / "kernel.json"
    try:
        has_root_kernel = Path("/root/.local/share/jupyter/kernels/papermill/kernel.json").is_file()
    except PermissionError:
        has_root_kernel = False
    if not has_root_kernel and not kernel_file_name.is_file():
        kernel_file_name.parent.mkdir(parents=True, exist_ok=True)
        with kernel_file_name.open(mode="w", encoding="utf-8") as f:
            f.writelines(kernel_def)
        print('CRITICAL: Notebook kernel has been installed. Set kernel to "Metatlas Targeted" and re-run notebook.')
        raise StopExecution
    try:
        from metatlas.tools import notebook  # noqa: E402
    except ImportError as err:
        print('CRITICAL: Set notebook kernel to "Metatlas Targeted" and re-run notebook.')
        raise StopExecution from err

    source_code_version_id = "af68070fac3809e3011eebf6c99d3ed513c9342b"
    notebook.setup("INFO", source_code_version_id)

    try:
        import notebooks.standards_library.standard_annotation as sta
    except ImportError as err:
        print('CRITICAL: Could not import standard annotation tools and notebook modules.')
        raise StopExecution from err

# Read config file and set notebook options

In [2]:
config_path = "/global/homes/b/bkieft/metatlas_junkdrawer/example_data/greenham/RefStd_Greenham_20250516.yaml"
with open(config_path, "r") as config_file:
    config = yaml.safe_load(config_file)

pd.options.display.max_colwidth = 300
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
print(f"Running with config: {os.path.basename(config_path)}")
print(f"Running analysis with timestamp: {timestamp}")

Running with config: RefStd_Greenham_20250516.yaml
Running analysis with timestamp: 20250620144214


# Extract EIC and Spectra information from files in the run table

In [3]:
if config["cache"]["full_data_from_cache"] is False:
    lcmsruns_table_with_adducts = sta.build_standard_lcmsrun_table(config)
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.extract_data(lcmsruns_table_with_adducts,config,method="find_peaks")
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="full", data=(eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images, lcmsruns_table_with_adducts))
elif config["cache"]["full_data_from_cache"] is True:
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images, lcmsruns_table_with_adducts = sta.handle_data(mode="load", config=config, file_suffix="full")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/greenham/cache/20250605104528_ref_stds_full.pkl


# Create interactive plot to choose adduct rt peaks for each standard compound

In [4]:
if config["cache"]["selected_data_from_cache"] is False:
    processed_data = sta.process_data_for_plotting(eics_full, top_spectra_full, group_names_full, rt_peaks_full, config)

    if config["cache"]["gui_data_from_cache"] is True:
        processed_data, selection_results_dict, running_notes_dict = sta.handle_data(mode="load", config=config, file_suffix="selected")

    elif config["cache"]["gui_data_from_cache"] is False:
        selection_results_dict = {}
        running_notes_dict = {
            f"{row['compound_name']};;{row['standard_lcmsrun']}": row['annotation_notes']
            for _, row in lcmsruns_table_with_adducts.iterrows()
        }

    sta.create_interactive_plots(processed_data, selection_results_dict, mols_images, running_notes_dict)
    # Run next cell after manual selection of adducts

elif config["cache"]["selected_data_from_cache"] is True:
    processed_data, selection_results_dict, running_notes_dict = sta.handle_data(mode="load", config=config, file_suffix="selected")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/greenham/cache/20250606111208_ref_stds_selected.pkl


In [5]:
# Check if all adducts have been selected
for key, value in selection_results_dict.items():
    if isinstance(value, tuple) and value == ([], [], []):
        print(f"WARNING: No selections made for {key}.")
        print("Please return to GUI and select adducts for this compound.")

In [6]:
# Save the selections after GUI is completed
if config["cache"]["selected_data_from_cache"] is False:
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="selected", \
                    data=(processed_data, selection_results_dict, running_notes_dict))

# Filter RT Peak, EICs, and Top Spectra by the selected compounds+adducts

In [7]:
if config["cache"]["filtered_data_from_cache"] is False:
    all_selected_adducts = sta.filter_by_selected(eics_full, rt_peaks_full, top_spectra_full, selection_results_dict)
    all_rt_peaks_formatted = sta.format_rt_peaks(all_selected_adducts['rt_peaks'])
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="filtered", \
                    data=(all_selected_adducts, all_rt_peaks_formatted))

elif config["cache"]["filtered_data_from_cache"] is True:
    all_selected_adducts, all_rt_peaks_formatted = sta.handle_data(mode="load", config=config, file_suffix="filtered")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/greenham/cache/20250606114757_ref_stds_filtered.pkl


# Generate static summary reports for each compound and a combined summary document

In [8]:
if config["analysis"]["fresh_outputs"]["generate_static_summary_pdfs"] is True:
    sta.generate_static_summary_plots(processed_data, selection_results_dict, config, timestamp)

if config["analysis"]["fresh_outputs"]["generate_selection_summary_table"] is True:
    sta.generate_selection_summary_table(all_rt_peaks_formatted, running_notes_dict, config, timestamp)

if config["analysis"]["fresh_outputs"]["upload_to_gdrive"] is True:
    check_upload_status = sta.upload_to_google_drive(config["project"]["standards_output_path"], config["project"]["standards_input_file"], timestamp="20250606121942", overwrite=True)

# Stop here for compound review before moving to depositing to DBs

# Identify compounds not in the metatlas database Compounds table and store if necessary

In [9]:
if config["cache"]['metatlas_db_data_from_cache'] is False:
    in_db, notin_db = sta.search_for_matches_in_metatlas_db(all_rt_peaks_formatted, check_by_flat=False) # Check if selected compounds from ALL are in metatlas DB
    if len(notin_db) > 0 and config['compounds']['direct_store_to_compounds_table'] is True: # Store selected compounds+adducts in metatlas db
        sta.store_in_metatlas_db(notin_db)
    sta.check_db_deposit(all_rt_peaks_formatted)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="metatlas_db", \
                    data=(in_db, notin_db, all_rt_peaks_formatted))

elif config["cache"]['metatlas_db_data_from_cache'] is True:
    in_db, notin_db, all_rt_peaks_formatted = sta.handle_data(mode="load", config=config, file_suffix="metatlas_db")
    if len(notin_db) > 0 and config['compounds']['direct_store_to_compounds_table'] is True: # Store selected compounds+adducts in metatlas db
        sta.store_in_metatlas_db(notin_db)
    sta.check_db_deposit(all_rt_peaks_formatted)

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/greenham/cache/20250620143602_ref_stds_metatlas_db.pkl
Running double check for compounds in metatlas db Comopunds table...


	All new entries found in the database.



# Identify compounds+adducts not in atlases and set up new atlas creation

In [10]:
if config["cache"]["ema_atlas_data_from_cache"] is False:
    if config["atlases"]["new_ema_atlas_dtype"] == "all":
        rt_peaks_ema_input = all_rt_peaks_formatted
    elif config["atlases"]["new_ema_atlas_dtype"] == "best":
        rt_peaks_ema_input = all_rt_peaks_formatted[all_rt_peaks_formatted['best_adduct'] == True]
    ema_atlases_data = sta.get_ema_atlas_data(config["atlases"]["current_ema_atlases"])
    rt_peaks_ema_input_formatted = sta.convert_rt_peaks_to_atlas_format(rt_peaks_ema_input)
    matches_to_atlases, nonmatches_to_atlases = sta.search_for_matches_in_atlases(rt_peaks_ema_input_formatted, ema_atlases_data)

    if config["cache"]["rt_correction_data_from_cache"] is False:
        print("Setting up RT correction for compounds not yet in atlases using baseline correction method:\n")
        baseline_to_experimental_qc, baseline_correction_outputs = sta.run_rt_correction(nonmatches_to_atlases, config)
        sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="rt_correction", \
                        data=(baseline_to_experimental_qc, baseline_correction_outputs))

    elif config["cache"]["rt_correction_data_from_cache"] is True:
        baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load",config=config, file_suffix="rt_correction")

    nonmatches_to_atlases_rt_corrected = sta.substitute_corrected_rt_values(nonmatches_to_atlases, baseline_correction_outputs)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="ema_atlases", \
                    data=(nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs))

elif config["cache"]["ema_atlas_data_from_cache"] is True:
    nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load", config=config, file_suffix="ema_atlases")
    print(f"Total compounds to add to EMA atlases per chromatography: {nonmatches_to_atlases_rt_corrected['chromatography'].nunique()}")

 Searching in HILICZ positive atlas:   0%|          | 0/10 [00:00<?, ? compound/s]

 Searching in HILICZ negative atlas:   0%|          | 0/7 [00:00<?, ? compound/s]

 Searching in C18 positive atlas:   0%|          | 0/9 [00:00<?, ? compound/s]

 Searching in C18 negative atlas:   0%|          | 0/6 [00:00<?, ? compound/s]


Summary of compounds+adducts already in the atlases:



Unnamed: 0,query_unique_id,query_to_atlas,atlas_matches,atlas_source
0,Indole-3-pyruvate;;[M+H]+;;positive;;HILICZ,"InChI=1S/C11H9NO3/c13-10(11(14)15)5-7-6-12-9-4-2-1-3-8(7)9/h1-4,6,12H,5H2,(H,14,15)","[InChI=1S/C11H9NO3/c13-10(11(14)15)5-7-6-12-9-4-2-1-3-8(7)9/h1-4,6,12H,5H2,(H,14,15)]",[HILIC_EMA-standards_positive_20250620134440.tsv]
1,Phenylethylamine;;[M+H]+;;positive;;HILICZ,"InChI=1S/C8H11N/c9-7-6-8-4-2-1-3-5-8/h1-5H,6-7,9H2","[InChI=1S/C8H11N/c9-7-6-8-4-2-1-3-5-8/h1-5H,6-7,9H2]",[HILIC_EMA-standards_positive_20250620134440.tsv]
2,Tryptamine;;[M+H]+;;positive;;HILICZ,"InChI=1S/C10H12N2/c11-6-5-8-7-12-10-4-2-1-3-9(8)10/h1-4,7,12H,5-6,11H2","[InChI=1S/C10H12N2/c11-6-5-8-7-12-10-4-2-1-3-9(8)10/h1-4,7,12H,5-6,11H2]",[HILIC_EMA-standards_positive_20250620134440.tsv]
3,5-Hydroxy-L-Tryptophan;;[M+H]+;;positive;;HILICZ,5-hydroxy-tryptophan,[5-hydroxy-tryptophan],[HILIC_EMA-standards_positive_20250620134440.tsv]
4,Glutathione (reduced;GSH);;[M+H]+;;positive;;HILICZ,"InChI=1S/C10H17N3O6S/c11-5(10(18)19)1-2-7(14)13-6(4-20)9(17)12-3-8(15)16/h5-6,20H,1-4,11H2,(H,12,17)(H,13,14)(H,15,16)(H,18,19)/t5-,6-/m0/s1","[InChI=1S/C10H17N3O6S/c11-5(10(18)19)1-2-7(14)13-6(4-20)9(17)12-3-8(15)16/h5-6,20H,1-4,11H2,(H,12,17)(H,13,14)(H,15,16)(H,18,19)/t5-,6-/m0/s1]",[HILIC_EMA-standards_positive_20250620134440.tsv]
5,Indole-3-pyruvate;;[M-H]-;;negative;;HILICZ,"InChI=1S/C11H9NO3/c13-10(11(14)15)5-7-6-12-9-4-2-1-3-8(7)9/h1-4,6,12H,5H2,(H,14,15)","[InChI=1S/C11H9NO3/c13-10(11(14)15)5-7-6-12-9-4-2-1-3-8(7)9/h1-4,6,12H,5H2,(H,14,15)]",[HILIC_EMA-standards_negative_20250620134440.tsv]
6,Cinnamate;;[M-H]-;;negative;;HILICZ,"InChI=1S/C9H8O2/c10-9(11)7-6-8-4-2-1-3-5-8/h1-7H,(H,10,11)/b7-6+","[InChI=1S/C9H8O2/c10-9(11)7-6-8-4-2-1-3-5-8/h1-7H,(H,10,11)/b7-6+]",[HILIC_EMA-standards_negative_20250620134440.tsv]
7,5-Hydroxy-L-Tryptophan;;[M-H]-;;negative;;HILICZ,5-hydroxy-tryptophan,[5-hydroxy-tryptophan],[HILIC_EMA-standards_negative_20250620134440.tsv]
8,Glutathione (reduced;GSH);;[M-H]-;;negative;;HILICZ,"InChI=1S/C10H17N3O6S/c11-5(10(18)19)1-2-7(14)13-6(4-20)9(17)12-3-8(15)16/h5-6,20H,1-4,11H2,(H,12,17)(H,13,14)(H,15,16)(H,18,19)/t5-,6-/m0/s1","[InChI=1S/C10H17N3O6S/c11-5(10(18)19)1-2-7(14)13-6(4-20)9(17)12-3-8(15)16/h5-6,20H,1-4,11H2,(H,12,17)(H,13,14)(H,15,16)(H,18,19)/t5-,6-/m0/s1]",[HILIC_EMA-standards_negative_20250620134440.tsv]
9,5-Hydroxy-L-Tryptophan;;[M+H]+;;positive;;C18,5-hydroxy-l-tryptophan,[5-hydroxy-l-tryptophan],[C18_EMA-standards_positive.tsv]



There are 17 compounds+adducts are not yet in any atlases. View with 'nonmatches_to_atlases'.

Setting up RT correction for compounds not yet in atlases using baseline correction method:

	Getting all QC files for project /global/cfs/cdirs/metatlas/raw_data/jgi/20250321_JGI_KG_510479_Brassica_final-set2_EXP120B_HILICZ_USHXG01827

	Retrieving baseline HILICZ QC atlas: /global/homes/b/bkieft/metatlas-data/HILIC/HILIC_QCv7_positive.tsv

	Collecting QC MS1 data for HILICZ...



 Collecting MS1 data for QC compounds:   0%|          | 0/57 [00:00<?, ? file/s]

Skipping C18 chromatography as it has a user-defined RT correction model.
	Performing RT correction...



Calculating RT correction model:   0%|          | 0/1 [00:00<?, ? chromatography/s]

	HILICZ RT correction results:


Unnamed: 0,label,adduct,polarity,rt_peak_baseline,rt_peak_experimental,rt_peak_corrected,rt_min_corrected,rt_max_corrected,rt_diff_experimental_vs_corrected
0,UDP-Glucose,[M+H]+,positive,,15.953246,16.055008,15.555008,16.555008,-0.101762
1,UDP-Glucose,[M+NH4]+,positive,,15.953246,16.055008,15.555008,16.555008,-0.101762
2,UDP-Glucose,[M+Na]+,positive,,15.953246,16.055008,15.555008,16.555008,-0.101762
3,UDP-Glucose,[M+K]+,positive,,15.956995,16.058668,15.558668,16.558668,-0.101673
4,Glutathione (oxidized;GSSH),[M+H]+,positive,,17.145823,17.215853,16.715853,17.715853,-0.070031
5,Phenylacetaldehyde,[M-H]-,negative,,0.813836,0.684882,0.184882,1.184882,0.128954
6,UDP-Glucose,[M-H]-,negative,,15.946436,16.048358,15.548358,16.548358,-0.101922
7,Glutathione (oxidized;GSSH),[M-H]-,negative,,17.143135,17.213246,16.713246,17.713246,-0.07011
8,ABMBA (unlabeled),,QC,1.093806,1.238709,1.132249,0.632249,1.632249,0.10646
9,N-acetyl-glucosamine (U - 13C),,QC,6.707815,6.627501,6.726059,6.226059,7.226059,-0.098558


Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/greenham/cache/20250620144214_ref_stds_rt_correction.pkl
Formatted 17 RT-corrected compounds for insertion into HILICZ atlases.
Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/greenham/cache/20250620144214_ref_stds_ema_atlases.pkl


# Create new EMA atlas with top selected reference standards added

In [11]:
if config['atlases']['save_new_ema_atlases'] is True:
    ema_atlas_ids, ema_atlas_names = sta.update_and_save_ema_atlases(nonmatches_to_atlases_rt_corrected, ema_atlases_data, config, timestamp)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="new_atlas_ids", \
                    data=(ema_atlas_ids, ema_atlas_names))
    ema_atlas_ids, ema_atlas_names = sta.handle_data(mode="load", config=config, file_suffix="new_atlas_ids")

    if config['atlases']['direct_deposit_new_emas'] is True:
        print("New EMA atlases have been saved to disk and deposited in the metatlas database:")
        display(pd.DataFrame.from_dict(ema_atlas_ids))
    print(f"\nNew EMA atlas locations:")
    display(pd.DataFrame.from_dict(ema_atlas_names))

elif config['atlases']['save_new_ema_atlases'] is False:
    print("No new EMA atlases saved to disk, as 'save_new_ema_atlases' is set to False in the config file.")
    print("Here is the new atlas data in memory:")
    display(nonmatches_to_atlases_rt_corrected)


Current HILICZ positive EMA atlas: HILIC_EMA-standards_positive.tsv
373 current compounds updated with 6 new compounds for a total of 379 compounds.
Updated HILICZ positive EMA atlas saved to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/hatzenpichler/updated_EMA_atlases/HILIC_EMA-standards_positive_20250620134440.tsv


Current HILICZ negative EMA atlas: HILIC_EMA-standards_negative.tsv
418 current compounds updated with 5 new compounds for a total of 423 compounds.
Updated HILICZ negative EMA atlas saved to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/hatzenpichler/updated_EMA_atlases/HILIC_EMA-standards_negative_20250620134440.tsv

No compounds to add to the C18 positive atlas. Skipping.
No compounds to add to the C18 negative atlas. Skipping.
Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/hatzenpichler/cache/20250620134440_ref_stds_new_atlas_ids.pkl
Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/

Unnamed: 0,HILICZ,C18
positive,/global/homes/b/bkieft/metatlas_junkdrawer/example_data/hatzenpichler/updated_EMA_atlases/HILIC_EMA-standards_positive_20250620134440.tsv,
negative,/global/homes/b/bkieft/metatlas_junkdrawer/example_data/hatzenpichler/updated_EMA_atlases/HILIC_EMA-standards_negative_20250620134440.tsv,


# Identify compounds not in MSMS refs and set up new MSMS refs creation

In [12]:
if config["cache"]["msms_refs_data_from_cache"] is False:
    if config["msms_refs"]["new_msms_refs_dtype"] == "all":
        rt_peaks_msms_input = all_rt_peaks_formatted
        top_spectra_msms_input = all_selected_adducts['top_spectra']
    elif config["msms_refs"]["new_msms_refs_dtype"] == "best":
        rt_peaks_msms_input = all_rt_peaks_formatted[all_rt_peaks_formatted['best_adduct'] == True]
        top_spectra_msms_input = all_selected_adducts['top_spectra'][all_selected_adducts['top_spectra']['best_adduct'] == True]
    msms_refs = sta.get_msms_refs(msms_refs_path=config["msms_refs"]["current_msms_refs_path"])
    rt_peaks_msms_input_formatted = sta.format_for_msms_refs(rt_peaks_msms_input, top_spectra_msms_input, msms_refs, config)
    in_msms_refs, notin_msms_refs = sta.search_for_matches_in_msms_refs(rt_peaks_msms_input_formatted, msms_refs, check_by_flat=True)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="msms_refs", \
                    data=(msms_refs, notin_msms_refs, rt_peaks_msms_input, top_spectra_msms_input))

elif config["cache"]["msms_refs_data_from_cache"] is True:
    msms_refs, notin_msms_refs, rt_peaks_msms_input, top_spectra_msms_input = sta.handle_data(mode="load", config=config, file_suffix="msms_refs")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/hatzenpichler/cache/20250620110554_ref_stds_msms_refs.pkl


# Create new MSMS refs table from selected reference standards

In [13]:
if config["msms_refs"]['save_new_msms_refs'] is True:
    sta.update_and_save_msms_refs(msms_refs, notin_msms_refs, config, timestamp)

elif config["msms_refs"]['save_new_msms_refs'] is False:
    print("No new MSMS refs saved to disk, as 'save_new_msms_refs' is set to False in the config file.")

Existing MSMS refs went from 216409 to 216441 compounds.
	New MSMS refs: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/hatzenpichler/updated_MSMS_refs/msms_refs_20250620134440.tab


# Create new MSMS refs MGF file from selected reference standards

In [14]:
if config["msms_refs"]['save_new_mgf'] is True:
    sta.write_mgf_from_top_spectra(top_spectra_msms_input, rt_peaks_msms_input, config, timestamp)

elif config["msms_refs"]['save_new_mgf'] is False:
    print("No new MGF refs saved to disk, as 'save_new_mgf' is set to False in the config file.")

No new MGF refs saved to disk, as 'save_new_mgf' is set to False in the config file.
