# Set up metatlas shifter environment

In [None]:
in_jupyterlab = True

import numpy as np
import pandas as pd
import sys
from datetime import datetime
import os
from IPython.display import display
import yaml
from pathlib import Path

if in_jupyterlab is False:

    sys.path.insert(1, '/global/homes/b/bkieft/metatlas') # Enter your own metatlas repo path here
    try:
        from metatlas.tools.standards_library import standard_annotation as sta
    except ImportError as err:
        print('CRITICAL: Could not import standard annotation tools.')
        raise StopExecution from err
    
elif in_jupyterlab is True:

    class StopExecution(Exception):
        def _render_traceback_(self):
            pass

    kernel_def = """{"argv":["shifter","--entrypoint","--image=ghcr.io/biorack/metatlas/metatlas_shifter:latest","/usr/local/bin/python","-m",
                    "ipykernel_launcher","-f","{connection_file}"],"display_name": "Metatlas Targeted","language": "python",
                    "metadata": { "debugger": true }}"""
    kernel_file_name = Path.home() / ".local" / "share" / "jupyter" / "kernels" / "metatlas-targeted" / "kernel.json"
    
    try:
        has_root_kernel = Path("/root/.local/share/jupyter/kernels/papermill/kernel.json").is_file()
    except PermissionError:
        has_root_kernel = False
    if not has_root_kernel and not kernel_file_name.is_file():
        kernel_file_name.parent.mkdir(parents=True, exist_ok=True)
        with kernel_file_name.open(mode="w", encoding="utf-8") as f:
            f.writelines(kernel_def)
        print('CRITICAL: Notebook kernel has been installed. Set kernel to "Metatlas Targeted" and re-run notebook.')
        raise StopExecution
        
    try:
        from metatlas.tools import notebook  # noqa: E402
    except ImportError as err:
        print('CRITICAL: Set notebook kernel to "Metatlas Targeted" and re-run notebook.')
        raise StopExecution from err

    source_code_version_id = None
    notebook.setup("INFO", source_code_version_id)

    try:
        from metatlas.tools.standards_library import standard_annotation as sta
    except ImportError as err:
        print('CRITICAL: Could not import standard annotation tools.')
        raise StopExecution from err

# Read config file and set notebook options

In [None]:
config_path = "/path/to/config.yaml"
with open(config_path, "r") as config_file:
    config = yaml.safe_load(config_file)

pd.options.display.max_colwidth = 300
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
print(f"Running with config: {os.path.basename(config_path)}")
print(f"Running analysis with timestamp: {timestamp}")

# Extract EIC and Spectra information from files in the run table

In [None]:
if config["cache"]["full_data_from_cache"] is False:
    lcmsruns_table_with_adducts = sta.build_standard_lcmsrun_table(config)
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.extract_data(lcmsruns_table_with_adducts,config,method="find_peaks")
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="full", data=(eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images, lcmsruns_table_with_adducts))
elif config["cache"]["full_data_from_cache"] is True:
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images, lcmsruns_table_with_adducts = sta.handle_data(mode="load", config=config, file_suffix="full")

# Create interactive plot to choose adduct rt peaks for each standard compound

In [None]:
if config["cache"]["selected_data_from_cache"] is False:
    processed_data = sta.process_data_for_plotting(eics_full, top_spectra_full, group_names_full, rt_peaks_full, config)

    if config["cache"]["gui_data_from_cache"] is True:
        processed_data, selection_results_dict, running_notes_dict = sta.handle_data(mode="load", config=config, file_suffix="selected")

    elif config["cache"]["gui_data_from_cache"] is False:
        selection_results_dict = {}
        running_notes_dict = {
            f"{row['compound_name']};;{row['standard_lcmsrun']}": row['annotation_notes']
            for _, row in lcmsruns_table_with_adducts.iterrows()
        }

    sta.create_interactive_plots(processed_data, selection_results_dict, mols_images, running_notes_dict)
    # Run next cell after manual selection of adducts

elif config["cache"]["selected_data_from_cache"] is True:
    processed_data, selection_results_dict, running_notes_dict = sta.handle_data(mode="load", config=config, file_suffix="selected")

In [None]:
# Check if all adducts have been selected
for key, value in selection_results_dict.items():
    if isinstance(value, tuple) and value == ([], [], []):
        print(f"WARNING: No selections made for {key}.")
        print("Please return to GUI and select adducts for this compound.")

In [None]:
# Save the selections after GUI is completed
if config["cache"]["selected_data_from_cache"] is False:
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="selected", \
                    data=(processed_data, selection_results_dict, running_notes_dict))

# Filter RT Peak, EICs, and Top Spectra by the selected compounds+adducts

In [None]:
if config["cache"]["filtered_data_from_cache"] is False:
    all_selected_adducts = sta.filter_by_selected(eics_full, rt_peaks_full, top_spectra_full, selection_results_dict)
    all_rt_peaks_formatted = sta.format_rt_peaks(all_selected_adducts['rt_peaks'])
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="filtered", \
                    data=(all_selected_adducts, all_rt_peaks_formatted))

elif config["cache"]["filtered_data_from_cache"] is True:
    all_selected_adducts, all_rt_peaks_formatted = sta.handle_data(mode="load", config=config, file_suffix="filtered")

# Generate static summary reports for each compound and a combined summary document

In [None]:
if config["analysis"]["new_outputs"]["generate_static_summary_pdfs"] is True:
    sta.generate_static_summary_plots(processed_data, selection_results_dict, config, timestamp)

if config["analysis"]["new_outputs"]["generate_selection_summary_table"] is True:
    sta.generate_selection_summary_table(all_rt_peaks_formatted, running_notes_dict, config, timestamp)

if config["analysis"]["new_outputs"]["upload_to_gdrive"] is True:
    check_upload_status = sta.upload_to_google_drive(config["project"]["standards_output_path"], config["project"]["standards_input_file"], timestamp="20250624100944", overwrite=True)

# Stop here for compound review before moving to depositing to DBs

# Identify compounds not in the metatlas database Compounds table and store if necessary

In [None]:
if config["cache"]['metatlas_db_data_from_cache'] is False:
    in_db, notin_db = sta.search_for_matches_in_metatlas_db(all_rt_peaks_formatted, check_by_flat=False) # Check if selected compounds from ALL are in metatlas DB
    if len(notin_db) > 0 and config['compounds']['direct_store_to_compounds_table'] is True: # Store selected compounds+adducts in metatlas db
        sta.store_in_metatlas_db(notin_db)
    sta.check_db_deposit(all_rt_peaks_formatted)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="metatlas_db", \
                    data=(in_db, notin_db, all_rt_peaks_formatted))

elif config["cache"]['metatlas_db_data_from_cache'] is True:
    in_db, notin_db, all_rt_peaks_formatted = sta.handle_data(mode="load", config=config, file_suffix="metatlas_db")
    if len(notin_db) > 0 and config['compounds']['direct_store_to_compounds_table'] is True: # Store selected compounds+adducts in metatlas db
        sta.store_in_metatlas_db(notin_db)
    sta.check_db_deposit(all_rt_peaks_formatted)

# Identify compounds+adducts not in atlases and set up new atlas creation

In [None]:
if config["cache"]["ema_atlas_data_from_cache"] is False:
    if config["atlases"]["new_ema_atlas_dtype"] == "all":
        rt_peaks_ema_input = all_rt_peaks_formatted
    elif config["atlases"]["new_ema_atlas_dtype"] == "best":
        rt_peaks_ema_input = all_rt_peaks_formatted[all_rt_peaks_formatted['best_adduct'] == True]
    ema_atlases_data = sta.get_ema_atlas_data(config["atlases"]["current_ema_atlases"])
    rt_peaks_ema_input_formatted = sta.convert_rt_peaks_to_atlas_format(rt_peaks_ema_input)
    matches_to_atlases, nonmatches_to_atlases = sta.search_for_matches_in_atlases(rt_peaks_ema_input_formatted, ema_atlases_data)

    if config["cache"]["rt_correction_data_from_cache"] is False:
        print("Setting up RT correction for compounds not yet in atlases using baseline correction method:\n")
        baseline_to_experimental_qc, baseline_correction_outputs = sta.run_rt_correction(nonmatches_to_atlases, config)
        sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="rt_correction", \
                        data=(baseline_to_experimental_qc, baseline_correction_outputs))

    elif config["cache"]["rt_correction_data_from_cache"] is True:
        baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load",config=config, file_suffix="rt_correction")

    nonmatches_to_atlases_rt_corrected = sta.substitute_corrected_rt_values(nonmatches_to_atlases, baseline_correction_outputs)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="ema_atlases", \
                    data=(nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs))

elif config["cache"]["ema_atlas_data_from_cache"] is True:
    nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load", config=config, file_suffix="ema_atlases")
    print(f"Total compounds to add to EMA atlases per chromatography: {nonmatches_to_atlases_rt_corrected['chromatography'].nunique()}")

# Create new EMA atlas with top selected reference standards added

In [None]:
if config['atlases']['save_new_ema_atlases'] is True:
    ema_atlas_ids, ema_atlas_names = sta.update_and_save_ema_atlases(nonmatches_to_atlases_rt_corrected, ema_atlases_data, config, timestamp)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="new_atlas_ids", \
                    data=(ema_atlas_ids, ema_atlas_names))
    ema_atlas_ids, ema_atlas_names = sta.handle_data(mode="load", config=config, file_suffix="new_atlas_ids")

    if config['atlases']['direct_deposit_new_emas'] is True:
        print("New EMA atlases have been saved to disk and deposited in the metatlas database:")
        display(pd.DataFrame.from_dict(ema_atlas_ids))
    print(f"\nNew EMA atlas locations:")
    display(pd.DataFrame.from_dict(ema_atlas_names))

elif config['atlases']['save_new_ema_atlases'] is False:
    print("No new EMA atlases saved to disk, as 'save_new_ema_atlases' is set to False in the config file.")
    print("Here is the new atlas data in memory:")
    display(nonmatches_to_atlases_rt_corrected)

# Identify compounds not in MSMS refs and set up new MSMS refs creation

In [None]:
if config["cache"]["msms_refs_data_from_cache"] is False:
    if config["msms_refs"]["new_msms_refs_dtype"] == "all":
        rt_peaks_msms_input = all_rt_peaks_formatted
        top_spectra_msms_input = all_selected_adducts['top_spectra']
    elif config["msms_refs"]["new_msms_refs_dtype"] == "best":
        rt_peaks_msms_input = all_rt_peaks_formatted[all_rt_peaks_formatted['best_adduct'] == True]
        top_spectra_msms_input = all_selected_adducts['top_spectra'][all_selected_adducts['top_spectra']['best_adduct'] == True]
    msms_refs = sta.get_msms_refs(msms_refs_path=config["msms_refs"]["current_msms_refs_path"])
    rt_peaks_msms_input_formatted = sta.format_for_msms_refs(rt_peaks_msms_input, top_spectra_msms_input, msms_refs, config)
    in_msms_refs, notin_msms_refs = sta.search_for_matches_in_msms_refs(rt_peaks_msms_input_formatted, msms_refs, check_by_flat=True)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="msms_refs", \
                    data=(msms_refs, notin_msms_refs, rt_peaks_msms_input, top_spectra_msms_input))

elif config["cache"]["msms_refs_data_from_cache"] is True:
    msms_refs, notin_msms_refs, rt_peaks_msms_input, top_spectra_msms_input = sta.handle_data(mode="load", config=config, file_suffix="msms_refs")

# Create new MSMS refs table from selected reference standards

In [None]:
if config["msms_refs"]['save_new_msms_refs'] is True:
    sta.update_and_save_msms_refs(msms_refs, notin_msms_refs, config, timestamp)

elif config["msms_refs"]['save_new_msms_refs'] is False:
    print("No new MSMS refs saved to disk, as 'save_new_msms_refs' is set to False in the config file.")

# Create new MSMS refs MGF file from selected reference standards

In [None]:
if config["msms_refs"]['save_new_mgf'] is True:
    sta.write_mgf_from_top_spectra(top_spectra_msms_input, rt_peaks_msms_input, config, timestamp)

elif config["msms_refs"]['save_new_mgf'] is False:
    print("No new MGF refs saved to disk, as 'save_new_mgf' is set to False in the config file.")