In [None]:
import numpy as np
import pandas as pd
import sys
from datetime import datetime
import os
from IPython.display import display
import yaml

sys.path.insert(1, '/global/homes/b/bkieft/metatlas')
import notebooks.standards_library.standard_annotation as sta

pd.options.display.max_colwidth = 300
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# Enter Reference Standard information

In [None]:
config_path = "/global/homes/b/bkieft/metatlas/notebooks/standards_library/msms_set_config.yaml"
with open(config_path, "r") as config_file:
    config = yaml.safe_load(config_file)

# Get EIC and Spectra information from run table

In [None]:
if config["raw_data_from_cache"] is False:
    lcmsruns_table_with_adducts = sta.build_standard_lcmsrun_table(config['standards_input_file'], include_polarities=config["include_polarities"], 
                                                                   include_chromatographies=config["include_chromatographies"], include_adducts=config["include_adducts"])

    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.extract_data(lcmsruns_table=lcmsruns_table_with_adducts,
                                                                                                            ppm_tolerance=config["ppm_tolerance"],
                                                                                                            method="find_peaks")
    sta.handle_data(
        mode="save",
        save_path=config['standards_output_path'],
        timestamp=timestamp,
        data=(eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images),
        file_suffix="full"
    )
    
elif config["raw_data_from_cache"] is True:
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.handle_data(mode="load",
                                                                                                            save_path=config['standards_output_path'],
                                                                                                            file_suffix="full")

# Create interactive plot and choose adducts

In [None]:
if config["selected_data_from_cache"] is False:

    processed_data = sta.process_data_for_plotting(eics_full, top_spectra_full, group_names_full, rt_peaks_full, config["include_adducts"], sort_by='run', # 'run', 'specs'
                                                   subset_by_compound="sorgoleone", subset_by_run=None) # Use these to subset by compound (str) or run number (int)

    selected_adducts_dict = {}
    ambiguous_adducts_dict = {}
    top_adducts_dict = {}
    sta.create_interactive_plots(processed_data, mols_images, selected_adducts_dict, ambiguous_adducts_dict, top_adducts_dict) # Run this to create the interactive plots
    # Run next cell after manual selection of adducts

In [None]:
if config["selected_data_from_cache"] is False:
    sta.handle_data(
        mode="save",
        save_path=config['standards_output_path'],
        timestamp=timestamp,
        data=(selected_adducts_dict, ambiguous_adducts_dict, top_adducts_dict, processed_data),
        file_suffix="selected"
    )
    sta.generate_static_summary_plots(processed_data, selected_adducts_dict, top_adducts_dict, config["standards_output_path"]) # Save summary plots of selected compounds+adducts after completing GUI
    
elif config["selected_data_from_cache"] is True:
    selected_adducts_dict, ambiguous_adducts_dict, top_adducts_dict, processed_data = sta.handle_data(mode="load",
                                                                                                        save_path=config['standards_output_path'],
                                                                                                        file_suffix="selected")

# Filter RT Peak, EICs, and Top Spectra by selected compounds+adducts

In [None]:
if config["filtered_data_from_cache"] is False:
    eics_filtered, rt_peaks_filtered, top_spectra_filtered = sta.filter_by_selected(eics_full, rt_peaks_full, top_spectra_full, selected_adducts_dict, top_adducts_dict) 
    sta.handle_data(
        mode="save",
        save_path=config['standards_output_path'],
        timestamp=timestamp,
        data=(eics_filtered, top_spectra_filtered, rt_peaks_filtered),
        file_suffix="filtered"
    )

elif config["filtered_data_from_cache"] is True:
    eics_filtered, top_spectra_filtered, rt_peaks_filtered = sta.handle_data(mode="load",
                                                                             save_path=config['standards_output_path'],
                                                                             file_suffix="filtered")
    rt_peaks_filtered_all_selected, rt_peaks_filtered_top_selected = sta.format_and_select_top_adducts(rt_peaks_filtered, top_adducts_dict) # Format RT peaks data for adding to metatlas DB, atlases, and MSMS refs. Export "all" for MSMS refs and "top" for EMA atlases
    in_db, notin_db = sta.search_for_matches_in_metatlas_db(rt_peaks_filtered_all_selected, check_by_flat=True) # Check if selected compounds are in metatlas DB
    if len(notin_db) > 0 and config['direct_store_to_compounds_table'] is True: # Store selected compounds+adducts in metatlas db
        sta.store_in_metatlas_db(notin_db)
    sta.check_db_deposit(rt_peaks_filtered_top_selected)

# Identify compounds not in atlases and set up new atlas creation

In [None]:
if config["ema_atlas_data_from_cache"] is False:
    ema_atlases = sta.get_ema_atlas_data(config["current_ema_atlases"])
    rt_peaks_filtered_top_selected_formatted = sta.convert_rt_peaks_to_atlas_format(rt_peaks_filtered_top_selected)
    matches_to_atlases, nonmatches_to_atlases = sta.search_for_matches_in_atlases(rt_peaks_filtered_top_selected_formatted, ema_atlases)

    if config["rt_correction_data_from_cache"] is False:
        baseline_qc, experimental_qc, baseline_to_experimental_qc = sta.get_qc_experimental_atlas(nonmatches_to_atlases, config["current_qc_atlases"], include_istds=True)
        baseline_correction_inputs = sta.create_baseline_correction_input(nonmatches_to_atlases, baseline_to_experimental_qc)
        baseline_correction_outputs = sta.rt_correction_from_baseline(baseline_correction_inputs, config["include_chromatographies"])
        sta.handle_data(
            mode="save",
            save_path=config['standards_output_path'],
            timestamp=timestamp,
            data=(baseline_to_experimental_qc, baseline_correction_outputs),
            file_suffix="rt_correction"
        )

    elif config["rt_correction_data_from_cache"] is True:
        baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load",save_path=config['standards_output_path'], file_suffix="rt_correction")

    nonmatches_to_atlases_rt_corrected = sta.substitute_corrected_rt_values(nonmatches_to_atlases, baseline_correction_outputs)
    sta.handle_data(
        mode="save",
        save_path=config['standards_output_path'],
        timestamp=timestamp,
        data=(nonmatches_to_atlases_rt_corrected, ema_atlases),
        file_suffix="ema_atlases"
    )

elif config["ema_atlas_data_from_cache"] is True:
    nonmatches_to_atlases_rt_corrected, ema_atlases = sta.handle_data(mode="load",
                                                                        save_path=config['standards_output_path'],
                                                                        file_suffix="ema_atlases")

# Create new EMA atlas with top selected reference standards added

In [None]:
ema_atlas_id, ema_atlas_name = sta.update_and_save_ema_atlases(nonmatches_to_atlases_rt_corrected, ema_atlases, config, timestamp)
sta.handle_data(
    mode="save",
    save_path=config['standards_output_path'],
    timestamp=timestamp,
    data=(ema_atlas_id, ema_atlas_name),
    file_suffix="update_atlas_ids"
)

# Identify compounds not in MSMS refs and set up new MSMS refs creation

In [None]:
if config["msms_refs_data_from_cache"] is False:

    msms_refs = sta.get_msms_refs(msms_refs_path=config["current_msms_refs_path"])
    rt_peaks_filtered_all_selected_formatted = sta.format_for_msms_refs(rt_peaks_filtered_all_selected, top_spectra_filtered, msms_refs, config["msms_refs_metadata"])
    in_msms_refs, notin_msms_refs = sta.search_for_matches_in_msms_refs(rt_peaks_filtered_all_selected_formatted, msms_refs, check_by_flat=True)
    sta.handle_data(
        mode="save",
        save_path=config['standards_output_path'],
        timestamp=timestamp,
        data=(msms_refs, notin_msms_refs),
        file_suffix="msms_refs"
    )
    
elif config["msms_refs_data_from_cache"] is True:
    msms_refs, notin_msms_refs = sta.handle_data(mode="load",
                                                  save_path=config['standards_output_path'],
                                                  file_suffix="msms_refs")

# Create new MSMS refs table with all selected reference standards added

In [None]:
sta.update_and_save_msms_refs(msms_refs, notin_msms_refs, config["standards_output_path"], timestamp, save_refs=True)