In [None]:
import numpy as np
import pandas as pd
import sys
from datetime import datetime
import os
from IPython.display import display
import yaml

sys.path.insert(1, '/global/homes/b/bkieft/metatlas')
import notebooks.standards_library.standard_annotation as sta

pd.options.display.max_colwidth = 300
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# Read config file

In [None]:
config_path = "/global/homes/b/bkieft/metatlas_junkdrawer/example_data/metasciphenolics/metasci_phenolics_annotation_input.yaml"
with open(config_path, "r") as config_file:
    config = yaml.safe_load(config_file)

# Extract EIC and Spectra information from files in the run table

In [None]:
if config["full_data_from_cache"] is False:
    lcmsruns_table_with_adducts = sta.build_standard_lcmsrun_table(config)
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.extract_data(lcmsruns_table_with_adducts,config,method="find_peaks")
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="full", data=(eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images))
elif config["full_data_from_cache"] is True:
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.handle_data(mode="load", config=config, file_suffix="full")

# Create interactive plot to choose adduct rt peaks for each standard compound

In [None]:
if config["selected_data_from_cache"] is False:
    processed_data = sta.process_data_for_plotting(eics_full, top_spectra_full, group_names_full, rt_peaks_full, config)
    selected_adducts_dict = {}
    ambiguous_adducts_dict = {}
    top_adducts_dict = {}
    sta.create_interactive_plots_vscode(processed_data, mols_images, selected_adducts_dict, ambiguous_adducts_dict, top_adducts_dict)
    # Run next cell after manual selection of adducts

elif config["selected_data_from_cache"] is True:
    print("Not initiating GUI for adduct selection, loading selected adducts from cache below.")

# Generate static summary reports for each compound after selections are made

In [None]:
if config["selected_data_from_cache"] is False:
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="selected", \
                    data=(selected_adducts_dict, ambiguous_adducts_dict, top_adducts_dict, processed_data))
    sta.generate_static_summary_plots(processed_data, selected_adducts_dict, top_adducts_dict, config) # Save summary plots of selected compounds+adducts after completing GUI

elif config["selected_data_from_cache"] is True:
    selected_adducts_dict, ambiguous_adducts_dict, top_adducts_dict, processed_data = sta.handle_data(mode="load", config=config, file_suffix="selected")

# Filter RT Peak, EICs, and Top Spectra by the selected compounds+adducts

In [None]:
if config["filtered_data_from_cache"] is False:
    eics_filtered, rt_peaks_filtered, top_spectra_filtered = sta.filter_by_selected(eics_full, rt_peaks_full, top_spectra_full, selected_adducts_dict, top_adducts_dict)
    rt_peaks_filtered_all_selected, rt_peaks_filtered_top_selected = sta.format_and_select_top_adducts(rt_peaks_filtered, top_adducts_dict) # Export "all" for MSMS refs and "top" for EMA atlases
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="filtered", \
                    data=(eics_filtered, top_spectra_filtered, rt_peaks_filtered, rt_peaks_filtered_all_selected, rt_peaks_filtered_top_selected))

elif config["filtered_data_from_cache"] is True:
    eics_filtered, top_spectra_filtered, rt_peaks_filtered, rt_peaks_filtered_all_selected, rt_peaks_filtered_top_selected = sta.handle_data(mode="load", config=config, file_suffix="filtered")
    print(f"\nTotal unique compounds selected: {eics_filtered['compound_name'].nunique()}")
    print(f"Total unique compound+adduct entries selected: {eics_filtered['label'].nunique()}\n")
    print(f"Total EICs selected: {eics_filtered.shape[0]}")
    print(f"Total RT peaks selected: {rt_peaks_filtered.shape[0]}")
    print(f"Total MS2 spectra selected: {top_spectra_filtered.shape[0]}")

# Identify compounds not in the metatlas database Compounds table and store if necessary

In [None]:
if config['metatlas_db_data_from_cache'] is False:
    in_db, notin_db = sta.search_for_matches_in_metatlas_db(rt_peaks_filtered_all_selected, check_by_flat=True) # Check if selected compounds are in metatlas DB
    if len(notin_db) > 0 and config['direct_store_to_compounds_table'] is True: # Store selected compounds+adducts in metatlas db
        sta.store_in_metatlas_db(notin_db)
    sta.check_db_deposit(rt_peaks_filtered_top_selected)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="metatlas_db", \
                    data=(in_db, notin_db, rt_peaks_filtered_all_selected, rt_peaks_filtered_top_selected))

elif config['metatlas_db_data_from_cache'] is True:
    in_db, notin_db, rt_peaks_filtered_all_selected, rt_peaks_filtered_top_selected = sta.handle_data(mode="load", config=config, file_suffix="metatlas_db")
    print(f"\n'All' peaks dataset (for MSMS refs): {rt_peaks_filtered_all_selected.shape[0]} total compound peaks.")
    print(f"'Top' peaks dataset (for EMA atlases): {rt_peaks_filtered_top_selected.shape[0]} best compound peaks.\n")

# Identify compounds+adducts not in atlases and set up new atlas creation

In [None]:
if config["ema_atlas_data_from_cache"] is False:
    ema_atlases_data = sta.get_ema_atlas_data(config["current_ema_atlases"])
    rt_peaks_filtered_top_selected_formatted = sta.convert_rt_peaks_to_atlas_format(rt_peaks_filtered_top_selected)
    matches_to_atlases, nonmatches_to_atlases = sta.search_for_matches_in_atlases(rt_peaks_filtered_top_selected_formatted, ema_atlases_data)

    if config["rt_correction_data_from_cache"] is False:
        print("Setting up RT correction for compounds not yet in atlases using baseline correction method:\n")
        baseline_qc, experimental_qc, baseline_to_experimental_qc = sta.get_qc_experimental_atlas(nonmatches_to_atlases, config["current_qc_atlases"], include_istds=True)
        baseline_correction_inputs = sta.create_baseline_correction_input(nonmatches_to_atlases, baseline_to_experimental_qc)
        baseline_correction_outputs = sta.rt_correction_from_baseline(baseline_correction_inputs, config)
        sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="rt_correction", \
                        data=(baseline_to_experimental_qc, baseline_correction_outputs))

    elif config["rt_correction_data_from_cache"] is True:
        baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load",config=config, file_suffix="rt_correction")

    nonmatches_to_atlases_rt_corrected = sta.substitute_corrected_rt_values(nonmatches_to_atlases, baseline_correction_outputs)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="ema_atlases", \
                    data=(nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs))

elif config["ema_atlas_data_from_cache"] is True:
    nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load", config=config, file_suffix="ema_atlases")
    print(f"Total compounds to add to EMA atlases per chromatography: {nonmatches_to_atlases_rt_corrected['chromatography'].nunique()}")

# Create new EMA atlas with top selected reference standards added

In [None]:
if config['save_new_ema_atlases'] is True:
    ema_atlas_ids, ema_atlas_names = sta.update_and_save_ema_atlases(nonmatches_to_atlases_rt_corrected, ema_atlases_data, config, timestamp)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="new_atlas_ids", \
                    data=(ema_atlas_ids, ema_atlas_names))
    ema_atlas_ids, ema_atlas_names = sta.handle_data(mode="load", config=config, file_suffix="new_atlas_ids")

    if config['direct_deposit_new_emas'] is True:
        print("New EMA atlases have been saved to disk and deposited in the metatlas database:")
        display(pd.DataFrame.from_dict(ema_atlas_ids))
    print(f"\nNew EMA atlas locations:")
    display(pd.DataFrame.from_dict(ema_atlas_names))

elif config['save_new_ema_atlases'] is False:
    print("No new EMA atlases saved to disk, as 'save_new_ema_atlases' is set to False in the config file.")
    print("Here is the new atlas data in memory:")
    display(nonmatches_to_atlases_rt_corrected)

# Identify compounds not in MSMS refs and set up new MSMS refs creation

In [None]:
if config["msms_refs_data_from_cache"] is False:
    msms_refs = sta.get_msms_refs(msms_refs_path=config["current_msms_refs_path"])
    rt_peaks_filtered_all_selected_formatted = sta.format_for_msms_refs(rt_peaks_filtered_all_selected, top_spectra_filtered, msms_refs, config)
    in_msms_refs, notin_msms_refs = sta.search_for_matches_in_msms_refs(rt_peaks_filtered_all_selected_formatted, msms_refs, check_by_flat=True)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="msms_refs", \
                    data=(msms_refs, notin_msms_refs, rt_peaks_filtered_all_selected_formatted))
    
elif config["msms_refs_data_from_cache"] is True:
    msms_refs, notin_msms_refs, rt_peaks_filtered_all_selected_formatted = sta.handle_data(mode="load", config=config, file_suffix="msms_refs")

# Create new MSMS refs table with all selected reference standards added

In [None]:
if config['save_new_msms_refs'] is True:
    sta.update_and_save_msms_refs(msms_refs, notin_msms_refs, config, timestamp)

elif config['save_new_msms_refs'] is False:
    print("No new MSMS refs saved to disk, as 'save_new_msms_refs' is set to False in the config file.")
    print("Here is the new MSMS refs data in memory:")
    display(notin_msms_refs)