In [10]:
import numpy as np
import pandas as pd
import sys
from datetime import datetime
import os
from IPython.display import display
import yaml

sys.path.insert(1, '/global/homes/b/bkieft/metatlas')
import notebooks.standards_library.standard_annotation as sta

pd.options.display.max_colwidth = 300
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

# Read config file

In [11]:
config_path = "/global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/scheller_annotation_input.yaml"
with open(config_path, "r") as config_file:
    config = yaml.safe_load(config_file)

# Extract EIC and Spectra information from files in the run table

In [12]:
if config["full_data_from_cache"] is False:
    lcmsruns_table_with_adducts = sta.build_standard_lcmsrun_table(config)
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.extract_data(lcmsruns_table_with_adducts,config,method="find_peaks")
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="full", data=(eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images))
elif config["full_data_from_cache"] is True:
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, mols_images = sta.handle_data(mode="load", config=config, file_suffix="full")

Loading most recent pkl file: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250506190611_ref_stds_full.pkl


# Create interactive plot to choose adduct rt peaks for each standard compound

In [8]:
if config["selected_data_from_cache"] is False:
    processed_data = sta.process_data_for_plotting(eics_full, top_spectra_full, group_names_full, rt_peaks_full, config)
    selected_adducts_dict = {}
    ambiguous_adducts_dict = {}
    top_adducts_dict = {}
    sta.create_interactive_plots_vscode(processed_data, mols_images, selected_adducts_dict, ambiguous_adducts_dict, top_adducts_dict)
    # Run next cell after manual selection of adducts

elif config["selected_data_from_cache"] is True:
    print("Not initiating GUI for adduct selection, loading selected adducts from cache below.")

VBox(children=(VBox(children=(Label(value='', layout=Layout(margin='0 0 0 0')), HBox(children=(VBox(children=(…

Output()

# Generate static summary reports for each compound after selections are made

In [11]:
if config["selected_data_from_cache"] is False:
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="selected", \
                    data=(selected_adducts_dict, ambiguous_adducts_dict, top_adducts_dict, processed_data))
    sta.generate_static_summary_plots(processed_data, selected_adducts_dict, top_adducts_dict, config) # Save summary plots of selected compounds+adducts after completing GUI

elif config["selected_data_from_cache"] is True:
    selected_adducts_dict, ambiguous_adducts_dict, top_adducts_dict, processed_data = sta.handle_data(mode="load", config=config, file_suffix="selected")

Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250507092440_ref_stds_selected.pkl


 Writing summary plots for selected compounds:   0%|          | 0/8 [00:00<?, ? compound group/s]

[09:28:09] non-ring atom 0 marked aromatic
[09:28:09] non-ring atom 0 marked aromatic


# Filter RT Peak, EICs, and Top Spectra by the selected compounds+adducts

In [12]:
if config["filtered_data_from_cache"] is False:
    eics_filtered, rt_peaks_filtered, top_spectra_filtered = sta.filter_by_selected(eics_full, rt_peaks_full, top_spectra_full, selected_adducts_dict, top_adducts_dict)
    rt_peaks_filtered_all_selected, rt_peaks_filtered_top_selected = sta.format_and_select_top_adducts(rt_peaks_filtered, top_adducts_dict) # Export "all" for MSMS refs and "top" for EMA atlases
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="filtered", \
                    data=(eics_filtered, top_spectra_filtered, rt_peaks_filtered, rt_peaks_filtered_all_selected, rt_peaks_filtered_top_selected))

elif config["filtered_data_from_cache"] is True:
    eics_filtered, top_spectra_filtered, rt_peaks_filtered, rt_peaks_filtered_all_selected, rt_peaks_filtered_top_selected = sta.handle_data(mode="load", config=config, file_suffix="filtered")
    print(f"\nTotal unique compounds selected: {eics_filtered['compound_name'].nunique()}")
    print(f"Total unique compound+adduct entries selected: {eics_filtered['label'].nunique()}\n")
    print(f"Total EICs selected: {eics_filtered.shape[0]}")
    print(f"Total RT peaks selected: {rt_peaks_filtered.shape[0]}")
    print(f"Total MS2 spectra selected: {top_spectra_filtered.shape[0]}")


Total unique compounds selected: 1
Total unique compound+adduct entries selected: 10

Total EICs selected: 31
Total RT peaks selected: 12
Total MS2 spectra selected: 12

Working on dataset: top
	Checking for differing RTs between CEs and polarities, which are unexpected...
		Group ('C18', 'sorgoleone'): All RT values for ['102040norm' '205060norm'] and ['NEG' 'POS'] are within 0.05 mins of each other (0.0057).
		Group ('HILICZ', 'sorgoleone'): All RT values for ['102040norm' '205060norm'] and ['NEG' 'POS'] are within 0.05 mins of each other (0.0107).

	Grouping by monoisotopic_mass and identify isomers in the datasets...
		No isomers found in top data.

	Selecting best collision energy row by intensity for the top adduct(s) per compound...
		Selected 1 row and removed 1 row(s) for ('C18', 'NEG', 'sorgoleone', '[M-H]-').
		Selected 1 row and removed 1 row(s) for ('C18', 'POS', 'sorgoleone', '[M+H]+').
		Selected 1 row and removed 1 row(s) for ('C18', 'POS', 'sorgoleone', '[M+Na]+').
		

# Identify compounds not in the metatlas database Compounds table and store if necessary

In [13]:
if config['metatlas_db_data_from_cache'] is False:
    in_db, notin_db = sta.search_for_matches_in_metatlas_db(rt_peaks_filtered_all_selected, check_by_flat=True) # Check if selected compounds are in metatlas DB
    if len(notin_db) > 0 and config['direct_store_to_compounds_table'] is True: # Store selected compounds+adducts in metatlas db
        sta.store_in_metatlas_db(notin_db)
    sta.check_db_deposit(rt_peaks_filtered_top_selected)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="metatlas_db", \
                    data=(in_db, notin_db, rt_peaks_filtered_all_selected, rt_peaks_filtered_top_selected))

elif config['metatlas_db_data_from_cache'] is True:
    in_db, notin_db, rt_peaks_filtered_all_selected, rt_peaks_filtered_top_selected = sta.handle_data(mode="load", config=config, file_suffix="metatlas_db")
    print(f"\n'All' peaks dataset (for MSMS refs): {rt_peaks_filtered_all_selected.shape[0]} total compound peaks.")
    print(f"'Top' peaks dataset (for EMA atlases): {rt_peaks_filtered_top_selected.shape[0]} best compound peaks.\n")

 Searching for matches in metatlas db:   0%|          | 0/12 [00:00<?, ? compound/s]


Summary of compounds already in the metatlas database:



Unnamed: 0,query_label,query_matching_criterion,query_to_db,db_match
0,sorgoleone,inchi_key,FGWRUVXUQWGLOX-AFJQJTPPSA-N,[FGWRUVXUQWGLOX-AFJQJTPPSA-N]



All compounds are already in the metatlas database.

All new entries found in the database.

Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250507092440_ref_stds_metatlas_db.pkl


# Identify compounds+adducts not in atlases and set up new atlas creation

In [14]:
if config["ema_atlas_data_from_cache"] is False:
    ema_atlases_data = sta.get_ema_atlas_data(config["current_ema_atlases"])
    rt_peaks_filtered_top_selected_formatted = sta.convert_rt_peaks_to_atlas_format(rt_peaks_filtered_top_selected)
    matches_to_atlases, nonmatches_to_atlases = sta.search_for_matches_in_atlases(rt_peaks_filtered_top_selected_formatted, ema_atlases_data)

    if config["rt_correction_data_from_cache"] is False:
        print("Setting up RT correction for compounds not yet in atlases using baseline correction method:\n")
        baseline_qc, experimental_qc, baseline_to_experimental_qc = sta.get_qc_experimental_atlas(nonmatches_to_atlases, config["current_qc_atlases"], include_istds=True)
        baseline_correction_inputs = sta.create_baseline_correction_input(nonmatches_to_atlases, baseline_to_experimental_qc)
        baseline_correction_outputs = sta.rt_correction_from_baseline(baseline_correction_inputs, config)
        sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="rt_correction", \
                        data=(baseline_to_experimental_qc, baseline_correction_outputs))

    elif config["rt_correction_data_from_cache"] is True:
        baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load",config=config, file_suffix="rt_correction")

    nonmatches_to_atlases_rt_corrected = sta.substitute_corrected_rt_values(nonmatches_to_atlases, baseline_correction_outputs)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="ema_atlases", \
                    data=(nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs))

elif config["ema_atlas_data_from_cache"] is True:
    nonmatches_to_atlases_rt_corrected, ema_atlases_data, baseline_to_experimental_qc, baseline_correction_outputs = sta.handle_data(mode="load", config=config, file_suffix="ema_atlases")
    print(f"Total compounds to add to EMA atlases per chromatography: {nonmatches_to_atlases_rt_corrected['chromatography'].nunique()}")

 Searching in HILICZ positive atlas:   0%|          | 0/1 [00:00<?, ? compound/s]

 Searching in HILICZ negative atlas:   0%|          | 0/1 [00:00<?, ? compound/s]

 Searching in C18 positive atlas:   0%|          | 0/2 [00:00<?, ? compound/s]

 Searching in C18 negative atlas:   0%|          | 0/1 [00:00<?, ? compound/s]


None of the compounds+adducts searched were found in the atlases.

There are 5 compounds+adducts are not yet in any atlases. View with 'nonmatches_to_atlases'.

Setting up RT correction for compounds not yet in atlases using baseline correction method:

	Getting all QC files for project /global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558

	Retrieving baseline HILICZ QC atlas: /global/homes/b/bkieft/metatlas-data/HILIC/HILIC_QCv7_positive.tsv

	Collecting QC MS1 data for HILICZ...



 Collecting MS1 data for QC compounds:   0%|          | 0/52 [00:00<?, ? file/s]

	Getting all QC files for project /global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782

	Retrieving baseline C18 QC atlas: /global/homes/b/bkieft/metatlas-data/C18/C18_QCv7_positive.tsv

	Collecting QC MS1 data for C18...



 Collecting MS1 data for QC compounds:   0%|          | 0/53 [00:00<?, ? file/s]

	Performing RT correction...



Calculating RT correction model:   0%|          | 0/2 [00:00<?, ? chromatography/s]

	HILICZ RT correction results:


Unnamed: 0,label,adduct,polarity,rt_peak_baseline,rt_peak_experimental,rt_peak_corrected,rt_min_corrected,rt_max_corrected,rt_diff_experimental_vs_corrected
0,sorgoleone,[M+H]+,positive,,0.801184,0.645454,0.145454,1.145454,0.15573
1,sorgoleone,[M-H]-,negative,,0.804263,0.648787,0.148787,1.148787,0.155476
2,ABMBA (unlabeled),,QC,1.093806,1.21834,1.096205,0.596205,1.596205,0.122135
3,N-acetyl-glucosamine (U - 13C),,QC,6.707815,6.418233,6.580829,6.080829,7.080829,-0.162596
4,adenine (U - 15N),,QC,2.677602,2.834389,2.827321,2.327321,3.327321,0.007068
5,"alanine (U - 13C, 15N)",,QC,13.405091,13.240681,13.400469,12.900469,13.900469,-0.159788
6,"arginine (U - 13C, 15N)",,QC,16.939915,17.013107,16.987934,16.487934,17.487934,0.025173
7,"asparagine (U - 13C, 15N)",,QC,14.368089,14.252848,14.37583,13.87583,14.87583,-0.122982
8,"aspartic acid (U - 13C, 15N)",,QC,16.13036,16.043652,16.07848,15.57848,16.57848,-0.034828
9,"cystine (U - 13C, 15N)",,QC,16.904308,16.930342,16.910628,16.410628,17.410628,0.019714


	C18 RT correction results:


Unnamed: 0,label,adduct,polarity,rt_peak_baseline,rt_peak_experimental,rt_peak_corrected,rt_min_corrected,rt_max_corrected,rt_diff_experimental_vs_corrected
0,sorgoleone,[M+H]+,positive,,7.543334,7.438575,6.938575,7.938575,0.104759
1,sorgoleone,[M+Na]+,positive,,7.544277,7.439548,6.939548,7.939548,0.104729
2,sorgoleone,[M-H]-,negative,,7.547524,7.442898,6.942898,7.942898,0.104627
3,9-cis-retinoic acid,,QC,7.372778,7.322109,7.210608,6.710608,7.710608,0.111501
4,ABMBA (unlabeled),,QC,4.7,4.892107,4.73861,4.23861,5.23861,0.153496
5,caffeine,,QC,2.652537,2.833037,2.689957,2.189957,3.189957,0.14308
6,enoxolone,,QC,6.605928,6.792058,6.666384,6.166384,7.166384,0.125674
7,inosine (U - 15N),,QC,1.02,1.13626,1.03348,0.53348,1.53348,0.10278
8,nigericin,,QC,8.559813,8.749709,8.690287,8.190287,9.190287,0.059422
9,"phenylalanine (U - 13C, 15N)",,QC,1.452954,1.481643,1.368337,0.868337,1.868337,0.113306


Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250507092440_ref_stds_rt_correction.pkl
Formatted 5 RT-corrected compounds for insertion into HILICZ atlases.
Formatted 5 RT-corrected compounds for insertion into C18 atlases.
Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250507092440_ref_stds_ema_atlases.pkl


# Create new EMA atlas with top selected reference standards added

In [15]:
if config['save_new_ema_atlases'] is True:
    ema_atlas_ids, ema_atlas_names = sta.update_and_save_ema_atlases(nonmatches_to_atlases_rt_corrected, ema_atlases_data, config, timestamp)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="new_atlas_ids", \
                    data=(ema_atlas_ids, ema_atlas_names))
    ema_atlas_ids, ema_atlas_names = sta.handle_data(mode="load", config=config, file_suffix="new_atlas_ids")

    if config['direct_deposit_new_emas'] is True:
        print("New EMA atlases have been saved to disk and deposited in the metatlas database:")
        display(pd.DataFrame.from_dict(ema_atlas_ids))
    print(f"\nNew EMA atlas locations:")
    display(pd.DataFrame.from_dict(ema_atlas_names))

elif config['save_new_ema_atlases'] is False:
    print("No new EMA atlases saved to disk, as 'save_new_ema_atlases' is set to False in the config file.")
    print("Here is the new atlas data in memory:")
    display(nonmatches_to_atlases_rt_corrected)


Current HILICZ positive EMA atlas: HILIC_EMA-standards_positive.tsv
373 current compounds updated with 1 new compounds for a total of 374 compounds.
Updated HILICZ positive EMA atlas saved to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/updated_EMA_atlases/HILIC_EMA-standards_positive_20250507092440.tsv


Current HILICZ negative EMA atlas: HILIC_EMA-standards_negative.tsv
418 current compounds updated with 1 new compounds for a total of 419 compounds.
Updated HILICZ negative EMA atlas saved to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/updated_EMA_atlases/HILIC_EMA-standards_negative_20250507092440.tsv


Current C18 positive EMA atlas: C18_EMA-standards_positive.tsv
3791 current compounds updated with 2 new compounds for a total of 3793 compounds.
Updated C18 positive EMA atlas saved to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/updated_EMA_atlases/C18_EMA-standards_positive_20250507092440.tsv


Cu

Unnamed: 0,HILICZ,C18
positive,/global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/updated_EMA_atlases/HILIC_EMA-standards_positive_20250507092440.tsv,/global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/updated_EMA_atlases/C18_EMA-standards_positive_20250507092440.tsv
negative,/global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/updated_EMA_atlases/HILIC_EMA-standards_negative_20250507092440.tsv,/global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/updated_EMA_atlases/C18_EMA-standards_negative_20250507092440.tsv


# Identify compounds not in MSMS refs and set up new MSMS refs creation

In [16]:
if config["msms_refs_data_from_cache"] is False:
    msms_refs = sta.get_msms_refs(msms_refs_path=config["current_msms_refs_path"])
    rt_peaks_filtered_all_selected_formatted = sta.format_for_msms_refs(rt_peaks_filtered_all_selected, top_spectra_filtered, msms_refs, config)
    in_msms_refs, notin_msms_refs = sta.search_for_matches_in_msms_refs(rt_peaks_filtered_all_selected_formatted, msms_refs, check_by_flat=True)
    sta.handle_data(mode="save", config=config, timestamp=timestamp, file_suffix="msms_refs", \
                    data=(msms_refs, notin_msms_refs, rt_peaks_filtered_all_selected_formatted))
    
elif config["msms_refs_data_from_cache"] is True:
    msms_refs, notin_msms_refs, rt_peaks_filtered_all_selected_formatted = sta.handle_data(mode="load", config=config, file_suffix="msms_refs")

Loaded MSMS refs with 216409 rows and 17 columns.


 Searching for matches in MSMS refs:   0%|          | 0/12 [00:00<?, ? compound/s]


12 compounds+adducts are not yet in MSMS refs. Check notin_msms_refs to view.

Saving data to: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/cache/20250507092440_ref_stds_msms_refs.pkl


# Create new MSMS refs table with all selected reference standards added

In [17]:
if config['save_new_msms_refs'] is True:
    sta.update_and_save_msms_refs(msms_refs, notin_msms_refs, config, timestamp)

elif config['save_new_msms_refs'] is False:
    print("No new MSMS refs saved to disk, as 'save_new_msms_refs' is set to False in the config file.")
    print("Here is the new MSMS refs data in memory:")
    display(notin_msms_refs)

Existing MSMS refs went from 216409 to 216421 compounds.
	New MSMS refs: /global/homes/b/bkieft/metatlas_junkdrawer/example_data/metascischeller/updated_MSMS_refs/msms_refs_20250507092440.tab
