In [1]:
import numpy as np
import pandas as pd
import sys
from datetime import datetime
import os
from IPython.display import display

sys.path.insert(1, '/global/homes/b/bkieft/metatlas')
import notebooks.standards_library.standard_annotation as sta
pd.options.display.max_colwidth = 300

current_time = datetime.now().strftime("%Y%m%d%H%M%S")

# Enter Reference Standard information

In [2]:
ppm_tolerance = 5
include_polarities = ['POS', 'NEG']
include_chromatographies = ['C18', 'HILIC'] # 'C18' and/or 'HILIC'
include_adducts = ['[M+H]+', '[M+Na]+', '[M-H2O+H]+', '[M+K]+', '[M+NH4]+', '[M]+', '[M+2H]2+','[M-H]-', '[M+Cl]-', '[M]-', '[M-2H]2-']

path_to_standards_file = '/global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation'
standards_file = 'scheller_test.csv' # scheller_test.csv   metasci_flav_annotation_input.csv
standards_info_path = f'{path_to_standards_file}/{standards_file}'
input_compounds = pd.read_csv(standards_info_path)

current_ema_atlases = {"hilicz": {"pos": '/global/homes/b/bkieft/metatlas-data/HILIC/HILIC_EMA-standards_positive.tsv',
                                  "neg": '/global/homes/b/bkieft/metatlas-data/HILIC/HILIC_EMA-standards_negative.tsv'},
                        "c18": {"pos": '/global/homes/b/bkieft/metatlas-data/C18/C18_EMA-standards_positive.tsv',
                                  "neg": '/global/homes/b/bkieft/metatlas-data/C18/C18_EMA-standards_negative.tsv'}}
current_qc_atlases = {"hilicz": '/global/homes/b/bkieft/metatlas-data/HILIC/HILIC_QCv7_positive.tsv', # 0e0a13a57c434b258bf8ab33ab357961
                    "c18": '/global/homes/b/bkieft/metatlas-data/C18/C18_QCv7_positive.tsv'} # 20459fe5e4e14ab4a22168027b2bacda

current_msms_refs_path = '/global/cfs/cdirs/metatlas/projects/spectral_libraries/20240430_istdv7-addition_msms_refs.tab'

new_full_data = False
new_selected_data = True
new_filtered_data = True
new_rt_correction_data = True

# Get EIC and Spectra information from run table

In [3]:
if new_full_data is True:
    standard_lcmsruns_table = sta.build_standard_lcmsrun_table(standards_info_path, include_polarities=include_polarities, include_chromatographies=include_chromatographies)
    standard_lcmsruns_table_with_adducts = sta.build_adduct_annotated_table(standard_lcmsruns_table, include_adducts=include_adducts)
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full = sta.extract_data(lcmsruns_table=standard_lcmsruns_table_with_adducts,
                                                                                                            ppm_tolerance=ppm_tolerance,
                                                                                                            method="find_peaks")
    runnum_to_structure_image_grid = sta.generate_gridded_molecular_images(standard_lcmsruns_table_with_adducts)

# Save and/or read full data

In [4]:
if new_full_data is True:
    sta.save_full_data(eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, runnum_to_structure_image_grid, standards_info_path, current_time)
    
elif new_full_data is False:
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, runnum_to_structure_image_grid = sta.load_full_data(standards_info_path)

Loading most recent pkl file: /global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/scheller_test_20250422133530_ref_stds_data_full.pkl


# Create interactive plot and choose adducts

In [5]:
if new_selected_data is True:
    processed_data = sta.process_data_for_plotting(eics_full, top_spectra_full, group_names_full, rt_peaks_full, include_adducts)
    processed_data = [
        entry for entry in processed_data
        if entry.get('compound_name') == "sorgoleone"
    ]

    selected_good_adducts = {}
    ambiguous_adducts = {}
    top_adducts = {}
    sta.create_interactive_plots(processed_data, runnum_to_structure_image_grid, selected_good_adducts, ambiguous_adducts, top_adducts)

VBox(children=(HBox(children=(VBox(children=(Label(value='Select all good adducts:'), Checkbox(value=False, de…

Output()

# Save and/or read selected data

In [9]:
if new_selected_data is True:
    sta.save_selected_data(selected_good_adducts, ambiguous_adducts, standards_info_path, current_time)
    
elif new_selected_data is False:
    selected_good_adducts, ambiguous_adducts = sta.load_selected_data(standards_info_path)

Saving data to: /global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/scheller_test_20250425163322_ref_stds_data_selected.pkl


# Preview compounds+adducts selections

In [10]:
selected_compounds_table = sta.extract_selected_compounds(selected_good_adducts)
ambiguous_compounds_table = sta.extract_ambiguous_compounds(ambiguous_adducts)

print("Selected compounds:\n")
display(selected_compounds_table)
print("\nAmbiguous compounds (to return to for scrutiny):\n")
display(ambiguous_compounds_table)

Selected compounds:



Unnamed: 0,compound_name,standard_lcmsrun,selected_adducts,selected_peak_indices
0,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run375.h5,"[[M+H]+, [M+NH4]+]","[peak1, peak1]"
1,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE205060norm-200uM-S1_Run376.h5,"[[M+H]+, [M+NH4]+]","[peak1, peak1]"
2,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run377.h5,[[M-H]-],[peak1]
3,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE205060norm-200uM-S1_Run378.h5,[[M-H]-],[peak1]
4,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run382.h5,"[[M+Na]+, [M+H]+]","[peak1, peak1]"
5,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,"[[M+Na]+, [M+H]+]","[peak1, peak1]"



Ambiguous compounds (to return to for scrutiny):



Unnamed: 0,compound_name,standard_lcmsrun
0,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run384.h5
1,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run385.h5


# Filter RT Peak, EICs, and Top Spectra by selected compounds+adducts

In [11]:
eics_filtered, rt_peaks_filtered, top_spectra_filtered = sta.filter_by_selected(eics_full, rt_peaks_full, top_spectra_full, selected_compounds_table)

print(f"\nTotal unique compounds retained: {eics_filtered['compound_name'].nunique()}")
print(f"Total unique compound+adduct entries retained: {eics_filtered['label'].nunique()}\n")
print(f"Total EICs selected: {eics_filtered.shape[0]}")
print(f"Total RT peaks selected: {rt_peaks_filtered.shape[0]}")
print(f"Total MS2 spectra selected: {top_spectra_filtered.shape[0]}")

[16:34:27] non-ring atom 0 marked aromatic
[16:34:27] non-ring atom 0 marked aromatic



Total unique compounds retained: 1
Total unique compound+adduct entries retained: 8

Total EICs selected: 25
Total RT peaks selected: 10
Total MS2 spectra selected: 10


# Save and/or read filtered data

In [12]:
if new_filtered_data is True:
    sta.save_filtered_data(eics_filtered, top_spectra_filtered, rt_peaks_filtered, standards_info_path, current_time)
    
elif new_filtered_data is False:
    eics_filtered, top_spectra_filtered, rt_peaks_filtered = sta.load_filtered_data(standards_info_path)

Saving data to: /global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/scheller_test_20250425163322_ref_stds_data_filtered.pkl


# Save summary plots of selected compounds+adducts

In [13]:
sta.generate_static_summary_plots(processed_data, selected_good_adducts, export_dir=path_to_standards_file)

# Choose only the top adduct per compound to add to DB, Atlases, and MSMS Refs

In [None]:
def filter_by_selected_top_adduct(rt_peaks, top_adducts):
    
    unfiltered_rt_peaks = rt_peaks.copy()
    unfiltered_rt_peaks['label'] = unfiltered_rt_peaks['compound_name']

    # Find the row that matches the top_adducts dict keys
    for key, value in top_adducts.items():
        label = key.split(';;')[0]
        standard_lcmsrun = key.split(';;')[1]
        selected_row = unfiltered_rt_peaks[(unfiltered_rt_peaks['standard_lcmsrun'] == standard_lcmsrun) &
                                           (unfiltered_rt_peaks['label'] == label)]
        
    # Find the row with the highest intensity for each group
    unfiltered_rt_peaks['label'] = unfiltered_rt_peaks['compound_name']
    group_list = ['chromatography', 'polarity', 'label']
    idx_max_intensity = unfiltered_rt_peaks.groupby(group_list)['intensity'].idxmax()
    highest_intensity_row = unfiltered_rt_peaks.loc[idx_max_intensity]

        # # Filter rows to keep only those with the same adduct as the highest intensity row
        group_list.extend(['adduct', 'collision_energy'])
        top_adducts_per_pol = unfiltered_rt_peaks.merge(
            highest_intensity_row[group_list],
            on=group_list,
            how='inner'
        )

        # Find all other peaks for the selected adduct
        top_adducts_per_pol_grouped = top_adducts_per_pol.groupby(group_list)
        unfiltered_rt_peaks_grouped = unfiltered_rt_peaks.groupby(group_list)
        all_peaks = []

        for group_key, _ in top_adducts_per_pol_grouped:
            # Check if the group_key exists in rt_peaks_grouped
            if group_key in unfiltered_rt_peaks_grouped.groups:
                # Retrieve all rows for the matching group
                matching_rows = unfiltered_rt_peaks_grouped.get_group(group_key)
                if matching_rows.shape[0] > 1: # Are there multiple peaks per chrom+polarity+compound+adduct+collision_energy?
                    matching_rows.loc[:,'label'] = matching_rows.apply(lambda row: f"{row['label']} ({row['peak_index']})", axis=1)
                all_peaks.append(matching_rows)

        top_adducts_per_pol_allpeaks = pd.concat(all_peaks, ignore_index=True) if all_peaks else pd.DataFrame()

        # Group by monoisotopic_mass and identify isomers if present
        top_adducts_per_pol_allpeaks_isomer_grouping = top_adducts_per_pol_allpeaks.groupby(['monoisotopic_mass','polarity','chromatography'])
        grouped_compounds = top_adducts_per_pol_allpeaks_isomer_grouping['compound_name'].nunique()
        multiple_compounds_per_mim = grouped_compounds[grouped_compounds > 1]

        if not multiple_compounds_per_mim.empty:
            # Iterate over each monoisotopic mass with multiple compounds
            for isomer_mim in multiple_compounds_per_mim.index:
                isomer_data = top_adducts_per_pol_allpeaks[
                    (top_adducts_per_pol_allpeaks['monoisotopic_mass'] == isomer_mim[0]) &
                    (top_adducts_per_pol_allpeaks['polarity'] == isomer_mim[1]) &
                    (top_adducts_per_pol_allpeaks['chromatography'] == isomer_mim[2])
                ]
                
                # Check if all adducts are the same
                unique_adducts = isomer_data['adduct'].unique()
                if len(unique_adducts) == 1:
                    # All adducts are the same, do nothing
                    print(f"Note: Found isomers in {isomer_mim[2]} {isomer_mim[1]} mode at {isomer_mim[0]} ({list(isomer_data['label'])}) but they had matching selected adducts {unique_adducts[0]}.")
                    continue
                else: # Adducts for isomers do not agree
                    print(f"Warning! Adducts for isomers do not agree. See data for monoisotopic mass {isomer_mim[0]}:\n")
                    display(isomer_data[['label', 'adduct', 'inchi', 'monoisotopic_mass']])
                    print("\nPlease return to the GUI to select a matching adduct for isomers.")
                    return
        
        print(f"\nFiltered {unfiltered_rt_peaks.shape[0]} compound peaks to {top_adducts_per_pol_allpeaks.shape[0]} peaks by best adduct. Here are the compounds+adducts retained:\n")
        display(top_adducts_per_pol_allpeaks[['label', 'adduct', 'polarity', 'chromatography', 'inchi_key', 'monoisotopic_mass']].sort_values(by=['label','adduct']))

        return top_adducts_per_pol_allpeaks

In [15]:
top_adducts

{'sorgoleone;;/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run375.h5': '[M+H]+',
 'sorgoleone;;/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE205060norm-200uM-S1_Run376.h5': '[M+H]+',
 'sorgoleone;;/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run377.h5': '[M-H]-',
 'sorgoleone;;/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_R

In [18]:
rt_peaks_filtered

Unnamed: 0,standard_lcmsrun,chromatography,compound_name,adduct,polarity,rt_peak,intensity,mz_observed,mz_theoretical,ppm_error,smiles,peak_index,inchi,inchi_key,neutralized_inchi,neutralized_inchi_key,permanent_charge,formula,monoisotopic_mass,collision_energy
0,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run375.h5,HILICZ,sorgoleone,[M+H]+,POS,0.801184,238290800.0,359.221771,359.221685,-0.23885,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,0,C22H30O4,358.214409,102040norm
1,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run375.h5,HILICZ,sorgoleone,[M+NH4]+,POS,0.801184,76450020.0,376.24823,376.248232,0.006537,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,0,C22H30O4,358.214409,102040norm
2,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE205060norm-200uM-S1_Run376.h5,HILICZ,sorgoleone,[M+H]+,POS,0.793515,220015500.0,359.221771,359.221685,-0.23885,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,0,C22H30O4,358.214409,205060norm
3,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE205060norm-200uM-S1_Run376.h5,HILICZ,sorgoleone,[M+NH4]+,POS,0.793515,74330640.0,376.24823,376.248232,0.006537,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,0,C22H30O4,358.214409,205060norm
4,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run377.h5,HILICZ,sorgoleone,[M-H]-,NEG,0.804263,1073812000.0,357.207184,357.207133,-0.141089,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,0,C22H30O4,358.214409,102040norm
5,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE205060norm-200uM-S1_Run378.h5,HILICZ,sorgoleone,[M-H]-,NEG,0.803481,1008106000.0,357.207153,357.207133,-0.055655,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,0,C22H30O4,358.214409,205060norm
6,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run382.h5,C18,sorgoleone,[M+H]+,POS,7.543334,85129120.0,359.222168,359.221685,-1.343262,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,0,C22H30O4,358.214409,102040norm
7,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run382.h5,C18,sorgoleone,[M+Na]+,POS,7.543334,142042300.0,381.204132,381.203627,-1.323807,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,0,C22H30O4,358.214409,102040norm
8,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,C18,sorgoleone,[M+H]+,POS,7.544277,73171460.0,359.221954,359.221685,-0.748579,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,0,C22H30O4,358.214409,205060norm
9,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,C18,sorgoleone,[M+Na]+,POS,7.544277,155523200.0,381.203949,381.203627,-0.843472,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N,0,C22H30O4,358.214409,205060norm


In [None]:
rt_peaks_filtered_top_adduct = filter_by_selected_top_adduct(rt_peaks_filtered, top_adducts)
# rt_peaks_filtered_top_adduct = sta.filter_by_top_adduct(rt_peaks_filtered)

# Check if selected compounds are in metatlas DB

In [None]:
in_db, notin_db = sta.search_for_matches_in_metatlas_db(rt_peaks_filtered_top_adduct, check_by_flat=True)

# Store selected compounds+adducts in metatlas db

In [None]:
#sta.store_in_metatlas_db(notin_db)

# Check if selected compounds/adducts are in the atlases

In [None]:
ema_atlases_data = sta.get_ema_atlas_data(current_ema_atlases)

In [None]:
rt_peaks_filtered_top_adduct_formatted = sta.convert_rt_peaks_to_atlas_format(rt_peaks_filtered_top_adduct)

In [None]:
matches_to_atlases, nonmatches_to_atlases = sta.search_for_matches_in_atlases(rt_peaks_filtered_top_adduct_formatted, ema_atlases_data)

In [None]:
if new_rt_correction_data is True:

    baseline_to_experimental_qc = sta.get_qc_experimental_atlas(nonmatches_to_atlases, current_qc_atlases, include_istds=True)

In [None]:
if new_rt_correction_data is True:
    sta.save_rt_correction_data(baseline_to_experimental_qc, standards_info_path, current_time)
    
elif new_rt_correction_data is False:
    baseline_to_experimental_qc = sta.load_rt_correction_data(standards_info_path)

In [None]:
baseline_correction_inputs = sta.create_baseline_correction_input(nonmatches_to_atlases, baseline_to_experimental_qc)
baseline_correction_outputs = sta.rt_correction_from_baseline(baseline_correction_inputs)

In [None]:
nonmatches_to_atlases_rt_corrected = sta.substitute_corrected_rt_values(nonmatches_to_atlases, baseline_correction_outputs)
print(f"Inserting {nonmatches_to_atlases_rt_corrected.shape[0]} RT-corrected compounds into the database")

# Store selected compound/adduct in metatlas atlases

In [None]:
sta.update_and_save_atlases(ema_atlases_data, nonmatches_to_atlases_rt_corrected, current_time, path_to_standards_file, save_atlas=True)

# Check if selected compounds/adducts are in MSMS refs

In [None]:
msms_refs = sta.get_msms_refs(msms_refs_path=current_msms_refs_path)
msms_refs.shape

In [None]:
in_msms_refs, notin_msms_refs = sta.search_for_matches_in_msms_refs(rt_peaks_filtered_top_adduct_formatted, msms_refs, check_by_flat=True)

In [None]:
rt_peaks_filtered_with_top_spectra = sta.merge_selected_peaks_with_top_spectra(notin_msms_refs, top_spectra_filtered)

In [None]:
rt_peaks_filtered_with_top_spectra_formatted = sta.format_for_msms_refs(rt_peaks_filtered_with_top_spectra, msms_refs)

# Store selected compound/adduct in MSMS refs

In [None]:
sta.update_and_save_msms_refs(msms_refs, rt_peaks_filtered_with_top_spectra_formatted, path_to_standards_file, current_time, save_refs=True)

# Check to verify that compounds were added to database, atlases, and refs correctly

In [None]:
# TO DO: Add a checker