In [None]:
import numpy as np
import pandas as pd
import sys
from datetime import datetime
import os
from IPython.display import display, SVG
import json

sys.path.insert(1, '/global/homes/b/bkieft/metatlas')
import notebooks.standards_library.standard_annotation as sta
pd.options.display.max_colwidth = 300

current_time = datetime.now().strftime("%Y%m%d%H%M%S")

# Enter Reference Standard information

In [None]:
# Load configuration
config_path = "/global/homes/b/bkieft/metatlas/notebooks/standards_library/msms_set_config.json"
with open(config_path, "r") as config_file:
    config = json.load(config_file)

# Access values from the config
ppm_tolerance = config["ppm_tolerance"]
include_polarities = config["include_polarities"]
include_chromatographies = config["include_chromatographies"]
include_adducts = config["include_adducts"]
msms_refs_prefix = config["msms_refs_prefix"]
path_to_standards_file = config["path_to_standards_file"]
standards_file = config["standards_file"]
standards_info_path = f"{path_to_standards_file}/{standards_file}"
current_ema_atlases = config["current_ema_atlases"]
current_qc_atlases = config["current_qc_atlases"]
current_msms_refs_path = config["current_msms_refs_path"]
new_full_data = config["new_full_data"]
new_selected_data = config["new_selected_data"]
new_filtered_data = config["new_filtered_data"]
new_rt_correction_data = config["new_rt_correction_data"]
new_full_data = False
new_selected_data = False
new_filtered_data = False
new_rt_correction_data = False

# Get EIC and Spectra information from run table

In [3]:
if new_full_data is True:
    standard_lcmsruns_table = sta.build_standard_lcmsrun_table(standards_info_path, include_polarities=include_polarities, include_chromatographies=include_chromatographies)
    standard_lcmsruns_table_with_adducts = sta.build_adduct_annotated_table(standard_lcmsruns_table, include_adducts=include_adducts)
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full = sta.extract_data(lcmsruns_table=standard_lcmsruns_table_with_adducts,
                                                                                                            ppm_tolerance=ppm_tolerance,
                                                                                                            method="find_peaks")
    runnum_to_structure_image_grid = sta.generate_gridded_molecular_images(standard_lcmsruns_table_with_adducts)

# Save and/or read full data

In [4]:
if new_full_data is True:
    sta.save_full_data(eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, runnum_to_structure_image_grid, standards_info_path, current_time)
    
elif new_full_data is False:
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, atlas_full, runnum_to_structure_image_grid = sta.load_full_data(standards_info_path)

Loading most recent pkl file: /global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/scheller_test_20250422133530_ref_stds_data_full.pkl


# Create interactive plot and choose adducts

In [5]:
processed_data = sta.process_data_for_plotting(eics_full, top_spectra_full, group_names_full, rt_peaks_full, include_adducts, sort_by='run') # or 'run'
processed_data = [
    entry for entry in processed_data
    if entry.get('compound_name') == "sorgoleone"
]

In [6]:
if new_selected_data is True:

    selected_good_adducts = {}
    ambiguous_adducts = {}
    top_adducts = {}
    sta.create_interactive_plots(processed_data, runnum_to_structure_image_grid, selected_good_adducts, ambiguous_adducts, top_adducts)

# Save and/or read selected data

In [7]:
if new_selected_data is True:
    sta.save_selected_data(selected_good_adducts, ambiguous_adducts, top_adducts, standards_info_path, current_time)
    
elif new_selected_data is False:
    selected_good_adducts, ambiguous_adducts, top_adducts = sta.load_selected_data(standards_info_path)

Loading most recent pkl file: /global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/scheller_test_20250429085323_ref_stds_data_selected.pkl


# Preview compounds+adducts selections

In [8]:
selected_compounds_table = sta.extract_selected_compounds(selected_good_adducts, top_adducts)
ambiguous_compounds_table = sta.extract_ambiguous_compounds(ambiguous_adducts)

print("Selected compounds:\n")
display(selected_compounds_table)
print("\nAmbiguous compounds (to return to for scrutiny):\n")
display(ambiguous_compounds_table)

Selected compounds:



Unnamed: 0,compound_name,standard_lcmsrun,selected_adducts,selected_peak_indices,top_adduct
0,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run375.h5,"[[M+H]+, [M+NH4]+, [M-H2O+H]+]","[peak1, peak1, peak1]",[[M+H]+]
1,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE205060norm-200uM-S1_Run376.h5,"[[M+H]+, [M+NH4]+, [M-H2O+H]+]","[peak1, peak1, peak1]",[[M+H]+]
2,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run377.h5,[[M-H]-],[peak1],[[M-H]-]
3,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE205060norm-200uM-S1_Run378.h5,[[M-H]-],[peak1],[[M-H]-]
4,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run382.h5,"[[M+Na]+, [M+H]+, [M-H2O+H]+]","[peak1, peak1, peak1]","[[M+Na]+, [M+H]+]"
5,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,"[[M+Na]+, [M+H]+]","[peak1, peak1]","[[M+Na]+, [M+H]+]"
6,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run384.h5,[[M-H]-],[peak1],[[M-H]-]
7,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run385.h5,[[M-H]-],[peak1],[[M-H]-]



Ambiguous compounds (to return to for scrutiny):



# Filter RT Peak, EICs, and Top Spectra by selected compounds+adducts

In [9]:
if new_filtered_data is True:
    eics_filtered, rt_peaks_filtered, top_spectra_filtered = sta.filter_by_selected(eics_full, rt_peaks_full, top_spectra_full, selected_compounds_table)

# Save and/or read filtered data

In [10]:
if new_filtered_data is True:
    sta.save_filtered_data(eics_filtered, top_spectra_filtered, rt_peaks_filtered, standards_info_path, current_time)
    
elif new_filtered_data is False:
    eics_filtered, top_spectra_filtered, rt_peaks_filtered = sta.load_filtered_data(standards_info_path)

Loading most recent pkl file: /global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/scheller_test_20250429085323_ref_stds_data_filtered.pkl


# Choose only the top adduct per compound to add to DB, Atlases, and MSMS Refs

In [11]:
rt_peaks_filtered_top_adduct = sta.filter_by_selected_top_adduct(rt_peaks_filtered, top_adducts)

Group ('C18', 'sorgoleone'): All RT values for ['102040norm' '205060norm'] and ['POS' 'NEG'] are within 0.05 mins of each other (0.0057).

Group ('HILICZ', 'sorgoleone'): All RT values for ['102040norm' '205060norm'] and ['POS' 'NEG'] are within 0.05 mins of each other (0.0107).

No isomers found in selected top adducts.

Filtered 15 compound peaks to 5 peaks by best adduct. Here are the compounds+adducts retained:



Unnamed: 0,label,adduct,polarity,chromatography,inchi_key,monoisotopic_mass,collision_energy
4,sorgoleone,[M+H]+,POS,C18,FGWRUVXUQWGLOX-AFJQJTPPSA-N,358.214409,102040norm
0,sorgoleone,[M+H]+,POS,HILICZ,FGWRUVXUQWGLOX-AFJQJTPPSA-N,358.214409,102040norm
7,sorgoleone,[M+Na]+,POS,C18,FGWRUVXUQWGLOX-AFJQJTPPSA-N,358.214409,205060norm
9,sorgoleone,[M-H]-,NEG,C18,FGWRUVXUQWGLOX-AFJQJTPPSA-N,358.214409,205060norm
2,sorgoleone,[M-H]-,NEG,HILICZ,FGWRUVXUQWGLOX-AFJQJTPPSA-N,358.214409,102040norm


# Save summary plots of selected compounds+adducts after completing

In [12]:
sta.generate_static_summary_plots(processed_data, selected_good_adducts, top_adducts, path_to_standards_file)

[10:09:02] non-ring atom 0 marked aromatic
[10:09:02] non-ring atom 0 marked aromatic


# Check if selected compounds are in metatlas DB

In [13]:
in_db, notin_db = sta.search_for_matches_in_metatlas_db(rt_peaks_filtered_top_adduct, check_by_flat=True)

Searching for matches in MSMS refs:   0%|          | 0/5 [00:00<?, ?it/s]


Summary of compounds already in the metatlas database:



Unnamed: 0,query_label,query_matching_criterion,query_to_db,db_match
0,sorgoleone,inchi_key,FGWRUVXUQWGLOX-AFJQJTPPSA-N,[FGWRUVXUQWGLOX-AFJQJTPPSA-N]



All compounds are already in the metatlas database.



# Store selected compounds+adducts in metatlas db

In [14]:
# if len(notin_db) > 0:
#     sta.store_in_metatlas_db(notin_db)

# Check if selected compounds/adducts are in the atlases

In [15]:
ema_atlases_data = sta.get_ema_atlas_data(current_ema_atlases)

In [16]:
rt_peaks_filtered_top_adduct_formatted = sta.convert_rt_peaks_to_atlas_format(rt_peaks_filtered_top_adduct)

In [17]:
matches_to_atlases, nonmatches_to_atlases = sta.search_for_matches_in_atlases(rt_peaks_filtered_top_adduct_formatted, ema_atlases_data)

Searching for matches in existing atlases:   0%|          | 0/5 [00:00<?, ?it/s]


None of these compounds+adducts were found in the atlases.


These compounds+adducts are not yet in any atlases:



Unnamed: 0,standard_lcmsrun,chromatography,compound_name,adduct,polarity,rt_peak,mz,smiles,peak_index,inchi,...,permanent_charge,formula,mono_isotopic_molecular_weight,collision_energy,label,rt_min,rt_max,mz_tolerance,mz_tolerance_units,in_metatlas
0,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run382.h5,C18,sorgoleone,[M+H]+,positive,7.543334,359.221685,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,102040norm,sorgoleone,7.043334,8.043334,5,ppm,True
1,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,C18,sorgoleone,[M+Na]+,positive,7.544277,381.203627,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,205060norm,sorgoleone,7.044277,8.044277,5,ppm,True
2,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run385.h5,C18,sorgoleone,[M-H]-,negative,7.547524,357.207133,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,205060norm,sorgoleone,7.047524,8.047524,5,ppm,True
3,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run375.h5,HILICZ,sorgoleone,[M+H]+,positive,0.801184,359.221685,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,102040norm,sorgoleone,0.301184,1.301184,5,ppm,True
4,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run377.h5,HILICZ,sorgoleone,[M-H]-,negative,0.804263,357.207133,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,102040norm,sorgoleone,0.304263,1.304263,5,ppm,True


In [18]:
if new_rt_correction_data is True:

    baseline_to_experimental_qc = sta.get_qc_experimental_atlas(nonmatches_to_atlases, current_qc_atlases, include_istds=True)

In [19]:
if new_rt_correction_data is True:
    sta.save_rt_correction_data(baseline_to_experimental_qc, standards_info_path, current_time)
    
elif new_rt_correction_data is False:
    baseline_to_experimental_qc = sta.load_rt_correction_data(standards_info_path)

Loading most recent pkl file: /global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/scheller_test_20250429095538_rt_correction_data.pkl


In [20]:
baseline_correction_inputs = sta.create_baseline_correction_input(nonmatches_to_atlases, baseline_to_experimental_qc)
baseline_correction_outputs = sta.rt_correction_from_baseline(baseline_correction_inputs, include_chromatographies)

Backward RT correction data for C18:


Unnamed: 0,label,polarity,rt_peak_baseline,rt_peak_experimental,rt_peak_corrected,rt_min_corrected,rt_max_corrected,rt_diff_experimental_vs_corrected
0,sorgoleone,positive,,7.543334,7.438575,6.938575,7.938575,0.104759
1,sorgoleone,positive,,7.544277,7.439548,6.939548,7.939548,0.104729
2,sorgoleone,negative,,7.547524,7.442898,6.942898,7.942898,0.104627
3,9-cis-retinoic acid,QC,7.372778,7.322109,7.210608,6.710608,7.710608,0.111501
4,ABMBA (unlabeled),QC,4.7,4.892107,4.73861,4.23861,5.23861,0.153496
5,caffeine,QC,2.652537,2.833037,2.689957,2.189957,3.189957,0.14308
6,enoxolone,QC,6.605928,6.792058,6.666384,6.166384,7.166384,0.125674
7,inosine (U - 15N),QC,1.02,1.13626,1.03348,0.53348,1.53348,0.10278
8,nigericin,QC,8.559813,8.749709,8.690287,8.190287,9.190287,0.059422
9,"phenylalanine (U - 13C, 15N)",QC,1.452954,1.481643,1.368337,0.868337,1.868337,0.113306


Backward RT correction data for HILICZ:


Unnamed: 0,label,polarity,rt_peak_baseline,rt_peak_experimental,rt_peak_corrected,rt_min_corrected,rt_max_corrected,rt_diff_experimental_vs_corrected
0,sorgoleone,positive,,0.801184,0.618179,0.118179,1.118179,0.183005
1,sorgoleone,negative,,0.804263,0.621525,0.121525,1.121525,0.182738
2,ABMBA (unlabeled),QC,1.093806,1.21834,1.070658,0.570658,1.570658,0.147682
3,N-acetyl-glucosamine (U - 13C),QC,6.707815,6.418233,6.57229,6.07229,7.07229,-0.154056
4,adenine (U - 15N),QC,2.557602,2.834223,2.807783,2.307783,3.307783,0.02644
5,"alanine (U - 13C, 15N)",QC,13.405091,13.240681,13.401497,12.901497,13.901497,-0.160816
6,"arginine (U - 13C, 15N)",QC,16.939915,17.013107,16.988042,16.488042,17.488042,0.025065
7,"asparagine (U - 13C, 15N)",QC,14.368089,14.252848,14.377046,13.877046,14.877046,-0.124198
8,"aspartic acid (U - 13C, 15N)",QC,16.13036,16.043652,16.079247,15.579247,16.579247,-0.035595
9,"cystine (U - 13C, 15N)",QC,16.904308,16.930342,16.910804,16.410804,17.410804,0.019538


In [21]:
nonmatches_to_atlases_rt_corrected = sta.substitute_corrected_rt_values(nonmatches_to_atlases, baseline_correction_outputs)
print(f"Formatted {nonmatches_to_atlases_rt_corrected.shape[0]} RT-corrected compounds to be inserted into atlases.")

Formatted 7 RT-corrected compounds to be inserted into atlases.


# Store selected compound/adduct in metatlas atlases

In [22]:
sta.update_and_save_atlases(ema_atlases_data, nonmatches_to_atlases_rt_corrected, current_time, path_to_standards_file, save_atlas=True)

For HILIC_EMA-standards_positive.tsv, new atlas has 374 rows and old atlas has 373 rows.
For HILIC_EMA-standards_negative.tsv, new atlas has 419 rows and old atlas has 418 rows.
For C18_EMA-standards_positive.tsv, new atlas has 3796 rows and old atlas has 3792 rows.
For C18_EMA-standards_negative.tsv, new atlas has 2698 rows and old atlas has 2697 rows.


# Check if selected compounds/adducts are in MSMS refs

In [23]:
msms_refs = sta.get_msms_refs(msms_refs_path=current_msms_refs_path)
msms_refs.shape

(216409, 17)

In [35]:
msms_refs.columns

Index(['database', 'id', 'name', 'spectrum', 'decimal', 'precursor_mz',
       'polarity', 'adduct', 'fragmentation_method', 'collision_energy',
       'instrument', 'instrument_type', 'formula', 'exact_mass', 'inchi_key',
       'inchi', 'smiles'],
      dtype='object')

In [37]:
display(msms_refs['decimal'].value_counts())

decimal
4.0    118440
1.0     40081
2.0     32571
3.0     16835
0.0      6408
6.0      2064
5.0        10
Name: count, dtype: int64

In [45]:
display(msms_refs['collision_energy'].value_counts().tail(10))

collision_energy
Ramp 19.8-29.6 eV    1
Ramp 22.0-33.0 eV    1
Ramp 26.8-40.2 eV    1
Ramp 20.3-30.5 eV    1
RAMP 15.6-23.4 eV    1
289                  1
Ramp 28.5-42.8 eV    1
28 eV                1
Ramp 20.8-31.3 eV    1
RAMP 20.9-31.4 eV    1
Name: count, dtype: int64

In [49]:
msms_refs[msms_refs['database'] == 'metatlas']['collision_energy'].unique()

array([nan, 'ramp-102040', 'ramp-205060', 'absolute-102040',
       'absolute-205060', 'normalized-102040', 'normalized-205060'],
      dtype=object)

In [50]:
sta.get_collision_energy('/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run382.h5')

'102040norm'

In [38]:
rt_peaks_filtered_top_adduct_formatted.head()

Unnamed: 0,standard_lcmsrun,chromatography,compound_name,adduct,polarity,rt_peak,mz,smiles,peak_index,inchi,...,permanent_charge,formula,mono_isotopic_molecular_weight,collision_energy,label,rt_min,rt_max,mz_tolerance,mz_tolerance_units,in_metatlas
0,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run382.h5,C18,sorgoleone,[M+H]+,positive,7.543334,359.221685,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,102040norm,sorgoleone,7.043334,8.043334,5,ppm,True
1,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,C18,sorgoleone,[M+Na]+,positive,7.544277,381.203627,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,205060norm,sorgoleone,7.044277,8.044277,5,ppm,True
2,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run385.h5,C18,sorgoleone,[M-H]-,negative,7.547524,357.207133,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,205060norm,sorgoleone,7.047524,8.047524,5,ppm,True
3,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_POS_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run375.h5,HILICZ,sorgoleone,[M+H]+,positive,0.801184,359.221685,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,102040norm,sorgoleone,0.301184,1.301184,5,ppm,True
4,/global/cfs/cdirs/metatlas/raw_data/jgi/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558/20241002_JGI_HS_510060_SorghExu_final1_EXP120B_HILICZ_USHXG02558_NEG_MS2_RefStd-1_Sorgoleone_1_Rg70to1050-CE102040norm-200uM-S1_Run377.h5,HILICZ,sorgoleone,[M-H]-,negative,0.804263,357.207133,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",...,0,C22H30O4,358.214409,102040norm,sorgoleone,0.304263,1.304263,5,ppm,True


In [None]:
in_msms_refs, notin_msms_refs = sta.search_for_matches_in_msms_refs(rt_peaks_filtered_top_adduct_formatted, msms_refs, check_by_flat=True)

In [None]:
rt_peaks_filtered_with_top_spectra = sta.merge_selected_peaks_with_top_spectra(notin_msms_refs, top_spectra_filtered)

In [None]:
msms_refs_metadata = {
    "ce_type": 'ramp',
    "msms_refs_prefix": msms_refs_prefix,
    "frag_method": "HCD",
    "instrument_type": 'Orbitrap',
    "decimal": 4.0,
    
}
rt_peaks_filtered_with_top_spectra_formatted = sta.format_for_msms_refs(rt_peaks_filtered_with_top_spectra, msms_refs, msms_refs_metadata)

# Store selected compound/adduct in MSMS refs

In [None]:
sta.update_and_save_msms_refs(msms_refs, rt_peaks_filtered_with_top_spectra_formatted, path_to_standards_file, current_time, save_refs=True)

# Check to verify that compounds were added to database, atlases, and refs correctly

In [None]:
# TO DO: Add a checker