In [1]:
import numpy as np
import pandas as pd
import sys
import pickle
from datetime import datetime
import glob
import os
from IPython.display import display, HTML

sys.path.insert(1, '/global/homes/b/bkieft/metatlas')
import notebooks.standards_library.standard_annotation as sta
pd.options.display.max_colwidth = 300

# Enter Reference Standard information

In [2]:
ppm_tolerance = 5
include_polarities = ['POS', 'NEG']
#include_chromatographies = ['C18', 'HILIC'] # 'C18' and/or 'HILIC'
include_chromatographies = ['C18']
include_adducts = ['[M+H]+', '[M+Na]+', '[M-H2O+H]+', '[M+K]+', '[M+NH4]+', '[M]+', '[M+2H]2+','[M-H]-', '[M+Cl]-', '[M]-', '[M-2H]2-']

path_to_standards_file = '/global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/metasci-flavanoids_output'
#'/global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/'
standards_file = 'metasci_flav_annotation_input.csv'
# scheller_test.csv
standards_info_path = f'{path_to_standards_file}/{standards_file}'

input_compounds = pd.read_csv(standards_info_path)
new_analysis = False

# Build run table with adducts

In [None]:
standard_lcmsruns_table = sta.build_standard_lcmsrun_table(standards_info_path, include_polarities=include_polarities, include_chromatographies=include_chromatographies)
standard_lcmsruns_table_with_adducts = sta.build_adduct_annotated_table(standard_lcmsruns_table, include_adducts=include_adducts)

# Get EIC and Spectra information from run table

In [4]:
if new_analysis is True:
    eics_full, top_spectra_full, group_names_full, rt_peaks_full = sta.extract_data(lcmsruns_table=standard_lcmsruns_table_with_adducts, 
                                                                                    ppm_tolerance=ppm_tolerance,
                                                                                    method="find_peaks")
    runnum_to_structure_image_grid = sta.generate_gridded_molecular_images(standard_lcmsruns_table_with_adducts)

# Save and/or read data

In [5]:
if new_analysis is True:
    sta.save_full_data(eics_full, top_spectra_full, group_names_full, rt_peaks_full, runnum_to_structure_image_grid, standards_info_path)
elif new_analysis is False:
    eics_full, top_spectra_full, group_names_full, rt_peaks_full, runnum_to_structure_image_grid = sta.load_full_data(standards_info_path)

Loading most recent pkl file: /global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/metasci-flavanoids_output/metasci_flav_annotation_input_20250414160006_ref_stds_data_full.pkl


# Format EIC and Spectra information for plotting

In [6]:
processed_data = sta.process_data_for_plotting(eics_full, top_spectra_full, group_names_full, rt_peaks_full, include_adducts)

# Create interactive plot and choose adducts

In [None]:
selected_good_adducts = {} # Or, load in existing dicts from previous session with path to pkl file
ambiguous_adducts = {}

sta.create_interactive_plots(processed_data, runnum_to_structure_image_grid, save_location=path_to_standards_file,
                             selected_good_adducts=selected_good_adducts, ambiguous_adducts=ambiguous_adducts)

# Preview selections

In [None]:
selected_compounds_table = sta.extract_selected_compounds(selected_good_adducts)
selected_compounds_table

In [9]:
ambiguous_compounds_table = sta.extract_ambiguous_compounds(ambiguous_adducts)
ambiguous_compounds_table

No ambiguous adducts selected.


# Subset Run Table, RT Peak, EICs, and Top Spectra by selected adducts

In [13]:
eics_selected = pd.concat([df.assign(key=key) for d in eics_full for key, df in d.items()],ignore_index=True).rename(columns={'key': 'standard_lcmsrun'})
eics_selected['compound_name'] = eics_selected['label'].apply(lambda x: x.split('_')[0])
eics_selected = sta.select_compounds_from_gui(eics_selected, selected_compounds_table)
    
rt_peaks_selected = pd.concat(rt_peaks_full).rename(columns={'lcmsrun': 'standard_lcmsrun'})
rt_peaks_selected = sta.select_compounds_from_gui(rt_peaks_selected, selected_compounds_table)

top_spectra_selected = pd.concat(top_spectra_full, ignore_index=True).rename(columns={'lcmsrun': 'standard_lcmsrun'})
top_spectra_selected['compound_name'] = top_spectra_selected['label'].apply(lambda x: x.split('_')[0])
top_spectra_selected = sta.select_compounds_from_gui(top_spectra_selected, selected_compounds_table)

print(f"\nTotal unique compounds retained: {eics_selected['compound_name'].nunique()}")
print(f"Total unique compound+adduct entries retained: {eics_selected['label'].nunique()}\n")
print(f"Total EICs selected: {eics_selected.shape[0]}")
print(f"Total RT peaks selected: {rt_peaks_selected.shape[0]}")
print(f"Total MS2 spectra selected: {top_spectra_selected.shape[0]}")


Total unique compounds retained: 9
Total unique compound+adduct entries retained: 16

Total EICs selected: 16
Total RT peaks selected: 18
Total MS2 spectra selected: 18


# Save and/or read filtered data

In [4]:
new_analysis = False

if new_analysis is True:
    sta.save_selected_data(eics_selected, top_spectra_selected, rt_peaks_selected, standards_info_path)
elif new_analysis is False:
    eics_selected, top_spectra_selected, rt_peaks_selected = sta.load_selected_data(standards_info_path)

Loading most recent pkl file: /global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/metasci-flavanoids_output/metasci_flav_annotation_input_20250415134146_ref_stds_data_selected.pkl


# Choose only the top adduct per compound to add to DB, Atlases, and MSMS Refs

In [5]:
rt_peaks_selected_top_adduct = sta.filter_top_compounds(rt_peaks_selected)
#display(rt_peaks_selected_top_adduct.sort_values(by=['compound_name', 'adduct']))

Found isomers in C18 POS mode at 286.04773803999996 (['kaempferol (peak1)', 'kaempferol (peak2)']) but they had matching selected adducts [M+Na]+.
Found isomers in C18 POS mode at 302.04265265999993 (['quercetin (peak1)', 'quercetin (peak2)']) but they had matching selected adducts [M+Na]+.


# Check if selected compounds/adducts are in metatlas DB

In [5]:
in_db, notin_db = sta.search_for_matches_in_metatlas_db(rt_peaks_selected_top_adduct)

Searching for matches in MSMS refs:   0%|          | 0/12 [00:00<?, ?it/s]

(3, 8)
(3, 8)

Summary of compounds already in the metatlas database:



Unnamed: 0,compound,matching_criterion,db_entry
0,(+)-catechin,inchi_key,PFTAWBLQPZVEMU-DZGCQCFKSA-N
1,kaempferol (peak1),inchi_key,IYRMWMYZSQPJKC-UHFFFAOYSA-N
2,kaempferol (peak2),inchi_key,IYRMWMYZSQPJKC-UHFFFAOYSA-N
3,myricetin,inchi_key,IKMDFBPHZNJCSN-UHFFFAOYSA-N
4,naringin,inchi_key,DFPMSGMNTNDNHN-ZPHOTFPESA-N
5,phloretin,inchi_key,VGEREEWJJVICBM-UHFFFAOYSA-N
6,quercetin (peak1),inchi_key,REFJWTPEDVJJIY-UHFFFAOYSA-N
7,quercetin (peak2),inchi_key,REFJWTPEDVJJIY-UHFFFAOYSA-N



These compounds are not yet in the metatlas database:



Unnamed: 0,compound_name,inchi,inchi_key,neutralized_inchi,neutralized_inchi_key,permanent_charge,formula,monoisotopic_mass
0,baicalin,"InChI=1S/C21H18O11/c22-9-6-10(8-4-2-1-3-5-8)30-11-7-12(14(23)15(24)13(9)11)31-21-18(27)16(25)17(26)19(32-21)20(28)29/h1-7,16-19,21,23-27H,(H,28,29)/t16-,17-,18+,19-,21-/m0/s1",IKIIZLYTISPENI-UNJWAJPSSA-N,"InChI=1S/C21H18O11/c22-9-6-10(8-4-2-1-3-5-8)30-11-7-12(14(23)15(24)13(9)11)31-21-18(27)16(25)17(26)19(32-21)20(28)29/h1-7,16-19,21,23-27H,(H,28,29)/t16-,17-,18+,19-,21-/m0/s1",IKIIZLYTISPENI-UNJWAJPSSA-N,0,C21H18O11,446.084911
1,mangiferin,"InChI=1S/C19H18O11/c20-4-11-15(25)17(27)18(28)19(30-11)12-8(23)3-10-13(16(12)26)14(24)5-1-6(21)7(22)2-9(5)29-10/h1-3,11,15,17-23,25-28H,4H2/t11-,15-,17+,18-,19+/m1/s1",AEDDIBAIWPIIBD-ZJKJAXBQSA-N,"InChI=1S/C19H18O11/c20-4-11-15(25)17(27)18(28)19(30-11)12-8(23)3-10-13(16(12)26)14(24)5-1-6(21)7(22)2-9(5)29-10/h1-3,11,15,17-23,25-28H,4H2/t11-,15-,17+,18-,19+/m1/s1",AEDDIBAIWPIIBD-ZJKJAXBQSA-N,0,C19H18O11,422.084911
2,neohesperidin dihydrochalcone,"InChI=1S/C28H36O15/c1-11-21(34)23(36)25(38)27(40-11)43-26-24(37)22(35)19(10-29)42-28(26)41-13-8-16(32)20(17(33)9-13)14(30)5-3-12-4-6-18(39-2)15(31)7-12/h4,6-9,11,19,21-29,31-38H,3,5,10H2,1-2H3/t11-,19+,21-,22+,23+,24-,25+,26+,27-,28+/m0/s1",ITVGXXMINPYUHD-CUVHLRMHSA-N,"InChI=1S/C28H36O15/c1-11-21(34)23(36)25(38)27(40-11)43-26-24(37)22(35)19(10-29)42-28(26)41-13-8-16(32)20(17(33)9-13)14(30)5-3-12-4-6-18(39-2)15(31)7-12/h4,6-9,11,19,21-29,31-38H,3,5,10H2,1-2H3/t11-,19+,21-,22+,23+,24-,25+,26+,27-,28+/m0/s1",ITVGXXMINPYUHD-CUVHLRMHSA-N,0,C28H36O15,612.20542


# Store selected compounds in metatlas db

In [7]:
#notin_db
# metob.store(notin_db)

# Check if selected compounds/adducts are in the atlases

In [6]:
rt_peaks_selected_top_adduct_formatted = sta.convert_rt_peaks_to_atlas_format(rt_peaks_selected_top_adduct)

In [7]:
existing_atlases = sta.get_existing_atlases(existing_atlases_path='/global/homes/b/bkieft/metatlas-data/*/*EMA*.tsv')

In [8]:
matches_to_atlases, nonmatches_to_atlases = sta.search_for_matches_in_atlases(rt_peaks_selected_top_adduct_formatted, existing_atlases)

Searching for matches in existing atlases:   0%|          | 0/12 [00:00<?, ?it/s]


Summary of compounds+adducts already in the atlases:



Unnamed: 0,query_compound,query_match,atlas_matches,atlas_source_files
0,(+)-catechin ; [M+H]+,"InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1","[InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1]",[C18_EMA-standards_positive.tsv]
1,naringin ; [M+H]+,"InChI=1S/C27H32O14/c1-10-20(32)22(34)24(36)26(37-10)41-25-23(35)21(33)18(9-28)40-27(25)38-13-6-14(30)19-15(31)8-16(39-17(19)7-13)11-2-4-12(29)5-3-11/h2-7,10,16,18,20-30,32-36H,8-9H2,1H3/t10-,16-,18+,20-,21+,22+,23-,24+,25+,26-,27+/m0/s1","[InChI=1S/C27H32O14/c1-10-20(32)22(34)24(36)26(37-10)41-25-23(35)21(33)18(9-28)40-27(25)38-13-6-14(30)19-15(31)8-16(39-17(19)7-13)11-2-4-12(29)5-3-11/h2-7,10,16,18,20-30,32-36H,8-9H2,1H3/t10-,16-,18+,20-,21+,22+,23-,24+,25+,26-,27+/m0/s1]",[C18_EMA-standards_positive.tsv]
2,myricetin ; [M+H]+,"InChI=1S/C15H10O8/c16-6-3-7(17)11-10(4-6)23-15(14(22)13(11)21)5-1-8(18)12(20)9(19)2-5/h1-4,16-20,22H","[InChI=1S/C15H10O8/c16-6-3-7(17)11-10(4-6)23-15(14(22)13(11)21)5-1-8(18)12(20)9(19)2-5/h1-4,16-20,22H]",[C18_EMA-standards_positive.tsv]
3,baicalin ; [M+H]+,baicalein,[baicalein],[C18_EMA-standards_positive.tsv]
4,phloretin ; [M+H]+,"InChI=1S/C15H14O5/c16-10-4-1-9(2-5-10)3-6-12(18)15-13(19)7-11(17)8-14(15)20/h1-2,4-5,7-8,16-17,19-20H,3,6H2","[InChI=1S/C15H14O5/c16-10-4-1-9(2-5-10)3-6-12(18)15-13(19)7-11(17)8-14(15)20/h1-2,4-5,7-8,16-17,19-20H,3,6H2]",[C18_EMA-standards_positive.tsv]
5,(+)-catechin ; [M-H]-,"InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1","[InChI=1S/C15H14O6/c16-8-4-11(18)9-6-13(20)15(21-14(9)5-8)7-1-2-10(17)12(19)3-7/h1-5,13,15-20H,6H2/t13-,15+/m0/s1]",[C18_EMA-standards_negative.tsv]



These compounds+adducts are not yet in any atlases:



Unnamed: 0,chromatography,compound_name,adduct,polarity,rt_peak,mz_theoretical,smiles,inchi,inchi_key,neutralized_inchi,neutralized_inchi_key,permanent_charge,formula,monoisotopic_mass,collision_energy,label,rt_min,rt_max,mz_tolerance
0,C18,mangiferin,[M+H]+,positive,2.743738,423.092187,C1=C2C(=CC(=C1O)O)OC3=C(C2=O)C(=C(C(=C3)O)[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O)O,"InChI=1S/C19H18O11/c20-4-11-15(25)17(27)18(28)19(30-11)12-8(23)3-10-13(16(12)26)14(24)5-1-6(21)7(22)2-9(5)29-10/h1-3,11,15,17-23,25-28H,4H2/t11-,15-,17+,18-,19+/m1/s1",AEDDIBAIWPIIBD-ZJKJAXBQSA-N,"InChI=1S/C19H18O11/c20-4-11-15(25)17(27)18(28)19(30-11)12-8(23)3-10-13(16(12)26)14(24)5-1-6(21)7(22)2-9(5)29-10/h1-3,11,15,17-23,25-28H,4H2/t11-,15-,17+,18-,19+/m1/s1",AEDDIBAIWPIIBD-ZJKJAXBQSA-N,0,C19H18O11,422.084911,102040,mangiferin,2.243738,3.243738,5
1,C18,neohesperidin dihydrochalcone,[M+H]+,positive,3.703746,613.212696,C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@@H]2[C@H]([C@@H]([C@H](O[C@H]2OC3=CC(=C(C(=C3)O)C(=O)CCC4=CC(=C(C=C4)OC)O)O)CO)O)O)O)O)O,"InChI=1S/C28H36O15/c1-11-21(34)23(36)25(38)27(40-11)43-26-24(37)22(35)19(10-29)42-28(26)41-13-8-16(32)20(17(33)9-13)14(30)5-3-12-4-6-18(39-2)15(31)7-12/h4,6-9,11,19,21-29,31-38H,3,5,10H2,1-2H3/t11-,19+,21-,22+,23+,24-,25+,26+,27-,28+/m0/s1",ITVGXXMINPYUHD-CUVHLRMHSA-N,"InChI=1S/C28H36O15/c1-11-21(34)23(36)25(38)27(40-11)43-26-24(37)22(35)19(10-29)42-28(26)41-13-8-16(32)20(17(33)9-13)14(30)5-3-12-4-6-18(39-2)15(31)7-12/h4,6-9,11,19,21-29,31-38H,3,5,10H2,1-2H3/t11-,19+,21-,22+,23+,24-,25+,26+,27-,28+/m0/s1",ITVGXXMINPYUHD-CUVHLRMHSA-N,0,C28H36O15,612.20542,102040,neohesperidin dihydrochalcone,3.203746,4.203746,5
2,C18,quercetin,[M+Na]+,positive,3.824816,325.031871,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",REFJWTPEDVJJIY-UHFFFAOYSA-N,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",REFJWTPEDVJJIY-UHFFFAOYSA-N,0,C15H10O7,302.042653,102040,quercetin (peak1),3.324816,4.324816,5
3,C18,kaempferol,[M+Na]+,positive,4.171614,309.036956,OC1=CC=C(C=C1)C1=C(O)C(=O)C2=C(O)C=C(O)C=C2O1,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",IYRMWMYZSQPJKC-UHFFFAOYSA-N,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",IYRMWMYZSQPJKC-UHFFFAOYSA-N,0,C15H10O6,286.047738,102040,kaempferol (peak1),3.671614,4.671614,5
4,C18,quercetin,[M+Na]+,positive,4.171614,325.031871,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",REFJWTPEDVJJIY-UHFFFAOYSA-N,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",REFJWTPEDVJJIY-UHFFFAOYSA-N,0,C15H10O7,302.042653,102040,quercetin (peak2),3.671614,4.671614,5
5,C18,kaempferol,[M+Na]+,positive,4.396081,309.036956,OC1=CC=C(C=C1)C1=C(O)C(=O)C2=C(O)C=C(O)C=C2O1,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",IYRMWMYZSQPJKC-UHFFFAOYSA-N,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",IYRMWMYZSQPJKC-UHFFFAOYSA-N,0,C15H10O6,286.047738,102040,kaempferol (peak2),3.896081,4.896081,5


In [8]:
# TO DO: Create matching input sheet to Katherine's backward RT correction script 
# existing_atlases has a column 'source_file' which tells you which atlas it is from
# This can be used to grab the QC atlases per polarity/chrom and make the top part of the input csv

# Store selected compound/adduct in metatlas atlases

In [9]:
# TO DO (maybe by hand?)
# RT correction here first

# Check if selected compounds/adducts are in MSMS refs

In [9]:
msms_refs = sta.get_msms_refs(msms_refs_path='/global/cfs/cdirs/metatlas/projects/spectral_libraries/20240430_istdv7-addition_msms_refs.tab')
msms_refs.shape

(32723, 4)

In [10]:
in_msms_refs, notin_msms_refs = sta.search_for_matches_in_msms_refs(rt_peaks_selected_top_adduct_formatted, msms_refs)

Searching for matches in MSMS refs:   0%|          | 0/12 [00:00<?, ?it/s]


Summary of compounds+adducts already in MSMS refs:



Unnamed: 0,compound,matching_criterion,adduct,msms_entry
0,(+)-catechin,inchi_key,[M-H]-,PFTAWBLQPZVEMU-DZGCQCFKSA-N
1,mangiferin,inchi_key,[M+H]+,AEDDIBAIWPIIBD-ZJKJAXBQSA-N
2,naringin,inchi_key,[M+H]+,DFPMSGMNTNDNHN-ZPHOTFPESA-N
3,myricetin,inchi_key,[M+H]+,IKMDFBPHZNJCSN-UHFFFAOYSA-N
4,neohesperidin dihydrochalcone,inchi_key,[M+H]+,ITVGXXMINPYUHD-CUVHLRMHSA-N
5,phloretin,inchi_key,[M+H]+,VGEREEWJJVICBM-UHFFFAOYSA-N



These compounds+adducts are not yet in MSMS refs:



Unnamed: 0,standard_lcmsrun,chromatography,compound_name,adduct,polarity,rt_peak,mz_theoretical,smiles,peak_index,inchi,...,neutralized_inchi,neutralized_inchi_key,permanent_charge,formula,monoisotopic_mass,collision_energy,label,rt_min,rt_max,mz_tolerance
0,/global/cfs/cdirs/metatlas/raw_data/jgi/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680_POS_MS2_6_PolyphenolsMix1_1_Rg80to1200-CE102040-metasci-S1_Run26.h5,C18,baicalin,[M+H]+,positive,3.642457,447.092187,O[C@@H]1[C@@H](O)[C@@H](OC2=C(O)C(O)=C3C(=O)C=C(OC3=C2)C2=CC=CC=C2)O[C@@H]([C@H]1O)C(O)=O,peak1,"InChI=1S/C21H18O11/c22-9-6-10(8-4-2-1-3-5-8)30-11-7-12(14(23)15(24)13(9)11)31-21-18(27)16(25)17(26)19(32-21)20(28)29/h1-7,16-19,21,23-27H,(H,28,29)/t16-,17-,18+,19-,21-/m0/s1",...,"InChI=1S/C21H18O11/c22-9-6-10(8-4-2-1-3-5-8)30-11-7-12(14(23)15(24)13(9)11)31-21-18(27)16(25)17(26)19(32-21)20(28)29/h1-7,16-19,21,23-27H,(H,28,29)/t16-,17-,18+,19-,21-/m0/s1",IKIIZLYTISPENI-UNJWAJPSSA-N,0,C21H18O11,446.084911,102040,baicalin,3.142457,4.142457,5
1,/global/cfs/cdirs/metatlas/raw_data/jgi/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680_POS_MS2_6_PolyphenolsMix1_1_Rg80to1200-CE102040-metasci-S1_Run26.h5,C18,quercetin,[M+Na]+,positive,3.824816,325.031871,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,peak1,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",...,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",REFJWTPEDVJJIY-UHFFFAOYSA-N,0,C15H10O7,302.042653,102040,quercetin (peak1),3.324816,4.324816,5
2,/global/cfs/cdirs/metatlas/raw_data/jgi/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680_POS_MS2_6_PolyphenolsMix1_1_Rg80to1200-CE102040-metasci-S1_Run26.h5,C18,kaempferol,[M+Na]+,positive,4.171614,309.036956,OC1=CC=C(C=C1)C1=C(O)C(=O)C2=C(O)C=C(O)C=C2O1,peak1,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",...,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",IYRMWMYZSQPJKC-UHFFFAOYSA-N,0,C15H10O6,286.047738,102040,kaempferol (peak1),3.671614,4.671614,5
3,/global/cfs/cdirs/metatlas/raw_data/jgi/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680_POS_MS2_6_PolyphenolsMix1_1_Rg80to1200-CE102040-metasci-S1_Run26.h5,C18,quercetin,[M+Na]+,positive,4.171614,325.031871,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,peak2,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",...,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",REFJWTPEDVJJIY-UHFFFAOYSA-N,0,C15H10O7,302.042653,102040,quercetin (peak2),3.671614,4.671614,5
4,/global/cfs/cdirs/metatlas/raw_data/jgi/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680_POS_MS2_6_PolyphenolsMix1_1_Rg80to1200-CE102040-metasci-S1_Run26.h5,C18,kaempferol,[M+Na]+,positive,4.396081,309.036956,OC1=CC=C(C=C1)C1=C(O)C(=O)C2=C(O)C=C(O)C=C2O1,peak2,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",...,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",IYRMWMYZSQPJKC-UHFFFAOYSA-N,0,C15H10O6,286.047738,102040,kaempferol (peak2),3.896081,4.896081,5


In [11]:
notin_msms_refs

Unnamed: 0,standard_lcmsrun,chromatography,compound_name,adduct,polarity,rt_peak,mz_theoretical,smiles,peak_index,inchi,...,neutralized_inchi,neutralized_inchi_key,permanent_charge,formula,monoisotopic_mass,collision_energy,label,rt_min,rt_max,mz_tolerance
0,/global/cfs/cdirs/metatlas/raw_data/jgi/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680_POS_MS2_6_PolyphenolsMix1_1_Rg80to1200-CE102040-metasci-S1_Run26.h5,C18,baicalin,[M+H]+,positive,3.642457,447.092187,O[C@@H]1[C@@H](O)[C@@H](OC2=C(O)C(O)=C3C(=O)C=C(OC3=C2)C2=CC=CC=C2)O[C@@H]([C@H]1O)C(O)=O,peak1,"InChI=1S/C21H18O11/c22-9-6-10(8-4-2-1-3-5-8)30-11-7-12(14(23)15(24)13(9)11)31-21-18(27)16(25)17(26)19(32-21)20(28)29/h1-7,16-19,21,23-27H,(H,28,29)/t16-,17-,18+,19-,21-/m0/s1",...,"InChI=1S/C21H18O11/c22-9-6-10(8-4-2-1-3-5-8)30-11-7-12(14(23)15(24)13(9)11)31-21-18(27)16(25)17(26)19(32-21)20(28)29/h1-7,16-19,21,23-27H,(H,28,29)/t16-,17-,18+,19-,21-/m0/s1",IKIIZLYTISPENI-UNJWAJPSSA-N,0,C21H18O11,446.084911,102040,baicalin,3.142457,4.142457,5
1,/global/cfs/cdirs/metatlas/raw_data/jgi/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680_POS_MS2_6_PolyphenolsMix1_1_Rg80to1200-CE102040-metasci-S1_Run26.h5,C18,quercetin,[M+Na]+,positive,3.824816,325.031871,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,peak1,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",...,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",REFJWTPEDVJJIY-UHFFFAOYSA-N,0,C15H10O7,302.042653,102040,quercetin (peak1),3.324816,4.324816,5
2,/global/cfs/cdirs/metatlas/raw_data/jgi/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680_POS_MS2_6_PolyphenolsMix1_1_Rg80to1200-CE102040-metasci-S1_Run26.h5,C18,kaempferol,[M+Na]+,positive,4.171614,309.036956,OC1=CC=C(C=C1)C1=C(O)C(=O)C2=C(O)C=C(O)C=C2O1,peak1,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",...,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",IYRMWMYZSQPJKC-UHFFFAOYSA-N,0,C15H10O6,286.047738,102040,kaempferol (peak1),3.671614,4.671614,5
3,/global/cfs/cdirs/metatlas/raw_data/jgi/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680_POS_MS2_6_PolyphenolsMix1_1_Rg80to1200-CE102040-metasci-S1_Run26.h5,C18,quercetin,[M+Na]+,positive,4.171614,325.031871,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,peak2,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",...,"InChI=1S/C15H10O7/c16-7-4-10(19)12-11(5-7)22-15(14(21)13(12)20)6-1-2-8(17)9(18)3-6/h1-5,16-19,21H",REFJWTPEDVJJIY-UHFFFAOYSA-N,0,C15H10O7,302.042653,102040,quercetin (peak2),3.671614,4.671614,5
4,/global/cfs/cdirs/metatlas/raw_data/jgi/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680/20230321_JGI_TN_507992_LabUnlabStdMixes_final_QEHF_C18_USDAY63680_POS_MS2_6_PolyphenolsMix1_1_Rg80to1200-CE102040-metasci-S1_Run26.h5,C18,kaempferol,[M+Na]+,positive,4.396081,309.036956,OC1=CC=C(C=C1)C1=C(O)C(=O)C2=C(O)C=C(O)C=C2O1,peak2,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",...,"InChI=1S/C15H10O6/c16-8-3-1-7(2-4-8)15-14(20)13(19)12-10(18)5-9(17)6-11(12)21-15/h1-6,16-18,20H",IYRMWMYZSQPJKC-UHFFFAOYSA-N,0,C15H10O6,286.047738,102040,kaempferol (peak2),3.896081,4.896081,5


In [None]:
sta.format_for_msms_refs(msms_refs)

In [13]:
top_spectra_selected.shape

(18, 13)

# Store selected compound/adduct in MSMS refs

In [None]:
# Combine existing and new MSMS refs
new_msms_refs = pd.concat([msms_refs, notin_msms_refs])
print(f"Existing MSMS refs: {msms_refs.shape}")
print(f"New MSMS refs: {new_msms_refs.shape}")

In [None]:
# Export new MSMS refs file