In [1]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import pandas as pd
import sys
import ipywidgets as widgets
from IPython.display import display, clear_output
import pickle
from datetime import datetime
import math
import glob
import os
import base64

sys.path.insert(1, '/global/homes/b/bkieft/metatlas')
import notebooks.standards_library.standard_annotation as sta
pd.options.display.max_colwidth = 300

# Enter Reference Standard information

In [2]:
ppm_tolerance = 5
include_polarities = ['POS', 'NEG']
#include_chromatographies = ['C18', 'HILIC'] # 'C18' and/or 'HILIC'
include_chromatographies = ['C18']
include_adducts = ['[M+H]+', '[M+Na]+', '[M-H2O+H]+', '[M+K]+', '[M+NH4]+', '[M]+', '[M+2H]2+','[M-H]-', '[M+Cl]-', '[M]-', '[M-2H]2-']

# path_to_standards_files = '/global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/metasci-flavanoids_output'
# csv_standard_info_path = f'{path_to_standards_files}/metasci_flav_annotation_input.csv'
path_to_standards_files = '/global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/'
csv_standard_info_path = f'{path_to_standards_files}/scheller_test.csv'

input_compounds = pd.read_csv(csv_standard_info_path)
new_analysis = False

# Build run table with adducts

In [3]:
standard_lcmsruns_table = sta.build_standard_lcmsrun_table(csv_standard_info_path, include_polarities=include_polarities, include_chromatographies=include_chromatographies)
standard_lcmsruns_table_with_adducts = sta.build_adduct_annotated_table(standard_lcmsruns_table, include_adducts=include_adducts)

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 

[16:33:24] ERROR: 



# Get EIC and Spectra information from run table

In [4]:
# test = standard_lcmsruns_table_with_adducts[standard_lcmsruns_table_with_adducts['standard_lcmsrun'].str.contains('Sorgoleone')]
# test = test[test['standard_lcmsrun'].str.contains('C18')]
# test = test[test['standard_lcmsrun'].str.contains('POS')]
# test = test[test['standard_lcmsrun'].str.contains('205060')]
# test.shape

In [5]:
if new_analysis is True:
    eics_list, top_spectra_list, group_name_list, rt_peak_list = sta.extract_data(lcmsruns_table=standard_lcmsruns_table_with_adducts, 
                                                                                    ppm_tolerance=ppm_tolerance,
                                                                                    method="find_peaks")
    runnum_to_structure_image_grid = sta.generate_gridded_molecular_images(standard_lcmsruns_table_with_adducts)

# Save and/or read data

In [6]:
if new_analysis is True:
    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    standards_data_filename = csv_standard_info_path.replace(".csv", f"_{current_time}_standards_data.pkl")
    print(f"Saving standards data to: {standards_data_filename}")
    with open(standards_data_filename, 'wb') as f:
        pickle.dump((eics_list, top_spectra_list, group_name_list, rt_peak_list, runnum_to_structure_image_grid), f)
elif new_analysis is False:
    pkl_files = glob.glob(f"{path_to_standards_files}/*_standards_data.pkl")
    most_recent_pkl = max(pkl_files, key=os.path.getmtime)
    print(f"Loading most recent pkl file: {most_recent_pkl}")
    with open(most_recent_pkl, 'rb') as f:
        eics_list, top_spectra_list, group_name_list, rt_peak_list, runnum_to_structure_image_grid = pickle.load(f)

Loading most recent pkl file: /global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation/scheller_test_20250411_154752_standards_data.pkl


# Format EIC and Spectra information for plotting

In [7]:
processed_data = []
adduct_color = sta.generate_adduct_colors(include_adducts)
obj_lengths = [len(eics_list), len(top_spectra_list), len(group_name_list), len(rt_peak_list)]

if len(set(obj_lengths)) != 1:
    print(f"Warning: Lists have inconsistent lengths: {obj_lengths}")
number_of_groups = obj_lengths[0]

for i in range(number_of_groups):
    eics = eics_list[i]
    top_spectra = top_spectra_list[i]
    group_name = group_name_list[i]
    rt_peaks = rt_peak_list[i]

    # Extract group-specific information
    compound_name = group_name[0]
    group_file = group_name[1]
    compound_smiles = group_name[2]
    group_run_number = sta.get_run_num(group_file)
    group_chrom = sta.get_chromatography(group_file)
    group_pol = sta.get_file_polarity(group_file)
    group_params = sta.get_lcmsrun_params(group_file)

    # Unique identifier for the group
    group_id = f"{compound_name}_{group_chrom}_{group_pol}_{group_params}_{group_run_number}"
    unique_id = f"{compound_name};;{group_file}"
    
    # Extract unique adducts from EICs
    eic_adducts = sta.extract_adducts(eics)

    # Append processed data for further use
    processed_data.append({
        "group_id": group_id,
        "eics": eics,
        "top_spectra": top_spectra,
        "rt_peaks": rt_peaks,
        "compound_name": compound_name,
        "compound_smiles": compound_smiles,
        "group_file": group_file,
        "unique_id": unique_id,
        "group_run_number": group_run_number,
        "group_chrom": group_chrom,
        "group_pol": group_pol,
        "group_params": group_params,
        "adduct_color": adduct_color
    })

processed_data.sort(key=lambda x: x['group_run_number'])

# Create interactive plot and choose adducts

In [8]:
selected_good_adducts = {}
ambiguous_adducts = {}

def create_interactive_plots(processed_data, runnum_to_structure_image_grid):
    global selected_good_adducts

    # Widget Creation
    image_toggle = widgets.ToggleButton(
        value=False,  # Default to hidden
        description='Show Structures',
        tooltip='Toggle to show/hide the compound structure image',
        layout=widgets.Layout(width='150px', margin='30px 0 0 0')
    )
    yaxis_toggle = widgets.ToggleButton(
        value=False,  # Default to unique y-axis
        description='Shared Y-Axis',  # Description when toggled to shared y-axis
        tooltip='Toggle between unique and shared y-axes for non-log EIC plots',
        layout=widgets.Layout(width='150px', margin='30px 0 0 0')
    )
    next_button = widgets.Button(
        description="Next Group"
        )
    previous_button = widgets.Button(
        description="Previous Group"
        )
    progress_label = widgets.Label(
        value=f"1/{len(processed_data)} Groups Completed"
        )
    navigate_textbox = widgets.Text(
        placeholder='Index...',
        description='Go to:',
        layout=widgets.Layout(width='50px')
    )
    navigate_button = widgets.Button(
        description="Go",
        layout=widgets.Layout(width='50px')
    )
    compound_image_widget = widgets.Image(
        format='png',
        layout=widgets.Layout(
            width='400px',
            height='400px',
            margin='0 0 0 50px',
        )
    )
    output_container = widgets.Output()

    # Event Handlers
    def on_image_toggle_change(change):
        if image_toggle.value:
            image_toggle.description = 'Hide Structures'
            compound_image_widget.layout.display = 'block'
        else:
            image_toggle.description = 'Show Structures'
            compound_image_widget.layout.display = 'none'

    def update_progress_text():
        progress_label.value = f"{current_index + 1}/{len(processed_data)} Groups Completed"

    def on_toggle_change(change):
        yaxis_toggle.description = 'Shared Y-Axis' if not yaxis_toggle.value else 'Unique Y-Axis'
        update_plot(current_index)

    def navigate_to_group(b):
        nonlocal current_index
        try:
            target_index = int(navigate_textbox.value) - 1
            if 0 <= target_index < len(processed_data):
                current_index = target_index
                update_plot(current_index)
            else:
                output_container.clear_output(wait=True)
                with output_container:
                    print(f"Invalid index. Please enter a number between 1 and {len(processed_data)}.")
        except ValueError:
            output_container.clear_output(wait=True)
            with output_container:
                print("Invalid input. Please enter a valid integer.")

    def previous_group(b):
        nonlocal current_index
        if current_index > 0:
            current_index -= 1
            update_plot(current_index)
            # Reset the image toggle state
            image_toggle.value = False
            image_toggle.description = 'Show Structures'
        else:
            output_container.clear_output(wait=True)

    def next_group(b):
        nonlocal current_index
        if current_index < len(processed_data) - 1:
            current_index += 1
            update_plot(current_index)
            # Reset the image toggle state
            image_toggle.value = False
            image_toggle.description = 'Show Structures'
        else:
            output_container.clear_output(wait=True)
            with output_container:
                print("Analysis completed!")

    # Attach Event Handlers
    image_toggle.observe(on_image_toggle_change, names='value')
    yaxis_toggle.observe(on_toggle_change, names='value')
    next_button.on_click(next_group)
    previous_button.on_click(previous_group)
    navigate_button.on_click(navigate_to_group)

    # Layout Definitions
    def create_layout(checkboxes):
        checkbox_layout = widgets.VBox(
            checkboxes,
            layout=widgets.Layout(
                border='1px solid black',
                padding='5px',
                margin='5px',
                width='250px',
                align_items='flex-start'
            )
        )
        # Layout of the "Go To:" widget
        go_to_label = widgets.Label(value="Go To:")
        go_to_layout = widgets.HBox(
            [go_to_label, navigate_textbox, navigate_button],
            layout=widgets.Layout(
                justify_content='flex-start',  # Align to the far left
                spacing='5px',
                margin='30px 0 0 0'  # Add space above the widget
            )
        )
        # Update the size of the search box
        navigate_textbox.description = ""
        navigate_textbox.layout = widgets.Layout(width='150px')  # Decrease the size of the search box

        compound_image_widget.layout.display = 'none'
        image_toggle.layout.margin = '0 0 0 50px'

        navigation_buttons_layout = widgets.HBox(
            [
                widgets.VBox([next_button, previous_button]),  # Stack Previous and Next buttons vertically
                image_toggle  # Place the Image Toggle button to the right
            ],
            layout=widgets.Layout(
                justify_content='flex-start',  # Align items to the left
                spacing='10px',  # Add spacing between elements
                margin='0 0 0 0'  # No margin for the navigation buttons
            )
        )
        button_layout = widgets.VBox(
            [navigation_buttons_layout, progress_label, go_to_layout, yaxis_toggle],
            layout=widgets.Layout(
                align_items='flex-start',
                spacing='5px'
            )
        )
        top_layout = widgets.HBox(
            [checkbox_layout, button_layout, compound_image_widget],
            layout=widgets.Layout(
                align_items='flex-start',
                justify_content='flex-start',
                spacing='10px'
            )
        )
        return top_layout

    # Plot Update Logic
    def update_plot(index):
        nonlocal current_index
        data = processed_data[index]
    
        eics = data['eics']
        top_spectra = data['top_spectra']
        rt_peaks = data['rt_peaks']
        adduct_color = data['adduct_color']
        group_id = data['group_id']
        unique_id = data['unique_id']
        group_run_number = data['group_run_number']

        # Extract adduct-peak combinations from rt_peaks and top_spectra
        adduct_peak_combinations = []
        if isinstance(rt_peaks, pd.DataFrame) and not rt_peaks.empty:
            # Create a mapping from adducts to peak indices and intensities
            adduct_to_peaks = {}
            for _, peak_row in rt_peaks.iterrows():
                adduct = peak_row['adduct'] if 'adduct' in peak_row else None
                if adduct:
                    if adduct not in adduct_to_peaks:
                        adduct_to_peaks[adduct] = []
                    adduct_to_peaks[adduct].append({
                        'peak_index': peak_row['peak_index'],
                        'intensity': peak_row['intensity']
                    })

            # Create unique identifiers for each adduct-peak combination
            for adduct, peaks in adduct_to_peaks.items():
                max_intensity = max(peak['intensity'] for peak in peaks)
                for peak in peaks:
                    adduct_peak_combinations.append({
                        'adduct': adduct,
                        'peak_index': peak['peak_index'],
                        'description': f"{adduct} ({peak['peak_index']})",
                        'max_intensity': max_intensity
                    })
        # Sort adduct_peak_combinations by max_intensity in descending order
        adduct_peak_combinations.sort(key=lambda x: x['max_intensity'], reverse=True)

        # Create the summary EIC plot data
        group_run_eics = [
            eic for pdata in processed_data if pdata['group_run_number'] == group_run_number
            for eic in pdata['eics'].values()
        ]
        summary_traces = []
        summary_xmin_list = []
        summary_xmax_list = []
        for eic in group_run_eics:
            # Loop through each row in the eic DataFrame
            for _, eic_row in eic.iterrows():
                # Filter data where intensity is above 1e5
                valid_indices = eic_row['i'] > 1e5
                filtered_rt = eic_row['rt'][valid_indices]
                filtered_i = eic_row['i'][valid_indices]

                if len(filtered_rt) > 0:  # Ensure there are valid points
                    # Sort retention times
                    rt_sort = np.argsort(filtered_rt)
                    adduct = sta.get_adduct(eic_row['label'])  # Extract adduct from the label
                    color = adduct_color.get(adduct, 'gray')  # Default to gray if adduct color is missing
                    label = eic_row['label']

                    # Update x_min and x_max based on filtered data
                    summary_xmin_list.append(filtered_rt.min())
                    summary_xmax_list.append(filtered_rt.max())

                    # Add a trace for the current adduct
                    summary_traces.append(
                        go.Scatter(
                            x=filtered_rt[rt_sort],
                            y=filtered_i[rt_sort],
                            mode='lines',
                            name=f"{label}",
                            line=dict(color=color),
                            showlegend=False
                        )
                    )
        x_min = min(summary_xmin_list) if summary_xmin_list else None
        x_max = max(summary_xmax_list) if summary_xmax_list else None

        eic_adducts = set()
        for eic in eics.values():
            eic['adduct'] = eic.label.apply(lambda x: x.split('_')[-1])
            eic_adducts.update(eic.adduct.tolist())

        num_spectra = len(top_spectra)
        num_columns = 4  # Increase columns to 3 to accommodate the summary plot
        num_spectra_rows = math.ceil(num_spectra / num_columns)

        # Create the figure with subplots
        fig = make_subplots(
            rows=2 + num_spectra_rows,  # Adjust rows dynamically
            cols=4,  # Number of columns
            shared_xaxes=False,
            shared_yaxes=yaxis_toggle.value,
            vertical_spacing=0.3 / (2 + num_spectra_rows),
            horizontal_spacing=0.1,
            subplot_titles=[
                "Sample",  # Row 1, Col 1
                "Blank",  # Row 1, Col 2
                "EIC Summary",  # Row 1-2, Col 3-4
                "Sample (Log)",  # Row 2, Col 1
                "Blank (Log)",  # Row 2, Col 2
                *(f"{row['adduct']} @ {round(row['rt'], 2)} mins" for _, row in top_spectra.iterrows())  # Spectra titles
            ],
            specs=[
                [{"type": "scatter"}, {"type": "scatter"}, {"type": "scatter", "rowspan": 2, "colspan": 2}, None],  # Row 1
                [{"type": "scatter"}, {"type": "scatter"}, None, None],  # Row 2
                *[[{"type": "scatter"} for _ in range(4)] for _ in range(num_spectra_rows)]  # Spectra rows
            ]
        )
        fig.update_xaxes(range=[x_min, x_max], row=1, col=3)  # Set x-axis bounds for the summary graph

        # Add the summary traces to the spanning subplot
        for trace in summary_traces:
            fig.add_trace(trace, row=1, col=3)  # Add to row 1, col 3

        # Add EIC traces for each adduct/peak
        for idx, (lcmsrun_path, eic) in enumerate(eics.items()):
            for i, eic_row in eic.iterrows():
                rt_sort = np.argsort(eic_row['rt'])
                adduct = sta.get_adduct(eic_row['label'])
                color = adduct_color[adduct]
                
                # Determine row and column for the current trace
                row = 1 if idx < 2 else 2
                col = (idx % 2) + 1
                
                # Dynamic facet_name determination 
                if row == 1 and col == 1:
                    facet_name = "Sample"
                elif row == 1 and col == 2:
                    facet_name = "Blank"
                elif row == 2 and col == 1:
                    facet_name = "Sample (Log)"
                elif row == 2 and col == 2:
                    facet_name = "Blank (Log)"

                # Add line traces for raw intensity
                trace_index = len(fig.data)
                fig.add_trace(
                    go.Scatter(
                        x=eic_row['rt'][rt_sort],
                        y=eic_row['i'][rt_sort],
                        mode='lines',
                        name=f"{adduct} {facet_name}",  # Include facet_name in legend
                        line=dict(color=color),
                        showlegend=True
                    ),
                    row=row,
                    col=col
                )

                # Recalculate facet_name for log-transformed traces
                if row + 1 == 2 and col == 1:
                    facet_name = "Sample (Log)"
                elif row + 1 == 2 and col == 2:
                    facet_name = "Blank (Log)"

                # Add line traces for log-transformed intensity
                trace_index = len(fig.data)
                fig.add_trace(
                    go.Scatter(
                        x=eic_row['rt'][rt_sort],
                        y=np.log10(eic_row['i'][rt_sort].astype(float)),
                        mode='lines',
                        name=f"{adduct} {facet_name}",  # Include updated facet_name in legend
                        line=dict(color=color),
                        showlegend=True
                    ),
                    row=row + 1,  # Log traces go to the next row
                    col=col
                )

                # Add peak markers for each peak associated with this adduct
                if not rt_peaks.empty:
                    if facet_name == "Sample" or facet_name == "Sample (Log)":
                        adduct_peaks = rt_peaks[rt_peaks['adduct'] == adduct]
                        for _, peak_info in adduct_peaks.iterrows():
                            peak_rt = peak_info['rt_peak']
                            peak_index = peak_info['peak_index']
                            peak_intensity = peak_info['intensity']

                            # Add marker for raw intensity
                            fig.add_trace(
                                go.Scatter(
                                    x=[peak_rt],
                                    y=[peak_intensity],
                                    mode='markers',
                                    marker=dict(color=color, size=10),
                                    name=f"{adduct} {peak_index}",
                                    showlegend=False
                                ),
                                row=row,
                                col=col
                            )

                            # Add marker for log-transformed intensity
                            fig.add_trace(
                                go.Scatter(
                                    x=[peak_rt],
                                    y=[np.log10(peak_intensity)],
                                    mode='markers',
                                    marker=dict(color=color, size=10),
                                    name=f"{adduct} {peak_index}",
                                    showlegend=False
                                ),
                                row=row + 1,  # Log traces go to the next row
                                col=col
                            )

                # Add MS2 spectra markers
                if not top_spectra.empty:
                    if facet_name == "Sample" or facet_name == "Sample (Log)":
                        adduct_spectra = top_spectra[top_spectra['adduct'] == adduct]
                        # Remove adduct filtering to show all MS2 spectra
                        for _, spectrum_row in adduct_spectra.iterrows():
                            spectrum_adduct = spectrum_row['adduct']
                            spectrum_peak_index = spectrum_row['peak_index']
                            rounded_rt = round(spectrum_row['rt'], 2)
                            marker_color = adduct_color.get(spectrum_adduct, 'gray')

                            # Find closest point in the current EIC
                            sorted_rt = eic_row['rt'][rt_sort]
                            sorted_intensity = eic_row['i'][rt_sort]
                            
                            # Skip if no intensity data available
                            if len(sorted_rt) == 0 or len(sorted_intensity) == 0:
                                continue
                                
                            # Find the closest RT point in the EIC
                            closest_idx = np.argmin(np.abs(sorted_rt - spectrum_row['rt']))
                            
                            if closest_idx >= len(sorted_rt):
                                # If the index is out of bounds, skip this spectrum
                                print(f"Warning: RT {spectrum_row['rt']} is out of bounds for EIC RT range.")
                                continue
                            
                            raw_intensity = sorted_intensity[closest_idx]
                            log_intensity = np.log10(raw_intensity)
                            
                            # Display marker for all spectra on the raw intensity plot
                            fig.add_trace(
                                go.Scatter(
                                    x=[spectrum_row['rt']],
                                    y=[raw_intensity],
                                    mode='markers',
                                    marker=dict(color=marker_color, symbol='x', size=10),
                                    name=f"MS2: {spectrum_adduct} ({spectrum_peak_index}) @ {rounded_rt}",
                                    showlegend=False
                                ),
                                row=row,
                                col=col
                            )
                            
                            # Display marker for all spectra on the log-transformed plot
                            fig.add_trace(
                                go.Scatter(
                                    x=[spectrum_row['rt']],
                                    y=[log_intensity],
                                    mode='markers',
                                    marker=dict(color=marker_color, symbol='x', size=10),
                                    name=f"MS2: {spectrum_adduct} ({spectrum_peak_index}) @ {rounded_rt}",
                                    showlegend=False
                                ),
                                row=row + 1,
                                col=col
                            )

        # Add traces for Spectra plots
        top_spectra_sorted = top_spectra.sort_values(['adduct', 'peak_index'])

        mz_list = [lst[0] for lst in top_spectra_sorted['spectrum'] if isinstance(lst, (list, np.ndarray)) and len(lst) > 0]
        mz_list_flattened = np.concatenate([np.ravel(arr) if isinstance(arr, np.ndarray) else np.array([arr]) for arr in mz_list])
        lowest_mz = np.min(mz_list_flattened)*0.9
        highest_mz = np.max(mz_list_flattened)*1.1

        for i, spectrum_row in enumerate(top_spectra_sorted.iterrows()):
            mz_values = spectrum_row[1]['spectrum'][0]
            i_values = spectrum_row[1]['spectrum'][1]
            adduct = spectrum_row[1]['adduct']
            color = adduct_color[adduct]
            precursor_mz = spectrum_row[1]['precursor_mz']
            peak_index = spectrum_row[1]['peak_index']
            spectrum_title = f"{adduct} ({peak_index}) @ {round(spectrum_row[1]['rt'], 2)} mins"

            # Determine the row and column for this spectrum
            spectrum_row_idx = 3 + (i // num_columns)  # Start after EIC rows
            spectrum_col = (i % num_columns) + 1

            # Update the x-axis range for the current subplot
            fig.update_xaxes(
                range=[lowest_mz, highest_mz],  # Set x-axis limits
                row=spectrum_row_idx,
                col=spectrum_col
            )

            # Add vertical lines for each point
            for mz, intensity in zip(mz_values, i_values):
                fig.add_trace(
                    go.Scatter(
                        x=[mz, mz],
                        y=[0, intensity],
                        mode='lines',
                        line=dict(color=color),
                        showlegend=False
                    ),
                    row=spectrum_row_idx,
                    col=spectrum_col
                )

            # Add markers for each point
            fig.add_trace(
                go.Scatter(
                    x=mz_values,
                    y=i_values,
                    mode='markers',
                    marker=dict(color=color, size=6),
                    name=f"Spectrum {i+1}: {adduct}",
                    showlegend=False
                ),
                row=spectrum_row_idx,
                col=spectrum_col
            )

            # Add a black circle at precursor_mz (y=0)
            fig.add_trace(
                go.Scatter(
                    x=[precursor_mz],
                    y=[0],
                    mode='markers',
                    marker=dict(color='black', symbol='circle', size=20),
                    name=f"Precursor MZ: {precursor_mz}",
                    showlegend=False
                ),
                row=spectrum_row_idx,
                col=spectrum_col
            )
            
            fig.layout.annotations[5 + i].text = spectrum_title

        # Update layout
        fig_title = (group_id.replace('_', '  |  '))
        fig.update_layout(
            hoverlabel=dict(
                font_size=11,  # Increase font size for better readability
                namelength=-1  # Show the full name without truncation
            ),
            title=dict(text=fig_title, font=dict(size=14), x=0.5, xanchor="center"),
            height=700 + 300 * num_spectra_rows,  # Adjust height to fit all plots
            width=1500,  # Increase width to fit the new column
            plot_bgcolor="white",  # Set the plotting area background to white
            paper_bgcolor="white",  # Set the figure background to white
            legend=dict(
                    orientation="h",  # Horizontal legend
                    xanchor="center",
                    yanchor="top",
                    x=0.5,  # Center the legend horizontally
                    y=-0.2,  # Position the legend below the EIC Summary
            )
        )

        # Add black borders and keep gridlines
        fig.update_xaxes(
            showline=True,  # Show axis line
            linewidth=1,  # Set line width
            linecolor="black",  # Set line color to black
            showgrid=True,  # Keep gridlines
            gridcolor="lightgray"  # Set gridline color
        )
        fig.update_yaxes(
            showline=True,  # Show axis line
            linewidth=1,  # Set line width
            linecolor="black",  # Set line color to black
            showgrid=True,  # Keep gridlines
            gridcolor="lightgray"  # Set gridline color
        )

        # Update the compound image based on group_run_number
        if group_run_number in runnum_to_structure_image_grid:
            compound_image_widget.value = base64.b64decode(runnum_to_structure_image_grid[group_run_number])
        else:
            compound_image_widget.value = b''  # Clear the image if not found

        # Create checkboxes for selecting good adducts with peak indices
        checkboxes = [
            widgets.Checkbox(
                value=False,
                description=combo['description'],
                disabled=False
            )
            for combo in adduct_peak_combinations
        ]
        ambiguous_checkbox = widgets.Checkbox(
            value=False,
            description="Ambiguous",
            disabled=False
        )
        checkboxes.append(ambiguous_checkbox)
        checkbox_dict = {checkbox.description: checkbox for checkbox in checkboxes}

        def on_checkbox_change(change):
            if ambiguous_checkbox.value:
                ambiguous_adducts[unique_id] = unique_id
                selected_good_adducts.pop(unique_id, None)
            else:
                selected_good_adducts[unique_id] = [
                    combo['adduct'] + "||" + str(combo['peak_index'])
                    for combo in adduct_peak_combinations 
                    if checkbox_dict[combo['description']].value
                ]
                ambiguous_adducts.pop(unique_id, None)

        for checkbox in checkboxes:
            checkbox.observe(on_checkbox_change, names='value')

        # Set previously selected values
        if unique_id in selected_good_adducts:
            for selected_combo in selected_good_adducts[unique_id]:
                adduct, peak_index = selected_combo.split("||")
                for combo in adduct_peak_combinations:
                    if combo['adduct'] == adduct and str(combo['peak_index']) == peak_index:
                        combo_description = combo['description']
                        if combo_description in checkbox_dict:
                            checkbox_dict[combo_description].value = True
        elif unique_id in ambiguous_adducts:
            ambiguous_checkbox.value = True

        # Create the layout with checkboxes
        top_layout = create_layout(checkboxes)
        plot_and_checkboxes = widgets.VBox(
            [top_layout, widgets.Output()],
            layout=widgets.Layout(align_items='flex-start')
        )

        clear_output(wait=True)
        update_progress_text()
        display(plot_and_checkboxes)
        with plot_and_checkboxes.children[1]:
            display(fig)
        display(output_container)

    # Initialize
    current_index = 0
    update_plot(current_index)

create_interactive_plots(processed_data, runnum_to_structure_image_grid)

VBox(children=(HBox(children=(VBox(children=(Checkbox(value=False, description='[M-H]- (peak1)'), Checkbox(val…

Output()

# Preview selections

In [9]:
if len(selected_good_adducts) == 0:
    print("No good adducts selected.")
    selected_compounds_table = pd.DataFrame()
else:
    selected_compounds_table = pd.DataFrame({
        'index': selected_good_adducts.keys(),
        'selected_adduct_peaks': selected_good_adducts.values()
    }).reset_index(drop=True)

    selected_compounds_table[['compound_name', 'standard_lcmsrun']] = selected_compounds_table['index'].str.split(';;', expand=True)

    selected_compounds_table['selected_adducts'] = selected_compounds_table['selected_adduct_peaks'].apply(
        lambda x: [item.split('||')[0] for item in x]
    )
    selected_compounds_table['selected_peak_indices'] = selected_compounds_table['selected_adduct_peaks'].apply(
        lambda x: [item.split('||')[1] for item in x]
    )

    selected_compounds_table = selected_compounds_table.drop(columns=['index', 'selected_adduct_peaks'])
    
selected_compounds_table

Unnamed: 0,compound_name,standard_lcmsrun,selected_adducts,selected_peak_indices
0,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run382.h5,"[[M+Na]+, [M+H]+]","[peak1, peak1]"
1,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,"[[M+Na]+, [M+H]+]","[peak1, peak1]"
2,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE102040norm-200uM-S1_Run384.h5,[[M-H]-],[peak1]
3,sorgoleone,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run385.h5,[[M-H]-],[peak1]
4,cis-aconitic acid,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-2_cis-AconiticAcid_1_Rg80to1200-CE102040norm-10ugmL-S1_Run387.h5,"[[M+Na]+, [M-H2O+H]+, [M-H2O+H]+, [M+H]+, [M+H]+]","[peak1, peak1, peak2, peak1, peak2]"
5,cis-aconitic acid,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-2_cis-AconiticAcid_1_Rg80to1200-CE205060norm-10ugmL-S1_Run388.h5,"[[M+Na]+, [M-H2O+H]+, [M-H2O+H]+, [M+H]+, [M+H]+]","[peak1, peak1, peak2, peak1, peak2]"
6,cis-aconitic acid,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-2_cis-AconiticAcid_1_Rg80to1200-CE102040norm-10ugmL-S1_Run389.h5,"[[M-H]-, [M-H]-]","[peak1, peak2]"
7,cis-aconitic acid,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-2_cis-AconiticAcid_1_Rg80to1200-CE205060norm-10ugmL-S1_Run390.h5,"[[M-H]-, [M-H]-]","[peak1, peak2]"
8,trans-aconitic acid,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-3_trans-AconiticAcid_1_Rg80to1200-CE102040norm-10ugmL-S1_Run394.h5,"[[M+Na]+, [M-H2O+H]+]","[peak1, peak1]"
9,trans-aconitic acid,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-3_trans-AconiticAcid_1_Rg80to1200-CE205060norm-10ugmL-S1_Run395.h5,"[[M+Na]+, [M-H2O+H]+, [M-H2O+H]+]","[peak1, peak1, peak2]"


In [10]:
if len(ambiguous_adducts) == 0:
    print("No ambiguous adducts selected.")
    ambiguous_adducts_table = pd.DataFrame()
else:
    ambiguous_adducts_table = pd.DataFrame({
        'unique_id': ambiguous_adducts.keys(),
        'combined': ambiguous_adducts.values()
    }).reset_index(drop=True)

    ambiguous_adducts_table[['compound_name', 'standard_lcmsrun']] = ambiguous_adducts_table['unique_id'].str.split(';;', expand=True)

    ambiguous_adducts_table = ambiguous_adducts_table.drop(columns=['unique_id', 'combined'])

ambiguous_adducts_table

No ambiguous adducts selected.


# Subset Run Table, RT Peak, EICs, and Top Spectra by selected adducts

In [11]:
standard_lcmsruns_table_with_adducts_subset = sta.select_compounds_from_gui(standard_lcmsruns_table_with_adducts, selected_compounds_table)
standard_lcmsruns_table_with_adducts_subset.shape

(19, 12)

In [12]:
rt_peak_annotations = pd.concat(rt_peak_list).rename(columns={'lcmsrun': 'standard_lcmsrun'})
rt_peak_annotations_subset = sta.select_compounds_from_gui(rt_peak_annotations, selected_compounds_table)
rt_peak_annotations_subset.shape

(26, 12)

In [13]:
top_spectra_annotations = pd.concat(top_spectra_list, ignore_index=True).rename(columns={'lcmsrun': 'standard_lcmsrun'})
top_spectra_annotations['compound_name'] = top_spectra_annotations['label'].apply(lambda x: x.split('_')[0])
top_spectra_annotations_subset = sta.select_compounds_from_gui(top_spectra_annotations, selected_compounds_table)
top_spectra_annotations_subset.shape

(22, 13)

In [14]:
eics_annotations = pd.concat([df.assign(key=key) for d in eics_list for key, df in d.items()],ignore_index=True).rename(columns={'key': 'standard_lcmsrun'})
eics_annotations['compound_name'] = eics_annotations['label'].apply(lambda x: x.split('_')[0])
eics_annotations_subset = sta.select_compounds_from_gui(eics_annotations, selected_compounds_table)
eics_annotations_subset.shape

(19, 8)

# Save and/or read filtered data

In [15]:
new_analysis = True

In [16]:
if new_analysis is True:
    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    subset_standards_data_filename = csv_standard_info_path.replace(".csv", f"_{current_time}_standards_data_subset.pkl")
    print(f"Saving standards data to: {subset_standards_data_filename}")
    with open(subset_standards_data_filename, 'wb') as f:
        pickle.dump((rt_peak_annotations_subset, top_spectra_annotations_subset, eics_annotations_subset, standard_lcmsruns_table_with_adducts_subset), f)
elif new_analysis is False:
    pkl_files = glob.glob(f"{path_to_standards_files}/*_standards_data_subset.pkl")
    most_recent_pkl = max(pkl_files, key=os.path.getmtime)
    print(f"Loading most recent pkl file: {most_recent_pkl}")
    with open(most_recent_pkl, 'rb') as f:
        rt_peak_annotations_subset, top_spectra_annotations_subset, eics_annotations_subset, standard_lcmsruns_table_with_adducts_subset = pickle.load(f)

Saving standards data to: /global/homes/b/bkieft/metabolomics_standards_annotation/standard_library_annotation//scheller_test_20250411_163437_standards_data_subset.pkl


# Sort RT Peak annotations by intensity and choose best adduct

In [21]:
test = rt_peak_annotations_subset[rt_peak_annotations_subset['compound_name'].str.contains("sorgoleone")]
idx_max_intensity = test.groupby(['chromatography', 'polarity', 'compound_name'])['intensity'].idxmax()
highest_intensity_row = test.loc[idx_max_intensity]
highest_intensity_row

Unnamed: 0,standard_lcmsrun,chromatography,compound_name,adduct,polarity,rt_peak,intensity,mz_observed,mz_theoretical,ppm_error,smiles,peak_index
5,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_NEG_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run385.h5,C18,sorgoleone,[M-H]-,NEG,7.547524,159076656.0,357.207306,357.207133,-0.482824,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1
3,/global/cfs/cdirs/metatlas/raw_data/jgi/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782/20241021_JGI_HS_510060_SorghExu_final1_IQX_C18_USDAY92782_POS_MS2_RefStd-1_Sorgoleone_1_Rg80to1200-CE205060norm-200uM-S1_Run383.h5,C18,sorgoleone,[M+Na]+,POS,7.544277,155523216.0,381.203949,381.203627,-0.843472,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,peak1


In [20]:
c18_pos, c18_neg, hilic_pos, hilic_neg = sta.save_rt_peaks_to_atlas_format(rt_peak_annotations_subset)
final_atlases = pd.concat([c18_pos, c18_neg, hilic_pos, hilic_neg], ignore_index=True)
final_atlases[final_atlases['compound_name'].str.contains("sorgoleone")]

Unnamed: 0,chromatography,compound_name,adduct,polarity,rt_peak,mz_theoretical,smiles,rt_min,rt_max,label,mz_tolerance,inchi,inchi_key
4,C18,sorgoleone peak1,[M+Na]+,positive,7.543334,381.203627,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,7.043334,8.043334,sorgoleone peak1,5,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N
5,C18,sorgoleone peak1,[M+Na]+,positive,7.544277,381.203627,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,7.044277,8.044277,sorgoleone peak1,5,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N
11,C18,sorgoleone peak1,[M-H]-,negative,7.547524,357.207133,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,7.047524,8.047524,sorgoleone peak1,5,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N
12,C18,sorgoleone peak1,[M-H]-,negative,7.549027,357.207133,COC1=CC(=O)C(=C(C1=O)CCCCCCC/C=C\C/C=C\CC=C)O,7.049027,8.049027,sorgoleone peak1,5,"InChI=1S/C22H30O4/c1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-18-21(24)19(23)17-20(26-2)22(18)25/h3,5-6,8-9,17,24H,1,4,7,10-16H2,2H3/b6-5-,9-8-",FGWRUVXUQWGLOX-AFJQJTPPSA-N


In [None]:
## TO DO: Add a check on final_atlases to make sure the adduct is consistent for a given compound across rows

# Check if selected compounds/adducts are in metatlas DB

In [27]:
final_atlases_compounds = final_atlases[['label', 'inchi', 'inchi_key']].drop_duplicates()
final_atlases_compounds = final_atlases_compounds[pd.notna(final_atlases_compounds['inchi_key'])]
in_db, notin_db, flat_in_db = sta.search_for_matches_in_metatlas_db(final_atlases_compounds)
print("\nThese compounds are not in the metatlas Compounds database table:")
pd.DataFrame(list(notin_db.items()), columns=['Label', 'InChIKey'])


0it [00:00, ?it/s]


Searching metatlas db for (+)-catechin (PFTAWBLQPZVEMU-DZGCQCFKSA-N)
Found 48 entries in metatlas db.

These compounds are not in the metatlas Compounds database table:


Unnamed: 0,Label,InChIKey


In [28]:
# Filter the DataFrame to include only rows missing from the database
final_atlases_compounds_not_in_db = final_atlases_compounds[
    final_atlases_compounds['inchi_key'].isin(list(notin_db.values()))
]
final_atlases_compounds_not_in_db

Unnamed: 0,label,inchi,inchi_key


# Store selected compounds in metatlas db

In [29]:
# Add attributes before storing
final_atlases_compounds_add_to_db = final_atlases_compounds_not_in_db.copy()
final_atlases_compounds_add_to_db['neutralized_inchi'] = final_atlases_compounds_add_to_db['inchi'].apply(sta.neutralize_inchi)
final_atlases_compounds_add_to_db['neutralized_inchi_key'] = final_atlases_compounds_add_to_db['neutralized_inchi'].apply(sta.inchi_to_inchikey)
final_atlases_compounds_add_to_db['permanent_charge'] = final_atlases_compounds_add_to_db['neutralized_inchi'].apply(sta.charge_from_inchi)
final_atlases_compounds_add_to_db['formula'] = final_atlases_compounds_add_to_db['neutralized_inchi'].apply(sta.formula_from_inchi)
final_atlases_compounds_add_to_db['monoisotopic_mass'] = final_atlases_compounds_add_to_db['neutralized_inchi'].apply(sta.monoisotopic_mass_from_inchi)
final_atlases_compounds_add_to_db.rename(columns={'label': 'compound_name'}, inplace=True)

In [30]:
print("\nNote! These compounds have different InchiKey and Nueutralized InchiKey:")
final_atlases_compounds_add_to_db[final_atlases_compounds_add_to_db['inchi_key'] != final_atlases_compounds_add_to_db['neutralized_inchi_key']]


Note! These compounds have different InchiKey and Nueutralized InchiKey:


Unnamed: 0,compound_name,inchi,inchi_key,neutralized_inchi,neutralized_inchi_key,permanent_charge,formula,monoisotopic_mass


In [25]:
metatlas_store_input = sta.format_for_atlas_store(final_atlases_compounds_add_to_db)
metatlas_store_input

[{'chebi_id': '',
  'chebi_url': '',
  'creation_time': '2025-04-08T15:14:04',
  'description': '',
  'formula': 'C16H20O9',
  'head_id': '7a58c33d81ff413f9b7668eaa4335127',
  'hmdb_id': '',
  'hmdb_url': '',
  'img_abc_id': '',
  'inchi': 'InChI=1S/C16H20O9/c1-23-10-6-8(2-4-9(10)18)3-5-12(19)25-16-15(22)14(21)13(20)11(7-17)24-16/h2-6,11,13-18,20-22H,7H2,1H3/b5-3+/t11-,13-,14+,15-,16+/m1/s1',
  'inchi_key': 'JWRQVQWBNRGGPK-PMQCXRHVSA-N',
  'iupac_name': '',
  'kegg_id': '',
  'kegg_url': '',
  'last_modified': '2025-04-08T15:14:04',
  'lipidmaps_id': '',
  'lipidmaps_url': '',
  'metacyc_id': '',
  'mono_isotopic_molecular_weight': 356.1107322199999,
  'name': '1-O-feruloyl-glucose',
  'neutralized_2d_inchi': '',
  'neutralized_2d_inchi_key': '',
  'neutralized_inchi': 'InChI=1S/C16H20O9/c1-23-10-6-8(2-4-9(10)18)3-5-12(19)25-16-15(22)14(21)13(20)11(7-17)24-16/h2-6,11,13-18,20-22H,7H2,1H3/b5-3+/t11-,13-,14+,15-,16+/m1/s1',
  'neutralized_inchi_key': 'JWRQVQWBNRGGPK-PMQCXRHVSA-N',
  'num

In [25]:
# metob.store(metatlas_store_input)

# Check if selected compounds/adducts are in the atlases

In [26]:
all_atlases = glob.glob('/global/homes/b/bkieft/metatlas-data/*/*.tsv')
atlas_dfs = []

for df_path in all_atlases:
    df = pd.read_csv(df_path, sep='\t')
    df['source_file'] = os.path.basename(df_path)  # Add the file name as a new column
    atlas_dfs.append(df)

atlas_df = pd.concat(atlas_dfs)

In [27]:
matches_to_atlases, nonmatches_to_atlases = sta.find_atlas_matches(atlas_df, final_atlases)

No match found for 1-O-feruloyl-glucose in any atlas field.
No match found for 1-O-feruloyl-glucose in any atlas field.
No match found for 1-O-feruloyl-glucose in any atlas field.


In [28]:
nonmatches_to_atlases

Unnamed: 0,new_label,attempted_matching_values,atlas_source_files
0,1-O-feruloyl-glucose,"[JWRQVQWBNRGGPK-PMQCXRHVSA-N, InChI=1S/C16H20O9/c1-23-10-6-8(2-4-9(10)18)3-5-12(19)25-16-15(22)14(21)13(20)11(7-17)24-16/h2-6,11,13-18,20-22H,7H2,1H3/b5-3+/t11-,13-,14+,15-,16+/m1/s1, 1-O-feruloyl-glucose, 1-O-feruloyl-glucose]",


# Store selected compound/adduct in metatlas atlases

In [None]:
# TO DO (maybe by hand?)
# RT correction here first

# Check if selected compounds/adducts are in MSMS refs

In [29]:
msms_refs_path = '/global/cfs/cdirs/metatlas/projects/spectral_libraries/20240430_istdv7-addition_msms_refs.tab'
msms_refs = pd.read_csv(msms_refs_path, sep='\t', index_col=0)
msms_refs_compounds = msms_refs[['name', 'inchi', 'inchi_key']].drop_duplicates()
msms_refs_compounds.shape


Columns (2,12) have mixed types. Specify dtype option on import or set low_memory=False.



(15661, 3)

In [30]:
in_msms_refs, notin_msms_refs, flat_in_msms_refs = sta.search_for_matches_in_msms_refs(final_atlases_compounds, msms_refs)
print("\nThese compounds are not in the MSMS refs table:")
pd.DataFrame(list(notin_msms_refs.items()), columns=['Label', 'InChIKey'])

0it [00:00, ?it/s]

Searching MSMS refs for 1-O-feruloyl-glucose (JWRQVQWBNRGGPK-PMQCXRHVSA-N)
1-O-feruloyl-glucose (JWRQVQWBNRGGPK-PMQCXRHVSA-N) not found in MSMS refs. Trying flat inchi key (JWRQVQWBNRGGPK-%-N)
1-O-feruloyl-glucose (JWRQVQWBNRGGPK-PMQCXRHVSA-N) not found in MSMS refs.

These compounds are not in the MSMS refs table:


Unnamed: 0,Label,InChIKey
0,1-O-feruloyl-glucose,JWRQVQWBNRGGPK-PMQCXRHVSA-N


# Store selected compound/adduct in MSMS refs

In [31]:
# Make sure RTs agree
final_atlases_slim = final_atlases[['label', 'adduct', 'rt_peak', 'mz_theoretical', 'inchi', 'inchi_key']].reset_index(drop=True)
final_atlases_slim['label_adduct'] = final_atlases_slim.apply(lambda row: f"{row.label}_{row.adduct}", axis=1)

combined_top_spectra = pd.merge(top_spectra_annotations_subset.rename(columns={'label': 'label_adduct'}), \
                                final_atlases_slim[['label_adduct', 'rt_peak', 'mz_theoretical', 'inchi', 'inchi_key']], on='label_adduct')
combined_top_spectra['label'] = combined_top_spectra['label_adduct'].apply(lambda x: x.split('_')[0])
combined_top_spectra['rt_diff'] = combined_top_spectra['rt'] - combined_top_spectra['rt_peak']
combined_top_spectra = combined_top_spectra[combined_top_spectra['rt_diff'] < 0.3]

In [32]:
# Subset by compounds that need to be added
combined_top_spectra_subset = combined_top_spectra[combined_top_spectra['inchi_key'].isin(list(notin_msms_refs.values()))]

In [33]:
# Add all required columns for MSMS refs
combined_top_spectra_add_to_msms_refs = combined_top_spectra_subset.copy()
combined_top_spectra_add_to_msms_refs['ce_type'] = 'ramped'
combined_top_spectra_add_to_msms_refs['ce'] = combined_top_spectra_add_to_msms_refs['lcmsrun'].apply(sta.get_collision_energy)
combined_top_spectra_add_to_msms_refs['file'] = combined_top_spectra_add_to_msms_refs['lcmsrun'].apply(os.path.basename)
combined_top_spectra_add_to_msms_refs.rename(columns={'mz_theoretical': 'mz'}, inplace=True)
combined_top_spectra_add_to_msms_refs = sta.enrich_metadata(combined_top_spectra_add_to_msms_refs)
combined_top_spectra_add_to_msms_refs['spectrum'] = combined_top_spectra_add_to_msms_refs['spectrum'].apply(sta.make_text_spectrum)
combined_top_spectra_add_to_msms_refs = combined_top_spectra_add_to_msms_refs[msms_refs.columns.intersection(combined_top_spectra_add_to_msms_refs.columns)]

In [34]:
# Combine existing and new MSMS refs
new_msms_refs = pd.concat([msms_refs, combined_top_spectra_add_to_msms_refs])

In [35]:
print(f"Existing MSMS refs: {msms_refs.shape}")
print(f"New MSMS refs: {new_msms_refs.shape}")

Existing MSMS refs: (216409, 17)
New MSMS refs: (216417, 17)


In [None]:
# Export new MSMS refs file