# Example Notebook For Associating Experimental Signals With The Carbon Network
Prior to running this notebook, the CarbonNetwork graphml file must be present in the build folder

In [1]:
import pandas as pd
import numpy as np
import glob as glob
import matplotlib.pyplot as plt
import networkx as nx
import os
import glob

from typing import List, Tuple

from tqdm.notebook import tqdm

# replace with submodules
import sys
sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas')
from metatlas.io import feature_tools as ft
sys.path.insert(0,'/global/homes/b/bpb/repos/blink')
import blink

import analysis_tools as at

INFO:rdkit:Enabling RDKit 2023.09.1 jupyter extensions


## Parameters
The next code block sets parameters that are used throughout the remainder of the notebook.

In [2]:
# experiment directory
## note: must follow Northen Lab file naming conventions and be converted to hdf5 format
exp_dir = '/global/cfs/cdirs/metatlas/raw_data/egsb/20231113_EB_SMK_107002-011_CenturyExp_20230414_EXP120A_C18-EP_USDAY72349'

# tolerance in ppm between experimental signal and node mz
mz_ppm_tolerance = 4

# minimum MSMS score 
msms_score_min = 0.5

# minimum MSMS matching ion count
msms_matches_min = 3

# retention time range in minutes for feature finding
rt_range = [1, 7]

# tolerance in daltons used for calculating MS/MS similarity scores
frag_mz_tolerance = 0.05

In [3]:
from importlib import reload
at = reload(at)

In [4]:
# collect and merge required data and metadata
node_data = at.graph_to_df()
node_atlas = at.make_node_atlas(node_data, rt_range)
merged_node_data = at.merge_spectral_data(node_data)
files_data = at.get_files_df(exp_dir)

INFO:root:Processing original_spectra.mgf
INFO:root:Processing nl_spectra.mgf


In [7]:
# for testing purposes for faster runtime, remove later
files = files_data['filename'].tolist()

In [8]:
# get ms1 and ms2 data
ms1_data = at.get_sample_ms1_data(node_atlas, files, mz_ppm_tolerance)
ms2_data = at.get_sample_ms2_data(files)

  0%|          | 0/110 [00:00<?, ?file/s]

  0%|          | 0/110 [00:00<?, ?file/s]

In [9]:
ms2_data['nl_spectrum'] = ms2_data.apply(lambda x: np.array([x.mdm_mz_vals, x.mdm_i_vals]), axis=1)
ms2_data['original_spectrum'] = ms2_data.apply(lambda x: np.array([x.original_mz_vals, x.original_i_vals]), axis=1)

In [10]:
all_pmzs = np.array(ms2_data['precursor_mz'].tolist() + merged_node_data['precursor_mz'].tolist())
all_pmzs.sort()

all_pmz_groups = ft.group_consecutive(all_pmzs)

pmz_group_key = dict(zip(all_pmzs, all_pmz_groups))

In [11]:
ms2_data['group_idx'] = ms2_data['precursor_mz'].apply(lambda x: pmz_group_key[x])
merged_node_data['group_idx'] = merged_node_data['precursor_mz'].apply(lambda x: pmz_group_key[x])

In [12]:
ms2_and_node_data = pd.merge(ms2_data.rename(columns={'precursor_mz': 'data_calc_precursor_mz'}), 
                             merged_node_data[['group_idx', 'node_id', 'precursor_mz', 'spectrum_original_spectra', 'spectrum_nl_spectra']], on='group_idx')

In [13]:
nl_data_spectra = ms2_and_node_data['nl_spectrum'].tolist()
nl_ref_spectra = ms2_and_node_data['spectrum_nl_spectra'].tolist()

or_data_spectra = ms2_and_node_data['original_spectrum']
or_ref_spectra = ms2_and_node_data['spectrum_original_spectra'].tolist()

data_pmzs = ms2_and_node_data['data_calc_precursor_mz'].tolist()
ref_pmzs = ms2_and_node_data['precursor_mz'].tolist()

In [14]:
discretized_spectra = blink.discretize_spectra(nl_data_spectra, nl_ref_spectra, data_pmzs,  ref_pmzs,
                                         bin_width=0.001, tolerance=frag_mz_tolerance, intensity_power=0.5, trim_empty=False, remove_duplicates=False, network_score=False)

S12 = blink.score_sparse_spectra(discretized_spectra)
m = blink.reformat_score_matrix(S12)

In [15]:
ms2_and_node_data['nl_blink_score'] = np.diag(S12['mzi'].toarray())
ms2_and_node_data['nl_matches'] = np.diag(S12['mzc'].toarray())

In [16]:
discretized_spectra = blink.discretize_spectra(or_data_spectra, or_ref_spectra, data_pmzs,  ref_pmzs,
                                         bin_width=0.001, tolerance=frag_mz_tolerance, intensity_power=0.5, trim_empty=False, remove_duplicates=False, network_score=False)

S12 = blink.score_sparse_spectra(discretized_spectra)
m = blink.reformat_score_matrix(S12)

In [17]:
ms2_and_node_data['or_blink_score'] = np.diag(S12['mzi'].toarray())
ms2_and_node_data['or_matches'] = np.diag(S12['mzc'].toarray())

In [18]:
ms2_and_node_data

Unnamed: 0,data_calc_precursor_mz,isolated_precursor_mz,rt,filename,coisolated_precursor_count,mdm_mz_vals,mdm_i_vals,original_mz_vals,original_i_vals,nl_spectrum,original_spectrum,group_idx,node_id,precursor_mz,spectrum_original_spectra,spectrum_nl_spectra,nl_blink_score,nl_matches,or_blink_score,or_matches
0,113.021844,112.985382,6.632,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,1,"[44.99545669555664, 68.9956283569336, 84.99076...","[1942.34521484375, 33053.69140625, 2305.519042...","[111.80378723144531, 112.98614501953125, 84.99...","[1368.4957275390625, 3050.147705078125, 2305.5...","[[44.99545669555664, 68.9956283569336, 84.9907...","[[111.80378723144531, 112.98614501953125, 84.9...",7,1517.0,113.022727,"[[157.06610107421875, 153.0558319091797, 141.1...","[[143.0503692626953, 157.06610107421875, 183.0...",0.0,0.0,0.023085,2.0
1,113.021863,112.985397,5.738,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,1,"[44.99552536010742, 68.99565124511719, 84.9906...","[1931.0777587890625, 38245.7890625, 1924.64990...","[44.99552536010742, 44.99821472167969, 68.9956...","[1931.0777587890625, 176193.859375, 38245.7890...","[[44.99552536010742, 68.99565124511719, 84.990...","[[44.99552536010742, 44.99821472167969, 68.995...",7,1517.0,113.022727,"[[157.06610107421875, 153.0558319091797, 141.1...","[[143.0503692626953, 157.06610107421875, 183.0...",0.0,0.0,0.037695,4.0
2,113.021869,112.985428,5.455,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,1,"[44.99555587768555, 68.99565124511719, 84.9908...","[1772.28173828125, 48465.1328125, 2092.3720703...","[73.93795776367188, 128.4359588623047, 112.986...","[1731.4483642578125, 1336.662353515625, 4704.3...","[[44.99555587768555, 68.99565124511719, 84.990...","[[73.93795776367188, 128.4359588623047, 112.98...",7,1517.0,113.022727,"[[157.06610107421875, 153.0558319091797, 141.1...","[[143.0503692626953, 157.06610107421875, 183.0...",0.0,0.0,0.026744,2.0
3,113.021877,112.985321,6.031,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,1,"[44.99555587768555, 68.99565887451172, 84.9907...","[1860.5750732421875, 46281.18359375, 3151.8068...","[123.10609436035156, 119.09931945800781, 118.0...","[1295.0946044921875, 1282.032470703125, 1261.7...","[[44.99555587768555, 68.99565887451172, 84.990...","[[123.10609436035156, 119.09931945800781, 118....",7,1517.0,113.022727,"[[157.06610107421875, 153.0558319091797, 141.1...","[[143.0503692626953, 157.06610107421875, 183.0...",0.0,0.0,0.026116,2.0
4,113.021882,112.985382,5.419,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,1,"[44.99552536010742, 68.99566650390625, 84.9906...","[1889.4718017578125, 49082.5703125, 2520.17529...","[84.99067687988281, 112.98587799072266, 90.070...","[2520.17529296875, 3798.049560546875, 1277.983...","[[44.99552536010742, 68.99566650390625, 84.990...","[[84.99067687988281, 112.98587799072266, 90.07...",7,1517.0,113.022727,"[[157.06610107421875, 153.0558319091797, 141.1...","[[143.0503692626953, 157.06610107421875, 183.0...",0.0,0.0,0.030867,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12972,283.060983,283.097168,5.215,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,2,"[195.08056640625, 239.07131958007812, 253.0511...","[3801.342041015625, 87880.828125, 5422.9526367...","[238.06268310546875, 239.0405731201172, 239.07...","[21812.953125, 3764.56201171875, 87880.828125,...","[[195.08056640625, 239.07131958007812, 253.051...","[[238.06268310546875, 239.0405731201172, 239.0...",244,696.0,283.061211,"[[255.10015869140625, 254.98574829101562, 253....","[[253.19512939453125, 271.1741638183594, 271.2...",0.0,0.0,0.000000,0.0
12973,283.061009,283.097198,5.227,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,2,"[195.0840301513672, 239.071044921875, 253.0515...","[1943.697509765625, 34182.5546875, 3643.992431...","[252.0426483154297, 283.1778564453125, 283.097...","[3250.552978515625, 2270.2880859375, 517480.81...","[[195.0840301513672, 239.071044921875, 253.051...","[[252.0426483154297, 283.1778564453125, 283.09...",244,696.0,283.061211,"[[258.0574645996094, 258.96197509765625, 277.2...","[[277.2528381347656, 295.2298583984375, 295.26...",0.0,0.0,0.057343,7.0
12974,283.061009,283.097198,5.227,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,2,"[195.0840301513672, 239.071044921875, 253.0515...","[1943.697509765625, 34182.5546875, 3643.992431...","[252.0426483154297, 283.1778564453125, 283.097...","[3250.552978515625, 2270.2880859375, 517480.81...","[[195.0840301513672, 239.071044921875, 253.051...","[[252.0426483154297, 283.1778564453125, 283.09...",244,696.0,283.061211,"[[258.0574645996094, 258.96197509765625, 277.2...","[[253.19512939453125, 271.1741638183594, 271.2...",0.0,0.0,0.057343,7.0
12975,283.061009,283.097198,5.227,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,2,"[195.0840301513672, 239.071044921875, 253.0515...","[1943.697509765625, 34182.5546875, 3643.992431...","[252.0426483154297, 283.1778564453125, 283.097...","[3250.552978515625, 2270.2880859375, 517480.81...","[[195.0840301513672, 239.071044921875, 253.051...","[[252.0426483154297, 283.1778564453125, 283.09...",244,696.0,283.061211,"[[255.10015869140625, 254.98574829101562, 253....","[[277.2528381347656, 295.2298583984375, 295.26...",0.0,0.0,0.000000,0.0


In [19]:
ms1_data

Unnamed: 0,label,num_datapoints,peak_area,peak_height,mz_centroid,rt_peak,lcmsrun_observed
0,0.0,29,1.509948e+06,188037.312500,200.128837,4.214752,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
1,1.0,1,5.524462e+03,5524.461914,0.688041,6.472521,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
2,10.0,933,1.544912e+07,104700.562500,1688.401652,2.692523,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
3,100.0,488,2.974037e+06,30031.361328,321.127814,2.995543,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
4,1000.0,7,3.975089e+04,12996.774414,10.246955,5.971230,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
...,...,...,...,...,...,...,...
4588,994.0,6,1.521624e+04,4296.339355,7.224095,2.663438,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
4589,995.0,170,1.580328e+06,32560.300781,1214.255339,5.190832,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
4590,996.0,297,1.209099e+06,23444.960938,573.922445,4.904231,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
4591,998.0,2,4.941802e+03,2841.666748,2.247607,2.451944,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...


In [20]:
ms1_data = ms1_data.astype({'label': 'string', 'lcmsrun_observed': 'string'})

In [21]:
max_ms1_data = ms1_data.sort_values('peak_height', ascending=False).drop_duplicates(subset='label').rename(columns={'label': 'node_id'})

In [22]:
# max_ms1_data = pd.merge(max_ms1_data.rename(columns={'label': 'node_id'}), node_data[['node_id', 'precursor_mz']], on='node_id')
# max_ms1_data['ppm_error'] = max_ms1_data.apply(lambda x: ((x.precursor_mz - x.mz_centroid) / x.precursor_mz) * 1000000, axis=1)

In [23]:
max_ms2_data = ms2_and_node_data.sort_values(['nl_blink_score', 'or_blink_score'], ascending=False
                                            ).drop_duplicates(subset='node_id'
                                            ).rename(columns={'filename': 'lcmsrun_observed'}
                                            )[['node_id', 'nl_blink_score', 'nl_matches', 'or_blink_score', 'nl_matches', 'or_matches', 'lcmsrun_observed']]

In [24]:
max_ms2_data.columns

Index(['node_id', 'nl_blink_score', 'nl_matches', 'or_blink_score',
       'nl_matches', 'or_matches', 'lcmsrun_observed'],
      dtype='object')

In [25]:
max_ms2_data['max_blink_score'] = max_ms2_data[['nl_blink_score', 'or_blink_score']].max(axis=1)
max_ms2_data['max_matches'] = max_ms2_data[['nl_matches', 'or_matches']].max(axis=1)

max_ms2_data = max_ms2_data[(max_ms2_data['max_blink_score']>=msms_score_min) & (max_ms2_data['max_matches']>=msms_matches_min)]

In [26]:
max_ms1_data

Unnamed: 0,node_id,num_datapoints,peak_area,peak_height,mz_centroid,rt_peak,lcmsrun_observed
227,1248.0,452,1.106914e+09,1.424765e+08,7.309474e+05,5.142134,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
4580,972.0,282,1.098036e+09,1.424765e+08,7.250847e+05,5.142134,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
2508,4547.0,642,1.747374e+09,1.155015e+08,3.417158e+06,3.459139,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
3020,5465.0,645,1.747380e+09,1.155015e+08,3.417170e+06,3.459139,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
2565,4638.0,641,1.747369e+09,1.155015e+08,3.417148e+06,3.459139,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
...,...,...,...,...,...,...,...
1613,3049.0,1,2.102819e+03,2.102819e+03,8.389580e-01,0.982235,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
3852,6849.0,1,1.826869e+03,1.826869e+03,4.314113e-01,1.750194,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
3114,5297.0,1,1.603164e+03,1.603164e+03,6.145049e-01,1.423981,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
1799,3547.0,1,1.548688e+03,1.548688e+03,2.230602e-01,1.451836,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...


In [27]:
max_ms2_data

Unnamed: 0,node_id,nl_blink_score,nl_matches,or_blink_score,nl_matches.1,or_matches,lcmsrun_observed,max_blink_score,max_matches
12876,2.0,0.871673,2.0,1.055182,2.0,23.0,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,1.055182,23.0
12909,221.0,0.570196,3.0,0.109946,3.0,12.0,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,0.570196,12.0
11695,32.0,0.444455,2.0,0.608405,2.0,11.0,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,0.608405,11.0
11587,1265.0,0.0,0.0,0.504763,0.0,8.0,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...,0.504763,8.0


In [28]:
ms1_data

Unnamed: 0,label,num_datapoints,peak_area,peak_height,mz_centroid,rt_peak,lcmsrun_observed
0,0.0,29,1.509948e+06,188037.312500,200.128837,4.214752,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
1,1.0,1,5.524462e+03,5524.461914,0.688041,6.472521,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
2,10.0,933,1.544912e+07,104700.562500,1688.401652,2.692523,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
3,100.0,488,2.974037e+06,30031.361328,321.127814,2.995543,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
4,1000.0,7,3.975089e+04,12996.774414,10.246955,5.971230,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
...,...,...,...,...,...,...,...
4588,994.0,6,1.521624e+04,4296.339355,7.224095,2.663438,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
4589,995.0,170,1.580328e+06,32560.300781,1214.255339,5.190832,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
4590,996.0,297,1.209099e+06,23444.960938,573.922445,4.904231,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
4591,998.0,2,4.941802e+03,2841.666748,2.247607,2.451944,/global/cfs/cdirs/metatlas/raw_data/egsb/20231...
