# Example Notebook For Associating Experimental Signals With The Carbon Network
Prior to running this notebook, the CarbonNetwork graphml file must be present in the build folder

In [1]:
import pandas as pd
import numpy as np
import glob as glob
import matplotlib.pyplot as plt
import networkx as nx
import os
import glob

from typing import List, Tuple

from tqdm.notebook import tqdm

# replace with submodules
import sys
sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas')
from metatlas.io import feature_tools as ft


import analysis_tools as at

INFO:rdkit:Enabling RDKit 2023.09.1 jupyter extensions


## Parameters
The next code block sets parameters that are used throughout the remainder of the notebook.

In [2]:
# experiment directory
## note: must follow Northen Lab file naming conventions and be converted to hdf5 format

# tolerance in ppm between experimental signal and node mz
mz_ppm_tolerance = 5
peak_height_min = 1e4
num_datapoints_min = 10
# minimum MSMS score 
msms_score_min = 0.5

# minimum MSMS matching ion count
msms_matches_min = 3

# retention time range in minutes for feature finding
rt_range = [1, 700]

# tolerance in daltons used for calculating MS/MS similarity scores
frag_mz_tolerance = 0.05

In [17]:
from importlib import reload
at = reload(at)
# max_ms1_data = at.get_best_ms1_rawdata(ms1_data,merged_node_data)
# max_ms1_data

In [4]:
# collect and merge required data and metadata
node_data = at.graph_to_df()
node_atlas = at.make_node_atlas(node_data, rt_range)
merged_node_data = at.merge_spectral_data(node_data)


INFO:root:Processing original_spectra.mgf
INFO:root:Processing nl_spectra.mgf


In [5]:
files = node_data['no_extension_basename'].value_counts().head(3).index.tolist()
all_files = pd.read_csv('/global/cfs/cdirs/metatlas/projects/carbon_network/public_and_internal_files_with_massive_and_redu.tsv', sep='\t')
files = all_files[all_files['no_extension_basename'].isin(files)].sort_values('num_unique_spectra',ascending=False).drop_duplicates('no_extension_basename')
files = files['h5'].tolist()

files

['/pscratch/sd/b/bpb/massive/v01/MSV000089061/ccms_peak/raw_neg/Soil_F5C_neg.h5',
 '/pscratch/sd/b/bpb/massive/z01/MSV000088008/ccms_peak/RAW/NEG_MSMS_raw/DOM_Interlab-LCMS_Lab024_M_NEG_MS2_rep2.h5',
 '/pscratch/sd/b/bpb/massive/z01/MSV000088008/ccms_peak/RAW/NEG_MSMS_raw/DOM_Interlab-LCMS_Lab024_A5M_NEG_MS2_rep3.h5']

In [8]:
# get ms1 and ms2 data
ms1_data = at.get_sample_ms1_data(node_atlas, files, mz_ppm_tolerance,peak_height_min,num_datapoints_min)
max_ms1_data = at.get_best_ms1_rawdata(ms1_data,node_data)
ms2_data = at.get_sample_ms2_data(files,merged_node_data,msms_score_min,msms_matches_min,mz_ppm_tolerance,frag_mz_tolerance)
max_ms2_data = at.get_best_ms2_rawdata(ms2_data)
best_hits = at.get_best_ms1_ms2_combined(max_ms1_data,max_ms2_data)



  0%|          | 0/3 [00:00<?, ?file/s]

In [19]:
best_hits


Unnamed: 0,node_id,num_datapoints,peak_area,peak_height,mz_centroid,rt_peak,lcmsrun_observed,precursor_mz,ppm_error,ms2_node_id,ms2_score,ms2_matches,ms2_lcmsrun_observed
0,1448.0,445.0,1.737178e+08,4.277070e+06,255.160508,17.243446,/pscratch/sd/b/bpb/massive/z01/MSV000088008/cc...,255.15982,-2.693957,1448.0,1.500329,120.0,/pscratch/sd/b/bpb/massive/v01/MSV000089061/cc...
1,64.0,817.0,9.289505e+07,3.542237e+06,177.055032,3.239237,/pscratch/sd/b/bpb/massive/v01/MSV000089061/cc...,177.055106,0.423123,64.0,1.453142,87.0,/pscratch/sd/b/bpb/massive/z01/MSV000088008/cc...
2,43.0,593.0,1.919725e+08,2.533967e+07,277.144841,20.541883,/pscratch/sd/b/bpb/massive/z01/MSV000088008/cc...,277.144094,-2.695288,43.0,1.419878,185.0,/pscratch/sd/b/bpb/massive/v01/MSV000089061/cc...
3,38.0,991.0,1.728318e+08,6.289111e+06,165.056041,14.989144,/pscratch/sd/b/bpb/massive/z01/MSV000088008/cc...,165.055723,-1.929570,38.0,1.222284,4.0,/pscratch/sd/b/bpb/massive/z01/MSV000088008/cc...
4,9.0,1981.0,7.480138e+08,1.689511e+07,165.019687,9.229259,/pscratch/sd/b/bpb/massive/z01/MSV000088008/cc...,165.019346,-2.068167,9.0,1.018468,5.0,/pscratch/sd/b/bpb/massive/z01/MSV000088008/cc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5805,3872.0,152.0,8.280619e+05,1.235276e+04,300.030083,15.107903,/pscratch/sd/b/bpb/massive/v01/MSV000089061/cc...,300.029793,-0.966513,,,,
5806,4434.0,111.0,5.213050e+05,1.234622e+04,290.932206,15.186124,/pscratch/sd/b/bpb/massive/v01/MSV000089061/cc...,290.933515,4.499181,,,,
5807,4449.0,130.0,4.921063e+05,1.226270e+04,166.026488,1.384729,/pscratch/sd/b/bpb/massive/v01/MSV000089061/cc...,166.026219,-1.620694,,,,
5808,1878.0,41.0,8.807930e+04,1.106410e+04,210.004099,2.321516,/pscratch/sd/b/bpb/massive/v01/MSV000089061/cc...,210.003553,-2.600824,,,,


In [None]:
max_ms1_data