# Example Notebook For Associating Experimental Signals With The Carbon Network
Prior to running this notebook, the CarbonNetwork graphml file must be present in the build folder

In [1]:
import pandas as pd
import numpy as np
import glob as glob
import matplotlib.pyplot as plt
import networkx as nx
import os
import glob

from typing import List, Tuple

from tqdm.notebook import tqdm

# replace with submodules
import sys
sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas')
from metatlas.io import feature_tools as ft
sys.path.insert(0,'/global/homes/b/bpb/repos/blink')
import blink

import analysis_tools as at

INFO:rdkit:Enabling RDKit 2023.09.1 jupyter extensions


## Parameters
The next code block sets parameters that are used throughout the remainder of the notebook.

In [12]:
# experiment directory
## note: must follow Northen Lab file naming conventions and be converted to hdf5 format
# exp_dir  = '/global/cfs/cdirs/metatlas/raw_data/egsb/20240125_EB_MdR_101544-059_WAVESTAB3_20231222_EXP120A_C18-EP_USDAY72349'
exp_dir = ['/global/cfs/cdirs/metatlas/raw_data/egsb/20231113_EB_SMK_107002-011_CenturyExp_20230414_EXP120A_C18-EP_USDAY72349',
           '/global/cfs/cdirs/metatlas/raw_data/jgi/20240112_JGI_MdR_109570-002_OMTSoil50g_Pilot_QEHF_C18_USDAY86082_CORRECTED']
# experiment directory
## note: must follow Northen Lab file naming conventions and be converted to hdf5 format

# tolerance in ppm between experimental signal and node mz
mz_ppm_tolerance = 5
peak_height_min = 1e4
num_datapoints_min = 10
# minimum MSMS score 
msms_score_min = 0.5

# minimum MSMS matching ion count
msms_matches_min = 3

# retention time range in minutes for feature finding
rt_range = [1, 700]

# tolerance in daltons used for calculating MS/MS similarity scores
frag_mz_tolerance = 0.05

In [11]:
from importlib import reload
at = reload(at)

In [26]:
df = pd.read_csv('/global/cfs/cdirs/metatlas/projects/carbon_network/public_and_internal_files_with_massive_and_redu.tsv', sep='\t')
df = df[df['SampleType']=='plant']
df = df[~df['buddy'].str.contains('qc',case=False)]
df = df[~df['buddy'].str.contains('blank',case=False)]
out_dir = '/global/cfs/cdirs/metatlas/projects/carbon_network/raw_data'
temp_files = df['h5'].tolist()
files = []
for f in temp_files:
    base_dir = os.path.dirname(f)
    base_name = os.path.basename(f)
    new_dir = os.path.join(out_dir,base_dir)
    new_name = os.path.join(new_dir,base_name)
    files.append(new_name)
    if not os.path.isfile(new_name):
        print('File Not Found!')
        print(new_name)
        print(f)
files_data = pd.DataFrame(files,columns=['filename'])
files = files_data['filename'].tolist()


In [27]:
# collect and merge required data and metadata
node_data = at.graph_to_df()
node_atlas = at.make_node_atlas(node_data, rt_range)
merged_node_data = at.merge_spectral_data(node_data)
# files_data = at.get_files_df(exp_dir)
# files = files_data['filename'].tolist()

INFO:root:Processing original_spectra.mgf
INFO:root:Processing nl_spectra.mgf


In [28]:
len(files)

595

In [29]:
# get ms1 and ms2 data
ms1_data = at.get_sample_ms1_data(node_atlas, files, mz_ppm_tolerance,peak_height_min,num_datapoints_min)
max_ms1_data = at.get_best_ms1_rawdata(ms1_data,node_data)
ms2_data = at.get_sample_ms2_data(files,merged_node_data,msms_score_min,msms_matches_min,mz_ppm_tolerance,frag_mz_tolerance)
max_ms2_data = at.get_best_ms2_rawdata(ms2_data)
best_hits = at.get_best_ms1_ms2_combined(max_ms1_data,max_ms2_data)



  0%|          | 0/595 [00:00<?, ?file/s]

  0%|          | 0/595 [00:00<?, ?file/s]

In [30]:
best_hits = best_hits[best_hits['peak_area']>1e4]
best_hits[pd.notna(best_hits['ms2_score'])]

Unnamed: 0,node_id,num_datapoints,peak_area,peak_height,mz_centroid,rt_peak,lcmsrun_observed,precursor_mz,ppm_error,ms2_node_id,ms2_score,ms2_matches,ms2_lcmsrun_observed
0,2.0,774.0,2.330674e+08,12052493.0,181.013371,4.976950,/global/cfs/cdirs/metatlas/projects/carbon_net...,181.013732,1.997677,2.0,1.719608,68.0,/pscratch/sd/b/bpb/massive/v01/MSV000083611/cc...
1,52.0,4528.0,4.121406e+08,16565982.0,151.038965,1.452227,/global/cfs/cdirs/metatlas/projects/carbon_net...,151.039206,1.593085,52.0,1.695509,43.0,/pscratch/sd/b/bpb/massive/v01/MSV000083105/cc...
2,38.0,483.0,1.309855e+08,33093812.0,165.055191,6.553267,/pscratch/sd/b/bpb/massive/v01/MSV000087711/cc...,165.055723,3.220089,38.0,1.442964,4.0,/pscratch/sd/b/bpb/massive/v01/MSV000083611/cc...
3,24.0,57.0,4.009724e+08,29880608.0,164.070859,3.377556,/global/cfs/cdirs/metatlas/projects/carbon_net...,164.07164,4.759908,24.0,1.284736,29.0,/pscratch/sd/b/bpb/massive/v01/MSV000083611/cc...
4,14.0,332.0,4.350765e+09,29789866.0,131.034808,33.677227,/pscratch/sd/b/bpb/massive/v01/MSV000087123/cc...,131.034921,0.860027,14.0,1.234991,37.0,/pscratch/sd/b/bpb/massive/v01/MSV000083611/cc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,5.0,261.0,1.579011e+09,233736640.0,179.055034,1.264681,/global/cfs/cdirs/metatlas/projects/carbon_net...,179.054997,-0.206631,5.0,0.097339,20.0,/pscratch/sd/b/bpb/massive/v01/MSV000083611/cc...
98,581.0,250.0,5.888474e+08,122628064.0,307.191846,11.521867,/pscratch/sd/b/bpb/massive/v01/MSV000087711/cc...,307.191838,-0.027910,581.0,0.096935,25.0,/global/cfs/cdirs/metatlas/projects/carbon_net...
99,379.0,908.0,4.888944e+08,124879600.0,161.044710,1.308183,/global/cfs/cdirs/metatlas/projects/carbon_net...,161.045328,3.831576,379.0,0.091787,20.0,/pscratch/sd/b/bpb/massive/v01/MSV000083611/cc...
100,171.0,788.0,6.951336e+09,421823520.0,295.227617,1.337902,/global/cfs/cdirs/metatlas/projects/carbon_net...,295.227717,0.339288,171.0,0.087629,27.0,/pscratch/sd/b/bpb/massive/v01/MSV000081804/cc...


In [31]:
def make_output_df(node_data,best_hits,filename='output.csv'):
    output = node_data.copy()
    output.set_index('node_id',inplace=True)
    output = output.join(best_hits.set_index('node_id'),rsuffix='_best_hit',how='left')
    output.to_csv(filename)
    return output
make_output_df(node_data,best_hits,filename='BestHits_PlantData.csv')

Unnamed: 0_level_0,original_index,massive_id,no_extension_basename,title,description,precursor_mz,isolated_precursor_mz,rt,coisolated_precursor_count,predicted_formula,...,peak_height,mz_centroid,rt_peak,lcmsrun_observed,precursor_mz_best_hit,ppm_error,ms2_node_id,ms2_score,ms2_matches,ms2_lcmsrun_observed
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
847.0,847,MSV000090678,HBH0622_38_NEG,GNPS - HBH0622 Dissolved Organic Matter,DOM extraction in oceanic samples was carried ...,230.139362,230.139511,2.277,1.0,C11H21NO4,...,1.504576e+06,230.139703,5.897850,/global/cfs/cdirs/metatlas/projects/carbon_net...,230.139362,-1.484910,,,,
0.0,0,MSV000092487,S56_neg_2,GNPS - Neckar and Spree River Water DOM,Non-target metabolomics of River DOM samples i...,200.128534,200.128876,6.159,1.0,C10H19NO3,...,2.849381e+06,200.128481,2.045592,/pscratch/sd/b/bpb/massive/v01/MSV000081804/cc...,200.128534,0.268909,0.0,0.380986,27.0,/global/cfs/cdirs/metatlas/projects/carbon_net...
4406.0,4406,MSV000088823,DOM_Interlab-LCMS_Lab024_M_NEG_MS2_rep2,GNPS DOM LC-MS/MS Interlab Comparison 2020 COM...,Interlab Study of LC-MS/MS analyis of Marine D...,230.148272,230.148209,16.316,1.0,C6H17N9O,...,1.161113e+06,230.147497,10.781616,/global/cfs/cdirs/metatlas/projects/carbon_net...,230.148272,3.370386,,,,
141.0,141,MSV000088823,DOM_Interlab-LCMS_Lab16_A_Neg_MS2_rep2,GNPS DOM LC-MS/MS Interlab Comparison 2020 COM...,Interlab Study of LC-MS/MS analyis of Marine D...,188.056616,188.05661,1.704,1.0,C7H11NO5,...,8.389550e+07,188.055910,1.305850,/pscratch/sd/b/bpb/massive/v01/MSV000087711/cc...,188.056616,3.752591,,,,
1.0,1,MSV000087608,DOM_Interlab-LCMS_Lab8_A_Neg_MS2_rep1,GNPS InterLab DOM Comparison Lab 8 - Raw and m...,InterLab DOM Comparison Lab 8 - Raw and mzML f...,188.05545,188.055817,1.589,2.0,C7H11NO5,...,8.389550e+07,188.055908,1.305850,/pscratch/sd/b/bpb/massive/v01/MSV000087711/cc...,188.05545,-2.434662,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8761.0,8761,MSV000092114,Chabo_3_neg,GNPS - Chabo_Botswana_Waterholes_DissolvedOrga...,Dissolved organic matter samples from waterhol...,271.056998,271.046326,1.176,1.0,C10H12N2O7,...,8.087297e+05,271.057955,1.305850,/pscratch/sd/b/bpb/massive/v01/MSV000087711/cc...,271.056998,-3.529806,,,,
6785.0,6785,MSV000088543,20181029_JJ_KZ_Switchgrass_Greenhouse_Rhizo2_Q...,Exometabolomics of Switchgrass rhizosphere,Project studies an impact of abiotic stressors...,111.043375,111.018425,2.535,1.0,C2H4N6,...,3.680207e+06,111.043651,1.312683,/global/cfs/cdirs/metatlas/projects/carbon_net...,111.043375,-2.485710,,,,
8817.0,8817,MSV000092487,S71_neg_2,GNPS - Neckar and Spree River Water DOM,Non-target metabolomics of River DOM samples i...,361.077478,361.238983,10.025,2.0,C14H18O11,...,8.333679e+05,361.078035,1.002789,/pscratch/sd/b/bpb/massive/v01/MSV000090671/cc...,361.077478,-1.542578,,,,
8921.0,8921,MSV000088823,DOM_Interlab-LCMS_Lab6_A45M_Neg_MS2_rep2,GNPS DOM LC-MS/MS Interlab Comparison 2020 COM...,Interlab Study of LC-MS/MS analyis of Marine D...,262.910774,263.076965,1.027,2.0,C2H4N2O7S3,...,9.007341e+06,262.911610,3.088507,/pscratch/sd/b/bpb/massive/v01/MSV000087123/cc...,262.910774,-3.182112,,,,


In [32]:
d_sample = ms1_data.pivot_table(index='node_id',columns=['lcmsrun_observed'],values='peak_area',aggfunc='mean',fill_value=300)
d_sample.to_csv('PeakArea_PlantData.csv') 


In [3]:
import pandas as pd
plant = pd.read_csv('PeakArea_PlantData.csv')