# Example Notebook For Associating Experimental Signals With The Carbon Network
Prior to running this notebook, the CarbonNetwork graphml file must be present in the build folder

In [None]:
import pandas as pd
import numpy as np
import glob as glob
import matplotlib.pyplot as plt
import networkx as nx
import os
import glob

from typing import List, Tuple

from tqdm.notebook import tqdm

# replace with submodules
# import sys
# sys.path.insert(0,'/global/homes/b/bpb/repos/metatlas')
# from metatlas.io import feature_tools as ft
# sys.path.insert(0,'/global/homes/b/bpb/repos/blink')
# import blink

import analysis_tools as at

## Parameters
The next code block sets parameters that are used throughout the remainder of the notebook.

In [None]:
# experiment directory
## note: must follow Northen Lab file naming conventions and be converted to hdf5 format
exp_dir  = ['/global/cfs/cdirs/metatlas/raw_data/egsb/20240409_EB_NB_107915-001_PRISM-RtExu_combined-rep1-5_EXP120A_C18-EP_USDAY72349']
# exp_dir = ['/global/cfs/cdirs/metatlas/raw_data/egsb/20231113_EB_SMK_107002-011_CenturyExp_20230414_EXP120A_C18-EP_USDAY72349',
        #    '/global/cfs/cdirs/metatlas/raw_data/jgi/20240112_JGI_MdR_109570-002_OMTSoil50g_Pilot_QEHF_C18_USDAY86082_CORRECTED']
# experiment directory
## note: must follow Northen Lab file naming conventions and be converted to hdf5 format
files = glob.glob(exp_dir[0] + '/*NEG*.h5')
files = [f for f in files if 'qc' not in f.lower()]
files = [f for f in files if 'blank' not in f.lower()]
print(len(files))
# tolerance in ppm between experimental signal and node mz
mz_ppm_tolerance = 5
peak_height_min = 1e4
num_datapoints_min = 10
# minimum MSMS score 
msms_score_min = 0.5

# minimum MSMS matching ion count
msms_matches_min = 3

# retention time range in minutes for feature finding
rt_range = [1, 700]

# tolerance in daltons used for calculating MS/MS similarity scores
frag_mz_tolerance = 0.05

In [None]:
from importlib import reload
at = reload(at)

In [None]:
# df = pd.read_csv('/global/cfs/cdirs/metatlas/projects/carbon_network/public_and_internal_files_with_massive_and_redu.tsv', sep='\t')
# df = df[df['SampleType']=='plant']
# df = df[~df['buddy'].str.contains('qc',case=False)]
# df = df[~df['buddy'].str.contains('blank',case=False)]
# out_dir = '/global/cfs/cdirs/metatlas/projects/carbon_network/raw_data'
# temp_files = df['h5'].tolist()
# files = []
# for f in temp_files:
#     base_dir = os.path.dirname(f)
#     base_name = os.path.basename(f)
#     new_dir = os.path.join(out_dir,base_dir)
#     new_name = os.path.join(new_dir,base_name)
#     files.append(new_name)
#     if not os.path.isfile(new_name):
#         print('File Not Found!')
#         print(new_name)
#         print(f)
# files_data = pd.DataFrame(files,columns=['filename'])
# files = files_data['filename'].tolist()


In [None]:
# collect and merge required data and metadata
node_data = at.graph_to_df()
node_atlas = at.make_node_atlas(node_data, rt_range)
merged_node_data = at.merge_spectral_data(node_data)
# files_data = at.get_files_df(exp_dir)
# files = files_data['filename'].tolist()

In [None]:
len(files)

In [None]:
# get ms1 and ms2 data
ms1_data = at.get_sample_ms1_data(node_atlas, files, mz_ppm_tolerance,peak_height_min,num_datapoints_min)
max_ms1_data = at.get_best_ms1_rawdata(ms1_data,node_data)
ms2_data = at.get_sample_ms2_data(files,merged_node_data,msms_score_min,msms_matches_min,mz_ppm_tolerance,frag_mz_tolerance)
max_ms2_data = at.get_best_ms2_rawdata(ms2_data)
best_hits = at.get_best_ms1_ms2_combined(max_ms1_data,max_ms2_data)



In [None]:
# ms2_data = [pd.concat(m) for m in ms2_data if m is not None]
ms2_data = pd.concat(ms2_data)
ms2_data

In [None]:
max_ms2_data = at.get_best_ms2_rawdata(ms2_data)
best_hits = at.get_best_ms1_ms2_combined(max_ms1_data,max_ms2_data)

In [None]:
# best_hits = best_hits[best_hits['peak_area']>1e4]
# best_hits[pd.notna(best_hits['ms2_score'])]

In [None]:
def make_output_df(node_data,best_hits,filename='output.csv'):
    output = node_data.copy()
    output.set_index('node_id',inplace=True)
    output = output.join(best_hits.set_index('node_id'),rsuffix='_best_hit',how='left')
    output.to_csv(filename)
    return output
make_output_df(node_data,best_hits,filename='BestHits_ExudateData.csv')

In [None]:
ms2_data['plant'] = ms2_data['lcmsrun_observed'].apply(lambda x: os.path.basename(x).split('_')[12])

In [None]:
d_sample = ms2_data.pivot_table(index='node_id',columns=['plant'],values='score',aggfunc='max',fill_value=0)
d_sample = d_sample>0.6
d_sample.sum().sort_values(ascending=False).head(20)

In [None]:
files = ms1_data['lcmsrun_observed'].unique()
ms1_data['plant'] = ms1_data['lcmsrun_observed'].apply(lambda x: os.path.basename(x).split('_')[12])


In [None]:
d_sample = ms1_data.pivot_table(index='node_id',columns=['plant'],values='peak_area',aggfunc='mean',fill_value=300)
d_sample.to_csv('PeakArea_ExudateData-groupedbyplant.csv') 
d_sample = d_sample>1e7


In [None]:
d_sample.sum().sort_values(ascending=False).head(20)

In [None]:
d_sample = ms1_data.pivot_table(index='node_id',columns=['lcmsrun_observed'],values='peak_area',aggfunc='mean',fill_value=300)
d_sample.to_csv('PeakArea_ExudateData.csv') 


In [None]:
import pandas as pd
plant = pd.read_csv('PeakArea_PlantData.csv')