# Example Notebook For Associating Experimental Signals With The Carbon Network
Prior to running this notebook, the CarbonNetwork graphml file must be present in the build folder

In [None]:
import pandas as pd
import numpy as np
import glob as glob
import matplotlib.pyplot as plt
import networkx as nx
import os
import glob

from typing import List, Tuple

from tqdm.notebook import tqdm

# replace with submodules
import sys
sys.path.insert(0,'/global/homes/t/tharwood/repos/metatlas')
from metatlas.io import feature_tools as ft
sys.path.insert(0,'/global/homes/t/tharwood/repos/blink')
import blink

import analysis_tools as at

## Parameters
The next code block sets parameters that are used throughout the remainder of the notebook.

In [None]:
# experiment directory
## note: must follow Northen Lab file naming conventions and be converted to hdf5 format
exp_dir = '/global/cfs/cdirs/metatlas/raw_data/egsb/20231113_EB_SMK_107002-011_CenturyExp_20230414_EXP120A_C18-EP_USDAY72349'

# tolerance in ppm between experimental signal and node mz
mz_ppm_tolerance = 4

# minimum MSMS score 
msms_score_min = 0.5

# minimum MSMS matching ion count
msms_matches_min = 3

# retention time range in minutes for feature finding
rt_range = [1, 7]

# tolerance in daltons used for calculating MS/MS similarity scores
frag_mz_tolerance = 0.05

In [None]:
# collect and merge required data and metadata
node_data = at.graph_to_df()
node_atlas = at.make_node_atlas(node_data, rt_range)
merged_node_data = at.merge_spectral_data(node_data)
files_data = at.get_files_df(exp_dir)

In [None]:
# for testing purposes for faster runtime, remove later
files = files_data['filename'].tolist()[:10]

In [None]:
# get ms1 and ms2 data
ms1_data = at.get_sample_ms1_data(node_atlas, files, mz_ppm_tolerance)
ms2_data = at.get_sample_ms2_data(files)

In [None]:
ms2_data['nl_spectrum'] = ms2_data.apply(lambda x: np.array([x.mdm_mz_vals, x.mdm_i_vals]), axis=1)
ms2_data['original_spectrum'] = ms2_data.apply(lambda x: np.array([x.original_mz_vals, x.original_i_vals]), axis=1)

In [None]:
all_pmzs = np.array(ms2_data['precursor_mz'].tolist() + merged_node_data['precursor_mz'].tolist())
all_pmzs.sort()

all_pmz_groups = ft.group_consecutive(all_pmzs)

pmz_group_key = dict(zip(all_pmzs, all_pmz_groups))

In [None]:
ms2_data['group_idx'] = ms2_data['precursor_mz'].apply(lambda x: pmz_group_key[x])
merged_node_data['group_idx'] = merged_node_data['precursor_mz'].apply(lambda x: pmz_group_key[x])

In [None]:
ms2_and_node_data = pd.merge(ms2_data.rename(columns={'precursor_mz': 'data_calc_precursor_mz'}), 
                             merged_node_data[['group_idx', 'node_id', 'precursor_mz', 'spectrum_original_spectra', 'spectrum_nl_spectra']], on='group_idx')

In [None]:
nl_data_spectra = ms2_and_node_data['nl_spectrum'].tolist()
nl_ref_spectra = ms2_and_node_data['spectrum_nl_spectra'].tolist()

or_data_spectra = ms2_and_node_data['original_spectrum']
or_ref_spectra = ms2_and_node_data['spectrum_original_spectra'].tolist()

data_pmzs = ms2_and_node_data['data_calc_precursor_mz'].tolist()
ref_pmzs = ms2_and_node_data['precursor_mz'].tolist()

In [None]:
discretized_spectra = blink.discretize_spectra(nl_data_spectra, nl_ref_spectra, data_pmzs,  ref_pmzs,
                                         bin_width=0.001, tolerance=frag_mz_tolerance, intensity_power=0.5, trim_empty=False, remove_duplicates=False, network_score=False)

S12 = blink.score_sparse_spectra(discretized_spectra)
m = blink.reformat_score_matrix(S12)

In [None]:
ms2_and_node_data['nl_blink_score'] = np.diag(S12['mzi'].toarray())
ms2_and_node_data['nl_matches'] = np.diag(S12['mzc'].toarray())

In [None]:
discretized_spectra = blink.discretize_spectra(or_data_spectra, or_ref_spectra, data_pmzs,  ref_pmzs,
                                         bin_width=0.001, tolerance=frag_mz_tolerance, intensity_power=0.5, trim_empty=False, remove_duplicates=False, network_score=False)

S12 = blink.score_sparse_spectra(discretized_spectra)
m = blink.reformat_score_matrix(S12)

In [None]:
ms2_and_node_data['or_blink_score'] = np.diag(S12['mzi'].toarray())
ms2_and_node_data['or_matches'] = np.diag(S12['mzc'].toarray())

In [None]:
ms2_and_node_data

In [None]:
ms1_data

In [None]:
ms1_data = ms1_data.astype({'label': 'string', 'lcmsrun_observed': 'string'})

In [None]:
max_ms1_data = ms1_data.sort_values('peak_height', ascending=False).drop_duplicates(subset='label').rename(columns={'label': 'node_id'})

In [None]:
# max_ms1_data = pd.merge(max_ms1_data.rename(columns={'label': 'node_id'}), node_data[['node_id', 'precursor_mz']], on='node_id')
# max_ms1_data['ppm_error'] = max_ms1_data.apply(lambda x: ((x.precursor_mz - x.mz_centroid) / x.precursor_mz) * 1000000, axis=1)

In [None]:
max_ms2_data = ms2_and_node_data.sort_values(['nl_blink_score', 'or_blink_score'], ascending=False
                                            ).drop_duplicates(subset='node_id'
                                            ).rename(columns={'filename': 'lcmsrun_observed'}
                                            )[['node_id', 'nl_blink_score', 'nl_matches', 'or_blink_score', 'nl_matches', 'or_matches', 'lcmsrun_observed']]

In [None]:
max_ms2_data.columns

In [None]:
max_ms2_data['max_blink_score'] = max_ms2_data[['nl_blink_score', 'or_blink_score']].max(axis=1)
max_ms2_data['max_matches'] = max_ms2_data[['nl_matches', 'or_matches']].max(axis=1)

max_ms2_data = max_ms2_data[(max_ms2_data['max_blink_score']>=msms_score_min) & (max_ms2_data['max_matches']>=msms_matches_min)]

In [None]:
max_ms1_data

In [None]:
max_ms2_data

In [None]:
ms1_data