# ChemEcho Tutorial Notebook

## Initial Setup

In [None]:
# install chemecho
! pip uninstall -y chemecho
! pip install git+file:///global/homes/t/tharwood/repos/chemecho/

In [None]:
# imports
from chemecho.utils import load_processed_gnps_data, merge_in_nist
from chemecho.featurization import subformula_featurization, build_feature_matrix, feature_reduction, save_featurized_spectra, load_featurized_spectra, vectorize_spectrum

In [None]:
polarity = 'negative' # either 'negative' or 'positive'
workdir = '/pscratch/sd/t/tharwood/chemecho_tutorial' # storage location for feature matrices, MS2 libraries, etc.

# embedding parameters
vector_assignment_method = 'blur' # either 'blur' or 'top'. blur assigns all subformula within tolerance, top takes only the best one
max_ppm_error = 5 # max ppm error of the subformula assignment
min_feature_occurence = 6 # minimum number of feature occurences to keep

## Download & Process MS2 Training Data (Libraries)

In [None]:
gnps_cleaned = load_processed_gnps_data(gnps_cleaned_path=f'{workdir}/gnps_cleaned.tsv', convert_spectra=True, polarity=polarity)

# If you have access to commercial NIST library:
merged_lib = merge_in_nist(gnps_cleaned, nist_cleaned_path=f'{workdir}/nist_cleaned.tsv', convert_spectra=True, polarity=polarity)

# if not
# merged_lib = gnps_cleaned

In [None]:
print(f"Total spectra: {merged_lib.shape[0]}")
print(f"Unique InchiKeys: {len(merged_lib.inchikey_smiles.unique())}")

## Build Training Data Feature Matrix

In [None]:
# sample merged_lib for faster processing time
# merged_lib = merged_lib.sample(1000).reset_index(drop=True)

In [None]:
peak_subformula_vectors, nl_subformula_vectors = subformula_featurization(merged_lib, 
                                                                          vector_assignment=vector_assignment_method, 
                                                                          max_ppm_error=max_ppm_error)

In [None]:
# get indices of spectra with no subformula assigned
failed_spectra_idxs = [i for i in range(len(peak_subformula_vectors)) if peak_subformula_vectors[i] is None]

In [None]:
featurized_spectral_data, feature_vector_index_map = build_feature_matrix(peak_subformula_vectors, nl_subformula_vectors)

In [None]:
featurized_spectral_data, feature_vector_index_map = feature_reduction(featurized_spectral_data, feature_vector_index_map, 
                                                                       min_occurence=min_feature_occurence)

In [None]:
# save embeddings
save_featurized_spectra(featurized_spectral_data, feature_vector_index_map, failed_spectra_idxs, workdir, 
                        overwrite=False,
                        polarity=polarity)

## Label Data and Train Model

In [None]:
from chemecho.train_predict import train_substructure_tree, filter_failed_idxs

In [None]:
# load embeddings
featurized_spectral_data, feature_vector_index_map, failed_spectra_idxs = load_featurized_spectra(workdir, polarity=polarity)

In [None]:
# filter failed spectra
filtered_spectral_data, filtered_merged_lib = filter_failed_idxs(featurized_spectral_data, merged_lib, failed_spectra_idxs)

In [None]:
# simple example for training a model for predicting glycosylation
hexose_smarts = 'OCC1OC(O)C(O)C(O)C1O'

model, report = train_substructure_tree(hexose_smarts, filtered_merged_lib, filtered_spectral_data, workdir, polarity,
                                        frag_type='smarts',
                                        max_depth=16,
                                        min_frag_count=1,
                                        min_positive_unique=10,
                                        save_model=True)

In [None]:
print(report)

## Predict From New Spectra

In [None]:
import numpy as np

In [None]:
molecule1 = "THYMIDINE-5'-DIPHOSPHO-ALPHA-D-GLUCOSE"
parent_form1 = 'C16H26N2O16P2' # in real cases predicted with SIRIUS/MSBuddy
adduct1 = '[M-H]-' # in real cases either assumed (resulting in predictive penalty) or predicted/empirically determined
precursor_mz1 = 563.068
spectrum1 = np.array([[63.73970, 67.65610, 78.95770, 85.61220, 92.10690, 96.96830, 120.65800, 125.03500, 158.92500, 176.99600, 181.29100, 195.00600, 212.84900, 241.01200, 256.96300, 320.98100, 321.05000, 322.05300, 345.62100, 383.00700, 491.87700, 536.28100, 563.07200], [2425, 2601, 135164, 2375, 15208, 25244, 2686, 23524, 11635, 17918, 3099, 75207, 4317, 99260, 11655, 4182, 157385, 5274, 3400, 13193, 3191, 3132, 38224]])

In [None]:
ms2_vector = vectorize_spectrum(spectrum1,
                                precursor_mz1,
                                parent_form1,
                                adduct1,
                                feature_vector_index_map,
                                max_ppm_error=5,
                                vector_assignment='blur')