# ChemEcho Tutorial Notebook

## Initial Setup

In [None]:
# install chemecho
! pip uninstall -y chemecho
! pip install git+file:///global/homes/t/tharwood/repos/chemecho/

In [None]:
# imports
from chemecho.utils import load_processed_gnps_data, merge_in_nist
from chemecho.featurization import subformula_featurization, build_feature_matrix, feature_reduction, save_featurized_spectra, load_featurized_spectra, vectorize_spectrum

In [None]:
polarity = 'negative' # either 'negative' or 'positive'
workdir = '/pscratch/sd/t/tharwood/chemecho_tutorial' # storage location for feature matrices, MS2 libraries, etc.

# embedding parameters
vector_assignment_method = 'blur' # either 'blur' or 'top'. blur assigns all subformula within tolerance, top takes only the best one
max_ppm_error = 5 # max ppm error of the subformula assignment
min_feature_occurence = 6 # minimum number of feature occurences to keep

## Download & Process MS2 Training Data (Libraries)

In [None]:
gnps_cleaned = load_processed_gnps_data(gnps_cleaned_path=f'{workdir}/gnps_cleaned.tsv', convert_spectra=True, polarity=polarity)

# If you have access to commercial NIST library:
merged_lib = merge_in_nist(gnps_cleaned, nist_cleaned_path=f'{workdir}/nist_cleaned.tsv', convert_spectra=True, polarity=polarity)

# if not
# merged_lib = gnps_cleaned

In [None]:
print(f"Total spectra: {merged_lib.shape[0]}")
print(f"Unique InchiKeys: {len(merged_lib.inchikey_smiles.unique())}")

## Build Training Data Feature Matrix

In [None]:
peak_subformula_vectors, nl_subformula_vectors = subformula_featurization(merged_lib, 
                                                                          vector_assignment=vector_assignment_method, 
                                                                          max_ppm_error=max_ppm_error)

In [None]:
# get indices of spectra with no subformula assigned
failed_spectra_idxs = [i for i in range(len(peak_subformula_vectors)) if peak_subformula_vectors[i] is None]

In [None]:
featurized_spectral_data, feature_vector_index_map = build_feature_matrix(peak_subformula_vectors, nl_subformula_vectors)

In [None]:
featurized_spectral_data, feature_vector_index_map = feature_reduction(featurized_spectral_data, feature_vector_index_map, 
                                                                       min_occurence=min_feature_occurence)

In [None]:
# save embeddings
save_featurized_spectra(featurized_spectral_data, feature_vector_index_map, failed_spectra_idxs, workdir, 
                        overwrite=False,
                        polarity=polarity)

## Label Data and Train Model

In [None]:
from chemecho.train_predict import train_substructure_tree, filter_failed_idxs

In [None]:
# load embeddings
featurized_spectral_data, feature_vector_index_map, failed_spectra_idxs = load_featurized_spectra(workdir, polarity=polarity)

In [None]:
# filter failed spectra
filtered_spectral_data, filtered_merged_lib = filter_failed_idxs(featurized_spectral_data, merged_lib, failed_spectra_idxs)

In [None]:
# simplistic example for training a model for predicting glucose-derivatives
hexose_smarts = 'OCC1OC(O)C(O)C(O)C1O'

model, report = train_substructure_tree(hexose_smarts, filtered_merged_lib, filtered_spectral_data, workdir, polarity,
                                        frag_type='smarts',
                                        max_depth=16,
                                        min_frag_count=1,
                                        min_positive_unique=10,
                                        save_model=True)

In [None]:
print(report)

## Predict From New Spectra

In [None]:
import numpy as np

In [None]:
molecule1 = "Glucuronic acid"
parent_form1 = 'C6H10O7' # in real cases predicted with SIRIUS/MSBuddy
adduct1 = '[M-H]-' # in real cases either assumed (resulting in predictive penalty) or predicted/empirically determined
precursor_mz1 = 193.035
spectrum1 = np.array([[51.709801,
  57.034199,
  59.013401,
  71.013603,
  71.408501,
  72.992897,
  73.029297,
  73.972702,
  75.008499,
  83.013702,
  85.029404,
  85.035004,
  87.008698,
  89.0243,
  95.013802,
  99.009003,
  101.024002,
  103.003998,
  113.014999,
  113.024002,
  116.607002,
  129.020004,
  131.035004,
  133.014008,
  157.014999,
  163.024994,
  193.035004],
 [55178.199219,
  229171.0,
  2067150.0,
  2229370.0,
  59995.300781,
  5109200.0,
  1023610.0,
  94215.898438,
  87368.898438,
  106567.0,
  2388860.0,
  118491.0,
  91278.398438,
  1230700.0,
  467406.0,
  246487.0,
  1734100.0,
  1689690.0,
  144557.0,
  4421380.0,
  65680.796875,
  248136.0,
  568488.0,
  382782.0,
  295386.0,
  107466.0,
  836176.0]])

In [None]:
ms2_vector = vectorize_spectrum(spectrum1,
                                precursor_mz1,
                                parent_form1,
                                adduct1,
                                feature_vector_index_map,
                                max_ppm_error=5,
                                vector_assignment='blur')

In [None]:
test_pred = model.predict(ms2_vector)

In [None]:
test_pred