# ChemEcho Tutorial Notebook

## Initial Setup

In [None]:
# install chemecho
! pip uninstall -y chemecho
! pip install git+file:///global/homes/t/tharwood/repos/chemecho/

In [None]:
# imports
from chemecho.utils import load_processed_gnps_data, merge_in_nist
from chemecho.featurization import subformula_featurization, build_feature_matrix, feature_reduction, save_featurized_spectra, load_featurized_spectra

In [None]:
polarity = 'negative' # either 'negative' or 'positive'
workdir = '/pscratch/sd/t/tharwood/chemecho_tutorial' # storage location for feature matrices, MS2 libraries, etc.

# embedding parameters
vector_assignment_method = 'blur' # either 'blur' or 'top'. blur assigns all subformula within tolerance, top takes only the best one
max_ppm_error = 5 # max ppm error of the subformula assignment
min_feature_occurence = 6 # minimum number of feature occurences to keep

## Download & Process MS2 Training Data (Libraries)

In [None]:
gnps_cleaned = load_processed_gnps_data(gnps_cleaned_path=f'{workdir}/gnps_cleaned.tsv', convert_spectra=True, polarity=polarity)

# If you have access to commercial NIST library:
merged_lib = merge_in_nist(gnps_cleaned, nist_cleaned_path=f'{workdir}/nist_cleaned.tsv', convert_spectra=True, polarity=polarity)

# if not
# merged_lib = gnps_cleaned

In [None]:
print(f"Total spectra: {merged_lib.shape[0]}")
print(f"Unique InchiKeys: {len(merged_lib.inchikey_smiles.unique())}")

## Build Training Data Feature Matrix

In [None]:
# sample merged_lib for faster processing time
merged_lib = merged_lib.sample(1000).reset_index(drop=True)

In [None]:
peak_subformula_vectors, nl_subformula_vectors = subformula_featurization(merged_lib, 
                                                                          vector_assignment=vector_assignment_method, 
                                                                          max_ppm_error=max_ppm_error)

In [None]:
# get indices of spectra with no subformula assigned
failed_spectra_idxs = [i for i in range(len(peak_subformula_vectors)) if peak_subformula_vectors[i] is None]

In [None]:
featurized_spectral_data, feature_vector_index_map = build_feature_matrix(peak_subformula_vectors, nl_subformula_vectors)

In [None]:
featurized_spectral_data, feature_vector_index_map = feature_reduction(featurized_spectral_data, feature_vector_index_map, 
                                                                       min_occurence=min_feature_occurence)

In [None]:
# save embeddings
save_featurized_spectra(featurized_spectral_data, feature_vector_index_map, failed_spectra_idxs, workdir, 
                        overwrite=False,
                        polarity=polarity)

In [None]:
# load embeddings
featurized_spectral_data, feature_vector_index_map, failed_spectra_idxs = load_featurized_spectra(workdir, polarity=polarity)

## Label Data and Train Model

In [None]:
from chemecho.train_predict import train_substructure_tree

In [None]:
# simple example for training a model for predicting glycosylation
hexose_smarts = 'OCC1OC(O)C(O)C(O)C1O'

train_substructure_tree(frag, merged_lib, featurized_spectral_data, workdir, polarity,
                        frag_type='smarts',
                        max_depth=16,
                        min_frag_count=1,
                        min_positive_unique=10,
                        save_model=True)

## Predict From New Spectra