# Reference spectra

## Submodular selection overview

- Randomly select 1000 files from the MassIVE-KB input files = 24,076,639 spectra
- Randomly subsample to 200,000 spectra
- Compute a pairwise distance matrix for those 200,000 spectra using a peak-to-peak normalized dot product
- Perform submodular selection using the facility location function to select the 500 best reference spectra
- UMAP plot using the pairwise distance matrix

In [None]:
import os
import sys

# Cluster.
# src_dir = os.path.abspath(f'{os.environ["HOME"]}/projects/gleams/src')
# Laptop.
src_dir = os.path.abspath(f'{os.environ["HOME"]}/Projects/gleams/src')
if src_dir not in sys.path:
    sys.path.append(src_dir)

In [None]:
import copy
import logging

import joblib
import matplotlib.pyplot as plt
import numba as nb
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm
import umap
from apricot import FacilityLocationSelection
from pyteomics import mgf

from gleams import config
from gleams.embed import encoder
from gleams.embed import spectrum
from gleams.io import ms_io

In [None]:
tqdm.tqdm = tqdm.tqdm_notebook

In [None]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s: %(message)s')

# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context(font_scale=1.3)    # single-column figure

In [None]:
np.random.seed(42)

In [None]:
num_files = 1000
num_spectra = 200000
num_ref_spectra = 500

In [None]:
data_dir = '/net/noble/vol3/user/wout/gleams/data'

In [None]:
filenames = [os.path.join(data_dir, filename)
             for filename in os.listdir(data_dir)
             if filename.lower().endswith(('.mzml', '.mzxml',
                                           '.mzml.xz', 'mzxml.xz'))]

In [None]:
filenames_selected = np.asarray(filenames)[
    np.random.choice(len(filenames), num_files, False)]

In [None]:
logging.info('Read spectrum files')
spectra, spectra_raw = [], []
for file_i, spec_file in enumerate(filenames_selected, 1):
    basename = os.path.basename(spec_file)
    logging.info('Processing file %s [%d/%d]', basename, file_i,
                 len(filenames_selected))
    for spec in tqdm.tqdm(ms_io.get_spectra(spec_file), desc='Spectra read',
                          leave=False, unit='spectra'):
        spec.filename = basename
        spec_raw = copy.deepcopy(spec)
        if spectrum.preprocess(spec, config.fragment_mz_min,
                               config.fragment_mz_max).is_valid:
            spectra.append(spec)
            spectra_raw.append(spec_raw)

In [None]:
idx = np.random.choice(len(spectra), num_spectra, False)
spectra_selected = np.asarray(spectra)[idx]
spectra_raw_selected = np.asarray(spectra_raw)[idx]
logging.info('%d spectra downsampled to %d spectra', len(spectra),
             num_spectra)

In [None]:
joblib.dump(spectra_selected, 'spectra_selected.joblib')
joblib.dump(spectra_raw_selected, 'spectra_raw_selected.joblib')

In [None]:
# spectra_selected = joblib.load('spectra_selected.joblib')
# spectra_raw_selected = joblib.load('spectra_raw_selected.joblib')

In [None]:
def pairwise_distances(spectra_arr, out):
    pairwise_distances_nb(spectra_arr, out)
    # Guard against numerical instability.
    return np.clip(out, 0, 1, out)

@nb.njit(parallel=True)
def pairwise_distances_nb(spectra_arr, out):
    for i in nb.prange(spectra_arr.shape[0]):
        for j in range(i + 1, spectra_arr.shape[0]):
            out[i, j] = out[j, i] = spectrum.dot(
                spectra_arr[i, 0], spectra_arr[i, 1],
                spectra_arr[j, 0], spectra_arr[j, 1],
                config.fragment_mz_tol)
    np.fill_diagonal(out, 1)

In [None]:
max_len = config.max_peaks_used
spectra_arr = np.asarray([
    np.pad([spec.mz, spec.intensity], ((0, 0), (max_len - len(spec.mz), 0)),
           'constant') for spec in spectra_selected])

In [None]:
logging.info('Compute pairwise distances')
dist = np.zeros((num_spectra, num_spectra), np.float32)
dist = pairwise_distances(spectra_arr, dist)

In [None]:
joblib.dump(dist, 'pairwise_distances.joblib')

In [None]:
# dist = joblib.load('pairwise_distances.joblib')

In [None]:
logging.info('Perform submodular selection')
selector = FacilityLocationSelection(num_ref_spectra, 'precomputed')
selector.fit(dist)

In [None]:
joblib.dump(selector, 'submodular_selector.joblib')

In [None]:
# selector = joblib.load('submodular_selector.joblib')

In [None]:
logging.info('Export selected reference spectra to MGF')
metadata = pd.read_csv(
    f'{os.environ["HOME"]}/projects/gleams/data/metadata.csv',
    usecols=['filename', 'dataset'])
metadata.drop_duplicates(inplace=True)
metadata.set_index('filename', inplace=True)

In [None]:
f_out = mgf.write(
    [{'m/z array': spec.mz, 'intensity array': spec.intensity, 'params': {
        'title': f'mzspec:{metadata.loc[spec.filename, "dataset"]}:'
                 f'{os.path.splitext(os.path.splitext(spec.filename)[0])[0]}:'
                 f'scan:{spec.identifier}',
        'rtinseconds': spec.retention_time,
        'pepmass': (spec.precursor_mz, None),
        'charge': spec.precursor_charge}}
     for i, spec in enumerate(spectra_raw_selected[selector.ranking])],
    'gleams_reference_spectra.mgf', file_mode='w')
f_out.close()

In [None]:
logging.info('UMAP plot submodular selection')
embedding = (umap.UMAP(n_neighbors=5, metric='precomputed', random_state=42)
             .fit_transform(1 - dist))

In [None]:
joblib.dump(embedding, 'umap_embedding.joblib')

In [None]:
# embedding = joblib.load('umap_embedding.joblib')

In [None]:
width = 7
# height = width / 1.618
fig, ax = plt.subplots(figsize=(width, width))

ax.scatter(embedding[:, 0], embedding[:, 1], c='lightgray', marker='.',
           alpha=0.01)
sc = ax.scatter(embedding[selector.ranking, 0],
                embedding[selector.ranking, 1],
                c=np.arange(num_ref_spectra), cmap='cividis', alpha=0.75)

ax.set_xlabel('UMAP dimension 1', size='large')
ax.set_ylabel('UMAP dimension 2', size='large')
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
sns.despine(left=True, bottom=True)

cbar_ax = fig.add_axes([0.95, 0.25, 0.025, 0.5])
colorbar = fig.colorbar(sc, ticks=np.arange(0, num_ref_spectra + 1, 100),
                        boundaries=np.arange(0, num_ref_spectra + 1, 1),
                        cax=cbar_ax)
colorbar.set_label('Submodular selection order', size='large', labelpad=15)

plt.savefig('umap_ref_spectra.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()