# Reference spectra selection using submodular selection

- Randomly select 1000 files from the training split of the MassIVE-KB data set.
- Randomly subsample to 200,000 spectra.
- Compute a pairwise dot product similarity matrix for those 200,000 spectra using the peak-to-peak normalized dot product.
- Perform submodular selection using the facility location function to select the 500 best reference spectra.
- UMAP plot using the pairwise similarity matrix.

In [None]:
import os
import sys
os.environ['GLEAMS_HOME'] = os.path.join(
    os.environ['HOME'], 'Projects', 'gleams')
# Make sure all code is in the PATH.
src_dir = os.path.normpath(os.path.join(os.environ['GLEAMS_HOME'], 'src'))
if src_dir not in sys.path:
    sys.path.append(src_dir)

In [None]:
import copy

import apricot
import joblib
import matplotlib.pyplot as plt
import numba as nb
import numpy as np
import pandas as pd
import pyteomics
import seaborn as sns
import umap

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Initialize logging.
from gleams import logger as glogger
glogger.init()
# Initialize all random seeds before importing any packages.
from gleams import rndm
rndm.set_seeds()

from gleams import config
from gleams.feature import spectrum
from gleams.ms_io import ms_io

In [None]:
import logging
logger = logging.getLogger('gleams')
logger.setLevel(logging.DEBUG)

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context(font_scale=1.3)    # Single-column figure.

In [None]:
num_files = 1000
num_spectra = 200000
num_ref_spectra = 1000

In [None]:
peak_dir = os.path.join(os.environ['GLEAMS_HOME'], 'data', 'peak')

In [None]:
split = 'train'
filenames = (
    pd.read_parquet(config.metadata_filename.replace(
        '.parquet', f'_{split}.parquet'), columns=['dataset', 'filename'])
    .drop_duplicates())

In [None]:
filenames_selected = filenames.sample(min(len(filenames), num_files))

In [None]:
def get_spectra_from_file(dataset: str, filename: str):
    logger.debug('Process file %s/%s', dataset, filename)
    spectra, spectra_raw = [], []
    peak_filename = os.path.join(os.environ['GLEAMS_HOME'], 'data', 'peak',
                                 dataset, filename)
    if not os.path.isfile(peak_filename):
        logger.warning('Missing peak file %s, no spectra read',
                       peak_filename)
    else:
        for spec in ms_io.get_spectra(peak_filename):
            spec.dataset = dataset
            spec.filename = filename
            spec_raw = copy.deepcopy(spec)
            if spectrum.preprocess(spec, config.fragment_mz_min,
                                   config.fragment_mz_max).is_valid:
                spectra.append(spec)
                spectra_raw.append(spec_raw)
    return spectra, spectra_raw

In [None]:
logger.info('Read spectrum files')
spectra, spectra_raw = [], []
for file_spectra, file_spectra_raw in joblib.Parallel(
        n_jobs=-1, backend='multiprocessing')(
        joblib.delayed(get_spectra_from_file)(dataset, filename)
        for dataset, filename in zip(filenames_selected['dataset'],
                                     filenames_selected['filename'])):
    spectra.extend(file_spectra)
    spectra_raw.extend(file_spectra_raw)

In [None]:
idx = np.random.choice(len(spectra), min(num_spectra, len(spectra)), False)
spectra_selected = np.asarray(spectra)[idx]
spectra_raw_selected = np.asarray(spectra_raw)[idx]
logger.info('%d spectra randomly downsampled to %d spectra', len(spectra),
            num_spectra)

In [None]:
joblib.dump(spectra_selected, 'ref_spectra_selected.joblib')
joblib.dump(spectra_raw_selected, 'ref_spectra_selected_raw.joblib')

In [None]:
# spectra_selected = joblib.load('ref_spectra_selected.joblib')
# spectra_raw_selected = joblib.load('ref_spectra_selected_raw.joblib')

In [None]:
def pairwise_dot(spectra_arr, out):
    pairwise_dot_nb(spectra_arr, out)
    # Guard against numerical instability.
    return np.clip(out, 0, 1, out)

@nb.njit(parallel=True)
def pairwise_dot_nb(spectra_arr, out):
    for i in nb.prange(spectra_arr.shape[0]):
        for j in range(i + 1, spectra_arr.shape[0]):
            out[i, j] = out[j, i] = spectrum.dot(
                spectra_arr[i, 0], spectra_arr[i, 1],
                spectra_arr[j, 0], spectra_arr[j, 1],
                config.fragment_mz_tol)
    np.fill_diagonal(out, 1)

In [None]:
spectra_arr = np.asarray([
    np.pad([spec.mz, spec.intensity],
           ((0, 0), (config.max_peaks_used - len(spec.mz), 0)), 'constant')
    for spec in spectra_selected])

In [None]:
logger.info('Compute pairwise dot products between all spectra')
dot_products = pairwise_dot(spectra_arr, np.zeros((num_spectra, num_spectra),
                                                  np.float32))

In [None]:
joblib.dump(dot_products, 'ref_spectra_pairwise_dot.joblib')

In [None]:
# dot_products = joblib.load('ref_spectra_pairwise_dot.joblib')

In [None]:
logger.info('Select %d reference spectra from %d spectra using the facility '
            'location function', num_ref_spectra, dot_products.shape[0])
selector = (apricot.FacilityLocationSelection(num_ref_spectra, 'precomputed',
                                              n_jobs=-1)
            .fit(dot_products))

In [None]:
joblib.dump(selector, 'ref_spectra_submodular_selector.joblib')

In [None]:
# selector = joblib.load('ref_spectra_submodular_selector.joblib')

In [None]:
logger.info('Export the selected reference spectra to an MGF file')
f_out = pyteomics.mgf.write(
    [{'m/z array': spec.mz, 'intensity array': spec.intensity, 'params': {
        'title': f'mzspec:{spec.dataset}:'
                 f'{os.path.splitext(spec.filename)[0]}:'
                 f'scan:{spec.identifier}',
        'rtinseconds': spec.retention_time,
        'pepmass': (spec.precursor_mz, None),
        'charge': spec.precursor_charge}}
     for i, spec in enumerate(spectra_raw_selected[selector.ranking])],
    'gleams_reference_spectra.mgf', file_mode='w')
f_out.close()

In [None]:
logger.info('UMAP plot submodular selection')
embedding = (umap.UMAP(n_neighbors=5, metric='precomputed')
             .fit_transform(1 - dot_products))

In [None]:
joblib.dump(embedding, 'ref_spectra_umap.joblib')

In [None]:
# embedding = joblib.load('ref_spectra_umap.joblib')

In [None]:
width = 7
# height = width / 1.618
fig, ax = plt.subplots(figsize=(width, width))

ax.scatter(embedding[:, 0], embedding[:, 1], s=1, c='lightgray', alpha=0.1,
           rasterized=True)
sc = ax.scatter(embedding[selector.ranking, 0],
                embedding[selector.ranking, 1],
                s=10, c=np.arange(num_ref_spectra), cmap='YlGnBu',
                alpha=0.75, rasterized=True)

ax.set_xlabel('UMAP1', size='large')
ax.set_ylabel('UMAP2', size='large')
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
sns.despine(left=True, bottom=True)

cbar_ax = fig.add_axes([0.95, 0.25, 0.025, 0.5])
colorbar = fig.colorbar(sc, ticks=np.arange(0, num_ref_spectra + 1, 100),
                        boundaries=np.arange(0, num_ref_spectra + 1, 1),
                        cax=cbar_ax)
colorbar.set_label('Submodular selection order', size='large', labelpad=15)

plt.savefig('ref_spectra_umap.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
logging.shutdown()