In [None]:
import os
import sys
os.environ['GLEAMS_HOME'] = os.path.join(
    os.environ['HOME'], 'Projects', 'gleams')
# Make sure all code is in the PATH.
src_dir = os.path.normpath(os.path.join(os.environ['GLEAMS_HOME'], 'src'))
if src_dir not in sys.path:
    sys.path.append(src_dir)

In [None]:
import collections
import itertools

import joblib
import matplotlib.pyplot as plt
import numba as nb
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm.notebook as tqdm
from sklearn.metrics import auc, roc_curve

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Initialize logging.
from gleams import logger as glogger
glogger.init()
# Initialize all random seeds before importing any packages.
from gleams import rndm
rndm.set_seeds()

from gleams import config
from gleams.feature import spectrum
from gleams.ms_io import ms_io
from gleams.nn import embedder, data_generator, nn

In [None]:
import logging
logger = logging.getLogger('gleams')
logger.setLevel(logging.DEBUG)

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

In [None]:
split = 'test'
num_pairs = 10_000_000

In [None]:
pair_generator = data_generator.PairSequence(
    os.path.join(
        os.environ['GLEAMS_HOME'], 'data', 'feature',
        f'feature_{config.massivekb_task_id}_{split}.npz'),
    os.path.join(
        os.environ['GLEAMS_HOME'], 'data', 'feature',
        f'feature_{config.massivekb_task_id}_{split}_pairs_pos.npy'),
    os.path.join(
        os.environ['GLEAMS_HOME'], 'data', 'feature',
        f'feature_{config.massivekb_task_id}_{split}_pairs_neg.npy'),
    config.batch_size, nn._get_feature_split(), num_pairs,
    False)

In [None]:
pair_metadata = pd.read_parquet(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'feature',
                 f'feature_{config.massivekb_task_id}_{split}.parquet'))

In [None]:
def _get_spectra_from_file(dataset, filename, scans):
    spectra = {}
    filepath = os.path.join(os.environ['GLEAMS_HOME'], 'data', 'peak',
                            dataset, filename)
    if not os.path.isfile(filepath):
        logger.warning('Missing peak file %s, no spectra read', filename)
    else:
        for spec in ms_io.get_spectra(filepath, scans):
            spectra[f'{dataset}/{filename}/{spec.identifier}'] = \
                spectrum.preprocess(spec, config.fragment_mz_min,
                                    config.fragment_mz_max)
    return spectra

In [None]:
dataset_total = pair_metadata['dataset'].nunique()
spectra = []
for dataset_i, (dataset, md_dataset) in enumerate(
        pair_metadata.groupby('dataset', sort=False), 1):
    logging.info('Process dataset %s (%d files) [%3d/%3d]', dataset,
                 md_dataset['filename'].nunique(), dataset_i, dataset_total)
    spectra.extend(joblib.Parallel(n_jobs=-1, backend='multiprocessing')(
        joblib.delayed(_get_spectra_from_file)(dataset, filename,
                                               md_file['scan'])
        for filename, md_file in md_dataset.groupby(
            'filename', sort=False)))
spectra = collections.ChainMap(*spectra)

In [None]:
@nb.njit(parallel=True)
def dot(spectra_arr1, spectra_arr2, out, fragment_mz_tol):
    for i in nb.prange(spectra_arr1.shape[0]):
        out[i] = spectrum.dot(
            spectra_arr1[i, 0], spectra_arr1[i, 1],
            spectra_arr2[i, 0], spectra_arr2[i, 1],
            fragment_mz_tol)
    return out

In [None]:
spectra_arr, index_dot = ([], []), []
for pair1, pair2 in itertools.chain(pair_generator.pairs_pos,
                                    pair_generator.pairs_neg):
    index_dot.append([pair1, pair2])
    for pair_i, arr_i in zip([pair1, pair2], [0, 1]):
        spec = spectra[f"{pair_metadata.at[pair_i, 'dataset']}/"
                       f"{pair_metadata.at[pair_i, 'filename']}/"
                       f"{pair_metadata.at[pair_i, 'scan']}"]
        spectra_arr[arr_i].append(
            np.pad(
                [spec.mz, spec.intensity],
                ((0, 0), (config.max_peaks_used - len(spec.mz), 0)),
                'constant'))

fragment_mz_tol_high_res, fragment_mz_tol_low_res = 0.05, 0.8
labels = np.hstack((np.ones(len(pair_generator.pairs_pos), np.uint8),
                    np.zeros(len(pair_generator.pairs_neg), np.uint8)))
spectra_arr1 = np.asarray(spectra_arr[0])
spectra_arr2 = np.asarray(spectra_arr[1])
dot_high_res = dot(
    spectra_arr1, spectra_arr2, np.zeros(spectra_arr1.shape[0], np.float32),
    fragment_mz_tol_high_res)
dot_low_res = dot(
    spectra_arr1, spectra_arr2, np.zeros(spectra_arr1.shape[0], np.float32),
    fragment_mz_tol_low_res)
index_dot = np.asarray(index_dot)

In [None]:
emb = embedder.Embedder(
    config.num_precursor_features, config.num_fragment_features,
    config.num_ref_spectra, config.lr, config.model_filename)
emb.load()

In [None]:
labels_embed, scores_embed = [], []
for batch_i in tqdm.tqdm(range(len(pair_generator)),
                         desc='Batches embedded'):
    batch_x, batch_y = pair_generator[batch_i]
    labels_embed.extend(batch_y)
    scores_embed.extend(
        emb.siamese_model.predict(batch_x).reshape(-1))
labels_embed = np.asarray(labels_embed)
scores_embed = np.asarray(scores_embed)

In [None]:
index_embed = []
for batch_i in range(len(pair_generator)):
    batch_pairs_pos = pair_generator.pairs_pos[
        batch_i * pair_generator.batch_size // 2:
        (batch_i + 1) * pair_generator.batch_size // 2]
    batch_pairs_neg = pair_generator.pairs_neg[
        batch_i * pair_generator.batch_size // 2:
        (batch_i + 1) * pair_generator.batch_size // 2]
    index_embed.extend(batch_pairs_pos)
    index_embed.extend(batch_pairs_neg)
index_embed = np.asarray(index_embed)

In [None]:
dot_embed = (
    pd.merge(
        pd.DataFrame({'dot_low_res': dot_low_res,
                      'dot_high_res': dot_high_res})
        .set_index([index_dot[:, 0], index_dot[:, 1]]),
        pd.DataFrame({'scores_embed': scores_embed, 'label': labels_embed})
        .set_index([index_embed[:, 0], index_embed[:, 1]]),
        left_index=True, right_index=True)
    .reset_index()
    .rename(columns={'level_0': 'pair_0', 'level_1': 'pair_1'}))
dot_embed[['charge', 'mz']] = (
    pair_metadata.loc[dot_embed['pair_0'], ['charge', 'mz']]
    .reset_index(drop=True))

In [None]:
dot_embed.to_parquet('aucroc_dot.parquet', index=False)

In [None]:
# dot_embed = pd.read_parquet('aucroc_dot.parquet')

### Dot product versus embedded Euclidean distance

In [None]:
width = 7

jg = sns.jointplot(data=dot_embed, x='dot_high_res', y='scores_embed',
                   hue='label', palette='Set1', height=width,
                   s=1, marker='.', rasterized=True,
                   joint_kws={'alpha': 0.1})

jg.ax_joint.legend(jg.ax_joint.get_legend_handles_labels()[0],
                   ['Negative', 'Positive'], title='Pair type')
jg.set_axis_labels('Spectrum dot product', 'Embedded euclidean distance')

plt.savefig('dot_vs_dist.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
height = 7

dot_embed_charge = dot_embed[dot_embed['charge'] <= 4]

jg = sns.JointGrid(height=height)
sns.scatterplot(data=dot_embed_charge, x='dot_high_res', y='scores_embed',
                hue='charge', palette='Set1', alpha=0.1, s=1, marker='.',
                rasterized=True, ax=jg.ax_joint)
sns.kdeplot(data=dot_embed_charge, x='dot_high_res', hue='charge',
            palette='Set1', legend=False, common_norm=False, fill=True,
            ax=jg.ax_marg_x)
sns.kdeplot(data=dot_embed_charge, y='scores_embed', hue='charge',
            palette='Set1', legend=False, common_norm=False, fill=True,
            ax=jg.ax_marg_y)

jg.ax_joint.legend(loc='upper right', title='Precursor charge')
jg.set_axis_labels('Spectrum dot product', 'Embedded euclidean distance')

plt.savefig('dot_vs_dist_charge.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
height = 7

jg = sns.JointGrid(height=height)
sns.scatterplot(data=dot_embed, x='dot_high_res', y='scores_embed',
                alpha=0.1, s=1, c=dot_embed['mz'], marker='.',
                cmap=plt.cm.get_cmap('YlGnBu'), rasterized=True,
                ax=jg.ax_joint)
sns.kdeplot(data=dot_embed, x='dot_high_res', color='black', legend=False,
            common_norm=False, fill=True, ax=jg.ax_marg_x)
sns.kdeplot(data=dot_embed, y='scores_embed', color='black', legend=False,
            common_norm=False, fill=True, ax=jg.ax_marg_y)

ax_joint_pos = jg.ax_joint.get_position()
cbar_ax = jg.fig.add_axes([1.025, ax_joint_pos.x0 + 0.05,
                           0.025, ax_joint_pos.height - 0.1])
colorbar = jg.fig.colorbar(jg.ax_joint.get_children()[0], cax=cbar_ax)
colorbar.solids.set(alpha=1)
colorbar.set_label('Precursor m/z', size='large', labelpad=15)

jg.set_axis_labels('Spectrum dot product', 'Embedded euclidean distance')

plt.savefig('dot_vs_dist_mz.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

### AU(C)ROC plots

In [None]:
def concentrate_fpr(fpr, alpha):
    return (1 - np.exp(-alpha * fpr)) / (1 - np.exp(-alpha))

In [None]:
alpha = 14

In [None]:
width = 7
# height = width / 1.618
fig, ax = plt.subplots(figsize=(width, width))

fpr_high_res, tpr_high_res, _ = roc_curve(dot_embed['label'],
                                          dot_embed['dot_high_res'])
croc_fpr_high_res = concentrate_fpr(fpr_high_res, alpha)
ax.plot(croc_fpr_high_res, tpr_high_res,
        label=f'Dot product high res '
              f'(AUCROC = {auc(croc_fpr_high_res, tpr_high_res):.2%})')

fpr_low_res, tpr_low_res, _ = roc_curve(dot_embed['label'],
                                        dot_embed['dot_low_res'])
croc_fpr_low_res = concentrate_fpr(fpr_low_res, alpha)
ax.plot(croc_fpr_low_res, tpr_low_res,
        label=f'Dot product low res '
              f'(AUCROC = {auc(croc_fpr_low_res, tpr_low_res):.2%})')

fpr_embed, tpr_embed, _ = roc_curve(
    dot_embed['label'],
    1 - dot_embed['scores_embed'] / dot_embed['scores_embed'].max())
croc_fpr_embed = concentrate_fpr(fpr_embed, alpha)
ax.plot(croc_fpr_embed, tpr_embed,
        label=f'Embedding '
              f'(AUCROC = {auc(croc_fpr_embed, tpr_embed):.2%})')

ax.plot(concentrate_fpr(np.arange(0, 1.01, 0.01), alpha),
        np.arange(0, 1.01, 0.01), color='black', linestyle='--')

ax.set_xlim([-0.05, 1.05])
ax.set_ylim([-0.05, 1.05])

ax.set_xlabel('Concentrated false positive rate')
ax.set_ylabel('True positive rate')

ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.3))

sns.despine()

plt.savefig('aucroc_dot.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
width = 7
# height = width / 1.618
fig, ax = plt.subplots(figsize=(width, width))

ax.plot(fpr_high_res, tpr_high_res,
        label=f'Dot product high res '
              f'(AUROC = {auc(fpr_high_res, tpr_high_res):.2%})')

ax.plot(fpr_low_res, tpr_low_res,
        label=f'Dot product low res '
              f'(AUROC = {auc(fpr_low_res, tpr_low_res):.2%})')

ax.plot(fpr_embed, tpr_embed,
        label=f'Embedding '
              f'(AUROC = {auc(fpr_embed, tpr_embed):.2%})')

ax.plot(np.arange(0, 1.01, 0.01), np.arange(0, 1.01, 0.01),
        color='black', linestyle='--')

ax.set_xlim([-0.05, 1.05])
ax.set_ylim([-0.05, 1.05])

ax.set_xlabel('False positive rate')
ax.set_ylabel('True positive rate')

ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.3))

sns.despine()

plt.savefig('auroc_dot.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
logging.shutdown()