In [None]:
import os
import sys
os.environ['GLEAMS_HOME'] = os.path.join(os.environ['HOME'],
                                         'Projects/gleams')
# Make sure all code is in the PATH.
sys.path.append(
    os.path.normpath(os.path.join(os.environ['GLEAMS_HOME'], 'src')))

In [None]:
import joblib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import scipy.sparse as ss
import seaborn as sns

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Initialize logging.
from gleams import logger as glogger
glogger.init()
# Initialize all random seeds before importing any packages.
from gleams import rndm
rndm.set_seeds()

from gleams import config
from gleams.cluster import cluster
from gleams.metadata.metadata import _remove_mod

In [None]:
import logging
logger = logging.getLogger('gleams')
logger.setLevel(logging.DEBUG)

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette(['#9e0059', '#6da7de', '#ee266d', '#dee000', '#eb861e'])
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

In [None]:
split = 'test'

In [None]:
cluster.compute_pairwise_distances(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed',
                 f'embed_{config.massivekb_task_id}_{split}.npy'),
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed',
                 f'embed_{config.massivekb_task_id}_{split}.parquet'))

In [None]:
metadata = (
    pd.merge(
        pd.read_parquet(os.path.join(
            os.environ['GLEAMS_HOME'], 'data', 'cluster',
            f'embed_{config.massivekb_task_id}_{split}.parquet'))
        [['dataset', 'filename', 'scan']],
        pd.read_parquet(
            os.path.join(
                os.environ['GLEAMS_HOME'], 'data', 'metadata',
                f'massivekb_ids_{config.massivekb_task_id}.parquet')),
        'left', ['dataset', 'filename', 'scan'], copy=False)
    .dropna(subset=['sequence']))
# Don't disambiguate between I/L.
metadata['sequence'] = metadata['sequence'].str.replace('I', 'L')

In [None]:
num_samples = min(10_000_000, len(metadata))
idx_sample = np.random.choice(metadata.index, num_samples, False)
metadata = metadata.loc[idx_sample]

In [None]:
pairwise_distances = ss.load_npz(os.path.join(
    os.environ['GLEAMS_HOME'], 'data', 'cluster',
    f'dist_{config.massivekb_task_id}_{split}.npz'))
pairwise_distances = pairwise_distances[metadata.index][:, metadata.index]
logger.info('Using %d non-zero pairwise distances between %d randomly '
            'selected embeddings', pairwise_distances.count_nonzero(),
            len(metadata))

In [None]:
logger.info('Verify whether neighbors have the same peptide label')
rows, columns, dist = ss.find(pairwise_distances)
sequences = ((metadata['sequence'] + '/' + metadata['charge'].astype(str))
             .reset_index(drop=True))
same_label = (sequences.loc[rows].reset_index(drop=True) ==
              sequences.loc[columns].reset_index(drop=True))
order = np.argsort(dist)
dist = np.asarray(dist)[order]
same_label = np.asarray(same_label)[order]
prop_same_label = np.cumsum(same_label) / np.arange(1, len(same_label) + 1)

In [None]:
sequences_no_mod = (metadata['sequence'].apply(_remove_mod) + '/' +
                    metadata['charge'].astype(str)).reset_index(drop=True)
same_label_no_mod = (sequences_no_mod.loc[rows].reset_index(drop=True) ==
                     sequences_no_mod.loc[columns].reset_index(drop=True))
same_label_no_mod = np.asarray(same_label_no_mod)[order]
prop_same_label_no_mod = (np.cumsum(same_label_no_mod) /
                          np.arange(1, len(same_label_no_mod) + 1))

In [None]:
same_label_isobar = []
for seq1, seq2 in zip(sequences_no_mod.loc[rows],
                      sequences_no_mod.loc[columns]):
    if len(seq1) != len(seq2):
        same_label_isobar.append(False)
    else:
        diff_pos = [''.join(sorted([seq1[i], seq2[i]]))
                    for i in range(len(seq1)) if seq1[i] != seq2[i]]
        if len(diff_pos) == 0:
            same_label_isobar.append(True)
        else:
            same_label_isobar.append(all(pos in ('LN', 'DN', 'KQ', 'EK')
                                         for pos in diff_pos))
same_label_isobar = np.asarray(same_label_isobar)[order]
prop_same_label_isobar = (np.cumsum(same_label_isobar) /
                          np.arange(1, len(same_label_isobar) + 1))

In [None]:
joblib.dump([dist, prop_same_label, prop_same_label_no_mod,
             prop_same_label_isobar], 'nn_dist.joblib')

In [None]:
# dist, prop_same_label, prop_same_label_no_mod, prop_same_label_isobar = \
#    joblib.load('nn_dist.joblib')

In [None]:
width = 7
height = width / 1.618    # golden ratio
fig, ax = plt.subplots(figsize=(width, height))

max_dist = 0.2
mask = dist < max_dist
ax.plot(dist[mask], prop_same_label[mask], label='Original')
ax.plot(dist[mask], prop_same_label_no_mod[mask], label='Unmodified')
ax.plot(dist[mask], prop_same_label_isobar[mask], label='Near-isobaric')

ax.set_xlim(0, max_dist)
ax.set_ylim(0.95, 1)

ax.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1, decimals=0))

ax.legend(loc='lower left')

ax.set_xlabel('Embedded distance')
ax.set_ylabel('Proportion same peptide')

sns.despine()

plt.savefig('nn_dist.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
logging.shutdown()