In [None]:
import os
import sys
os.environ['GLEAMS_HOME'] = os.path.join(os.environ['HOME'],
                                         'Projects/gleams')
# Make sure all code is in the PATH.
sys.path.append(
    os.path.normpath(os.path.join(os.environ['GLEAMS_HOME'], 'src')))

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from typing import List

import joblib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import pyteomics
import seaborn as sns
import tqdm.notebook as tqdm

from evaluate_clusters import evaluate_clusters

In [None]:
# Initialize logging.
from gleams import logger as glogger
glogger.init()
# Initialize all random seeds before importing any packages.
from gleams import rndm
rndm.set_seeds()

from gleams import config
from gleams.cluster import cluster
from gleams.metadata.metadata import _remove_mod
from gleams.ms_io import ms_io

In [None]:
import logging
logger = logging.getLogger('gleams')
logger.setLevel(logging.DEBUG)

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette(['#9e0059', '#6da7de', '#ee266d', '#dee000', '#eb861e'])
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

## Clustering

Cluster the spectra (using the previously determined optimal clustering hyperparameters).

In [None]:
try:
    os.remove(os.path.join(os.environ['GLEAMS_HOME'], 'data', 'cluster',
                           f'clusters_{config.massivekb_task_id}.npy'))
except FileNotFoundError:
    pass
cluster.compute_pairwise_distances(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed',
                 f'embed_{config.massivekb_task_id}.npy'),
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed',
                 f'embed_{config.massivekb_task_id}.parquet'),
    config.charges)
cluster.cluster(os.path.join(os.environ['GLEAMS_HOME'], 'data', 'cluster',
                             f'dist_{config.massivekb_task_id}.npz'),
                os.path.join(os.environ['GLEAMS_HOME'], 'data', 'cluster',
                             f'embed_{config.massivekb_task_id}.parquet'))

In [None]:
clusters = pd.merge(
    pd.read_parquet(
        os.path.join(os.environ['GLEAMS_HOME'], 'data', 'cluster',
                     f'embed_{config.massivekb_task_id}.parquet'))
    [['dataset', 'filename', 'scan', 'charge', 'mz']],
    (pd.read_parquet(
        os.path.join(os.environ['GLEAMS_HOME'], 'data', 'metadata',
                     f'massivekb_ids_{config.massivekb_task_id}.parquet'))
     [['dataset', 'filename', 'scan', 'sequence']]),
    'left', ['dataset', 'filename', 'scan'])
# Don't disambiguate between I/L.
clusters['sequence'] = clusters['sequence'].str.replace('I', 'L')
clusters['cluster'] = np.load(os.path.join(
    os.environ['GLEAMS_HOME'], 'data', 'cluster',
    f'clusters_{config.massivekb_task_id}.npy'))

In [None]:
num_id_massive = clusters['sequence'].count()
print(f'Number of initial spectrum identifications: {num_id_massive:,} '
      f'({(num_id_massive / len(clusters)):.2%})')

In [None]:
# Clustering performance (excluding modifications).
num_clustered, num_noise, \
    prop_clustered, prop_clustered_incorrect, \
    homogeneity, completeness = evaluate_clusters(
        pd.DataFrame(
            {'sequence': (clusters['sequence'].apply(
                              lambda seq: (_remove_mod(seq)
                                           if pd.notnull(seq)
                                           else np.nan))
                          + '/' + clusters['charge'].astype(str)),
                      'cluster': clusters['cluster']}),
        min_cluster_size)
num_clustered = len(clusters[clusters["cluster"] != -1])
print(f'Number of clustered spectra: {num_clustered:,} / {len(clusters):,} '
      f'({num_clustered / len(clusters):.2%})')
print(f'Incorrectly clustered spectra: {prop_clustered_incorrect:.2%}')
print(f'Clustering homogeneity: {homogeneity:.3f}')
print(f'Clustering completeness: {completeness:.3f}')

## Spectrum export

Export the unidentified spectra that are part of valid clusters to MGF files used for reidentification. Spectra clustered as noise are not exported.

In [None]:
def _get_usis(df):
    return ('mzspec:' + df['dataset'] + ':' +
            df['filename'].apply(
                lambda fn: os.path.splitext(os.path.basename(fn))[0]) +
            ':scan:' + df['scan'].astype(str))

In [None]:
filename_medoids = os.path.join(
    os.environ['GLEAMS_HOME'], 'data', 'cluster',
    f'clusters_{config.massivekb_task_id}_medoids.npy')
if not os.path.isfile(filename_medoids):
    cluster_i_medoid = cluster.get_cluster_medoids(
        os.path.join(os.environ['GLEAMS_HOME'], 'data', 'cluster',
                     f'clusters_{config.massivekb_task_id}.npy'),
        os.path.join(os.environ['GLEAMS_HOME'], 'data', 'cluster',
                     f'dist_{config.massivekb_task_id}.npz'))
    np.save(filename_medoids, cluster_i_medoid)
else:
    cluster_i_medoid = np.load(filename_medoids)
cluster_i_non_ident = (clusters.groupby('cluster')['sequence']
                       .apply(lambda sequences: all(pd.isnull(sequences))))
cluster_i_non_ident = cluster_i_non_ident[cluster_i_non_ident].index
cluster_medoids = clusters.iloc[cluster_i_medoid]
cluster_medoids_non_ident = cluster_medoids[cluster_medoids['cluster']
                                            .isin(cluster_i_non_ident)]
cluster_medoids_non_ident = pd.merge(
    cluster_medoids_non_ident,
    clusters['cluster'].value_counts().rename('cluster_size'),
    'left', left_on='cluster', right_index=True)
dataset_filename_scans = (cluster_medoids_non_ident
                          .groupby(['dataset', 'filename'])['scan']
                          .apply(sorted).reset_index())

In [None]:
clusters_non_ident = len(clusters[clusters['cluster'].isin(
    cluster_medoids_non_ident['cluster'])])
print(f'Number of clustered, unidentified spectra: {clusters_non_ident:,} '
      f'({(clusters_non_ident / len(clusters)):.2%})')
num_export = dataset_filename_scans['scan'].apply(len).sum()
print(f'Number of unidentified cluster medoids: '
      f'{len(cluster_medoids_non_ident):,} '
      f'({(len(cluster_medoids_non_ident) / len(clusters)):.2%})')

In [None]:
reident_peak_dir = os.path.join('cluster_ident', 'peak')
os.makedirs(reident_peak_dir, exist_ok=True)

In [None]:
def get_spectra_from_file(dataset: str, filename: str, scans: List[int]):
    filename_base = os.path.splitext(os.path.basename(filename))[0]
    filename_orig = os.path.join(os.environ['GLEAMS_HOME'], 'data', 'peak',
                                 dataset, filename)
    filename_tmp = os.path.join('/tmp', filename)
    ! cp {filename_orig} /tmp
    scans = set(scans)
    for spec in ms_io.get_spectra(filename_tmp):
        if int(spec.identifier) in scans:
            usi = f'mzspec:{dataset}:{filename_base}:scan:{spec.identifier}'
            yield (usi, {'m/z array': spec.mz,
                         'intensity array': spec.intensity,
                         'params': {'TITLE': usi,
                                    'RTINSECONDS': spec.retention_time,
                                    'PEPMASS': spec.precursor_mz,
                                    'CHARGE': f'{spec.precursor_charge}+'}})
            scans.remove(int(spec.identifier))
            if len(scans) == 0:
                break
    ! rm {filename_tmp}

In [None]:
logger.info('Read and export cluster medoid spectra to MGF files in '
            'directory %s', reident_peak_dir)
spectra_dicts = {}
with tqdm.tqdm(total=sum(dataset_filename_scans['scan'].apply(len)),
               unit='spectra') as pbar:
    for dataset, filename, scans in zip(dataset_filename_scans['dataset'],
                                        dataset_filename_scans['filename'],
                                        dataset_filename_scans['scan']):
        for usi, spec in get_spectra_from_file(dataset, filename, scans):
            spectra_dicts[usi] = spec
            pbar.update(1)
# Clusters of size 2.
pyteomics.mgf.write(
    [spectra_dicts[usi] for usi in _get_usis(
        cluster_medoids_non_ident[cluster_medoids_non_ident['cluster_size'] == 2])],
    os.path.join(reident_peak_dir, 'cluster_ident_2.mgf'), use_numpy=True)
# Clusters of size larger than 2.
pyteomics.mgf.write(
    [spectra_dicts[usi] for usi in _get_usis(
        cluster_medoids_non_ident[cluster_medoids_non_ident['cluster_size'] > 2])],
    os.path.join(reident_peak_dir, 'cluster_ident_n.mgf'), use_numpy=True)

## Incorporate ANN-SoLo identifications

Include the identifications from ANN-SoLo processing of the previously exported unidentified spectra.

In [None]:
def read_mztab_psms(filename: str) -> pd.DataFrame:
    """
    Read PSMs from the given mzTab file.
    
    Parameters
    ----------
    filename: str
        The mzTab file name from which to read the PSMs.
    
    Returns
    -------
    pd.DataFrame
        A data frame containing the PSM information from the mzTab file.
    """
    # Skip the header lines.
    skiplines = 0
    with open(filename) as f_in:
        line = next(f_in)
        while line.split('\t', 1)[0] != 'PSH':
            line = next(f_in)
            skiplines += 1
    return pd.read_csv(filename, sep='\t', header=skiplines,
                       index_col='PSM_ID').drop(columns='PSH')

In [None]:
reident_id_dir = os.path.join('cluster_ident', 'id')
psms = pd.concat([read_mztab_psms(os.path.join(reident_id_dir, filename))
                  for filename in os.listdir(reident_id_dir)
                  if os.path.splitext(filename)[1] == '.mztab'])
psms['sequence'] = psms['sequence'].str.replace('I', 'L')

In [None]:
num_id_ann_solo = len(psms)
print(f'Number of ANN-SoLo spectrum identifications: {num_id_ann_solo:,} '
      f'({(num_id_ann_solo / num_export):.2%} of previously unidentified '
      f'cluster medoid spectra)')

In [None]:
clusters = clusters.set_index(_get_usis(clusters))
clusters.loc[psms.index, 'sequence'] = psms['sequence']
clusters['ann_solo'] = False
clusters.loc[psms.index, 'ann_solo'] = True

### Mass difference histogram

In [None]:
def get_mass_groups(psms, tol_mass, tol_mode, min_group_size=None):
    psms_remaining = psms.sort_values('search_engine_score[1]',
                                      ascending=False)
    psms_remaining['mass_diff'] = ((psms_remaining['exp_mass_to_charge'] -
                                    psms_remaining['calc_mass_to_charge']) *
                                   psms_remaining['charge'])

    # Start with the highest ranked SSM.
    mass_groups = []
    while psms_remaining.size > 0:
        # Find all remaining PSMs within the mass difference window.
        mass_diff = psms_remaining['mass_diff'].iat[0]
        if (tol_mass is None or tol_mode not in ('Da', 'ppm') or
                min_group_size is None):
            mask = np.full(len(psms_remaining), True, dtype=bool)
        elif tol_mode == 'Da':
            mask = (np.fabs(psms_remaining['mass_diff'] - mass_diff) <=
                    tol_mass)
        elif tol_mode == 'ppm':
            mask = (np.fabs(psms_remaining['mass_diff'] - mass_diffs) /
                    psms_remaining['exp_mass_to_charge'] * 10 ** 6
                    <= tol_mass)
        mass_groups.append(psms_remaining[mask])
        # Exclude the selected PSMs from further selections.
        psms_remaining = psms_remaining[~mask]

    mass_group_stats = []
    for mass_group in mass_groups:
        mass_group_stats.append((mass_group['mass_diff'].median(),
                                 mass_group['mass_diff'].mean(),
                                 len(mass_group)))
    mass_group_stats = pd.DataFrame.from_records(
        mass_group_stats, columns=['mass_diff_median', 'mass_diff_mean',
                                   'num_psms'])
    return mass_group_stats

In [None]:
tol_mass, tol_mode, min_group_size = 0.1, 'Da', 20
mass_groups = get_mass_groups(psms, tol_mass, tol_mode, min_group_size)

In [None]:
mass_groups.sort_values('num_psms', ascending=False).head(20)

## Propagate identifications

We can propagate identifications within clusters by assigning unmodified spectra the same peptide sequence as the majority of identified spectra in the same cluster.

We check both identification propagations based on the initial identifications and additional identification propagations based on the new ANN-SoLo identifications.

In [None]:
def _get_cluster_num_id_propagation(sequences):
    num_ids = pd.notnull(sequences).sum()
    return len(sequences) - num_ids if num_ids > 0 else 0


def _get_num_id_propagation(clusters):
    return clusters.groupby('cluster')['sequence'].apply(
        _get_cluster_num_id_propagation).sum()


num_id_prop_initial = _get_num_id_propagation(
    clusters[(clusters['cluster'] != -1) & ~clusters['ann_solo']])
num_id_prop_ann_solo = _get_num_id_propagation(
    clusters[clusters['cluster'] != -1]) - num_id_prop_initial

In [None]:
num_id_prop_initial_clusters = clusters.loc[
    (clusters['cluster'] != -1) & ~clusters['ann_solo'], 'cluster'].nunique()
print(f'Number of initial clusters for propagation: '
      f'{num_id_prop_initial_clusters:,}')
print(f'Number of propagated initial spectrum identifications: '
      f'{num_id_prop_initial:,} '
      f'({(num_id_prop_initial / len(clusters)):.2%})')
print(f'Number of propagated ANN-SoLo spectrum identifications: '
      f'{num_id_prop_ann_solo:,} '
      f'({(num_id_prop_ann_solo / len(clusters)):.2%})')
num_new_id = num_id_prop_initial + num_id_ann_solo + num_id_prop_ann_solo
print(f'Total number of new PSMs: {num_new_id:,} '
      f'({num_new_id / len(clusters):.2%})')
print(f'PSM increase: {(num_new_id / num_id_massive):.2%}')

In [None]:
num_clustered_unidentified = (clusters[clusters['cluster'] != -1]
                              ['sequence'].isna().sum()
                              - num_id_prop_initial - num_id_prop_ann_solo)
num_ids = pd.DataFrame(
    [('MassIVE-KB', num_id_massive),
     ('MassIVE-KB\npropagation', num_id_prop_initial),
     ('ANN-SoLo', num_id_ann_solo),
     ('ANN-SoLo\npropagation', num_id_prop_ann_solo),
     ('clustered\nunidentified', num_clustered_unidentified)],
    columns=['search_mode', 'num_ids']).set_index('search_mode')
num_ids.to_csv('cluster_ident.csv')

In [None]:
width = 7
height = width / 1.618
fig, axes = plt.subplots(1, 2, figsize=(width * 2, height),
                         gridspec_kw={'width_ratios': [1, 3]})

# Identification contributions.
ax = axes[0]

num_ids.plot.bar(ax=ax, color='#9e0059', legend=False)
line = ax.plot(range(len(num_ids)), np.cumsum(num_ids['num_ids']),
               marker='o', markersize=8, markeredgecolor='white',
               markeredgewidth=1)

ax.set_xlabel('')
ax.set_ylabel('Number of identified spectra')
ax.ticklabel_format(axis='y', scilimits=(6, 6))

ax.legend(line, ['Cumulative'], loc='upper center',
          bbox_to_anchor=(0.5, 1.1))

# Mass difference histogram.
ax = axes[1]
# Exclude unmodified PSMs.
mask = ((mass_groups['mass_diff_median'].abs() > tol_mass) &
        (mass_groups['mass_diff_median'] > -50) &
        (mass_groups['mass_diff_median'] < 100))
ax.bar(mass_groups[mask]['mass_diff_median'], mass_groups[mask]['num_psms'],
       width=0.4, color='black')

# Annotate the most frequent modifications.
modifications = [('Glu→His', 0, 310000),          #   8.016319
                 ('His→Phe', 10, 260000),         #  10.009502
                 ('oxidation', 8, 210000),        #  15.994915
                 (None, None, None),
                 ('', 0, 210000),                 #   2.015650
                 ('Phospho', 0, 210000),          #  79.966331
                 (None, None, None),
                 ('Carbamidomethyl', 0, 110000),  #  57.021464
                 ('Lys→Arg', 0, 110000),          #  28.006148
                 (None, None, None),
                 (None, None, None),
                 (None, None, None),
                 (None, None, None),
                 (None, None, None),
                 ('water loss', 0, 110000)]       # -18.010565
for (annot, x, y), mass_group in zip(modifications, mass_groups.sort_values(
        'num_psms', ascending=False)[1:].itertuples()):
    if annot is not None:
        ax.annotate(annot,
                    (mass_group.mass_diff_median, mass_group.num_psms + 50),
                    (mass_group.mass_diff_median + x, y),
                    arrowprops={'arrowstyle': '<-', 'linewidth': 1},
                    ha='center')
ax.text(-0.5, 210000, 'Pro→Val', ha='center')

ax.set_xlim((-50, 100))

ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))

ax.set_xlabel('Precursor mass difference (Da)')
ax.set_ylabel(f'Number of PSMs')

for ax in axes:
    sns.despine(ax=ax)
    
fig.subplots_adjust(wspace=0.25)

plt.savefig('cluster_ident.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
logging.shutdown()