In [None]:
import os
import sys
os.environ['GLEAMS_HOME'] = os.path.join(os.environ['HOME'],
                                         'Projects/gleams')
# Make sure all code is in the PATH.
sys.path.append(
    os.path.normpath(os.path.join(os.environ['GLEAMS_HOME'], 'src')))

In [None]:
import warnings
from sklearn.exceptions import EfficiencyWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=EfficiencyWarning)

In [None]:
import collections
import copy
import itertools
import shutil
import time

import joblib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import pyteomics
import seaborn as sns
import tqdm.notebook as tqdm

In [None]:
# Initialize logging.
from gleams import logger as glogger
glogger.init()
# Initialize all random seeds before importing any packages.
from gleams import rndm
rndm.set_seeds()

from gleams import config
from gleams.cluster import cluster
from gleams.feature import feature, spectrum
from gleams.ms_io import ms_io
from gleams.nn import nn

In [None]:
import logging
logger = logging.getLogger('gleams')
logger.setLevel(logging.DEBUG)

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

## Export spectra

In [None]:
! mkdir -p $GLEAMS_HOME/notebooks/cluster_comparison

In [None]:
cluster_dir = os.path.join(
    os.environ['GLEAMS_HOME'], 'notebooks', 'cluster_comparison')

In [None]:
def get_spectra_from_file(dataset, filename, scans):
    logger.debug('Process file %s/%s', dataset, filename)
    peak_filename = os.path.join(os.environ['GLEAMS_HOME'], 'data', 'peak',
                                 dataset, filename)
    if os.path.isfile(peak_filename):
        return [spec for spec in ms_io.get_spectra(peak_filename, scans)
                if spectrum.preprocess(copy.deepcopy(spec),
                                       config.fragment_mz_min,
                                       config.fragment_mz_max).is_valid]
    else:
        return None

In [None]:
filename_mgf = os.path.join(cluster_dir, 'cluster_comparison.mgf')
filename_metadata = os.path.join(cluster_dir, 'cluster_comparison.parquet')
split = 'test'
if not os.path.isfile(filename_metadata):
    logger.info('Export spectra to be clustered to MGF file(s)')
    datasets = pd.read_parquet(
        os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed',
                     f'embed_{config.massivekb_task_id}_{split}.parquet'))
    dataset_filename_scans = (datasets.groupby(['dataset', 'filename'])
                              ['scan'].apply(list).reset_index())
    spectrum_idx = []
    for i, (dataset, filename_scans) in tqdm.tqdm(
            enumerate(dataset_filename_scans.groupby('dataset'), 1),
            desc='Datasets processed',
            total=dataset_filename_scans['dataset'].nunique()):
        for filename, spectra in zip(
                filename_scans['filename'],
                joblib.Parallel(n_jobs=-1, backend='multiprocessing')(
                    joblib.delayed(get_spectra_from_file)
                    (dataset, filename, scans)
                    for filename, scans in zip(filename_scans['filename'],
                                               filename_scans['scan']))):
            if spectra is not None:
                spectra_dicts = []
                for spec in spectra:
                    spectra_dicts.append(
                        {'m/z array': spec.mz,
                        'intensity array': spec.intensity,
                        'params': {
                            'TITLE': len(spectrum_idx),
                            'RTINSECONDS': spec.retention_time,
                            'PEPMASS': spec.precursor_mz,
                            'CHARGE': f'{spec.precursor_charge}+'}})
                    spectrum_idx.append((dataset, filename,
                                         int(spec.identifier)))
                with open(filename_mgf, 'a') as f:
                    pyteomics.mgf.write(spectra_dicts, f)
    psms = pd.read_parquet(
        os.path.join(os.environ['GLEAMS_HOME'], 'data', 'metadata',
                     f'massivekb_ids_{config.massivekb_task_id}.parquet'))
    metadata = pd.merge(
        pd.DataFrame(spectrum_idx, columns=['dataset', 'filename', 'scan']),
        psms, 'left', ['dataset', 'filename', 'scan'])
    metadata['sequence'] = metadata['sequence'].str.replace('I', 'L')
    metadata.to_parquet(filename_metadata)
else:
    metadata = pd.read_parquet(filename_metadata)

## Cluster

In [None]:
min_cluster_size = 2
min_peptide_size = None

In [None]:
def evaluate_clusters(clusters, min_cluster_size=None,
                      min_peptide_size=None):
    # Ignore small clusters.
    if min_cluster_size is not None:
        cluster_counts = clusters['cluster'].value_counts()
        clusters.loc[clusters['cluster'].isin(cluster_counts[
            cluster_counts < min_cluster_size].index), 'cluster'] = -1
    # Only expect frequently occuring peptides to be clustered.
    if min_peptide_size is not None:
        peptide_counts = clusters['sequence'].value_counts()
        clusters = clusters[clusters['sequence'].isin(
            peptide_counts[peptide_counts >= min_peptide_size].index)]
    clusters_non_noise = clusters[clusters['cluster'] != -1]
    prop_clustered = len(clusters_non_noise) / len(clusters)
    prop_clustered_incorrect = (
            clusters_non_noise.groupby('cluster')['sequence']
            .apply(lambda labels: len(labels) - labels.value_counts().iat[0])
            .sum()
            / len(clusters))

    return prop_clustered, prop_clustered_incorrect

In [None]:
performance = collections.defaultdict(list)
timing = collections.defaultdict(list)

### MS-Cluster

In [None]:
dir_mscluster = os.path.join(cluster_dir, 'mscluster')

In [None]:
%%bash

mkdir -p $GLEAMS_HOME/notebooks/cluster_comparison/mscluster
ls $GLEAMS_HOME/notebooks/cluster_comparison/cluster_comparison.mgf \
    > $GLEAMS_HOME/notebooks/cluster_comparison/mscluster/mscluster_spec_list.txt

In [None]:
%%bash

time $GLEAMS_HOME/bin/MsCluster/MsCluster \
    --model LTQ_TRYP \
    --list $GLEAMS_HOME/notebooks/cluster_comparison/mscluster/mscluster_spec_list.txt \
    --output-name mscluster \
    --tmp-dir $GLEAMS_HOME/notebooks/cluster_comparison/mscluster/dat \
    --out-dir $GLEAMS_HOME/notebooks/cluster_comparison/mscluster \
    --dat-only \
    --model-dir $GLEAMS_HOME/bin/MsCluster/Models \
    --keep-dat \
    --assign-charges

MS-Cluster hyperparameters that influence the clustering quality are:

- `--mixture-prob <X>`: the probability wrongfully adding a spectrum to a cluster (default X=0.05)
- `--num-rounds <X>`: determines how many rounds are used for the hierarchical clustering (default X=3).

In [None]:
hp_mscluster = list(itertools.product(
    [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1], [3, 5, 10]))

In [None]:
for i, (mixture_prob, num_rounds) in enumerate(hp_mscluster):
    logger.info('MS-Cluster run %d (mixture-prob=%.3f ; num-rounds=%d)',
                i + 1, mixture_prob, num_rounds)
    # Execute clustering.
    cmd = f"""$GLEAMS_HOME/bin/MsCluster/MsCluster \
        --model LTQ_TRYP \
        --dat-list {dir_mscluster}/dat/mscluster_dat_list.txt \
        --output-name mscluster \
        --output-file-size 100000000 \
        --out-dir {dir_mscluster}/cluster_{i} \
        --model-dir $GLEAMS_HOME/bin/MsCluster/Models \
        --memory-gb 20 \
        --fragment-tolerance 0.05 \
        --precursor-ppm 10 \
        --assign-charges \
        --mixture-prob {mixture_prob} \
        --num-rounds {num_rounds} \
        --keep-dataset-idx"""
    start_time = time.time()
    if not os.path.isfile(os.path.join(dir_mscluster, f'cluster_{i}',
                                       'mscluster_0_0_mgf_list.txt')):
        ! eval {cmd}
        # Account for failed MS-Cluster runs.
        if not os.path.isfile(os.path.join(dir_mscluster, f'cluster_{i}',
                                           'mscluster_0_0_mgf_list.txt')):
            continue
        timing['MS-Cluster'].append(time.time() - start_time)
    else:
        timing['MS-Cluster'].append(np.nan)
    # Evaluate clustering performance.
    cluster_labels, cluster_i = np.full(len(metadata), -1), -1
    for filename in os.listdir(os.path.join(dir_mscluster, f'cluster_{i}',
                                            'clust')):
        if filename.endswith('.clust'):
            with open(os.path.join(dir_mscluster, f'cluster_{i}', 'clust',
                                   filename)) as f_in:
                for line in f_in:
                    if line.startswith('mscluster'):
                        cluster_i += 1
                    elif not line.isspace():
                        cluster_labels[int(line.split('\t')[2])] = cluster_i
    performance['MS-Cluster'].append(
        evaluate_clusters(pd.DataFrame({'sequence': metadata['sequence'],
                                        'cluster': cluster_labels}).dropna(),
                          min_cluster_size, min_peptide_size))

### spectra-cluster

In [None]:
dir_spectracluster = os.path.join(cluster_dir, 'spectra-cluster')

In [None]:
! mkdir -p $GLEAMS_HOME/notebooks/cluster_comparison/spectra-cluster/tmp

spectra-cluster hyperparameters that influence the clustering quality are:

- `-rounds <arg>`: number of clustering rounds to use.
- `-threshold_end <arg>`: (lowest) final clustering threshold
- `-threshold_start <arg>`: (highest) starting threshold

In [None]:
hp_spectracluster = list(itertools.product(
    [0.9999, 0.999, 0.99, 0.95, 0.9, 0.8, 0.7], [3, 5, 10]))

In [None]:
for i, (threshold_end, rounds) in enumerate(hp_spectracluster):
    logger.info('spectra-cluster run %d (threshold_end=%.2f ; rounds=%d)',
                i + 1, threshold_end, rounds)
    # Execute clustering.
    cmd = f"""java -jar $GLEAMS_HOME/bin/spectra-cluster/spectra-cluster-cli-1.1.2.jar \
        {cluster_dir}/cluster_comparison.mgf \
        -binary_directory {dir_spectracluster}/tmp \
        -fragment_tolerance 0.05 \
        -keep_binary_files \
        -major_peak_jobs $(nproc --all) \
        -output_path {dir_spectracluster}/clusters_{i}.txt \
        -precursor_tolerance 10 \
        -precursor_tolerance_unit ppm \
        -reuse_binary_files \
        -rounds {rounds} \
        -threshold_end {threshold_end} \
        -threshold_start 1.0 \
        -x_disable_mgf_comments"""
    start_time = time.time()
    if not os.path.isfile(os.path.join(dir_spectracluster,
                                       f'clusters_{i}.txt')):
        ! eval {cmd}
        timing['spectra-cluster'].append(time.time() - start_time)
    else:
        timing['spectra-cluster'].append(np.nan)
    # Evaluate clustering performance.
    cluster_labels, cluster_i = np.full(len(metadata), -1), -1
    with open(os.path.join(dir_spectracluster, f'clusters_{i}.txt')) as f_in:
        for line in f_in:
            if line.startswith('=Cluster='):
                cluster_i += 1
            elif line.startswith('SPEC'):
                cluster_labels[
                    int(line[line.find('#id=index=') + len('#id=index='):
                             line.find('#title')]) - 1] = cluster_i
    performance['spectra-cluster'].append(
        evaluate_clusters(pd.DataFrame({'sequence': metadata['sequence'],
                                        'cluster': cluster_labels}).dropna(),
                          min_cluster_size, min_peptide_size))

### GLEAMS

In [None]:
dir_gleams = os.path.join(cluster_dir, 'gleams')

In [None]:
! mkdir -p $GLEAMS_HOME/notebooks/cluster_comparison/gleams

In [None]:
# Extract the relevant entries from all (previously computed) embeddings.
embed_idx = (
    pd.merge(metadata, (pd.read_parquet(
        os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed',
                     f'embed_{config.massivekb_task_id}_{split}.parquet'))
                            .reset_index()),
             'right', ['dataset', 'filename', 'scan'])
    ['index'].astype(np.int64))
metadata_gleams = metadata.loc[embed_idx.index]
embeddings = np.load(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed',
                 f'embed_{config.massivekb_task_id}_{split}.npy'),
    mmap_mode='r')[embed_idx.values]
np.save(os.path.join(dir_gleams, 'embed_cluster_comparison.npy'), embeddings)

In [None]:
# Compute pairwise distances.
if (not os.path.isfile(os.path.join(
        dir_gleams, 'dist_cluster_comparison.npz'))):
    cluster.compute_pairwise_distances(
        os.path.join(dir_gleams, 'embed_cluster_comparison.npy'),
        filename_metadata)
    os.rename(os.path.join(os.environ['GLEAMS_HOME'], 'data', 'cluster',
                           'dist_cluster_comparison.npz'),
              os.path.join(dir_gleams, 'dist_cluster_comparison.npz'))

In [None]:
def get_clusters_mz_diff(clusters, min_cluster_size, min_peptide_size):
    # Ignore small clusters.
    if min_cluster_size is not None:
        cluster_counts = clusters['cluster'].value_counts()
        clusters.loc[clusters['cluster'].isin(cluster_counts[
            cluster_counts < min_cluster_size].index), 'cluster'] = -1
    # Only expect frequently occuring peptides to be clustered.
    if min_peptide_size is not None:
        peptide_counts = clusters['sequence'].value_counts()
        clusters = clusters[clusters['sequence'].isin(
            peptide_counts[peptide_counts >= min_peptide_size].index)]
    clusters_non_noise = clusters[clusters['cluster'] != -1]
    
    def _get_cluster_mz_diff(cluster):
        cluster_label = cluster['sequence'].mode().iat[0]
        incorrect_i = np.where(cluster['sequence'] != cluster_label)[0]
        if len(incorrect_i) > 0:
            cluster_mz = (cluster[cluster['sequence'] == cluster_label]
                          ['mz'].mean())
            return cluster.iloc[incorrect_i]['mz'] - cluster_mz
    
    mz_diff = (clusters_non_noise.groupby('cluster')
               .apply(_get_cluster_mz_diff).values)
    return mz_diff[~np.isnan(mz_diff)]

In [None]:
hp_gleams = list(itertools.product(np.arange(0.25, 0.5, 0.01), [2]))

In [None]:
cluster_filename = os.path.join(
    dir_gleams, 'clusters_cluster_comparison.npy')
for i, (eps, min_samples) in enumerate(hp_gleams):
    logger.info('GLEAMS run %d (eps=%.4f ; min_samples=%d)',
                i + 1, eps, min_samples)
    if os.path.isfile(cluster_filename):
        os.remove(cluster_filename)
    config.eps, config.min_samples = eps, min_samples
    # Execute clustering.
    start_time = time.time()
    cluster.cluster(os.path.join(dir_gleams, 'dist_cluster_comparison.npz'))
    timing['GLEAMS'].append(time.time() - start_time)
    # Evaluate clustering performance.
    performance['GLEAMS'].append(
        evaluate_clusters(
            pd.DataFrame({'sequence': metadata_gleams['sequence'],
                          'cluster': np.load(cluster_filename)}).dropna(),
            min_cluster_size, min_peptide_size))
    clusters_mz_diff = get_clusters_mz_diff(
        pd.DataFrame({'sequence': metadata_gleams['sequence'],
                      'cluster': np.load(cluster_filename),
                      'mz': metadata_gleams['mz']}),
        min_cluster_size, min_peptide_size)

## Compare clustering results

In [None]:
def get_pareto_frontier(arr, threshold0=0, threshold1=0):
    # Sort by the first column.
    arr_sorted = arr[arr[:, 0].argsort()]
    # Iteratively add points to the Pareto frontier.
    pareto_idx = [0]
    for i in range(1, arr_sorted.shape[0]):
        if (arr_sorted[i, 0] > (arr_sorted[pareto_idx[-1], 0]
                                + threshold0) and
                arr_sorted[i, 1] > (arr_sorted[pareto_idx[-1], 1]
                                    + threshold1)):
            pareto_idx.append(i)
    return arr_sorted[pareto_idx]

In [None]:
joblib.dump((performance, timing), 'cluster_comparison.joblib')

In [None]:
performance, timing = joblib.load('cluster_comparison.joblib')

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

for tool, values in performance.items():
    pareto = get_pareto_frontier(np.asarray(values)[:, [1, 0]],
                                 0.001, 0.01)
    ax.plot(pareto[:, 0], pareto[:, 1], marker='o', label=tool)

ax.set_xlim(0, 0.05)
ax.set_ylim(0.45, 1)

ax.legend(loc='lower right')

ax.set_xlabel('Incorrectly clustered spectra')
ax.set_ylabel('Clustered spectra')

ax.xaxis.set_major_formatter(mticker.PercentFormatter(1))
ax.yaxis.set_major_formatter(mticker.PercentFormatter(1))

sns.despine()

plt.savefig('cluster_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()