In [None]:
import os
import sys
os.environ['GLEAMS_HOME'] = os.path.join(os.environ['HOME'],
                                         'Projects/gleams')
# Make sure all code is in the PATH.
sys.path.append(
    os.path.normpath(os.path.join(os.environ['GLEAMS_HOME'], 'src')))

In [None]:
import warnings
from sklearn.exceptions import EfficiencyWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=EfficiencyWarning)

In [None]:
import collections
import copy
import math
import shutil

import joblib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import pyteomics
import scipy.stats
import seaborn as sns
import tqdm.notebook as tqdm

from evaluate_clusters import evaluate_clusters

In [None]:
# Initialize logging.
from gleams import logger as glogger
glogger.init()
# Initialize all random seeds before importing any packages.
from gleams import rndm
rndm.set_seeds()

from gleams import config
from gleams.cluster import cluster
from gleams.feature import spectrum
from gleams.ms_io import ms_io

In [None]:
import logging
logger = logging.getLogger('gleams')
logger.setLevel(logging.DEBUG)

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette(['#9e0059', '#6da7de', '#ee266d', '#dee000', '#eb861e'])
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

## Export spectra

In [None]:
! mkdir -p $GLEAMS_HOME/notebooks/cluster_comparison

In [None]:
cluster_dir = os.path.join(
    os.environ['GLEAMS_HOME'], 'notebooks', 'cluster_comparison')

In [None]:
def read_export_spectra_to_mgf(peak_filename, out_filename, scans):
    logger.debug('Process file %s', peak_filename)
    if not os.path.isfile(peak_filename):
        return None
    # Read all spectra from the original peak file.
    spectrum_idx, spectra_dicts = [], []
    for spec in ms_io.get_spectra(peak_filename, scans):
        if (config.charges[0] <= spec.precursor_charge <= config.charges[1]
                and spectrum.preprocess(copy.deepcopy(spec),
                                        config.fragment_mz_min,
                                        config.fragment_mz_max).is_valid):
            spectra_dicts.append({
                'm/z array': spec.mz,
                'intensity array': spec.intensity,
                'params': {
                    'TITLE': f'{os.path.basename(peak_filename)}:scan:{spec.identifier}',
                    'RTINSECONDS': spec.retention_time,
                    'PEPMASS': spec.precursor_mz,
                    'CHARGE': f'{spec.precursor_charge}+'}})
            spectrum_idx.append((int(spec.identifier),
                                 spec.precursor_charge,
                                 spec.precursor_mz))
    # Export the spectra to a temporary MGF file.
    if len(spectra_dicts) > 0:
        pyteomics.mgf.write(spectra_dicts, out_filename, use_numpy=True)
    return spectrum_idx

In [None]:
split = 'test'
max_spectra_per_split = 30_000_000

In [None]:
datasets = pd.read_parquet(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed',
                 f'embed_{config.massivekb_task_id}_{split}.parquet'))
datasets_splits = np.array_split(
    datasets.sample(frac=1),
    max(3, math.ceil(len(datasets) / max_spectra_per_split)))

psms = pd.read_parquet(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'metadata',
                 f'massivekb_ids_{config.massivekb_task_id}.parquet'),
    columns=['dataset', 'filename', 'scan', 'sequence'])
    
for split_i, datasets_split in enumerate(datasets_splits):
    cluster_dir_split = os.path.join(cluster_dir, str(split_i))
    if not os.path.isdir(cluster_dir_split):
        os.makedirs(cluster_dir_split)
    cluster_dir_split_tmp = os.path.join(cluster_dir_split, 'tmp')
    if os.path.isdir(cluster_dir_split_tmp):
        shutil.rmtree(cluster_dir_split_tmp)
    os.makedirs(cluster_dir_split_tmp)
    filename_md = os.path.join(cluster_dir_split, 'cluster_comparison.parquet')
    filename_mgf = os.path.join(cluster_dir_split, 'cluster_comparison.mgf')
    if os.path.isfile(filename_md):
        continue
    if os.path.isfile(filename_mgf):
        os.remove(filename_mgf)
    logger.info('Partition %d/%d: Export spectra to be clustered to MGF file',
                split_i + 1, len(datasets_splits))
    spectrum_idx, tmp_filenames = [], []
    dataset_filename_scans = (datasets_split.groupby(['dataset', 'filename'])
                              ['scan'].apply(list).reset_index())
    # FIXME: Replace by `/tmp` if using local copies of the peak files.
    for dataset, filename, scans in zip(dataset_filename_scans['dataset'],
                                        dataset_filename_scans['filename'],
                                        dataset_filename_scans['scan']):
        file_spectrum_idx = read_export_spectra_to_mgf(
            os.path.join(os.environ['GLEAMS_HOME'], 'data', 'peak', dataset, filename),
            os.path.join(cluster_dir_split_tmp, f'{os.path.splitext(filename)[0]}.mgf'),
            scans)
        if file_spectrum_idx is not None:
            for idx in file_spectrum_idx:
                spectrum_idx.append((dataset, filename, *idx))
            tmp_filenames.append(os.path.join(
                cluster_dir_split_tmp, f'{os.path.splitext(filename)[0]}.mgf'))
    logger.debug('Combine %d temporary files into a single MGF file %s',
                 len(tmp_filenames), filename_mgf)
    with open(filename_mgf, 'wb') as f_out:
        for fn in tmp_filenames:
            with open(fn, 'rb') as f_in:
                shutil.copyfileobj(f_in, f_out)
    shutil.rmtree(cluster_dir_split_tmp)
    logger.debug('Export metadata for %d spectra to Parquet file %s',
                 len(spectrum_idx), filename_md)
    metadata = pd.merge(
        pd.DataFrame(spectrum_idx, columns=['dataset', 'filename', 'scan',
                                            'charge', 'mz']),
        psms, 'left', ['dataset', 'filename', 'scan'])
    metadata['sequence'] = metadata['sequence'].str.replace('I', 'L')
    metadata.to_parquet(filename_md)

## Cluster

In [None]:
min_cluster_sizes = [(5, None)] #, (2, None), (10, None), (50, None)]

In [None]:
def get_clusters_falcon(filename, ids=None):
    cluster_labels = pd.read_csv(filename, comment='#')
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = cluster_labels.set_index(
            cluster_labels['identifier'].str.rsplit(':', 1)
            .str[1].astype(int))
        cluster_labels = pd.merge(cluster_labels['cluster'], ids,
                                  'left', left_index=True, right_index=True)
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['charge'].astype(str))
        return cluster_labels


def get_clusters_maracluster(filename, ids=None):
    cluster_labels = (pd.read_csv(filename, sep='\t',
                                  names=['filename', 'scan', 'cluster'],
                                  usecols=['scan', 'cluster'])
                      .dropna(how='all'))
    cluster_labels.set_index('scan', inplace=True)
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = (pd.merge(ids, cluster_labels, 'right',
                                   left_index=True, right_index=True)
                          .dropna(subset=['charge']))
        cluster_labels['charge'] = cluster_labels['charge'].astype(int)
        cluster_labels['sequence'] = (cluster_labels['sequence'] + '/' +
                                      cluster_labels['charge'].astype(str))
        return cluster_labels


def get_clusters_mscluster(dir_name, ids=None):
    clusters, cluster_i = [], -1
    for filename in os.listdir(dir_name):
        if filename.endswith('.clust'):
            with open(os.path.join(dir_name, filename)) as f_in:
                for line in f_in:
                    if line.startswith('mscluster'):
                        cluster_i += 1
                    elif not line.isspace():
                        splits = line.split('\t')
                        spectrum_i = int(splits[2])
                        clusters.append((spectrum_i, cluster_i))
    cluster_labels = (pd.DataFrame(clusters, columns=['index', 'cluster'])
                      .set_index('index').sort_index())
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = (pd.merge(ids, cluster_labels, 'right',
                                   left_index=True, right_index=True)
                          .dropna(subset=['charge']))
        cluster_labels['charge'] = cluster_labels['charge'].astype(int)
        cluster_labels['sequence'] = (cluster_labels['sequence'] + '/' +
                                      cluster_labels['charge'].astype(str))
        return cluster_labels
    
    
def get_clusters_mscrush(dir_name, ids=None):
    cluster_labels = []
    for filename in os.listdir(dir_name):
        if filename.endswith('.txt'):
            clusters_file = pd.read_csv(os.path.join(dir_name, filename),
                                        sep='\t')
            clusters_file['Titles'] = clusters_file['Titles'].str.split('|')
            clusters_file = clusters_file.explode('Titles')
            filenames_scans = clusters_file['Titles'].str.split(':')
            clusters_file['filename'] = filenames_scans.str[0]
            clusters_file['scan'] = filenames_scans.str[-1].astype(int)
            clusters_file = clusters_file.rename(columns={'ID': 'cluster'})
            clusters_file = clusters_file[['filename', 'scan', 'cluster']]
            if len(clusters_file) > 0:
                if len(cluster_labels) > 0:
                    clusters_file['cluster'] += cluster_labels[-1].iat[-1, 2] + 1
                cluster_labels.append(clusters_file)
    cluster_labels = pd.concat(cluster_labels, ignore_index=True)
    cluster_labels['filename'] += '.gz'
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = (pd.merge(cluster_labels, ids,
                                   'left', ['filename', 'scan'])
                           .dropna(subset=['charge']))
        cluster_labels['charge'] = \
            cluster_labels['charge'].astype(int)
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['charge'].astype(str))
        return cluster_labels


def get_clusters_spectracluster(filename, ids=None):
    identifiers, clusters, cluster_i = [], [], -1
    with open(filename) as f_in:
        for line in f_in:
            if line.startswith('=Cluster='):
                cluster_i += 1
            elif line.startswith('SPEC'):
                start_i = line.find('#id=index=') + len('#id=index=')
                stop_i = line.find('#title', start_i)
                spectrum_i = int(line[start_i:stop_i]) - 1
                clusters.append((spectrum_i, cluster_i))
    cluster_labels = (pd.DataFrame(clusters, columns=['index', 'cluster'])
                      .set_index('index').sort_index())
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = (pd.merge(ids, cluster_labels, 'right',
                                   left_index=True, right_index=True)
                          .dropna(subset=['charge']))
        cluster_labels['charge'] = cluster_labels['charge'].astype(int)
        cluster_labels['sequence'] = (cluster_labels['sequence'] + '/' +
                                      cluster_labels['charge'].astype(str))
        return cluster_labels


def get_clusters_gleams(filename_clusters, ids=None):
    cluster_labels = pd.DataFrame({'cluster': np.load(filename_clusters)})
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = (pd.merge(ids, cluster_labels, 'right',
                                   left_index=True, right_index=True)
                          .dropna(subset=['charge']))
        cluster_labels['charge'] = cluster_labels['charge'].astype(int)
        cluster_labels['sequence'] = (cluster_labels['sequence'] + '/' +
                                      cluster_labels['charge'].astype(str))
        return cluster_labels

In [None]:
performance = []

### falcon

falcon hyperparameters that influence the clustering quality are:

- `eps`: maximum cosine distance between two spectra for them to be considered as neighbors of each other.

In [None]:
hp_falcon = {0: 0.01, 1: 0.05, 2: 0.10, 3: 0.15, 4: 0.20, 5: 0.25, 6: 0.30}

In [None]:
for split_i in os.listdir(cluster_dir):
    metadata = pd.read_parquet(os.path.join(cluster_dir, split_i,
                                            'cluster_comparison.parquet'))
    dir_falcon = os.path.join(cluster_dir, split_i, 'falcon')
    if not os.path.exists(dir_falcon):
        os.makedirs(dir_falcon)  
    # falcon clustering.
    for i, eps in hp_falcon.items():
        logging.info('falcon run %d (eps=%.2f)', i + 1, eps)
        filename = os.path.join(dir_falcon, f'clusters_{i}.csv.xz')
        # Execute clustering.
        cmd = f"""falcon \
            {os.path.join(cluster_dir, split_i)}/cluster_comparison.mgf \
            {os.path.join(cluster_dir, split_i, "falcon")}/clusters_{i} \
            --precursor_tol 10 ppm \
            --fragment_tol 0.05 \
            --eps {eps} \
            --min_intensity 0.1 \
            --scaling root"""
        if not os.path.isfile(filename):
            ! eval {cmd}
        # Evaluate clustering performance.
        cluster_labels = get_clusters_falcon(filename, metadata)
        for min_cluster_size, max_cluster_size in min_cluster_sizes:
            num_clustered, num_noise, \
                prop_clustered, prop_clustered_incorrect, \
                homogeneity, completeness = \
                    evaluate_clusters(cluster_labels, min_cluster_size,
                                      max_cluster_size)
            performance.append((split_i, 'falcon', (eps,),
                                min_cluster_size, max_cluster_size,
                                num_clustered, num_noise,
                                prop_clustered, prop_clustered_incorrect,
                                homogeneity, completeness))

### MaRaCluster

MaRaCluster hyperparameters that influence the clustering quality are:

- `--pvalThreshold <X>` / `--clusterThresholds <X>`: p-value threshold to merge spectra into clusters.

In [None]:
hp_maracluster = {0: -3.0, 1: -5.0, 2: -10.0, 3: -15.0, 4: -20.0, 5: -25.0,
                  6: -30.0, 7: -50.0}

In [None]:
for split_i in os.listdir(cluster_dir):
    metadata = pd.read_parquet(os.path.join(cluster_dir, split_i,
                                            'cluster_comparison.parquet'))
    dir_maracluster = os.path.join(cluster_dir, split_i, 'maracluster')
    if not os.path.exists(dir_maracluster):
        os.makedirs(dir_maracluster)    
    # MaRaCluster preprocessing.
    cmd = f"""realpath {os.path.join(cluster_dir, split_i)}/cluster_comparison.mgf \
        > {dir_maracluster}/files.txt"""
    ! eval {cmd}
    # MaRaCluster clustering.
    for i, pval_threshold in hp_maracluster.items():
        logging.info('MaRaCluster run %d (p-value threshold=%.1f)',
                     i + 1, pval_threshold)
        filename_orig = os.path.join(
            dir_maracluster,
            f'clusters_{i}.clusters_p{abs(int(pval_threshold))}.tsv')
        filename = os.path.join(dir_maracluster, f'clusters_{i}.tsv')
        # Execute clustering.
        cmd = f"""$GLEAMS_HOME/bin/maracluster-v1-01-linux-amd64/bin/maracluster batch \
            --batch {dir_maracluster}/files.txt \
            --output-folder {dir_maracluster} \
            --precursorTolerance 10ppm \
            --pvalThreshold {pval_threshold} \
            --clusterThresholds {pval_threshold} \
            --prefix clusters_{i}"""
        if not os.path.isfile(filename):
            ! eval {cmd} && \
                mv {filename_orig} {filename} && \
                rm {dir_maracluster}/*.dat && \
                rm {dir_maracluster}/*.dat.pvalue_tree.tsv && \
                rm {dir_maracluster}/*.dat_file_list.txt && \
                rm {dir_maracluster}/overlap.pvalue_tree.tsv
        # Evaluate clustering performance.
        cluster_labels = get_clusters_maracluster(filename, metadata)
        for min_cluster_size, max_cluster_size in min_cluster_sizes:
            num_clustered, num_noise, \
                prop_clustered, prop_clustered_incorrect, \
                homogeneity, completeness = \
                    evaluate_clusters(cluster_labels, min_cluster_size,
                                      max_cluster_size)
            performance.append((split_i, 'MaRaCluster', (pval_threshold,),
                                min_cluster_size, max_cluster_size,
                                num_clustered, num_noise,
                                prop_clustered, prop_clustered_incorrect,
                                homogeneity, completeness))

### MS-Cluster

MS-Cluster hyperparameters that influence the clustering quality are:

- `--mixture-prob <X>`: the probability wrongfully adding a spectrum to a cluster (default X=0.05)
- `--num-rounds <X>`: determines how many rounds are used for the hierarchical clustering (default X=3).

In [None]:
hp_mscluster = {0: 0.00001, 1: 0.0001, 2: 0.001, 3: 0.005, 4: 0.01, 5: 0.05,
                6: 0.1}
rounds = 3

In [None]:
for split_i in os.listdir(cluster_dir):
    metadata = pd.read_parquet(os.path.join(cluster_dir, split_i,
                                            'cluster_comparison.parquet'))
    dir_mscluster = os.path.join(cluster_dir, split_i, 'mscluster')
    if not os.path.exists(dir_mscluster):
        os.makedirs(dir_mscluster)    
    # MS-Cluster preprocessing.
    cmd = f"""realpath {os.path.join(cluster_dir, split_i)}/cluster_comparison.mgf \
        > {dir_mscluster}/mscluster_spec_list.txt"""
    ! eval {cmd}
    # MS-Cluster clustering.
    for i, mixture_prob in hp_mscluster.items():
        logger.info('MS-Cluster run %d (mixture-prob=%.5f ; num-rounds=%d)',
                    i + 1, mixture_prob, rounds)
        dir_mscluster_i = os.path.join(dir_mscluster, f'clusters_{i}')
        # Execute clustering.
        cmd = f"""$GLEAMS_HOME/bin/MsCluster/MsCluster \
            --model LTQ_TRYP \
            --list {dir_mscluster}/mscluster_spec_list.txt \
            --output-name mscluster \
            --output-file-size 100000000 \
            --tmp-dir {os.path.join(dir_mscluster, "tmp")} \
            --out-dir {dir_mscluster_i} \
            --model-dir $GLEAMS_HOME/bin/MsCluster/Models \
            --memory-gb 300 \
            --fragment-tolerance 0.05 \
            --precursor-ppm 10 \
            --assign-charges \
            --mixture-prob {mixture_prob} \
            --num-rounds {rounds} \
            --keep-dataset-idx"""
        if not os.path.exists(dir_mscluster_i):
            ! eval {cmd}
        # Evaluate clustering performance.
        cluster_labels = get_clusters_mscluster(
            os.path.join(dir_mscluster_i, 'clust'), metadata)
        for min_cluster_size, max_cluster_size in min_cluster_sizes:
            num_clustered, num_noise, \
                prop_clustered, prop_clustered_incorrect, \
                homogeneity, completeness = \
                    evaluate_clusters(cluster_labels, min_cluster_size,
                                      max_cluster_size)
            performance.append((split_i, 'MS-Cluster', (mixture_prob, rounds),
                                min_cluster_size, max_cluster_size,
                                num_clustered, num_noise,
                                prop_clustered, prop_clustered_incorrect,
                                homogeneity, completeness))

### spectra-cluster

spectra-cluster hyperparameters that influence the clustering quality are:

- `-rounds <arg>`: number of clustering rounds to use.
- `-threshold_end <arg>`: (lowest) final clustering threshold
- `-threshold_start <arg>`: (highest) starting threshold

In [None]:
hp_spectracluster = {0: 0.99999, 1: 0.9999, 2: 0.999, 3: 0.99, 4: 0.95,
                     5: 0.9, 6: 0.8}
rounds = 3

In [None]:
for split_i in os.listdir(cluster_dir):
    metadata = pd.read_parquet(os.path.join(cluster_dir, split_i,
                                            'cluster_comparison.parquet'))
    dir_spectracluster = os.path.join(cluster_dir, split_i, 'spectra-cluster')
    if not os.path.exists(os.path.join(dir_spectracluster, 'tmp')):
        os.makedirs(os.path.join(dir_spectracluster, 'tmp'))
    # spectra-cluster clustering.
    for i, threshold_end in hp_spectracluster.items():
        logger.info('spectra-cluster run %d (threshold_end=%.5f ; rounds=%d)',
                    i + 1, threshold_end, rounds)
        filename = os.path.join(dir_spectracluster, f'clusters_{i}.txt')
        # Execute clustering.
        cmd = f"""java -jar $GLEAMS_HOME/bin/spectra-cluster/spectra-cluster-cli-1.1.2.jar \
            {os.path.join(cluster_dir, split_i)}/cluster_comparison.mgf \
            -binary_directory {dir_spectracluster}/tmp \
            -fast_mode \
            -fragment_tolerance 0.05 \
            -keep_binary_files \
            -major_peak_jobs $(nproc --all) \
            -output_path {filename} \
            -precursor_tolerance 10 \
            -precursor_tolerance_unit ppm \
            -reuse_binary_files \
            -rounds {rounds} \
            -threshold_end {threshold_end} \
            -threshold_start 1.0 \
            -x_disable_mgf_comments"""
        if not os.path.isfile(filename):
            ! eval {cmd}
        # Evaluate clustering performance.
        cluster_labels = get_clusters_spectracluster(filename, metadata)
        for min_cluster_size, max_cluster_size in min_cluster_sizes:
            num_clustered, num_noise, \
                prop_clustered, prop_clustered_incorrect, \
                homogeneity, completeness = \
                    evaluate_clusters(cluster_labels, min_cluster_size,
                                      max_cluster_size)
            performance.append((split_i, 'spectra-cluster',
                                (threshold_end, rounds),
                                min_cluster_size, max_cluster_size,
                                num_clustered, num_noise,
                                prop_clustered, prop_clustered_incorrect,
                                homogeneity, completeness))

### GLEAMS

In [None]:
hp_gleams = {
    0: ('complete', 0.1), 1: ('complete', 0.2), 2: ('complete', 0.3),
    3: ('complete', 0.4), 4: ('complete', 0.5), 5: ('complete', 0.6),
    6: ('complete', 0.7), 7: ('complete', 0.8),
    8: ('single', 0.05), 9: ('single', 0.10), 10: ('single', 0.15),
    11: ('single', 0.20), 12: ('single', 0.25),
    13: ('average', 0.1), 14: ('average', 0.2), 15: ('average', 0.3),
    16: ('average', 0.4), 17: ('average', 0.5), 18: ('average', 0.6)}

In [None]:
for split_i in os.listdir(cluster_dir):
    dir_gleams = os.path.join(cluster_dir, split_i, 'gleams')
    if not os.path.exists(dir_gleams):
        os.makedirs(dir_gleams)
    embed_filename = os.path.join(dir_gleams, 'embed_cluster_comparison.npy')
    metadata_filename = os.path.join(cluster_dir, split_i,
                                     'cluster_comparison.parquet')
    metadata = pd.read_parquet(metadata_filename)
    # Extract the relevant entries from all (previously computed) embeddings.
    if not os.path.isfile(embed_filename):
        embed_idx = pd.merge(
            metadata, pd.read_parquet(
                os.path.join(
                    os.environ['GLEAMS_HOME'], 'data', 'embed',
                    f'embed_{config.massivekb_task_id}_{split}.parquet'),
                columns=['dataset', 'filename', 'scan']).reset_index(),
            'left', ['dataset', 'filename', 'scan'])['index'].values
        embeddings = np.load(os.path.join(
            os.environ['GLEAMS_HOME'], 'data', 'embed',
            f'embed_{config.massivekb_task_id}_{split}.npy'), mmap_mode='r')
        np.save(embed_filename, embeddings[embed_idx])
    # GLEAMS clustering.
    for i, (linkage, distance_threshold) in hp_gleams.items():
        logger.info('GLEAMS run %d (%s linkage, distance_threshold=%.2f)',
                    i + 1, linkage, distance_threshold)
        cluster_filename = os.path.join(
            dir_gleams, f'clusters_cluster_comparison_{i}.npy')
        # Execute clustering.
        if not os.path.isfile(cluster_filename):
            cluster.cluster(
                embed_filename, metadata_filename, cluster_filename,
                config.precursor_tol_mass, config.precursor_tol_mode,
                linkage, distance_threshold, config.charges)
        # Evaluate clustering performance.
        cluster_labels = get_clusters_gleams(cluster_filename, metadata)
        for min_cluster_size, max_cluster_size in min_cluster_sizes:
            num_clustered, num_noise, \
                prop_clustered, prop_clustered_incorrect, \
                homogeneity, completeness = \
                    evaluate_clusters(cluster_labels, min_cluster_size,
                                      max_cluster_size)
            performance.append((split_i, f'GLEAMS {linkage} linkage',
                                (distance_threshold,),
                                min_cluster_size, max_cluster_size,
                                num_clustered, num_noise,
                                prop_clustered, prop_clustered_incorrect,
                                homogeneity, completeness))

## Compare clustering results

In [None]:
performance = pd.DataFrame(performance, columns=[
    'split_i', 'tool', 'hyperparameters',
    'min_cluster_size', 'max_cluster_size',
    'num_clustered', 'num_noise',
    'prop_clustered', 'prop_clustered_incorrect',
    'homogeneity', 'completeness'])
performance.to_csv('cluster_comparison.csv', index=False)

In [None]:
performance = pd.read_csv('cluster_comparison.csv')

In [None]:
width = 7
height = width / 1.618
fig, axes = plt.subplots(1, 2, figsize=(width * 2, height))

for tool in ('GLEAMS average linkage', 'falcon', 'MaRaCluster',
             'MS-Cluster', 'spectra-cluster'):
    tool_performance = (performance[performance['tool'] == tool]
                        .sort_values('prop_clustered_incorrect'))
    if tool.startswith('GLEAMS'):
        tool = 'GLEAMS'
    axes[0].plot(
        (tool_performance.groupby('hyperparameters', sort=False)
         ['prop_clustered_incorrect'].mean()),
        (tool_performance.groupby('hyperparameters', sort=False)
         ['prop_clustered'].mean()),
        marker='o', label=tool)
    axes[1].plot(
        (tool_performance.groupby('hyperparameters', sort=False)
         ['prop_clustered_incorrect'].mean()),
        (tool_performance.groupby('hyperparameters', sort=False)
         ['completeness'].mean()),
        marker='o', label=tool)

axes[0].set_xlim(0, 0.05)
axes[0].set_ylim(0, 1)
axes[1].set_xlim(0, 0.05)
axes[1].set_ylim(0.7, 1)

axes[0].xaxis.set_major_formatter(mticker.PercentFormatter(1, 0))
axes[0].yaxis.set_major_formatter(mticker.PercentFormatter(1, 0))
axes[1].xaxis.set_major_formatter(mticker.PercentFormatter(1, 0))

axes[0].set_xlabel('Incorrectly clustered spectra')
axes[0].set_ylabel('Clustered spectra')
axes[1].set_xlabel('Incorrectly clustered spectra')
axes[1].set_ylabel('Completeness')

axes[0].legend(loc='upper left', frameon=False)
axes[1].legend(loc='upper left', frameon=False)
    
fig.tight_layout()

for ax in axes:
    sns.despine(ax=ax)

plt.savefig('cluster_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
width = 7
height = width / 1.618
fig, axes = plt.subplots(1, 2, figsize=(width * 2, height))

for tool in ('GLEAMS complete linkage', 'GLEAMS single linkage',
             'GLEAMS average linkage', 'GLEAMS DBSCAN'):
    tool_performance = (performance[performance['tool'] == tool]
                        .sort_values('prop_clustered_incorrect'))
    tool = tool[7:]
    axes[0].plot(
        (tool_performance.groupby('hyperparameters', sort=False)
         ['prop_clustered_incorrect'].mean()),
        (tool_performance.groupby('hyperparameters', sort=False)
         ['prop_clustered'].mean()),
        marker='o', label=tool, zorder=0.9)
    axes[1].plot(
        (tool_performance.groupby('hyperparameters', sort=False)
         ['prop_clustered_incorrect'].mean()),
        (tool_performance.groupby('hyperparameters', sort=False)
         ['completeness'].mean()),
        marker='o', label=tool, zorder=0.9)

axes[0].set_xlim(0, 0.05)
axes[0].set_ylim(0, 1)
axes[1].set_xlim(0, 0.05)
axes[1].set_ylim(0.7, 1)

axes[0].xaxis.set_major_formatter(mticker.PercentFormatter(1, 0))
axes[0].yaxis.set_major_formatter(mticker.PercentFormatter(1, 0))
axes[1].xaxis.set_major_formatter(mticker.PercentFormatter(1, 0))

axes[0].set_xlabel('Incorrectly clustered spectra')
axes[0].set_ylabel('Clustered spectra')
axes[1].set_xlabel('Incorrectly clustered spectra')
axes[1].set_ylabel('Completeness')

axes[0].legend(loc='upper left', frameon=False)
axes[1].legend(loc='upper left', frameon=False)
    
fig.tight_layout()

for ax in axes:
    sns.despine(ax=ax)

plt.savefig('cluster_comparison_gleams.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

## Cluster size and number of datasets

In [None]:
# Only use the first split for simplicity.
# The clustering results are very similar across the splits.
split_i = 0
performance_split = (performance[performance['split_i'] == split_i]
                     .reset_index().copy())
performance_split.loc[
    (performance_split['tool'].str.startswith('GLEAMS')) &
    (performance_split['tool'] != 'GLEAMS average linkage'),
    'prop_clustered_incorrect'] = np.inf
performance_split.loc[performance_split['tool'].str.startswith('GLEAMS'),
                      'tool'] = 'GLEAMS'

In [None]:
# Get the "best" performing runs (low number of incorrectly clustered spectra).
target_incorrect, tool_index = 0.01, {}
for tool, tool_performance in \
        ((performance_split.set_index('tool')['prop_clustered_incorrect']
          - target_incorrect).abs().reset_index().groupby('tool')):
    if tool in ('GLEAMS complete linkage', 'GLEAMS single linkage',
                'GLEAMS DBSCAN', 'msCRUSH'):
        continue
    idxmin = (tool_performance['prop_clustered_incorrect'] ==
              tool_performance['prop_clustered_incorrect'].min())
    tool_index[tool] = (np.where(idxmin)[0][0],
                        tool_performance[idxmin].index[0])

In [None]:
performance_split.loc[[idx[1] for idx in tool_index.values()]].sort_index()

In [None]:
# Read clustering results from the different tools.
metadata = pd.read_parquet(os.path.join(
    cluster_dir, str(split_i), 'cluster_comparison.parquet'))

tool_clusters = {
    'GLEAMS': get_clusters_gleams(
        os.path.join(cluster_dir, str(split_i), 'gleams',
                     f'clusters_cluster_comparison_'
                     f'{tool_index["GLEAMS"][0]}.npy'),
        metadata),
    'falcon': get_clusters_falcon(
        os.path.join(cluster_dir, str(split_i), 'falcon',
                     f'clusters_{tool_index["falcon"][0]}.csv.xz'),
        metadata),
    'MaRaCluster': get_clusters_maracluster(
        os.path.join(cluster_dir, str(split_i), 'maracluster',
                     f'clusters_{tool_index["MaRaCluster"][0]}.tsv.xz'),
        metadata),
    'MS-Cluster': get_clusters_mscluster(
        os.path.join(cluster_dir, str(split_i), 'mscluster',
                     f'clusters_{tool_index["MS-Cluster"][0]}', 'clust'),
        metadata),
    'spectra-cluster': get_clusters_spectracluster(
        os.path.join(cluster_dir, str(split_i), 'spectra-cluster',
                     f'clusters_{tool_index["spectra-cluster"][0]}.txt'),
        metadata)
}

In [None]:
# Remove singleton and noise clusters.
min_cluster_size, max_cluster_size = min_cluster_sizes[0]
for tool, clusters in tool_clusters.items():
    # Use consecutive cluster labels, skipping the noise points.    
    cluster_map = clusters['cluster'].value_counts(dropna=False)
    if -1 in cluster_map.index:
        cluster_map = cluster_map.drop(index=-1)
    cluster_map = (cluster_map.to_frame().reset_index().reset_index()
                   .rename(columns={'index': 'old', 'level_0': 'new'})
                   .set_index('old')['new'])
    cluster_map = cluster_map.to_dict(collections.defaultdict(lambda: -1))
    clusters['cluster'] = clusters['cluster'].map(cluster_map)
    
    # Only consider clusters with specific minimum (inclusive) and/or
    # maximum (exclusive) size.
    cluster_counts = clusters['cluster'].value_counts(dropna=False)
    if min_cluster_size is not None:
        clusters.loc[clusters['cluster'].isin(cluster_counts[
            cluster_counts < min_cluster_size].index), 'cluster'] = -1
    if max_cluster_size is not None:
        clusters.loc[clusters['cluster'].isin(cluster_counts[
            cluster_counts >= max_cluster_size].index), 'cluster'] = -1
    
    tool_clusters[tool] = clusters

In [None]:
# Add cluster sizes.
for tool, clusters in tool_clusters.items():
    cluster_counts = (clusters['cluster']
                      .value_counts(dropna=False)
                      .to_frame()
                      .reset_index()
                      .rename(columns={'index': 'cluster', 'cluster': 'size'}))
    tool_clusters[tool] = pd.merge(clusters, cluster_counts, on='cluster')
cluster_sizes = {tool: clusters[clusters['cluster'] != -1]['size'].values
                 for tool, clusters in tool_clusters.items()}
_ = joblib.dump(cluster_sizes, 'cluster_comparison_size.joblib')
# Count number of datasets per cluster.
n_datasets = {tool: (clusters[clusters['cluster'] != -1].groupby('cluster')
                     ['dataset'].nunique().values)
              for tool, clusters in tool_clusters.items()}
_ = joblib.dump(n_datasets, 'cluster_comparison_n_datasets.joblib')

In [None]:
print('Number of clusters / clustered spectra / incorrectly clustered spectra:')
for tool, clusters in tool_clusters.items():
    print(f'- {tool}: {clusters.loc[clusters["cluster"] != -1, "cluster"].nunique():,d} / '
          f'{performance_split.loc[tool_index[tool][1], "num_clustered"]:,} / '
          f'{performance_split.loc[tool_index[tool][1], "prop_clustered_incorrect"]:.2%}')
print('Cluster sizes (median ± IQR):')
for tool, clusters in tool_clusters.items():
    cluster_size_non_noise = clusters[clusters["cluster"] != -1]["size"]
    print(f'- {tool}: {np.median(cluster_size_non_noise):.0f} ± '
          f'{scipy.stats.iqr(cluster_size_non_noise):.0f}')

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

for tool, cluster_size in cluster_sizes.items():
    sns.ecdfplot(cluster_size,
                 stat='proportion', complementary=True, ax=ax,
                 label=tool)
    
ax.set_xscale('log')
ax.set_ylim(0., 1.01)

ax.yaxis.set_major_formatter(mticker.PercentFormatter(1))

ax.set_xlabel('Minimum cluster size')
ax.set_ylabel('Proportion of clustered spectra')

ax.legend(loc='upper right', frameon=False)

sns.despine()

plt.savefig('cluster_comparison_size.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

bins = np.arange(1, max([n.max() for n in n_datasets.values()]) + 2, 1)
for tool, tool_n_datasets in n_datasets.items():
    hist = np.histogram(tool_n_datasets, bins)
    mask = hist[0] > 0
    ax.plot(bins[:-1][mask], hist[0][mask], marker='o', label=tool)

ax.xaxis.set_major_locator(mticker.FixedLocator([4, 8, 12, 16, 20]))
ax.set_yscale('log')

ax.set_xlabel('Number of different datasets')
ax.set_ylabel('Number of clusters')

ax.legend(loc='upper right', frameon=False)

sns.despine()

plt.savefig('cluster_comparison_n_datasets.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
logging.shutdown()