In [None]:
import os
import sys
# Make sure all code is in the PATH.
sys.path.append(os.path.normpath(os.path.join('../src')))

In [None]:
import collections
import itertools
import logging

import joblib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import natsort
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.patches import Patch
from sklearn.metrics import homogeneity_score, completeness_score

import config, falcon
from ms_io import ms_io

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette(['#6da7de', '#9e0059', '#dee000', '#d82222', '#5ea15d',
                 '#943fa6', '#63c5b5', '#ff38ba', '#eb861e', '#ee266d'])
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

## File download

In [None]:
! wget --timestamping \
    --retry-connrefused \
    --directory-prefix=../data/external \
    --passive-ftp \
    ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2014/04/PXD000561/*.raw

## File conversion

In [None]:
%%bash

mkdir -p ../data/interim

for raw_file in ../data/external/*.raw; do
    if [ ! -f ../data/interim/$(basename $raw_file .raw).mgf ]; then
        ThermoRawFileParser -i $raw_file -o ../data/interim -f 0
    fi
done

In [None]:
# Modify MGF spectrum titles for compatibility with msCRUSH.
mgf_dir = '../data/interim/'
for filename in os.listdir(mgf_dir):
    if filename.endswith('.mgf'):
        filename = os.path.join(mgf_dir, filename)
        spectra = list(ms_io.get_spectra(filename))
        ms_io.write_spectra(filename, spectra)

## Cluster comparison

### Setup

In [None]:
work_dir = '../data/processed'

In [None]:
min_cluster_sizes = [(2, None)]    #, (5, None), (10, None), (50, None)]

In [None]:
def _count_majority_label_mismatch(labels):
    labels_assigned = labels.dropna()
    if len(labels_assigned) <= 1:
        return 0
    else:
        return len(labels_assigned) - labels_assigned.value_counts().iat[0]


def evaluate_clusters(clusters, min_cluster_size=None, max_cluster_size=None):
    clusters = clusters.copy()
    # Only consider clusters with specific minimum (inclusive) and/or
    # maximum (exclusive) size.
    cluster_counts = clusters['cluster'].value_counts(dropna=False)
    if min_cluster_size is not None:
        clusters.loc[clusters['cluster'].isin(cluster_counts[
            cluster_counts < min_cluster_size].index), 'cluster'] = -1
    if max_cluster_size is not None:
        clusters.loc[clusters['cluster'].isin(cluster_counts[
            cluster_counts >= max_cluster_size].index), 'cluster'] = -1

    # Use consecutive cluster labels, skipping the noise points.    
    cluster_map = (clusters['cluster'].value_counts(dropna=False)
                   .drop(index=-1).to_frame().reset_index().reset_index()
                   .rename(columns={'index': 'old', 'level_0': 'new'})
                   .set_index('old')['new'])
    cluster_map = cluster_map.to_dict(collections.defaultdict(lambda: -1))
    clusters['cluster'] = clusters['cluster'].map(cluster_map)
    num_clusters = clusters['cluster'].max() + 1

    # Reassign noise points to singleton clusters.
    noise_mask = clusters['cluster'] == -1
    num_noise = noise_mask.sum()
    clusters.loc[noise_mask, 'cluster'] = np.arange(
        num_clusters, num_clusters + num_noise)

    # Compute cluster evaluation measures.
    prop_clustered = (len(clusters) - num_noise) / len(clusters)

    clusters_ident = clusters.dropna(subset=['sequence'])
    clusters_ident_non_noise = (clusters[~noise_mask]
                                .dropna(subset=['sequence']))

    # The number of incorrectly clustered spectra is the number of PSMs that
    # differ from the majority PSM. Unidentified spectra are not considered.
    prop_clustered_incorrect = sum(joblib.Parallel(n_jobs=-1)(
        joblib.delayed(_count_majority_label_mismatch)(clust['sequence'])
        for _, clust in clusters[~noise_mask].groupby('cluster')))
    prop_clustered_incorrect /= len(clusters_ident_non_noise)

    # Homogeneity measures whether clusters contain only identical PSMs.
    # This is only evaluated on non-noise points, because the noise cluster
    # is highly non-homogeneous by definition.
    homogeneity = homogeneity_score(clusters_ident_non_noise['sequence'],
                                    clusters_ident_non_noise['cluster'])
    # Completeness measures whether identical PSMs are assigned to the same
    # cluster.
    # This is evaluated on all PSMs, including those clustered as noise.
    completeness = completeness_score(clusters_ident['sequence'],
                                      clusters_ident['cluster'])

    return (len(clusters) - num_noise, num_noise,
            prop_clustered, prop_clustered_incorrect,
            homogeneity, completeness)

In [None]:
def get_clusters_mscluster(dir_name, ids=None):
    clusters, cluster_i = [], -1
    for filename in os.listdir(dir_name):
        if filename.endswith('.clust'):
            with open(os.path.join(dir_name, filename)) as f_in:
                for line in f_in:
                    if line.startswith('mscluster'):
                        cluster_i += 1
                    elif not line.isspace():
                        splits = line.split('\t')
                        file_i = int(splits[1])
                        spectrum_i = int(splits[2])
                        clusters.append((file_i, spectrum_i, cluster_i))
    cluster_labels = pd.DataFrame(clusters, columns=['file_i', 'spectrum_i',
                                                     'cluster'])
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = (pd.merge(cluster_labels, ids,
                                   'left', ['file_i', 'spectrum_i'])
                          .dropna(subset=['precursor_charge'])
                          [['identifier', 'cluster', 'precursor_charge',
                            'precursor_mz', 'sequence']])
        cluster_labels['precursor_charge'] = \
            cluster_labels['precursor_charge'].astype(int)
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['precursor_charge'].astype(str))
        return cluster_labels


def get_clusters_spectracluster(filename, ids=None):
    identifiers, clusters, cluster_i = [], [], -1
    with open(filename) as f_in:
        for line in f_in:
            if line.startswith('=Cluster='):
                cluster_i += 1
            elif line.startswith('SPEC'):
                fn_start_i = line.find('interim/') + len('interim/')
                fn_stop_i = line.find('.mgf', fn_start_i)
                scan_start_i = line.find('scan=') + len('scan=')
                scan_stop_i = line.find('\t', scan_start_i)
                identifiers.append(f'mzspec:PXD000561:'
                                   f'{line[fn_start_i:fn_stop_i]}:scan:'
                                   f'{line[scan_start_i:scan_stop_i]}')
                clusters.append(cluster_i)
    cluster_labels = pd.DataFrame({'identifier': identifiers,
                                   'cluster': clusters})
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = (pd.merge(cluster_labels,
                                   ids[['identifier', 'precursor_charge',
                                        'precursor_mz', 'sequence']],
                                   'left', 'identifier')
                          .dropna(subset=['precursor_charge']))
        cluster_labels['precursor_charge'] = \
            cluster_labels['precursor_charge'].astype(int)
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['precursor_charge'].astype(str))
        return cluster_labels


def get_clusters_falcon(filename, ids=None):
    cluster_labels = pd.read_csv(filename)
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = pd.merge(cluster_labels,
                                  ids[['identifier', 'sequence']],
                                  'left', 'identifier')
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['precursor_charge'].astype(str))
        return cluster_labels
    

def get_clusters_maracluster(filename, ids=None):
    cluster_labels = (pd.read_csv(filename, sep='\t',
                                  names=['filename', 'scan', 'cluster'])
                      .dropna(how='all'))
    cluster_labels['identifier'] = (
        'mzspec:PXD000561:'
        + cluster_labels['filename'].apply(
            lambda fn: os.path.splitext(os.path.basename(fn))[0])
        + ':scan:' + cluster_labels['scan'].astype(str))
    cluster_labels = cluster_labels[['identifier', 'cluster']]
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = pd.merge(cluster_labels,
                                  ids[['identifier', 'precursor_charge',
                                       'precursor_mz', 'sequence']],
                                  'left', 'identifier')
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['precursor_charge'].astype(str))
        return cluster_labels
    
    
def get_clusters_mscrush(filename, ids=None):
    cluster_labels = []
    for charge in config.charges:
        clusters_charge = pd.read_csv(filename.format(charge), sep='\t')
        clusters_charge['Titles'] = clusters_charge['Titles'].str.split('|')
        clusters_charge = clusters_charge.explode('Titles')
        clusters_charge['identifier'] = ('mzspec:PXD000561:'
                                         + clusters_charge['Titles'])
        clusters_charge = clusters_charge.rename(columns={'ID': 'cluster'})
        clusters_charge = clusters_charge[['identifier', 'cluster']]
        if len(cluster_labels) > 0:
            clusters_charge['cluster'] += cluster_labels[-1].iat[-1, 1] + 1
        cluster_labels.append(clusters_charge)
    cluster_labels = pd.concat(cluster_labels, ignore_index=True)
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = pd.merge(cluster_labels,
                                  ids[['identifier', 'precursor_charge',
                                       'precursor_mz', 'sequence']],
                                  'left', 'identifier')
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['precursor_charge'].astype(str))
        return cluster_labels

In [None]:
ids = pd.read_parquet('kim2014_ids.parquet')
ids['sequence'] = ids['sequence'].str.replace('L', 'I')
ids = ids[ids['precursor_charge'].isin(config.charges)]

In [None]:
performance = []

### MS-Cluster

In [None]:
dir_mscluster = os.path.join(work_dir, 'mscluster')

In [None]:
%%bash

mkdir -p $../data/processed/mscluster
realpath ../data/interim/*.mgf > ../data/processed/mscluster/mscluster_spec_list.txt

In [None]:
%%bash

time ../bin/MsCluster/MsCluster \
    --model LTQ_TRYP \
    --list ../data/processed/mscluster/mscluster_spec_list.txt \
    --output-name mscluster \
    --tmp-dir ../data/processed/mscluster/dat \
    --out-dir ../data/processed/mscluster \
    --dat-only \
    --model-dir ../bin/MsCluster/Models \
    --keep-dat \
    --assign-charges

In [None]:
hp_mscluster = {0: 0.0001, 1: 0.001, 2: 0.005, 3: 0.01, 4: 0.05, 5: 0.1,
                6: 0.2}
rounds = 3

In [None]:
for i, mixture_prob in hp_mscluster.items():
    logging.info('MS-Cluster run %d (mixture-prob=%.3f ; num-rounds=%d)',
                 i + 1, mixture_prob, rounds)
    # Execute clustering.
    cmd = f"""../bin/MsCluster/MsCluster \
        --model LTQ_TRYP \
        --dat-list {dir_mscluster}/dat/mscluster_dat_list.txt \
        --output-name mscluster \
        --output-file-size 100000000 \
        --out-dir {dir_mscluster}/cluster_{i} \
        --model-dir ../bin/MsCluster/Models \
        --memory-gb 50 \
        --fragment-tolerance 0.05 \
        --precursor-ppm 20 \
        --assign-charges \
        --mixture-prob {mixture_prob} \
        --num-rounds {rounds} \
        --keep-dataset-idx"""
    if not os.path.exists(os.path.join(dir_mscluster, f'cluster_{i}')):
        ! eval {cmd}
    # Evaluate clustering performance.
    cluster_labels = get_clusters_mscluster(
        os.path.join(dir_mscluster, f'cluster_{i}'), ids)
    for min_cluster_size, max_cluster_size in min_cluster_sizes:
        num_clustered, num_noise, \
            prop_clustered, prop_clustered_incorrect, \
            homogeneity, completeness = \
                evaluate_clusters(cluster_labels, min_cluster_size,
                                  max_cluster_size)
        performance.append(('MS-Cluster', (mixture_prob, rounds),
                            min_cluster_size, max_cluster_size,
                            num_clustered, num_noise,
                            prop_clustered, prop_clustered_incorrect,
                            homogeneity, completeness))

### spectra_cluster

In [None]:
dir_spectracluster = os.path.join(work_dir, 'spectra-cluster')

In [None]:
! mkdir -p .../data/processed/spectra-cluster/tmp

In [None]:
hp_spectracluster = {0: 0.99999, 1: 0.9999, 2: 0.999, 3: 0.99, 4: 0.95,
                     5: 0.9, 6: 0.8, 7: 0.7}
rounds = 3

In [None]:
for i, threshold_end in hp_spectracluster.items():
    logging.info('spectra-cluster run %d (threshold_end=%.4f ; rounds=%d)',
                 i + 1, threshold_end, rounds)
    filename = os.path.join(dir_spectracluster, f'clusters_{i}.txt')
    # Execute clustering.
    cmd = f"""java -jar ../bin/spectra-cluster/spectra-cluster-cli-1.1.2.jar \
        ../data/interim/*.mgf \
        -binary_directory {dir_spectracluster}/tmp \
        -fast_mode \
        -fragment_tolerance 0.05 \
        -keep_binary_files \
        -major_peak_jobs $(nproc --all) \
        -output_path {filename} \
        -precursor_tolerance 20 \
        -precursor_tolerance_unit ppm \
        -reuse_binary_files \
        -rounds {rounds} \
        -threshold_end {threshold_end} \
        -threshold_start 1.0 \
        -x_disable_mgf_comments"""
    if not os.path.isfile(filename):
        ! eval {cmd}
    # Evaluate clustering performance.
    cluster_labels = get_clusters_spectracluster(filename, ids)
    for min_cluster_size, max_cluster_size in min_cluster_sizes:
        num_clustered, num_noise, \
            prop_clustered, prop_clustered_incorrect, \
            homogeneity, completeness = \
                evaluate_clusters(cluster_labels, min_cluster_size,
                                  max_cluster_size)
        performance.append(('spectra-cluster', (threshold_end, rounds),
                            min_cluster_size, max_cluster_size,
                            num_clustered, num_noise,
                            prop_clustered, prop_clustered_incorrect,
                            homogeneity, completeness))

### falcon

In [None]:
dir_falcon = os.path.join(work_dir, 'falcon')

In [None]:
! mkdir -p ../data/processed/falcon

In [None]:
hp_falcon = {0: 0.05, 1: 0.1, 2: 0.15, 3: 0.2, 4: 0.25, 5: 0.3, 6: 0.35}

In [None]:
config.work_dir = dir_falcon
for i, eps in hp_falcon.items():
    logging.info('hp_falcon run %d (eps=%.2f)', i + 1, eps)
    filename = os.path.join(dir_falcon, f'clusters_{i}.csv')
    config.eps = eps
    # Execute clustering.
    if not os.path.isfile(filename):
        falcon.main()
        os.rename(os.path.join(dir_falcon, 'clusters.csv'), filename)
    # Evaluate clustering performance.
    cluster_labels = get_clusters_falcon(filename, ids)
    for min_cluster_size, max_cluster_size in min_cluster_sizes:
        num_clustered, num_noise, \
            prop_clustered, prop_clustered_incorrect, \
            homogeneity, completeness = \
                evaluate_clusters(cluster_labels, min_cluster_size,
                                  max_cluster_size)
        performance.append(('falcon', eps,
                            min_cluster_size, max_cluster_size,
                            num_clustered, num_noise,
                            prop_clustered, prop_clustered_incorrect,
                            homogeneity, completeness))

### MaRaCluster

In [None]:
dir_maracluster = os.path.join(work_dir, 'maracluster')

In [None]:
! mkdir -p ../data/processed/maracluster
! ls -1 ../data/interim/*.mgf > ../data/processed/maracluster/files.txt

In [None]:
hp_maracluster = {0: -3.0, 1: -5.0, 2: -10.0, 3: -15.0, 4: -20.0, 5: -25.0,
                  6: -30.0}

In [None]:
for i, pval_threshold in hp_maracluster.items():
    logging.info('MaRaCluster run %d (p-value threshold=%.1f)',
                 i + 1, pval_threshold)
    filename = os.path.join(
        dir_maracluster,
        f'MaRaCluster{i}.clusters_p{abs(int(pval_threshold))}.tsv')
    # Execute clustering.
    cmd = f"""../bin/maracluster-v1-01-linux-amd64/bin/maracluster batch \
        --batch ../data/processed/maracluster/files.txt \
        --output-folder ../data/processed/maracluster \
        --precursorTolerance 20ppm \
        --pvalThreshold {pval_threshold} \
        --clusterThresholds {pval_threshold} \
        --prefix MaRaCluster{i}"""
    if not os.path.isfile(filename):
        ! eval {cmd}
    # Evaluate clustering performance.
    cluster_labels = get_clusters_maracluster(filename, ids)
    for min_cluster_size, max_cluster_size in min_cluster_sizes:
        num_clustered, num_noise, \
            prop_clustered, prop_clustered_incorrect, \
            homogeneity, completeness = \
                evaluate_clusters(cluster_labels, min_cluster_size,
                                  max_cluster_size)
        performance.append(('MaRaCluster', (pval_threshold,),
                            min_cluster_size, max_cluster_size,
                            num_clustered, num_noise,
                            prop_clustered, prop_clustered_incorrect,
                            homogeneity, completeness))
    os.rename(filename, os.path.join(
        dir_maracluster, f'PXD000561_maracluster_{i}.tsv'))

### msCRUSH

In [None]:
dir_mscrush = os.path.join(work_dir, 'mscrush')

In [None]:
! mkdir -p ../data/processed/mscrush

In [None]:
hp_mscrush = {i: hp for i, hp in enumerate(itertools.product(
    [50, 100, 200], [10, 15], [0.55, 0.65, 0.75]))}

In [None]:
for i, (it, h, sim) in hp_mscrush.items():
    logging.info('msCRUSH run %d (iteration=%d, hash=%d, similarity=%.2f)',
                 i + 1, it, h, sim)
    filename = os.path.join(dir_mscrush, f'PXD000561_mscrush_{i}',
                            f'mscrush-c{config.charges[0]}.txt')
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    # Execute clustering.
    cmd = f"""../bin/mscrush/mscrush_on_general_charge \
        --files ../data/interim/*.mgf \
        --iteration {it} \
        --hash {h} \
        --thread $(nproc --all) \
        --similarity {sim} \
        --clustering_prefix {os.path.dirname(filename)}/mscrush"""
    if not os.path.isfile(filename.format(config.charges[0])):
        ! eval {cmd}
    # Evaluate clustering performance.
    cluster_labels = get_clusters_mscrush(filename, ids)
    for min_cluster_size, max_cluster_size in min_cluster_sizes:
        num_clustered, num_noise, \
            prop_clustered, prop_clustered_incorrect, \
            homogeneity, completeness = \
                evaluate_clusters(cluster_labels, min_cluster_size,
                                  max_cluster_size)
        performance.append(('msCRUSH', (it, h, sim),
                            min_cluster_size, max_cluster_size,
                            num_clustered, num_noise,
                            prop_clustered, prop_clustered_incorrect,
                            homogeneity, completeness))

## Compare clustering results

### Cluster hyperparameters

In [None]:
performance = pd.DataFrame(performance, columns=[
    'tool', 'hyperparameters',
    'min_cluster_size', 'max_cluster_size',
    'num_clustered', 'num_noise',
    'prop_clustered', 'prop_clustered_incorrect',
    'homogeneity', 'completeness'])
performance.to_csv('cluster_comparison.csv', index=False)

In [None]:
performance

### More detailed analysis of cluster size

In [None]:
tool_clusters = {}

# Read clustering results from the different tools.
tool_clusters['falcon'] = get_clusters_falcon(
    '../data/processed/falcon/clusters_1.csv', ids)
tool_clusters['MS-Cluster'] = get_clusters_mscluster(
    '../data/processed/mscluster/cluster_0', ids)
tool_clusters['spectra-cluster'] = get_clusters_spectracluster(
    '../data/processed/spectra-cluster/clusters_0.txt', ids)

# Remove singleton and noise clusters.
min_cluster_size = 2
for tool, clusters in tool_clusters.items():
    # Only consider clusters with specific minimum (inclusive) size.
    cluster_counts = clusters['cluster'].value_counts(dropna=False)
    if min_cluster_size is not None:
        clusters.loc[clusters['cluster'].isin(cluster_counts[
            cluster_counts < min_cluster_size].index), 'cluster'] = -1

    # Use consecutive cluster labels, skipping the noise points.    
    cluster_map = (clusters['cluster'].value_counts(dropna=False)
                   .drop(index=-1).to_frame().reset_index().reset_index()
                   .rename(columns={'index': 'old', 'level_0': 'new'})
                   .set_index('old')['new'])
    cluster_map = cluster_map.to_dict(collections.defaultdict(lambda: -1))
    clusters['cluster'] = clusters['cluster'].map(cluster_map)
    num_clusters = clusters['cluster'].max() + 1

    # Reassign noise points to singleton clusters.
    noise_mask = clusters['cluster'] == -1
    num_noise = noise_mask.sum()
    clusters.loc[noise_mask, 'cluster'] = np.arange(
        num_clusters, num_clusters + num_noise)
    
    tool_clusters[tool] = clusters
    
# Add cluster sizes.
for tool, clusters in tool_clusters.items():
    cluster_counts = (clusters['cluster']
                      .value_counts(dropna=False)
                      .to_frame()
                      .reset_index()
                      .rename(columns={'index': 'cluster', 'cluster': 'size'}))
    tool_clusters[tool] = pd.merge(clusters, cluster_counts, on='cluster')

In [None]:
print('Number of clusters per tool:')
for tool, clusters in tool_clusters.items():
    print(f'- {tool}: {clusters[clusters["size"] > 1]["cluster"].nunique():,d}')

In [None]:
max_peptide = tool_clusters['falcon']['sequence'].value_counts()
max_peptide, num_max_peptide = max_peptide.index.values[0], max_peptide.values[0]
print(f'Most frequent peptide: {max_peptide} is observed '
      f'{num_max_peptide:,d} times')

In [None]:
cluster_size_intervals = [(2, 5), (5, 20), (20, 100), (100, 500),
                          (500, 5000), (5000, None)]
max_peptide_clusters = []
for tool, clusters in tool_clusters.items():
    for cluster in (clusters[clusters['sequence'] == max_peptide]
                    ['cluster'].unique()):
        sequence_counts = (clusters[clusters['cluster'] == cluster]
                           ['sequence'].value_counts(dropna=False))
        if sequence_counts.sum() < cluster_size_intervals[0][0]:
            continue
        num_correct = sequence_counts[max_peptide]
        num_unidentified = (sequence_counts[np.NaN]
                            if np.NaN in sequence_counts else 0)
        num_incorrect = sequence_counts.sum() - num_correct - num_unidentified
        # Only consider clusters where this is the majority peptide.
        if num_correct > num_incorrect:
            cluster_size = num_correct + num_incorrect + num_unidentified
            for min_interval_size, max_interval_size in cluster_size_intervals:
                if (max_interval_size is None
                        and min_interval_size <= cluster_size):
                    interval = f'{min_interval_size}+'
                    break
                elif min_interval_size <= cluster_size < max_interval_size:
                    interval = f'{min_interval_size}–{max_interval_size}'
                    break
            max_peptide_clusters.append((tool, interval, num_correct,
                                         num_unidentified, num_incorrect))
max_peptide_clusters = (
    pd.DataFrame(max_peptide_clusters, columns=[
        'tool', 'interval', 'num_correct', 'num_unidentified', 'num_incorrect'])
    .sort_values(['tool', 'interval'], key=natsort.natsort_keygen()))
max_peptide_clusters['num_total'] = (max_peptide_clusters['num_correct'] +
                                     max_peptide_clusters['num_unidentified'] +
                                     max_peptide_clusters['num_incorrect'])

In [None]:
print(f'Size of the top 5 largest clusters for peptide {max_peptide} per tool:')
(max_peptide_clusters.groupby('tool')
 ['num_total'].apply(lambda x: x.sort_values(ascending=False).head())
 .to_frame().droplevel(1).reset_index()
 .rename(columns={'num_total': 'largest clusters'}))

In [None]:
print(f'Number of unique clusters for peptide {max_peptide} per tool:')
for tool, num_clusters in max_peptide_clusters['tool'].value_counts().iteritems():
    print(f'- {tool}: {num_clusters}')

In [None]:
width = 7
height = width / 1.618
fig, axes = plt.subplots(2, 2, figsize=(width * 2, height * 2))
axes = np.ravel(axes)

# Number of clustered spectra and completeness.
for tool in ('falcon', 'MS-Cluster', 'spectra-cluster'):
    tool_performance = performance[(performance['tool'] == tool) &
                                   (performance['min_cluster_size'] == 2)]
    axes[0].plot(tool_performance['prop_clustered_incorrect'],
                 tool_performance['prop_clustered'], marker='o', label=tool)
    axes[1].plot(tool_performance['prop_clustered_incorrect'],
                 tool_performance['completeness'], marker='o', label=tool)

axes[0].set_xlim(0, 0.05)
axes[0].set_ylim(0, 1)
axes[1].set_xlim(0, 0.05)
axes[1].set_ylim(0.75, 1)

axes[0].xaxis.set_major_formatter(mticker.PercentFormatter(1, 0))
axes[0].yaxis.set_major_formatter(mticker.PercentFormatter(1, 0))
axes[1].xaxis.set_major_formatter(mticker.PercentFormatter(1, 0))

axes[0].set_xlabel('Incorrectly clustered spectra')
axes[0].set_ylabel('Clustered spectra')
axes[1].set_xlabel('Incorrectly clustered spectra')
axes[1].set_ylabel('Completeness')

axes[0].legend(loc='lower right', frameon=False)
axes[1].legend(loc='lower right', frameon=False)

# Cluster sizes.
max_size = max([clusters['size'].max()
                for clusters in tool_clusters.values()])
for i, (tool, clusters) in enumerate(tool_clusters.items()):
    sizes = np.insert(clusters['size'].values, -1, max_size)
    sns.ecdfplot(sizes, stat='proportion', ax=axes[2], label=tool,
                 zorder=len(tool_clusters) - i)
    
axes[2].set_xscale('log')
axes[2].set_ylim(0., 1.01)

axes[2].yaxis.set_major_formatter(mticker.PercentFormatter(1))

axes[2].set_xlabel('Cluster size')
axes[2].set_ylabel('Cumulative clustered spectra')

axes[2].legend(loc='lower right', frameon=False)

# Frequent peptide.
max_peptide_clusters_grouped = pd.concat(
    [max_peptide_clusters,
     pd.DataFrame([(tool, interval, 0, 0, 0)
                   for tool in max_peptide_clusters['tool'].unique()
                   for interval in max_peptide_clusters['interval'].unique()],
                  columns=['tool', 'interval', 'num_correct',
                           'num_unidentified', 'num_incorrect'])],
    ignore_index=True)    
max_peptide_clusters_grouped = (max_peptide_clusters_grouped
                                .sort_values(['tool', 'interval'],
                                             key=natsort.natsort_keygen())
                                .groupby(['tool', 'interval'], sort=False)
                                [['num_correct', 'num_unidentified',
                                  'num_incorrect']].apply(sum)
                                .reset_index())

for position, tool, hatch in zip([1.5, 0.5, -0.5],
                                 ['falcon', 'MS-Cluster', 'spectra-cluster'],
                                 ['x', '', '.']):
    (max_peptide_clusters_grouped[max_peptide_clusters_grouped['tool'] == tool]
     .plot.bar(x='interval', position=position, rot=0, stacked=True,
               width=0.2, edgecolor='black', hatch=3 * hatch, ax=axes[3]))

legend_elements = [
    Patch(facecolor='#6da7de', edgecolor='black',label=max_peptide),
    Patch(facecolor='white', edgecolor='black', hatch=3 * 'x',
          label='falcon'),
    Patch(facecolor='#9e0059', edgecolor='black', label='Unidentified'),
    Patch(facecolor='white', edgecolor='black', label='MS-Cluster'),
    Patch(facecolor='#dee000', edgecolor='black', label='Incorrect peptide'),
    Patch(facecolor='white', edgecolor='black', hatch=3 * '.',
          label='spectra-cluster')]
axes[3].legend(handles=legend_elements, loc='center',
               bbox_to_anchor=(0.5, 1.1), ncol=3, frameon=False)

axes[3].set_xlim(-0.5, axes[3].get_xlim()[1])

axes[3].yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))

axes[3].set_xlabel('Cluster size')
axes[3].set_ylabel('Clustered spectra')

for i, (ax, c) in enumerate(zip(axes, 'ABCD')):
    ax.annotate(c, xy=(-0.15, 1.1), xycoords='axes fraction',
                fontsize='xx-large', weight='bold')

for ax in axes:
    sns.despine(ax=ax)
    
fig.tight_layout()

plt.savefig('cluster_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
logging.shutdown()