In [None]:
import collections
import os

import joblib
import natsort
import numpy as np
import pandas as pd

## Setup

In [None]:
def get_clusters_falcon(filename, ids=None):
    cluster_labels = pd.read_csv(filename, comment='#')
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = pd.merge(cluster_labels,
                                  ids[['identifier', 'sequence']],
                                  'left', 'identifier')
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['precursor_charge'].astype(str))
        return cluster_labels
    

def get_clusters_maracluster(filename, ids=None):
    cluster_labels = (pd.read_csv(filename, sep='\t',
                                  names=['filename', 'scan', 'cluster'])
                      .dropna(how='all'))
    cluster_labels['identifier'] = (
        'mzspec:PXD000561:'
        + cluster_labels['filename'].apply(
            lambda fn: os.path.splitext(os.path.basename(fn))[0])
        + ':scan:' + cluster_labels['scan'].astype(str))
    cluster_labels = cluster_labels[['identifier', 'cluster']]
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = (pd.merge(cluster_labels,
                                   ids[['identifier', 'precursor_charge',
                                        'precursor_mz', 'sequence']],
                                   'left', 'identifier')
                           .dropna(subset=['precursor_charge']))
        cluster_labels['precursor_charge'] = \
            cluster_labels['precursor_charge'].astype(int)
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['precursor_charge'].astype(str))
        return cluster_labels
    
    
def get_clusters_mscluster(dir_name, ids=None):
    clusters, cluster_i = [], -1
    for filename in os.listdir(dir_name):
        if filename.endswith('.clust'):
            with open(os.path.join(dir_name, filename)) as f_in:
                for line in f_in:
                    if line.startswith('mscluster'):
                        cluster_i += 1
                    elif not line.isspace():
                        splits = line.split('\t')
                        file_i = int(splits[1])
                        spectrum_i = int(splits[2])
                        clusters.append((file_i, spectrum_i, cluster_i))
    cluster_labels = pd.DataFrame(clusters, columns=['file_i', 'spectrum_i',
                                                     'cluster'])
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = (pd.merge(cluster_labels, ids,
                                   'left', ['file_i', 'spectrum_i'])
                          .dropna(subset=['precursor_charge'])
                          [['identifier', 'cluster', 'precursor_charge',
                            'precursor_mz', 'sequence']])
        cluster_labels['precursor_charge'] = \
            cluster_labels['precursor_charge'].astype(int)
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['precursor_charge'].astype(str))
        return cluster_labels


def get_clusters_mscrush(dir_name, ids=None):
    cluster_labels = []
    for filename in os.listdir(dir_name):
        if filename.endswith('.txt'):
            clusters_file = pd.read_csv(os.path.join(dir_name, filename),
                                        sep='\t')
            clusters_file['Titles'] = clusters_file['Titles'].str.split('|')
            clusters_file = clusters_file.explode('Titles')
            clusters_file['identifier'] = ('mzspec:PXD000561:'
                                           + clusters_file['Titles'])
            clusters_file = clusters_file.rename(columns={'ID': 'cluster'})
            clusters_file = clusters_file[['identifier', 'cluster']]
            if len(cluster_labels) > 0:
                clusters_file['cluster'] += cluster_labels[-1].iat[-1, 1] + 1
            cluster_labels.append(clusters_file)
    cluster_labels = pd.concat(cluster_labels, ignore_index=True)
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = (pd.merge(cluster_labels,
                                   ids[['identifier', 'precursor_charge',
                                        'precursor_mz', 'sequence']],
                                   'left', 'identifier')
                           .dropna(subset=['precursor_charge']))
        cluster_labels['precursor_charge'] = \
            cluster_labels['precursor_charge'].astype(int)
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['precursor_charge'].astype(str))
        return cluster_labels


def get_clusters_spectracluster(filename, ids=None):
    identifiers, clusters, cluster_i = [], [], -1
    with open(filename) as f_in:
        for line in f_in:
            if line.startswith('=Cluster='):
                cluster_i += 1
            elif line.startswith('SPEC'):
                fn_start_i = line.find('interim/') + len('interim/')
                fn_stop_i = line.find('.mgf', fn_start_i)
                scan_start_i = line.find('scan=') + len('scan=')
                scan_stop_i = line.find('\t', scan_start_i)
                identifiers.append(f'mzspec:PXD000561:'
                                   f'{line[fn_start_i:fn_stop_i]}:scan:'
                                   f'{line[scan_start_i:scan_stop_i]}')
                clusters.append(cluster_i)
    cluster_labels = pd.DataFrame({'identifier': identifiers,
                                   'cluster': clusters})
    if ids is None:
        return cluster_labels
    else:
        cluster_labels = (pd.merge(cluster_labels,
                                   ids[['identifier', 'precursor_charge',
                                        'precursor_mz', 'sequence']],
                                   'left', 'identifier')
                          .dropna(subset=['precursor_charge']))
        cluster_labels['precursor_charge'] = \
            cluster_labels['precursor_charge'].astype(int)
        cluster_labels['sequence'] = (
            cluster_labels['sequence'] + '/' +
            cluster_labels['precursor_charge'].astype(str))
        return cluster_labels

In [None]:
ids = pd.read_parquet('kim2014_ids.parquet')
ids['sequence'] = ids['sequence'].str.replace('L', 'I')

## Analyze cluster sizes

In [None]:
performance = pd.read_csv('cluster_comparison_hyperparam.csv')

In [None]:
# Get the "best" performing runs (low number of incorrectly clustered spectra).
target_incorrect, tool_index = 0.01, {}
for tool, tool_performance in \
        ((performance.set_index('tool')['prop_clustered_incorrect'] - target_incorrect)
         .abs().reset_index().groupby('tool')):
    idxmin = (tool_performance['prop_clustered_incorrect'] ==
              tool_performance['prop_clustered_incorrect'].min())
    tool_index[tool] = (np.where(idxmin)[0][0],
                        tool_performance[idxmin].index[0])

In [None]:
performance.loc[[idx[1] for idx in tool_index.values()]].sort_index()

In [None]:
# Read clustering results from the different tools.
tool_clusters = {
    'falcon': get_clusters_falcon(
        f'../data/processed/falcon/PXD000561_falcon_'
        f'{tool_index["falcon"][0]}.csv', ids),
    'MaRaCluster': get_clusters_maracluster(
        f'../data/processed/maracluster/PXD000561_maracluster_'
        f'{tool_index["MaRaCluster"][0]}.tsv', ids),
    'MS-Cluster': get_clusters_mscluster(
        f'../data/processed/mscluster/PXD000561_mscluster_'
        f'{tool_index["MS-Cluster"][0]}', ids),
    'msCRUSH': get_clusters_mscrush(
        f'../data/processed/mscrush/PXD000561_mscrush_'
        f'{tool_index["msCRUSH"][0]}', ids),
    'spectra-cluster': get_clusters_spectracluster(
        f'../data/processed/spectra-cluster/PXD000561_spectra-cluster_'
        f'{tool_index["spectra-cluster"][0]}.txt', ids)
}

In [None]:
# Remove singleton and noise clusters.
min_cluster_size, max_cluster_size = 2, None
for tool, clusters in tool_clusters.items():
    # Use consecutive cluster labels, skipping the noise points.    
    cluster_map = clusters['cluster'].value_counts(dropna=False)
    if -1 in cluster_map.index:
        cluster_map = cluster_map.drop(index=-1)
    cluster_map = (cluster_map.to_frame().reset_index().reset_index()
                   .rename(columns={'index': 'old', 'level_0': 'new'})
                   .set_index('old')['new'])
    cluster_map = cluster_map.to_dict(collections.defaultdict(lambda: -1))
    clusters['cluster'] = clusters['cluster'].map(cluster_map)

    # Reassign noise points to singleton clusters.
    noise_mask = clusters['cluster'] == -1
    num_clusters = clusters['cluster'].max() + 1
    clusters.loc[noise_mask, 'cluster'] = np.arange(
        num_clusters, num_clusters + noise_mask.sum())
    
    tool_clusters[tool] = clusters
    
# Add cluster sizes.
for tool, clusters in tool_clusters.items():
    cluster_counts = (clusters['cluster']
                      .value_counts(dropna=False)
                      .to_frame()
                      .reset_index()
                      .rename(columns={'index': 'cluster', 'cluster': 'size'}))
    tool_clusters[tool] = pd.merge(clusters, cluster_counts, on='cluster')
    
max_size = max([clusters['size'].max() for clusters in tool_clusters.values()])
cluster_sizes = {tool: np.insert(clusters['size'].values, -1, max_size)
                 for tool, clusters in tool_clusters.items()}
_ = joblib.dump(cluster_sizes, 'cluster_comparison_size.joblib')

In [None]:
print('Number of clusters per tool:')
for tool, clusters in tool_clusters.items():
    print(f'- {tool}: {clusters[clusters["size"] > 1]["cluster"].nunique():,d}')

In [None]:
max_peptide = tool_clusters['falcon']['sequence'].value_counts()
max_peptide, num_max_peptide = max_peptide.index.values[0], max_peptide.values[0]
print(f'Most frequent peptide: {max_peptide} is observed '
      f'{num_max_peptide:,d} times')

In [None]:
cluster_size_intervals = [(2, 5), (5, 20), (20, 100), (100, 500),
                          (500, 5000), (5000, None)]
max_peptide_clusters = []
for tool, clusters in tool_clusters.items():
    for cluster in (clusters[clusters['sequence'] == max_peptide]
                    ['cluster'].unique()):
        sequence_counts = (clusters[clusters['cluster'] == cluster]
                           ['sequence'].value_counts(dropna=False))
        if sequence_counts.sum() < cluster_size_intervals[0][0]:
            continue
        num_correct = sequence_counts[max_peptide]
        num_unidentified = (sequence_counts[np.NaN]
                            if np.NaN in sequence_counts else 0)
        num_incorrect = sequence_counts.sum() - num_correct - num_unidentified
        # Only consider clusters where this is the majority peptide.
        if num_correct > num_incorrect:
            cluster_size = num_correct + num_incorrect + num_unidentified
            for min_interval_size, max_interval_size in cluster_size_intervals:
                if (max_interval_size is None
                        and min_interval_size <= cluster_size):
                    interval = f'{min_interval_size}+'
                    break
                elif min_interval_size <= cluster_size < max_interval_size:
                    interval = f'{min_interval_size}–{max_interval_size}'
                    break
            max_peptide_clusters.append((tool, interval, num_correct,
                                         num_unidentified, num_incorrect))
max_peptide_clusters = (
    pd.DataFrame(max_peptide_clusters, columns=[
        'tool', 'interval', 'num_correct', 'num_unidentified', 'num_incorrect'])
    .sort_values(['tool', 'interval'], key=natsort.natsort_keygen()))
max_peptide_clusters['num_total'] = (max_peptide_clusters['num_correct'] +
                                     max_peptide_clusters['num_unidentified'] +
                                     max_peptide_clusters['num_incorrect'])

max_peptide_clusters_grouped = pd.concat(
    [max_peptide_clusters,
     pd.DataFrame([(tool, interval, 0, 0, 0)
                   for tool in max_peptide_clusters['tool'].unique()
                   for interval in max_peptide_clusters['interval'].unique()],
                  columns=['tool', 'interval', 'num_correct',
                           'num_unidentified', 'num_incorrect'])],
    ignore_index=True)    
max_peptide_clusters_grouped = (max_peptide_clusters_grouped
                                .sort_values(['tool', 'interval'],
                                             key=natsort.natsort_keygen())
                                .groupby(['tool', 'interval'], sort=False)
                                [['num_correct', 'num_unidentified',
                                  'num_incorrect']].apply(sum)
                                .reset_index())
max_peptide_clusters_grouped.to_csv('cluster_comparison_size.csv', index=False)

In [None]:
print(f'Size of the top 5 largest clusters for peptide {max_peptide} per tool:')
(max_peptide_clusters.groupby('tool')
 ['num_total'].apply(lambda x: x.sort_values(ascending=False).head())
 .to_frame().droplevel(1).reset_index()
 .rename(columns={'num_total': 'largest clusters'}))

In [None]:
print(f'Number of unique clusters for peptide {max_peptide} per tool:')
for tool, num_clusters in max_peptide_clusters['tool'].value_counts().iteritems():
    print(f'- {tool}: {num_clusters}')