In [None]:
import joblib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.patches import Patch

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette(['#9e0059', '#6da7de', '#ee266d', '#dee000', '#eb861e'])
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

In [None]:
hyperparam = pd.read_csv('cluster_comparison_hyperparam.csv')
cluster_sizes = joblib.load('cluster_comparison_size.joblib')
max_peptide_clusters_grouped = pd.read_csv('cluster_comparison_size.csv')

In [None]:
tools = ('falcon', 'MaRaCluster', 'MS-Cluster', 'msCRUSH', 'spectra-cluster')

In [None]:
width = 7
height = width / 1.618
fig, axes = plt.subplots(2, 2, figsize=(width * 2, height * 2))
axes = np.ravel(axes)

# Number of clustered spectra and completeness.
for tool in tools:
    tool_performance = (hyperparam[(hyperparam['tool'] == tool) &
                                   (hyperparam['min_cluster_size'] == 2)]
                        .sort_values('prop_clustered_incorrect'))
    axes[0].plot(tool_performance['prop_clustered_incorrect'],
                 tool_performance['prop_clustered'], marker='o', label=tool)
    axes[1].plot(tool_performance['prop_clustered_incorrect'],
                 tool_performance['completeness'], marker='o', label=tool)

axes[0].set_xlim(0, 0.05)
axes[0].set_ylim(0, 1)
axes[1].set_xlim(0, 0.05)
axes[1].set_ylim(0.7, 1)

axes[0].xaxis.set_major_formatter(mticker.PercentFormatter(1, 0))
axes[0].yaxis.set_major_formatter(mticker.PercentFormatter(1, 0))
axes[1].xaxis.set_major_formatter(mticker.PercentFormatter(1, 0))

axes[0].set_xlabel('Incorrectly clustered spectra')
axes[0].set_ylabel('Clustered spectra')
axes[1].set_xlabel('Incorrectly clustered spectra')
axes[1].set_ylabel('Completeness')

axes[0].legend(loc='lower right', frameon=False)
axes[1].legend(loc='lower right', frameon=False)

# Cluster sizes.
for i, tool in enumerate(tools):
    sns.ecdfplot(cluster_sizes[tool][cluster_sizes[tool] > 1],
                 stat='proportion', complementary=True, ax=axes[2],
                 label=tool, zorder=len(tools) - i)
    
axes[2].set_xscale('log')
axes[2].set_ylim(0., 1.01)

axes[2].yaxis.set_major_formatter(mticker.PercentFormatter(1))

axes[2].set_xlabel('Cluster size')
axes[2].set_ylabel('Proportion of clustered spectra')

axes[2].legend(loc='upper right', frameon=False)

# Frequent peptide.
max_peptide_clusters_grouped['num_clustered'] = (
    max_peptide_clusters_grouped['num_correct'] +
    max_peptide_clusters_grouped['num_unidentified'] +
    max_peptide_clusters_grouped['num_incorrect'])
sns.barplot(x='interval', y='num_clustered', hue='tool',
            data=max_peptide_clusters_grouped, hue_order=tools,
            edgecolor='black', ax=axes[3])
axes[3].legend(loc='upper left', ncol=2, frameon=False)

axes[3].set_xlim(-0.5, axes[3].get_xlim()[1])

axes[3].yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))

axes[3].set_xlabel('Cluster size')
axes[3].set_ylabel('Clustered spectra')

for i, (ax, c) in enumerate(zip(axes, 'ABCD')):
    ax.annotate(c, xy=(-0.15, 1.1), xycoords='axes fraction',
                fontsize='xx-large', weight='bold')

for ax in axes:
    sns.despine(ax=ax)
    
fig.tight_layout()

plt.savefig('cluster_comparison.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()