In [None]:
import itertools

import joblib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.lines import Line2D
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='sans-serif')
sns.set_palette(['#9e0059', '#6da7de', '#ee266d', '#dee000', '#eb861e'])
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

## Suspects

In [None]:
suspects_unfiltered = pd.read_parquet(
    '../../data/interim/suspects_unfiltered.parquet')
suspects_grouped = pd.read_parquet(
    '../../data/interim/suspects_grouped.parquet')
suspects_unique = pd.read_parquet(
    '../../data/interim/suspects_unique.parquet')

In [None]:
print(f'{len(suspects_grouped):,} suspects with non-zero mass differences '
      f'collected ({len(suspects_unfiltered):,} total)')
print(f'{len(suspects_unique):,} unique suspects after duplicate removal and '
      'filtering')

In [None]:
suspects_grouped['Dataset'] = (suspects_grouped['SuspectUsi']
                               .str.split(':').str[1])
suspects_grouped['Filename'] = (suspects_grouped['SuspectUsi']
                               .str.split(':').str[2])
suspect_count = (
    suspects_grouped.groupby(['CompoundName', 'Adduct', 'Rationale',
                              'GroupDeltaMass'])['Dataset']
    .count().sort_values(ascending=False).rename('Count').reset_index())

In [None]:
suspect_count.head(10).round(3)

In [None]:
suspect_dataset_count = (
    suspects_grouped.groupby(['CompoundName', 'Adduct', 'Rationale',
                              'GroupDeltaMass'])[['Dataset', 'Filename']]
    .nunique().sort_values(['CompoundName', 'GroupDeltaMass'], ascending=False)
    .reset_index())
suspect_dataset_count.to_csv('supp_table_1_suspect_dataset_count.csv')

In [None]:
suspect_dataset_count.sort_values('Dataset', ascending=False).head(10).round(3)

In [None]:
suspect_dataset_count.sort_values('Filename', ascending=False).head(10).round(3)

In [None]:
width = 7
height = width / 1.618
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(width, height))

dataset_count = suspect_dataset_count['Dataset'].value_counts().sort_index()
ax1.plot(dataset_count.index, dataset_count.values, '.', c='#9e0059')
filename_count = suspect_dataset_count['Filename'].value_counts().sort_index()
ax2.plot(filename_count.index, filename_count.values, '.', c='#6da7de')

ax1.set_yscale('log')
ax2.set_yscale('log')

ax1.set_xlabel('Number of datasets observed')
ax2.set_xlabel('Number of files observed')
ax1.set_ylabel('Number of suspects')

sns.despine(ax=ax1)
sns.despine(ax=ax2)

plt.tight_layout()

plt.savefig('suspect_count.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

## Mass shifts

In [None]:
delta_mzs = (suspects_grouped[['GroupDeltaMass', 'Rationale', 'AtomicDifference']]
             .value_counts().reset_index().rename(columns={0: 'Count'})
             .sort_values('Count', ascending=False))

suspects_interval = suspects_grouped.loc[suspects_grouped.index[
    suspects_grouped['DeltaMass'].between(-80.5, -79.5)]]

_ = joblib.dump((delta_mzs, suspects_interval), 'fig1_delta_mzs.parquet')
delta_mzs.sort_values('GroupDeltaMass').to_csv('supp_table_2_delta_mz.csv',
                                               index=False)

In [None]:
print(f"{len(delta_mzs):,} unique (non-zero) delta m/z's")

In [None]:
group_sizes_known, group_sizes_unknown = [], []
groups = suspects_grouped.copy()
groups['GroupDeltaMass'] = groups['GroupDeltaMass'].fillna(groups['DeltaMass'])
groups['Rationale'] = groups['Rationale'].fillna('unspecified')
groups['AtomicDifference'] = groups['AtomicDifference'].fillna('unspecified')
groups = groups[['GroupDeltaMass', 'Rationale', 'AtomicDifference']].value_counts()
counts, labels = [], []
for (_, rationale, atomic_difference), count in groups.iteritems():
    if rationale == 'unspecified' and atomic_difference == 'unspecified':
        counts.extend(itertools.repeat(count, count))
        labels.extend(itertools.repeat('unknown', count))
    else:
        counts.extend(itertools.repeat(count, count))
        labels.extend(itertools.repeat('interpreted', count))
groups = pd.DataFrame({'count': counts, 'label': labels})

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

sns.histplot(data=groups, x='count', hue='label', stat='count',
             log_scale=True)

ax.set_xlabel('Mass offset group size')
ax.set_ylabel('Number of suspects')

legend = ax.get_legend()
handles = legend.legendHandles
legend.remove()
ax.legend(handles, ['Matched', 'Unknown'], title='Mass offset interpretation')

ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))

sns.despine(ax=ax)

plt.savefig('delta_mass_group_size.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
delta_mzs.head(20).round(3)

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

ax.bar(delta_mzs[delta_mzs['Count'] >= 100]['GroupDeltaMass'],
       delta_mzs[delta_mzs['Count'] >= 100]['Count'],
       width=0.4, color='black')
ax.bar([-2.016, 2.016], [30124, 21984], width=1., color='#6da7de',
       label='2H')
ax.bar([-14.016, 14.016], [15396, 17202], width=1., color='#9e0059',
       label='1C,2H')
ax.bar([-15.995, 15.995], [8318, 12948], width=1., color='#dee000',
       label='1O')
ax.bar([-18.011, 18.010], [12257, 13653], width=1., color='#eb861e',
       label='2H,1O')
ax.bar([-28.031, 28.031], [22503, 23295], width=1., color='#943fa6',
       label='2C,4H')

ax.bar([-80], [625], width=1., color='#ee266d')

ax.set_xlim(-200, 300)

ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))

ax.set_xlabel('Delta $m$/$z$', math_fontfamily='dejavuserif')
ax.set_ylabel(f'Number of suspects')

ax.legend(loc='upper left', handlelength=1, handleheight=0.01)

sns.despine(ax=ax)

# Inset plot.
axins = inset_axes(ax, width=2, height=1.5)

suspects_interval = suspects_grouped.loc[suspects_grouped.index[
    suspects_grouped['DeltaMass'].between(-80.5, -79.5)]]
bin_width = 0.002
bins = (np.linspace(-80.2, -79.8, int(0.4 / bin_width) + 1) + bin_width / 2)

hue = np.full_like(suspects_interval['GroupDeltaMass'], 'none', object)
hue[suspects_interval['GroupDeltaMass'] == -79.967] = 'phosphorylation'
hue[suspects_interval['GroupDeltaMass'] == -79.957] = 'sulfation'
palette = {'phosphorylation': '#63c5b5', 'sulfation': '#ff38ba',
           'none': 'lightgray'}

sns.histplot(suspects_interval, x='DeltaMass', hue=hue, bins=bins,
             multiple='stack', palette=palette, legend=False, ax=axins,
             linewidth=0, alpha=1)

axins.set_xlabel('Delta $m$/$z$', fontsize='x-small',
                 math_fontfamily='dejavuserif')
axins.set_ylabel('Number of suspects', fontsize='x-small')
axins.tick_params(axis='both', which='major', labelsize='x-small')

handles = [Line2D([0], [0], color='#63c5b5', lw=2, solid_capstyle='butt'),
           Line2D([0], [0], color='#ff38ba', lw=2, solid_capstyle='butt'),]
axins.legend(handles, ['Phospho', 'Sulfo'], loc='upper right',
             bbox_to_anchor=(1.06, 1), fontsize='xx-small', handlelength=1.2)

sns.despine(ax=axins)

ax.add_patch(plt.Rectangle((.52, .38), .49, .61, ls='-', lw=1, ec='#ee266d',
                           fc='None', transform=ax.transAxes, clip_on=False,
                           zorder=10))

plt.savefig('delta_mass.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
width = 7
height = width / 1.618

for mz in range(-100, 100):
    # Get the suspects within the current interval.
    suspects_interval = suspects_grouped.loc[suspects_grouped.index[
        suspects_grouped['DeltaMass'].between(mz - 0.5, mz + 0.5)]]
    if len(suspects_interval) == 0:
        continue
    bin_width = 0.002
    bins = (np.linspace(mz - 0.5, mz + 0.5, int(1 / bin_width) + 1)
            + bin_width / 2)
    
    fig, ax = plt.subplots(figsize=(width, height))
    
    # Color the groups by frequency.
    hue = suspects_interval['GroupDeltaMass'].fillna('ungrouped')
    palette = {mz: color for mz, color in zip(
        hue[hue != 'ungrouped'].value_counts().index,
        sns.color_palette(n_colors=hue.nunique()))}
    palette['ungrouped'] = 'lightgray'
    
    sns.histplot(suspects_interval, x='DeltaMass', hue=hue, bins=bins,
                 multiple='stack', palette=palette, legend=False, ax=ax,
                 linewidth=0, alpha=1)
    
    ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))

    ax.set_xlabel('Delta $m$/$z$')
    ax.set_ylabel('Number of suspects')

    sns.despine(ax=ax)

    plt.savefig(f'density_{mz}.png', dpi=300, bbox_inches='tight')
    # plt.show()
    plt.close()