In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)

## Suspects

In [None]:
suspects_unfiltered = pd.read_parquet(
    '../../data/interim/suspects_unfiltered.parquet')
suspects_grouped = pd.read_parquet(
    '../../data/interim/suspects_grouped.parquet')
suspects_unique = pd.read_parquet(
    '../../data/interim/suspects_unique.parquet')

In [None]:
print(f'{len(suspects_grouped):,} suspects with non-zero mass differences '
      f'collected ({len(suspects_unfiltered):,} total)')
print(f'{len(suspects_unique):,} unique suspects after duplicate removal and '
      'filtering')

In [None]:
suspects_grouped['Dataset'] = (suspects_grouped['SuspectUsi']
                               .str.split(':').str[1])
suspect_count = (
    suspects_grouped.groupby(['CompoundName', 'Adduct', 'Rationale',
                              'GroupDeltaMass'])['Dataset']
    .count().sort_values(ascending=False).rename('Count').reset_index())

In [None]:
suspect_count.head(10).round(3)

In [None]:
suspect_dataset_count = (
    suspects_grouped.groupby(['CompoundName', 'Adduct', 'Rationale',
                              'GroupDeltaMass'])['Dataset']
    .nunique().sort_values(ascending=False)
    .rename('Unique datasets observed').reset_index())

In [None]:
suspect_dataset_count.head(10).round(3)

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

dataset_count = (suspect_dataset_count['Unique datasets observed']
                 .value_counts().sort_index())
ax.plot(dataset_count.index, dataset_count.values, '.')

ax.set_yscale('log')

ax.set_xlabel('Number of datasets observed')
ax.set_ylabel('Number of suspects')

sns.despine(ax=ax)

plt.savefig('suspect_dataset_count.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

## Mass shifts

In [None]:
delta_mzs = (suspects_grouped[['GroupDeltaMass', 'Rationale', 'AtomicDifference']]
             .value_counts().reset_index().rename(columns={0: 'Count'})
             .sort_values('Count', ascending=False))

In [None]:
print(f"{len(delta_mzs):,} unique (non-zero) delta m/z's")

In [None]:
delta_mzs.head(20).round(3)

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width * 1.5, height / 1.5))

ax.bar(delta_mzs[delta_mzs['Count'] >= 100]['GroupDeltaMass'],
       delta_mzs[delta_mzs['Count'] >= 100]['Count'],
       width=0.4, color='black')

ax.set_xlim(-300, 300)

ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))

ax.set_xlabel('Delta $m$/$z$')
ax.set_ylabel(f'Number of suspects')

sns.despine(ax=ax)

plt.savefig('delta_mass.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
width = 7
height = width / 1.618

for mz in range(-100, 100):
    # Get the suspects within the current interval.
    suspects_interval = suspects_grouped.loc[suspects_grouped.index[
        suspects_grouped['DeltaMass'].between(mz - 0.5, mz + 0.5)]]
    if len(suspects_interval) == 0:
        continue
    bin_width = 0.002
    bins = (np.linspace(mz - 0.5, mz + 0.5, int(1 / bin_width) + 1)
            + bin_width / 2)
    
    fig, ax = plt.subplots(figsize=(width, height))
    
    # Color the groups by frequency.
    hue = suspects_interval['GroupDeltaMass'].fillna('ungrouped')
    palette = {mz: color for mz, color in zip(
        hue[hue != 'ungrouped'].value_counts().index,
        sns.color_palette(n_colors=hue.nunique()))}
    palette['ungrouped'] = 'lightgray'
    
    sns.histplot(suspects_interval, x='DeltaMass', hue=hue, bins=bins,
                 multiple='stack', palette=palette, legend=False, ax=ax,
                 linewidth=0, alpha=1)
    
    ax.yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))

    ax.set_xlabel('Delta $m$/$z$')
    ax.set_ylabel('Number of suspects')

    sns.despine(ax=ax)

    plt.savefig(f'density_{mz}.png', dpi=300, bbox_inches='tight')
    # plt.show()
    plt.close()