In [None]:
import os
import sys
os.environ['GLEAMS_HOME'] = os.path.join(os.environ['HOME'],
                                         'Projects/gleams')
# Make sure all code is in the PATH.
sys.path.append(
    os.path.normpath(os.path.join(os.environ['GLEAMS_HOME'], 'src')))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import umap
from scipy.stats import pearsonr

In [None]:
import logging
logger = logging.getLogger('gleams')
logger.setLevel(logging.DEBUG)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from gleams.dag import dag

from gleams import config
from gleams.ms_io import ms_io
from gleams.nn import data_generator, embedder, nn

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='sans-serif')
sns.set_palette('Set1')
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

In [None]:
split = 'test'

In [None]:
metadata = pd.merge(
    pd.read_parquet(
        os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed',
                     f'embed_{config.massivekb_task_id}_{split}.parquet')),
    (pd.read_parquet(
        os.path.join(os.environ['GLEAMS_HOME'], 'data', 'metadata',
                     f'massivekb_ids_{config.massivekb_task_id}.parquet'))
     .drop_duplicates(['filename', 'scan'])), 'left', ['filename', 'scan'])
# Only include frequently occurring peptides.
metadata = metadata[metadata['charge'] <= 4]
min_peptide_count = 5
peptide_counts = metadata['sequence'].value_counts()
metadata = metadata[metadata['sequence'].isin(
    peptide_counts[peptide_counts >= min_peptide_count].index)]

In [None]:
num_embeddings = min(len(metadata), 5000000)
selected_idx = np.random.choice(len(metadata), num_embeddings, False)
metadata = metadata.iloc[selected_idx]
gleams_embeddings = np.load(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed',
                 f'embed_{config.massivekb_task_id}_{split}.npy'),
    mmap_mode='r')[metadata.index]

In [None]:
min_r = 0.5

print('Dimensions correlated with charge:')
for dim in range(gleams_embeddings.shape[1]):
    r, pval = pearsonr(gleams_embeddings[:, dim], metadata['charge'])
    if abs(r) > min_r:
        print(dim, '\t', f'{r:6.3f}')
            
print('Dimensions correlated with m/z:')
for dim in range(gleams_embeddings.shape[1]):
    r, pval = pearsonr(gleams_embeddings[:, dim], metadata['mz'])
    if abs(r) > min_r:
        print(dim, '\t', f'{r:6.3f}')

In [None]:
umap_embeddings = (umap.UMAP(min_peptide_count)
                   .fit_transform(gleams_embeddings))

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

sc = ax.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], s=1,
                c=metadata['mz'], marker='.', cmap=plt.cm.get_cmap('YlGnBu'),
                alpha=0.1, rasterized=True)
cbar_ax = fig.add_axes([0.95, 0.25, 0.025, 0.5])
colorbar = fig.colorbar(sc, cax=cbar_ax)
colorbar.solids.set(alpha=1)
colorbar.set_label('Precursor m/z', size='large', labelpad=15)

ax.axis('off')

plt.savefig('embed_umap_mz.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

charges = np.sort(metadata['charge'].unique())
sc = ax.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], s=1,
                c=metadata['charge'], marker='.',
                cmap=plt.cm.get_cmap('YlGnBu', len(charges)), alpha=0.1,
                rasterized=True)
cbar_ax = fig.add_axes([0.95, 0.25, 0.025, 0.5])
colorbar = fig.colorbar(sc, cax=cbar_ax, ticks=charges, values=charges)
colorbar.solids.set(alpha=1)
colorbar.set_label('Precursor charge', size='large', labelpad=15)

ax.axis('off')

plt.savefig('embed_umap_charge.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
frequent_peptides = metadata.reset_index(drop=True).reset_index().groupby(
    ['sequence', 'charge'])['index'].apply(list).reset_index()
frequent_peptides['count'] = frequent_peptides['index'].apply(len)
frequent_peptides = frequent_peptides.sort_values('count', ascending=False)

In [None]:
frequent_peptides.head(10)

In [None]:
width = 7
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

sc = ax.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], s=1,
                c='lightgray', marker='.', alpha=0.1, rasterized=True)
selected_peptides = frequent_peptides.head(10).sort_values(['sequence',
                                                            'charge'])
for indexes, peptide, charge in zip(selected_peptides['index'],
                                    selected_peptides['sequence'],
                                    selected_peptides['charge']):
    ax.scatter(umap_embeddings[indexes][:, 0],
               umap_embeddings[indexes][:, 1],
               marker='o', label=f'{peptide}/{charge}')
    
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)

ax.axis('off')

plt.savefig('embed_umap_frequent_peptides.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()