In [None]:
import os
import sys
os.environ['GLEAMS_HOME'] = os.path.join(
    os.environ['HOME'], 'Projects', 'gleams')
# Make sure all code is in the PATH.
src_dir = os.path.normpath(os.path.join(os.environ['GLEAMS_HOME'], 'src'))
if src_dir not in sys.path:
    sys.path.append(src_dir)

In [None]:
import functools
import re
import shutil

import joblib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import ppx
import pyteomics.mgf
import pyteomics.mzid
import scipy.spatial.distance as ssd
import seaborn as sns
from sklearn.metrics import roc_auc_score, roc_curve

In [None]:
# Initialize logging.
from gleams import logger as glogger
glogger.init()
# Initialize all random seeds before importing any packages.
from gleams import rndm
rndm.set_seeds()

from gleams import config
from gleams.feature import feature
from gleams.metadata import metadata as md
from gleams.ms_io import ms_io
from gleams.nn import nn

In [None]:
import logging
logger = logging.getLogger('gleams')
logger.setLevel(logging.DEBUG)

In [None]:
# Plot styling.
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette(['#9e0059', '#6da7de', '#ee266d', '#dee000', '#eb861e'])
sns.set_context('paper', font_scale=1.3)    # Single-column figure.

## Download data

In [None]:
peak_dir = os.path.join(
    os.environ['GLEAMS_HOME'], 'data', 'peak', 'PXD015943')
result_dir = os.path.join(
    os.environ['GLEAMS_HOME'], 'notebooks', 'phospho')
if not os.path.isdir(peak_dir):
    os.makedirs(peak_dir)
if not os.path.isdir(result_dir):
    os.makedirs(result_dir)

filename_metadata = os.path.join(result_dir, 'metadata_PXD015943.parquet')

In [None]:
# Download all RAW and mzIdentML files.
proj = ppx.find_project('PXD015943', local=peak_dir)
proj.download(proj.remote_files('*.raw'))
proj.download(proj.remote_files('*.mzid'))

In [None]:
%%bash

for raw_file in $GLEAMS_HOME/data/peak/PXD015943/*.raw; do
    if [ ! -f $GLEAMS_HOME/data/peak/PXD015943/$(basename $raw_file .raw).mzML.gz ]; then
        ThermoRawFileParser -i $raw_file -o $GLEAMS_HOME/data/peak/PXD015943 -f 2 -g
    fi
done

## Data pre-processing

In [None]:
def get_modified_sequence(series):
    sequence, mods = series['PeptideSequence'], series['Modification']
    mods = {mod['location']: str(round(mod['monoisotopicMassDelta'], 6))
            for mod in mods}
    sequence_mod = []
    if 0 in mods:
        sequence_mod.append(f'+{mods[0]}')
    for i, aa in enumerate(sequence, 1):
        sequence_mod.append(aa)
        if i in mods:
            sequence_mod.append(f'+{mods[i]}')
    if len(sequence) + 1 in mods:
        sequence_mod.append(f'+{mods[len(sequence) + 1]}')
    return ''.join(sequence_mod)

In [None]:
# Read all PSMs from the mzIdentML files.
filenames_mzid = [os.path.join(peak_dir, filename)
                  for filename in os.listdir(peak_dir)
                  if filename.endswith('.mzid')]

read_mzid = functools.partial(
    pyteomics.mzid.DataFrame, recursive=True, retrieve_refs=True,
    iterative=False, read_schema=False, build_id_cache=False)


psms = pd.concat(joblib.Parallel(n_jobs=-1)(
                     joblib.delayed(read_mzid)(filename)
                     for filename in filenames_mzid),
                 ignore_index=True)

In [None]:
# Create a suitable metadata file.
metadata = (psms['spectrum title']
            .str.extract(r'^\d+: Scan (\d+) \(rt=\d+\.\d+\) \[.*\\([^\\]+)\.raw\]$')
            .rename(columns={0: 'scan', 1: 'filename'}))
metadata['dataset'] = 'PXD015943'
metadata['filename'] += '.mzML.gz'
metadata['scan'] = metadata['scan'].fillna(-1).astype(int)
metadata['charge'] = psms['chargeState']
metadata['mz'] = psms['experimentalMassToCharge']
metadata['sequence'] = (psms[['PeptideSequence', 'Modification']].dropna()
                        .apply(get_modified_sequence, 'columns'))
metadata = (metadata[['dataset', 'filename', 'scan', 'charge', 'mz', 'sequence']]
            .dropna())
# Restrict to only existing peak files.
filenames_peak = [filename for filename in os.listdir(peak_dir)
                  if filename.endswith('.mzML.gz')]
metadata = metadata[metadata['filename'].isin(filenames_peak)]
# Export metadata file.
metadata.to_parquet(filename_metadata, index=False)

## Embed spectra

In [None]:
# Encode and embed the spectra.
feature.convert_peaks_to_features(filename_metadata)
nn.embed(filename_metadata, config.model_filename, config.charges)

## Evaluate pair distances

In [None]:
md.generate_pairs_positive(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'feature', 'dataset',
                 'PXD015943.parquet'),
    config.charges)
md.generate_pairs_negative(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'feature', 'dataset',
                 'PXD015943.parquet'),
    config.charges, config.pair_mz_tolerance,
    config.negative_pair_fragment_tolerance,
    config.negative_pair_matching_fragments_threshold)
for mode in ('pos', 'neg'):
    for charge in np.arange(config.charges[0], config.charges[1] + 1):
        shutil.move(
            os.path.join(os.environ['GLEAMS_HOME'], 'data', 'feature',
                         'dataset', f'PXD015943_pairs_{mode}_{charge}.npy'),
            os.path.join(
                result_dir, f'metadata_PXD015943_pairs_{mode}_{charge}.npy'))

In [None]:
metadata_feature = pd.read_parquet(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'feature', 'dataset',
                 'PXD015943.parquet'))
metadata_embed = pd.read_parquet(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed', 'dataset',
                 'PXD015943.parquet'),
    columns=['dataset', 'filename', 'scan'])

In [None]:
metadata = (pd.merge(metadata_feature, metadata_embed.reset_index(), 'right',
                     on=['dataset', 'filename', 'scan'])
            .dropna())
metadata['index'] = metadata['index'].astype(int)

In [None]:
embeddings = np.load(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed', 'dataset',
                 'PXD015943.npy'))

In [None]:
pairs_pos, pairs_neg = [], []
max_num_pairs = 10_000_000 // (config.charges[1] + 1 - config.charges[0])
for charge in np.arange(config.charges[0], config.charges[1] + 1):
    pairs_pos_charge = np.load(
        os.path.join(result_dir, f'metadata_PXD015943_pairs_pos_{charge}.npy'))
    pairs_neg_charge = np.load(
        os.path.join(result_dir, f'metadata_PXD015943_pairs_neg_{charge}.npy'))
    num_pairs = min(len(pairs_pos_charge), len(pairs_neg_charge),
                    max_num_pairs // 2)
    logger.info('Using %d positive and negative feature pairs each for '
                'charge %d', num_pairs, charge)
    idx_pos = np.random.choice(pairs_pos_charge.shape[0], num_pairs, False)
    idx_neg = np.random.choice(pairs_neg_charge.shape[0], num_pairs, False)
    pairs_pos.append(pairs_pos_charge[idx_pos])
    pairs_neg.append(pairs_neg_charge[idx_neg])
pairs_pos, pairs_neg = np.vstack(pairs_pos), np.vstack(pairs_neg)

In [None]:
dist_pos = joblib.Parallel(n_jobs=-1)(
    joblib.delayed(ssd.euclidean)(embeddings[index1], embeddings[index2])
    for index1, index2 in zip(metadata.loc[pairs_pos[:, 0]]['index'],
                              metadata.loc[pairs_pos[:, 1]]['index']))
dist_neg = joblib.Parallel(n_jobs=-1)(
    joblib.delayed(ssd.euclidean)(embeddings[index1], embeddings[index2])
    for index1, index2 in zip(metadata.loc[pairs_neg[:, 0]]['index'],
                              metadata.loc[pairs_neg[:, 1]]['index']))

In [None]:
joblib.dump([dist_pos, dist_neg], 'phospho_pairs_dist.joblib')

In [None]:
# dist_pos, dist_neg = joblib.load('phospho_pairs_dist.joblib')

In [None]:
width = 7
height = width / 1.618    # golden ratio
fig, ax = plt.subplots(figsize=(width, height))

sns.kdeplot(dist_pos, shade=True, label='Positive pairs', ax=ax)
sns.kdeplot(dist_neg, shade=True, label='Negative pairs', ax=ax)

ax.set_xlim(0, ax.get_xlim()[1])

ax.set_xlabel('Embedded distance')
ax.set_ylabel('Density')

ax.legend()

sns.despine()

plt.savefig('phospho_pairs_dist.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

In [None]:
labels = np.hstack([[0] * len(dist_pos), [1] * len(dist_neg)])
dist = np.hstack([dist_pos, dist_neg])
fpr, tpr, _ = roc_curve(labels, dist)
auc = roc_auc_score(labels, dist)

In [None]:
width = 7 / 1.618
height = width
fig, ax = plt.subplots(figsize=(width, height))

interval = np.linspace(0, 1, 101)
tpr[0], tpr[-1] = 0, 1
ax.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
        
ax.plot([0, 1], [0, 1], c='black', ls='--')

ax.set_xlim(0, 1)
ax.set_ylim(0, 1)

ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')

ax.legend(loc='lower right', frameon=False)

sns.despine()

plt.savefig('phospho_pairs_roc.pdf', dpi=300, bbox_inches='tight')
plt.show()
plt.close()