In [None]:
import os
import sys
os.environ['GLEAMS_HOME'] = os.path.join(
    os.environ['HOME'], 'Projects', 'gleams')
# Make sure all code is in the PATH.
src_dir = os.path.normpath(os.path.join(os.environ['GLEAMS_HOME'], 'src'))
if src_dir not in sys.path:
    sys.path.append(src_dir)

In [None]:
import pyteomics.mzid

In [None]:
# Initialize logging.
from gleams import logger as glogger
glogger.init()
# Initialize all random seeds before importing any packages.
from gleams import rndm
rndm.set_seeds()

from gleams import config
from gleams.feature import feature
from gleams.nn import nn

## Download data from PRIDE

In [None]:
%%bash

wget --timestamping --retry-connrefused \
    --directory-prefix=$GLEAMS_HOME/data/peak/PXD015575 --passive-ftp \
    ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2019/11/PXD015575/*

In [None]:
%%bash

for raw_file in $GLEAMS_HOME/data/peak/PXD015575/*.raw; do
    if [ ! -f $GLEAMS_HOME/data/peak/PXD015575/$(basename $raw_file .raw).mzML.gz ]; then
        ThermoRawFileParser -i $raw_file -o $GLEAMS_HOME/data/peak/PXD015575 -f 2 -g
    fi
done

## Embed with GLEAMS

In [None]:
def _get_modified_sequence(series):
    sequence, mods = series['PeptideSequence'], series['Modification']
    mods = {mod['location']: str(round(mod['monoisotopicMassDelta'], 3))
            for mod in mods}
    sequence_mod = []
    if 0 in mods:
        sequence_mod.append(f'+{mods[0]}')
    for i, aa in enumerate(sequence, 1):
        sequence_mod.append(aa)
        if i in mods:
            sequence_mod.append(f'+{mods[i]}')
    if len(sequence) + 1 in mods:
        sequence_mod.append(f'+{mods[len(sequence) + 1]}')
    return ''.join(sequence_mod)

In [None]:
# Read all PSMs from the mzIdentML files.
data_dir = os.path.join(os.environ['GLEAMS_HOME'], 'data', 'peak',
                        'PXD015575')
filenames = [os.path.join(data_dir, filename)
             for filename in os.listdir(data_dir)
             if filename.endswith('.mzIdentML')]
psms = pyteomics.mzid.DataFrame(
    *filenames, recursive=True, retrieve_refs=True, iterative=False,
    read_schema=False, build_id_cache=False)
# Combine peptide sequence and modifications.
psms['sequence'] = psms[['PeptideSequence', 'Modification']].apply(
    _get_modified_sequence, 'columns')
# Apply FDR to filter invalid PSMs.
psms = psms[psms['passThreshold'] & ~psms['isDecoy']]
# Convert to metadata table.
metadata = psms[['name', 'scan number(s)', 'chargeState',
                 'experimentalMassToCharge', 'sequence']].rename(
    columns={'name': 'filename', 'scan number(s)': 'scan',
             'chargeState': 'charge', 'experimentalMassToCharge': 'mz'})
metadata['dataset'] = 'PXD015575'
metadata['filename'] = metadata['filename'].str.split(
    r'_wPhosSequest\.\d+\.\d+').str[0] + '.mzML.gz'
metadata['scan'] = metadata['scan'].astype(int)
metadata = metadata[['dataset', 'filename', 'scan', 'sequence', 'charge',
                     'mz']]
# Export metadata file.
filename_metadata = os.path.join(os.environ['GLEAMS_HOME'], 'data',
                                 'metadata', 'metadata_PXD015575.parquet')
metadata.to_parquet(filename_metadata, index=False)

In [None]:
# Encode and embed the spectra.
feature.convert_peaks_to_features(filename_metadata)
nn.embed(filename_metadata, config.model_filename, config.charges)