In [None]:
import os
import sys
os.environ['GLEAMS_HOME'] = os.path.join(os.environ['HOME'],
                                         'Projects/gleams')
# Make sure all code is in the PATH.
sys.path.append(
    os.path.normpath(os.path.join(os.environ['GLEAMS_HOME'], 'src')))

In [None]:
import functools
import io
import json
import operator
import re
import urllib
import zipfile

import joblib
import numpy as np
import pandas as pd
import tqdm.notebook as tqdm

from gleams import config

## Download MassIVE-KB PSM information

In [None]:
dir_name = 'massivekb_ids'
if not os.path.isdir(dir_name):
    os.mkdir(dir_name)

In [None]:
# Get metadata for all search task:
# MassIVE-KB
#     -> Human HCD Spectral Library
#     -> View All Search Tasks
#     -> Download
with urllib.request.urlopen(urllib.request.Request(
            'https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResult'
            '?task=82c0124b6053407fa41ba98f53fd8d89'
            '&view=view_all_search_tasks', method='POST')) \
        as url_search_tasks:
    with zipfile.ZipFile(io.BytesIO(url_search_tasks.read())) \
            as zip_search_tasks:
        for filename in zip_search_tasks.namelist():
            if 'all_search_tasks' in filename:
                zip_search_tasks.extract(filename, dir_name)
                filename_task_ids = os.path.join(dir_name, filename)
                break

In [None]:
# Download the PSMs for all individual tasks.
for task_id in tqdm.tqdm(pd.read_csv(filename_task_ids, sep='\t',
                                     usecols=['search_task_id'],
                                     squeeze=True),
                         desc='Search results downloaded'):
    try:
        with urllib.request.urlopen(
                f'https://proteomics2.ucsd.edu/ProteoSAFe/status_json.jsp'
                f'?task={task_id}') as url_task:
            task_type = json.loads(url_task.read().decode())['workflow']
            if task_type == 'MULTIPASS_MSGF_PLUS_DB_SEARCH':
                # Example: 001812f23bbd4db99f1d4f526b60dbbb
                view = 'view_rescored_psms'
            elif task_type == 'MSGF-PLUS-SYNTHETIC':
                # Example: 0027dc60e863437494475781cd32898e
                view = 'group_by_spectrum_merged_result_with_kl_with_ambiguity'
            elif task_type == 'MSGF-PLUS-AMBIGUITY':
                # Example: 002919d2b7a94058a0d2ae21d3eb1608
                view = 'group_by_spectrum'
            with urllib.request.urlopen(urllib.request.Request(
                        f'https://proteomics2.ucsd.edu/ProteoSAFe/DownloadResult'
                        f'?task={task_id}&view={view}', method='POST')) \
                    as url_download:
                with zipfile.ZipFile(io.BytesIO(url_download.read())) \
                        as zip_download:
                    for filename in zip_download.namelist():
                        if any(filename_sub in filename
                               for filename_sub in [
                                   'view_rescored_psms',
                                   'group_by_spectrum_merged_result_with_kl_with_ambiguity',
                                   '.mzTab']):
                            zip_download.extract(filename, dir_name)
                            break
    except urllib.error.URLError:
        pass

## Combine PSMs

In [None]:
psms_tsv = pd.concat(
    [pd.read_csv(os.path.join(dir_name, filename), sep='\t',
                 usecols=['filename', 'scan', 'sequence', 'decoy'])
     for filename in tqdm.tqdm(os.listdir(dir_name), desc='TSV files read')
     if (any(filename_sub in filename
             for filename_sub in [
                 'view_rescored_psms',
                 'group_by_spectrum_merged_result_with_kl_with_ambiguity'])
         and os.path.getsize(os.path.join(dir_name, filename)) > 0)],
    ignore_index=True, sort=False, copy=False)
# Remove decoy PSMs.
psms_tsv = psms_tsv[psms_tsv['decoy'] == 0]
# Remove charge from peptide sequences.
regex_no_charge = re.compile('\.\d+$')
psms_tsv['sequence'] = psms_tsv['sequence'].apply(
    functools.partial(regex_no_charge.sub, ''))
psms_tsv = psms_tsv[['filename', 'scan', 'sequence']]

In [None]:
regex_run = re.compile('ms_run\[(\d+)\]-location')
regex_mod = re.compile('(\d+)-UNIMOD:(\d+)')
mod_masses = {1: '+42.011', 4: '+57.021', 5: '+43.006', 7: '+0.984',
              21: '+79.966', 28: '-17.027', 34: '+14.016', 35: '+15.995'}


def _sequence_add_mod(psm):
    if pd.isna(psm['modifications']):
        return psm['sequence']
    else:
        mods = [(int(pos), mod_masses[int(mod_i)])
                for pos, mod_i in re.findall(regex_mod,
                                             psm['modifications'])]
        mods = sorted(mods, key=operator.itemgetter(0), reverse=True)
        sequence = psm['sequence']
        for pos, mod in mods:
            sequence = sequence[:pos] + mod + sequence[pos:]
        return sequence


def read_mztab_psms(filename):
    try:
        filenames = {}
        skiplines = 0
        with open(filename) as f_in:
            line = next(f_in).strip()
            while line.split('\t', 1)[0] != 'PSH':
                line = next(f_in).strip()
                if line:
                    skiplines += 1
                    if 'ms_run' in line:
                        run_i = re.search(regex_run, line).group(1)
                        filenames[run_i] = (line.rsplit('\t', 1)[-1]
                                            .rsplit('/', 1)[-1])
        
        psms = pd.read_csv(filename, sep='\t', header=skiplines,
                           usecols=['sequence', 'modifications',
                                    'spectra_ref', 'opt_global_decoy'])
        psms = psms[psms['opt_global_decoy'] == 0].drop_duplicates()
        psms['sequence'] = psms.apply(_sequence_add_mod, axis='columns')
        file_scan = psms['spectra_ref'].str.extract(
            r'ms_run\[(?P<file>\d+)\]:scan=(?P<scan>\d+)')
        psms['filename'] = file_scan['file'].replace(filenames)
        psms['scan'] = file_scan['scan']
        return psms[['filename', 'scan', 'sequence']]
    except StopIteration:
        return None

In [None]:
mztab_dir = os.path.join(dir_name, 'mzTab')
psms_mztab = joblib.Parallel(n_jobs=-1)(
    joblib.delayed(read_mztab_psms)(os.path.join(mztab_dir, filename))
    for filename in os.listdir(mztab_dir)
    if filename.lower().endswith('.mztab'))
psms_mztab = pd.concat(psms_mztab, ignore_index=True, sort=False, copy=False)

In [None]:
psms = (pd.concat([psms_tsv, psms_mztab], ignore_index=True, copy=False)
        .drop_duplicates(['filename', 'scan']))
psms['scan'] = psms['scan'].astype(np.int64)

In [None]:
# Merge with spectrum metadata to add precursor m/z and charge information.
psms = pd.merge(
    pd.read_parquet(
        os.path.join(os.environ['GLEAMS_HOME'], 'data', 'embed',
                     f'embed_{config.massivekb_task_id}.parquet')),
    psms, 'outer', ['filename', 'scan'], copy=False).dropna()
psms['scan'] = psms['scan'].astype(np.int64)
psms['charge'] = psms['charge'].astype(np.int64)
psms = (psms[['dataset', 'filename', 'scan', 'sequence', 'charge', 'mz']]
        .sort_values(['dataset', 'filename', 'scan']))

In [None]:
psms.to_parquet(
    os.path.join(os.environ['GLEAMS_HOME'], 'data', 'metadata',
                 f'massivekb_ids_{config.massivekb_task_id}.parquet'),
    index=False)