In [None]:
import os

import natsort
import pandas as pd
import pyteomics.mztab
import tqdm

from ms_io import ms_io

In [None]:
# Get spectrum IDs from MassIVE mzTab files.
ids, sequences, charges = [], [], []
mztab_dir = '../data/external/RMSV000000091.3'
for filename in tqdm.tqdm(os.listdir(mztab_dir)):
    if filename.lower().endswith('.mztab'):
        mztab_contents = pyteomics.mztab.MzTab(os.path.join(mztab_dir,
                                                            filename))
        # Resolve run references.
        runs = {}
        for key, value in mztab_contents.metadata.items():
            if 'ms_run' in key:
                runs[int(key[7:-10])] = os.path.splitext(
                    os.path.basename(value))[0]
        # Get all PSMS.
        for sequence, modifications, spectra_ref, charge in zip(
                mztab_contents.spectrum_match_table['sequence'],
                mztab_contents.spectrum_match_table['modifications'],
                mztab_contents.spectrum_match_table['spectra_ref'],
                mztab_contents.spectrum_match_table['charge']):
            run_id = int(spectra_ref[7:spectra_ref.find(']')])
            scan = int(spectra_ref[spectra_ref.rfind('=') + 1:])
            usi = f'mzspec:PXD000561:{runs[run_id]}:scan:{scan}'
            if modifications is None:
                sequence_new = sequence
            else:
                mods = []
                for modification in modifications.split(','):
                    pos, mod = modification.split('-')
                    mods.append((int(pos), mod))
                sequence_new = list(sequence)
                for i, (pos, mod) in enumerate(sorted(
                        mods, key=lambda pm: pm[0])):
                    sequence_new.insert(pos + i, f'[{mod}]')
                sequence_new = ''.join(sequence_new)
            ids.append(usi)
            sequences.append(sequence_new)
            charges.append(charge)
psms = (pd.DataFrame({'identifier': ids, 'sequence': sequences,
                      'charge': charges})
        .drop_duplicates('identifier'))

In [None]:
# Get file and spectrum indexes (needed to parse MS-Cluster output).
indexes = []
with open('../data/processed/mscluster/mscluster_0_spec_list.txt', 'r') as f_in:
    for file_i, filename in tqdm.tqdm(enumerate(f_in)):
        for spec in ms_io.get_spectra(filename.strip()):
            indexes.append((file_i, int(spec.identifier.rsplit(':', 1)[-1]),
                            'mzspec:PXD000561:' + spec.identifier,
                            spec.precursor_charge, spec.precursor_mz))
spectra = pd.DataFrame(indexes, columns=['file_i', 'spectrum_i', 'identifier',
                                         'precursor_charge', 'precursor_mz'])

In [None]:
spectra = (pd.merge(spectra, psms, 'left', 'identifier')
           .sort_values('identifier', key=natsort.natsort_keygen())
           .reset_index(drop=True))
spectra = spectra[['identifier', 'precursor_charge', 'precursor_mz',
                   'sequence', 'file_i', 'spectrum_i']]
spectra.to_parquet('kim2014_ids.parquet')