Download the raw file and zip folder search result generated by MaxQauant

In [2]:
# import wget
# url1 = "https://ftp.pride.ebi.ac.uk/pride/data/archive/2017/02/PXD004732/01625b_GA1-TUM_first_pool_1_01_01-DDA-1h-R2.raw"
# url2 = "https://ftp.pride.ebi.ac.uk/pride/data/archive/2017/02/PXD004732/TUM_first_pool_1_01_01_DDA-1h-R2-tryptic.zip"

# wget.download(url1)
# wget.download(url2)

Read the msms result and generate a dataframe only for unmodified peptides

In [3]:
from zipfile import ZipFile
import pandas as pd
with ZipFile(f'TUM_first_pool_1_01_01_DDA-1h-R2-tryptic.zip', 'r') as zip_file:
  msms = pd.read_csv(zip_file.open('msms.txt'), sep='\t')
# Current PROSIT pipeline does not accomodate modified peptides, so we remove all of the oxidized peptides
msms = msms[msms['Modifications'] == 'Unmodified']

Get the scan number from the search result and extract the desired info using fisher_py

In [4]:
from fisher_py import RawFile
raw = RawFile('01625b_GA1-TUM_first_pool_1_01_01-DDA-1h-R2.raw')
# Get the scan numbers from the msms file and save the scan + info in a dictionary
from fisher_py.data.business import Scan
import numpy as np
scan_mzs = []
scan_ints = []
scan_mass_analyzers = []
scan_collison_energy = []
for scan in msms['Scan number']:
  raw_scan = Scan.from_file(raw._raw_file_access, scan)
  scan_mzs.append(np.array(raw_scan.preferred_masses))
  scan_ints.append(np.array(raw_scan.preferred_intensities))
  scan_mass_analyzers.append(raw_scan.scan_type.split(' + ')[0])
  frag_infos = [f.split(' ')[0] for f in raw_scan.scan_type.split('@')[1:]]
  splits = [[i for i, g in enumerate(f) if g.isnumeric()][0] for f in frag_infos]
  NCEs = [float(frag[split:]) for split, frag in zip(splits, frag_infos)]
  scan_collison_energy.append(NCEs[0])

generate a dataframe merging results from search result and raw file

In [5]:

annotation_df = pd.DataFrame(msms[['Modified sequence', 'Charge', 'Scan number', 'Length']].values, columns=['MODIFIED_SEQUENCE', 'PRECURSOR_CHARGE', 'SCAN_NUMBER', 'PEPTIDE_LENGTH'])
annotation_df['MZ'] = scan_mzs
annotation_df['INTENSITIES'] = scan_ints
annotation_df['MASS_ANALYZER'] = scan_mass_analyzers
annotation_df['COLLISION_ENERGY'] = scan_collison_energy

In [None]:
from spectrum_fundamentals.mod_string import maxquant_to_internal
annotation_df['MODIFIED_SEQUENCE'] = maxquant_to_internal(annotation_df['MODIFIED_SEQUENCE'].values)

from spectrum_fundamentals.annotation.annotation import annotate_spectra
annotation = annotate_spectra(annotation_df)
annotation

In [7]:
PROSIT_ALHABET = {
    "A": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "K": 9,
    "L": 10,
    "M": 11,
    "N": 12,
    "P": 13,
    "Q": 14,
    "R": 15,
    "S": 16,
    "T": 17,
    "V": 18,
    "W": 19,
    "Y": 20,
    "M(ox)": 21,
}
sequence_integer = [[PROSIT_ALHABET[AA] for AA in sequence] for sequence in msms['Sequence']]
precursor_charge_onehot = pd.get_dummies(msms['Charge']).values
collision_energy_aligned_normed = annotation_df['COLLISION_ENERGY']
intensities_raw = annotation['INTENSITIES']

In [8]:
df = pd.DataFrame(list(zip(sequence_integer, precursor_charge_onehot, collision_energy_aligned_normed, intensities_raw)),
                  columns=['sequence_integer', 'precursor_charge_onehot', 'collision_energy', 'intensities_raw'])

In [10]:
print(df.head())

                         sequence_integer precursor_charge_onehot  \
0                [1, 1, 1, 5, 20, 18, 15]               [0, 1, 0]   
1                [1, 1, 1, 5, 20, 18, 15]               [0, 1, 0]   
2                [1, 1, 1, 5, 20, 18, 15]               [0, 1, 0]   
3                [1, 1, 1, 5, 20, 18, 15]               [0, 1, 0]   
4  [1, 1, 5, 17, 4, 2, 2, 14, 1, 1, 3, 9]               [0, 1, 0]   

   collision_energy                                    intensities_raw  
0              28.0  [0.36918813165578857, 0.0, -1.0, 0.0, 0.0, -1....  
1              35.0  [0.028514689782729, 0.0, -1.0, 0.0, 0.0, -1.0,...  
2              28.0  [0.3452339640378655, 0.0, -1.0, 0.0, 0.0, -1.0...  
3              35.0  [0.030064791591335877, 0.0, -1.0, 0.0, 0.0, -1...  
4              35.0  [0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.07584115481...  
