In [None]:
# import wget
# url1 = "https://ftp.pride.ebi.ac.uk/pride/data/archive/2017/02/PXD004732/01625b_GA1-TUM_first_pool_1_01_01-DDA-1h-R2.raw"
# url2 = "https://ftp.pride.ebi.ac.uk/pride/data/archive/2017/02/PXD004732/TUM_first_pool_1_01_01_DDA-1h-R2-tryptic.zip"

# wget.download(url1)
# wget.download(url2)

In [3]:
from zipfile import ZipFile
import pandas as pd
with ZipFile(f'TUM_first_pool_1_01_01_DDA-1h-R2-tryptic.zip', 'r') as zip_file:
  msms = pd.read_csv(zip_file.open('msms.txt'), sep='\t')
# Current PROSIT pipeline does not accomodate modified peptides, so we remove all of the oxidized peptides
msms = msms[msms['Modifications'] == 'Unmodified']

In [4]:
from fisher_py import RawFile
raw = RawFile('01625b_GA1-TUM_first_pool_1_01_01-DDA-1h-R2.raw')
# Get the scan numbers from the msms file and save the scan + info in a dictionary
from fisher_py.data.business import Scan
import numpy as np
scan_mzs = []
scan_ints = []
scan_mass_analyzers = []
scan_collison_energy = []
for scan in msms['Scan number']:
  raw_scan = Scan.from_file(raw._raw_file_access, scan)
  scan_mzs.append(np.array(raw_scan.preferred_masses))
  scan_ints.append(np.array(raw_scan.preferred_intensities))
  scan_mass_analyzers.append(raw_scan.scan_type.split(' + ')[0])
  frag_infos = [f.split(' ')[0] for f in raw_scan.scan_type.split('@')[1:]]
  splits = [[i for i, g in enumerate(f) if g.isnumeric()][0] for f in frag_infos]
  NCEs = [float(frag[split:]) for split, frag in zip(splits, frag_infos)]
  scan_collison_energy.append(NCEs[0])

In [5]:
annotation_df = pd.DataFrame(msms[['Modified sequence', 'Charge', 'Scan number', 'Length']].values, columns=['MODIFIED_SEQUENCE', 'PRECURSOR_CHARGE', 'SCAN_NUMBER', 'PEPTIDE_LENGTH'])
annotation_df['MZ'] = scan_mzs
annotation_df['INTENSITIES'] = scan_ints
annotation_df['MASS_ANALYZER'] = scan_mass_analyzers
annotation_df['COLLISION_ENERGY'] = scan_collison_energy

from spectrum_fundamentals.mod_string import maxquant_to_internal
annotation_df['MODIFIED_SEQUENCE'] = maxquant_to_internal(annotation_df['MODIFIED_SEQUENCE'].values)

from spectrum_fundamentals.annotation.annotation import annotate_spectra
annotation = annotate_spectra(annotation_df)
annotation

Unnamed: 0,INTENSITIES,MZ,CALCULATED_MASS,removed_peaks
0,"[0.36918813165578857, 0.0, -1.0, 0.0, 0.0, -1....","[175.11929321289062, 0.0, -1.0, 0.0, 0.0, -1.0...",796.423175,0
1,"[0.028514689782729, 0.0, -1.0, 0.0, 0.0, -1.0,...","[175.25360107421875, 0.0, -1.0, 0.0, 0.0, -1.0...",796.423175,0
2,"[0.3452339640378655, 0.0, -1.0, 0.0, 0.0, -1.0...","[175.11927795410156, 0.0, -1.0, 0.0, 0.0, -1.0...",796.423175,0
3,"[0.030064791591335877, 0.0, -1.0, 0.0, 0.0, -1...","[175.16168212890625, 0.0, -1.0, 0.0, 0.0, -1.0...",796.423175,0
4,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.07584115481...","[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 262.248901367...",1370.559489,0
...,...,...,...,...
11965,"[0.009784486409648692, 0.0, -1.0, 0.0, 0.0, -1...","[147.1424102783203, 0.0, -1.0, 0.0, 0.0, -1.0,...",914.474935,0
11966,"[0.23857646569260368, 0.0, -1.0, 0.0, 0.0, -1....","[147.11309814453125, 0.0, -1.0, 0.0, 0.0, -1.0...",914.474935,0
11967,"[0.012048242613237779, 0.0, -1.0, 0.0, 0.0, -1...","[147.1204376220703, 0.0, -1.0, 0.0, 0.0, -1.0,...",914.474935,0
11968,"[0.39071905153057307, 0.0, -1.0, 0.0, 0.0, -1....","[147.11328125, 0.0, -1.0, 0.0, 0.0, -1.0, 276....",914.474935,0


In [None]:
PROSIT_ALHABET = {
    "A": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "K": 9,
    "L": 10,
    "M": 11,
    "N": 12,
    "P": 13,
    "Q": 14,
    "R": 15,
    "S": 16,
    "T": 17,
    "V": 18,
    "W": 19,
    "Y": 20,
    "M(ox)": 21,
}
sequence_integer = [[PROSIT_ALHABET[AA] for AA in sequence] for sequence in msms['Sequence']]
precursor_charge_onehot = pd.get_dummies(msms['Charge']).values
collision_energy_aligned_normed = annotation_df['COLLISION_ENERGY']
intensities_raw = annotation['INTENSITIES']

df = pd.DataFrame(list(zip(sequence_integer, precursor_charge_onehot, collision_energy_aligned_normed, intensities_raw)),
                  columns=['sequence_integer', 'precursor_charge_onehot', 'collision_energy', 'intensities_raw'])

print(df.head())

In [11]:
df['NCE'] = df['collision_energy']*0.0108

print(df.head())


                         sequence_integer precursor_charge_onehot  \
0                [1, 1, 1, 5, 20, 18, 15]               [0, 1, 0]   
1                [1, 1, 1, 5, 20, 18, 15]               [0, 1, 0]   
2                [1, 1, 1, 5, 20, 18, 15]               [0, 1, 0]   
3                [1, 1, 1, 5, 20, 18, 15]               [0, 1, 0]   
4  [1, 1, 5, 17, 4, 2, 2, 14, 1, 1, 3, 9]               [0, 1, 0]   

   collision_energy                                    intensities_raw     NCE  
0              28.0  [0.36918813165578857, 0.0, -1.0, 0.0, 0.0, -1....  0.3024  
1              35.0  [0.028514689782729, 0.0, -1.0, 0.0, 0.0, -1.0,...  0.3780  
2              28.0  [0.3452339640378655, 0.0, -1.0, 0.0, 0.0, -1.0...  0.3024  
3              35.0  [0.030064791591335877, 0.0, -1.0, 0.0, 0.0, -1...  0.3780  
4              35.0  [0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.07584115481...  0.3780  
