In [1]:
from math import inf
import pandas as pd
import timeit

In [2]:
def msp_to_df(input_file,
              max_seq_len=30,
              min_ce=36, max_ce=40,
              mz_min=135, mz_max=1400,
              keep_sequence_indices=False):
    """
    Function to read spectrum data from .msp file and convert to dataframe.
    Args:
        input_file (str): path to .msp file
        max_seq_len (int): maximum acceptable sequence length
        min_ce (int): minimum collision energy of spectra to be included in df
        max_ce (int): maximum collision energy of spectra to be included in df
        mz_min (int): lower boundary for m/z to be included in df
        mz_max (int): lower boundary for m/z to be included in df
        keep_sequence_indices (boolean): set to True in order to combine both
                                         DataFrames into one

    Returns:
        df (pd.DataFrame or np.array):   spectrum information within defined
                                         parameters [n_spectra, n_features]
        seqs (pd.DataFrame or np.array): sequences (included in df if
                                         keep_sequence_indices is True)
    """
    with open(input_file, 'r') as file:
        def generate_rows():
            skip_spectrum = contains_value = False
            row = {}
            for line in file:
                if line[0] == 'N' and line[1] == 'a':
                    skip_spectrum = contains_value = False
                    row = {}
                    sequence, rest = line[6:].split('/', maxsplit=1)
                    ce = float(rest.rsplit('_', maxsplit=1)[1][:-3])
                    if len(sequence) <= max_seq_len and min_ce <= ce <= max_ce:
                        row['sequence'] = sequence
                    else:
                        skip_spectrum = True
                elif line == '\n':
                    if (not skip_spectrum) and contains_value:
                        yield row
                elif not skip_spectrum and line[0].isdigit():
                    mz, intensity = map(float, line.split('\t')[:2])
                    key = round(mz)
                    if mz_min <= key <= mz_max:
                        contains_value |= intensity > 0
                        row[key] = max(intensity, row.get(key, -inf))
        df = pd.DataFrame.from_records(generate_rows()) \
            .reindex(['sequence'] + list(range(mz_min, mz_max+1)), axis=1) \
            .fillna(0.0)

    if keep_sequence_indices:
        return df.set_index('sequence')
    else:
        return df.drop(columns='sequence'), df.sequence

In [3]:
input_file = '/Users/smantz/PycharmProjects/fufezan-lab-advanced_python_2020-21_HD_fork/notebooks/cptac2_mouse_hcd_selected.msp'
%timeit -r 10 msp_to_df(input_file)

1.73 s ± 25.8 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [4]:
input_file = '/Users/smantz/PycharmProjects/fufezan-lab-advanced_python_2020-21_HD_fork/notebooks/cptac2_mouse_hcd_selected.msp'
df, sequence= msp_to_df(input_file)
df

Unnamed: 0,135,136,137,138,139,140,141,142,143,144,...,1391,1392,1393,1394,1395,1396,1397,1398,1399,1400
0,0.0,2851.7,0.0,0.0,0.0,0.0,1404.0,0.0,26365.8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2797.6,0.0
1,0.0,2626.0,0.0,0.0,1553.4,0.0,0.0,0.0,11172.9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,59857.3,8295.8,0.0,3220.5,0.0,24351.6,2054.0,179213.0,20880.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1946.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2129,0.0,51469.1,2589.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42959.4,8361.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3538.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
input_file = '/Users/smantz/PycharmProjects/fufezan-lab-advanced_python_2020-21_HD_fork/notebooks/cptac2_mouse_hcd_selected.msp'
df_sequence= msp_to_df(input_file, keep_sequence_indices=True)
df_sequence

Unnamed: 0_level_0,135,136,137,138,139,140,141,142,143,144,...,1391,1392,1393,1394,1395,1396,1397,1398,1399,1400
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAAFEDQENETVVVK,0.0,2851.7,0.0,0.0,0.0,0.0,1404.0,0.0,26365.8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2797.6,0.0
AAGTIQTSVQEVNSK,0.0,2626.0,0.0,0.0,1553.4,0.0,0.0,0.0,11172.9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAGVSVEPFWPGLFAK,0.0,59857.3,8295.8,0.0,3220.5,0.0,24351.6,2054.0,179213.0,20880.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAHSQCAYSNPEGTVLLACEESR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAHSQCAYSNPEGTVLLACEESR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1946.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YVSHFETDGPHVLLYFDSVPTTR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YVTLIYTNYENGK,0.0,51469.1,2589.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YYTYLVMNK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42959.4,8361.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YYTYLVMNK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3538.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
