In [1]:
import os
import pandas as pd
import numpy as np

from scipy.stats import pearsonr

In [2]:
def parse_prosit_result(df):
    frag_list = []
    for i in range(1,15):
        frag_list.append('y{}'.format(i))
        for j in [2, 3]:
            frag_list.append('y{}^{}'.format(i, j))
    
    df = df[df['FragmentType'] == 'y'].reset_index(drop=True)
    
    temp_list = []
    for i in range(len(df['LabeledPeptide'].unique())):
        pep = df['LabeledPeptide'].unique()[i]
        temp = df[df['LabeledPeptide'] == pep].reset_index(drop=True)
        if len(temp['PrecursorCharge'].unique()) == 1:
            temp_dict = {}
            for j in range(len(temp)):
                if temp.loc[j,'FragmentCharge'] == 1:
                    ion = "{}{}".format(temp.loc[j,'FragmentType'], temp.loc[j,'FragmentNumber'])
                else:
                    ion = "{}{}^{}".format(temp.loc[j,'FragmentType'], temp.loc[j,'FragmentNumber'], temp.loc[j,'FragmentCharge'])
                temp_dict[ion] = [temp.at[j,'RelativeIntensity']]
            temp_df = pd.DataFrame(temp_dict)
            temp_df['Pepseq'] = pep
            temp_df['Charge'] = temp['PrecursorCharge'].unique()[0]
            temp_list.append(temp_df)
        else:
            for c in temp['PrecursorCharge'].unique():
                temp_charge = temp[temp['PrecursorCharge']==c].reset_index(drop=True)
                temp_dict = {}
                for j in range(len(temp_charge)):
                    if temp_charge.loc[j,'FragmentCharge'] == 1:
                        ion = "{}{}".format(temp_charge.loc[j,'FragmentType'], temp_charge.loc[j,'FragmentNumber'])
                    else:
                        ion = "{}{}^{}".format(temp_charge.loc[j,'FragmentType'], temp_charge.loc[j,'FragmentNumber'], \
                                               temp_charge.loc[j,'FragmentCharge'])
                    temp_dict[ion] = [temp_charge.at[j,'RelativeIntensity']]
                temp_df = pd.DataFrame(temp_dict)
                temp_df['Peptide'] = pep
                temp_df['Charge'] = c
                temp_list.append(temp_df)
        if i%1000 == 0: print(i)
            
    prosit_df = pd.concat(temp_list, axis=0).reset_index(drop=True)
    prosit_df = prosit_df[['Peptide', 'Charge'] + frag_list]
    
    for i in range(len(prosit_df)):
        if prosit_df.at[i, 'Charge'] == 1:
            prosit_df.loc[i, [f for f in frag_list if ('^2' in f) or ('^3' in f)]] = -1 
        elif prosit_df.at[i, 'Charge'] == 2:
            prosit_df.loc[i, [f for f in frag_list if '^3' in f]] = -1

        prosit_df.loc[i, [f for f in frag_list if int(f.split('^')[0].replace('y','')) >= len(prosit_df.at[i, 'Peptide'])]] = -1
    
    prosit_df = prosit_df.fillna(0)
    prosit_df = prosit_df.replace(-1, np.nan)
    
    return prosit_df

In [3]:
def parse_ms2pip_result(df):
    frag_list = []
    for i in range(1,15):
        frag_list.append('y{}'.format(i))
        for j in [2, 3]:
            frag_list.append('y{}^{}'.format(i, j))
            
    mspip_df = pd.DataFrame(columns =['Peptide', 'Charge'] + frag_list)
    
    df = df[df['ion'] == 'Y'].reset_index(drop=True)
    
    for i in range(len(df['spec_id'].unique())):
        pep = df['spec_id'].unique()[i]#.split('_')[0]
        temp = df[df['spec_id'] == pep].reset_index(drop=True)
        mspip_df.loc[i, 'Peptide'] = pep.split('_')[0]
        mspip_df.loc[i, 'Charge'] = temp['charge'].unique()[0]
        for j in range(len(temp)):
            ion = temp.at[j,'ion'].replace('Y', 'y')
            if len(ion) > 1:
                frag_charge = ion[-1]
                ion = ion[0]
                ion = "{}{}^{}".format(ion, temp.at[j,'ionnumber'], frag_charge)
            else:
                ion = "{}{}".format(ion, temp.at[j,'ionnumber'])
            mspip_df.loc[i, ion] = temp.at[j,'prediction']
        if i%1000 == 0: print(i)
    
    for i in range(len(mspip_df)):
        if mspip_df.at[i, 'Charge'] == 1:
            mspip_df.loc[i, [f for f in frag_list if ('^2' in f) or ('^3' in f)]] = -1
        elif mspip_df.at[i, 'Charge'] == 2:
            mspip_df.loc[i, [f for f in frag_list if '^3' in f]] = -1

        mspip_df.loc[i, [f for f in frag_list if int(f.split('^')[0].replace('y','')) >= len(mspip_df.at[i, 'Peptide'])]] = -1

    mspip_df = mspip_df.fillna(0)
    mspip_df = mspip_df.replace(-1, np.nan)
    
    return mspip_df

In [4]:
def get_pcc(target, pred):
    frag_list = []
    for i in range(1,15):
        frag_list.append('y{}'.format(i))
        for j in [2, 3]:
            frag_list.append('y{}^{}'.format(i, j))
            
    data = []
    
    for i in range(len(target)):
        seq = target.at[i, 'Peptide']
        charge = target.at[i, 'Charge']
        targ = target.loc[i, frag_list]
        ions = list(targ[targ >= 0].index)
        if len(ions) < 2:
            continue
        
        t = target.loc[i, ions]
        p = pred.loc[i, ions]
        pcc = pearsonr(t, p)[0]
        
        data.append([seq, charge, pcc])
        
    df = pd.DataFrame(data, columns=['Peptide', 'Charge', 'Pcc'])
    return df

In [None]:
'''
parsing Prosit & MS2PIP result file, if you need
'''
### Prosit result
# prosit_result = pd.read_csv("")
# parsed_prosit_result = parse_prosit_result(prosit_result)

### MS2PIP
# ms2pip_result = pd.read_csv("")
# parsed_ms2pip_result = parse_ms2pip_result(ms2pip_result)

In [None]:
''' 
Read target data & predction data
    - The order of the target data and thr prediction data
      should be same.
'''
# target = pd.read_csv("")
# pred = pd.read_csv("")

In [None]:
'''
Calculte PCC and create table
'''
# get_pcc(target, pred)