# Perform spectral counting (NSAF) on detected peptides
    Author: Anima Sutradhar
    Project: Peptide detectability prediction to improve protein identification in mass spectrometry using machine learning.

In [1]:
# import libraries
import numpy as np
import pandas as pd
import re
import csv

# set display options
#pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
# import datasets
fasta_peptides = pd.read_table('../data/fasta_peptides.tsv')
detected_peptides = pd.read_table('../data/detected_peptides.tsv')

In [3]:
fasta_peptides.shape

(556262, 3)

In [4]:
detected_peptides.shape

(83382, 3)

In [5]:
# sort detected_peptides by PEP
detected_peptides = detected_peptides.sort_values(by=['PEP']).reset_index(drop = True)

In [6]:
# remove any peptides in detected_peptides that map to more than one different protein
detected_peptides = detected_peptides.groupby('Peptide').filter(lambda x: x['Protein'].nunique() == 1)
detected_peptides.shape
# it seems no peptides are shared by more than one different protein, as expected (since we removed ambiguous protein groups)

(83382, 3)

In [7]:
detected_peptides.describe(include="all")

Unnamed: 0,Protein,Peptide,PEP
count,83382,83382,83382.0
unique,5586,37847,
top,Q09666,WGDAGAEYVVESTGVFTTMEK,
freq,752,101,
mean,,,0.007283494
std,,,0.01826888
min,,,1.2363999999999999e-275
25%,,,3.016025e-08
50%,,,0.00016333
75%,,,0.003856275


In [8]:
fasta_peptides.describe(include="all")

Unnamed: 0,Protein,Peptide,Length
count,556262,556262,556262.0
unique,20354,526936,
top,Q8WZ42,IHTGEKPYK,
freq,1839,362,
mean,,,1202.077699
std,,,2229.114067
min,,,12.0
25%,,,451.0
50%,,,736.0
75%,,,1278.0


## Perform spectral counting on detected_peptides

In [9]:
fasta_protein_len_dict = dict(zip(fasta_peptides.Protein, fasta_peptides.Length))

In [10]:
# map protein sequence lengths to each protein in detected_peptides
detected_peptides['Protein_length'] = detected_peptides['Protein'].map(fasta_protein_len_dict)
detected_peptides

Unnamed: 0,Protein,Peptide,PEP,Protein_length
0,P35579,DFSALESQLQDTQELLQEENR,1.236400e-275,1960
1,P63167,NADMSEEMQQDSVECATQALEK,1.264600e-275,89
2,P02545,ASASGSGAQVGGPISSGSSASSVTVTR,3.009900e-251,664
3,Q14257,LSEEEILENPDLFLTSEATDYGR,8.585900e-249,317
4,P10809,IMQSSSEVGYDAMAGDFVNMVEK,3.154100e-248,573
...,...,...,...,...
83377,Q8NF91,DTPGTCHVTLK,1.112600e-01,8797
83378,Q9UPT9,CDDAIITK,1.113500e-01,525
83379,P13639,SDPVVSYR,1.114800e-01,858
83380,Q8TCT9,NASDMPETITSR,1.114800e-01,377


In [11]:
# check for any NaN values in "Length" column
detected_peptides.isnull().sum()

Protein           0
Peptide           0
PEP               0
Protein_length    0
dtype: int64

In [12]:
# assign a count value of '1' for each PSM (peptide spectrum match)
detected_peptides["PSM"] = 1
detected_peptides

Unnamed: 0,Protein,Peptide,PEP,Protein_length,PSM
0,P35579,DFSALESQLQDTQELLQEENR,1.236400e-275,1960,1
1,P63167,NADMSEEMQQDSVECATQALEK,1.264600e-275,89,1
2,P02545,ASASGSGAQVGGPISSGSSASSVTVTR,3.009900e-251,664,1
3,Q14257,LSEEEILENPDLFLTSEATDYGR,8.585900e-249,317,1
4,P10809,IMQSSSEVGYDAMAGDFVNMVEK,3.154100e-248,573,1
...,...,...,...,...,...
83377,Q8NF91,DTPGTCHVTLK,1.112600e-01,8797,1
83378,Q9UPT9,CDDAIITK,1.113500e-01,525,1
83379,P13639,SDPVVSYR,1.114800e-01,858,1
83380,Q8TCT9,NASDMPETITSR,1.114800e-01,377,1


In [13]:
# count number of times each PSM per protein appears
detected_peptides_abundance = detected_peptides.groupby("Protein")["PSM"].sum()
detected_peptides_abundance

Protein
A0A0U1RRL7     2
A0AV96         3
A0AVF1         1
A0AVT1        75
A0FGR8        27
              ..
Q9Y6X4         2
Q9Y6X5         1
Q9Y6X9         5
Q9Y6Y0         5
Q9Y6Y8        21
Name: PSM, Length: 5586, dtype: int64

In [14]:
# map protein counts to detected_peptides
detected_peptides_abundance = pd.merge(detected_peptides, detected_peptides_abundance, on='Protein', how='left',
                                 suffixes=(None, '_per_protein'), indicator=True) \
                          .query("_merge == 'both'") \
                          .drop('_merge', 1) \
                          .drop('PSM', 1)
detected_peptides_abundance

Unnamed: 0,Protein,Peptide,PEP,Protein_length,PSM_per_protein
0,P35579,DFSALESQLQDTQELLQEENR,1.236400e-275,1960,400
1,P63167,NADMSEEMQQDSVECATQALEK,1.264600e-275,89,13
2,P02545,ASASGSGAQVGGPISSGSSASSVTVTR,3.009900e-251,664,130
3,Q14257,LSEEEILENPDLFLTSEATDYGR,8.585900e-249,317,22
4,P10809,IMQSSSEVGYDAMAGDFVNMVEK,3.154100e-248,573,192
...,...,...,...,...,...
83377,Q8NF91,DTPGTCHVTLK,1.112600e-01,8797,6
83378,Q9UPT9,CDDAIITK,1.113500e-01,525,2
83379,P13639,SDPVVSYR,1.114800e-01,858,263
83380,Q8TCT9,NASDMPETITSR,1.114800e-01,377,19


In [15]:
# calculate spectral count (using NSAF approach) for detected_peptides
detected_peptides_abundance["Quantification"] = detected_peptides_abundance["PSM_per_protein"] / detected_peptides_abundance["Protein_length"]
detected_peptides_abundance

Unnamed: 0,Protein,Peptide,PEP,Protein_length,PSM_per_protein,Quantification
0,P35579,DFSALESQLQDTQELLQEENR,1.236400e-275,1960,400,0.204082
1,P63167,NADMSEEMQQDSVECATQALEK,1.264600e-275,89,13,0.146067
2,P02545,ASASGSGAQVGGPISSGSSASSVTVTR,3.009900e-251,664,130,0.195783
3,Q14257,LSEEEILENPDLFLTSEATDYGR,8.585900e-249,317,22,0.069401
4,P10809,IMQSSSEVGYDAMAGDFVNMVEK,3.154100e-248,573,192,0.335079
...,...,...,...,...,...,...
83377,Q8NF91,DTPGTCHVTLK,1.112600e-01,8797,6,0.000682
83378,Q9UPT9,CDDAIITK,1.113500e-01,525,2,0.003810
83379,P13639,SDPVVSYR,1.114800e-01,858,263,0.306527
83380,Q8TCT9,NASDMPETITSR,1.114800e-01,377,19,0.050398


In [16]:
# check if any detected peptides are not in fasta_peptides (should be 0)
len(detected_peptides_abundance[~detected_peptides_abundance["Peptide"].isin(fasta_peptides["Peptide"])])

1337

In [17]:
detected_peptides_abundance[~detected_peptides_abundance["Peptide"].isin(fasta_peptides["Peptide"])]
# after a random spot check these peptides seem to be present in fasta peptides, but a Methionine amino acid comes before it

Unnamed: 0,Protein,Peptide,PEP,Protein_length,PSM_per_protein,Quantification
166,P14618,SKPHSEAGTAFIQTQQLHAAMADTFLEHMCR,6.628300e-135,531,282,0.531073
521,O15260,GQNDLMGTAEDFADQFLR,1.809200e-95,269,25,0.092937
654,Q9NVH1,ATALSEEELDNEDYYSLLNVR,6.217000e-85,559,17,0.030411
670,Q15102,SGEENPASKPTPVQDVQGDGR,1.178300e-84,231,12,0.051948
675,Q9Y5S9,ADVLDLHEAGGEDFAMDEDGDESIHK,1.335500e-84,174,7,0.040230
...,...,...,...,...,...,...
82145,P40429,AEVQVLVLDGR,8.722100e-02,203,12,0.059113
82331,P49591,VLDLDLFR,9.023000e-02,514,35,0.068093
82402,Q99717,TSMASLFSFTSPAVK,9.127200e-02,465,2,0.004301
82740,A8MWX3,SGVMCLK,9.695400e-02,477,1,0.002096


In [18]:
detected_peptides_abundance.describe(include="all")

Unnamed: 0,Protein,Peptide,PEP,Protein_length,PSM_per_protein,Quantification
count,83382,83382,83382.0,83382.0,83382.0,83382.0
unique,5586,37847,,,,
top,Q09666,WGDAGAEYVVESTGVFTTMEK,,,,
freq,752,101,,,,
mean,,,0.007283494,906.052074,70.37255,0.091596
std,,,0.01826888,1085.613851,107.714099,0.101508
min,,,1.2363999999999999e-275,44.0,1.0,0.000197
25%,,,3.016025e-08,358.0,15.0,0.026506
50%,,,0.00016333,583.0,34.0,0.065041
75%,,,0.003856275,972.0,75.0,0.122727


In [19]:
# drop any duplicate peptides (i.e. several instances of same peptides that map to same protein) but keep first occurrence
detected_peptides_abundance = detected_peptides_abundance.drop_duplicates(subset = ['Peptide', 'Protein'], 
                                                                          keep = "first").reset_index(drop = True)
detected_peptides_abundance.describe(include="all")

Unnamed: 0,Protein,Peptide,PEP,Protein_length,PSM_per_protein,Quantification
count,37847,37847,37847.0,37847.0,37847.0,37847.0
unique,5586,37847,,,,
top,Q09666,QIWTVNEALIQK,,,,
freq,218,1,,,,
mean,,,0.006334102,949.842999,50.19061,0.05856
std,,,0.01691212,1100.798112,89.729124,0.064425
min,,,1.2363999999999999e-275,44.0,1.0,0.000197
25%,,,2.22495e-09,389.0,10.0,0.016499
50%,,,0.00011868,631.0,23.0,0.038095
75%,,,0.0032456,1054.0,52.0,0.080217


In [20]:
detected_peptides_abundance.shape

(37847, 6)

In [21]:
# export dataset
detected_peptides_abundance.to_csv("../data/detected_peptides_NSAF.tsv", sep='\t', index=False)