# Get undetected peptides (and calculate NSAF)
    Author: Anima Sutradhar
    Project: Peptide detectability prediction to improve protein identification in mass spectrometry using machine learning.

## Notebook summary:
1. Import fasta_peptides and detected_peptides datasets.
2. Dataset cleaning.
    - Sort detected peptides by PEP score.
    - Remove duplicates in both datasets.
    - Only keep proteins in fasta_peptides that are also present in detected_peptides - let this be expected_peptides.
3. Get undetected peptides.
    - Remove all rows in expected_peptides that are also present in detected_peptides.
4. Export new data frame with undetected peptides as TSV.
    - Check dimensions.

In [1]:
# import libraries
import numpy as np
import pandas as pd
import csv

# set display options
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# 1. Import fasta_peptides and detected_peptides datasets

In [2]:
# import datasets
fasta_peptides = pd.read_table('../data/fasta_peptides.tsv')
detected_peptides = pd.read_table('../data/detected_peptides_NSAF.tsv')

In [3]:
fasta_peptides.shape

(556262, 3)

In [4]:
detected_peptides.shape

(37847, 6)

# 2. Dataset cleaning

### get expected peptides before removing duplicates

In [5]:
# initial check on how many total proteins in detected_peptides are not in fasta_peptides (should be 0)
len(detected_peptides[~detected_peptides["Protein"].isin(fasta_peptides["Protein"])])

0

In [6]:
# initial check on how many total peptides (i.e. PSMs) in detected_peptides are not in fasta_peptides (should be 0)
len(detected_peptides[~detected_peptides["Peptide"].isin(fasta_peptides["Peptide"])])
# NB: These sequences are actually there, they appear to be start of the sequences
# i.e. all these sequences start with Methionine in the fasta peptides.

591

In [7]:
# initial check on how many total proteins and peptides in fasta_peptides are not in detected_peptides
len(fasta_peptides[~fasta_peptides["Protein"].isin(detected_peptides["Protein"])])

376072

In [8]:
# check how many protein+peptide in detected_peptides are present in fasta_peptides (should be 37,847)
len(detected_peptides.set_index(['Protein', 'Peptide']).index.isin(fasta_peptides.set_index(['Protein', 'Peptide']).index))

37847

In [12]:
len(fasta_peptides['Protein'])

556262

### Only keep proteins in fasta_peptides that are present in detected_peptides - let this be expected_peptides

In [18]:
detected_peptides.describe(include='all')

Unnamed: 0,Protein,Peptide,PEP,Protein_length,PSM_per_protein,Quantification
count,37847,37847,37847.0,37847.0,37847.0,37847.0
unique,5586,37847,,,,
top,Q09666,EDLMGLAIGTHGSNIQQAR,,,,
freq,218,1,,,,
mean,,,0.006334102,949.842999,50.19061,0.05856
std,,,0.01691212,1100.798112,89.729124,0.064425
min,,,1.2363999999999999e-275,44.0,1.0,0.000197
25%,,,2.22495e-09,389.0,10.0,0.016499
50%,,,0.00011868,631.0,23.0,0.038095
75%,,,0.0032456,1054.0,52.0,0.080217


In [17]:
# initial check on how many total proteins and peptides in detected_peptides are in fasta_peptides 
# (should be 37,847 if after removing duplicate peptides)
len(detected_peptides[detected_peptides["Protein"].isin(fasta_peptides["Protein"])])

37847

In [23]:
# initial check on how many total proteins in fasta_peptides are in detected_peptides
len(fasta_peptides[fasta_peptides["Protein"].isin(detected_peptides["Protein"])])

180190

In [24]:
# remove proteins in fasta_peptides not present in detected_peptides - let this be expected_peptides
expected_peptides = fasta_peptides[fasta_peptides["Protein"].isin(detected_peptides["Protein"])]
expected_peptides.describe(include="all")

Unnamed: 0,Protein,Peptide,Length
count,180190,180190,180190.0
unique,5586,177030,
top,Q8WZ42,FSMPGFK,
freq,1839,18,
mean,,,1543.838176
std,,,3543.990225
min,,,44.0
25%,,,491.0
50%,,,824.0
75%,,,1445.0


# 2. Get undetected peptides

In [25]:
# initial check on undetected peptides
undetected_peptides = pd.merge(expected_peptides, detected_peptides, 
                               on=["Peptide"], 
                               how='left', indicator=True)
undetected_peptides

Unnamed: 0,Protein_x,Peptide,Length,Protein_y,PEP,Protein_length,PSM_per_protein,Quantification,_merge
0,Q66K14,MWLSPEEVLVANALWVTER,1250,,,,,,left_only
1,Q66K14,ANPFFVLQR,1250,,,,,,left_only
2,Q66K14,GGGLTGLLVGTLDVVLDSSAR,1250,,,,,,left_only
3,Q66K14,ILHQTQDSQVYWTVACGSSR,1250,,,,,,left_only
4,Q66K14,HWEWLENNLLQTLSIFDSEEDITTFVK,1250,,,,,,left_only
...,...,...,...,...,...,...,...,...,...
180185,Q8WTT2,HYHPIVQR,800,,,,,,left_only
180186,Q8WTT2,FAAHLIAGAPSEGSGALKPELSR,800,,,,,,left_only
180187,Q8WTT2,SATELFEAYSMAEMTFNPPVESSNPK,800,,,,,,left_only
180188,Q8WTT2,FLQGDSFLNEDLNQLIK,800,Q8WTT2,7.947000e-43,800.0,22.0,0.0275,both


In [26]:
# find peptides/PSMs present only in undetected_peptides
len(undetected_peptides[undetected_peptides['_merge'].str.contains("left_only", na=False)])

142713

In [27]:
# find number of undetected using isin
len(expected_peptides[~expected_peptides["Peptide"].isin(detected_peptides["Peptide"])])

142713

In [28]:
undetected_peptides = expected_peptides[~expected_peptides["Peptide"].isin(detected_peptides["Peptide"])]
undetected_peptides.shape

(142713, 3)

In [29]:
undetected_peptides.describe(include="all")

Unnamed: 0,Protein,Peptide,Length
count,142713,142713,142713.0
unique,5559,139774,
top,Q8WZ42,LQIWDTAGQER,
freq,1834,12,
mean,,,1691.574405
std,,,3924.710088
min,,,44.0
25%,,,524.0
50%,,,881.0
75%,,,1573.0


In [30]:
# remove any peptides containing 'U' (selenocysteine) amino acids
# this is so that we can map peptide amino acids correctly when calculating AAIndex1 properties
len(undetected_peptides[undetected_peptides['Peptide'].str.contains("U", na=False)])

5

In [31]:
undetected_peptides[undetected_peptides['Peptide'].str.contains("U", na=False)]

Unnamed: 0,Protein,Peptide,Length
39764,Q9NNW7,SGLDPTVTGCUG,524
209476,Q16881,SGASILQAGCUG,649
260100,O60613,LYAGAILEVCGUK,165
331630,Q9C0D9,KPNSDULGMEEK,397
348199,P36969,GFVCIVTNVASQUGK,197


In [32]:
undetected_peptides.shape

(142713, 3)

In [33]:
len(undetected_peptides[~undetected_peptides['Peptide'].str.contains("U", na=False)])

142708

In [34]:
undetected_peptides = undetected_peptides[~undetected_peptides['Peptide'].str.contains("U", na=False)]
undetected_peptides.shape

(142708, 3)

# Calculate spectral counting (NSAF) for undetected_peptides

In [36]:
# check for any intersection between detected and undetected (should be 0)
print(len(set(detected_peptides["Peptide"]).intersection(set(undetected_peptides["Peptide"]))))

0


In [38]:
# create dictionary from detected_peptides, with protein keys and associated NSAF values
detected_peptides_nsaf_dict = dict(zip(detected_peptides.Protein, detected_peptides.Quantification))

In [39]:
# map protein NSAF quantitation values to each protein in undetected_peptides
undetected_peptides['Quantification'] = undetected_peptides['Protein'].map(detected_peptides_nsaf_dict)
undetected_peptides

Unnamed: 0,Protein,Peptide,Length,Quantification
0,Q66K14,MWLSPEEVLVANALWVTER,1250,0.0016
1,Q66K14,ANPFFVLQR,1250,0.0016
2,Q66K14,GGGLTGLLVGTLDVVLDSSAR,1250,0.0016
3,Q66K14,ILHQTQDSQVYWTVACGSSR,1250,0.0016
4,Q66K14,HWEWLENNLLQTLSIFDSEEDITTFVK,1250,0.0016
...,...,...,...,...
556024,Q8WTT2,ILMHTFPK,800,0.0275
556025,Q8WTT2,TDLLLDSESQGSGVFLPELDEPEYCNAQNTALWELHALR,800,0.0275
556026,Q8WTT2,HYHPIVQR,800,0.0275
556027,Q8WTT2,FAAHLIAGAPSEGSGALKPELSR,800,0.0275


In [40]:
# check for any NaN values
undetected_peptides.isnull().sum()

Protein           0
Peptide           0
Length            0
Quantification    0
dtype: int64

In [43]:
print(undetected_peptides.shape)
print(detected_peptides.shape)

(142708, 4)
(37847, 6)


In [57]:
undetected_peptides.describe(include='all')

Unnamed: 0,Protein,Peptide,Length,Quantification
count,142708,142708,142708.0,142708.0
unique,5559,139769,,
top,Q8WZ42,LQIWDTAGQER,,
freq,1834,12,,
mean,,,1691.620133,0.015974
std,,,3924.771084,0.027578
min,,,44.0,0.000197
25%,,,524.0,0.00236
50%,,,881.5,0.006579
75%,,,1573.0,0.017437


In [44]:
# remove any peptides in undetected_peptides that map to more than one different protein
undetected_peptides_clean = undetected_peptides.groupby('Peptide').filter(lambda x: x['Protein'].nunique() == 1)
undetected_peptides_clean.shape

(137658, 4)

In [48]:
undetected_peptides_clean.describe(include='all')

Unnamed: 0,Protein,Peptide,Length,Quantification
count,137431,137431,137431.0,137431.0
unique,5535,137431,,
top,Q8WZ42,LRPGEER,,
freq,1811,1,,
mean,,,1708.801915,0.015396
std,,,3968.937471,0.026096
min,,,44.0,0.000197
25%,,,530.0,0.002345
50%,,,890.0,0.00644
75%,,,1577.0,0.017153


In [46]:
# drop duplicate peptides (i.e. if a peptide sequence that matches to same protein occurs twice or more), keep first occurrence
undetected_peptides_clean = undetected_peptides_clean.drop_duplicates(subset = ['Peptide', 'Protein'], 
                                                                      keep = "first").reset_index(drop = True)
undetected_peptides_clean.describe(include="all")

Unnamed: 0,Protein,Peptide,Length,Quantification
count,137431,137431,137431.0,137431.0
unique,5535,137431,,
top,Q8WZ42,LRPGEER,,
freq,1811,1,,
mean,,,1708.801915,0.015396
std,,,3968.937471,0.026096
min,,,44.0,0.000197
25%,,,530.0,0.002345
50%,,,890.0,0.00644
75%,,,1577.0,0.017153


In [50]:
# check how many detected proteins are not in undetected
len(detected_peptides[~detected_peptides["Protein"].isin(undetected_peptides_clean["Protein"])])

243

In [65]:
detected_peptides[~detected_peptides["Protein"].isin(undetected_peptides_clean["Protein"])]

Unnamed: 0,Protein,Peptide,PEP,Protein_length,PSM_per_protein,Quantification
1,P63167,NADMSEEMQQDSVECATQALEK,1.264600e-275,89,13,0.146067
57,P31946,QTTVSNSQQAYQEAFEISK,5.570500e-170,246,38,0.154472
135,P08708,DNYVPEVSALDQEIIEVDPDTK,3.984200e-132,135,20,0.148148
156,P63104,GIVDQSQQAYQEAFEISK,9.758100e-131,245,86,0.351020
162,P31946,TAFDEAIAELDTLNEESYK,1.913200e-129,246,38,0.154472
...,...,...,...,...,...,...
36593,P31946,IEAELQDICNDVLELLDK,5.935900e-02,246,38,0.154472
36640,P62875,LLNYAPLEK,6.082500e-02,67,5,0.074627
36706,P63220,ADGIVSK,6.305900e-02,83,17,0.204819
36744,O75347,MMIPDCQR,6.408300e-02,108,12,0.111111


In [51]:
# initial check on how many total proteins and PSMs in detected_peptides are in fasta_peptides
len(detected_peptides[detected_peptides["Protein"].isin(fasta_peptides["Protein"])])

37847

In [67]:
undetected_peptides_clean.shape

(137431, 4)

# 3. Export new data frame with undetected peptides as TSV

In [66]:
# export undetected peptides to TSV
undetected_peptides_clean.to_csv("../data/undetected_peptides_NSAF.tsv", sep='\t', index=False)