# Dataset preparation for machine learning
    Author: Anima Sutradhar
    Project: Peptide detectability prediction to improve protein identification in mass spectrometry using machine learning.

In [1]:
# import libraries
import numpy as np
import pandas as pd
import csv

# set display options
#pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
# import datasets
detected_peptides = pd.read_table('../data/detected_peptides_NSAF.tsv')
undetected_peptides = pd.read_table('../data/undetected_peptides_NSAF.tsv')

In [3]:
detected_peptides.head()

Unnamed: 0,Protein,Peptide,PEP,Protein_length,PSM_per_protein,Quantification
0,P35579,DFSALESQLQDTQELLQEENR,1.2363999999999999e-275,1960,400,0.204082
1,P63167,NADMSEEMQQDSVECATQALEK,1.2646e-275,89,13,0.146067
2,P02545,ASASGSGAQVGGPISSGSSASSVTVTR,3.0099e-251,664,130,0.195783
3,Q14257,LSEEEILENPDLFLTSEATDYGR,8.5859e-249,317,22,0.069401
4,P10809,IMQSSSEVGYDAMAGDFVNMVEK,3.1541000000000002e-248,573,192,0.335079


In [4]:
undetected_peptides.head()

Unnamed: 0,Protein,Peptide,Length,Quantification
0,Q66K14,MWLSPEEVLVANALWVTER,1250,0.0016
1,Q66K14,ANPFFVLQR,1250,0.0016
2,Q66K14,GGGLTGLLVGTLDVVLDSSAR,1250,0.0016
3,Q66K14,ILHQTQDSQVYWTVACGSSR,1250,0.0016
4,Q66K14,HWEWLENNLLQTLSIFDSEEDITTFVK,1250,0.0016


In [5]:
# take the mean of the entire PEP column in detected peptides, and apply it to all rows in undetected peptides
undetected_peptides["PEP"] = [detected_peptides["PEP"].mean()]*undetected_peptides.shape[0]
undetected_peptides

Unnamed: 0,Protein,Peptide,Length,Quantification,PEP
0,Q66K14,MWLSPEEVLVANALWVTER,1250,0.0016,0.006334
1,Q66K14,ANPFFVLQR,1250,0.0016,0.006334
2,Q66K14,GGGLTGLLVGTLDVVLDSSAR,1250,0.0016,0.006334
3,Q66K14,ILHQTQDSQVYWTVACGSSR,1250,0.0016,0.006334
4,Q66K14,HWEWLENNLLQTLSIFDSEEDITTFVK,1250,0.0016,0.006334
...,...,...,...,...,...
137426,Q8WTT2,ILMHTFPK,800,0.0275,0.006334
137427,Q8WTT2,TDLLLDSESQGSGVFLPELDEPEYCNAQNTALWELHALR,800,0.0275,0.006334
137428,Q8WTT2,HYHPIVQR,800,0.0275,0.006334
137429,Q8WTT2,FAAHLIAGAPSEGSGALKPELSR,800,0.0275,0.006334


In [6]:
# drop "PSM_per_protein"
detected_peptides = detected_peptides.drop(labels=['PSM_per_protein'], axis=1)
detected_peptides.head()

Unnamed: 0,Protein,Peptide,PEP,Protein_length,Quantification
0,P35579,DFSALESQLQDTQELLQEENR,1.2363999999999999e-275,1960,0.204082
1,P63167,NADMSEEMQQDSVECATQALEK,1.2646e-275,89,0.146067
2,P02545,ASASGSGAQVGGPISSGSSASSVTVTR,3.0099e-251,664,0.195783
3,Q14257,LSEEEILENPDLFLTSEATDYGR,8.5859e-249,317,0.069401
4,P10809,IMQSSSEVGYDAMAGDFVNMVEK,3.1541000000000002e-248,573,0.335079


In [8]:
# make all column names the same for both datasets
undetected_peptides = undetected_peptides.rename(columns={"Length": "Protein_length"})
undetected_peptides.head()

Unnamed: 0,Protein,Peptide,Protein_length,Quantification,PEP
0,Q66K14,MWLSPEEVLVANALWVTER,1250,0.0016,0.006334
1,Q66K14,ANPFFVLQR,1250,0.0016,0.006334
2,Q66K14,GGGLTGLLVGTLDVVLDSSAR,1250,0.0016,0.006334
3,Q66K14,ILHQTQDSQVYWTVACGSSR,1250,0.0016,0.006334
4,Q66K14,HWEWLENNLLQTLSIFDSEEDITTFVK,1250,0.0016,0.006334


In [9]:
detected_peptides.head()

Unnamed: 0,Protein,Peptide,PEP,Protein_length,Quantification
0,P35579,DFSALESQLQDTQELLQEENR,1.2363999999999999e-275,1960,0.204082
1,P63167,NADMSEEMQQDSVECATQALEK,1.2646e-275,89,0.146067
2,P02545,ASASGSGAQVGGPISSGSSASSVTVTR,3.0099e-251,664,0.195783
3,Q14257,LSEEEILENPDLFLTSEATDYGR,8.5859e-249,317,0.069401
4,P10809,IMQSSSEVGYDAMAGDFVNMVEK,3.1541000000000002e-248,573,0.335079


In [10]:
# make column order identical for both datasets
undetected_peptides = undetected_peptides[['Protein', 'Peptide', 'PEP', 'Protein_length', 'Quantification']]
undetected_peptides.head()

Unnamed: 0,Protein,Peptide,PEP,Protein_length,Quantification
0,Q66K14,MWLSPEEVLVANALWVTER,0.006334,1250,0.0016
1,Q66K14,ANPFFVLQR,0.006334,1250,0.0016
2,Q66K14,GGGLTGLLVGTLDVVLDSSAR,0.006334,1250,0.0016
3,Q66K14,ILHQTQDSQVYWTVACGSSR,0.006334,1250,0.0016
4,Q66K14,HWEWLENNLLQTLSIFDSEEDITTFVK,0.006334,1250,0.0016


In [11]:
# export datasets as TSV
detected_peptides.to_csv("../data/detected_peptides_all.tsv", sep='\t', index=False)
undetected_peptides.to_csv("../data/undetected_peptides_all.tsv", sep='\t', index=False)

In [12]:
print(detected_peptides.shape)
print(undetected_peptides.shape)

(37847, 5)
(137431, 5)


In [25]:
# load AAIndex1 datasets
aaindex1_df = pd.read_table('../data/aaindex1_df')
detected_peptides_all = pd.read_table('../data/detected_peptides_all_aaindex1.tsv')
undetected_peptides_all = pd.read_table('../data/undetected_peptides_all_aaindex1.tsv')

In [26]:
print(aaindex1_df.shape)
print(detected_peptides_all.shape)
print(undetected_peptides_all.shape)

(566, 22)
(37847, 571)
(137431, 571)


In [29]:
undetected_peptides_all.head()

Unnamed: 0,Protein,Peptide,PEP,Protein_length,Quantification,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,...,KARS160113,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122
0,Q66K14,MWLSPEEVLVANALWVTER,0.006334,1250.0,0.0016,82.27,20.37,22.83,24.34,13.83,...,223.0,321.174,224.0,390.0,633.028,117.674,467.026,-3.805,189.235,70.749
1,Q66K14,ANPFFVLQR,0.006334,1250.0,0.0016,39.73,10.11,11.32,11.68,5.39,...,115.0,159.711,121.0,178.0,303.014,54.1,217.889,0.0,91.767,34.214
2,Q66K14,GGGLTGLLVGTLDVVLDSSAR,0.006334,1250.0,0.0016,88.5,14.29,26.94,26.9,13.19,...,156.0,248.704,165.0,308.0,480.021,115.56,412.354,-8.454,165.301,67.703
3,Q66K14,ILHQTQDSQVYWTVACGSSR,0.006334,1250.0,0.0016,87.37,14.59,18.87,18.71,10.76,...,203.0,332.885,228.0,417.0,642.014,123.507,470.243,-12.578,189.973,72.45
4,Q66K14,HWEWLENNLLQTLSIFDSEEDITTFVK,0.006334,1250.0,0.0016,118.84,26.15,28.91,29.46,17.1,...,327.0,488.027,349.0,592.0,978.021,167.044,701.186,-17.454,274.969,87.39
