# Get experimentally detected peptides
    Author: Anima Sutradhar
    Project: Peptide detectability prediction to improve protein identification in mass spectrometry using machine learning.

## Notebook summary:
1. Import MaxQuant output (evidence.txt) file.
2. Clean dataset.
    - Check for and remove contaminants and 'reverse' sequences by MaxQuant and empty protein IDs.
    - Keep only peptides with 0 missed cleavages.
3. Reformat dataset columns for ML training.
4. Export reformatted peptide dataset as TSV.

In [1]:
# import libraries
import numpy as np
import pandas as pd
import re
import csv

# set display options
#pd.set_option("display.max_rows", None, "display.max_columns", None)

### 1. Import MaxQuant output (evidence.txt) file

In [2]:
# load dataset
evidence = pd.read_table('../data/maxquant_output/txt/evidence.txt')
evidence.head()

Unnamed: 0,Sequence,Length,Modifications,Modified sequence,Oxidation (M) Probabilities,Oxidation (M) Score Diffs,Acetyl (Protein N-term),Oxidation (M),Missed cleavages,Proteins,...,Reverse,Potential contaminant,id,Protein group IDs,Peptide ID,Mod. peptide ID,MS/MS IDs,Best MS/MS,Oxidation (M) site IDs,Taxonomy IDs
0,AAAAAAAAAVSR,12,Unmodified,_AAAAAAAAAVSR_,,,0,0,0,Q96JP5,...,,,0,4209,0,0,0,0,,9606.0
1,AAAAAAAATMALAAPSSPTPESPTMLTK,28,Unmodified,_AAAAAAAATMALAAPSSPTPESPTMLTK_,,,0,0,0,Q9NQS7,...,,,1,4961,1,1,1,1,,9606.0
2,AAAAAAAGDSDSWDADAFSVEDPVR,25,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSV...,,,1,0,0,O75822,...,,,2,655,2,2,2;3,2,,9606.0
3,AAAAAAAGDSDSWDADAFSVEDPVR,25,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSV...,,,1,0,0,O75822,...,,,3,655,2,2,4,4,,9606.0
4,AAAAAAAGDSDSWDADAFSVEDPVRK,26,Acetyl (Protein N-term),_(Acetyl (Protein N-term))AAAAAAAGDSDSWDADAFSV...,,,1,0,1,O75822,...,,,4,655,3,3,5,5,,9606.0


#### Explore dataset

In [3]:
evidence.shape

(103479, 92)

In [4]:
evidence.columns

Index(['Sequence', 'Length', 'Modifications', 'Modified sequence',
       'Oxidation (M) Probabilities', 'Oxidation (M) Score Diffs',
       'Acetyl (Protein N-term)', 'Oxidation (M)', 'Missed cleavages',
       'Proteins', 'Leading proteins', 'Leading razor protein', 'Gene names',
       'Protein names', 'Type', 'Raw file', 'Fraction', 'Experiment',
       'MS/MS m/z', 'Charge', 'm/z', 'Mass',
       'Uncalibrated - Calibrated m/z [ppm]',
       'Uncalibrated - Calibrated m/z [Da]', 'Mass error [ppm]',
       'Mass error [Da]', 'Uncalibrated mass error [ppm]',
       'Uncalibrated mass error [Da]', 'Max intensity m/z 0', 'Retention time',
       'Retention length', 'Calibrated retention time',
       'Calibrated retention time start', 'Calibrated retention time finish',
       'Retention time calibration', 'Match time difference',
       'Match m/z difference', 'Match q-value', 'Match score',
       'Number of data points', 'Number of scans', 'Number of isotopic peaks',
       'PIF', 

In [5]:
# get summary statistics
evidence.describe()

Unnamed: 0,Length,Acetyl (Protein N-term),Oxidation (M),Missed cleavages,Fraction,Experiment,MS/MS m/z,Charge,m/z,Mass,...,Reporter intensity count 6,Reporter intensity count 7,Reporter intensity count 8,Reporter intensity count 9,Reporter intensity count 10,id,Peptide ID,Mod. peptide ID,Best MS/MS,Taxonomy IDs
count,103479.0,103479.0,103479.0,103479.0,103479.0,103479.0,103479.0,103479.0,103479.0,103479.0,...,103479.0,103479.0,103479.0,103479.0,103479.0,103479.0,103479.0,103479.0,103479.0,102868.0
mean,13.528242,0.019202,0.0572,0.11066,4.121116,1.0,716.593043,2.739474,562.947075,1517.353103,...,1.021202,1.021975,1.02243,1.016293,1.019444,51739.0,23169.097082,24086.685627,54374.214749,9606.0
std,5.629131,0.137235,0.238467,0.332321,1.849471,0.0,191.897488,0.651233,192.6405,603.112929,...,0.406388,0.405871,0.405667,0.407848,0.406166,29871.95859,13471.188256,14063.205794,31400.61187,0.0
min,7.0,0.0,0.0,0.0,1.0,1.0,375.20517,1.0,202.118618,573.287074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9606.0
25%,9.0,0.0,0.0,0.0,3.0,1.0,575.967377,2.0,421.226354,1060.50454,...,1.0,1.0,1.0,1.0,1.0,25869.5,11527.5,11927.5,27160.5,9606.0
50%,12.0,0.0,0.0,0.0,4.0,1.0,689.926086,3.0,530.785296,1364.68236,...,1.0,1.0,1.0,1.0,1.0,51739.0,23164.0,23996.0,54267.0,9606.0
75%,16.0,0.0,0.0,0.0,6.0,1.0,827.067291,3.0,672.938369,1829.996345,...,1.0,1.0,1.0,1.0,1.0,77608.5,34913.0,36397.0,81569.5,9606.0
max,51.0,1.0,3.0,2.0,7.0,1.0,1495.714722,7.0,1446.72435,4456.18903,...,17.0,17.0,17.0,17.0,17.0,103478.0,46462.0,48432.0,108930.0,9606.0


#### Verification: check number of proteins (unique rows) and peptides

In [6]:
evidence['Proteins'].describe()

count     103335
unique      6906
top       Q09666
freq         827
Name: Proteins, dtype: object

In [7]:
evidence['Sequence'].describe()

count                      103479
unique                      46421
top       LCYVALDFEQEMATAASSSSLEK
freq                          124
Name: Sequence, dtype: object

In [8]:
# there are more sequences than Proteins - maybe there are some missing proteins? Check for this
print(evidence['Proteins'].isnull().sum()) # after a manual check these seem to be all reverse sequences

144


In [9]:
# check missed cleavages
evidence['Missed cleavages'].describe(include=['category'])

count    103479.000000
mean          0.110660
std           0.332321
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           2.000000
Name: Missed cleavages, dtype: float64

### 2. Clean dataset

#### Remove contaminants, 'reverse' sequences and empty protein IDs

In [10]:
# check number of contaminant peptides
print(len(evidence.loc[evidence['Potential contaminant'] == '+']))

2731


In [11]:
# remove contaminant peptides
evidence_cleaned = evidence.loc[evidence['Potential contaminant'] != '+']
evidence_cleaned.shape

(100748, 92)

In [12]:
# check number of contaminant peptides after removing
print(len(evidence_cleaned.loc[evidence_cleaned['Potential contaminant'] == '+']))

0


In [13]:
# check number of 'reverse' sequences before removing contaminants
print(len(evidence.loc[evidence['Reverse'] == '+']))

144


In [14]:
# check number of 'reverse' sequences after removing contaminants
print(len(evidence_cleaned.loc[evidence_cleaned['Reverse'] == '+']))

138


In [15]:
# remove 'reverse' sequences
evidence_cleaned = evidence_cleaned.loc[evidence_cleaned['Reverse'] != '+']
evidence_cleaned.shape

(100610, 92)

In [16]:
# check number of 'reverse' sequences after removing them
print(len(evidence_cleaned.loc[evidence_cleaned['Reverse'] == '+']))

0


In [17]:
# check for number of proteins
evidence_cleaned['Leading razor protein'].describe()

count     100610
unique      5709
top       Q09666
freq         837
Name: Leading razor protein, dtype: object

In [18]:
# check for number of peptides
evidence_cleaned['Sequence'].describe()

count                      100610
unique                      45362
top       LCYVALDFEQEMATAASSSSLEK
freq                          124
Name: Sequence, dtype: object

#### Keep only peptides with 0 missed cleavages.

In [19]:
# check number of 0 missed cleavages
print(len(evidence_cleaned.loc[evidence_cleaned['Missed cleavages'] == 0]))

90134


In [20]:
# check number of 1 missed cleavages
print(len(evidence_cleaned.loc[evidence_cleaned['Missed cleavages'] == 1]))

9873


In [21]:
# check number of 2 missed cleavages
print(len(evidence_cleaned.loc[evidence_cleaned['Missed cleavages'] == 2]))

603


In [22]:
# check number of 3 missed cleavages (should be 0)
print(len(evidence.loc[evidence['Missed cleavages'] == 3]))

0


In [23]:
# keep only peptides with 0 missed cleavages
evidence_cleaned = evidence_cleaned.loc[(evidence_cleaned['Missed cleavages'] == 0)]

In [24]:
# check for any missed cleavages
evidence_cleaned['Missed cleavages'].describe()

count    90134.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: Missed cleavages, dtype: float64

In [25]:
# check number of missed cleavages after removing them
print(len(evidence_cleaned.loc[evidence_cleaned['Missed cleavages'] != 0]))

0


In [26]:
evidence_cleaned.shape

(90134, 92)

#### Verification: check total number of missing values for each feature to determine any further rows for removal

In [27]:
print(evidence_cleaned.isnull().sum())
# No missing proteins or sequences, so no need to remove any further rows. Features that do have missing data are expected.

Sequence                           0
Length                             0
Modifications                      0
Modified sequence                  0
Oxidation (M) Probabilities    84999
                               ...  
Mod. peptide ID                    0
MS/MS IDs                          0
Best MS/MS                         0
Oxidation (M) site IDs         85029
Taxonomy IDs                       0
Length: 92, dtype: int64


#### Verification: check for any potential co-eluting contaminants and reverse sequences in 'Proteins' and 'leading razor protein' columns

In [28]:
# check for any potential co-eluting contaminants in 'Proteins' column
len(evidence_cleaned[evidence_cleaned['Proteins'].str.contains("CON__", na=False)])

87

In [29]:
# remove potential co-eluting contaminants in 'Proteins' column
evidence_cleaned = evidence_cleaned[~evidence_cleaned['Proteins'].str.contains("CON__", na=False)]
evidence_cleaned.shape

(90047, 92)

In [30]:
# verify all contaminants have been removed in 'Proteins' column
len(evidence_cleaned[evidence_cleaned["Proteins"].str.contains("CON__")])

0

In [31]:
# check for any potential co-eluting contaminants in 'Leading razor protein' column
len(evidence_cleaned[evidence_cleaned['Leading razor protein'].str.contains("CON__", na=False)])

0

In [32]:
# check for any potential co-eluting reverse sequences in 'Proteins' column
len(evidence_cleaned[evidence_cleaned['Proteins'].str.contains("REV__", na=False)])

0

In [33]:
# check for any potential co-eluting reverse sequences in 'Leading razor protein' column
len(evidence_cleaned[evidence_cleaned['Leading razor protein'].str.contains("REV__", na=False)])

0

In [34]:
evidence_cleaned.shape

(90047, 92)

In [35]:
# check how many identified peptides map to more than one protein (i.e. ambiguous protein groups)
len(evidence_cleaned[evidence_cleaned['Proteins'].str.contains(";")])

6665

In [36]:
# keep only rows that don't contain multiple proteins in "Proteins" column
evidence_cleaned = evidence_cleaned[~evidence_cleaned['Proteins'].str.contains(";", na=False)]
evidence_cleaned.shape

(83382, 92)

In [37]:
# verify these protein groups were removed
len(evidence_cleaned[evidence_cleaned['Proteins'].str.contains(";")])

0

In [38]:
# export cleaned dataset as TSV
evidence_cleaned.to_csv("../data/evidence_cleaned.tsv", sep='\t', index=False)

### 3. Reformat dataset columns for ML training

In [39]:
# extract protein and peptide sequence from evidence_cleaned
# make the "leading razor groups" column as the protein column
detected_peptides = evidence_cleaned[['Proteins', 'Sequence', 'PEP']]
detected_peptides.head()

Unnamed: 0,Proteins,Sequence,PEP
0,Q96JP5,AAAAAAAAAVSR,0.0010184
1,Q9NQS7,AAAAAAAATMALAAPSSPTPESPTMLTK,1.0957e-06
2,O75822,AAAAAAAGDSDSWDADAFSVEDPVR,1.0159e-11
3,O75822,AAAAAAAGDSDSWDADAFSVEDPVR,3.5763e-46
7,P36578,AAAAAAALQAK,0.0031654


#### Verification: final check on dimensions after formatting, and if expected number of proteins and peptides are present

In [40]:
# check dimensions
detected_peptides.shape

(83382, 3)

In [41]:
# rename columns
detected_peptides = detected_peptides.rename(columns={"Proteins": "Protein", "Sequence": "Peptide"})
detected_peptides.head()

Unnamed: 0,Protein,Peptide,PEP
0,Q96JP5,AAAAAAAAAVSR,0.0010184
1,Q9NQS7,AAAAAAAATMALAAPSSPTPESPTMLTK,1.0957e-06
2,O75822,AAAAAAAGDSDSWDADAFSVEDPVR,1.0159e-11
3,O75822,AAAAAAAGDSDSWDADAFSVEDPVR,3.5763e-46
7,P36578,AAAAAAALQAK,0.0031654


### 4. Export reformatted peptide dataset as TSV

In [42]:
detected_peptides.to_csv("../data/detected_peptides.tsv", sep='\t', index=False)