In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v2.1/')

refseq_2016_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20160914/RefSeq.20160914_Human_ucsc_hg19_customProDBnr_mito_150contams.fasta.gz'
refseq_2018_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC2/CRC_prospective/CPTAC2_Colon_Prospective_Collection_PNNL_Phosphoproteome.phosphosite.tmt10.tsv.gz'

In [3]:
protein_fa = FastaFile(str(refseq_2018_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

In [5]:
raw_phospho_df = pd.read_table(phospho_pth)
raw_phospho_df.columns

Index(['Phosphosite', '853ecf90-71e6-4156-a56f-a34b65_D2 Log Ratio',
       '8cc7e656-0152-4359-8566-0581c3 Log Ratio',
       'b3696374-c6c0-49dd-833e-596e26_D2 Log Ratio',
       'f635496c-0046-4ecd-89bc-7a4f33_D2 Log Ratio',
       'e1cd3d70-132b-452f-ba10-026721_D2 Log Ratio',
       'bd67de01-ad7d-431a-9ad6-4dd5a1 Log Ratio',
       '21cab01c-e968-42cc-9651-1e53c0_D3 Log Ratio',
       'ec9afcc4-ccf7-4226-b0e8-d6897d_D3 Log Ratio',
       '573048dd-2502-40e0-8e8c-c41bb8_D3 Log Ratio',
       ...
       '7bdfa5c9-d11f-4600-9c44-a87996_D2 Log Ratio',
       '515040a4-bbfb-4a15-9e53-e7b6e6_D2 Log Ratio',
       '6e0d04fc-e6a8-41ed-b6f3-45537d_D2 Log Ratio',
       '0d5554d1-1653-4589-a44f-43113e_D2 Log Ratio',
       '1bf00d93-240f-47e8-8055-f546b0_D2 Log Ratio',
       '1f79fed9-f0d4-4c45-acd2-ea1441_D2 Log Ratio',
       'fc9b09af-23b8-4ffe-b6c6-41236d_D2 Log Ratio', 'Peptide', 'Gene',
       'Organism'],
      dtype='object', length=202)

In [6]:
raw_phospho_df['Peptide'] + ':' + raw_phospho_df['Phosphosite']

0                       sLGHPEPLSNGRPQGNSR:NP_000012.1:s43
1                          VSKNSKYNAEStER:NP_000012.1:t320
2          YNAESTERESQDtVAENDDGGFSEEWEAQR:NP_000012.1:t327
3        YNAESTERESQDtVAENDDGGFsEEWEAQR:NP_000012.1:t32...
4                          DSHLGPHRStPESR:NP_000012.1:t354
                               ...                        
44944               sATAsPQQPQAQQR:XP_016871004.1:s872s876
44945                   SAtASPQQPQAQQR:XP_016871004.1:t874
44946               SAtAsPQQPQAQQR:XP_016871004.1:t874s876
44947                   SATAsPQQPQAQQR:XP_016871004.1:s876
44948                       IFEMGPVFtL:YP_003024029.1:t226
Length: 44949, dtype: object

In [7]:
peptide_df = pd.concat([
    raw_phospho_df.loc[:, ['Peptide']],
    raw_phospho_df['Phosphosite'].str.extract(
        r'^(?P<refseq_prot_id>[^:]+):'
        r'(?P<phosphosites>(?:[a-z]\d+)*)$',
        expand = True
    )
], axis = 'columns')
peptide_df['original_id'] = raw_phospho_df['Peptide'] + ':' + raw_phospho_df['Phosphosite']
peptide_df['phosphosites'] = peptide_df['phosphosites'].str.upper().str.findall(r'[A-Z]\d+')
# Multiple peptides per site. Take the first one
peptide_df['peptide'] = peptide_df['Peptide'].str.split(';').apply(lambda x: x[0])
del peptide_df['Peptide']
peptide_df.head(10)

Unnamed: 0,refseq_prot_id,phosphosites,original_id,peptide
0,NP_000012.1,[S43],sLGHPEPLSNGRPQGNSR:NP_000012.1:s43,sLGHPEPLSNGRPQGNSR
1,NP_000012.1,[T320],VSKNSKYNAEStER:NP_000012.1:t320,VSKNSKYNAEStER
2,NP_000012.1,[T327],YNAESTERESQDtVAENDDGGFSEEWEAQR:NP_000012.1:t327,YNAESTERESQDtVAENDDGGFSEEWEAQR
3,NP_000012.1,"[T327, S337]",YNAESTERESQDtVAENDDGGFsEEWEAQR:NP_000012.1:t32...,YNAESTERESQDtVAENDDGGFsEEWEAQR
4,NP_000012.1,[T354],DSHLGPHRStPESR:NP_000012.1:t354,DSHLGPHRStPESR
5,NP_000012.1,[S365],AAVQELsSSILAGEDPEER:NP_000012.1:s365,AAVQELsSSILAGEDPEER
6,NP_000012.1,"[S365, S366]",AAVQELssSILAGEDPEER:NP_000012.1:s365s366,AAVQELssSILAGEDPEER
7,NP_000012.1,"[S365, S367]",AAVQELsSsILAGEDPEER:NP_000012.1:s365s367,AAVQELsSsILAGEDPEER
8,NP_000012.1,[S366],AAVQELSsSILAGEDPEER:NP_000012.1:s366,AAVQELSsSILAGEDPEER
9,NP_000012.1,"[S366, S367]",AAVQELSssILAGEDPEER:NP_000012.1:s366s367,AAVQELSssILAGEDPEER


The reference is more similar to RefSeq 2018 than to RefSeq 2016

In [8]:
missing_prot_ids_in_refseq_2018 = set(peptide_df['refseq_prot_id']) - set(protein_fa.references)
len(missing_prot_ids_in_refseq_2018)

51

In [9]:
len(set(peptide_df['refseq_prot_id']) - set(FastaFile(refseq_2016_fasta_pth).references))

413

Number of peptides being excluded

In [10]:
(~peptide_df['refseq_prot_id'].isin(protein_fa.references)).sum()

1005

In [11]:
(~peptide_df['refseq_prot_id'].isin(FastaFile(refseq_2016_fasta_pth).references)).sum()

4029

Only keep the entries found in RefSeq 2018

In [12]:
peptide_df = peptide_df[peptide_df['refseq_prot_id'].isin(protein_fa.references)]

In [13]:
# Export the missing RefSeq IDs
(
    pd.DataFrame({'refseq_prot_id': sorted(missing_prot_ids_in_refseq_2018)})
    .to_csv(
        phospho_pth.with_name('phospho_missing_ids_in_refseq_2018_database.tsv'),
        sep='\t',
        index=False
    )
)

Check what residues are modified

In [14]:
peptide_df['phosphosites'].explode().apply(lambda x: x[0]).value_counts()

S    42680
T     9698
Y     1863
Name: phosphosites, dtype: int64

In [15]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['phosphosites'].apply(
    lambda l: [x for x in l if x[0] in 'STY']
)

In [16]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.peptide, peptide_df.refseq_prot_id, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [17]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df[~peptide_loc_range_df['peptide_loc_valid']]

Unnamed: 0,refseq_prot_id,phosphosites,original_id,peptide,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason


In [18]:
out_df = peptide_with_loc_df.loc[:,[
    'original_id', 'peptide', 'refseq_prot_id',
    'phosphosites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'
]]
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    phospho_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [19]:
out_df.head()

Unnamed: 0,original_id,peptide,refseq_prot_id,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,sLGHPEPLSNGRPQGNSR:NP_000012.1:s43,sLGHPEPLSNGRPQGNSR,NP_000012.1,S43,43,60,True,
1,VSKNSKYNAEStER:NP_000012.1:t320,VSKNSKYNAEStER,NP_000012.1,T320,309,322,True,
2,YNAESTERESQDtVAENDDGGFSEEWEAQR:NP_000012.1:t327,YNAESTERESQDtVAENDDGGFSEEWEAQR,NP_000012.1,T327,315,344,True,
3,YNAESTERESQDtVAENDDGGFsEEWEAQR:NP_000012.1:t32...,YNAESTERESQDtVAENDDGGFsEEWEAQR,NP_000012.1,T327;S337,315,344,True,
4,DSHLGPHRStPESR:NP_000012.1:t354,DSHLGPHRStPESR,NP_000012.1,T354,345,358,True,
