In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v2.1/')

refseq_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20160914/RefSeq.20160914_Human_ucsc_hg19_customProDBnr_mito_150contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC2/OV_prospective/mmc3_phospho_peptide_annotation.tsv.gz'

In [3]:
protein_fa = FastaFile(str(refseq_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

In [5]:
peptide_df = pd.read_table(phospho_pth)
peptide_df.columns

Index(['refseq_prot_id', 'symbol', 'site', 'Peptide'], dtype='object')

In [6]:
peptide_df = peptide_df.rename(columns={'Peptide': 'original_peptide'})
peptide_df['phosphosites'] = peptide_df['site'].str.split('-', 1, expand=True).iloc[:, 1].str.findall(r'[A-Z]\d+[a-z]')
peptide_df['peptide'] = peptide_df['original_peptide'].str.slice(start=2, stop=-2).str.replace(r'([STY])\*', lambda m: m.group(1).lower())
peptide_df.head(10)

Unnamed: 0,refseq_prot_id,symbol,site,original_peptide,phosphosites,peptide
0,NP_000011.2,ACVRL1,ACVRL1-S160s,R.GLHSELGES*SLILK.A,[S160s],GLHSELGEsSLILK
1,NP_000012.1,PSEN1,PSEN1-S43s,R.S*LGHPEPLSNGRPQGNSR.Q,[S43s],sLGHPEPLSNGRPQGNSR
2,NP_000012.1,PSEN1,PSEN1-S43s,R.S*LGHPEPL.S,[S43s],sLGHPEPL
3,NP_000012.1,PSEN1,PSEN1-S313s,K.NS*KYNAESTER.E,[S313s],NsKYNAESTER
4,NP_000012.1,PSEN1,PSEN1-S43s,R.S*LGHPEPLSNGRPQGN.S,[S43s],sLGHPEPLSNGRPQGN
5,NP_000012.1,PSEN1,PSEN1-S324s,K.YNAESTERES*QDTVAENDDGGFSEEWEAQR.D,[S324s],YNAESTEREsQDTVAENDDGGFSEEWEAQR
6,NP_000012.1,PSEN1,PSEN1-S319s,K.YNAES*TER.E,[S319s],YNAEsTER
7,NP_000012.1,PSEN1,PSEN1-S367s,R.AAVQELSSS*ILAGEDPEER.G,[S367s],AAVQELSSsILAGEDPEER
8,NP_000012.1,PSEN1,PSEN1-S43s,R.S*LGHPEPLSNGR.P,[S43s],sLGHPEPLSNGR
9,NP_000012.1,PSEN1,PSEN1-S365s,R.AAVQELS*SSILAGEDPEER.G,[S365s],AAVQELsSSILAGEDPEER


Check what residues are modified

In [7]:
peptide_df['phosphosites'].explode().apply(lambda x: x[0]).value_counts()

S    96400
T    19345
Y     2812
C        7
Name: phosphosites, dtype: int64

In [8]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['phosphosites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'STY']
)

In [9]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.peptide, peptide_df.refseq_prot_id, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [10]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df[~peptide_loc_range_df['peptide_loc_valid']]

Unnamed: 0,refseq_prot_id,symbol,site,original_peptide,phosphosites,peptide,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
10,NP_000022.3,ALAD,ALAD-S244s,K.SS*PAFGDR.R,[S244],SsPAFGDR,,,False,Invalid location
11,NP_000022.3,ALAD,ALAD-S244s,R.DAAKSS*PAFGDR.R,[S244],DAAKSsPAFGDR,,,False,Invalid location
12,NP_000022.3,ALAD,ALAD-S244s,K.SS*PAFGDRR.C,[S244],SsPAFGDRR,,,False,Invalid location
13,NP_000025.1,ALDOA,ALDOA-S154s,K.S*KGGVVGIK.V,[S154],sKGGVVGIK,,,False,Invalid location
14,NP_000025.1,ALDOA,ALDOA-S100s,R.LQS*IGTENTEENRR.F,[S100],LQsIGTENTEENRR,,,False,Invalid location
...,...,...,...,...,...,...,...,...,...,...
110936,NP_998820.3,WNK1,WNK1-S2292s,R.DVDDGSGSPHS*PHQLSSK.S,[S2292],DVDDGSGSPHsPHQLSSK,,,False,Invalid location
110937,NP_998820.3,WNK1,WNK1-S2297s,R.DVDDGSGSPHSPHQLS*SK.S,[S2297],DVDDGSGSPHSPHQLsSK,,,False,Invalid location
110938,NP_998820.3,WNK1,WNK1-S1521s,R.FIVS*PVPESR.L,[S1521],FIVsPVPESR,,,False,Invalid location
110940,NP_998820.3,WNK1,WNK1-S2546s,R.KFS*APGQLC.I,[S2546],KFsAPGQLC,,,False,Invalid location


In [11]:
out_df = peptide_with_loc_df.loc[
    :,
    ['refseq_prot_id', 'original_peptide', 
     'peptide', 'phosphosites', 
     'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
]
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    phospho_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [12]:
out_df.head()

Unnamed: 0,refseq_prot_id,original_peptide,peptide,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_000011.2,R.GLHSELGES*SLILK.A,GLHSELGEsSLILK,S160,152,165,True,
1,NP_000012.1,R.S*LGHPEPLSNGRPQGNSR.Q,sLGHPEPLSNGRPQGNSR,S43,43,60,True,
2,NP_000012.1,R.S*LGHPEPL.S,sLGHPEPL,S43,43,50,True,
3,NP_000012.1,K.NS*KYNAESTER.E,NsKYNAESTER,S313,312,322,True,
4,NP_000012.1,R.S*LGHPEPLSNGRPQGN.S,sLGHPEPLSNGRPQGN,S43,43,58,True,
