In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v3.0/')

refseq_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20130727/RefSeq.20130727-Human.contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC2/TCGA_BRCA_retrospective/phosphoproteome_P3_peptide_annotation.tsv.gz'

In [3]:
protein_fa = FastaFile(str(refseq_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

In [5]:
peptide_df = pd.read_table(phospho_pth)
peptide_df.columns

Index(['original_site_id', 'geneName', 'accession_number', 'variableSites',
       'sequence'],
      dtype='object')

In [6]:
peptide_df = peptide_df
peptide_df['phosphosites'] = peptide_df['variableSites'].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

Unnamed: 0,original_site_id,geneName,accession_number,variableSites,sequence,phosphosites
0,NP_055955_S15s _1_0_15_16,TTLL12,NP_055955,S15s,RPAERsSPGQTPEEGAQALAEFAALHGPALR,[S15s]
1,NP_680780_T31t _1_0_31_32,ATXN2L,NP_680780,T31t,RPPGGtSPPNGGLPGPLATSAAPPGPPAAASPCLGPVAAAGSGLRR,[T31t]
2,NP_002511_S125s _1_1_125_125,NPM1P21,NP_002511,S125s,CGSGPVHISGQHLVAVEEDAEsEDEEEEDVK,[S125s]
3,NP_001116539_S84s _1_1_84_84,MAPT,NP_001116539,S84s,STPTAEAEEAGIGDTPsLEDEAAGHVTQAR,[S84s]
4,NP_001135452_S299s _1_1_299_299,TP53BP1,NP_001135452,S299s,EQLSAQELMESGLQIQKsPEPEVLSTQEDLFDQSNK,[S299s]
5,NP_004454_S48s _1_0_48_50,FGD1,NP_004454,S48s,RGsGSALGGPLDPQFVGPSDTSLGAAPGHR,[S48s]
6,NP_056425_S413s _1_0_412_415,WIPI2,NP_056425,S413s,GTYVPSsPTRLAYTDDLGAVGGACLEDEASALRLDEDSEHPPMILR,[S413s]
7,NP_002826_S673s _1_1_673_673,PTPN12,NP_002826,S673s,DVDVSEDsPPPLPERTPESFVLASEHNTPVR,[S673s]
8,NP_001348_S87s _1_1_87_87,DHX9,NP_001348,S87s,SEEVPAFGVAsPPPLTDTPDTTANAEGDLPTTMGGPLPPHLALK,[S87s]
9,NP_004517_S40s _1_0_39_41,MCM2,NP_004517,S40s,RTDALTsSPGRDLPPFEDESEGLLGTEGPLEEEEDGEELIGDGMERDYR,[S40s]


Check what residues are modified

In [7]:
peptide_df['phosphosites'].explode().apply(lambda x: x[0]).value_counts()

S    32056
T     5052
M      353
Y      235
Q      217
C       98
Name: phosphosites, dtype: int64

In [8]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['phosphosites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'STY']
)

In [9]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.sequence, peptide_df.accession_number, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [10]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df[~peptide_loc_range_df['peptide_loc_valid']]

Unnamed: 0,original_site_id,geneName,accession_number,variableSites,sequence,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason


In [11]:
def compression_aware_with_suffix(p: Path, suffix: str):
    if p.suffix in ['.gz', '.bz2', '.xz', '.zst']:
        p = p.parent / p.stem
    return p.with_suffix(suffix)

In [12]:
out_df = peptide_with_loc_df.loc[
    :,
    ['original_site_id', 
     'phosphosites', 
     'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
]
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    compression_aware_with_suffix(phospho_pth, '.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [13]:
out_df.head()

Unnamed: 0,original_site_id,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_055955_S15s _1_0_15_16,S15,10,40,True,
1,NP_680780_T31t _1_0_31_32,T31,26,71,True,
2,NP_002511_S125s _1_1_125_125,S125,104,134,True,
3,NP_001116539_S84s _1_1_84_84,S84,68,97,True,
4,NP_001135452_S299s _1_1_299_299,S299,282,317,True,
