In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v3.1/')

refseq_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC3/HOPEAYA_discovery/cptac_hope_aya_mssm_phospho_d6_clean.tsv.gz'

In [3]:
protein_fa = FastaFile(str(refseq_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

## Phospho

In [5]:
raw_phospho_df = pd.read_table(phospho_pth)
raw_phospho_df.columns

Index(['site', 'gene', 'refseq_id', 'peptide', '7316-288-TISS [894482]',
       '7316-2660-TISS [894444]...2', '7316-89-TISS [894435]',
       '7316-2536-TISS [894447]', '7316-2176-TISS [894450]',
       '7316UP-904-TISS [900638]',
       ...
       '7316-3935-TISS [894427]', '7316-204-TISS [894456]',
       '7316-2146-TISS [894407]', '7316-2594-TISS [862600]',
       '7316-3303-TISS [894406]', '7316-1106-TISS [894468]',
       '7316-2751-TISS [894442]', '7316-287-TISS [894483]',
       '7316-1763-TISS [894464]', '7316-1455-TISS [894467]'],
      dtype='object', length=104)

In [6]:
raw_phospho_df.head()

Unnamed: 0,site,gene,refseq_id,peptide,7316-288-TISS [894482],7316-2660-TISS [894444]...2,7316-89-TISS [894435],7316-2536-TISS [894447],7316-2176-TISS [894450],7316UP-904-TISS [900638],...,7316-3935-TISS [894427],7316-204-TISS [894456],7316-2146-TISS [894407],7316-2594-TISS [862600],7316-3303-TISS [894406],7316-1106-TISS [894468],7316-2751-TISS [894442],7316-287-TISS [894483],7316-1763-TISS [894464],7316-1455-TISS [894467]
0,AAAS-S541s,AAAS,NP_056480.1,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHS*PHSHL,0.678,0.926,1.31,,2.0,,...,,,,,,,,,,
1,AAAS-S495s,AAAS,NP_056480.1,FS*PVLGR,-0.276214,-0.557244,-1.48755,1.30046,-0.770439,,...,0.617376,-0.527861,-0.725937,-0.675518,0.65339,-1.273346,-0.239751,-0.673117,0.821454,-0.084892
2,AAAS-S495s,AAAS,NP_056480.1,IAHIPLYFVNAQFPRFS*PVLGR,,,,,,,...,0.562537,-0.150395,-0.612625,-0.601853,1.270572,0.099327,0.539033,1.133469,0.005314,-0.466709
3,AAED1-S12s,AAED1,NP_714542.1,QVS*GAAALVPAPSGPDSGQPLAAAVAELPVLDAR,,,,,,,...,-0.067878,-0.645812,-0.529422,-0.249285,0.036471,0.785377,-0.221191,0.60076,-0.297446,-0.282195
4,AAGAB-S311s,AAGAB,NP_078942.3,AFWMAIGGDRDEIEGLSS*DEEH,,,,,,,...,-0.336099,-0.310628,0.45349,0.030441,-0.134703,-0.160766,0.526348,-0.150697,-0.642931,0.439274


In [7]:
peptide_df = raw_phospho_df.loc[:, ['gene', 'refseq_id', 'peptide', 'site']]
peptide_df['original_id'] = peptide_df['peptide']
peptide_df['peptide'] = peptide_df['peptide'].str.replace(r'([STY])\*', lambda m: m.group(1).lower())
peptide_df['phosphosites'] = peptide_df['site'].str.split('-', 1, expand=True).iloc[:, 1].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

Unnamed: 0,gene,refseq_id,peptide,site,original_id,phosphosites
0,AAAS,NP_056480.1,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHsPHSHL,AAAS-S541s,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHS*PHSHL,[S541s]
1,AAAS,NP_056480.1,FsPVLGR,AAAS-S495s,FS*PVLGR,[S495s]
2,AAAS,NP_056480.1,IAHIPLYFVNAQFPRFsPVLGR,AAAS-S495s,IAHIPLYFVNAQFPRFS*PVLGR,[S495s]
3,AAED1,NP_714542.1,QVsGAAALVPAPSGPDSGQPLAAAVAELPVLDAR,AAED1-S12s,QVS*GAAALVPAPSGPDSGQPLAAAVAELPVLDAR,[S12s]
4,AAGAB,NP_078942.3,AFWMAIGGDRDEIEGLSsDEEH,AAGAB-S311s,AFWMAIGGDRDEIEGLSS*DEEH,[S311s]
5,AAGAB,NP_078942.3,NDRNQGFSLLNSLTGTNHSIGSADPCHPEQPHLPAADsTESLSDHR,AAGAB-S215s,NDRNQGFSLLNSLTGTNHSIGSADPCHPEQPHLPAADS*TESLSDHR,[S215s]
6,AAGAB,NP_078942.3,NQGFSLLNSLTGTNHSIGsADPCHPEQPHLPAADSTESLSDHR,AAGAB-S199s,NQGFSLLNSLTGTNHSIGS*ADPCHPEQPHLPAADSTESLSDHR,[S199s]
7,AAGAB,NP_078942.3,NQGFSLLNSLTGTNHSIGSADPCHPEQPHLPAADsTESLSDHR,AAGAB-S215s,NQGFSLLNSLTGTNHSIGSADPCHPEQPHLPAADS*TESLSDHR,[S215s]
8,AAGAB,NP_078942.3,NQGFSLLNSLTGTNHSIGSADPCHPEQPHLPAADStESLSDHR,AAGAB-T216t,NQGFSLLNSLTGTNHSIGSADPCHPEQPHLPAADST*ESLSDHR,[T216t]
9,AAK1,NP_055726.3,ADVAVESLIPGLEPPVPQRLPSQtESVTSNR,AAK1-T848t,ADVAVESLIPGLEPPVPQRLPSQT*ESVTSNR,[T848t]


In [8]:
# Check what residues are modified
peptide_df['phosphosites'].explode().apply(lambda x: x[0]).value_counts()

S    85708
T    20574
Y     3886
C        1
Name: phosphosites, dtype: int64

In [9]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['phosphosites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'STY']
)

In [10]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.peptide, peptide_df.refseq_id, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [11]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['refseq_id', 'phosphosites', 'peptide', 'peptide_loc_invalid_reason']
]

Unnamed: 0,refseq_id,phosphosites,peptide,peptide_loc_invalid_reason


In [12]:
out_df = peptide_with_loc_df.loc[
    :,
    ['original_id', 'peptide', 'phosphosites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
]
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    phospho_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [13]:
out_df.head()

Unnamed: 0,original_id,peptide,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHS*PHSHL,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHsPHSHL,S541,501,546,True,
1,FS*PVLGR,FsPVLGR,S495,494,500,True,
2,IAHIPLYFVNAQFPRFS*PVLGR,IAHIPLYFVNAQFPRFsPVLGR,S495,479,500,True,
3,QVS*GAAALVPAPSGPDSGQPLAAAVAELPVLDAR,QVsGAAALVPAPSGPDSGQPLAAAVAELPVLDAR,S12,10,43,True,
4,AFWMAIGGDRDEIEGLSS*DEEH,AFWMAIGGDRDEIEGLSsDEEH,S311,294,315,True,
