In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v3.0/')

refseq_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC3/PDAC_discovery/phosphoproteomics_MultiSite_level_MD_abundance_tumor.cct.gz'

In [3]:
protein_fa = FastaFile(str(refseq_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

## Phospho

In [5]:
raw_phospho_df = pd.read_table(phospho_pth)
raw_phospho_df.columns

Index(['Index', 'Gene', 'Peptide', 'C3N-03884', 'C3L-03123', 'C3L-01687',
       'C3L-00589', 'C3L-00599', 'C3L-01054', 'C3L-03356',
       ...
       'C3N-01375', 'C3N-01381', 'C3L-00401', 'C3L-02118', 'C3N-00511',
       'C3L-02613', 'C3N-00512', 'C3L-02899', 'C3N-03006', 'C3N-03069'],
      dtype='object', length=143)

In [6]:
parsed_index_df = raw_phospho_df['Index'].str.split('_', expand=True)
raw_phospho_df['refseq_prot_id'] = parsed_index_df[0] + '_' + parsed_index_df[1]
raw_phospho_df['phosphosites'] = parsed_index_df.iloc[:, -1].str.findall(r'[A-Z]\d+')

In [7]:
raw_phospho_df.head()

Unnamed: 0,Index,Gene,Peptide,C3N-03884,C3L-03123,C3L-01687,C3L-00589,C3L-00599,C3L-01054,C3L-03356,...,C3L-00401,C3L-02118,C3N-00511,C3L-02613,C3N-00512,C3L-02899,C3N-03006,C3N-03069,refseq_prot_id,phosphosites
0,NP_000005.2_240_256_2_0,A2M,IITILEEEMNVSVCGLYTYGK,,,,,,,,...,,,,,,,,,NP_000005.2,
1,NP_000005.2_914_932_1_1_S928,A2M,ETTFNSLLCPSGGEVsEELSLK,,,,,,,,...,,,,,,,,,NP_000005.2,[S928]
2,NP_000009.1_485_489_1_1_S489,ACADVL,ELSGLGsALK,13.842216,13.643395,14.259207,13.972216,13.8341,,,...,,,,,,,,,NP_000009.1,[S489]
3,NP_000010.1_167_170_1_1_Y170,ACAT1,GSTPyGGVK,,,,,,,,...,,,,,,,,,NP_000010.1,[Y170]
4,NP_000011.2_155_161_1_1_S160,ACVRL1,GLHSELGEsSLILK,,,,,,,,...,,,18.011339,18.828547,18.324068,18.500011,18.10603,18.594276,NP_000011.2,[S160]


In [8]:
peptide_df = raw_phospho_df.loc[
    # Filter peptides that don't have any phosphosites
    ~raw_phospho_df.phosphosites.isnull(),
    ['Index', 'Gene', 'refseq_prot_id', 'Peptide', 'phosphosites']]
peptide_df = peptide_df.rename(columns = {'Index': 'original_id', 'Peptide': 'peptide'})
peptide_df.head(10)

Unnamed: 0,original_id,Gene,refseq_prot_id,peptide,phosphosites
1,NP_000005.2_914_932_1_1_S928,A2M,NP_000005.2,ETTFNSLLCPSGGEVsEELSLK,[S928]
2,NP_000009.1_485_489_1_1_S489,ACADVL,NP_000009.1,ELSGLGsALK,[S489]
3,NP_000010.1_167_170_1_1_Y170,ACAT1,NP_000010.1,GSTPyGGVK,[Y170]
4,NP_000011.2_155_161_1_1_S160,ACVRL1,NP_000011.2,GLHSELGEsSLILK,[S160]
5,NP_000011.2_155_161_1_1_S161,ACVRL1,NP_000011.2,GLHSELGESsLILK,[S161]
6,NP_000011.2_493_495_1_1_S495,ACVRL1,NP_000011.2,ISNsPEKPK,[S495]
8,NP_000012.1_365_367_1_1_S366,PSEN1,NP_000012.1,AAVQELSsSILAGEDPEER,[S366]
9,NP_000012.1_365_367_1_1_S367,PSEN1,NP_000012.1,AAVQELSSsILAGEDPEER,[S367]
10,NP_000012.1_365_367_2_2_S366S367,PSEN1,NP_000012.1,AAVQELSssILAGEDPEER,"[S366, S367]"
12,NP_000012.1_43_59_1_1_S43,PSEN1,NP_000012.1,sLGHPEPLSNGRPQGNSR,[S43]


In [9]:
# Check what residues are modified
peptide_df['phosphosites'].explode().apply(lambda x: x[0]).value_counts()

S    68096
T    10581
Y     1170
Name: phosphosites, dtype: int64

In [10]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['phosphosites'].apply(
    lambda l: [x for x in l if x[0] in 'STY']
)

In [11]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.peptide, peptide_df.refseq_prot_id, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [12]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['refseq_prot_id', 'phosphosites', 'peptide', 'peptide_loc_invalid_reason']
]

Unnamed: 0,refseq_prot_id,phosphosites,peptide,peptide_loc_invalid_reason
18507,NP_001159477.1,[S1],sQEELKK,Peptide not found in protein
28180,NP_001269877.1,[S1],sPPPETLK,Peptide not found in protein
30761,NP_001278928.1,[T2],DtLGLGK,Peptide not found in protein
33548,NP_001299602.1,[S6],MLQALsPK,Peptide not found in protein
33773,NP_001304028.1,[S3],GAsDEDLKK,Peptide not found in protein
34757,NP_001307374.1,[S1],sSLLNAK,Peptide not found in protein
34758,NP_001307374.1,[S2],SsLLNAK,Peptide not found in protein
35282,NP_001308037.1,[S6],IPIVRsFADIGK,Peptide not found in protein
35434,NP_001308400.1,"[S1, S3, S5]",sRsRsISR,Peptide not found in protein
35760,NP_001308849.1,[S3],APsLQAK,Peptide not found in protein


In [13]:
out_df = peptide_with_loc_df.loc[
    :,
    ['original_id', 'refseq_prot_id', 'peptide', 'phosphosites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
]
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    phospho_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [14]:
out_df.head()

Unnamed: 0,original_id,refseq_prot_id,peptide,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
1,NP_000005.2_914_932_1_1_S928,NP_000005.2,ETTFNSLLCPSGGEVsEELSLK,S928,913,934,True,
2,NP_000009.1_485_489_1_1_S489,NP_000009.1,ELSGLGsALK,S489,483,492,True,
3,NP_000010.1_167_170_1_1_Y170,NP_000010.1,GSTPyGGVK,Y170,166,174,True,
4,NP_000011.2_155_161_1_1_S160,NP_000011.2,GLHSELGEsSLILK,S160,152,165,True,
5,NP_000011.2_155_161_1_1_S161,NP_000011.2,GLHSELGESsLILK,S161,152,165,True,
