In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v1/')

refseq_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC3/CCRCC_discovery/6_CPTAC3_CCRCC_Phospho_abundance_phosphosite_protNorm=2_CB.tsv.gz'

In [3]:
protein_fa = FastaFile(str(refseq_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

## Phospho

In [5]:
raw_phospho_df = pd.read_table(phospho_pth)
raw_phospho_df.columns

Index(['Index', 'Gene', 'Peptide', 'ReferenceIntensity', 'CPT0079430001',
       'CPT0023360001', 'CPT0023350003', 'CPT0079410003', 'CPT0087040003',
       'CPT0077310003',
       ...
       'CPT0012080003', 'CPT0021240003', 'CPT0009020003', 'CPT0017450001',
       'CPT0009060003', 'CPT0012900004', 'CPT0017410003', 'CPT0009080003',
       'CPT0012920003', 'CPT0009000003'],
      dtype='object', length=211)

In [6]:
parsed_index_df = raw_phospho_df['Index'].str.split('_', expand=True)
raw_phospho_df['refseq_prot_id'] = parsed_index_df[0] + '_' + parsed_index_df[1]
raw_phospho_df['phosphosites'] = parsed_index_df.iloc[:, -1].str.findall(r'[A-Z]\d+')

In [7]:
raw_phospho_df.head()

Unnamed: 0,Index,Gene,Peptide,ReferenceIntensity,CPT0079430001,CPT0023360001,CPT0023350003,CPT0079410003,CPT0087040003,CPT0077310003,...,CPT0009020003,CPT0017450001,CPT0009060003,CPT0012900004,CPT0017410003,CPT0009080003,CPT0012920003,CPT0009000003,refseq_prot_id,phosphosites
0,NP_000005.2_708_710_1_1_S710,A2M,VGFYEsDVMGR,17.256343,,,,,,,...,,,,,,,,,NP_000005.2,[S710]
1,NP_000005.2_864_889_3_3_S879T885S889,A2M,SLGNVNFTVSAEALEsQELCGtEVPsVPEHGR,17.412773,,,,,,,...,,,,,,,,,NP_000005.2,"[S879, T885, S889]"
2,NP_000005.2_914_932_1_1_S928,A2M,ETTFNSLLCPSGGEVsEELSLK,15.544481,,,,,,,...,,,,,,,,,NP_000005.2,[S928]
3,NP_000007.1_145_167_1_0,ACADM,YLGRMTEEPLMCAYCVTEPGAGSDVAGIK,14.825694,,,,,,,...,15.231622,15.312459,14.805629,15.146247,14.801663,15.203983,15.225042,14.651588,NP_000007.1,
4,NP_000007.1_220_228_1_1_T228,ACADM,AFTGFIVEADtPGIQIGR,19.798011,,,,,,,...,,,,,,,,,NP_000007.1,[T228]


In [8]:
peptide_df = raw_phospho_df.loc[
    # Filter peptides that don't have any phosphosites
    ~raw_phospho_df.phosphosites.isnull(),
    ['Index', 'Gene', 'refseq_prot_id', 'Peptide', 'phosphosites']]
peptide_df = peptide_df.rename(columns = {'Index': 'original_id', 'Peptide': 'peptide'})
peptide_df.head(10)

Unnamed: 0,original_id,Gene,refseq_prot_id,peptide,phosphosites
0,NP_000005.2_708_710_1_1_S710,A2M,NP_000005.2,VGFYEsDVMGR,[S710]
1,NP_000005.2_864_889_3_3_S879T885S889,A2M,NP_000005.2,SLGNVNFTVSAEALEsQELCGtEVPsVPEHGR,"[S879, T885, S889]"
2,NP_000005.2_914_932_1_1_S928,A2M,NP_000005.2,ETTFNSLLCPSGGEVsEELSLK,[S928]
4,NP_000007.1_220_228_1_1_T228,ACADM,NP_000007.1,AFTGFIVEADtPGIQIGR,[T228]
5,NP_000007.1_351_355_1_1_T351,ACADM,NP_000007.1,RNtYYASIAK,[T351]
6,NP_000009.1_485_489_1_1_S485,ACADVL,NP_000009.1,ELsGLGSALK,[S485]
7,NP_000009.1_485_489_1_1_S489,ACADVL,NP_000009.1,ELSGLGsALK,[S489]
8,NP_000009.1_517_530_1_1_S522,ACADVL,NP_000009.1,AGLGSGLSLsGLVHPELSR,[S522]
9,NP_000009.1_52_61_1_1_S54,ACADVL,NP_000009.1,SDsHPSDALTR,[S54]
10,NP_000009.1_52_61_1_1_S57,ACADVL,NP_000009.1,SDSHPsDALTR,[S57]


In [9]:
# Check what residues are modified
peptide_df['phosphosites'].explode().apply(lambda x: x[0]).value_counts()

S    79960
T    20064
Y     3866
Name: phosphosites, dtype: int64

In [10]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['phosphosites'].apply(
    lambda l: [x for x in l if x[0] in 'STY']
)

In [11]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.peptide, peptide_df.refseq_prot_id, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [12]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['refseq_prot_id', 'phosphosites', 'peptide', 'peptide_loc_invalid_reason']
]

Unnamed: 0,refseq_prot_id,phosphosites,peptide,peptide_loc_invalid_reason
524,NP_000145.1,[S4],VIVGsPR,Peptide not found in protein
4255,NP_001005360.1,[S6],LQSQLLsIEK,Peptide not found in protein
10795,NP_001073998.2,[S5],LLPPAsP,Peptide not found in protein
18984,NP_001137357.1,[S3],TPTsPLK,Peptide not found in protein
22438,NP_001157973.1,[S6],EENLISs,Peptide not found in protein
30655,NP_001240764.1,[S5],ALTQVsK,Peptide not found in protein
36739,NP_001274522.1,[S1],IsNAYAR,Peptide not found in protein
41694,NP_001299602.1,[S5],MLQALsPK,Peptide not found in protein
43969,NP_001308037.1,[S5],IPIVRsFADIGK,Peptide not found in protein
51933,NP_001673.2,[S12],SATSSSPGSPIHsLETSL,Peptide not found in protein


In [13]:
out_df = peptide_with_loc_df.loc[
    :,
    ['original_id', 'refseq_prot_id', 'peptide', 'phosphosites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
]
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    phospho_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [14]:
out_df.head()

Unnamed: 0,original_id,refseq_prot_id,peptide,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_000005.2_708_710_1_1_S710,NP_000005.2,VGFYEsDVMGR,S710,705,715,True,
1,NP_000005.2_864_889_3_3_S879T885S889,NP_000005.2,SLGNVNFTVSAEALEsQELCGtEVPsVPEHGR,S879;T885;S889,864,895,True,
2,NP_000005.2_914_932_1_1_S928,NP_000005.2,ETTFNSLLCPSGGEVsEELSLK,S928,913,934,True,
4,NP_000007.1_220_228_1_1_T228,NP_000007.1,AFTGFIVEADtPGIQIGR,T228,218,235,True,
5,NP_000007.1_351_355_1_1_T351,NP_000007.1,RNtYYASIAK,T351,349,358,True,
