In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v3.0/')

refseq_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC3/PBTA_discovery/ProcessedPhosphoData/phospho_tumorall_nofilter_imputedA_03162020.tsv'

In [3]:
protein_fa = FastaFile(str(refseq_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

## Phospho

In [5]:
raw_phospho_df = pd.read_table(phospho_pth)
raw_phospho_df.columns

Index(['Index', 'gene_Symbol', 'Prot_Desc', 'X7316.1781', 'X7316.1790',
       'X7316.878', 'X7316.2181', 'X7316.2141', 'X7316.918', 'X7316.479',
       ...
       'X7316.407', 'X7316.3025', 'X7316.347', 'X7316.178', 'X7316.302',
       'X7316.2986', 'X7316.109', 'X7316.897', 'X7316.153', 'X7316.38'],
      dtype='object', length=220)

In [6]:
parsed_index_df = raw_phospho_df['Index'].str.split('_', expand=True)
raw_phospho_df['refseq_prot_id'] = parsed_index_df[0] + '_' + parsed_index_df[1]
raw_phospho_df['phosphosites'] = parsed_index_df.iloc[:, -1].str.findall(r'[A-Z]\d+')

In [7]:
raw_phospho_df.head()

Unnamed: 0,Index,gene_Symbol,Prot_Desc,X7316.1781,X7316.1790,X7316.878,X7316.2181,X7316.2141,X7316.918,X7316.479,...,X7316.347,X7316.178,X7316.302,X7316.2986,X7316.109,X7316.897,X7316.153,X7316.38,refseq_prot_id,phosphosites
0,NP_000022.3_214_215_1_1_S215,ALAD,SsPAFGDRR,-0.696372,-0.890851,-1.115034,-0.819816,-1.103827,-0.860584,0.0337,...,-1.003605,-1.187927,-0.557025,-0.514046,-0.709355,-0.612248,-0.710886,-0.675199,NP_000022.3,[S215]
1,NP_000025.1_36_39_1_1_S36,ALDOA,GILAADEsTGSIAK,-1.138552,-1.253123,-1.199987,-1.167665,-1.304464,-0.275617,0.639204,...,-1.348174,-1.736277,-0.727368,-0.677711,-0.810839,-0.803293,-1.143444,-1.116918,NP_000025.1,[S36]
2,NP_000025.1_36_39_1_1_S39,ALDOA,GILAADESTGsIAK,-0.980943,-1.138313,-1.242436,-1.348138,-1.046978,-0.373884,0.535637,...,-1.066309,-1.968361,-1.239626,-1.033501,-1.015763,-0.647571,-0.959836,-1.206872,NP_000025.1,[S39]
3,NP_000025.1_46_52_1_1_S46,ALDOA,LQsIGTENTEENR,-0.207991,-0.330642,-0.256058,-0.416227,-0.310663,0.647371,-0.129796,...,-0.95363,-0.94355,-0.802885,-0.639685,-0.72169,-0.904796,-0.619397,-1.025678,NP_000025.1,[S46]
4,NP_000028.3_1684_1693_1_1_S1686,ANK1,ITHsPTVSQVTER,-0.236494,0.727648,0.227237,0.172909,-0.338637,0.823786,0.792843,...,-0.594974,-0.979635,-0.426282,-0.527333,0.02156,-0.707242,0.368233,-0.160473,NP_000028.3,[S1686]


In [8]:
peptide_df = raw_phospho_df.loc[
    # Filter peptides that don't have any phosphosites
    ~raw_phospho_df.phosphosites.isnull(),
    ['Index', 'gene_Symbol', 'refseq_prot_id', 'Prot_Desc', 'phosphosites']]
peptide_df = peptide_df.rename(columns = {'Index': 'original_id', 'Prot_Desc': 'peptide', 'gene_Symbol': 'Gene'})
peptide_df.head(10)

Unnamed: 0,original_id,Gene,refseq_prot_id,peptide,phosphosites
0,NP_000022.3_214_215_1_1_S215,ALAD,NP_000022.3,SsPAFGDRR,[S215]
1,NP_000025.1_36_39_1_1_S36,ALDOA,NP_000025.1,GILAADEsTGSIAK,[S36]
2,NP_000025.1_36_39_1_1_S39,ALDOA,NP_000025.1,GILAADESTGsIAK,[S39]
3,NP_000025.1_46_52_1_1_S46,ALDOA,NP_000025.1,LQsIGTENTEENR,[S46]
4,NP_000028.3_1684_1693_1_1_S1686,ANK1,NP_000028.3,ITHsPTVSQVTER,[S1686]
5,NP_000028.3_960_973_1_1_T961,ANK1,NP_000028.3,LStPPPLAEEEGLASR,[T961]
6,NP_000090.1_43_43_1_1_S43,CST3,NP_000090.1,LVGGPMDAsVEEEGVRR,[S43]
7,NP_000114.2_382_384_1_1_S384,ERCC5,NP_000114.2,NAPAAVDEGSIsPR,[S384]
8,NP_000114.2_523_534_1_1_S526,ERCC5,NP_000114.2,ELTPAsPTCTNSVSK,[S526]
9,NP_000156.1_244_255_1_1_S251,GJA1,NP_000156.1,SDPYHATsGALSPAK,[S251]


In [9]:
# Check what residues are modified
peptide_df['phosphosites'].explode().apply(lambda x: x[0]).value_counts()

S    4388
T     611
Y      37
Name: phosphosites, dtype: int64

In [10]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['phosphosites'].apply(
    lambda l: [x for x in l if x[0] in 'STY']
)

In [11]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.peptide, peptide_df.refseq_prot_id, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [12]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['refseq_prot_id', 'phosphosites', 'peptide', 'peptide_loc_invalid_reason']
]

Unnamed: 0,refseq_prot_id,phosphosites,peptide,peptide_loc_invalid_reason
306,NP_001073998.2,[S5],LLPPAsP,Peptide not found in protein
1324,NP_001299602.1,[S5],MLQALsPK,Peptide not found in protein
4479,NP_001673.2,[S8],SATSSSPGsPIHSLETSL,Peptide not found in protein


In [13]:
out_df = peptide_with_loc_df.loc[
    :,
    ['original_id', 'refseq_prot_id', 'peptide', 'phosphosites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
]
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    phospho_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [14]:
out_df.head()

Unnamed: 0,original_id,refseq_prot_id,peptide,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_000022.3_214_215_1_1_S215,NP_000022.3,SsPAFGDRR,S215,214,222,True,
1,NP_000025.1_36_39_1_1_S36,NP_000025.1,GILAADEsTGSIAK,S36,29,42,True,
2,NP_000025.1_36_39_1_1_S39,NP_000025.1,GILAADESTGsIAK,S39,29,42,True,
3,NP_000025.1_46_52_1_1_S46,NP_000025.1,LQsIGTENTEENR,S46,44,56,True,
4,NP_000028.3_1684_1693_1_1_S1686,NP_000028.3,ITHsPTVSQVTER,S1686,1683,1695,True,
