In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd

In [2]:
protein_fa = FastaFile('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v3.1/DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz')

In [3]:
raw_phospho_df = pd.read_table(
    '/Users/liang/Box/MyCPTAC/CPTAC_proteome_v3.1/CPTAC3/HNSCC_discovery/Site-level-abundance-MD.tsv.gz'
)

In [4]:
parsed_index_df = raw_phospho_df['Index'].str.split('_', expand=True)
raw_phospho_df['refseq_prot_id'] = parsed_index_df[0] + '_' + parsed_index_df[1]
raw_phospho_df['phosphosites'] = parsed_index_df.iloc[:, -1].str.findall(r'[STY]\d+')

In [5]:
peptide_df = raw_phospho_df.loc[
    raw_phospho_df.phosphosites.notnull(), 
    ['Index', 'refseq_prot_id', 'Peptide', 'phosphosites']
]

In [6]:
peptide_df.head(10)

Unnamed: 0,Index,refseq_prot_id,Peptide,phosphosites
0,NP_000005.2_708_710_1_1_S710,NP_000005.2,VGFYEsDVMGR,[S710]
2,NP_000005.2_914_932_1_1_S928,NP_000005.2,ETTFNSLLCPSGGEVsEELSLK,[S928]
3,NP_000009.1_485_489_1_1_S485,NP_000009.1,ELsGLGSALK,[S485]
4,NP_000009.1_485_489_1_1_S489,NP_000009.1,ELSGLGsALK,[S489]
5,NP_000009.1_517_530_1_1_S522,NP_000009.1,AGLGSGLSLsGLVHPELSR,[S522]
6,NP_000010.1_167_170_1_1_S167,NP_000010.1,GsTPYGGVK,[S167]
8,NP_000011.2_155_161_1_1_S160,NP_000011.2,GLHSELGEsSLILK,[S160]
9,NP_000011.2_155_161_1_1_S161,NP_000011.2,GLHSELGESsLILK,[S161]
11,NP_000012.1_315_337_1_1_S324,NP_000012.1,YNAESTEREsQDTVAENDDGGFSEEWEAQR,[S324]
12,NP_000012.1_315_337_1_1_T327,NP_000012.1,YNAESTERESQDtVAENDDGGFSEEWEAQR,[T327]


In [7]:
def calc_peptide_start_end(t):
    peptide = t.Peptide
    
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(t.refseq_prot_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = t.phosphosites[0][0].lower()
    first_res_loc = int(t.phosphosites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]
    

In [8]:
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, peptide_df.itertuples(index=False)),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [9]:
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
peptide_with_loc_df[~peptide_loc_range_df['peptide_loc_valid']].head()

Unnamed: 0,Index,refseq_prot_id,Peptide,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
630,NP_000145.1_4_4_1_1_S4,NP_000145.1,VIVGsPR,[S4],,,False,Peptide not found in protein
9257,NP_001035924.1_1_3_1_1_T3,NP_001035924.1,ASLtPVK,[T3],,,False,Peptide not found in protein
11144,NP_001073998.2_5_5_1_1_S5,NP_001073998.2,LLPPAsP,[S5],,,False,Peptide not found in protein
11917,NP_001091738.1_1_3_1_1_T1,NP_001091738.1,VtDSLDK,[T1],,,False,Peptide not found in protein
18760,NP_001138230.1_0_4_1_1_S4,NP_001138230.1,SLPGsPK,[S4],,,False,Peptide not found in protein


In [10]:
protein_seq = protein_fa.fetch('NP_001240837.1')
protein_seq

'MDSKHQCLKLNDGHFMPVLGFGTYAPPEVPRSKALEVTKLAIEAGFRHIDSAHLYNNEEQVGLAIRSKIADGSVKREDIFYTSKLWSTFHRPELVRPALENSLKKAQLDYVDLYLIHSPMSLKPGEELSPTDENGKVIFDIVDLCTTWEAMEKCKDAGLAKSIGVSNFNRRQLEMILNKPGLKYKPVCNQVECHPYFNRSKLLDFCKSKDIVLVAYSALGSQRDKRWVDPNSPVLLEDPVLCALAKKHKRTPALIALRYQLQRGVVVLAKSYNEQRIRQNVQVFEFQLTAEDMKAIDGLDRNLHYFNSDSFASHPNYPYSDEY'

In [11]:
len(peptide_with_loc_df[~peptide_loc_range_df['peptide_loc_valid']]) / len(peptide_with_loc_df)

0.000435396210551602

In [12]:
out_df = peptide_with_loc_df[['Index', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']]
out_df.to_csv(
    '/Users/liang/Box/MyCPTAC/CPTAC_proteome_v3.1/CPTAC3/HNSCC_discovery/Site-level-abundance-MD.peptide_location.tsv.gz',
    sep='\t',
    index=False
)