In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v3.1/')

refseq_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC3/GBM_confirmatory/pnnl_phosphoproteome.peptide.tsv.gz'
acetyl_pth = DATA_ROOT / 'CPTAC3/GBM_confirmatory/pnnl_acetylproteome.peptide.tsv.gz'

In [3]:
protein_fa = FastaFile(str(refseq_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

## Phospho

In [5]:
raw_phospho_df = pd.read_table(phospho_pth)
raw_phospho_df.columns

Index(['original_id', 'symbol', 'refseq_id', 'site', 'peptide'], dtype='object')

In [6]:
raw_phospho_df.head()

Unnamed: 0,original_id,symbol,refseq_id,site,peptide
0,AAAS-S495s,AAAS,NP_056480.1,AAAS-S495s,FS*PVLGR
1,AAAS-S495s.1,AAAS,NP_056480.1,AAAS-S495s,IAHIPLYFVNAQFPRFS*PVLGR
2,AAAS-S541s,AAAS,NP_056480.1,AAAS-S541s,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHS*PHSHL
3,AAGAB-S215s,AAGAB,NP_078942.3,AAGAB-S215s,NQGFSLLNSLTGTNHSIGSADPCHPEQPHLPAADS*TESLSDHR
4,AAGAB-S310sS311s,AAGAB,NP_078942.3,AAGAB-S310sS311s,AFWMAIGGDRDEIEGLS*S*DEEH


In [7]:
# Some peptides don't have any PTMs
raw_phospho_df[raw_phospho_df['site'].str.endswith('-NULL')]

Unnamed: 0,original_id,symbol,refseq_id,site,peptide
244,ABI2-NULL,ABI2,NP_001269854.1,ABI2-NULL,ENSGS*GSVGVPIAVPTPSPPSVFPGHPVQFYSMNRPASR
245,ABI2-NULL.1,ABI2,NP_001269854.1,ABI2-NULL,ENSGSGS*VGVPIAVPTPSPPSVFPGHPVQFYSMNRPASR
246,ABI2-NULL.2,ABI2,NP_001269854.1,ABI2-NULL,ENSGSGSVGVPIAVPTPS*PPSVFPGHPVQFYSMNRPASR
247,ABI2-NULL.3,ABI2,NP_001269854.1,ABI2-NULL,ENSGSGSVGVPIAVPTPSPPSVFPGHPVQFY*SMNRPASR
248,ABI2-NULL.4,ABI2,NP_001269854.1,ABI2-NULL,ENSGSGSVGVPIAVPTPSPPSVFPGHPVQFYS*MNRPASR
...,...,...,...,...,...
102009,ZRANB2-NULL.6,ZRANB2,NP_976225.1,ZRANB2-NULL,SRS*PESQVIGENTK
102010,ZRANB2-NULL.7,ZRANB2,NP_976225.1,ZRANB2-NULL,T*RS*RS*PESQVIGENTK
102011,ZRANB2-NULL.8,ZRANB2,NP_976225.1,ZRANB2-NULL,T*RS*RSPESQVIGENTK
102012,ZRANB2-NULL.9,ZRANB2,NP_976225.1,ZRANB2-NULL,TRS*RS*PESQVIGENTK


In [8]:
peptide_df = raw_phospho_df.loc[
    ~raw_phospho_df['site'].str.endswith('-NULL'), 
    ['original_id', 'symbol', 'refseq_id', 'peptide', 'site']
]
peptide_df['peptide'] = peptide_df['peptide'].str.replace(r'([A-Z])\*', lambda m: m.group(1).lower())
peptide_df['phosphosites'] = peptide_df['site'].str.split('-', 1, expand=True).iloc[:, 1].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

Unnamed: 0,original_id,symbol,refseq_id,peptide,site,phosphosites
0,AAAS-S495s,AAAS,NP_056480.1,FsPVLGR,AAAS-S495s,[S495s]
1,AAAS-S495s.1,AAAS,NP_056480.1,IAHIPLYFVNAQFPRFsPVLGR,AAAS-S495s,[S495s]
2,AAAS-S541s,AAAS,NP_056480.1,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHsPHSHL,AAAS-S541s,[S541s]
3,AAGAB-S215s,AAGAB,NP_078942.3,NQGFSLLNSLTGTNHSIGSADPCHPEQPHLPAADsTESLSDHR,AAGAB-S215s,[S215s]
4,AAGAB-S310sS311s,AAGAB,NP_078942.3,AFWMAIGGDRDEIEGLssDEEH,AAGAB-S310sS311s,"[S310s, S311s]"
5,AAGAB-T216t,AAGAB,NP_078942.3,NQGFSLLNSLTGTNHSIGSADPCHPEQPHLPAADStESLSDHR,AAGAB-T216t,[T216t]
6,AAGAB-T216tS218s,AAGAB,NP_078942.3,NQGFSLLNSLTGTNHSIGSADPCHPEQPHLPAADStEsLSDHR,AAGAB-T216tS218s,"[T216t, S218s]"
7,AAK1-S14s,AAK1,NP_055726.3,EQGGsGLGSGSSGGGGSTSGLGSGYIGR,AAK1-S14s,[S14s]
8,AAK1-S14s.1,AAK1,NP_055726.3,REQGGsGLGSGSSGGGGSTSGLGSGYIGR,AAK1-S14s,[S14s]
9,AAK1-S18s,AAK1,NP_055726.3,FFDSRREQGGSGLGsGSSGGGGSTSGLGSGYIGR,AAK1-S18s,[S18s]


In [9]:
peptide_df['phosphosites'].explode().apply(lambda x: x[0]).value_counts()

S    104451
T     23644
Y      4473
Name: phosphosites, dtype: int64

In [10]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['phosphosites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'STY']
)

In [11]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.peptide, peptide_df.refseq_id, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [12]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['refseq_id', 'phosphosites', 'peptide', 'peptide_loc_invalid_reason']
]

Unnamed: 0,refseq_id,phosphosites,peptide,peptide_loc_invalid_reason
68898,NP_689584.5,[S128],RYNsLSILPAALGK,Invalid location
68899,NP_689584.5,[Y126],RyNSLSILPAALGK,Invalid location
70332,NP_001291768.1,[S186],YQSILENHLTEsIK,Invalid location


In [13]:
out_df = peptide_with_loc_df.loc[
    :,
    ['original_id', 'peptide', 'phosphosites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
]
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    phospho_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [14]:
out_df.head()

Unnamed: 0,original_id,peptide,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,AAAS-S495s,FsPVLGR,S495,494,500,True,
1,AAAS-S495s.1,IAHIPLYFVNAQFPRFsPVLGR,S495,479,500,True,
2,AAAS-S541s,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHsPHSHL,S541,501,546,True,
3,AAGAB-S215s,NQGFSLLNSLTGTNHSIGSADPCHPEQPHLPAADsTESLSDHR,S215,181,223,True,
4,AAGAB-S310sS311s,AFWMAIGGDRDEIEGLssDEEH,S310;S311,294,315,True,


## Acetylation

In [15]:
raw_acetyl_df = pd.read_table(acetyl_pth)
raw_acetyl_df.columns

Index(['original_id', 'symbol', 'refseq_id', 'site', 'peptide'], dtype='object')

In [16]:
# Some peptides don't have any PTMs
raw_acetyl_df[raw_acetyl_df['site'].str.endswith('-NULL')]

Unnamed: 0,original_id,symbol,refseq_id,site,peptide
422,ACP1-NULL,ACP1,NP_004291.1,ACP1-NULL,NHGIHTAHK#AR
656,ACTN4-NULL,ACTN4,NP_004915.2,ACTN4-NULL,M*LDAEDIVGTLRPDEK#
657,ACTN4-NULL.1,ACTN4,NP_004915.2,ACTN4-NULL,MLDAEDIVGTLRPDEK#
1798,ANXA6-NULL,ANXA6,NP_001146.2,ANXA6-NULL,EDAQEIADTPSGDK#
1928,API5-NULL,API5,NP_001136402.1,API5-NULL,QIYNPPSGK#YSSNLGNFNYER
...,...,...,...,...,...
20406,UQCRB-NULL,UQCRB,NP_001241681.1,UQCRB-NULL,EQWTK#YEEENFYLEPYLK
20407,UQCRB-NULL.1,UQCRB,NP_001241681.1,UQCRB-NULL,YEEENFYLEPYLK#
20408,UQCRB-NULL.2,UQCRB,NP_001241681.1,UQCRB-NULL,YEEENFYLEPYLK#EVIR
20569,VCAN-NULL,VCAN,NP_004376.2,VCAN-NULL,FDAYCFK#R


In [17]:
peptide_df = raw_acetyl_df.loc[
    ~raw_acetyl_df['site'].str.endswith('-NULL'), 
    ['original_id', 'symbol', 'refseq_id', 'peptide', 'site']
]
peptide_df['peptide'] = (
    peptide_df['peptide']
        .str.replace(r'([A-Z])\#', lambda m: m.group(1).lower())
        # Drop additional symbols 
        .str.replace(r'([A-Z])\$', lambda m: m.group(1))
        .str.replace(r'([A-Z])\*', lambda m: m.group(1))
        .str.replace(r'([A-Z])\@', lambda m: m.group(1))
)
peptide_df['acetylsites'] = peptide_df['site'].str.split('-', 1, expand=True).iloc[:, 1].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

Unnamed: 0,original_id,symbol,refseq_id,peptide,site,acetylsites
0,A2M-K1019k,A2M,NP_000005.2,QLNYkHYDGSYSTFGER,A2M-K1019k,[K1019k]
1,A2M-K1092k,A2M,NP_000005.2,SSGSLLNNAIk,A2M-K1092k,[K1092k]
2,A2M-K1133k,A2M,NP_000005.2,NALFCLESAWk,A2M-K1133k,[K1133k]
3,A2M-K115k,A2M,NP_000005.2,GPTQEFk,A2M-K115k,[K115k]
4,A2M-K115k.1,A2M,NP_000005.2,GPTQEFkK,A2M-K115k,[K115k]
5,A2M-K1162k,A2M,NP_000005.2,ALLAYAFALAGNQDk,A2M-K1162k,[K1162k]
6,A2M-K1168k,A2M,NP_000005.2,EVLkSLNEEAVK,A2M-K1168k,[K1168k]
7,A2M-K1176k,A2M,NP_000005.2,SLNEEAVk,A2M-K1176k,[K1176k]
8,A2M-K1176k.1,A2M,NP_000005.2,SLNEEAVkK,A2M-K1176k,[K1176k]
9,A2M-K1177k,A2M,NP_000005.2,SLNEEAVKk,A2M-K1177k,[K1177k]


In [18]:
# Check what residues are modified
peptide_df['acetylsites'].explode().apply(lambda x: x[0]).value_counts()

K    21647
Name: acetylsites, dtype: int64

In [19]:
# Keep only K that are uniquitsites
peptide_df['acetylsites'] = peptide_df['acetylsites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'K']
)

In [20]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.peptide, peptide_df.refseq_id, peptide_df.acetylsites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [21]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['refseq_id', 'acetylsites', 'peptide', 'original_id', 'peptide_loc_invalid_reason']
]

Unnamed: 0,refseq_id,acetylsites,peptide,original_id,peptide_loc_invalid_reason
3283,NP_001186601.1,[K102],DGFVTVDELk,CALU-K102k,Peptide not found in protein
3284,NP_001186601.1,[K106],DWIkFAQK,CALU-K106k,Peptide not found in protein
3285,NP_001186601.1,[K138],GHDLNEDGLVSWEEYk,CALU-K138k,Peptide not found in protein
14261,NP_444252.1,[K126],EGVHGGTLNkK,PFN2-K126k,Peptide not found in protein
14262,NP_444252.1,[K127],EGVHGGTLNKk,PFN2-K127k,Peptide not found in protein


In [22]:
out_df = peptide_with_loc_df.loc[
    :,
    ['original_id', 'peptide', 'acetylsites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
]
out_df['acetylsites'] = out_df['acetylsites'].str.join(';')
out_df.to_csv(
    acetyl_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [23]:
out_df.head()

Unnamed: 0,original_id,peptide,acetylsites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,A2M-K1019k,QLNYkHYDGSYSTFGER,K1019,1015,1031,True,
1,A2M-K1092k,SSGSLLNNAIk,K1092,1082,1092,True,
2,A2M-K1133k,NALFCLESAWk,K1133,1123,1133,True,
3,A2M-K115k,GPTQEFk,K115,109,115,True,
4,A2M-K115k.1,GPTQEFkK,K115,109,116,True,
