In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v1/')

refseq_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC3/GBM_discovery/phosphoproteome_mssm_per_gene_clean.v4.0.20200430.tsv.gz'
acetyl_pth = DATA_ROOT / 'CPTAC3/GBM_discovery/acetylome_mssm_per_gene_clean.v4.0.20200430.tsv.gz'

In [3]:
protein_fa = FastaFile(str(refseq_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

## Phospho

In [5]:
raw_phospho_df = pd.read_table(phospho_pth)
raw_phospho_df.columns

Index(['gene', 'refseq_id', 'peptide', 'site', 'C3L-00104', 'C3L-00365',
       'C3L-00674', 'C3L-00677', 'C3L-01040', 'C3L-01043',
       ...
       'PT-NPJ7', 'PT-P44H', 'PT-Q2AG', 'PT-QVJO', 'PT-R55F', 'PT-RN5K',
       'PT-RU72', 'PT-UTHO', 'PT-WVLH', 'PT-Y8DK'],
      dtype='object', length=113)

In [6]:
raw_phospho_df.head()

Unnamed: 0,gene,refseq_id,peptide,site,C3L-00104,C3L-00365,C3L-00674,C3L-00677,C3L-01040,C3L-01043,...,PT-NPJ7,PT-P44H,PT-Q2AG,PT-QVJO,PT-R55F,PT-RN5K,PT-RU72,PT-UTHO,PT-WVLH,PT-Y8DK
0,A2M,NP_000005.2,KYS*DASDCHGEDSQAFCEK,A2M-S273s,,,,,,,...,,,,,0.28458,,,,,
1,AAAS,NP_056480.1,AQEPPAGGGGSIHDLPLFTETSPTS*APWDPLPGPPPVLPHS*PHSHL,AAAS-S525sS541s,,,,,,,...,,,,,,0.16353,,,,
2,AAAS,NP_056480.1,AQEPPAGGGGSIHDLPLFTETSPTS*APWDPLPGPPPVLPHSPHSHL,AAAS-S525s,0.118623,0.069499,,0.336969,-0.082962,,...,-0.022726,,,,,,,0.326843,,
3,AAAS,NP_056480.1,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHS*PHSHL,AAAS-S541s,,,1.36863,0.411596,,0.477905,...,,0.27189,0.433268,-0.103002,,0.311555,-0.379264,,-0.438673,
4,AAAS,NP_056480.1,FS*PVLGR,AAAS-S495s,0.053418,-0.093105,-1.084975,-0.260149,-0.344549,-0.03986,...,-0.718516,-0.831106,-0.828229,-0.564761,-0.548979,-0.657678,-0.651907,-0.105979,-0.54232,-0.142143


In [7]:
peptide_df = raw_phospho_df.loc[:, ['gene', 'refseq_id', 'peptide', 'site']]
peptide_df['original_id'] = peptide_df['peptide']
peptide_df['peptide'] = peptide_df['peptide'].str.replace(r'([STY])\*', lambda m: m.group(1).lower())
peptide_df['phosphosites'] = peptide_df['site'].str.split('-', 1, expand=True).iloc[:, 1].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

Unnamed: 0,gene,refseq_id,peptide,site,original_id,phosphosites
0,A2M,NP_000005.2,KYsDASDCHGEDSQAFCEK,A2M-S273s,KYS*DASDCHGEDSQAFCEK,[S273s]
1,AAAS,NP_056480.1,AQEPPAGGGGSIHDLPLFTETSPTsAPWDPLPGPPPVLPHsPHSHL,AAAS-S525sS541s,AQEPPAGGGGSIHDLPLFTETSPTS*APWDPLPGPPPVLPHS*PHSHL,"[S525s, S541s]"
2,AAAS,NP_056480.1,AQEPPAGGGGSIHDLPLFTETSPTsAPWDPLPGPPPVLPHSPHSHL,AAAS-S525s,AQEPPAGGGGSIHDLPLFTETSPTS*APWDPLPGPPPVLPHSPHSHL,[S525s]
3,AAAS,NP_056480.1,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHsPHSHL,AAAS-S541s,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHS*PHSHL,[S541s]
4,AAAS,NP_056480.1,FsPVLGR,AAAS-S495s,FS*PVLGR,[S495s]
5,AAAS,NP_056480.1,IAHIPLYFVNAQFPRFsPVLGR,AAAS-S495s,IAHIPLYFVNAQFPRFS*PVLGR,[S495s]
6,AAED1,NP_714542.1,QVsGAAALVPAPSGPDSGQPLAAAVAELPVLDAR,AAED1-S12s,QVS*GAAALVPAPSGPDSGQPLAAAVAELPVLDAR,[S12s]
7,AAGAB,NP_078942.3,AFWMAIGGDRDEIEGLsSDEEH,AAGAB-S310s,AFWMAIGGDRDEIEGLS*SDEEH,[S310s]
8,AAGAB,NP_078942.3,AFWMAIGGDRDEIEGLSsDEEH,AAGAB-S311s,AFWMAIGGDRDEIEGLSS*DEEH,[S311s]
9,AAGAB,NP_078942.3,NQGFSLLNsLTGTNHSIGSADPCHPEQPHLPAADSTESLSDHR,AAGAB-S189s,NQGFSLLNS*LTGTNHSIGSADPCHPEQPHLPAADSTESLSDHR,[S189s]


In [8]:
# Check what residues are modified
peptide_df['phosphosites'].explode().apply(lambda x: x[0]).value_counts()

S    108827
T     23650
Y      4455
Name: phosphosites, dtype: int64

In [9]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['phosphosites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'STY']
)

In [10]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.peptide, peptide_df.refseq_id, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [11]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['refseq_id', 'phosphosites', 'peptide', 'peptide_loc_invalid_reason']
]

Unnamed: 0,refseq_id,phosphosites,peptide,peptide_loc_invalid_reason


In [12]:
out_df = peptide_with_loc_df.loc[
    :,
    ['original_id', 'peptide', 'phosphosites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
]
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    phospho_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [13]:
out_df.head()

Unnamed: 0,original_id,peptide,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,KYS*DASDCHGEDSQAFCEK,KYsDASDCHGEDSQAFCEK,S273,271,289,True,
1,AQEPPAGGGGSIHDLPLFTETSPTS*APWDPLPGPPPVLPHS*PHSHL,AQEPPAGGGGSIHDLPLFTETSPTsAPWDPLPGPPPVLPHsPHSHL,S525;S541,501,546,True,
2,AQEPPAGGGGSIHDLPLFTETSPTS*APWDPLPGPPPVLPHSPHSHL,AQEPPAGGGGSIHDLPLFTETSPTsAPWDPLPGPPPVLPHSPHSHL,S525,501,546,True,
3,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHS*PHSHL,AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHsPHSHL,S541,501,546,True,
4,FS*PVLGR,FsPVLGR,S495,494,500,True,


## Acetylation

In [14]:
raw_acetyl_df = pd.read_table(acetyl_pth)
raw_acetyl_df.columns

Index(['gene', 'refseq_id', 'peptide', 'site', 'C3L-00104', 'C3L-00365',
       'C3L-00674', 'C3L-00677', 'C3L-01040', 'C3L-01043',
       ...
       'PT-NPJ7', 'PT-P44H', 'PT-Q2AG', 'PT-QVJO', 'PT-R55F', 'PT-RN5K',
       'PT-RU72', 'PT-UTHO', 'PT-WVLH', 'PT-Y8DK'],
      dtype='object', length=113)

In [15]:
peptide_df = raw_acetyl_df.loc[:, ['gene', 'refseq_id', 'peptide', 'site']]
peptide_df['original_id'] = peptide_df['peptide'].copy()
peptide_df['peptide'] = (
    peptide_df['peptide']
        .str.replace(r'([A-Z])\#', lambda m: m.group(1).lower())
        # Drop additional symbols 
        .str.replace(r'([A-Z])\$', lambda m: m.group(1))
        .str.replace(r'([A-Z])\*', lambda m: m.group(1))
        .str.replace(r'([A-Z])\@', lambda m: m.group(1))
)
peptide_df['acetylsites'] = peptide_df['site'].str.split('-', 1, expand=True).iloc[:, 1].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

Unnamed: 0,gene,refseq_id,peptide,site,original_id,acetylsites
0,A1BG,NP_570602.2,SLPAPWLSMAPVSWITPGLk,A1BG-K134k,SLPAPWLSMAPVSWITPGLK#,[K134k]
1,A2M,NP_000005.2,AFTNSkIR,A2M-K682k,AFTNSK#IR,[K682k]
2,A2M,NP_000005.2,ALLAYAFALAGNQDk,A2M-K1162k,ALLAYAFALAGNQDK#,[K1162k]
3,A2M,NP_000005.2,DMYSFLEDMGLk,A2M-K676k,DMYSFLEDMGLK#,[K676k]
4,A2M,NP_000005.2,DNSVHWERPQkPK,A2M-K1188k,DNSVHWERPQK#PK,[K1188k]
5,A2M,NP_000005.2,EVLkSLNEEAVK,A2M-K1168k,EVLK#SLNEEAVK,[K1168k]
6,A2M,NP_000005.2,FSGQLNSHGCFYQQVk,A2M-K305k,FSGQLNSHGCFYQQVK#,[K305k]
7,A2M,NP_000005.2,GPTQEFk,A2M-K115k,GPTQEFK#,[K115k]
8,A2M,NP_000005.2,GPTQEFkK,A2M-K115k,GPTQEFK#K,[K115k]
9,A2M,NP_000005.2,NEDSLVFVQTDk,A2M-K135k,NEDSLVFVQTDK#,[K135k]


In [16]:
# Check what residues are modified
peptide_df['acetylsites'].explode().apply(lambda x: x[0]).value_counts()

K    19428
Name: acetylsites, dtype: int64

In [17]:
# Keep only K that are uniquitsites
peptide_df['acetylsites'] = peptide_df['acetylsites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'K']
)

In [18]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.peptide, peptide_df.refseq_id, peptide_df.acetylsites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [19]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['refseq_id', 'acetylsites', 'peptide', 'original_id', 'peptide_loc_invalid_reason']
]

Unnamed: 0,refseq_id,acetylsites,peptide,original_id,peptide_loc_invalid_reason


In [20]:
out_df = peptide_with_loc_df.loc[
    :,
    ['original_id', 'peptide', 'acetylsites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
]
out_df['acetylsites'] = out_df['acetylsites'].str.join(';')
out_df.to_csv(
    acetyl_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [21]:
out_df.head()

Unnamed: 0,original_id,peptide,acetylsites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,SLPAPWLSMAPVSWITPGLK#,SLPAPWLSMAPVSWITPGLk,K134,115,134,True,
1,AFTNSK#IR,AFTNSkIR,K682,677,684,True,
2,ALLAYAFALAGNQDK#,ALLAYAFALAGNQDk,K1162,1148,1162,True,
3,DMYSFLEDMGLK#,DMYSFLEDMGLk,K676,665,676,True,
4,DNSVHWERPQK#PK,DNSVHWERPQkPK,K1188,1178,1190,True,
