In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd
from cmapPy.pandasGEXpress.parse import parse

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v2.0/')

refseq_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20160914/RefSeq.20160914_Human_ucsc_hg19_customProDBnr_mito_150contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC2/BRCA_prospective/prosp-brca-v5.3-phosphoproteome-ratio-norm-unfiltered.gct'
acetyl_pth = DATA_ROOT / 'CPTAC2/BRCA_prospective/prosp-brca-v5.3-acetylome-ratio-norm-unfiltered.gct'

In [3]:
protein_fa = FastaFile(str(refseq_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

In [5]:
raw_phospho_gct = parse(str(phospho_pth))
raw_phospho_gct.row_metadata_df.columns

Index(['id.description', 'geneSymbol', 'numColumnsVMsiteObserved', 'bestScore',
       'bestDeltaForwardReverseScore', 'Best_scoreVML',
       'Best_numActualVMSites_sty', 'Best_numLocalizedVMsites_sty',
       'variableSites', 'sequence', 'sequenceVML',
       'accessionNumber_VMsites_numVMsitesPresent_numVMsitesLocalizedBest_earliestVMsiteAA_latestVMsiteAA',
       'protein_mw', 'species', 'speciesMulti', 'orfCategory',
       'accession_number', 'accession_numbers', 'protein_group_num',
       'entry_name'],
      dtype='object', name='rhd')

In [6]:
peptide_df = raw_phospho_gct.row_metadata_df.loc[
    # Only Keep RefSeq accessions
    ~raw_phospho_gct.row_metadata_df['accession_number'].str.startswith('smORF'),
    ['variableSites', 'sequence', 'accession_number', 'protein_group_num']
]
peptide_df['variableSites'] = peptide_df['variableSites'].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

rhd,variableSites,sequence,accession_number,protein_group_num
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NP_001305715.1_S26s _1_1_26_26,[S26s],AHEsVVK,NP_001305715.1,1.2
NP_001305715.1_S58s _1_1_58_58,[S58s],HLSAsQK,NP_001305715.1,1.2
NP_001611.1_S18s _1_1_18_18,[S18s],ELLLPNWQGsGSHGLTIAQR,NP_001611.1,1.1
NP_001611.1_S41s _1_1_41_41,[S41s],DDGVFVQEVTQNsPAAR,NP_001611.1,1.1
NP_001611.1_S93s _1_1_93_93,[S93s],KGDRsPEPGQTWTR,NP_001611.1,1.1
NP_001611.1_T99t _1_1_99_99,[T99t],SPEPGQtWTR,NP_001611.1,1.1
NP_001611.1_S110s _1_1_110_110,[S110s],EVFSSCSsEVVLSGDDEEYQR,NP_001611.1,1.1
NP_001611.1_S115s _1_1_115_115,[S115s],EVFSSCSSEVVLsGDDEEYQR,NP_001611.1,1.1
NP_001611.1_Y121y _1_0_121_127,[Y121y],EVFSSCSSEVVLSGDDEEyQRIYTTK,NP_001611.1,1.1
NP_001611.1_S135s _1_1_135_135,[S135s],LKsEDGVEGDLGETQSR,NP_001611.1,1.1


Check what residues are modified

In [7]:
peptide_df['variableSites'].explode().apply(lambda x: x[0]).value_counts()

S    61747
T    12421
M     1817
Y     1172
N     1143
Q      225
C      100
Name: variableSites, dtype: int64

In [8]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['variableSites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'STY']
)

In [9]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.sequence, peptide_df.accession_number, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [10]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df[~peptide_loc_range_df['peptide_loc_valid']].head()

Unnamed: 0_level_0,variableSites,sequence,accession_number,protein_group_num,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [11]:
out_df = peptide_with_loc_df.loc[
    :,
    ['phosphosites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
].reset_index()
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    phospho_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [12]:
out_df.head()

Unnamed: 0,rid,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_001305715.1_S26s _1_1_26_26,S26,23,29,True,
1,NP_001305715.1_S58s _1_1_58_58,S58,54,60,True,
2,NP_001611.1_S18s _1_1_18_18,S18,9,28,True,
3,NP_001611.1_S41s _1_1_41_41,S41,29,45,True,
4,NP_001611.1_S93s _1_1_93_93,S93,89,102,True,


## Acetylation

In [13]:
raw_acetyl_gct = parse(str(acetyl_pth))
raw_acetyl_gct.row_metadata_df.columns

Index(['id.description', 'geneSymbol', 'numColumnsVMsiteObserved', 'bestScore',
       'bestDeltaForwardReverseScore', 'Best_scoreVML', 'variableSites',
       'sequence', 'sequenceVML',
       'accessionNumber_VMsites_numVMsitesPresent_numVMsitesLocalizedBest_earliestVMsiteAA_latestVMsiteAA',
       'protein_mw', 'species', 'speciesMulti', 'orfCategory',
       'accession_number', 'accession_numbers', 'protein_group_num',
       'entry_name'],
      dtype='object', name='rhd')

In [14]:
peptide_df = raw_acetyl_gct.row_metadata_df.loc[
    # Only Keep RefSeq accessions
    ~raw_acetyl_gct.row_metadata_df['accession_number'].str.startswith('smORF'),
    ['variableSites', 'sequence', 'accession_number', 'protein_group_num']
]
peptide_df['variableSites'] = peptide_df['variableSites'].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

rhd,variableSites,sequence,accession_number,protein_group_num
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NP_000468.1_K28k _1_1_28_28,[K28k],DAHkSEVAHR,NP_000468.1,1.1
NP_000468.1_K36k _1_1_36_36,[K36k],FkDLGEENFK,NP_000468.1,1.1
NP_000468.1_K44k _1_1_44_44,[K44k],FKDLGEENFk,NP_000468.1,1.1
NP_000468.1_K65k _1_1_65_65,[K65k],ALVLIAFAQYLQQCPFEDHVk,NP_000468.1,1.1
NP_000468.1_K75k _1_1_75_75,[K75k],LVNEVTEFAkTCVADESAENCDK,NP_000468.1,1.1
NP_000468.1_K88k _1_1_88_88,[K88k],TCVADESAENCDkSLHTLFGDK,NP_000468.1,1.1
NP_000468.1_K97k _1_1_97_97,[K97k],SLHTLFGDkLCTVATLR,NP_000468.1,1.1
NP_000468.1_K117k _1_1_117_117,[K117k],ETYGEMADCCAkQEPERNECFLQHK,NP_000468.1,1.1
NP_000468.1_K130k_1_1_130_130,"[Q118q, K130k]",qEPERNECFLQHkDDNPNLPR,NP_000468.1,1.1
NP_000468.1_K160k _1_1_160_160,[K160k],LVRPEVDVMCTAFHDNEETFLkK,NP_000468.1,1.1


In [15]:
peptide_df['variableSites'].explode().apply(lambda x: x[0]).value_counts()

K    18848
M      855
N      149
Q       34
Name: variableSites, dtype: int64

In [16]:
# Keep only K that are acetylsites
peptide_df['acetylsites'] = peptide_df['variableSites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'K']
)

In [17]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.sequence, peptide_df.accession_number, peptide_df.acetylsites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [18]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df[~peptide_loc_range_df['peptide_loc_valid']].head()

Unnamed: 0_level_0,variableSites,sequence,accession_number,protein_group_num,acetylsites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [19]:
out_df = peptide_with_loc_df.loc[
    :,
    ['acetylsites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
].reset_index()
out_df['acetylsites'] = out_df['acetylsites'].str.join(';')
out_df.to_csv(
    acetyl_pth.with_suffix('.peptide_location.tsv.gz'),
    sep='\t',
    index=False
)

In [20]:
out_df.head()

Unnamed: 0,rid,acetylsites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_000468.1_K28k _1_1_28_28,K28,25,34,True,
1,NP_000468.1_K36k _1_1_36_36,K36,35,44,True,
2,NP_000468.1_K44k _1_1_44_44,K44,35,44,True,
3,NP_000468.1_K65k _1_1_65_65,K65,45,65,True,
4,NP_000468.1_K75k _1_1_75_75,K75,66,88,True,
