In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd
from cmapPy.pandasGEXpress.parse import parse

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v2.0/')

refseq_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC3/LSCC_discovery/lscc-v3.2-phosphoproteome-ratio-norm-unfiltered.gct'
acetyl_pth = DATA_ROOT / 'CPTAC3/LSCC_discovery/lscc-v3.2-acetylome-ratio-norm-unfiltered.gct'
ubiquitin_pth = DATA_ROOT / 'CPTAC3/LSCC_discovery/lscc-v3.2-ubiquitylome-ratio-norm-unfiltered.gct'

In [3]:
protein_fa = FastaFile(str(refseq_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

## Phospho

In [5]:
raw_phospho_gct = parse(str(phospho_pth))
raw_phospho_gct.row_metadata_df.columns

Index(['id.description', 'geneSymbol', 'numColumnsVMsiteObserved', 'bestScore',
       'bestDeltaForwardReverseScore', 'Best_scoreVML',
       'Best_numActualVMSites_sty', 'Best_numLocalizedVMsites_sty',
       'Best_numAmbiguousVMsites_sty', 'StartAA', 'VMsiteFlanks',
       'variableSites', 'sequence', 'sequenceVML',
       'accessionNumber_VMsites_numVMsitesPresent_numVMsitesLocalizedBest_earliestVMsiteAA_latestVMsiteAA',
       'protein_mw', 'species', 'speciesMulti', 'orfCategory',
       'accession_number', 'accession_numbers', 'protein_group_num',
       'entry_name'],
      dtype='object', name='rhd')

In [6]:
peptide_df = raw_phospho_gct.row_metadata_df.loc[
    # Only Keep RefSeq accessions
    ~raw_phospho_gct.row_metadata_df['accession_number'].str.startswith('smORF'),
    ['variableSites', 'sequence', 'accession_number', 'protein_group_num']
]
peptide_df['variableSites'] = peptide_df['variableSites'].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

rhd,variableSites,sequence,accession_number,protein_group_num
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NP_001333374.1_S18s _1_1_18_18,[S18s],ELLLPNWQGsGSHGLTIAQR,NP_001333374.1,1.1
NP_001333374.1_S41s _1_1_41_41,[S41s],DDGVFVQEVTQNsPAAR,NP_001333374.1,1.1
NP_001333374.1_S93s _1_1_93_93,[S93s],KGDRsPEPGQTWTR,NP_001333374.1,1.1
NP_001333374.1_T99t _1_1_99_99,[T99t],SPEPGQtWTR,NP_001333374.1,1.1
NP_001333374.1_S110s _1_0_109_110,[S110s],EVFSSCSsEVVLSGDDEEYQR,NP_001333374.1,1.1
NP_001333374.1_S110s _1_1_110_110,[S110s],EVFSSCSsEVVLSGDDEEYQR,NP_001333374.1,1.1
NP_001333374.1_S115s _1_1_115_115,[S115s],EVFSSCSSEVVLsGDDEEYQR,NP_001333374.1,1.1
NP_001333374.1_S135s _1_1_135_135,[S135s],sEDGVEGDLGETQSR,NP_001333374.1,1.1
NP_001333374.1_T146t _1_1_146_146,[T146t],SEDGVEGDLGEtQSR,NP_001333374.1,1.1
NP_001333374.1_T158t _1_1_158_158,[T158t],RVtAYTVDVTGR,NP_001333374.1,1.1


In [7]:
# Check what residues are modified
peptide_df['variableSites'].explode().apply(lambda x: x[0]).value_counts()

S    67442
T    12674
M     2425
Y     1185
N     1132
Q      227
C      104
Name: variableSites, dtype: int64

In [8]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['variableSites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'STY']
)

In [9]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.sequence, peptide_df.accession_number, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [10]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['accession_number', 'variableSites', 'sequence', 'peptide_loc_invalid_reason']
]

Unnamed: 0_level_0,accession_number,variableSites,sequence,peptide_loc_invalid_reason
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [11]:
out_df = peptide_with_loc_df.loc[
    :,
    ['phosphosites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
].reset_index()
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    phospho_pth.with_suffix('.peptide_location.tsv'),
    sep='\t',
    index=False
)

In [12]:
out_df.head()

Unnamed: 0,rid,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_001333374.1_S18s _1_1_18_18,S18,9,28,True,
1,NP_001333374.1_S41s _1_1_41_41,S41,29,45,True,
2,NP_001333374.1_S93s _1_1_93_93,S93,89,102,True,
3,NP_001333374.1_T99t _1_1_99_99,T99,93,102,True,
4,NP_001333374.1_S110s _1_0_109_110,S110,103,123,True,


## Acetyl

In [13]:
raw_acetyl_gct = parse(str(acetyl_pth))
raw_acetyl_gct.row_metadata_df.columns

Index(['id.description', 'geneSymbol', 'numColumnsVMsiteObserved', 'bestScore',
       'bestDeltaForwardReverseScore', 'Best_scoreVML', 'StartAA',
       'VMsiteFlanks', 'Best_numActualVMSites_k', 'Best_numLocalizedVMsites_k',
       'Best_numAmbiguousVMsites_k', 'variableSites', 'sequence',
       'sequenceVML',
       'accessionNumber_VMsites_numVMsitesPresent_numVMsitesLocalizedBest_earliestVMsiteAA_latestVMsiteAA',
       'protein_mw', 'species', 'speciesMulti', 'orfCategory',
       'accession_number', 'accession_numbers', 'protein_group_num',
       'entry_name'],
      dtype='object', name='rhd')

In [14]:
peptide_df = raw_acetyl_gct.row_metadata_df.loc[
    # Only Keep RefSeq accessions
    ~raw_acetyl_gct.row_metadata_df['accession_number'].str.startswith('smORF'),
    ['variableSites', 'sequence', 'accession_number', 'protein_group_num']
]
peptide_df['variableSites'] = peptide_df['variableSites'].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

rhd,variableSites,sequence,accession_number,protein_group_num
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NP_000468.1_K28k _1_1_28_28,[K28k],DAHkSEVAHR,NP_000468.1,1.1
NP_000468.1_K36k _1_1_36_36,[K36k],FkDLGEENFK,NP_000468.1,1.1
NP_000468.1_K44k _1_1_44_44,[K44k],DLGEENFk,NP_000468.1,1.1
NP_000468.1_K75k _1_1_75_75,[K75k],LVNEVTEFAkTCVADESAENCDK,NP_000468.1,1.1
NP_000468.1_K88k _1_1_88_88,[K88k],TCVADESAENCDkSLHTLFGDK,NP_000468.1,1.1
NP_000468.1_K97k _1_1_97_97,[K97k],SLHTLFGDkLCTVATLR,NP_000468.1,1.1
NP_000468.1_K117k _1_1_117_117,[K117k],ETYGEMADCCAkQEPERNECFLQHK,NP_000468.1,1.1
NP_000468.1_K130k _1_1_130_130,[K130k],QEPERNECFLQHkDDNPNLPR,NP_000468.1,1.1
NP_000468.1_K160k _1_1_160_160,[K160k],LVRPEVDVMCTAFHDNEETFLkK,NP_000468.1,1.1
NP_000468.1_K161k _1_1_161_161,[K161k],kYLYEIAR,NP_000468.1,1.1


In [15]:
peptide_df['variableSites'].explode().apply(lambda x: x[0]).value_counts()

K    15696
M      432
N      175
Q       38
Name: variableSites, dtype: int64

In [16]:
# Keep only K that are acetylsites
peptide_df['acetylsites'] = peptide_df['variableSites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'K']
)

In [17]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.sequence, peptide_df.accession_number, peptide_df.acetylsites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [18]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df[~peptide_loc_range_df['peptide_loc_valid']].head()

Unnamed: 0_level_0,variableSites,sequence,accession_number,protein_group_num,acetylsites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [19]:
out_df = peptide_with_loc_df.loc[
    :,
    ['acetylsites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
].reset_index()
out_df['acetylsites'] = out_df['acetylsites'].str.join(';')
out_df.to_csv(
    acetyl_pth.with_suffix('.peptide_location.tsv'),
    sep='\t',
    index=False
)

In [20]:
out_df.head()

Unnamed: 0,rid,acetylsites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_000468.1_K28k _1_1_28_28,K28,25,34,True,
1,NP_000468.1_K36k _1_1_36_36,K36,35,44,True,
2,NP_000468.1_K44k _1_1_44_44,K44,37,44,True,
3,NP_000468.1_K75k _1_1_75_75,K75,66,88,True,
4,NP_000468.1_K88k _1_1_88_88,K88,76,97,True,


## Ubiquitin

In [21]:
raw_ubiquitin_gct = parse(str(ubiquitin_pth))
raw_ubiquitin_gct.row_metadata_df.columns

Index(['id.description', 'geneSymbol', 'numColumnsVMsiteObserved', 'bestScore',
       'bestDeltaForwardReverseScore', 'Best_scoreVML', 'StartAA',
       'VMsiteFlanks', 'Best_numActualVMSites_k', 'Best_numLocalizedVMsites_k',
       'Best_numAmbiguousVMsites_k', 'variableSites', 'sequence',
       'sequenceVML',
       'accessionNumber_VMsites_numVMsitesPresent_numVMsitesLocalizedBest_earliestVMsiteAA_latestVMsiteAA',
       'protein_mw', 'species', 'speciesMulti', 'orfCategory',
       'accession_number', 'accession_numbers', 'protein_group_num',
       'entry_name'],
      dtype='object', name='rhd')

In [22]:
peptide_df = raw_ubiquitin_gct.row_metadata_df.loc[
    # Only Keep RefSeq accessions
    ~raw_ubiquitin_gct.row_metadata_df['accession_number'].str.startswith('smORF'),
    ['variableSites', 'sequence', 'accession_number', 'protein_group_num']
]
peptide_df['variableSites'] = peptide_df['variableSites'].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

rhd,variableSites,sequence,accession_number,protein_group_num
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NP_001333374.1_K3k _1_1_3_3,[K3k],MEkEETTR,NP_001333374.1,1.1
NP_001333374.1_K128k _1_1_128_128,[K128k],IYTTkIK,NP_001333374.1,1.1
NP_001333374.1_K134k _1_1_134_134,[K134k],LkSEDGVEGDLGETQSR,NP_001333374.1,1.1
NP_001333374.1_K171k _1_1_171_171,[K171k],EGAkDIDISSPEFK,NP_001333374.1,1.1
NP_001333374.1_K181k _1_1_181_181,[K181k],DIDISSPEFkIK,NP_001333374.1,1.1
NP_001333374.1_K203k _1_1_203_203,[K203k],HELTEISNVDVETQSGkTVIR,NP_001333374.1,1.1
NP_001333374.1_K243k _1_1_243_243,[K243k],AGAISASGPELQGAGHSkLQVTMPGIK,NP_001333374.1,1.1
NP_001333374.1_K252k _1_1_252_252,[K252k],LQVTMPGIkVGGSGVNVNAK,NP_001333374.1,1.1
NP_001333374.1_K263k _1_1_263_263,[K263k],VGGSGVNVNAkGLDLGGR,NP_001333374.1,1.1
NP_001333374.1_K326k _1_1_326_326,[K326k],EGQTPkAGLR,NP_001333374.1,1.1


In [23]:
# Check what residues are modified
peptide_df['variableSites'].explode().apply(lambda x: x[0]).value_counts()

K    25809
M      124
N      120
Q       42
Name: variableSites, dtype: int64

In [24]:
# Keep only K that are uniquitsites
peptide_df['ubiquitinsites'] = peptide_df['variableSites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'K']
)

In [25]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.sequence, peptide_df.accession_number, peptide_df.ubiquitinsites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [26]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['accession_number', 'variableSites', 'sequence', 'peptide_loc_invalid_reason']
]

Unnamed: 0_level_0,accession_number,variableSites,sequence,peptide_loc_invalid_reason
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [27]:
out_df = peptide_with_loc_df.loc[
    :,
    ['ubiquitinsites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
].reset_index()
out_df['ubiquitinsites'] = out_df['ubiquitinsites'].str.join(';')
out_df.to_csv(
    ubiquitin_pth.with_suffix('.peptide_location.tsv'),
    sep='\t',
    index=False
)

In [28]:
out_df.head()

Unnamed: 0,rid,ubiquitinsites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_001333374.1_K3k _1_1_3_3,K3,1,8,True,
1,NP_001333374.1_K128k _1_1_128_128,K128,124,130,True,
2,NP_001333374.1_K134k _1_1_134_134,K134,133,149,True,
3,NP_001333374.1_K171k _1_1_171_171,K171,168,181,True,
4,NP_001333374.1_K181k _1_1_181_181,K181,172,183,True,
