In [23]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd
from cmapPy.pandasGEXpress.parse import parse

In [24]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_proteome_v3.0/')

In [25]:
protein_fa = FastaFile(str(DATA_ROOT / 'DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz'))

In [26]:
raw_phospho_gct = parse(
    str(DATA_ROOT / 'CPTAC3/LUAD_discovery/luad-v3.1-phosphoproteome-ratio-norm-unfiltered.gct')
)

In [27]:
raw_phospho_gct.row_metadata_df.columns

Index(['id.description', 'geneSymbol', 'numColumnsVMsiteObserved', 'bestScore',
       'bestDeltaForwardReverseScore', 'Best_scoreVML',
       'Best_numActualVMSites_sty', 'Best_numLocalizedVMsites_sty',
       'variableSites', 'sequence', 'sequenceVML',
       'accessionNumber_VMsites_numVMsitesPresent_numVMsitesLocalizedBest_earliestVMsiteAA_latestVMsiteAA',
       'protein_mw', 'species', 'speciesMulti', 'orfCategory',
       'accession_number', 'accession_numbers', 'protein_group_num',
       'entry_name'],
      dtype='object', name='rhd')

In [28]:
peptide_df = raw_phospho_gct.row_metadata_df.loc[
    # Only Keep RefSeq accessions
    ~raw_phospho_gct.row_metadata_df['accession_number'].str.startswith('smORF'),
    ['variableSites', 'sequence', 'accession_number', 'protein_group_num', 'geneSymbol']
]
peptide_df['variableSites'] = peptide_df['variableSites'].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

rhd,variableSites,sequence,accession_number,protein_group_num,geneSymbol
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NP_001333374.1_S18s _1_1_18_18,[S18s],ELLLPNWQGsGSHGLTIAQR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S41s _1_1_41_41,[S41s],DDGVFVQEVTQNsPAAR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S93s _1_1_93_93,[S93s],KGDRsPEPGQTWTR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_T99t _1_1_99_99,[T99t],GDRSPEPGQtWTR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S110s _1_1_110_110,[S110s],EVFSSCSsEVVLSGDDEEYQR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S115s _1_1_115_115,[S115s],EVFSSCSSEVVLsGDDEEYQR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S135s _1_1_135_135,[S135s],sEDGVEGDLGETQSR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_T146t _1_1_146_146,[T146t],SEDGVEGDLGEtQSR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_T158t _1_1_158_158,[T158t],VtAYTVDVTGR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S177s _1_1_177_177,[S177s],DIDISsPEFK,NP_001333374.1,1.1,AHNAK


Check what residues are modified

In [29]:
peptide_df['variableSites'].explode().apply(lambda x: x[0]).value_counts()

S    63188
T    12339
M     1905
Y     1189
N     1101
Q      219
C      105
Name: variableSites, dtype: int64

In [30]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['variableSites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'STY']
)

In [31]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

In [32]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.sequence, peptide_df.accession_number, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [33]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df[~peptide_loc_range_df['peptide_loc_valid']].head()

Unnamed: 0_level_0,variableSites,sequence,accession_number,protein_group_num,geneSymbol,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [34]:
out_df = peptide_with_loc_df.loc[
    :,
    ['phosphosites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
].reset_index()
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    DATA_ROOT / 'CPTAC3/LUAD_discovery/luad-v3.1-phosphoproteome-ratio-norm-unfiltered.peptide_location.tsv',
    sep='\t',
    index=False
)

In [35]:
out_df.head()

Unnamed: 0,rid,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_001333374.1_S18s _1_1_18_18,S18,9,28,True,
1,NP_001333374.1_S41s _1_1_41_41,S41,29,45,True,
2,NP_001333374.1_S93s _1_1_93_93,S93,89,102,True,
3,NP_001333374.1_T99t _1_1_99_99,T99,90,102,True,
4,NP_001333374.1_S110s _1_1_110_110,S110,103,123,True,


## Acetylation

In [36]:
raw_acetyl_gct = parse(
    str(DATA_ROOT / 'CPTAC3/LUAD_discovery/luad-v3.1-acetylome-ratio-norm-unfiltered.gct')
)

In [37]:
raw_acetyl_gct.row_metadata_df.columns

Index(['id.description', 'geneSymbol', 'numColumnsVMsiteObserved', 'bestScore',
       'bestDeltaForwardReverseScore', 'Best_scoreVML', 'variableSites',
       'sequence', 'sequenceVML',
       'accessionNumber_VMsites_numVMsitesPresent_numVMsitesLocalizedBest_earliestVMsiteAA_latestVMsiteAA',
       'protein_mw', 'species', 'speciesMulti', 'orfCategory',
       'accession_number', 'accession_numbers', 'protein_group_num',
       'entry_name'],
      dtype='object', name='rhd')

In [38]:
peptide_df = raw_acetyl_gct.row_metadata_df.loc[
    # Only Keep RefSeq accessions
    ~raw_acetyl_gct.row_metadata_df['accession_number'].str.startswith('smORF'),
    ['variableSites', 'sequence', 'accession_number', 'protein_group_num']
]
peptide_df['variableSites'] = peptide_df['variableSites'].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

rhd,variableSites,sequence,accession_number,protein_group_num
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NP_000468.1_K28k _1_1_28_28,[K28k],DAHkSEVAHR,NP_000468.1,1.1
NP_000468.1_K36k _1_1_36_36,[K36k],FkDLGEENFK,NP_000468.1,1.1
NP_000468.1_K44k _1_1_44_44,[K44k],DLGEENFk,NP_000468.1,1.1
NP_000468.1_K65k _1_1_65_65,[K65k],ALVLIAFAQYLQQCPFEDHVk,NP_000468.1,1.1
NP_000468.1_K75k _1_1_75_75,[K75k],LVNEVTEFAkTCVADESAENCDK,NP_000468.1,1.1
NP_000468.1_K88k _1_1_88_88,[K88k],TCVADESAENCDkSLHTLFGDK,NP_000468.1,1.1
NP_000468.1_K97k _1_1_97_97,[K97k],SLHTLFGDkLCTVATLR,NP_000468.1,1.1
NP_000468.1_K117k _1_1_117_117,[K117k],ETYGEMADCCAkQEPERNECFLQHK,NP_000468.1,1.1
NP_000468.1_K130k _1_1_130_130,[K130k],QEPERNECFLQHkDDNPNLPR,NP_000468.1,1.1
NP_000468.1_K160k _1_1_160_160,[K160k],LVRPEVDVMCTAFHDNEETFLkK,NP_000468.1,1.1


In [39]:
peptide_df['variableSites'].explode().apply(lambda x: x[0]).value_counts()

K    13959
M      478
N      174
Q       45
Name: variableSites, dtype: int64

In [40]:
# Keep only K that are acetylsites
peptide_df['acetylsites'] = peptide_df['variableSites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'K']
)

In [41]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.sequence, peptide_df.accession_number, peptide_df.acetylsites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [42]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df[~peptide_loc_range_df['peptide_loc_valid']].head()

Unnamed: 0_level_0,variableSites,sequence,accession_number,protein_group_num,acetylsites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [43]:
out_df = peptide_with_loc_df.loc[
    :,
    ['acetylsites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
].reset_index()
out_df['acetylsites'] = out_df['acetylsites'].str.join(';')
out_df.to_csv(
    DATA_ROOT / 'CPTAC3/LUAD_discovery/luad-v3.1-acetylome-ratio-norm-unfiltered.peptide_location.tsv',
    sep='\t',
    index=False
)

In [44]:
out_df.head()

Unnamed: 0,rid,acetylsites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_000468.1_K28k _1_1_28_28,K28,25,34,True,
1,NP_000468.1_K36k _1_1_36_36,K36,35,44,True,
2,NP_000468.1_K44k _1_1_44_44,K44,37,44,True,
3,NP_000468.1_K65k _1_1_65_65,K65,45,65,True,
4,NP_000468.1_K75k _1_1_75_75,K75,66,88,True,
