In [1]:
from pathlib import Path
import re

from pysam import FastaFile
import pandas as pd
from cmapPy.pandasGEXpress.parse import parse

In [2]:
DATA_ROOT = Path('/Users/liang/Box/MyCPTAC/CPTAC_data_collection_v1/')

refseq_fasta_pth = DATA_ROOT / 'DCC/RefSeq_20180629/RefSeq.20180629_Human_ucsc_hg38_cpdbnr_mito_264contams.fasta.gz'

phospho_pth = DATA_ROOT / 'CPTAC3/LSCC_discovery/lscc-v3.2-phosphoproteome-ratio-norm-NArm.gct'
ubiquitin_pth = DATA_ROOT / 'CPTAC3/LSCC_discovery/lscc-v3.2-ubiquitylome-ratio-norm-NArm.gct'

In [3]:
protein_fa = FastaFile(str(refseq_fasta_pth))

In [4]:
def calc_peptide_start_end(peptide, protein_id, sites):
    # Make sure the peptide is part of the protein sequence
    protein_seq = protein_fa.fetch(protein_id)
    if not peptide.upper() in protein_seq:
        return [None, None, False, 'Peptide not found in protein']
    
    first_res = sites[0][0].lower()
    first_res_loc = int(sites[0][1:])
    first_res_ix = peptide.find(first_res) 
    
    peptide_start = first_res_loc - first_res_ix
    peptide_end = first_res_loc + len(peptide) - first_res_ix - 1
    if protein_seq[peptide_start - 1:peptide_end] != peptide.upper():
        return [None, None, False, 'Invalid location']
    return [peptide_start, peptide_end, True, None]

## Phospho

In [5]:
raw_phospho_gct = parse(str(phospho_pth))
raw_phospho_gct.row_metadata_df.columns

Index(['id.description', 'geneSymbol', 'numColumnsVMsiteObserved', 'bestScore',
       'bestDeltaForwardReverseScore', 'Best_scoreVML',
       'Best_numActualVMSites_sty', 'Best_numLocalizedVMsites_sty',
       'Best_numAmbiguousVMsites_sty', 'StartAA', 'VMsiteFlanks',
       'variableSites', 'sequence', 'sequenceVML',
       'accessionNumber_VMsites_numVMsitesPresent_numVMsitesLocalizedBest_earliestVMsiteAA_latestVMsiteAA',
       'protein_mw', 'species', 'speciesMulti', 'orfCategory',
       'accession_number', 'accession_numbers', 'protein_group_num',
       'entry_name', 'GeneSymbol'],
      dtype='object', name='rhd')

In [6]:
peptide_df = raw_phospho_gct.row_metadata_df.loc[
    # Only Keep RefSeq accessions
    ~raw_phospho_gct.row_metadata_df['accession_number'].str.startswith('smORF'),
    ['variableSites', 'sequence', 'accession_number', 'protein_group_num', 'GeneSymbol']
]
peptide_df['variableSites'] = peptide_df['variableSites'].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

rhd,variableSites,sequence,accession_number,protein_group_num,GeneSymbol
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NP_001333374.1_S18s _1_1_18_18,[S18s],ELLLPNWQGsGSHGLTIAQR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S41s _1_1_41_41,[S41s],DDGVFVQEVTQNsPAAR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S93s _1_1_93_93,[S93s],KGDRsPEPGQTWTR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_T99t _1_1_99_99,[T99t],SPEPGQtWTR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S110s _1_0_109_110,[S110s],EVFSSCSsEVVLSGDDEEYQR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S110s _1_1_110_110,[S110s],EVFSSCSsEVVLSGDDEEYQR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S115s _1_1_115_115,[S115s],EVFSSCSSEVVLsGDDEEYQR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_S135s _1_1_135_135,[S135s],sEDGVEGDLGETQSR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_T146t _1_1_146_146,[T146t],SEDGVEGDLGEtQSR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_T158t _1_1_158_158,[T158t],RVtAYTVDVTGR,NP_001333374.1,1.1,AHNAK


In [7]:
# Check what residues are modified
peptide_df['variableSites'].explode().apply(lambda x: x[0]).value_counts()

S    43340
T     7022
M     1078
N      626
Y      449
Q      156
C       69
Name: variableSites, dtype: int64

In [8]:
# Keep only STY that are phosphosites
peptide_df['phosphosites'] = peptide_df['variableSites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'STY']
)

In [9]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.sequence, peptide_df.accession_number, peptide_df.phosphosites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [10]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['accession_number', 'variableSites', 'sequence', 'peptide_loc_invalid_reason']
]

Unnamed: 0_level_0,accession_number,variableSites,sequence,peptide_loc_invalid_reason
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [11]:
out_df = peptide_with_loc_df.loc[
    :,
    ['phosphosites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
].reset_index()
out_df['phosphosites'] = out_df['phosphosites'].str.join(';')
out_df.to_csv(
    phospho_pth.with_suffix('.peptide_location.tsv'),
    sep='\t',
    index=False
)

In [12]:
out_df.head()

Unnamed: 0,rid,phosphosites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_001333374.1_S18s _1_1_18_18,S18,9,28,True,
1,NP_001333374.1_S41s _1_1_41_41,S41,29,45,True,
2,NP_001333374.1_S93s _1_1_93_93,S93,89,102,True,
3,NP_001333374.1_T99t _1_1_99_99,T99,93,102,True,
4,NP_001333374.1_S110s _1_0_109_110,S110,103,123,True,


## Ubiquitin

In [13]:
raw_ubiquitin_gct = parse(str(ubiquitin_pth))
raw_ubiquitin_gct.row_metadata_df.columns

Index(['id.description', 'geneSymbol', 'numColumnsVMsiteObserved', 'bestScore',
       'bestDeltaForwardReverseScore', 'Best_scoreVML', 'StartAA',
       'VMsiteFlanks', 'Best_numActualVMSites_k', 'Best_numLocalizedVMsites_k',
       'Best_numAmbiguousVMsites_k', 'variableSites', 'sequence',
       'sequenceVML',
       'accessionNumber_VMsites_numVMsitesPresent_numVMsitesLocalizedBest_earliestVMsiteAA_latestVMsiteAA',
       'protein_mw', 'species', 'speciesMulti', 'orfCategory',
       'accession_number', 'accession_numbers', 'protein_group_num',
       'entry_name', 'GeneSymbol'],
      dtype='object', name='rhd')

In [14]:
peptide_df = raw_ubiquitin_gct.row_metadata_df.loc[
    # Only Keep RefSeq accessions
    ~raw_ubiquitin_gct.row_metadata_df['accession_number'].str.startswith('smORF'),
    ['variableSites', 'sequence', 'accession_number', 'protein_group_num', 'GeneSymbol']
]
peptide_df['variableSites'] = peptide_df['variableSites'].str.findall(r'[A-Z]\d+[a-z]')
peptide_df.head(10)

rhd,variableSites,sequence,accession_number,protein_group_num,GeneSymbol
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NP_001333374.1_K134k _1_1_134_134,[K134k],LkSEDGVEGDLGETQSR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_K181k _1_1_181_181,[K181k],DIDISSPEFkIK,NP_001333374.1,1.1,AHNAK
NP_001333374.1_K203k _1_1_203_203,[K203k],HELTEISNVDVETQSGkTVIR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_K243k _1_1_243_243,[K243k],AGAISASGPELQGAGHSkLQVTMPGIK,NP_001333374.1,1.1,AHNAK
NP_001333374.1_K252k _1_1_252_252,[K252k],LQVTMPGIkVGGSGVNVNAK,NP_001333374.1,1.1,AHNAK
NP_001333374.1_K263k _1_1_263_263,[K263k],VGGSGVNVNAkGLDLGGR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_K326k _1_1_326_326,[K326k],EGQTPkAGLR,NP_001333374.1,1.1,AHNAK
NP_001333374.1_K387k _1_1_387_387,[K387k],GPQITGPSLEGDLGLkGAK,NP_001333374.1,1.1,AHNAK
NP_001333374.1_K519k _1_1_519_519,[K519k],GDIkVSAPGVQGDVK,NP_001333374.1,1.1,AHNAK
NP_001333374.1_K530k _1_1_530_530,[K530k],VSAPGVQGDVkGPQVALK,NP_001333374.1,1.1,AHNAK


In [15]:
# Check what residues are modified
peptide_df['variableSites'].explode().apply(lambda x: x[0]).value_counts()

K    14606
N       48
M       44
Q       28
Name: variableSites, dtype: int64

In [16]:
# Keep only K that are uniquitsites
peptide_df['ubiquitinsites'] = peptide_df['variableSites'].apply(
    lambda l: [x[:-1] for x in l if x[0] in 'K']
)

In [17]:
# Calculate the peptide location
peptide_loc_range_df = pd.DataFrame(
    map(calc_peptide_start_end, 
        peptide_df.sequence, peptide_df.accession_number, peptide_df.ubiquitinsites),
    index=peptide_df.index,
    columns=['peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason'],
).astype({
    'peptide_start': pd.Int64Dtype(),
    'peptide_end': pd.Int64Dtype(),
})

In [18]:
# Combine the location and peptide information
peptide_with_loc_df = pd.concat([peptide_df, peptide_loc_range_df], axis=1)
# List all the invalid peptides
peptide_with_loc_df.loc[
    ~peptide_loc_range_df['peptide_loc_valid'],
    ['accession_number', 'variableSites', 'sequence', 'peptide_loc_invalid_reason']
]

Unnamed: 0_level_0,accession_number,variableSites,sequence,peptide_loc_invalid_reason
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [19]:
out_df = peptide_with_loc_df.loc[
    :,
    ['ubiquitinsites', 'peptide_start', 'peptide_end', 'peptide_loc_valid', 'peptide_loc_invalid_reason']
].reset_index()
out_df['ubiquitinsites'] = out_df['ubiquitinsites'].str.join(';')
out_df.to_csv(
    ubiquitin_pth.with_suffix('.peptide_location.tsv'),
    sep='\t',
    index=False
)

In [20]:
out_df.head()

Unnamed: 0,rid,ubiquitinsites,peptide_start,peptide_end,peptide_loc_valid,peptide_loc_invalid_reason
0,NP_001333374.1_K134k _1_1_134_134,K134,133,149,True,
1,NP_001333374.1_K181k _1_1_181_181,K181,172,183,True,
2,NP_001333374.1_K203k _1_1_203_203,K203,187,207,True,
3,NP_001333374.1_K243k _1_1_243_243,K243,226,252,True,
4,NP_001333374.1_K252k _1_1_252_252,K252,244,263,True,
