In [1]:
import sys
import os
import session_info

# Add the '0_functions' folder to sys.path
sys.path.append(os.path.join(os.getcwd(), '..', '0_functions'))

In [2]:
import pandas as pd
import dataframe_image as dfi
from functions import preprocess
from functions import clean_agg
from functions import read_fastafile
from functions import pep_Cterm

ModuleNotFoundError: No module named 'dataframe_image'

In [None]:
# Display session information
session_info.show()

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
pd.set_option('display.max_rows', None)

In [4]:
datafolder = 'data'
figures = 'data/figures'

In [5]:
# Load data
# Know prenylated proteins uniprot download, based on SPARQL query 1, up_query_for_known_G_F_23.02.23.txt
uniprot = pd.read_csv(os.path.join(datafolder, 'UniProt_SPARQL_queries', 'up_output_known_G_F_23.02.23.csv'), sep=';', encoding='UTF-8')

# 10 additional know prenylated proteins uniprot download, based on SPARQL query 1, but this time directly giving the protein name as input
# up_query_for_known_G_F_extension_09.03.23.txt
add_uniprot = pd.read_csv(os.path.join(datafolder, 'UniProt_SPARQL_queries', 'up_output_known_G_F_extension28.12.24.csv'), sep=';')

publications = pd.read_csv(os.path.join(datafolder, 'publications', 'citavi_publications.csv'), sep=';')

# Collection of publicly available information and data pre-processing

In [6]:
# Preprocess raw UniProt output
# e.g. rename columns, clean strings from unnecessary additions, transform all locations containing "membrane" to just membrane,
# remove double entries, change type to integer where needed etc.
uniprot = preprocess(uniprot)

NameError: name 'preprocess' is not defined

In [7]:
len(uniprot)

803

In [8]:
## Additional proteins list known to be prenylated from literature
# CAAX farnesylation of 'ULK3', 'DCAF8', 'CEP85', 'LRRF1', 'NP1L4', 'RHBT3', 'DPCD', 'GNAI1', 'GNAI2'
# Geranylgeranylation of 'YKT6'

# Preprocess raw uniprot output
add_uniprot = preprocess(add_uniprot)

# Fill cells with information from the literature in the same form as the other dataframe
Storck2019 = ['ULK3', 'DCAF8', 'CEP85', 'LRRF1', 'NP1L4', 'RHBT3', 'DPCD']
Palsuledesai2014 = ['GNAI1', 'GNAI2']
Shirakawa2020 = ['YKT6']

for i, name in enumerate(add_uniprot['name']):
    if name in Shirakawa2020:
        add_uniprot.at[i, 'moiety'] = 'S-geranylgeranyl cysteine'
        add_uniprot.at[i, 'AminoAcid'] = 'C'    
        add_uniprot.at[i, 'position'] = -4
        add_uniprot.at[i, 'evidenceCode'] = 'extended_search'
        add_uniprot.at[i, 'publication'] = 'doi.org/10.15252/embj.2019104120'
    else:
        add_uniprot.at[i, 'moiety'] = 'S-farnesyl cysteine'
        add_uniprot.at[i, 'AminoAcid'] = 'C'    
        add_uniprot.at[i, 'position'] = -4
        add_uniprot.at[i, 'evidenceCode'] = 'extended_search'
        if name in Storck2019:
            add_uniprot.at[i, 'publication'] = 'doi:10.1038/s41557-019-0237-6'
        if name in Palsuledesai2014:
            add_uniprot.at[i, 'publication'] = 'doi:10.1039/c3mb70593e'

In [9]:
uniprot_ext = pd.concat([uniprot, add_uniprot], ignore_index=True)

In [10]:
# Publications of known prenylated proteins with Title, Author etc.

# Generate df per publication with additional info about title, autor etc. from the doi number in 'uniprot_ext'
# and annotations from the citation program citavi regarding the dois

uniprotEvidence = uniprot_ext[['ID', 'name', 'substrate', 'location', 'moiety', 'evidenceCode', 'publication']]
uniprotEvidence = uniprotEvidence[uniprotEvidence['publication'] != 'NaN']

# aggregate df by publicaton and clean (remove brackets etc.)
uniprotEvidence = uniprotEvidence.groupby('publication').agg(set).reset_index()
uniprotEvidence = clean_agg(uniprotEvidence)

# Results in 57 publications about the known prenylated proteins in UniProt
publications['publication'] = publications['publication'].str.lower()
up_evidence_ext = pd.merge(uniprotEvidence, publications, on='publication', how='outer')

In [11]:
#save
up_evidence_ext.to_csv(os.path.join(datafolder, 'publications', 'up_publications_ext.csv'), sep=';', index=False)

In [12]:
# non aggregated df of all known prenylated proteins
no_agg_known = uniprot_ext

In [13]:
# Aggregate uniprot table by protein ID, using set to avoid double entries
uniprot_ext = uniprot_ext.groupby('ID').agg(set).reset_index()

# Clean aggregated df (remove brackets etc.)
uniprot_ext = clean_agg(uniprot_ext)

# Characterization of known prenylated proteins

In [14]:
# Load pre-processed data

# Known prenylated proteins
known = uniprot_ext

# Evidence Codes with a corresponding publication
up_evidence_ext = up_evidence_ext.copy()

## What proteins have been previously known to be farnesylated or geranylgeranylated?

In [15]:
# Quick check for non reviewed protein entries
known[known['ProteinEntryReviewed'] == 'false'].reset_index(drop=True)

Unnamed: 0,ID,name,fullName,substrate,location,moiety,AminoAcid,position,evidenceCode,publication,ProteinEntryReviewed
0,A6NIT2,A6NIT2,Phosphorylase b kinase regulatory subunit,phosphorylase b kinase regulatory chain,Membrane,S-farnesyl cysteine,C,-4,ECO_0000256,,False
1,A6NMN0,A6NMN0,Phosphorylase b kinase regulatory subunit,phosphorylase b kinase regulatory chain,Membrane,S-farnesyl cysteine,C,-4,ECO_0000256,,False


## Which and how many proteins are known to go through both farnelysation and geranylgeranylisation?

In [16]:
# Divide known prenylated proteins by moiety using condition, aggregation and cleaning
known_F = known.loc[known['moiety'].isin(["S-farnesyl cysteine"])].reset_index(drop=True)
known_GG = known.loc[known['moiety'].isin(["S-geranylgeranyl cysteine"])].reset_index(drop=True)
known_F_GG = known.loc[~known['moiety'].isin(["S-farnesyl cysteine", "S-geranylgeranyl cysteine"])].reset_index(drop=True)

print('Number of known fanesylated proteins in UniProt:', len(known_F))
print('Number of known geranylgeranylated proteins in UniProt:', len(known_GG))
print('Number of known fanesylated & geranylgeranylated proteins in UniProt:', len(known_F_GG))

Number of known fanesylated proteins in UniProt: 70
Number of known geranylgeranylated proteins in UniProt: 111
Number of known fanesylated & geranylgeranylated proteins in UniProt: 2


In [17]:
# shorten moiety description
known_F_GG['moiety'] = '[S-farnesyl cysteine, S-geranylgeranyl cysteine]'

for index, row in known.iterrows():
    # Check if 'name' is either 'YKT6' or 'RHOB'
    if row['name'] == 'YKT6' or row['name'] == 'RHOB':
        # Update the 'moiety' column with the desired string
        known.at[index, 'moiety'] = '[S-farnesyl cysteine, S-geranylgeranyl cysteine]'

## What are the possible modification positions and the amino acid?

In [18]:
# The modification always occurs on the amino acid Cysteine (C)
known['AminoAcid'].unique()

array(['C'], dtype=object)

In [19]:
# The possible modification positions (C positions, as annotated in UniProt, for all known prenylated proteins)
known['position'].value_counts()

position
-4          130
[-3, -1]     20
[-2, -1]     16
[-4, -3]      6
[-5, -4]      6
[-3, -2]      3
-1            1
-3            1
Name: count, dtype: int64

In [20]:
# according to the publication EFC4B (also CRACR2A) isoform a (CRACR2A-a) has a predicted prenylation site in its C-terminus
# which is KKKSCCG! So the position should have been annotated in UniProt as [-3, -2]
# [-3, -2] (CCX) is a known motif for geranylgeranylation, opposed to CXX...(Marchwicka et al. 2022)

known[known['position'] == -3]

Unnamed: 0,ID,name,fullName,substrate,location,moiety,AminoAcid,position,evidenceCode,publication,ProteinEntryReviewed
159,Q9BSW2,EFC4B,EF-hand calcium-binding domain-containing protein 4B,EFCAB4,Membrane,S-geranylgeranyl cysteine,C,-3,ECO_0000269,doi:10.1126/scisignal.aac9171,True


In [21]:
# correct mistake in ID Q9BSW2
known.at[159, 'position'] = [-3, -2]

In [22]:
# look up index number for Q9BSW2 in GG df
known_GG[known_GG['ID'] == 'Q9BSW2']

Unnamed: 0,ID,name,fullName,substrate,location,moiety,AminoAcid,position,evidenceCode,publication,ProteinEntryReviewed
93,Q9BSW2,EFC4B,EF-hand calcium-binding domain-containing protein 4B,EFCAB4,Membrane,S-geranylgeranyl cysteine,C,-3,ECO_0000269,doi:10.1126/scisignal.aac9171,True


In [23]:
# correction
known_GG.at[93, 'position'] = [-3, -2]

In [24]:
# The possible modification positions (C positions, as annotated in UniProt, for all known prenylated proteins)
known['position'].value_counts()

position
-4          130
[-3, -1]     20
[-2, -1]     16
[-4, -3]      6
[-5, -4]      6
[-3, -2]      4
-1            1
Name: count, dtype: int64

In [25]:
# The postion annotation is not trustworthy enough. We are going to use the same method for motif detection as for the 
# experimentally identified proteins!

In [26]:
# Get fasta sequences from UniProt for motif detection

#' '.join([s for s in uniprot_ext['ID']])

In [27]:
fasta = read_fastafile(os.path.join(datafolder, 'fasta_files', 'idmapping_2024_09_29_canon.fasta'))

In [28]:
known[~known['ID'].isin(fasta['ID'])]

Unnamed: 0,ID,name,fullName,substrate,location,moiety,AminoAcid,position,evidenceCode,publication,ProteinEntryReviewed
127,Q8IXS6,PALM2,Paralemmin-2,paralemmin,Membrane,S-farnesyl cysteine,C,-4,ECO_0000255,,True


In [29]:
fasta[~fasta['ID'].isin(known['ID'])]

Unnamed: 0,ID,seqID,seq,len
127,Q9Y2D5,sp|Q9Y2D5|PLAK2_HUMAN,MAEAELHKERLQAIAEKRKRQTEIEGKRQQLDEQILLLQHSKSKVLREKWLLQGIPAGTAEEEEARRRQSEEDEFRVKQLEDNIQRLEQEIQTLESEESQISAKEQIILEKLKETEKSFKDFQKGFSSTDGDAVNYISSQLPDLPILCSRTAEPSPGQDGTSRAAGVGWENVLLKEGESASNATETSGPDMTIKKPPQLSEDDIWLKSEGDNYSATLLEPAASSLSPDHKNMEIEVSVAECKSVPGITSTPHPMDHPSAFYSPPHNGLLTDHHESLDNDVAREIRYLDEVLEANCCDSAVDGTYNGTSSPEPGAVVLVGGLSPPVHEATQPEPTERTASRQAPPHIELSNSSPDPMAEAERTNGHSPSQPRDALGDSLQVPVSPSSTTSSRCSSRDGEFTLTTLKKEAKFELRAFHEDKKPSKLFEDDEHEKEQYCIRKVRPSEEMLELEKERRELIRSQAVKKNPGIAAKWWNPPQEKTIEEQLDEEHLESHKKYKERKERRAQQEQLLLQKQLQQQQQQPPSQLCTAPASSHERASMIDKAKEDIVTEQIDFSAARKQFQLMENSRQAVAKGQSTPRLFSIKPFYRPLGSVNSDKPLTNPRPPSVGGPPEDSGASAAKGQKSPGALETPSAAGSQGNTASQGKEGPYSEPSKRGPLSKLWAEDGEFTSARAVLTVVKDDDHGILDQFSRSVNVSLTQEELDSGLDELSVRSQDTTVLETLSNDFSMDNISDSGASNETTNALQENSLADFSLPQTPQTDNPSEGRGEGVSKSFSDHGFYSPSSTLGDSPLVDDPLEYQAGLLVQNAIQQAIAEQVDKAVSKTSRDGAEQQGPEATVEEAEAAAFGSEKPQSMFEPPQVSSPVQEKRDVLPKILPAEDRALRERGPPQPLPAVQPSGPINMEETRPEGSYFSKYSEAAELRSTASLLATQESDVMVGPFKLRSRKQRTLSMIEEEIRAAQEREEELKRQRQVLQSTQSPRTKNAPSLPSRTCYKTAPGKIEKVKPPPSPTTEGPSLQPDLAPEEAAGTQRPKNLMQTLMEDYETHKSKRRERMDDSSYTSKLLSCKVTSEVLEATRVNRRKSALALRWEAGIYANQEEEDNE,1103


In [30]:
known = known.replace('Q8IXS6','Q9Y2D5')
known_F = known_F.replace('Q8IXS6','Q9Y2D5')

In [31]:
# add pep column for C-terminal peptides
fasta = pep_Cterm(fasta)

In [32]:
no_C = fasta[~fasta['pep'].str.contains('C')]

print('Proteins without cysteine in canonical sequence:', len(no_C))
no_C.reset_index(drop=True)

Proteins without cysteine in canonical sequence: 2


Unnamed: 0,ID,seqID,seq,len,pep
0,Q9Y2D5,sp|Q9Y2D5|PLAK2_HUMAN,MAEAELHKERLQAIAEKRKRQTEIEGKRQQLDEQILLLQHSKSKVLREKWLLQGIPAGTAEEEEARRRQSEEDEFRVKQLEDNIQRLEQEIQTLESEESQISAKEQIILEKLKETEKSFKDFQKGFSSTDGDAVNYISSQLPDLPILCSRTAEPSPGQDGTSRAAGVGWENVLLKEGESASNATETSGPDMTIKKPPQLSEDDIWLKSEGDNYSATLLEPAASSLSPDHKNMEIEVSVAECKSVPGITSTPHPMDHPSAFYSPPHNGLLTDHHESLDNDVAREIRYLDEVLEANCCDSAVDGTYNGTSSPEPGAVVLVGGLSPPVHEATQPEPTERTASRQAPPHIELSNSSPDPMAEAERTNGHSPSQPRDALGDSLQVPVSPSSTTSSRCSSRDGEFTLTTLKKEAKFELRAFHEDKKPSKLFEDDEHEKEQYCIRKVRPSEEMLELEKERRELIRSQAVKKNPGIAAKWWNPPQEKTIEEQLDEEHLESHKKYKERKERRAQQEQLLLQKQLQQQQQQPPSQLCTAPASSHERASMIDKAKEDIVTEQIDFSAARKQFQLMENSRQAVAKGQSTPRLFSIKPFYRPLGSVNSDKPLTNPRPPSVGGPPEDSGASAAKGQKSPGALETPSAAGSQGNTASQGKEGPYSEPSKRGPLSKLWAEDGEFTSARAVLTVVKDDDHGILDQFSRSVNVSLTQEELDSGLDELSVRSQDTTVLETLSNDFSMDNISDSGASNETTNALQENSLADFSLPQTPQTDNPSEGRGEGVSKSFSDHGFYSPSSTLGDSPLVDDPLEYQAGLLVQNAIQQAIAEQVDKAVSKTSRDGAEQQGPEATVEEAEAAAFGSEKPQSMFEPPQVSSPVQEKRDVLPKILPAEDRALRERGPPQPLPAVQPSGPINMEETRPEGSYFSKYSEAAELRSTASLLATQESDVMVGPFKLRSRKQRTLSMIEEEIRAAQEREEELKRQRQVLQSTQSPRTKNAPSLPSRTCYKTAPGKIEKVKPPPSPTTEGPSLQPDLAPEEAAGTQRPKNLMQTLMEDYETHKSKRRERMDDSSYTSKLLSCKVTSEVLEATRVNRRKSALALRWEAGIYANQEEEDNE,1103,YANQEEEDNE
1,Q99733,sp|Q99733|NP1L4_HUMAN,MADHSFSDGVPSDSVEAAKNASNTEKLTDQVMQNPRVLAALQERLDNVPHTPSSYIETLPKAVKRRINALKQLQVRCAHIEAKFYEEVHDLERKYAALYQPLFDKRREFITGDVEPTDAESEWHSENEEEEKLAGDMKSKVVVTEKAAATAEEPDPKGIPEFWFTIFRNVDMLSELVQEYDEPILKHLQDIKVKFSDPGQPMSFVLEFHFEPNDYFTNSVLTKTYKMKSEPDKADPFSFEGPEIVDCDGCTIDWKKGKNVTVKTIKKKQKHKGRGTVRTITKQVPNESFFNFFNPLKASGDGESLDEDSEFTLASDFEIGHFFRERIVPRAVLYFTGEAIEDDDNFEEGEEGEEEELEGDEEGEDEDDAEINPKV,375,EDDAEINPKV


In [33]:
# Look for C-terminal cysteine in the isoforms of these proteins

#' '.join([s for s in no_C['ID']])

In [34]:
iso = read_fastafile(os.path.join(datafolder, 'fasta_files', 'idmapping_2024_09_29_iso.fasta'))

In [35]:
# add pep column for C-terminal peptides
iso = pep_Cterm(iso)

In [36]:
# checks for C occurence, so we get the proteins that do contain a C in one or more isoforms
with_C = iso[iso['pep'].str.contains('C')]

print('Found isoforms that do contain a cysteine:', len(with_C))
with_C

Found isoforms that do contain a cysteine: 3


Unnamed: 0,ID,seqID,seq,len,pep
1,Q99733-2,sp|Q99733-2|NP1L4_HUMAN,MADHSFSDGVPSDSVEAAKNASNTEKLTDQVMQNPRVLAALQERLDNVPHTPSSYIETLPKAVKRRINALKQLQVRCAHIEAKFYEEVHDLERKYAALYQPLFDKRREFITGDVEPTDAESEWHSENEEEEKLAGDMKSKVVVTEKAAATAEEPDPKGIPEFWFTIFRNVDMLSELVQEYDEPILKHLQDIKVKFSDPGQPMSFVLEFHFEPNDYFTNSVLTKTYKMKSEPDKADPFSFEGPEIVDCDGCTIDWKKGKNVTVKTIKKKQKHKGRGTVRTITKQVPNESFFNFFNPLKASGDGESLDEDSEFTLASDFEIGHFFRERIVPRAVLYFTGEAIEDDDNFEEGEEGEEEELEGDEEGEDEDDAEINPKKEPSQPAECKQQ,386,PSQPAECKQQ
7,Q9Y2D5-8,sp|Q9Y2D5-8|PLAK2_HUMAN,MAEAELHKERLQAIAEKRKRQTEIEGKRQQLDEQILLLQHSKSKVLREKWLLQGIPAGTAEEEEARRRQSEEDEFRVKQLEDNIQRLEQEIQTLESEESQISAKEQIILEKLKETEKSFKDFQKGFSSTDGDAVNYISSQLPDLPILCSRTAEPSPGQDGTSRAAAVYAMEINVEKDKQTGETKILSTSTIGPEGVHQKGVKVYDDGTKVVYEVRSGGTVVENGVHKLSTKDVEELIQKAGQSSLGGGHVSERTVIADGSLSHPKEHMLCKEAKLEMVHKSRKDHSSGNPGQQAQAPSAAGPEANLDQPVTMIFMGYQNIEDEEETKKVLGYDETIKAELVLIDEDDEKSLREKTVTDVSTIDGNAAELVSGRPVSDTTEPSSPEGKEESLATEPAPGTQKKKRCQCCVVM,411,KKRCQCCVVM
8,Q9Y2D5-9,sp|Q9Y2D5-9|PLAK2_HUMAN,MEMAEAELHKERLQAIAEKRKRQTEIEGKRQQLDEQILLLQHSKSKVLREKWLLQGIPAGTAEEEEARRRQSEEDEFRVKQLEDNIQRLEQEIQTLESEESQISAKEQIILEKLKETEKSFKDFQKGFSSTDGAVYAMEINVEKDKQTGETKILSTSTIGPEGVHQKGVKVYDDGTKVVYEVRSGGTVVENGVHKLSTKDVEELIQKAGQSSLGGGHVSERTVIADGSLSHPKEHMLCKEAKLEMVHKSRKDHSSGNPGQQAQAPSAAGPEANLDQPVTMIFMGYQNIEDEEETKKVLGYDETIKAELVLIDEDDEKSLREKTVTDVSTIDGNAAELVSGRPVSDTTEPSSPEGKEESLATEPAPGTQKKKRCQCCVVM,379,KKRCQCCVVM


In [37]:
fasta[fasta['ID'] == 'Q99733']

Unnamed: 0,ID,seqID,seq,len,pep
157,Q99733,sp|Q99733|NP1L4_HUMAN,MADHSFSDGVPSDSVEAAKNASNTEKLTDQVMQNPRVLAALQERLDNVPHTPSSYIETLPKAVKRRINALKQLQVRCAHIEAKFYEEVHDLERKYAALYQPLFDKRREFITGDVEPTDAESEWHSENEEEEKLAGDMKSKVVVTEKAAATAEEPDPKGIPEFWFTIFRNVDMLSELVQEYDEPILKHLQDIKVKFSDPGQPMSFVLEFHFEPNDYFTNSVLTKTYKMKSEPDKADPFSFEGPEIVDCDGCTIDWKKGKNVTVKTIKKKQKHKGRGTVRTITKQVPNESFFNFFNPLKASGDGESLDEDSEFTLASDFEIGHFFRERIVPRAVLYFTGEAIEDDDNFEEGEEGEEEELEGDEEGEDEDDAEINPKV,375,EDDAEINPKV


In [38]:
fasta[fasta['ID'] == 'Q9Y2D5']

Unnamed: 0,ID,seqID,seq,len,pep
127,Q9Y2D5,sp|Q9Y2D5|PLAK2_HUMAN,MAEAELHKERLQAIAEKRKRQTEIEGKRQQLDEQILLLQHSKSKVLREKWLLQGIPAGTAEEEEARRRQSEEDEFRVKQLEDNIQRLEQEIQTLESEESQISAKEQIILEKLKETEKSFKDFQKGFSSTDGDAVNYISSQLPDLPILCSRTAEPSPGQDGTSRAAGVGWENVLLKEGESASNATETSGPDMTIKKPPQLSEDDIWLKSEGDNYSATLLEPAASSLSPDHKNMEIEVSVAECKSVPGITSTPHPMDHPSAFYSPPHNGLLTDHHESLDNDVAREIRYLDEVLEANCCDSAVDGTYNGTSSPEPGAVVLVGGLSPPVHEATQPEPTERTASRQAPPHIELSNSSPDPMAEAERTNGHSPSQPRDALGDSLQVPVSPSSTTSSRCSSRDGEFTLTTLKKEAKFELRAFHEDKKPSKLFEDDEHEKEQYCIRKVRPSEEMLELEKERRELIRSQAVKKNPGIAAKWWNPPQEKTIEEQLDEEHLESHKKYKERKERRAQQEQLLLQKQLQQQQQQPPSQLCTAPASSHERASMIDKAKEDIVTEQIDFSAARKQFQLMENSRQAVAKGQSTPRLFSIKPFYRPLGSVNSDKPLTNPRPPSVGGPPEDSGASAAKGQKSPGALETPSAAGSQGNTASQGKEGPYSEPSKRGPLSKLWAEDGEFTSARAVLTVVKDDDHGILDQFSRSVNVSLTQEELDSGLDELSVRSQDTTVLETLSNDFSMDNISDSGASNETTNALQENSLADFSLPQTPQTDNPSEGRGEGVSKSFSDHGFYSPSSTLGDSPLVDDPLEYQAGLLVQNAIQQAIAEQVDKAVSKTSRDGAEQQGPEATVEEAEAAAFGSEKPQSMFEPPQVSSPVQEKRDVLPKILPAEDRALRERGPPQPLPAVQPSGPINMEETRPEGSYFSKYSEAAELRSTASLLATQESDVMVGPFKLRSRKQRTLSMIEEEIRAAQEREEELKRQRQVLQSTQSPRTKNAPSLPSRTCYKTAPGKIEKVKPPPSPTTEGPSLQPDLAPEEAAGTQRPKNLMQTLMEDYETHKSKRRERMDDSSYTSKLLSCKVTSEVLEATRVNRRKSALALRWEAGIYANQEEEDNE,1103,YANQEEEDNE


In [39]:
# Replace the row in fasta at index 157 with values from iso at index 1, excluding the first column
fasta.loc[157, fasta.columns[1:]] = iso.loc[1, iso.columns[1:]]
fasta.loc[127, fasta.columns[1:]] = iso.loc[7, iso.columns[1:]]

In [40]:
# Pipeline for detecting canonical motifs

aliph_AA = ['A', 'G', 'V', 'I', 'L']

motifs = {
    "CAAX": lambda seq: seq[-4] == 'C' and seq[-3] in aliph_AA and seq[-2] in aliph_AA,
    "CXXX": lambda seq: seq[-4] == 'C',
    "CCC": lambda seq: seq.endswith('CCC'),
    "CXC": lambda seq: seq[-3] == 'C' and seq[-1] == 'C',
    "CCX": lambda seq: seq[-3] == 'C' and seq[-2] == 'C',
    "CC": lambda seq: seq[-2] == 'C' and seq[-1] == 'C',
    "C": lambda seq: seq[-1] == 'C'
}

In [41]:
print('All identified:', len(fasta), '\n')

# Iteratively filter for each motif
for motif_name, condition in motifs.items():
    motif_df = fasta[fasta['seq'].apply(condition)].reset_index(drop=True)

    # annotate motif in df
    motif_df['motif'] = f'{motif_name}'
    
    globals()[f'fasta_{motif_name}'] = motif_df # Create a new variable with the name based on the motif
    
    fasta = fasta[~fasta['ID'].isin(motif_df['ID'])]  # Remove identified sequences
    print(f'{motif_name}:  \t', len(motif_df), '\t(', len(fasta), ' remaining)', sep='')

fasta_canon = pd.concat([fasta_CAAX, fasta_CXXX, fasta_CXC, fasta_CCX, fasta_CC, fasta_C], ignore_index=True)

All identified: 183 

CAAX:  	53	(130 remaining)
CXXX:  	89	(41 remaining)
CCC:  	0	(41 remaining)
CXC:  	20	(21 remaining)
CCX:  	4	(17 remaining)
CC:  	16	(1 remaining)
C:  	1	(0 remaining)


In [42]:
# Update the motif, according to our motif detection pipeline
# 'position' comes from the UniProt annotation, might not correspond to motif

known = pd.merge(known, fasta_canon[['ID', 'motif']], on="ID") 

known_F = pd.merge(known_F, fasta_canon[['ID', 'motif']], on="ID") 
known_GG = pd.merge(known_GG, fasta_canon[['ID', 'motif']], on="ID") 
known_F_GG = pd.merge(known_F_GG, fasta_canon[['ID', 'motif']], on="ID") 

In [43]:
# Continue with division by F and GG, no more F_GG

known_F = pd.concat([known_F, known_F_GG]).reset_index(drop=True)
known_GG = pd.concat([known_GG, known_F_GG]).reset_index(drop=True)

In [44]:
# Preferred modification positions of prenylated proteins
known_motif_counts = known['motif'].value_counts()
normalized_frequencies = (known_motif_counts / len(known) * 100).round(1).astype(str) + '%'

motif_counts_table = pd.DataFrame({'count': known_motif_counts, 'frequency': normalized_frequencies}).reset_index().rename(columns={'index': 'motif'})

known_canon = known.groupby('motif')['name'].agg(set).reset_index()
known_canon['name'] = [list(elem) if len(elem) > 1 else list(elem)[0] for elem in known_canon['name']]

combined_df = pd.merge(motif_counts_table, known_canon, on='motif', how='inner')

dfi.export(combined_df, (os.path.join(datafolder, 'figures', 'known.png')))


display(combined_df)

C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe


Unnamed: 0,motif,count,frequency,name
0,CXXX,89,48.6%,"[RB40A, RHES, DNJA1, PI2R, RAB18, CENPE, RAB30, RAB5A, RAC2, INP5E, CENPF, RHOC, DPCD, KPB1, CLN3, LMNB2, TP4A1, RHEB, GBP2, GBG5, RALA, AL3B1, LRRF1, NP1L4, ULK3, PA24C, RHOB, RAB23, PALM, RAB8B, LMNA, GBG4, GBG5B, RB40B, RND2, TP4A3, RAB26, I5P2, BROX, RAP2A, RAB21, RAB17, PDE6B, CPLX3, DNJA4, RAB25, RALB, DNJA2, PDE6C, RHOH, RND3, A6NIT2, STK11, RASE, RP1BL, RHOJ, ZFN2B, RAB5B, KPB2, CN37, CEP85, RAB5C, PDE6A, GBG13, RPGR, RAP1B, RB11A, TP4A2, AL3B2, RB11B, RAB13, RB40C, RND1, DCAF8, RB40L, RSLAA, RAB37, A6NMN0, RAB38, CPLX4, GBP1, REBL1, NP1L1, DIRA1, RAC3, PP16B, RAB24, OAS1, UCHL1]"
1,CAAX,53,29.0%,"[GNAI1, RASM, GBG11, RASD1, GBG1, RRAS, RAB8A, GBG3, GBG12, DNJB2, I5P1, PALM2, GBG8, UBL3, RHOQ, PRIC1, CDC42, PP16A, GBG10, RHOF, GBGT2, PEX19, RSLAB, DIRA2, PALM3, YKT6, RAB28, PRIC2, RHOG, UBP32, RASK, CXX1, GNAI2, RHOD, RHBT3, RAP2C, GBG7, MIEN1, KPBB, GRK1, DIRA3, GBP5, RRAS2, RAP1A, GBG2, RASN, RAP2B, FBXL2, GRK7, RHOA, RASH, RAC1, LMNB1]"
2,CXC,20,10.9%,"[RAB7A, RAB4A, RAB15, RB33A, RAB19, RAB43, RB39A, RAB3C, RB39B, RB27B, RAB3A, RAB14, RB27A, RAB3B, RAB4B, RAB6B, RAB42, RAB3D, RAB6A, RB33B]"
3,CC,16,8.7%,"[RAB9A, RAB1C, RAB12, RAB2A, RAB1A, RAB9B, RAB7B, RB22A, RAB10, RAB31, RAB2B, RAB35, RAB32, RAB1B, RAB36, RAB7L]"
4,CCX,4,2.2%,"[EFC4B, RAB44, RAB34, RAB20]"
5,C,1,0.5%,RAB41


In [45]:
# Preferred modification positions of farnesylated known proteins
known_F_motif_counts = known_F['motif'].value_counts()
normalized_frequencies = (known_F_motif_counts / len(known) * 100).round(1).astype(str) + '%'

motif_counts_table = pd.DataFrame({'count': known_F_motif_counts, 'frequency': normalized_frequencies}).reset_index().rename(columns={'index': 'motif'})

known_canon = known_F.groupby('motif')['name'].agg(set).reset_index()
known_canon['name'] = [list(elem) if len(elem) > 1 else list(elem)[0] for elem in known_canon['name']]

combined_df = pd.merge(motif_counts_table, known_canon, on='motif', how='inner')

dfi.export(combined_df, (os.path.join(datafolder, 'figures', 'known_F.png')))

print('Known farnesylated motifs of the', len(known), 'known prenylated proteins:')
display(combined_df)

C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
Known farnesylated motifs of the 183 known prenylated proteins:


Unnamed: 0,motif,count,frequency,name
0,CXXX,46,25.1%,"[TP4A2, ULK3, RHES, PA24C, DNJA2, DNJA1, RHOB, PI2R, PALM, RND3, DCAF8, LMNA, RSLAA, CENPE, A6NIT2, STK11, A6NMN0, NP1L4, CPLX4, GBP1, RASE, REBL1, NP1L1, RHOJ, TP4A3, INP5E, PP16B, I5P2, CENPF, BROX, DPCD, KPB1, CLN3, RAP2A, CN37, KPB2, CEP85, PDE6A, LMNB2, CPLX3, GBG13, TP4A1, RHEB, UCHL1, LRRF1, DNJA4]"
1,CAAX,26,14.2%,"[GNAI1, GBG11, RASD1, GBG1, I5P1, PALM2, RHOQ, PRIC1, PP16A, GBGT2, PEX19, PALM3, RAB28, UBP32, PRIC2, YKT6, RASK, CXX1, GNAI2, RHBT3, KPBB, GRK1, RRAS2, RASN, RASH, LMNB1]"


In [46]:
# Preferred modification positions of geranylgeranylated known proteins
known_GG_motif_counts = known_GG['motif'].value_counts()
normalized_frequencies = (known_GG_motif_counts / len(known) * 100).round(1).astype(str) + '%'

motif_counts_table = pd.DataFrame({'count': known_GG_motif_counts, 'frequency': normalized_frequencies}).reset_index().rename(columns={'index': 'motif'})

known_canon = known_GG.groupby('motif')['name'].agg(set).reset_index()
known_canon['name'] = [list(elem) if len(elem) > 1 else list(elem)[0] for elem in known_canon['name']]

combined_df = pd.merge(motif_counts_table, known_canon, on='motif', how='inner')

dfi.export(combined_df, (os.path.join(datafolder, 'figures', 'known_GG.png')))

print('Known geranylgeranylated motifs of the', len(known), 'known prenylated proteins:')
display(combined_df)

C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
Known geranylgeranylated motifs of the 183 known prenylated proteins:


Unnamed: 0,motif,count,frequency,name
0,CXXX,44,24.0%,"[RB11A, AL3B2, RB40A, RAB25, RALB, RAB13, PDE6C, RB11B, RB40C, RHOB, RAB23, RND1, RAB8B, RHOH, RB40L, RAB18, RAB30, GBG4, GBG5B, RAB37, RAB5A, RAB38, RB40B, RP1BL, DIRA1, RND2, RAP1B, RAC3, RAC2, RAB26, ZFN2B, RHOC, RAB5B, RAB24, OAS1, RAB21, RAB5C, RAB17, PDE6B, GBP2, RPGR, GBG5, RALA, AL3B1]"
1,CAAX,28,15.3%,"[RASM, RRAS, RAB8A, GBG3, GBG12, DNJB2, GBG8, UBL3, CDC42, GBG10, RHOF, RSLAB, DIRA2, YKT6, RHOG, RHOD, RAP2C, GBG7, MIEN1, DIRA3, GBP5, RAP1A, GBG2, RAP2B, FBXL2, GRK7, RHOA, RAC1]"
2,CXC,20,10.9%,"[RAB7A, RAB4A, RAB15, RB33A, RAB19, RAB43, RB39A, RAB3C, RB39B, RB27B, RAB3A, RAB14, RB27A, RAB3B, RAB4B, RAB6B, RAB42, RAB3D, RAB6A, RB33B]"
3,CC,16,8.7%,"[RAB9A, RAB1C, RAB12, RAB2A, RAB1A, RAB9B, RAB7B, RB22A, RAB10, RAB31, RAB2B, RAB35, RAB32, RAB1B, RAB36, RAB7L]"
4,CCX,4,2.2%,"[EFC4B, RAB44, RAB34, RAB20]"
5,C,1,0.5%,RAB41


## How many known proteins have a publication regarding prenylation? 

In [47]:
# number of proteins with publications
publications = known[~known['publication'].isna()].reset_index(drop=True)

print('N proteins with at least one publication:', len(publications))

N proteins with at least one publication: 72


In [48]:
publications

Unnamed: 0,ID,name,fullName,substrate,location,moiety,AminoAcid,position,evidenceCode,publication,ProteinEntryReviewed,motif
0,O15498,YKT6,Synaptobrevin homolog YKT6,synaptobrevin,Membrane,"[S-farnesyl cysteine, S-geranylgeranyl cysteine]",C,-4,"[ECO_0000269, extended_search]","[doi.org/10.15252/embj.2019104120, doi:10.1073/pnas.0401183101]",True,CAAX
1,O60884,DNJA2,DnaJ homolog subfamily A member 2,DNJA2,Membrane,S-farnesyl cysteine,C,-4,ECO_0000269,doi:10.1073/pnas.0403413101,True,CXXX
2,O94955,RHBT3,Rho-related BTB domain-containing protein 3,RHBT3,Golgi apparatus,S-farnesyl cysteine,C,-4,extended_search,doi:10.1038/s41557-019-0237-6,True,CAAX
3,O95164,UBL3,Ubiquitin-like protein 3,UBL3,Membrane,S-geranylgeranyl cysteine,C,-4,ECO_0000305,doi:10.1074/jbc.m602283200,True,CAAX
4,P00973,OAS1,2'-5'-oligoadenylate synthase 1,2-5A synthase,"[Mitochondrion, Microsome, Cytoplasm, Nucleus, Secreted, Endoplasmic reticulum]",S-geranylgeranyl cysteine,C,-4,ECO_0000305,doi:10.1126/science.abj3624,True,CXXX
5,P01111,RASN,GTPase NRas,Ras,Membrane,S-farnesyl cysteine,C,-4,ECO_0000269,doi:10.1016/0092-8674(89)90054-8,True,CAAX
6,P01112,RASH,GTPase HRas,Ras,Membrane,S-farnesyl cysteine,C,-4,ECO_0000269,doi:10.1074/jbc.271.19.11541,True,CAAX
7,P01116,RASK,GTPase KRas,Ras,Membrane,S-farnesyl cysteine,C,-4,"[ECO_0000269, ECO_0000305, ECO_0007744]","[doi:10.1074/jbc.m113.527192, doi:10.1073/pnas.1615316113]",True,CAAX
8,P02545,LMNA,Prelamin-A/C,intermediate filament,"[Nucleus lamina, Nucleus matrix, Nucleus, Nucleoplasm, Nucleus speckle, Nucleus envelope]",S-farnesyl cysteine,C,-4,ECO_0000269,"[doi:10.1242/jcs.107.1.61, doi:10.1074/jbc.272.8.5298]",True,CXXX
9,P04899,GNAI2,Guanine nucleotide-binding protein G(i) subunit alpha-2,G-alpha,Membrane,S-farnesyl cysteine,C,-4,extended_search,doi:10.1039/c3mb70593e,True,CAAX


In [49]:
print('N unique publications:', len(up_evidence_ext))

N unique publications: 58


## What is the most common evidence code?

In [50]:
# use the non aggregated df of all known prenylated proteins

In [51]:
# split the uniprot df before dropping duplicates so the count is correct
evidence = no_agg_known[['ID', 'evidenceCode', 'publication']]
evidence = evidence.drop_duplicates() #gives all proteins that are unique in ID and evidenceCode and publication

evidence['evidenceCode'].value_counts()

#'ECO_0000250':  98   Sequence similarity evidence (manual assertion) 
#'ECO_0000269':  69   Experimental evidence (experimental + manual assertion) -> 48 publications
#'ECO_0000255':  11   Sequence Model evidence (manual assertion)
# extended_search 9   manually added to uniprot list based on individual literature research -> 2 publications
#'ECO_0000305':   7   Curator inference evidence (manual assertion)           ->  7 publications
#'ECO_0007744':   3   Combinatorial evidence (manual assertion)
#'ECO_0000256':   2   Sequence Model evidence (automatic assertion)

#Evidence types used for UniProtKB annotations
#from https://www.uniprot.org/help/evidences and https://www.uniprot.org/help/evidence_table
#evidence-types-used-for-uniprotkb-annotations, only 

evidenceCode
ECO_0000250        98
ECO_0000269        69
ECO_0000255        11
extended_search    10
ECO_0000305         7
ECO_0007744         3
ECO_0000256         2
Name: count, dtype: int64

In [52]:
# experimental evidence is the most reliable, be careful about Sequence Model evidence (automatic assertion)
# the two proteins with Sequence Model evidence (automatic assertion) are the same as the two not reviewed protein entries: A6NMN0, A6NIT2

up_evidence_ext['evidenceCode'].value_counts()

# Out of 57 unique publications about known prenylated proteins 48 are based on Experimental evidence (experimental + manual assertion) 
# and 7 are based on Curator inference evidence (manual assertion)

evidenceCode
ECO_0000269        48
ECO_0000305         7
extended_search     3
Name: count, dtype: int64

## Positions/motif of the proteins with experimental evidence

In [53]:
secure_pos = no_agg_known[['ID', 'name', 'moiety', 'evidenceCode', 'publication', 'position']]
secure_pos = secure_pos[secure_pos['evidenceCode'] == 'ECO_0000269']
secure_pos = secure_pos.groupby('ID').agg(set).reset_index()
secure_pos = clean_agg(secure_pos)

# Add motif from our motif detection pipeline, not deducted from UniProt position annotation
secure_pos = pd.merge(secure_pos, fasta_canon[['ID', 'motif']], on="ID") 

In [54]:
# look up index number for Q9BSW2 in non aggregated df
secure_pos[secure_pos['ID'] == 'Q9BSW2']

Unnamed: 0,ID,name,moiety,evidenceCode,publication,position,motif
54,Q9BSW2,EFC4B,S-geranylgeranyl cysteine,ECO_0000269,doi:10.1126/scisignal.aac9171,-3,CCX


In [55]:
# correct mistake in ID Q9BSW2
secure_pos.at[54, 'position'] = '[-3, -2]'

In [56]:
# look up index number for RHOB
secure_pos[secure_pos['ID'] == 'P62745']

Unnamed: 0,ID,name,moiety,evidenceCode,publication,position,motif
36,P62745,RHOB,"[S-farnesyl cysteine in plasma membrane form, S-geranylgeranyl cysteine in endosomal form]",ECO_0000269,"[doi:10.1074/jbc.270.14.7864, doi:10.1016/s0021-9258(19)88661-1]",-4,CXXX


In [57]:
# correct, shorten moiety description
secure_pos.at[36, 'moiety'] = ['S-farnesyl cysteine', 'S-geranylgeranyl cysteine']

In [58]:
secure_pos['motif'].value_counts()

motif
CXXX    31
CAAX    22
CC       3
CXC      1
CCX      1
Name: count, dtype: int64

In [59]:
secure_pos

# ATTENTION! THESE MOTIFS WERE PRODUCED BY OUR MOTIF DETECTION PIPELINE; NOT DEDUCTED BY THE UNIPROT POSITION ANNOTATION

Unnamed: 0,ID,name,moiety,evidenceCode,publication,position,motif
0,O15498,YKT6,S-farnesyl cysteine,ECO_0000269,doi:10.1073/pnas.0401183101,-4,CAAX
1,O60884,DNJA2,S-farnesyl cysteine,ECO_0000269,doi:10.1073/pnas.0403413101,-4,CXXX
2,P01111,RASN,S-farnesyl cysteine,ECO_0000269,doi:10.1016/0092-8674(89)90054-8,-4,CAAX
3,P01112,RASH,S-farnesyl cysteine,ECO_0000269,doi:10.1074/jbc.271.19.11541,-4,CAAX
4,P01116,RASK,S-farnesyl cysteine,ECO_0000269,doi:10.1073/pnas.1615316113,-4,CAAX
5,P02545,LMNA,S-farnesyl cysteine,ECO_0000269,"[doi:10.1242/jcs.107.1.61, doi:10.1074/jbc.272.8.5298]",-4,CXXX
6,P09936,UCHL1,S-farnesyl cysteine,ECO_0000269,doi:10.1073/pnas.0806474106,-4,CXXX
7,P10114,RAP2A,S-farnesyl cysteine,ECO_0000269,doi:10.1042/bj2890349,-4,CXXX
8,P11233,RALA,S-geranylgeranyl cysteine,ECO_0000269,"[doi:10.1016/s0021-9258(18)92889-9, doi:10.1128/mcb.00057-07]",-4,CXXX
9,P11234,RALB,S-geranylgeranyl cysteine,ECO_0000269,doi:10.1128/mcb.00057-07,-4,CXXX


# Known protein substrates

In [60]:
# FTase
FTase = known_F
# exclude YKT6
FTase = FTase.loc[(FTase['name'] != 'YKT6')].reset_index(drop=True)

# GGTaseI
# exclude any Rab from GGTaseI & motifs other than CAAX
GGTaseI = known_GG.loc[(known_GG['substrate'] != 'Rab') & (known_GG['motif'] == 'CXXX')].reset_index(drop=True)
# exclude FBXL2
GGTaseI = GGTaseI.loc[(GGTaseI['name'] != 'FBXL2')].reset_index(drop=True)

# GGTaseII
# Rab with single cysteine motif
single_G_Rab = known_GG.loc[(known_GG['substrate'] == 'Rab') & (known_GG['motif'].isin(['CXXX', 'C']))].reset_index(drop=True)
# double prenylated G
double_GG = known_GG.loc[~known_GG['motif'].isin(['CXXX', 'C'])].reset_index(drop=True)
GGTaseII = pd.concat([double_GG, single_G_Rab], ignore_index=True)
GGTaseII = GGTaseII.loc[(GGTaseII['substrate'] == 'Rab')]

# GGTaseIII
# only FBXL2 and YKT6
GGTaseIII = known_GG.loc[known_GG['name'] == 'FBXL2'].reset_index(drop=True)
GGTaseIII = pd.concat([GGTaseIII, (known_GG.loc[known_GG['name'] == 'YKT6'].reset_index(drop=True))], ignore_index=True)


print('FTase:', len(FTase), ', GGTaseI:', len(GGTaseI), ', GGTaseII:', len(GGTaseII), ', GGTaseIII:', len(GGTaseIII))

FTase: 71 , GGTaseI: 23 , GGTaseII: 62 , GGTaseIII: 2


In [61]:
# save
FTase.to_csv(os.path.join(datafolder, 'known_proteins', 'FTase.csv'), sep=',', index=False)
GGTaseI.to_csv(os.path.join(datafolder, 'known_proteins', 'GGTaseI.csv'), sep=',', index=False)
GGTaseII.to_csv(os.path.join(datafolder, 'known_proteins', 'GGTaseII.csv'), sep=',', index=False)
GGTaseIII.to_csv(os.path.join(datafolder, 'known_proteins', 'GGTaseIII.csv'), sep=',', index=False)

In [62]:
prenyltransferases = ['FTase', 'GGTaseI', 'GGTaseII', 'GGTaseIII']

# Create an empty DataFrame with the desired index
index_columns = ['number', 'motif', 'substrate', 'location', 'evidenceCode', 'evidenceCode with publication']
table_known = pd.DataFrame(index=prenyltransferases, columns=index_columns)

for transferase in prenyltransferases:
    if transferase == 'FTase':
        transf = pd.DataFrame(FTase)
    elif transferase == 'GGTaseI':
        transf = pd.DataFrame(GGTaseI)
    elif transferase == 'GGTaseII':
        transf = pd.DataFrame(GGTaseII)
    elif transferase == 'GGTaseIII':
        transf = pd.DataFrame(GGTaseIII)
        
    X_name = len(transf['name'])
    X_motif = transf['motif'].value_counts().reset_index().values
    X_motif = '\n'.join([f"{item[0]}, {item[1]}" for item in X_motif])
    transf['substrate'] = transf['substrate'].fillna(transf['name'])
    X_substr = transf['substrate'].value_counts().reset_index().values
    X_substr = '\n'.join([f"{item[0]}, {item[1]}" for item in X_substr])
    X_location = transf['location'].explode(ignore_index=True).value_counts().reset_index().values
    X_location = '\n'.join([f"{item[0]}, {item[1]}" for item in X_location])
    X_eC = transf['evidenceCode'].explode(ignore_index=True).value_counts().reset_index().values
    X_eC = '\n'.join([f"{item[0]}, {item[1]}" for item in X_eC])
    X_eC_pub = transf.explode('publication', ignore_index=True)
    X_eC_pub = X_eC_pub[X_eC_pub['publication'].notna()]
    X_eC_pub = X_eC_pub['evidenceCode'].explode(ignore_index=True).value_counts().reset_index().values
    X_eC_pub = '\n'.join([f"{item[0]}, {item[1]}" for item in X_eC_pub])
    
    d = pd.DataFrame({
        'number': [X_name], 'motif': [X_motif], 'substrate': [X_substr],
        'location': [X_location], 'evidenceCode': [X_eC], 'evidenceCode with publication': [X_eC_pub]
    }, index=[transferase])
    
    table_known.loc[transferase] = d.values

table_transposed = table_known.transpose()

# Format the DataFrame to display newline characters as line breaks
df_styled = table_transposed.style.set_properties(**{'white-space': 'pre-wrap'})

# Display the DataFrame
df_styled

Unnamed: 0,FTase,GGTaseI,GGTaseII,GGTaseIII
number,71,23,62,2
motif,"CXXX, 46 CAAX, 25","CXXX, 23","CXXX, 21 CXC, 20 CC, 16 CCX, 3 CAAX, 1 C, 1","CAAX, 2"
substrate,"Ras, 7 phosphorylase b kinase regulatory chain, 5 Rho, 4 G protein gamma, 4 paralemmin, 3 protein-tyrosine phosphatase, 3 intermediate filament, 3 RasD, 2 complexin/synaphin, 2 prickle / espinas / testin, 2 Rheb, 2 nucleosome assembly protein (NAP), 2 G-alpha, 2 cyclic nucleotide phosphodiesterase, 1 LRRFIP, 1 inositol 1,4,5-trisphosphate 5-phosphatase type IV, 1 DPCD, 1 PP16B, 1 PP16A, 1 CXX1, 1 DNJA4, 1 peptidase C19, 1 DNJA2, 1 RHBT3, 1 Ser/Thr protein kinase, 1 CEP85, 1 BROX, 1 WD repeat DCAF8, 1 AGC Ser/Thr protein kinase, 1 peptidase C12, 1 CAMK Ser/Thr protein kinase, 1 inositol 1,4,5-trisphosphate 5-phosphatase type I, 1 battenin, 1 Kinesin, 1 CNPase, 1 Rab, 1 centromere protein F, 1 G-protein coupled receptor 1, 1 peroxin-19, 1 GB1/RHD3 GTPase, 1 inositol 1,4,5-trisphosphate 5-phosphatase type II, 1 DNJA1, 1 PA24C, 1","Rho, 7 Ras, 4 G protein gamma, 3 cyclic nucleotide phosphodiesterase, 2 aldehyde dehydrogenase, 2 Di-Ras, 1 2-5A synthase, 1 GB1/RHD3 GTPase, 1 ZFN2B, 1 RPGR, 1","Rab, 62","FBXL2, 1 synaptobrevin, 1"
location,"Membrane, 58 Cytoplasm, 5 Nucleus, 5 Nucleus lamina, 3 Nucleus matrix, 2 Kinetochore, 2 Spindle, 2 Centrosome, 1 Centriole, 1 Cell cortex, 1 Spindle pole, 1 Centromere, 1 Perinuclear region, 1 Melanosome, 1 Golgi apparatus, 1 Nucleus envelope, 1 Nucleus speckle, 1 Nucleoplasm, 1 Nucleolus, 1","Membrane, 19 Cytoplasm, 2 Mitochondrion, 1 Microsome, 1 Nucleus, 1 Secreted, 1 Endoplasmic reticulum, 1 Lipid droplet, 1 Golgi apparatus, 1 Flagellum axoneme, 1 Cilium, 1 Centrosome, 1 Cilium axoneme, 1 Cilium basal body, 1","Membrane, 60 Cytoplasm, 1 Cytoplasmic vesicle, 1","Membrane, 2"
evidenceCode,"ECO_0000269, 31 ECO_0000250, 23 extended_search, 9 ECO_0000255, 5 ECO_0000305, 3 ECO_0000256, 2 ECO_0007744, 2","ECO_0000250, 13 ECO_0000269, 8 ECO_0000305, 1 ECO_0000255, 1","ECO_0000250, 50 ECO_0000269, 9 ECO_0000255, 2 ECO_0000305, 1","ECO_0000305, 1 ECO_0000269, 1 extended_search, 1"
evidenceCode with publication,"ECO_0000269, 41 extended_search, 9 ECO_0000305, 5 ECO_0007744, 4","ECO_0000269, 10 ECO_0000305, 1","ECO_0000269, 10 ECO_0000305, 1","ECO_0000269, 2 extended_search, 2 ECO_0000305, 1"


In [63]:
dfi.export(df_styled, os.path.join(datafolder, 'figures', 'Known_protein_substrates.png'))

C:\Program Files\Google\Chrome\Application\chrome.exe


# Sequence logos

In [64]:
# Update the C-terminal peptide according to fasta sequence
known = pd.merge(known, fasta_canon[['ID', 'pep']], on="ID") 

known_F = pd.merge(known_F, fasta_canon[['ID', 'pep']], on="ID") 
known_GG = pd.merge(known_GG, fasta_canon[['ID', 'pep']], on="ID") 

In [65]:
# save
known.to_csv(os.path.join(datafolder, 'known_proteins', 'known_all.csv'), sep=';', index=False)
known_F.to_csv(os.path.join(datafolder, 'known_proteins', 'known_F.csv'), sep=';', index=False)
known_GG.to_csv(os.path.join(datafolder, 'known_proteins', 'known_GG.csv'), sep=';', index=False)

In [66]:
known_F.motif.unique()

array(['CAAX', 'CXXX'], dtype=object)

In [67]:
known_GG.motif.unique()

array(['CXXX', 'CXC', 'CAAX', 'CC', 'C', 'CCX'], dtype=object)

In [68]:
# aggregate on different C motifs
F_CXXX = known_F['pep'][known_F['motif'] == 'CXXX']
F_CAAX = known_F['pep'][known_F['motif'] == 'CAAX']

GG_CXXX = known_GG['pep'][known_GG['motif'] == 'CXXX']
GG_CAAX = known_GG['pep'][known_GG['motif'] == 'CAAX']
GG_CXC = known_GG['pep'][known_GG['motif'] == 'CXC']
GG_CC = known_GG['pep'][known_GG['motif'] == 'CC']
GG_CCX = known_GG['pep'][known_GG['motif'] == 'CCX']
GG_C = known_GG['pep'][known_GG['motif'] == 'C']

In [69]:
# save
F_CXXX.to_csv(os.path.join(datafolder, 'PSSMSearch', 'F_CXXX.txt'), header=None, index=None, sep=' ')
F_CAAX.to_csv(os.path.join(datafolder, 'PSSMSearch', 'F_CAAX.txt'), header=None, index=None, sep=' ')

GG_CXXX.to_csv(os.path.join(datafolder, 'PSSMSearch', 'GG_CXXX.txt'), header=None, index=None, sep=' ')
GG_CAAX.to_csv(os.path.join(datafolder, 'PSSMSearch', 'GG_CAAX.txt'), header=None, index=None, sep=' ')
GG_CXC.to_csv(os.path.join(datafolder, 'PSSMSearch', 'GG_CXC.txt'), header=None, index=None, sep=' ')
GG_CC.to_csv(os.path.join(datafolder, 'PSSMSearch', 'GG_CC.txt'), header=None, index=None, sep=' ')
GG_CCX.to_csv(os.path.join(datafolder, 'PSSMSearch', 'GG_CCX.txt'), header=None, index=None, sep=' ')
GG_C.to_csv(os.path.join(datafolder, 'PSSMSearch', 'GG_C.txt'), header=None, index=None, sep=' ')