<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Proteins-with-true-single-internal-cysteine" data-toc-modified-id="Proteins-with-true-single-internal-cysteine-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Proteins with true single internal cysteine</a></span></li><li><span><a href="#Remove-cysteines-in-disulfide-bonds" data-toc-modified-id="Remove-cysteines-in-disulfide-bonds-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Remove cysteines in disulfide bonds</a></span></li><li><span><a href="#Annotate-CCC/CXC/CC/C-motifs-and-peptides" data-toc-modified-id="Annotate-CCC/CXC/CC/C-motifs-and-peptides-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Annotate CCC/CXC/CC/C motifs and peptides</a></span></li><li><span><a href="#Hypergeometric-distribution-and-Fisher's-exact-test-for-internal-C/CC/CXC/CCC-motifs" data-toc-modified-id="Hypergeometric-distribution-and-Fisher's-exact-test-for-internal-C/CC/CXC/CCC-motifs-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Hypergeometric distribution and Fisher's exact test for internal C/CC/CXC/CCC motifs</a></span></li></ul></div>

In [1]:
import sys
import os
import session_info

# Add the '0_functions' folder to sys.path
sys.path.append(os.path.join(os.getcwd(), '..', '0_functions'))

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import fisher_exact
from functions import Urn
from functions import add_Cpos
from functions import get_Ccount
from functions import pep_intern
from functions import annotate_pep_internal
import dataframe_image as dfi

In [3]:
# Display session information
session_info.show()

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
datafolder = 'data'
known_proteins_datafolder = '../1_Known_prenylated_proteins/data'
figures = 'data/figures'
PSSMSearch = 'data/PssmSearch'

In [6]:
# Load data

# All identified prenylated proteins that contain a cysteine
F_found = pd.read_csv(os.path.join(datafolder, 'identified_proteins', 'identified_F.csv'), sep=',')
GG_found = pd.read_csv(os.path.join(datafolder, 'identified_proteins', 'identified_GG.csv'), sep=',')
F_GG_found = pd.read_csv(os.path.join(datafolder, 'identified_proteins', 'identified_F_GG.csv'), sep=',')

# clean FASTA file of all identified proteins
fasta = pd.read_csv(os.path.join(datafolder, 'identified_proteins', 'final_fasta_seqs_identified_prots.csv'), sep=',')

# Cysteines in disulfide bonds
bonded = pd.read_csv(os.path.join(datafolder, 'UniProt_SPARQL_queries', 'up_output_multi_disulfid_04.10.23.csv'), sep=';')

# All known prenylated proteins
all_UP = pd.read_csv(os.path.join(known_proteins_datafolder, 'known_proteins', 'known_all.csv'), sep=';')

# Master file
master_df = pd.read_csv(os.path.join('..', 'master_df.csv'), sep=',')

In [7]:
novel_FAZ_only_Exp2F = ['O43900',
 'O95236',
 'Q16644',
 'Q5SZL2',
 'Q8IW50',
 'Q8N3D4',
 'Q8N5D0',
 'Q96EA4',
 'Q9Y3P9']

In [8]:
F_found[F_found['ID'].isin(novel_FAZ_only_Exp2F)].reset_index(drop=True)

Unnamed: 0,ID,name,fullName,substrate,location,moiety,AminoAcid,position,evidenceCode,publication,ProteinEntryReviewed,Exp_moiety,motif,pep
0,O43900,PRIC3,Prickle planar cell polarity protein 3,prickle / espinas / testin,Membrane,,,0,,,True,S-farnesyl cysteine,CAAX,
1,O95236,APOL3,Apolipoprotein L3,apolipoprotein L,Cytoplasm,,,0,,,True,S-farnesyl cysteine,CXXX,
2,Q16644,MAPK3,MAP kinase-activated protein kinase 3,CAMK Ser/Thr protein kinase,"['Nucleus', 'Cytoplasm']",,,0,,,True,S-farnesyl cysteine,CXXX,
3,Q5SZL2,CE85L,Centrosomal protein of 85 kDa-like,CEP85,Centrosome,,,0,,,True,S-farnesyl cysteine,CXXX,
4,Q8IW50,F219A,Protein FAM219A,FAM219,,,,0,,,True,S-farnesyl cysteine,CXXX,
5,Q8N3D4,EH1L1,EH domain-binding protein 1-like protein 1,EH1L1,Endosome,,,0,,,True,S-farnesyl cysteine,CAAX,
6,Q8N5D0,WDTC1,WD and tetratricopeptide repeats protein 1,WDTC1,,,,0,,,True,S-farnesyl cysteine,CXXX,
7,Q96EA4,SPDLY,Protein Spindly,Spindly,"['Nucleus', 'Centrosome', 'Spindle pole', 'Kin...",,,0,,,True,S-farnesyl cysteine,CXXX,
8,Q9Y3P9,RBGP1,Rab GTPase-activating protein 1,RBGP1,"['Centrosome', 'Cytosol']",,,0,,,True,S-farnesyl cysteine,C,


# Proteins with true single internal cysteine 

In [9]:
fasta_F = fasta[fasta['ID'].isin(F_found['ID'])]
fasta_GG = fasta[fasta['ID'].isin(GG_found['ID'])]
fasta_F_GG = fasta[fasta['ID'].isin(F_GG_found['ID'])]

print('N of identified only farnesylated proteins:', len(fasta_F))
print('N of identified only geranylgeranylated proteins:', len(fasta_GG))
print('N of identified farnesylated and geranylgeranylated proteins:', len(fasta_F_GG))

N of identified only farnesylated proteins: 274
N of identified only geranylgeranylated proteins: 159
N of identified farnesylated and geranylgeranylated proteins: 203


In [10]:
# Internally prenylated proteins

In [11]:
int_F = fasta_F[fasta_F['motif'].isna()]
int_GG = fasta_GG[fasta_GG['motif'].isna()]
int_F_GG = fasta_F_GG[fasta_F_GG['motif'].isna()]

print('Internal farnesylated only:', len(int_F))
print('Internal geranylgeranylated only:', len(int_GG))
print('Internal farnesylated & geranylgeranylated:', len(int_F_GG))

Internal farnesylated only: 184
Internal geranylgeranylated only: 133
Internal farnesylated & geranylgeranylated: 158


In [12]:
# check for proteins with only 1 possibly prenylated C that is internal
F_int_True_C1 = int_F[int_F['Ccount'] == 1]
GG_int_True_C1 = int_GG[int_GG['Ccount'] == 1]
F_GG_int_True_C1 = int_F_GG[int_F_GG['Ccount'] == 1]

# Get Uniprot annotations for proteins with 1 cysteine
F_int_True_C1_UP = F_found[F_found['ID'].isin(F_int_True_C1['ID'])].reset_index(drop=True)
GG_int_True_C1_UP = GG_found[GG_found['ID'].isin(GG_int_True_C1['ID'])].reset_index(drop=True)
F_GG_int_True_C1_UP = F_GG_found[F_GG_found['ID'].isin(F_GG_int_True_C1['ID'])].reset_index(drop=True)

print('N of F proteins with exactly 1 cysteine:', len(F_int_True_C1_UP))
print('N of GG proteins with exactly 1 cysteine:', len(GG_int_True_C1_UP))
print('N of F_GG proteins with exactly 1 cysteine:', len(F_GG_int_True_C1_UP))

N of F proteins with exactly 1 cysteine: 10
N of GG proteins with exactly 1 cysteine: 4
N of F_GG proteins with exactly 1 cysteine: 8


In [13]:
F_int_True_C1_UP

Unnamed: 0,ID,name,fullName,substrate,location,moiety,AminoAcid,position,evidenceCode,publication,ProteinEntryReviewed,Exp_moiety,motif,pep
0,O75438,NDUB1,NADH dehydrogenase [ubiquinone] 1 beta subcomp...,complex I NDUFB1 subunit,Membrane,,,0,,,True,S-farnesyl cysteine,internal,
1,P00395,COX1,Cytochrome c oxidase subunit 1,heme-copper respiratory oxidase,Membrane,,,0,,,True,S-farnesyl cysteine,internal,
2,P36542,ATPG,"ATP synthase subunit gamma, mitochondrial",ATPase gamma chain,Membrane,,,0,,,True,S-farnesyl cysteine,internal,
3,P60059,SC61G,Protein transport protein Sec61 subunit gamma,SecE/SEC61-gamma,Membrane,,,0,,,True,S-farnesyl cysteine,internal,
4,Q16836,HCDH,"Hydroxyacyl-coenzyme A dehydrogenase, mitochon...",3-hydroxyacyl-CoA dehydrogenase,Mitochondrion matrix,,,0,,,True,S-farnesyl cysteine,internal,
5,Q5XKP0,MIC13,MICOS complex subunit MIC13,MICOS complex subunit Mic13,Membrane,,,0,,,True,S-farnesyl cysteine,internal,
6,Q96M27,PRRC1,Protein PRRC1,PRRC1,Golgi apparatus,,,0,,,True,S-farnesyl cysteine,internal,
7,Q9BQE5,APOL2,Apolipoprotein L2,apolipoprotein L,Cytoplasm,,,0,,,True,S-farnesyl cysteine,internal,
8,Q9UDW1,QCR9,Cytochrome b-c1 complex subunit 9,UQCR10/QCR9,Membrane,,,0,,,True,S-farnesyl cysteine,internal,
9,Q9UMX0,UBQL1,Ubiquilin-1,UBQL1,Membrane,,,0,,,True,S-farnesyl cysteine,internal,


In [14]:
GG_int_True_C1_UP

Unnamed: 0,ID,name,fullName,substrate,location,moiety,AminoAcid,position,evidenceCode,publication,ProteinEntryReviewed,Exp_moiety,motif,pep
0,P53367,ARFP1,Arfaptin-1,ARFP1,Membrane,,,0,,,True,S-geranylgeranyl cysteine,internal,
1,P63027,VAMP2,Vesicle-associated membrane protein 2,synaptobrevin,Membrane,,,0,,,True,S-geranylgeranyl cysteine,internal,
2,Q8N4V1,EMC5,ER membrane protein complex subunit 5,membrane magnesium transporter (TC 1.A.67),Membrane,,,0,,,True,S-geranylgeranyl cysteine,internal,
3,Q92968,PEX13,Peroxisomal membrane protein PEX13,peroxin-13,Membrane,,,0,,,True,S-geranylgeranyl cysteine,internal,


In [15]:
F_GG_int_True_C1_UP

Unnamed: 0,ID,name,fullName,substrate,location,moiety,AminoAcid,position,evidenceCode,publication,ProteinEntryReviewed,Exp_moiety,motif,pep
0,O00483,NDUA4,Cytochrome c oxidase subunit NDUFA4,complex IV NDUFA4 subunit,Membrane,,,0,,,True,"[S-farnesyl cysteine, S-geranylgeranyl cysteine]",internal,
1,P00414,COX3,Cytochrome c oxidase subunit 3,cytochrome c oxidase subunit 3,Membrane,,,0,,,True,"[S-farnesyl cysteine, S-geranylgeranyl cysteine]",internal,
2,P0DJ93,SIM13,Small integral membrane protein 13,SMIM13,Membrane,,,0,,,True,"[S-farnesyl cysteine, S-geranylgeranyl cysteine]",internal,
3,P60468,SC61B,Protein transport protein Sec61 subunit beta,SEC61-beta,Membrane,S-palmitoyl cysteine,C,-5,ECO_0000305,doi:10.1194/jlr.d011106,True,"[S-farnesyl cysteine, S-geranylgeranyl cysteine]",internal,
4,Q8N131,PORIM,Porimin,CD164,Membrane,,,0,,,True,"[S-farnesyl cysteine, S-geranylgeranyl cysteine]",internal,
5,Q99720,SGMR1,Sigma non-opioid intracellular receptor 1,ERG2,Membrane,,,0,,,True,"[S-farnesyl cysteine, S-geranylgeranyl cysteine]",internal,
6,Q9BTV4,TMM43,Transmembrane protein 43,TMEM43,Membrane,,,0,,,True,"[S-farnesyl cysteine, S-geranylgeranyl cysteine]",internal,
7,Q9BV40,VAMP8,Vesicle-associated membrane protein 8,synaptobrevin,Membrane,(Microbial infection) N6-stearoyl lysine,K,-3,ECO_0000269,doi:10.1038/s41564-018-0215-6,True,"[S-farnesyl cysteine, S-geranylgeranyl cysteine]",internal,


In [16]:
# save
dfi.export(F_int_True_C1_UP, (os.path.join(datafolder, 'figures', 'F_True_C1_UP.png')))
dfi.export(GG_int_True_C1_UP, (os.path.join(datafolder, 'figures', 'GG_True_C1_UP.png')))
dfi.export(F_GG_int_True_C1_UP, (os.path.join(datafolder, 'figures', 'F_GG_True_C1_UP.png')))

C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe


In [17]:
#Update master file
Internal_C_1 = fasta[fasta['Ccount'] == 1 & fasta['motif'].isna()]
Internal_C_1 = add_Cpos(Internal_C_1[['ID', 'seq', 'len']])

master_df['Internal_C_1'] = master_df['Protein_ID'].map(Internal_C_1.set_index('ID')['Cpos'])

# Remove cysteines in disulfide bonds

In [18]:
# remove Cpos column
dataframes = [int_F, int_GG, int_F_GG] 

for df in dataframes:
    if 'Cpos' in df.columns:
        df.drop('Cpos', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('Cpos', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('Cpos', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('Cpos', axis=1, inplace=True)


In [19]:
# Add C positions counting from N-terminal and counting from C-terminal
# ATTENTION: these positions start from 1, not 0, as would be necessary for calling the Cs on index
fasta_F = add_Cpos(int_F)
fasta_GG = add_Cpos(int_GG)
fasta_F_GG = add_Cpos(int_F_GG)

In [20]:
# add C count 
bonded = bonded.rename(columns={'bond_Cpos': 'N_Cpos'})
agg_bonded_Ccount = get_Ccount(bonded)

# proteins that have cysteines that can be removed
print(len(agg_bonded_Ccount), 'proteins have', agg_bonded_Ccount['Ccount'].sum(), 'cysteines that can be removed.')
print('')

cols = ['ID', 'seqID', 'seq', 'len', 'Cpos', 'Ccount', 'motif', 'N_Cpos']

discardF = bonded.merge(fasta_F, on=['ID', 'N_Cpos'])
discardF = discardF[cols]

discardGG = bonded.merge(fasta_GG, on=['ID', 'N_Cpos'])
discardGG = discardGG[cols]

discardFGG = bonded.merge(fasta_F_GG, on=['ID', 'N_Cpos'])
discardFGG = discardFGG[cols]

print('Before removing cysteines in disulfide bonds:')
print('Number of Cs in F proteins:', len(fasta_F)) # check
print('Number of Cs in GG proteins:', len(fasta_GG)) # check
print('Number of Cs in F or GG proteins:', len(fasta_F_GG)) # check

fasta_F = fasta_F[cols]
fasta_GG = fasta_GG[cols]
fasta_F_GG = fasta_F_GG[cols]

fasta_F = pd.concat([fasta_F, discardF]).drop_duplicates(keep=False).reset_index(drop=True)
fasta_GG = pd.concat([fasta_GG, discardGG]).drop_duplicates(keep=False).reset_index(drop=True)
fasta_F_GG = pd.concat([fasta_F_GG, discardFGG]).drop_duplicates(keep=False).reset_index(drop=True)

print('\nAfter removing cysteines in disulfide bonds:')
print('Number of Cs in F proteins:', len(fasta_F)) # check
print('Number of Cs in GG proteins:', len(fasta_GG)) # check
print('Number of Cs in F or GG proteins:', len(fasta_F_GG)) # check

57 proteins have 482 cysteines that can be removed.

Before removing cysteines in disulfide bonds:
Number of Cs in F proteins: 2141
Number of Cs in GG proteins: 1425
Number of Cs in F or GG proteins: 1633

After removing cysteines in disulfide bonds:
Number of Cs in F proteins: 2053
Number of Cs in GG proteins: 1315
Number of Cs in F or GG proteins: 1436


In [21]:
# One F_GG protein seems to only have cysteines in disulfide bonds!

F_GG_found[F_GG_found['ID'] == 'P62072'].reset_index(drop=True)
#int_F_GG[int_F_GG['ID'] == 'P62072'].reset_index(drop=True)

Unnamed: 0,ID,name,fullName,substrate,location,moiety,AminoAcid,position,evidenceCode,publication,ProteinEntryReviewed,Exp_moiety,motif,pep
0,P62072,TIM10,Mitochondrial import inner membrane translocas...,small Tim,Membrane,,,0,,,True,"[S-farnesyl cysteine, S-geranylgeranyl cysteine]",internal,


In [22]:
# check if and how much the possibly prenylated Cs have been reduced to 1 possibly prenylated C with this method

# F
agg_F_bonded_filt_Ccount = get_Ccount(fasta_F)
F_bonded_C1 = agg_F_bonded_filt_Ccount[agg_F_bonded_filt_Ccount['Ccount'] == 1]
F_bonded_C1 = F_bonded_C1[~F_bonded_C1['ID'].isin(F_int_True_C1['ID'])].reset_index(drop=True)
F_bonded_C1

Unnamed: 0,ID,seqID,seq,len,Cpos,Ccount,motif,N_Cpos,Count_all
0,Q96HE7,sp|Q96HE7|ERO1A_HUMAN,MGRGWGFLFGLLGAVWLLSSGHGEEQPPETAAQRCFCQVSGYLDDC...,468,-303,1,,166,15
1,P11279,sp|P11279|LAMP1_HUMAN,MAAPGSARRPLLLLLLLLLLGLMHCASAAMFMVKNGNGTACIMANF...,417,-393,1,,25,9


In [23]:
# GG
agg_GG_bonded_filt_Ccount = get_Ccount(fasta_GG)
GG_bonded_C1 = agg_GG_bonded_filt_Ccount[agg_GG_bonded_filt_Ccount['Ccount'] == 1]
GG_bonded_C1 = GG_bonded_C1[~GG_bonded_C1['ID'].isin(GG_int_True_C1['ID'])].reset_index(drop=True)
GG_bonded_C1

Unnamed: 0,ID,seqID,seq,len,Cpos,Ccount,motif,N_Cpos,Count_all
0,O95297,sp|O95297|MPZL1_HUMAN,MAASAGAGAVIAAPDSRRWLWSVLAAALGLLTAGVSALEVYTPKEI...,269,-67,1,,203,3
1,P41217,sp|P41217|OX2G_HUMAN,MERLVIRMPFSHLSTYSLVWVMAAVVLCTAQVQVVTQDEREQLYTP...,278,-251,1,,28,7


In [24]:
# F or GG
agg_F_GG_bonded_filt_Ccount = get_Ccount(fasta_F_GG)
F_GG_bonded_C1 = agg_F_GG_bonded_filt_Ccount[agg_F_GG_bonded_filt_Ccount['Ccount'] == 1]
F_GG_bonded_C1 = F_GG_bonded_C1[~F_GG_bonded_C1['ID'].isin(F_GG_int_True_C1['ID'])].reset_index(drop=True)
F_GG_bonded_C1

Unnamed: 0,ID,seqID,seq,len,Cpos,Ccount,motif,N_Cpos,Count_all
0,P05107,sp|P05107|ITB2_HUMAN,MLGLRPPLLALVGLLSLGCVLSQECTKFKVSSCRECIESGPGCTWC...,769,-751,1,,19,57


In [25]:
# Actualize Ccount after removing cysteines in disulfide bonds

# Replace 'Ccount' values in 'fasta_F_GG' with values from 'agg_F_GG_bonded_filt_Ccount' based on matching 'ID'
fasta_F_GG['Ccount'] = fasta_F_GG['ID'].map(agg_F_GG_bonded_filt_Ccount.set_index('ID')['Ccount'])
fasta_F['Ccount'] = fasta_F['ID'].map(agg_F_bonded_filt_Ccount.set_index('ID')['Ccount'])
fasta_GG['Ccount'] = fasta_GG['ID'].map(agg_GG_bonded_filt_Ccount.set_index('ID')['Ccount'])

In [26]:
print('N of F proteins with exactly 1 cysteine, after removing cysteines in disulfide bonds:', 
      len(fasta_F['ID'][fasta_F['Ccount'] == 1].unique()))

print('N of GG proteins with exactly 1 cysteine, after removing cysteines in disulfide bonds:', 
      len(fasta_GG['ID'][fasta_GG['Ccount'] == 1].unique()))

print('N of F_GG proteins with exactly 1 cysteine, after removing cysteines in disulfide bonds:', 
      len(fasta_F_GG['ID'][fasta_F_GG['Ccount'] == 1].unique()))

N of F proteins with exactly 1 cysteine, after removing cysteines in disulfide bonds: 12
N of GG proteins with exactly 1 cysteine, after removing cysteines in disulfide bonds: 6
N of F_GG proteins with exactly 1 cysteine, after removing cysteines in disulfide bonds: 9


In [27]:
# Add count of all cysteines (Count_all), including those in disulfide bonds, to fasta dfs from aggregated dfs

fasta_F = fasta_F.merge(agg_F_bonded_filt_Ccount[['ID', 'Count_all']], on='ID', how='left')
fasta_GG = fasta_GG.merge(agg_GG_bonded_filt_Ccount[['ID', 'Count_all']], on='ID', how='left')
fasta_F_GG = fasta_F_GG.merge(agg_F_GG_bonded_filt_Ccount[['ID', 'Count_all']], on='ID', how='left')

In [28]:
#Update master file
Internal_C_1_wo_disulfide_bond = pd.concat([agg_F_bonded_filt_Ccount, agg_GG_bonded_filt_Ccount, agg_F_GG_bonded_filt_Ccount])

Internal_C_1_wo_disulfide_bond = Internal_C_1_wo_disulfide_bond[(Internal_C_1_wo_disulfide_bond['Ccount'] == 1) & (Internal_C_1_wo_disulfide_bond['Count_all'] != 1)]

master_df['Internal_C_1_wo_disulfide_bond'] = master_df['Protein_ID'].map(Internal_C_1_wo_disulfide_bond.set_index('ID')['Cpos'])

In [29]:
# Save
master_df.to_csv(os.path.join('..', 'master_df.csv'), sep=',', index=False)

# Annotate CCC/CXC/CC/C motifs and peptides

In [30]:
# add peptides for all internal C positions, 11 AA long
F_motifs = pep_intern(fasta_F)
GG_motifs = pep_intern(fasta_GG)
F_GG_motifs = pep_intern(fasta_F_GG)

In [31]:
# annotate peptides according to motif CCC/CXC/CC/C
F_motifs = annotate_pep_internal(F_motifs)
GG_motifs = annotate_pep_internal(GG_motifs)
F_GG_motifs = annotate_pep_internal(F_GG_motifs)

In [32]:
# Special cases
special_F_motifs = F_motifs[F_motifs.pepC.isna() & F_motifs.pepCC.isna() & F_motifs.pepCCC.isna() & F_motifs.pepCXC.isna()]

# CXCC and CCXC are covered in pepCC
special_F_motifs[special_F_motifs.N_Cpos > 5].reset_index(drop=True)

Unnamed: 0,ID,seqID,seq,len,Cpos,Ccount,motif,N_Cpos,Count_all,pep,pepCCC,pepCC,pepCXC,pepC
0,O15162,sp|O15162|PLS1_HUMAN,MDKQNSQMNASHPETNLPVGYPPQYPPTAFQGPPGYSGYPGPQVSY...,318,-82,18,,237,18,GPCVVCSCCGD,,,,
1,O15427,sp|O15427|MOT4_HUMAN,MGGAVVDEGPTGVKAPDGGWGWAVLFGCFVITGFSYAFPKAVSVFF...,465,-273,11,,193,11,LNCCVCAALMR,,,,
2,O95573,sp|O95573|ACSL3_HUMAN,MNNHVSSKPSTMKLKHTINPILLYFIHFLISLYTILTYIPFYFFSE...,720,-255,17,,466,17,RFMNICFCCPV,,,,
3,Q15027,sp|Q15027|ACAP1_HUMAN,MTVKLDFEECLKDSPRFRASIELVEAEVSELETRLEKLLKLGTGLL...,740,-318,16,,423,16,AQCCDCREPAP,,,,
4,Q5VZE5,sp|Q5VZE5|NAA35_HUMAN,MVMKASVDDDDSGWELSMPEKMEKSNTNWVDITQDFEEACRELKLG...,725,-614,17,,112,17,GIMDTCFCCLI,,,,
5,Q9BVK2,sp|Q9BVK2|ALG8_HUMAN,MAALTIATGTGNWFSALALGVTLLKCLLIPTYHSTDFEVHRNWLAI...,526,-400,13,,127,13,RECCKCIDGKK,,,,
6,Q9NRX5,sp|Q9NRX5|SERC1_HUMAN,MGSVLGLCSMASWIPCLCGSAPCLLCRCCPSGNNSTVTRLIYALFL...,453,-428,18,,26,18,APCLLCRCCPS,,,,
7,Q9NRY6,sp|Q9NRY6|PLS3_HUMAN,MAGYLPPKGYAPSPPPPYPVTPGYPEPALHPGPGQAPVPAQVPAPA...,295,-131,13,,165,13,CSCCPCGLQEM,,,,
8,Q9Y4A5,sp|Q9Y4A5|TRRAP_HUMAN,MAFVATQGATVVDQTTLMKKYLQFVAALTDVNTPDETKLKMMQEVS...,3859,-2714,78,,1146,78,IVERLCACCYE,,,,
9,Q9Y5L0,sp|Q9Y5L0|TNPO3_HUMAN,MEGAKPTLQLVYQAVQALYHDPDPSGKERASFWLGELQRSVHAWEI...,923,-259,32,,665,32,ERCCRCLRFAV,,,,


In [33]:
# Very N-terminal cysteine positions, too short peptides for sequence motiv


# Note that Q6IAA8|LTOR1 has it's only cysteines in the N-terminus i a CC motif (MGCCYSSEN)

print("F - N-terminal cysteines that can't be aligned for sequence logo plots:", len(F_motifs[F_motifs['N_Cpos'] < 6].reset_index(drop=True)))
F_motifs[F_motifs['N_Cpos'] < 6].reset_index(drop=True)

F - N-terminal cysteines that can't be aligned for sequence logo plots: 18


Unnamed: 0,ID,seqID,seq,len,Cpos,Ccount,motif,N_Cpos,Count_all,pep,pepCCC,pepCC,pepCXC,pepC
0,A6NNY8,sp|A6NNY8|UBP27_HUMAN,MCKDYVYDKDIEQIAKEEQGEALKLQASTSTEVSHQQCSVPGLGEK...,438,-437,19,,2,19,MCKDYVY,,,,
1,O75832,sp|O75832|PSD10_HUMAN,MEGCVSNLMVCNLAYSGKLEELKESILADKSLATRTDQDSRTALHW...,226,-223,5,,4,5,MEGCVSNLM,,,,
2,P10515,sp|P10515|ODP2_HUMAN,MWRVCARRAQNVAPWAGLEARWTALQEVPGTPRVTSRSGPAPARRN...,647,-643,9,,5,9,MWRVCARRAQ,,,,
3,P13473,sp|P13473|LAMP2_HUMAN,MVCFRLFPVPGSGLVLVCLVLGAVRSYALELNLTDSENATCLYAKW...,410,-408,2,,3,10,MVCFRLFP,,,,
4,P20701,sp|P20701|ITAL_HUMAN,MKDSCITVMAMALLSGFFFFAPASSYNLDVRGARSFSPPRAGRHFG...,1170,-1166,22,,5,22,MKDSCITVMA,,,,
5,P43489,sp|P43489|TNR4_HUMAN,MCVGARRLGRGPCAALLLLGLGLSTVTGLHCVGDTYPSNDRCCHEC...,277,-276,2,,2,20,MCVGARR,,,,
6,Q13362,sp|Q13362|2A5G_HUMAN,MLTCNKAGSRMVVDAANSNGPFQPVVLLHIRDVPPADQEKLFIQKL...,524,-521,7,,4,7,MLTCNKAGS,,,,
7,Q14318,sp|Q14318|FKBP8_HUMAN,MASCAEPSEPSAPLPAGVPPLEDFEVLDGVEDAEGEEEEEEEEEEE...,412,-409,8,,4,8,MASCAEPSE,,,,
8,Q6IAA8,sp|Q6IAA8|LTOR1_HUMAN,MGCCYSSENEDSDQDREERKLLLDPSSPPTKALNGAEPNYHSLPSA...,161,-159,2,,3,2,MGCCYSSE,,,,
9,Q6IAA8,sp|Q6IAA8|LTOR1_HUMAN,MGCCYSSENEDSDQDREERKLLLDPSSPPTKALNGAEPNYHSLPSA...,161,-158,2,,4,2,MGCCYSSEN,,,,


In [34]:
print("GG - N-terminal cysteines that can't be aligned for sequence logo plots:", len(GG_motifs[GG_motifs['N_Cpos'] < 6].reset_index(drop=True)))
GG_motifs[GG_motifs['N_Cpos'] < 6].reset_index(drop=True)

GG - N-terminal cysteines that can't be aligned for sequence logo plots: 7


Unnamed: 0,ID,seqID,seq,len,Cpos,Ccount,motif,N_Cpos,Count_all,pep,pepCCC,pepCC,pepCXC,pepC
0,O60725,sp|O60725|ICMT_HUMAN,MAGCAARAPPGSEARLSLATFLLGASVLALPLLTRAGLQGRTGLAL...,284,-281,7,,4,7,MAGCAARAP,,,,
1,P98196,sp|P98196|AT11A_HUMAN,MDCSLVRTLVHRYCAGEENWVDSRTIYVGHREPPPGAEAYIPQRYP...,1134,-1132,26,,3,26,MDCSLVRT,,,,
2,Q12999,sp|Q12999|TSN31_HUMAN,MVCGGFACSKNALCALNVVYMLVSLLLIGVAAWGKGLGLVSSIHII...,210,-208,11,,3,11,MVCGGFAC,,,,
3,Q14761,sp|Q14761|PTCA_HUMAN,MALPCTLGLGMLLALPGALGSGGSAEDSVGSSSVTVVLLLLLLLLL...,206,-202,2,,5,2,MALPCTLGLG,,,,
4,Q9H6R6,sp|Q9H6R6|ZDHC6_HUMAN,MGTFCSVIKFENLQELKRLCHWGPIIALGVIAICSTMAMIDSVLWY...,413,-409,19,,5,19,MGTFCSVIKF,,,,
5,Q9NVA4,sp|Q9NVA4|T184C_HUMAN,MPCTCTWRNWRQWIRPLVAVIYLVSIVVAVPLCVWELQKLEVGIHT...,438,-436,14,,3,14,MPCTCTWR,,,,
6,Q9NVA4,sp|Q9NVA4|T184C_HUMAN,MPCTCTWRNWRQWIRPLVAVIYLVSIVVAVPLCVWELQKLEVGIHT...,438,-434,14,,5,14,MPCTCTWRNW,,,,


In [35]:
print("F_GG - N-terminal cysteines that can't be aligned for sequence logo plots:", len(F_GG_motifs[F_GG_motifs['N_Cpos'] < 6].reset_index(drop=True)))
F_GG_motifs[F_GG_motifs['N_Cpos'] < 6].reset_index(drop=True)

F_GG - N-terminal cysteines that can't be aligned for sequence logo plots: 12


Unnamed: 0,ID,seqID,seq,len,Cpos,Ccount,motif,N_Cpos,Count_all,pep,pepCCC,pepCC,pepCXC,pepC
0,O75955,sp|O75955|FLOT1_HUMAN,MFFTCGPNEAMVVSGFCRSPPVMVAGGRVFVLPCIQQIQRISLNTL...,427,-423,4,,5,4,MFFTCGPNEA,,,,
1,P06239,sp|P06239|LCK_HUMAN,MGCGCSSHPEDDWMENIDVCENCHYPIVPLDGKGTLLIRNGSEVRD...,509,-507,9,,3,9,MGCGCSSH,,,,
2,P06239,sp|P06239|LCK_HUMAN,MGCGCSSHPEDDWMENIDVCENCHYPIVPLDGKGTLLIRNGSEVRD...,509,-505,9,,5,9,MGCGCSSHPE,,,,
3,P07686,sp|P07686|HEXB_HUMAN,MELCGLGLPRPPMLLALLLATLLAAMLALLTQVALVVQVAEAARAP...,556,-553,8,,4,8,MELCGLGLP,,,,
4,P27701,sp|P27701|CD82_HUMAN,MGSACIKVTKYFLFLFNLIFFILGAVILGFGVWILADKSSFISVLQ...,267,-263,11,,5,11,MGSACIKVTK,,,,
5,Q6PIU2,sp|Q6PIU2|NCEH1_HUMAN,MRSSCVLLTALVALAAYYVYIPLPGSVSDPWKLMLLDATFRGAQQV...,408,-404,5,,5,5,MRSSCVLLTA,,,,
6,Q8TCD1,sp|Q8TCD1|CR032_HUMAN,MVCIPCIVIPVLLWIYKKFLEPYIYPLVSPFVSRIWPKKAIQESND...,76,-74,3,,3,3,MVCIPCIV,,,,
7,Q8WWI5,sp|Q8WWI5|CTL1_HUMAN,MGCCSSASSAAQSSKREWKPLEDRSCTDIPWLLLFILFCIGMGFIC...,657,-655,27,,3,27,MGCCSSAS,,,,
8,Q8WWI5,sp|Q8WWI5|CTL1_HUMAN,MGCCSSASSAAQSSKREWKPLEDRSCTDIPWLLLFILFCIGMGFIC...,657,-654,27,,4,27,MGCCSSASS,,,,
9,Q96BY9,sp|Q96BY9|SARAF_HUMAN,MAAACGPGAAGYCLLLGLHLFLLTAGPALGWNDPDRMLLRDVKALT...,339,-335,10,,5,10,MAAACGPGAA,,,,


In [36]:
# aggregate on different C motifs

F_motifs_C = F_motifs[~F_motifs['pepC'].isna()].reset_index(drop=True)
F_motifs_C1 = F_motifs[F_motifs['Ccount'] == 1].reset_index(drop=True)
F_motifs_Cplus = F_motifs[F_motifs['Ccount'] != 1].reset_index(drop=True)
F_motifs_Cplus = F_motifs_Cplus[~F_motifs_Cplus['pepC'].isna()].reset_index(drop=True)

F_motifs_CC = F_motifs[~F_motifs['pepCC'].isna()].reset_index(drop=True)
F_motifs_CC = F_motifs_CC[['ID', 'pepCC']].drop_duplicates()

F_motifs_CXC = F_motifs[~F_motifs['pepCXC'].isna()].reset_index(drop=True)
F_motifs_CXC = F_motifs_CXC[['ID', 'pepCXC']].drop_duplicates()

F_motifs_CCC = F_motifs[~F_motifs['pepCCC'].isna()].reset_index(drop=True)
F_motifs_CCC = F_motifs_CCC[['ID', 'pepCCC']].drop_duplicates()

GG_motifs_C = GG_motifs[~GG_motifs['pepC'].isna()].reset_index(drop=True)
GG_motifs_C1 = GG_motifs[GG_motifs['Ccount'] == 1].reset_index(drop=True)
GG_motifs_Cplus = GG_motifs[GG_motifs['Ccount'] != 1].reset_index(drop=True)
GG_motifs_Cplus = GG_motifs_Cplus[~GG_motifs_Cplus['pepC'].isna()].reset_index(drop=True)

GG_motifs_CC = GG_motifs[~GG_motifs['pepCC'].isna()].reset_index(drop=True)
GG_motifs_CC = GG_motifs_CC[['ID', 'pepCC']].drop_duplicates()

GG_motifs_CXC = GG_motifs[~GG_motifs['pepCXC'].isna()].reset_index(drop=True)
GG_motifs_CXC = GG_motifs_CXC[['ID', 'pepCXC']].drop_duplicates()

GG_motifs_CCC = GG_motifs[~GG_motifs['pepCCC'].isna()].reset_index(drop=True)
GG_motifs_CCC = GG_motifs_CCC[['ID', 'pepCCC']].drop_duplicates()

F_GG_motifs_C = F_GG_motifs[~F_GG_motifs['pepC'].isna()].reset_index(drop=True)
F_GG_motifs_C1 = F_GG_motifs[F_GG_motifs['Ccount'] == 1].reset_index(drop=True)
F_GG_motifs_Cplus = F_GG_motifs[F_GG_motifs['Ccount'] != 1].reset_index(drop=True)
F_GG_motifs_Cplus = F_GG_motifs_Cplus[~F_GG_motifs_Cplus['pepC'].isna()].reset_index(drop=True)

F_GG_motifs_CC = F_GG_motifs[~F_GG_motifs['pepCC'].isna()].reset_index(drop=True)
F_GG_motifs_CC = F_GG_motifs_CC[['ID', 'pepCC']].drop_duplicates()

F_GG_motifs_CXC = F_GG_motifs[~F_GG_motifs['pepCXC'].isna()].reset_index(drop=True)
F_GG_motifs_CXC = F_GG_motifs_CXC[['ID', 'pepCXC']].drop_duplicates()

F_GG_motifs_CCC = F_GG_motifs[~F_GG_motifs['pepCCC'].isna()].reset_index(drop=True)
F_GG_motifs_CCC = F_GG_motifs_CCC[['ID', 'pepCCC']].drop_duplicates()

In [37]:
print('Farnesylated: number of peptides with internal C')
print('C all: ', len(F_motifs_C), ', C1: ',  len(F_motifs_C1), ', Cplus: ',  len(F_motifs_Cplus), ', CC: ',  
      len(F_motifs_CC), ', CXC: ',  len(F_motifs_CXC),  ', CCC: ', len(F_motifs_CCC), sep='')

print('\nGeranylgeranylated: number of peptides with internal C')
print('C all: ', len(GG_motifs_C), ', C1: ',  len(GG_motifs_C1), ', Cplus: ',  len(GG_motifs_Cplus), ', CC: ',  
      len(GG_motifs_CC), ', CXC: ',  len(GG_motifs_CXC),  ', CCC: ', len(GG_motifs_CCC), sep='')

print('\nFarnesylated & Geranylgeranylated: number of peptides with internal C')
print('C all: ', len(F_GG_motifs_C), ', C1: ',  len(F_GG_motifs_C1), ', Cplus: ',  len(F_GG_motifs_Cplus), ', CC: ',  
      len(F_GG_motifs_CC), ', CXC: ',  len(F_GG_motifs_CXC),  ', CCC: ', len(F_GG_motifs_CCC), sep='')

Farnesylated: number of peptides with internal C
C all: 1817, C1: 12, Cplus: 1805, CC: 61, CXC: 39, CCC: 3

Geranylgeranylated: number of peptides with internal C
C all: 1155, C1: 6, Cplus: 1149, CC: 41, CXC: 27, CCC: 6

Farnesylated & Geranylgeranylated: number of peptides with internal C
C all: 1245, C1: 9, Cplus: 1236, CC: 53, CXC: 26, CCC: 7


In [38]:
print('Farnesylated: number of proteins with internal C')
print('C all: ', len(F_motifs_C['ID'].unique()), ', C1: ',  len(F_motifs_C1['ID'].unique()), ', Cplus: ',  len(F_motifs_Cplus['ID'].unique()), ', CC: ',  
      len(F_motifs_CC['ID'].unique()), ', CXC: ',  len(F_motifs_CXC['ID'].unique()),  ', CCC: ', len(F_motifs_CCC['ID'].unique()), sep='')

print('\nGeranylgeranylated: number of proteins with internal C')
print('C all: ', len(GG_motifs_C['ID'].unique()), ', C1: ',  len(GG_motifs_C1['ID'].unique()), ', Cplus: ',  len(GG_motifs_Cplus['ID'].unique()), ', CC: ',  
      len(GG_motifs_CC['ID'].unique()), ', CXC: ',  len(GG_motifs_CXC['ID'].unique()),  ', CCC: ', len(GG_motifs_CCC['ID'].unique()), sep='')

print('\nFarnesylated & Geranylgeranylated: number of proteins with internal C')
print('C all: ', len(F_GG_motifs_C['ID'].unique()), ', C1: ',  len(F_GG_motifs_C1['ID'].unique()), ', Cplus: ',  len(F_GG_motifs_Cplus['ID'].unique()), ', CC: ',  
      len(F_GG_motifs_CC['ID'].unique()), ', CXC: ',  len(F_GG_motifs_CXC['ID'].unique()),  ', CCC: ', len(F_GG_motifs_CCC['ID'].unique()), sep='')

Farnesylated: number of proteins with internal C
C all: 183, C1: 12, Cplus: 171, CC: 45, CXC: 32, CCC: 3

Geranylgeranylated: number of proteins with internal C
C all: 132, C1: 6, Cplus: 126, CC: 31, CXC: 20, CCC: 4

Farnesylated & Geranylgeranylated: number of proteins with internal C
C all: 156, C1: 9, Cplus: 147, CC: 36, CXC: 20, CCC: 4


In [39]:
# save peptides in txt format for pssmsearch
F_motifs_C.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_motifs_C.txt'), header=None, index=None, sep=' ')
F_motifs_C1.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_motifs_C1.txt'), header=None, index=None, sep=' ')
F_motifs_Cplus.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_motifs_Cplus.txt'), header=None, index=None, sep=' ')
F_motifs_CC.pepCC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_motifs_CC.txt'), header=None, index=None, sep=' ')
F_motifs_CXC.pepCXC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_motifs_CXC.txt'), header=None, index=None, sep=' ')
F_motifs_CCC.pepCCC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_motifs_CCC.txt'), header=None, index=None, sep=' ')

GG_motifs_C.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'GG_motifs_C.txt'), header=None, index=None, sep=' ')
GG_motifs_C1.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'GG_motifs_C1.txt'), header=None, index=None, sep=' ')
GG_motifs_Cplus.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'GG_motifs_Cplus.txt'), header=None, index=None, sep=' ')
GG_motifs_CC.pepCC.to_csv(os.path.join(PSSMSearch, 'internal', 'GG_motifs_CC.txt'), header=None, index=None, sep=' ')
GG_motifs_CXC.pepCXC.to_csv(os.path.join(PSSMSearch, 'internal', 'GG_motifs_CXC.txt'), header=None, index=None, sep=' ')
GG_motifs_CCC.pepCCC.to_csv(os.path.join(PSSMSearch, 'internal', 'GG_motifs_CCC.txt'), header=None, index=None, sep=' ')

F_GG_motifs_C.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_GG_motifs_C.txt'), header=None, index=None, sep=' ')
F_GG_motifs_C1.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_GG_motifs_C1.txt'), header=None, index=None, sep=' ')
F_GG_motifs_Cplus.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_GG_motifs_Cplus.txt'), header=None, index=None, sep=' ')
F_GG_motifs_CC.pepCC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_GG_motifs_CC.txt'), header=None, index=None, sep=' ')
F_GG_motifs_CXC.pepCXC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_GG_motifs_CXC.txt'), header=None, index=None, sep=' ')
F_GG_motifs_CCC.pepCCC.to_csv(os.path.join(PSSMSearch, 'internal', 'F_GG_motifs_CCC.txt'), header=None, index=None, sep=' ')

In [40]:
# save all identified prenylated proteins with exactly one internal cysteine

intern_Ccount_1 = pd.concat([F_motifs_C1, GG_motifs_C1, F_GG_motifs_C1], ignore_index=True)
intern_Ccount_1.to_csv(os.path.join(datafolder, 'identified_proteins', 'intern_Ccount_1.csv'), sep=',', index=False)

In [41]:
# save dfs for AlphaFold in RQ6

F_motifs.to_csv(os.path.join(datafolder, 'identified_proteins', 'F_internal.csv'), sep=',', index=False)
GG_motifs.to_csv(os.path.join(datafolder, 'identified_proteins', 'GG_internal.csv'), sep=',', index=False)
F_GG_motifs.to_csv(os.path.join(datafolder, 'identified_proteins', 'F_GG_internal.csv'), sep=',', index=False)

# Expected distribution and Fisher's exact test for internal C/CC/CXC/CCC motifs

In [42]:
# Attention! Keep in mind that we don't take into account 57 
# N-terminal peptides for the prenylated proteins and 66 for the background proteins!

In [43]:
F_int_motifs = np.array([len(F_motifs_C['ID']), len(F_motifs_CC['ID']), len(F_motifs_CXC['ID']), 
                len(F_motifs_CCC['ID'])])

GG_int_motifs = np.array([len(GG_motifs_C['ID']), len(GG_motifs_CC['ID']), len(GG_motifs_CXC['ID']), 
                 len(GG_motifs_CCC['ID'])])

F_GG_int_motifs = np.array([len(F_GG_motifs_C['ID']), len(F_GG_motifs_CC['ID']), len(F_GG_motifs_CXC['ID']), 
                 len(F_GG_motifs_CCC['ID'])])

all_int_motifs = F_int_motifs + GG_int_motifs + F_GG_int_motifs

back_int_motifs = np.array([5660, 139, 108, 5])

In [44]:
# Hypergeometric distribution

# Background: number of peptides with internal C
# C:5660, CC:139, CXC:108, CCC:5

# Total N of peptides with C: 5912

K_arr = [5660, 139, 108, 5]
urn = Urn(K_arr)

In [45]:
# Category labels
category_labels = ['C', 'CC', 'CXC', 'CCC']  # Category names

In [46]:
# List of DataFrames and their names
dataframes = [
    (F_int_motifs, 'F_int_motifs'),
    (GG_int_motifs, 'GG_int_motifs'),
    (F_GG_int_motifs, 'F_GG_int_motifs'),
    (all_int_motifs, 'all_int_motifs')
]

In [47]:
# Loop through each DataFrame and perform analysis
for df, df_name in dataframes:
    # Calculate total draws and expected counts
    n = sum(df)
    m, _ = urn.moments(n)

    # Prepare DataFrame results
    results = {
        'Category': category_labels,
        'Expected Distribution': [round(value) for value in m],
        'Actual Distribution': df,
        'P-Value': []
    }

    # Calculate p-values and add significance markers
    for i in range(len(df)):
        contingency_table = np.array([[df[i], m[i]],
                                       [sum(df) - df[i], sum(m) - m[i]]])
        _, p_value = fisher_exact(contingency_table)

        # Add significance markers
        if p_value < 0.001:
            p_value_str = "0.0****"
        elif p_value < 0.01:
            p_value_str = "0.0****"
        elif p_value < 0.05:
            p_value_str = f"{p_value:.2f}*"
        else:
            p_value_str = f"{p_value:.2f}"

        results['P-Value'].append(p_value_str)

    # Create DataFrame from results
    results_df = pd.DataFrame(results)

    # Transpose the DataFrame
    results_df = results_df.set_index('Category').T  # Set 'Category' as index and transpose

    # Display the results as a formatted table
    print(f"\nResults for {df_name}:")
    print(results_df)



Results for F_int_motifs:
Category                  C    CC   CXC   CCC
Expected Distribution  1838    45    35     2
Actual Distribution    1817    61    39     3
P-Value                0.11  0.14  0.72  0.62

Results for GG_int_motifs:
Category                  C    CC   CXC   CCC
Expected Distribution  1177    29    22     1
Actual Distribution    1155    41    27     6
P-Value                0.05  0.14  0.56  0.12

Results for F_GG_int_motifs:
Category                   C     CC   CXC   CCC
Expected Distribution   1274     31    24     1
Actual Distribution     1245     53    26     7
P-Value                0.01*  0.02*  0.89  0.07

Results for all_int_motifs:
Category                     C       CC   CXC      CCC
Expected Distribution     4289      105    82        4
Actual Distribution       4217      155    92       16
P-Value                0.0****  0.0****  0.44  0.0****
