<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Proteins-with-true-single-internal-cysteine" data-toc-modified-id="Proteins-with-true-single-internal-cysteine-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Proteins with true single internal cysteine</a></span></li><li><span><a href="#Remove-cysteines-in-disulfide-bonds" data-toc-modified-id="Remove-cysteines-in-disulfide-bonds-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Remove cysteines in disulfide bonds</a></span></li><li><span><a href="#Annotate-CCC/CXC/CC/C-motifs-and-peptides" data-toc-modified-id="Annotate-CCC/CXC/CC/C-motifs-and-peptides-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Annotate CCC/CXC/CC/C motifs and peptides</a></span></li><li><span><a href="#Hypergeometric-distribution-and-Fisher's-exact-test-for-internal-C/CC/CXC/CCC-motifs" data-toc-modified-id="Hypergeometric-distribution-and-Fisher's-exact-test-for-internal-C/CC/CXC/CCC-motifs-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Hypergeometric distribution and Fisher's exact test for internal C/CC/CXC/CCC motifs</a></span></li></ul></div>

In [2]:
import sys
import os
import session_info

# Add the '0_functions' folder to sys.path
sys.path.append(os.path.join(os.getcwd(), '..', '0_functions'))

In [3]:
import pandas as pd
from functions import add_Cpos
from functions import get_Ccount
from functions import pep_intern
from functions import annotate_pep_internal

In [4]:
# Display session information
session_info.show()

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Load data
datafolder = 'data'
PSSMSearch = 'data/PssmSearch'

# load the background proteins with the fasta sequence, most C-terminal cysteine positions etc.
background = pd.read_csv(os.path.join(datafolder, 'total_extracts_GG_F', 'clean_background.csv'), sep=',')

# Proteins with true single internal cysteine 

In [5]:
print('N of background proteins:', len(background))

N of background proteins: 804


In [6]:
background.columns

Index(['ID', 'seqID', 'seq', 'len', 'Ccount'], dtype='object')

In [7]:
# check for proteins with only 1 possibly prenylated C that is internal
background_True_C1 = background[background['Ccount'] == 1]

print('N of background proteins with exactly 1 cysteine:', len(background_True_C1))

N of background proteins with exactly 1 cysteine: 66


In [8]:
# Add C positions counting from N-terminal and counting from C-terminal
# ATTENTION: these positions start from 1, not 0, as would be necessary for calling the Cs on index
background = add_Cpos(background)

# Remove cysteines in disulfide bonds

In [9]:
# SPARQL readable input for creating disulfide bonds csv
up_input = ' '.join(['uniprotkb:' + str(s) for s in background['ID'].unique()])

In [10]:
# clean SPARQL output
#bonded_back = pd.read_csv(os.path.join(datafolder, 'UniProt_SPARQL_queries', 'up_output_background_multi_disulfid_17.04.24.csv'), sep=';')
#bonded_back = bonded_back.dropna().reset_index(drop=True).rename(columns={'primaryAccession': 'ID', 'index': 'bond_Cpos'})

#for i, m in enumerate(bonded_back['bond_Cpos']):
    #bonded_back['bond_Cpos'][i] = re.sub(r'\^.*', '', str(m))

# Remove disulfide bonds annotated by similarity, because they are much less reliable
#bonded_back = bonded_back.loc[bonded_back['evidenceCode'] != 'ECO_0000250']
#bonded_back = bonded_back.drop(['evidenceCode'], axis=1).reset_index(drop=True)
    
#bonded_back.to_csv(os.path.join(datafolder, 'UniProt_SPARQL_queries', 'up_output_background_multi_disulfid_17.04.24.csv'), sep=';', index=False)

In [11]:
bonded = pd.read_csv(os.path.join(datafolder, 'UniProt_SPARQL_queries', 
                                       'up_output_background_multi_disulfid_17.04.24.csv'), sep=';')

In [12]:
# add C count 
bonded = bonded.rename(columns={'bond_Cpos': 'N_Cpos'})
agg_bonded_Ccount = get_Ccount(bonded)

# proteins that have cysteines that can be removed
print(len(agg_bonded_Ccount), 'proteins have', agg_bonded_Ccount['Ccount'].sum(), 'cysteines that can be removed.')
print('')

cols = ['ID', 'seqID', 'seq', 'len', 'Cpos', 'Ccount', 'N_Cpos']

discard = bonded.merge(background, on=['ID', 'N_Cpos'])
discard = discard[cols]

print('Before removing cysteines in disulfide bonds:')
print('Number of Cs in background proteins:', len(background)) # check

background = background[cols]

background = pd.concat([background, discard]).drop_duplicates(keep=False).reset_index(drop=True)

print('\nAfter removing cysteines in disulfide bonds:')
print('Number of Cs in background proteins:', len(background)) # check

81 proteins have 369 cysteines that can be removed.

Before removing cysteines in disulfide bonds:
Number of Cs in background proteins: 6639

After removing cysteines in disulfide bonds:
Number of Cs in background proteins: 6270


In [13]:
# check if and how much the possibly prenylated Cs have been reduced to 1 possibly prenylated C with this method

# F
agg_bonded_filt_Ccount = get_Ccount(background)
bonded_C1 = agg_bonded_filt_Ccount[agg_bonded_filt_Ccount['Ccount'] == 1]
bonded_C1 = bonded_C1[~bonded_C1['ID'].isin(background_True_C1['ID'])].reset_index(drop=True)
bonded_C1

Unnamed: 0,ID,seqID,seq,len,Cpos,Ccount,N_Cpos,Count_all
0,Q8WTT0,sp|Q8WTT0|CLC4C_HUMAN,MVPEEEPQDREKGLWWFQLKVWSMAVVSILLLSVCFTVSSVVPHNF...,213,-179,1,35,9
1,Q8N4Q1,sp|Q8N4Q1|MIA40_HUMAN,MSYCRQEGKDRIIFVTKEDHETPSSAELVADDPNDPYEEHGLILPN...,142,-139,1,4,7
2,Q15904,sp|Q15904|VAS1_HUMAN,MMAAMATARVRMGPRCAQALWRMPWLPVFLSLAAAAAAAAAEQQVP...,470,-455,1,16,3
3,Q9NX63,sp|Q9NX63|MIC19_HUMAN,MGGTTSTRRVTFEADENENITVVKGIRLSENVIDRMKESSPSGSKS...,227,-116,1,112,5
4,O75326,sp|O75326|SEM7A_HUMAN,MTPPPPGRAAPSAPRARVPGPPARLGLPLRLRLLLLLWAAAASAQG...,666,-22,1,645,19
5,O43819,sp|O43819|SCO2_HUMAN,MLLLTRSPTAWHRLSQLKPRVLPGTLGGQALHLRSWLLSRQGPAET...,266,-152,1,115,3
6,O75022,sp|O75022|LIRB3_HUMAN,MTPALTALLCLGLSLGPRTRVQAGPFPKPTLWAEPGSVISWGSPVT...,631,-622,1,10,9
7,P13987,sp|P13987|CD59_HUMAN,MGIQGGSVLFGLLLVLAVFCHSGHSLQCYNCPNPTADCKTAVNCSS...,128,-109,1,20,11
8,P40199,sp|P40199|CEAM6_HUMAN,MGPPSAPPCRLHVPWKEVLLTASLLTFWNPPTTAKLTIESTPFNVA...,344,-336,1,9,5
9,P35613,sp|P35613|BASI_HUMAN,MAAALFVLLGFALLGTHGASGAAGFVQAPLSQQRWVGGSVELHCEA...,385,-319,1,67,7


In [14]:
# Actualize Ccount after removing cysteines in disulfide bonds

# Replace 'Ccount' values in 'background' with values from 'agg_bonded_filt_Ccount' based on matching 'ID'
background['Ccount'] = background['ID'].map(agg_bonded_filt_Ccount.set_index('ID')['Ccount'])

In [15]:
print('N of background proteins with exactly 1 cysteine, after removing cysteines in disulfide bonds:', 
      len(background['ID'][background['Ccount'] == 1].unique()))

N of background proteins with exactly 1 cysteine, after removing cysteines in disulfide bonds: 76


In [16]:
# Add count of all cysteines (Count_all), including those in disulfide bonds, to fasta dfs from aggregated dfs
background = background.merge(agg_bonded_filt_Ccount[['ID', 'Count_all']], on='ID', how='left')

# Annotate CCC/CXC/CC/C motifs and peptides

In [17]:
# add peptides for all internal C positions, 11 AA long
background_motifs = pep_intern(background)

In [18]:
# annotate peptides according to motif CCC/CXC/CC/C
background_motifs = annotate_pep_internal(background_motifs)

In [19]:
# Special cases
special_motifs = background_motifs[background_motifs.pepC.isna() & background_motifs.pepCC.isna() & 
                                   background_motifs.pepCCC.isna() & background_motifs.pepCXC.isna()]

# CXCC and CCXC are covered in pepCC
special_motifs[background_motifs.N_Cpos > 5].reset_index(drop=True)

  special_motifs[background_motifs.N_Cpos > 5].reset_index(drop=True)


Unnamed: 0,ID,seqID,seq,len,Cpos,Ccount,N_Cpos,Count_all,pep,pepCCC,pepCC,pepCXC,pepC
0,O00624,sp|O00624|NPT3_HUMAN,MDGKPATRKGPDFCSLRYGLALIMHFSNFTMITQRVSLSIAIIAMV...,439,-219,11,221,11,FGSTGCVCCLL,,,,
1,O14493,sp|O14493|CLD4_HUMAN,MASMGLQVMGIALAVLGWLAVMLCCALPMWRVTAFIGSNIVTSQTI...,209,-25,7,185,9,LLCCNCPPRTD,,,,
2,O15551,sp|O15551|CLD3_HUMAN,MSMGLEITGTALAVLGWLGTIVCCALPMWRVSAFIGSNIITSQNIW...,220,-37,9,184,9,LLCCSCPPREK,,,,
3,O60488,sp|O60488|ACSL4_HUMAN,MKLKLNVLTIILLPVHLLITIYSALIFIPWYFLTNAKKKNAMAKRI...,711,-255,19,457,19,RFMNVCFCCPI,,,,
4,P21912,sp|P21912|SDHB_HUMAN,MAAVVALSLRRRLPATTLGGACLQASRGAQTAAATAPRIKKFAIYR...,280,-92,14,189,14,YECILCACCST,,,,
5,Q15057,sp|Q15057|ACAP2_HUMAN,MKMTVDFEECLKDSPRFRAALEEVEGDVAELELKLDKLVKLCIAMI...,778,-362,17,417,17,ASCCDCGLADP,,,,
6,Q5JSH3,sp|Q5JSH3|WDR44_HUMAN,MASESDTEEFYDAPEDVHLGGGYPVGSPGKVGLSTFKETENTAYKV...,913,-274,10,640,10,ISRRECLCCFQ,,,,
7,Q71RC9,sp|Q71RC9|SMIM5_HUMAN,MAATDFVQEMRAVGERLLLKLQRLPQAEPVEIVAFSVIILFTATVL...,77,-25,7,53,7,LLLIACSCCCT,,,,
8,Q8TCS8,sp|Q8TCS8|PNPT1_HUMAN,MAACRYCCSCLRLRPLSDGPFLLPRRDRALTQLQVRALWSSAGSRA...,783,-774,11,10,11,RYCCSCLRLRP,,,,
9,Q92599,sp|Q92599|SEPT8_HUMAN,MAATDLERFSNAEPEPRSLSLGGHVGFDSLPDQLVSKSVTQGFSFN...,483,-13,9,471,9,IQCCSCLVRDA,,,,


In [20]:
# Very N-terminal cysteine positions, too short peptides for sequence motiv
print("N-terminal cysteines that can't be aligned for sequence logo plots:", 
      len(background_motifs[background_motifs['N_Cpos'] < 6].reset_index(drop=True)))
background_motifs[background_motifs['N_Cpos'] < 6].reset_index(drop=True)

N-terminal cysteines that can't be aligned for sequence logo plots: 41


Unnamed: 0,ID,seqID,seq,len,Cpos,Ccount,N_Cpos,Count_all,pep,pepCCC,pepCC,pepCXC,pepC
0,O00217,sp|O00217|NDUS8_HUMAN,MRCLTTPMLLRALAQAARAGPPGGRSLHSSAVAATYKYVNMQDPEM...,210,-208,9,3,9,MRCLTTPM,,,,
1,O75069,sp|O75069|TMCC2_HUMAN,MKRCRSDELQQQQGEEDGAGLEDAASHLPGADLRPGETTGANSAGG...,709,-706,5,4,5,MKRCRSDEL,,,,
2,O75131,sp|O75131|CPNE3_HUMAN,MAAQCVTKVALNVSCANLLDKDIGSKSDPLCVLFLNTSGQQWYEVE...,537,-533,15,5,15,MAAQCVTKVA,,,,
3,O75608,sp|O75608|LYPA1_HUMAN,MCGNNMSTPLPAIVPAARKATAAVIFLHGLGDTGHGWAEAFAGIRS...,230,-229,6,2,6,MCGNNMS,,,,
4,O95159,sp|O95159|ZFPL1_HUMAN,MGLCKCPKRKVTNLFCFEHRVNVCEHCLVANHAKCIVQSYLQWLQD...,310,-307,13,4,13,MGLCKCPKR,,,MGLCKCPKR,
5,O95831,sp|O95831|AIFM1_HUMAN,MFRCGGLAAGALKQKLVPLVRTVCVRSPRQRNRLPGNLFQRWHVPL...,613,-610,5,4,5,MFRCGGLAA,,,,
6,P0DTU3,sp|P0DTU3|TRAR2_HUMAN,MACPGFLWALVISTCLEFSMAQTVTQSQPEMSVQEAETVTLSCTYD...,275,-273,5,3,7,MACPGFLW,,,,
7,P06241,sp|P06241|FYN_HUMAN,MGCVQCKDKEATKLTEERDGSLNQSSGYRYGTDPTPQHYPSFGVTS...,537,-535,9,3,9,MGCVQCKD,,,,
8,P06729,sp|P06729|CD2_HUMAN,MSFPCKFVASFLLIFNVSSKGAVSKEITNALETWGALGQDINLDIP...,351,-347,2,5,6,MSFPCKFVAS,,,,
9,P09326,sp|P09326|CD48_HUMAN,MCSRGWDSCLALELLLLPLSLLVTSIQGHLVHMTVVSGSNVTLNIS...,243,-242,6,2,8,MCSRGWD,,,,


In [21]:
# aggregate on different C motifs

B_motifs_C = background_motifs[~background_motifs['pepC'].isna()].reset_index(drop=True)
B_motifs_C1 = B_motifs_C[B_motifs_C['Ccount'] == 1].reset_index(drop=True)
B_motifs_Cplus = B_motifs_C[B_motifs_C['Ccount'] != 1].reset_index(drop=True)
B_motifs_Cplus = B_motifs_Cplus[~B_motifs_Cplus['pepC'].isna()].reset_index(drop=True)

B_motifs_CC = background_motifs[~background_motifs['pepCC'].isna()].reset_index(drop=True)
B_motifs_CC = B_motifs_CC[['ID', 'pepCC']].drop_duplicates()

B_motifs_CXC = background_motifs[~background_motifs['pepCXC'].isna()].reset_index(drop=True)
B_motifs_CXC = B_motifs_CXC[['ID', 'pepCXC']].drop_duplicates()

B_motifs_CCC = background_motifs[~background_motifs['pepCCC'].isna()].reset_index(drop=True)
B_motifs_CCC = B_motifs_CCC[['ID', 'pepCCC']].drop_duplicates()

In [22]:
print('Background: number of peptides with internal C')
print('C all: ', len(B_motifs_C), ', C1: ',  len(B_motifs_C1), ', Cplus: ',  len(B_motifs_Cplus), ', CC: ',  
      len(B_motifs_CC), ', CXC: ',  len(B_motifs_CXC),  ', CCC: ', len(B_motifs_CCC), sep='')

print('\nBackground: number of proteins with internal C')
print('C all: ', len(B_motifs_C['ID'].unique()), ', C1: ',  len(B_motifs_C1['ID'].unique()), ', C plus: ',  
      len(B_motifs_Cplus['ID'].unique()), ', CC: ',  len(B_motifs_CC['ID'].unique()), ', CXC: ',  
      len(B_motifs_CXC['ID'].unique()),  ', CCC: ', len(B_motifs_CCC['ID'].unique()), sep='')


Background: number of peptides with internal C
C all: 5708, C1: 75, Cplus: 5633, CC: 141, CXC: 110, CCC: 5

Background: number of proteins with internal C
C all: 781, C1: 75, C plus: 706, CC: 121, CXC: 92, CCC: 4


In [23]:
# save peptides in txt format for pssmsearch
B_motifs_C.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'back_pep_Call.txt'), header=None, index=None, sep=' ')
B_motifs_C1.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'back_pep_C1.txt'), header=None, index=None, sep=' ')
B_motifs_Cplus.pepC.to_csv(os.path.join(PSSMSearch, 'internal', 'back_pep_Cplus.txt'), header=None, index=None, sep=' ')
B_motifs_CC.pepCC.to_csv(os.path.join(PSSMSearch, 'internal', 'back_pep_CC.txt'), header=None, index=None, sep=' ')
B_motifs_CXC.pepCXC.to_csv(os.path.join(PSSMSearch, 'internal', 'back_pep_CXC.txt'), header=None, index=None, sep=' ')
B_motifs_CCC.pepCCC.to_csv(os.path.join(PSSMSearch, 'internal', 'back_pep_CCC.txt'), header=None, index=None, sep=' ')

In [24]:
# save all background proteins with exactly one internal cysteine
background_C1 = background[background['Ccount'] == 1].reset_index(drop=True)
B_motifs_C1.to_csv(os.path.join(datafolder, 'total_extracts_GG_F', 'background_C1.csv'), sep=',', index=False)

In [25]:
# save dfs for AlphaFold in RQ6
background.to_csv(os.path.join(datafolder, 'total_extracts_GG_F', 'clean_background.csv'), sep=',', index=False)