In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import os
os.chdir('/content/drive/MyDrive/MUTarget_data/')
current_path = os.getcwd()
os.listdir(current_path)

['PLANTS_uniprot.csv',
 'FUNGI_uniprot.csv',
 'ANIMALS_uniprot.csv',
 'combined_columns.csv']

In [9]:
import pandas as pd
import re

def process_text(text):
    text = text.replace(':', '')
    text = text.replace('|', ';')
    text = re.sub(r'\d+-\d+', '', text)
    return text

df_fungi = pd.read_csv('FUNGI_uniprot.csv')
# print(df_fungi.head())

df_animal = pd.read_csv('ANIMALS_uniprot.csv')
# print(df_animal.head())

df_plant = pd.read_csv('PLANTS_uniprot.csv')
# print(df_plant.head())

combined_df = pd.concat([df_fungi, df_animal, df_plant])
combined_df = combined_df.drop('LOCATION', axis=1)
combined_df['MOTIF'] = combined_df['MOTIF'].apply(process_text)
display(combined_df)
value_counts = combined_df['MOTIF'].value_counts()
print(value_counts)
value_counts = combined_df['Entry'].value_counts()
print(value_counts)


Unnamed: 0,Entry,MOTIF,Sequence
0,P08679,Peroxisome,MTVPYLNSNRNVASYLQSNSSQEKTLKERFSEIYPIHAQDVRQFVK...
1,P21826,Peroxisome,MVKISLDNTALYADIDTTPQFEPSKTTVADILTKDALEFIVLLHRT...
2,P27796,Peroxisome,MSQRLQSIKDHLVESAMGKGESKRKNSLLEKRPEDVVIVAANRSAI...
3,P30952,Peroxisome,MVKVSLDNVKLLVDVDKEPFFKPSSTTVGDILTKDALEFIVLLHRT...
4,P38137,Peroxisome,MTSAATVTASFNDTFSVSDNVAVIVPETDTQVTYRDLSHMVGHFQT...
...,...,...,...
1155,P31110,SIGNAL,MASPATSSAVLVVVLVATLAAGGANAATFTITNRCSFTVWPAATPV...
1156,P43174,SIGNAL,MNNEKNVSFEFIGSTDEVDEIKLLPCAWAGNVCGEKRAYCCSDPGR...
1157,Q2A783,SIGNAL,MKVFILALLALTATTAIAQLETTCSQGFGQYQQQQQPGQRQLLEQM...
1158,Q39962,SIGNAL,MKMKRSPYCFCCSFALLLLVSFLKDRHFCSADPTDGFTEVPLTEDN...


SIGNAL                                                  3196
Mitochondrion                                            791
Nucleus                                                  428
chloroplast                                              342
ER                                                       123
Nucleus;Nucleus                                          108
Peroxisome                                                62
Nucleus_export                                            52
Thylakoid                                                 44
Nucleus_export;Nucleus                                    34
ER;SIGNAL                                                 23
dual                                                      17
Nucleus;Nucleus;Nucleus                                   17
Nucleus_export;Nucleus;Nucleus                            12
Nucleus_export;Nucleus_export;Nucleus                      9
Nucleus_export;Nucleus_export                              8
Nucleus_export;Nucleus_e

In [10]:
import pandas as pd
import re

def remove_duplicate_motifs(text):
    words = text.split(';')
    unique_words = sorted(set(words), key=lambda x: words.index(x))
    return ';'.join(unique_words)

combined_df['MOTIF'] = combined_df['MOTIF'].apply(remove_duplicate_motifs)

display(combined_df)

value_counts = combined_df['MOTIF'].value_counts()
print(value_counts)


Unnamed: 0,Entry,MOTIF,Sequence
0,P08679,Peroxisome,MTVPYLNSNRNVASYLQSNSSQEKTLKERFSEIYPIHAQDVRQFVK...
1,P21826,Peroxisome,MVKISLDNTALYADIDTTPQFEPSKTTVADILTKDALEFIVLLHRT...
2,P27796,Peroxisome,MSQRLQSIKDHLVESAMGKGESKRKNSLLEKRPEDVVIVAANRSAI...
3,P30952,Peroxisome,MVKVSLDNVKLLVDVDKEPFFKPSSTTVGDILTKDALEFIVLLHRT...
4,P38137,Peroxisome,MTSAATVTASFNDTFSVSDNVAVIVPETDTQVTYRDLSHMVGHFQT...
...,...,...,...
1155,P31110,SIGNAL,MASPATSSAVLVVVLVATLAAGGANAATFTITNRCSFTVWPAATPV...
1156,P43174,SIGNAL,MNNEKNVSFEFIGSTDEVDEIKLLPCAWAGNVCGEKRAYCCSDPGR...
1157,Q2A783,SIGNAL,MKVFILALLALTATTAIAQLETTCSQGFGQYQQQQQPGQRQLLEQM...
1158,Q39962,SIGNAL,MKMKRSPYCFCCSFALLLLVSFLKDRHFCSADPTDGFTEVPLTEDN...


SIGNAL                          3196
Mitochondrion                    791
Nucleus                          563
chloroplast                      345
ER                               123
Nucleus_export;Nucleus            63
Peroxisome                        62
Nucleus_export                    61
Thylakoid                         44
ER;SIGNAL                         23
dual                              17
Nucleus;SIGNAL                     7
Thylakoid;chloroplast              2
Nucleus_export;Mitochondrion       1
Mitochondrion;Nucleus              1
chloroplast;Nucleus                1
Name: MOTIF, dtype: int64


In [11]:
combined_df['Entry\tEC number\tSequence'] = combined_df['Entry'].astype(str) + '\t' + combined_df['MOTIF'].astype(str) + '\t' + combined_df['Sequence'].astype(str)
combined_df = combined_df[['Entry\tEC number\tSequence']]
display(combined_df)
combined_df.to_csv('combined_columns.csv', index=False)

Unnamed: 0,Entry\tEC number\tSequence
0,P08679\tPeroxisome\tMTVPYLNSNRNVASYLQSNSSQEKTL...
1,P21826\tPeroxisome\tMVKISLDNTALYADIDTTPQFEPSKT...
2,P27796\tPeroxisome\tMSQRLQSIKDHLVESAMGKGESKRKN...
3,P30952\tPeroxisome\tMVKVSLDNVKLLVDVDKEPFFKPSST...
4,P38137\tPeroxisome\tMTSAATVTASFNDTFSVSDNVAVIVP...
...,...
1155,P31110\tSIGNAL\tMASPATSSAVLVVVLVATLAAGGANAATFT...
1156,P43174\tSIGNAL\tMNNEKNVSFEFIGSTDEVDEIKLLPCAWAG...
1157,Q2A783\tSIGNAL\tMKVFILALLALTATTAIAQLETTCSQGFGQ...
1158,Q39962\tSIGNAL\tMKMKRSPYCFCCSFALLLLVSFLKDRHFCS...
