In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
import os
os.chdir('/content/drive/MyDrive/MUTarget_data/')
current_path = os.getcwd()
os.listdir(current_path)

['PLANTS_uniprot.csv',
 'FUNGI_uniprot.csv',
 'ANIMALS_uniprot.csv',
 'combined_columns.csv',
 'distance_map-20240204T204819Z-001.zip',
 'esm_data-20240204T205453Z-001.zip',
 'new.csv',
 'newinfer.py',
 'new-train-triplet.py',
 'new-train-supconH.py',
 'partition.csv',
 'validation_0.csv',
 'partition',
 '.ipynb_checkpoints']

In [46]:
import pandas as pd
import re

def process_text(text):
    text = text.replace(':', '')
    text = text.replace('|', ';')
    text = re.sub(r'\d+-\d+', '', text)
    return text

df_fungi = pd.read_csv('FUNGI_uniprot.csv')
# print(df_fungi.head())

df_animal = pd.read_csv('ANIMALS_uniprot.csv')
# print(df_animal.head())

df_plant = pd.read_csv('PLANTS_uniprot.csv')
# print(df_plant.head())

combined_df = pd.concat([df_plant, df_animal, df_fungi])  # DO NOT CHANGE THIS ORDER
combined_df = combined_df.drop('LOCATION', axis=1)
combined_df['MOTIF'] = combined_df['MOTIF'].apply(process_text)
display(combined_df)
value_counts = combined_df['MOTIF'].value_counts()
print(value_counts)
value_counts = combined_df['Entry'].value_counts()
print(value_counts)


Unnamed: 0,Entry,MOTIF,Sequence
0,O49203,dual,MSSQICRSASKAAKSLLSSAKNARFFSEGRAIGAAAAVSASGKIPL...
1,P92947,dual,MSAVRRVMALASTTLPTKSGLSLWCPSSPSLARRFPARFSPIGSRI...
2,Q42592,dual,MAERVSLTLNGTLLSPPPTTTTTTMSSSLRSTTAASLLLRSSSSSS...
3,Q66GI4,dual,MLRLTCFTPSFSRACCPLFAMMLKVPSVHLHHPRFSPFRFYHTSLL...
4,Q8L785,dual,MAILHFSLPLIVSFLRPHASPRFFLLPRSLSQSPFLSRRRFHRTSA...
...,...,...,...
362,Q8J0D2,SIGNAL,MVAVTSLGKALTALSILASLAVAKEHYEFKNVAIGGGGYITGIVAH...
363,Q96X54,SIGNAL,MVAFSALSGVSALSLLLCLVQHAHGVSLKVSTQGGNSSSPILYGFM...
364,Q9HGE1,SIGNAL,MVSIKSVLAAATAVSSALAAPFDFVPRDNSTALQARQVTPNAEGWH...
365,B3STN5,SIGNAL,MNILLATASFVLSLGFVKAEPTRHHDRYAYIERVVCDVNFPDLLCR...


SIGNAL                                                  3196
Mitochondrion                                            791
Nucleus                                                  428
chloroplast                                              342
ER                                                       123
Nucleus;Nucleus                                          108
Peroxisome                                                62
Nucleus_export                                            52
Thylakoid                                                 44
Nucleus_export;Nucleus                                    34
ER;SIGNAL                                                 23
dual                                                      17
Nucleus;Nucleus;Nucleus                                   17
Nucleus_export;Nucleus;Nucleus                            12
Nucleus_export;Nucleus_export;Nucleus                      9
Nucleus_export;Nucleus_export                              8
Nucleus;Nucleus;Nucleus;

In [47]:
import pandas as pd
import re

def remove_duplicate_motifs(text):
    words = text.split(';')
    unique_words = sorted(set(words), key=lambda x: words.index(x))
    return ';'.join(unique_words)

combined_df['MOTIF'] = combined_df['MOTIF'].apply(remove_duplicate_motifs)

display(combined_df)

value_counts = combined_df['MOTIF'].value_counts()
print(value_counts)


Unnamed: 0,Entry,MOTIF,Sequence
0,O49203,dual,MSSQICRSASKAAKSLLSSAKNARFFSEGRAIGAAAAVSASGKIPL...
1,P92947,dual,MSAVRRVMALASTTLPTKSGLSLWCPSSPSLARRFPARFSPIGSRI...
2,Q42592,dual,MAERVSLTLNGTLLSPPPTTTTTTMSSSLRSTTAASLLLRSSSSSS...
3,Q66GI4,dual,MLRLTCFTPSFSRACCPLFAMMLKVPSVHLHHPRFSPFRFYHTSLL...
4,Q8L785,dual,MAILHFSLPLIVSFLRPHASPRFFLLPRSLSQSPFLSRRRFHRTSA...
...,...,...,...
362,Q8J0D2,SIGNAL,MVAVTSLGKALTALSILASLAVAKEHYEFKNVAIGGGGYITGIVAH...
363,Q96X54,SIGNAL,MVAFSALSGVSALSLLLCLVQHAHGVSLKVSTQGGNSSSPILYGFM...
364,Q9HGE1,SIGNAL,MVSIKSVLAAATAVSSALAAPFDFVPRDNSTALQARQVTPNAEGWH...
365,B3STN5,SIGNAL,MNILLATASFVLSLGFVKAEPTRHHDRYAYIERVVCDVNFPDLLCR...


SIGNAL                          3196
Mitochondrion                    791
Nucleus                          563
chloroplast                      345
ER                               123
Nucleus_export;Nucleus            63
Peroxisome                        62
Nucleus_export                    61
Thylakoid                         44
ER;SIGNAL                         23
dual                              17
Nucleus;SIGNAL                     7
Thylakoid;chloroplast              2
chloroplast;Nucleus                1
Nucleus_export;Mitochondrion       1
Mitochondrion;Nucleus              1
Name: MOTIF, dtype: int64


In [48]:
def split_set(validation_partition=0, test_partition=1):
  df1 = combined_df
  df2 = pd.read_csv('partition.csv')
  merged_df = pd.merge(df1, df2, how='left', left_on='Entry', right_on='entry')
  # display(merged_df)
  merged_df = merged_df.drop(columns=['entry', 'type', 'group'])
  # display(merged_df)
  validation_set = merged_df.loc[merged_df['partition'] == validation_partition].drop(columns=['partition'])
  test_set = merged_df.loc[merged_df['partition'] == test_partition].drop(columns=['partition'])
  train_set = merged_df.loc[(merged_df['partition'] != test_partition) & (merged_df['partition'] != validation_partition)].drop(columns=['partition'])
  # display(validation_set)
  # display(test_set)
  # display(train_set)
  return validation_set, test_set, train_set

def save_split(validation_set, test_set, train_set, validation_partition=0, test_partition=1):
  validation_set['Entry\tEC number\tSequence'] = validation_set['Entry'].astype(str) + '\t' + validation_set['MOTIF'].astype(str) + '\t' + validation_set['Sequence'].astype(str)
  validation_set = validation_set[['Entry\tEC number\tSequence']]
  # display(validation_set)
  validation_set.to_csv('partition/validation_'+str(validation_partition)+'.csv', index=False)

  test_set['Entry\tEC number\tSequence'] = test_set['Entry'].astype(str) + '\t' + test_set['MOTIF'].astype(str) + '\t' + test_set['Sequence'].astype(str)
  test_set = test_set[['Entry\tEC number\tSequence']]
  # display(test_set)
  test_set.to_csv('partition/test_'+str(test_partition)+'.csv', index=False)

  train_set['Entry\tEC number\tSequence'] = train_set['Entry'].astype(str) + '\t' + train_set['MOTIF'].astype(str) + '\t' + train_set['Sequence'].astype(str)
  train_set = train_set[['Entry\tEC number\tSequence']]
  # display(test_set)
  train_set.to_csv('partition/train_'+str(validation_partition)+str(test_partition)+'.csv', index=False)

In [49]:
a, b, c = split_set(0, 1)
# print(a)
save_split(a, b, c, 0, 1)

In [51]:
for i in range(5):
    valid_num = i
    if valid_num == 4:
        test_num = 0
    else:
        test_num = valid_num + 1
    validation_set, test_set, train_set = split_set(valid_num, test_num)
    save_split(validation_set, test_set, train_set, valid_num, test_num)