In [None]:
import pandas as pd

# Load CSV file (assumes columns: "id" and "sequence")
df = pd.read_csv("../DATASETS/predictor_dataset.csv")

# Convert to dictionary using sequences as keys
sequences = df["Amino Acid Sequence"].unique()  # Ensure sequences are unique

print(f"Total Sequences: {len(df['Amino Acid Sequence'])} Total Sequences unique: {len(sequences)}")
print(f"Unique UniProt IDs {len(df['UniProt IDs'].unique())}")
print(f"Unique PDB IDs {len(df['PDB ID'].unique())}")

Total Sequences: 1261 Total Sequences unique: 1245
Unique UniProt IDs 1249
Unique PDB IDs 1079


In [None]:
duplicate_sequences = df["Amino Acid Sequence"].value_counts()
duplicate_sequences = duplicate_sequences[duplicate_sequences > 1]

# Print the number of repetitions for each duplicated sequence
print("Finding Duplicated Sequences")
for sequence, count in duplicate_sequences.items():
    print(f"Repetitions: {count}, Sequence: {sequence}")

# Find Same Sequences with Different IDs
duplicates = df[df.duplicated(subset=["Amino Acid Sequence"], keep=False)]
print(duplicates[['Class', 'UniProt IDs', 'PDB ID', 'Amino Acid Sequence']].sort_values("Amino Acid Sequence"))

Finding Duplicated Sequences
Repetitions: 2, Sequence: MGDVEKGKKIFVQKCAQCHTVEKGGKHKTGPNLHGLFGRKTGQAPGFTYTDANKNKGITWKEETLMEYLENPKKYIPGTKMIFAGIKKKTEREDLIAYLKKATNE
Repetitions: 2, Sequence: MRPGTGAERGGLMVSEMESHPPSQGPGDGERRLSGSSLCSGSWVSADGFLRRRPSMGHPGMHYAPMGMHPMGQRANMPPVPHGMMPQMMPPMGGPPMGQMPGMMSSVMPGMMMSHMSQASMQPALPPGVNSMDVAAGTASGAKSMWTEHKSPDGRTYYYNTETKQSTWEKPDDLKTPAEQLLSKCPWKEYKSDSGKPYYYNSQTKESRWAKPKELEDLEGYQNTIVAGSLITKSNLHAMIKAEESSKQEECTTTSTAPVPTTEIPTTMSTMAAAEAAAAVVAAAAAAAAAAAAANANASTSASNTVSGTVPVVPEPEVTSIVATVVDNENTVTISTEEQAQLTSTPAIQDQSVEVSSNTGEETSKQETVADFTPKKEEEESQPAKKTYTWNTKEEAKQAFKELLKEKRVPSNASWEQAMKMIINDPRYSALAKLSEKKQAFNAYKVQTEKEEKEEARSKYKEAKESFQRFLENHEKMTSTTRYKKAEQMFGEMEVWNAISERDRLEIYEDVLFFLSKKEKEQAKQLRKRNWEALKNILDNMANVTYSTTWSEAQQYLMDNPTFAEDEELQNMDKEDALICFEEHIRALEKEEEEEKQKSLLRERRRQRKNRESFQIFLDELHEHGQLHSMSSWMELYPTISSDIRFTNMLGQPGSTALDLFKFYVEDLKARYHDEKKIIKDILKDKGFVVEVNTTFEDFVAIISSTKRSTTLDAGNIKLAFNSLLEKAEAREREREKEEARKMKRKESAFKSMLKQAAPPIELDAVWEDIRERFVKEPAFEDITLESERKRIFKDFMHVLEHECQHHHSKNKK

In [None]:
# Count occurrences of each sequence
sequence_counts = df["Amino Acid Sequence"].value_counts()

# Filter to keep only sequences that appear exactly once
unique_sequences = sequence_counts[sequence_counts == 1].index

# Keep only rows where the sequence appears exactly once
df_unique = df[df["Amino Acid Sequence"].isin(unique_sequences)]

# Save the cleaned dataset
df_unique.to_csv("filtered_dataset.csv", index=False)

print("Remove all copies of sequences that appear multiple times")
print(f"Original dataset size: {len(df)}")
print(f"Filtered dataset size: {len(df_unique)}")
print(f"Filtered dataset #Uniprot IDs: {len(df_unique['UniProt IDs'].unique())}")
print(f"Moonlight proteins rate in the unique dataset: {len(df_unique[df_unique['Class'] == True])*100 / len(df_unique)} %")

Remove all copies of sequences that appear multiple times
Original dataset size: 1261
Filtered dataset size: 1229
Filtered dataset #Uniprot IDs: 1229
Moonlight proteins rate in the unique dataset: 44.182262001627336 %
