In [16]:
import os
import pandas as pd

SAVE_PATH = "../../datasets/"

# Ensure the save path exists
os.makedirs(SAVE_PATH, exist_ok=True)

df_moondb_dataset = pd.read_csv("../../datasets/moondb_dataset.csv")
df_moonprot_dataset = pd.read_csv("../../datasets/moonprot_dataset.csv")

In [3]:
print("\n\nMoonDB dataset:")
print('--'*20)
print(df_moondb_dataset.columns)
print('--'*20)
print("\nMoonProt dataset:")
print(df_moonprot_dataset.columns)



MoonDB dataset:
----------------------------------------
Index(['UniProt IDs', 'PDB ID', 'Gene Name', 'Protein Name',
       'Amino Acid Sequence', 'Organism Name', 'Status', 'Protein Existence',
       'GO BP Terms', 'GO CC Terms', 'GO MF Terms'],
      dtype='object')
----------------------------------------

MoonProt dataset:
Index(['UniProt IDs', 'PDB ID', 'Gene Name', 'Protein Name',
       'Amino Acid Sequence', 'Organism Name', 'Status', 'Protein Existence',
       'GO BP Terms', 'GO CC Terms', 'GO MF Terms'],
      dtype='object')


In [4]:
# Uniprot IDs
uniprot_ids_moondb = set(df_moondb_dataset["UniProt IDs"])
uniprot_ids_moonprot = set(df_moonprot_dataset["UniProt IDs"])

# Save the Uniprot IDs as csv files
df_uniprot_ids_moondb = pd.DataFrame(uniprot_ids_moondb, columns=["UniProt IDs"])
df_uniprot_ids_moonprot = pd.DataFrame(uniprot_ids_moonprot, columns=["UniProt IDs"])

# Save the Uniprot IDs
df_uniprot_ids_moondb.to_csv(SAVE_PATH + "uniprot_ids_moondb.csv", index=False)
df_uniprot_ids_moonprot.to_csv(SAVE_PATH + "uniprot_ids_moonprot.csv", index=False)

In [5]:
# Load main dataset
df = pd.read_csv(SAVE_PATH + "predictor_dataset.csv")

In [14]:
# MoonDB proteins
df_moondb = df[df["UniProt IDs"].isin(uniprot_ids_moondb)]
print("\nMoonDB proteins:")
print(f"Total proteins: {df_moondb.shape[0]}")
print(f"Number of True values: {df_moondb['Class'].value_counts().get(True, 0)}")
print(f"Number of False values: {df_moondb['Class'].value_counts().get(False, 0)}")

# MoonProt proteins
df_moonprot = df[df["UniProt IDs"].isin(uniprot_ids_moonprot)]
print("\nMoonProt proteins:")
print(f"Total proteins: {df_moonprot.shape[0]}")
print(f"Number of True values: {df_moonprot['Class'].value_counts().get(True, 0)}")
print(f"Number of False values: {df_moonprot['Class'].value_counts().get(False, 0)}")

# Non-MoonDB Non-MoonProt proteins
df_non_moondb_moonprot = df[~df["UniProt IDs"].isin(uniprot_ids_moondb)]
df_non_moondb_moonprot = df_non_moondb_moonprot[~df_non_moondb_moonprot["UniProt IDs"].isin(uniprot_ids_moonprot)]
print("\nNon-MoonDB Non-MoonProt proteins:")
print(f"Total proteins: {df_non_moondb_moonprot.shape[0]}")
print(f"Number of True values: {df_non_moondb_moonprot['Class'].value_counts().get(True, 0)}")
print(f"Number of False values: {df_non_moondb_moonprot['Class'].value_counts().get(False, 0)}")


MoonDB proteins:
Total proteins: 315
Number of True values: 310
Number of False values: 5

MoonProt proteins:
Total proteins: 288
Number of True values: 280
Number of False values: 8

Non-MoonDB Non-MoonProt proteins:
Total proteins: 688
Number of True values: 0
Number of False values: 688


In [9]:
# Number of moonlight proteins
print("\nNumber of moonlight proteins:")
print(df["Class"].value_counts())


Number of moonlight proteins:
Class
False    700
True     561
Name: count, dtype: int64


In [None]:
# Strategy 1: Fully Balanced
def strategy_fully_balanced(df_moondb, df_moonprot, df_non_moondb, save_path):
    train_true = df_moondb[df_moondb['Class'] == True].sample(n=310, random_state=42)
    train_false = df_non_moondb[df_non_moondb['Class'] == False].sample(n=310, random_state=42)
    train = pd.concat([train_true, train_false])

    test_true = df_moonprot[df_moonprot['Class'] == True].sample(n=280, random_state=42)
    test_false = df_non_moondb[df_non_moondb['Class'] == False].sample(n=280, random_state=42)
    test = pd.concat([test_true, test_false])

    train.to_csv(os.path.join(save_path, "strategy1_train.csv"), index=False)
    test.to_csv(os.path.join(save_path, "strategy1_test.csv"), index=False)

In [None]:
# Strategy 2: Realistic Proportions
def strategy_realistic_proportions(df_moondb, df_moonprot, df_non_moondb, save_path):
    train_true = df_moondb[df_moondb['Class'] == True]
    train_false = pd.concat([
        df_moondb[df_moondb['Class'] == False],
        df_non_moondb[df_non_moondb['Class'] == False].sample(n=305, random_state=42)
    ])
    train = pd.concat([train_true, train_false])

    test_true = df_moonprot[df_moonprot['Class'] == True]
    test_false = df_moonprot[df_moonprot['Class'] == False]
    test = pd.concat([test_true, test_false])

    train.to_csv(os.path.join(save_path, "strategy2_train.csv"), index=False)
    test.to_csv(os.path.join(save_path, "strategy2_test.csv"), index=False)


In [None]:
# Strategy 3: Balanced Training, Imbalanced Testing
def strategy_balanced_training_imbalanced_testing(df_moondb, df_moonprot, df_non_moondb, save_path):
    train_true = df_moondb[df_moondb['Class'] == True].sample(n=280, random_state=42)
    train_false = df_non_moondb[df_non_moondb['Class'] == False].sample(n=280, random_state=42)
    train = pd.concat([train_true, train_false])

    test_true = df_moonprot[df_moonprot['Class'] == True]
    test_false = df_moonprot[df_moonprot['Class'] == False]
    test = pd.concat([test_true, test_false])

    train.to_csv(os.path.join(save_path, "strategy3_train.csv"), index=False)
    test.to_csv(os.path.join(save_path, "strategy3_test.csv"), index=False)

In [17]:
# Strategy 4: Cross-Domain Generalization
def strategy_cross_domain_generalization(df_moondb, df_moonprot, save_path):
    train = df_moondb
    test = df_moonprot

    train.to_csv(os.path.join(save_path, "strategy4_train.csv"), index=False)
    test.to_csv(os.path.join(save_path, "strategy4_test.csv"), index=False)

In [18]:
# Strategy 5: Use All Positives, Vary Negatives
def strategy_use_all_positives_vary_negatives(df_moondb, df_moonprot, df_non_moondb, save_path):
    train_true = pd.concat([
        df_moondb[df_moondb['Class'] == True],
        df_moonprot[df_moonprot['Class'] == True]
    ])
    train_false = df_non_moondb[df_non_moondb['Class'] == False].sample(n=len(train_true), random_state=42)
    train = pd.concat([train_true, train_false])

    test = df_non_moondb[df_non_moondb['Class'] == False]

    train.to_csv(os.path.join(save_path, "strategy5_train.csv"), index=False)
    test.to_csv(os.path.join(save_path, "strategy5_test.csv"), index=False)

In [None]:
# FER BUCK UPP!!!!!!!!!!!!!!!!