In [16]:
import os
import pandas as pd

SAVE_PATH = "../../datasets/"

# Ensure the save path exists
os.makedirs(SAVE_PATH, exist_ok=True)

df_moondb_dataset = pd.read_csv("../../datasets/moondb_dataset.csv")
df_moonprot_dataset = pd.read_csv("../../datasets/moonprot_dataset.csv")

In [3]:
print("\n\nMoonDB dataset:")
print('--'*20)
print(df_moondb_dataset.columns)
print('--'*20)
print("\nMoonProt dataset:")
print(df_moonprot_dataset.columns)



MoonDB dataset:
----------------------------------------
Index(['UniProt IDs', 'PDB ID', 'Gene Name', 'Protein Name',
       'Amino Acid Sequence', 'Organism Name', 'Status', 'Protein Existence',
       'GO BP Terms', 'GO CC Terms', 'GO MF Terms'],
      dtype='object')
----------------------------------------

MoonProt dataset:
Index(['UniProt IDs', 'PDB ID', 'Gene Name', 'Protein Name',
       'Amino Acid Sequence', 'Organism Name', 'Status', 'Protein Existence',
       'GO BP Terms', 'GO CC Terms', 'GO MF Terms'],
      dtype='object')


In [4]:
# Uniprot IDs
uniprot_ids_moondb = set(df_moondb_dataset["UniProt IDs"])
uniprot_ids_moonprot = set(df_moonprot_dataset["UniProt IDs"])

# Save the Uniprot IDs as csv files
df_uniprot_ids_moondb = pd.DataFrame(uniprot_ids_moondb, columns=["UniProt IDs"])
df_uniprot_ids_moonprot = pd.DataFrame(uniprot_ids_moonprot, columns=["UniProt IDs"])

# Save the Uniprot IDs
df_uniprot_ids_moondb.to_csv(SAVE_PATH + "uniprot_ids_moondb.csv", index=False)
df_uniprot_ids_moonprot.to_csv(SAVE_PATH + "uniprot_ids_moonprot.csv", index=False)

In [5]:
# Load main dataset
df = pd.read_csv(SAVE_PATH + "predictor_dataset.csv")

In [14]:
# MoonDB proteins
df_moondb = df[df["UniProt IDs"].isin(uniprot_ids_moondb)]
print("\nMoonDB proteins:")
print(f"Total proteins: {df_moondb.shape[0]}")
print(f"Number of True values: {df_moondb['Class'].value_counts().get(True, 0)}")
print(f"Number of False values: {df_moondb['Class'].value_counts().get(False, 0)}")

# MoonProt proteins
df_moonprot = df[df["UniProt IDs"].isin(uniprot_ids_moonprot)]
print("\nMoonProt proteins:")
print(f"Total proteins: {df_moonprot.shape[0]}")
print(f"Number of True values: {df_moonprot['Class'].value_counts().get(True, 0)}")
print(f"Number of False values: {df_moonprot['Class'].value_counts().get(False, 0)}")

# Non-MoonDB Non-MoonProt proteins
df_non_moondb_moonprot = df[~df["UniProt IDs"].isin(uniprot_ids_moondb)]
df_non_moondb_moonprot = df_non_moondb_moonprot[~df_non_moondb_moonprot["UniProt IDs"].isin(uniprot_ids_moonprot)]
print("\nNon-MoonDB Non-MoonProt proteins:")
print(f"Total proteins: {df_non_moondb_moonprot.shape[0]}")
print(f"Number of True values: {df_non_moondb_moonprot['Class'].value_counts().get(True, 0)}")
print(f"Number of False values: {df_non_moondb_moonprot['Class'].value_counts().get(False, 0)}")


MoonDB proteins:
Total proteins: 315
Number of True values: 310
Number of False values: 5

MoonProt proteins:
Total proteins: 288
Number of True values: 280
Number of False values: 8

Non-MoonDB Non-MoonProt proteins:
Total proteins: 688
Number of True values: 0
Number of False values: 688


In [9]:
# Number of moonlight proteins
print("\nNumber of moonlight proteins:")
print(df["Class"].value_counts())


Number of moonlight proteins:
Class
False    700
True     561
Name: count, dtype: int64


In [None]:
# Strategy 1: Fully Balanced
def strategy_fully_balanced(df_moondb, df_moonprot, df_non_moondb, save_path):
    train_true = df_moondb[df_moondb['Class'] == True].sample(n=310, random_state=42)
    train_false = df_non_moondb[df_non_moondb['Class'] == False].sample(n=310, random_state=42)
    train = pd.concat([train_true, train_false])

    test_true = df_moonprot[df_moonprot['Class'] == True].sample(n=280, random_state=42)
    test_false = df_non_moondb[df_non_moondb['Class'] == False].sample(n=280, random_state=42)
    test = pd.concat([test_true, test_false])

    train.to_csv(os.path.join(save_path, "strategy1_train.csv"), index=False)
    test.to_csv(os.path.join(save_path, "strategy1_test.csv"), index=False)

In [None]:
# Strategy 2: Realistic Proportions
def strategy_realistic_proportions(df_moondb, df_moonprot, df_non_moondb, save_path):
    train_true = df_moondb[df_moondb['Class'] == True]
    train_false = pd.concat([
        df_moondb[df_moondb['Class'] == False],
        df_non_moondb[df_non_moondb['Class'] == False].sample(n=305, random_state=42)
    ])
    train = pd.concat([train_true, train_false])

    test_true = df_moonprot[df_moonprot['Class'] == True]
    test_false = df_moonprot[df_moonprot['Class'] == False]
    test = pd.concat([test_true, test_false])

    train.to_csv(os.path.join(save_path, "strategy2_train.csv"), index=False)
    test.to_csv(os.path.join(save_path, "strategy2_test.csv"), index=False)


In [None]:
# Strategy 3: Balanced Training, Imbalanced Testing
def strategy_balanced_training_imbalanced_testing(df_moondb, df_moonprot, df_non_moondb, save_path):
    train_true = df_moondb[df_moondb['Class'] == True].sample(n=280, random_state=42)
    train_false = df_non_moondb[df_non_moondb['Class'] == False].sample(n=280, random_state=42)
    train = pd.concat([train_true, train_false])

    test_true = df_moonprot[df_moonprot['Class'] == True]
    test_false = df_moonprot[df_moonprot['Class'] == False]
    test = pd.concat([test_true, test_false])

    train.to_csv(os.path.join(save_path, "strategy3_train.csv"), index=False)
    test.to_csv(os.path.join(save_path, "strategy3_test.csv"), index=False)

In [17]:
# Strategy 4: Cross-Domain Generalization
def strategy_cross_domain_generalization(df_moondb, df_moonprot, save_path):
    train = df_moondb
    test = df_moonprot

    train.to_csv(os.path.join(save_path, "strategy4_train.csv"), index=False)
    test.to_csv(os.path.join(save_path, "strategy4_test.csv"), index=False)

In [18]:
# Strategy 5: Use All Positives, Vary Negatives
def strategy_use_all_positives_vary_negatives(df_moondb, df_moonprot, df_non_moondb, save_path):
    train_true = pd.concat([
        df_moondb[df_moondb['Class'] == True],
        df_moonprot[df_moonprot['Class'] == True]
    ])
    train_false = df_non_moondb[df_non_moondb['Class'] == False].sample(n=len(train_true), random_state=42)
    train = pd.concat([train_true, train_false])

    test = df_non_moondb[df_non_moondb['Class'] == False]

    train.to_csv(os.path.join(save_path, "strategy5_train.csv"), index=False)
    test.to_csv(os.path.join(save_path, "strategy5_test.csv"), index=False)

In [None]:
# FER BUCK UPP!!!!!!!!!!!!!!!!

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def split_unified_dataset(unified_dataset, test_size=0.4, random_state=42, balance_by_composition=False):
    """
    Combines the three datasets into one unified dataset and splits it into balanced train and test sets.

    Args:
        df_moondb (pd.DataFrame): MoonDB dataset.
        df_moonprot (pd.DataFrame): MoonProt dataset.
        df_non_moondb (pd.DataFrame): Non-MoonDB dataset.
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int): Random seed for reproducibility.
        balance_by_composition (bool): Whether to balance the dataset by amino acid composition.

    Returns:
        tuple: Train and test datasets as pandas DataFrames.
    """

    # Add a column for sequence length
    unified_dataset['Sequence Length'] = unified_dataset['Amino Acid Sequence'].apply(len)

    def balance_by_class_and_sequence_length(df):
        """Balances the dataset by 'Class' and sequence length."""
        # Split by class
        class_1 = df[df['Class'] == 1]
        class_0 = df[df['Class'] == 0]
    
        # Add a small check to ensure there are enough samples for stratification
        def safe_train_test_split(data, test_size, random_state, stratify_column):
            if len(data[stratify_column].unique()) < 2 or data[stratify_column].value_counts().min() < 2:
                # If stratification is not possible, perform a random split
                return train_test_split(data, test_size=test_size, random_state=random_state)
            else:
                # Perform stratified split
                return train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[stratify_column])
    
        # Add sequence length column
        class_1['Sequence Length'] = class_1['Amino Acid Sequence'].apply(len)
        class_0['Sequence Length'] = class_0['Amino Acid Sequence'].apply(len)
    
        # Perform safe train-test split
        train_1, test_1 = safe_train_test_split(class_1, test_size=test_size, random_state=random_state, stratify_column='Sequence Length')
        train_0, test_0 = safe_train_test_split(class_0, test_size=test_size, random_state=random_state, stratify_column='Sequence Length')
    
        # Combine balanced train and test sets
        train = pd.concat([train_1, train_0]).sample(frac=1, random_state=random_state).reset_index(drop=True)
        test = pd.concat([test_1, test_0]).sample(frac=1, random_state=random_state).reset_index(drop=True)
    
        return train, test

    # Balance by amino acid composition
    def balance_by_amino_acid_composition(df):
        """Balances the dataset by amino acid composition."""
        # Calculate amino acid composition for each sequence
        def amino_acid_composition(sequence):
            amino_acids = "ACDEFGHIKLMNPQRSTVWY"
            return {aa: sequence.count(aa) for aa in amino_acids}

        df['Amino Acid Composition'] = df['Amino Acid Sequence'].apply(amino_acid_composition)
        # Further balancing logic can be added here if needed
        return df

    # Apply balancing by class and sequence length
    train, test = balance_by_class_and_sequence_length(unified_dataset)

    # Optionally balance by amino acid composition
    if balance_by_composition:
        train = balance_by_amino_acid_composition(train)
        test = balance_by_amino_acid_composition(test)

    return train, test


In [10]:
# Summary of sequence lengths
def summarize_lengths(dataset, name):
    lengths = dataset['Amino Acid Sequence'].apply(len)
    print(f"\n{name} Dataset - Sequence Lengths:")
    print(f"Min Length: {lengths.min()}")
    print(f"Max Length: {lengths.max()}")
    print(f"Mean Length: {lengths.mean():.2f}")

# Summary of amino acid composition
def summarize_amino_acids(dataset, name):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    print(f"\n{name} Dataset - Amino Acid Composition:")
    composition = dataset['Amino Acid Sequence'].apply(lambda seq: {aa: seq.count(aa) for aa in amino_acids})
    composition_df = pd.DataFrame(composition.tolist())
    print(f"Mean Composition:\n{composition_df.mean()}")

In [2]:
import pandas as pd

# Load the datasets
predictor_dataset = pd.read_csv("../../datasets/predictor_dataset.csv")

In [14]:
# Split the unified dataset
train_dataset, test_dataset = split_unified_dataset(predictor_dataset, balance_by_composition=True)

# Length of the train and test datasets
print(f"Train Dataset Length: {len(train_dataset)}")
print(f"Test Dataset Length: {len(test_dataset)}")

# Target distribution
print("\nTrain Dataset - Target Distribution:")
print(train_dataset['Class'].value_counts(normalize=True))
print("\nTest Dataset - Target Distribution:")
print(test_dataset['Class'].value_counts(normalize=True))

# Summarize sequence lengths
summarize_lengths(train_dataset, "Train")
summarize_lengths(test_dataset, "Test")

# Summarize amino acid composition
summarize_amino_acids(train_dataset, "Train")
summarize_amino_acids(test_dataset, "Test")

Train Dataset Length: 756
Test Dataset Length: 505

Train Dataset - Target Distribution:
Class
False    0.555556
True     0.444444
Name: proportion, dtype: float64

Test Dataset - Target Distribution:
Class
False    0.554455
True     0.445545
Name: proportion, dtype: float64

Train Dataset - Sequence Lengths:
Min Length: 44
Max Length: 34350
Mean Length: 553.73

Test Dataset - Sequence Lengths:
Min Length: 49
Max Length: 4966
Mean Length: 514.47

Train Dataset - Amino Acid Composition:
Mean Composition:
A    40.441799
C     9.633598
D    29.285714
E    40.399471
F    19.104497
G    37.236772
H    12.390212
I    26.582011
K    35.880952
L    49.875661
M    12.050265
N    22.280423
P    33.456349
Q    24.768519
R    29.201058
S    43.044974
T    30.907407
V    35.411376
W     6.226190
Y    15.550265
dtype: float64

Test Dataset - Amino Acid Composition:
Mean Composition:
A    38.247525
C     8.776238
D    27.578218
E    36.017822
F    18.746535
G    34.651485
H    11.637624
I    25.09901

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  class_1['Sequence Length'] = class_1['Amino Acid Sequence'].apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  class_0['Sequence Length'] = class_0['Amino Acid Sequence'].apply(len)
