#### 20% blinding

In [3]:
import pandas as pd
import random
import os

# Set random seed for reproducibility
random.seed(42)

# Load the original datasets
train_df = pd.read_csv("data/davis_b3_train.csv")
test_df = pd.read_csv("data/davis_b3_test.csv")

# Combine train and test into one dataframe
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# Extract all unique proteins
all_proteins = list(combined_df["target_sequence"].unique())

# Shuffle proteins
random.shuffle(all_proteins)

# Define number of folds
num_folds = 5
proteins_per_fold = len(all_proteins) // num_folds
leftover = len(all_proteins) % num_folds

# Distribute proteins into folds
folds = []
start_idx = 0
for fold in range(num_folds):
    end_idx = start_idx + proteins_per_fold + (1 if fold < leftover else 0)
    folds.append(all_proteins[start_idx:end_idx])
    start_idx = end_idx

# Create output directory
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)

# Create and save folds
for fold_idx, proteins_to_blind in enumerate(folds):
    proteins_to_blind = set(proteins_to_blind)

    # Separate blinded and non-blinded interactions
    test_df_blinded = combined_df[combined_df["target_sequence"].isin(proteins_to_blind)]
    train_df_blinded = combined_df[~combined_df["target_sequence"].isin(proteins_to_blind)]

    # Save the datasets
    train_file = os.path.join(output_dir, f"davis_b3_train_{fold_idx+1}.csv")
    test_file = os.path.join(output_dir, f"davis_b3_test_{fold_idx+1}.csv")
    train_df_blinded.to_csv(train_file, index=False)
    test_df_blinded.to_csv(test_file, index=False)

    print(f"Fold {fold_idx}: {len(proteins_to_blind)} proteins blinded.")
    print(f"  Train set -> {train_file}")
    print(f"  Test set  -> {test_file}\n")


Fold 0: 76 proteins blinded.
  Train set -> data/davis_b3_train_1.csv
  Test set  -> data/davis_b3_test_1.csv

Fold 1: 76 proteins blinded.
  Train set -> data/davis_b3_train_2.csv
  Test set  -> data/davis_b3_test_2.csv

Fold 2: 76 proteins blinded.
  Train set -> data/davis_b3_train_3.csv
  Test set  -> data/davis_b3_test_3.csv

Fold 3: 76 proteins blinded.
  Train set -> data/davis_b3_train_4.csv
  Test set  -> data/davis_b3_test_4.csv

Fold 4: 75 proteins blinded.
  Train set -> data/davis_b3_train_5.csv
  Test set  -> data/davis_b3_test_5.csv



run create_data on each pair

In [8]:
import torch
import numpy as np
from torch_geometric.loader import DataLoader
from models.ginconv import GINConvNet
from utils import TestbedDataset  # Adjust the import path as needed

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the validation dataset
validation_data = TestbedDataset(root='data', dataset='pharos_test')

# Create a DataLoader for the validation dataset
validation_loader = DataLoader(validation_data, batch_size=32, shuffle=False)
print(type(validation_loader))

print(validation_data[0])

for batch in validation_loader:
    print(batch)  # This will print the batch structure, useful for debugging
    break  # Print only the first batch to inspect structure



Pre-processed data found: data/processed/pharos_test.pt, loading ...
<class 'torch_geometric.loader.dataloader.DataLoader'>
Data(x=[40, 78], edge_index=[2, 86], y=[1], target=[1, 1000], c_size=[1], drug_id='5280', protein_id='Q86TW2')
DataBatch(x=[914, 78], edge_index=[2, 1984], y=[32], target=[32, 1000], c_size=[32], drug_id=[32], protein_id=[32], batch=[914], ptr=[33])
