In [1]:
import numpy as np
import pandas as pd
from project_root.dataset.protein_dataset import ProteinDataset

# -------------------------------
# üîß Create dummy input data
# -------------------------------

# Simulated UniProt IDs
ids = [f"P{i:05d}" for i in range(10)]

# Create dummy DataFrame
df = pd.DataFrame({
    "UniProt IDs": ids,
    "Class": np.random.randint(0, 2, size=10),
    "Max_MBL_CC": np.random.rand(10)  # An extra attribute
})

In [2]:
# Create matching dummy embeddings and attention weights
embedding_dim = 128
attention_shape = (12, 128)

embeddings = {id_: np.random.rand(embedding_dim) for id_ in ids}
attention_weights = {id_: np.random.rand(*attention_shape) for id_ in ids}

# -------------------------------
# ‚úÖ Initialize ProteinDataset
# -------------------------------

protein_dataset = ProteinDataset(df, embeddings, attention_weights, solve_inconsistencies=True)


Checking consistency...
 - DataFrame IDs: 10
 - Embeddings IDs: 10
 - Attention Weights IDs: 10
Consistency checked.

ProteinDataset Report:
 - Number of samples: 10
 - Number of embeddings: 10
 - Number of attention weights: 10
 - Target column: Class
 - ID column: UniProt IDs
 - Save path: ./OUTPUTS/



In [3]:
# -------------------------------
# üîç Test accessors
# -------------------------------

print("‚úÖ Length of dataset:", len(protein_dataset))
print("‚úÖ First sample (shapes):")
(sample_emb, sample_attn), label = protein_dataset[0]
print("  - Embedding shape:", sample_emb.shape)
print("  - Attention shape:", sample_attn.shape)
print("  - Label:", label.item())

# Test getters
print("\n‚úÖ Number of embeddings returned:", len(protein_dataset.get_embeddings()))
print("‚úÖ Number of attention weights returned:", len(protein_dataset.get_attention_weights()))
print("‚úÖ Number of labels returned:", len(protein_dataset.get_labels()))
print("‚úÖ Number of IDs returned:", len(protein_dataset.get_ids()))

# Test attribute access
print("\n‚úÖ Attribute Max_MBL_CC (first 3 values):", protein_dataset.get_attribute("Max_MBL_CC")[:3])

‚úÖ Length of dataset: 10
‚úÖ First sample (shapes):
  - Embedding shape: torch.Size([128])
  - Attention shape: torch.Size([12, 128])
  - Label: 0.0

‚úÖ Number of embeddings returned: 10
‚úÖ Number of attention weights returned: 10
‚úÖ Number of labels returned: 10
‚úÖ Number of IDs returned: 10

‚úÖ Attribute Max_MBL_CC (first 3 values): [0.44961639965113553, 0.6675381100406788, 0.9472766640067233]
