In [1]:
import os

# Limit the number of threads for various libraries
os.environ["MKL_NUM_THREADS"] = "12"  # For MKL (used by NumPy)
os.environ["NUMEXPR_NUM_THREADS"] = "12"  # For NumExpr
os.environ["VECLIB_MAXIMUM_THREADS"] = "12"  # For macOS Accelerate
os.environ["OPENBLAS_NUM_THREADS"] = "12"  # For OpenBLAS
os.environ["BLIS_NUM_THREADS"] = "12"  # For BLIS

# Print environment variables to verify
print("OMP_NUM_THREADS:", os.environ.get("OMP_NUM_THREADS"))
print("MKL_NUM_THREADS:", os.environ.get("MKL_NUM_THREADS"))
print("NUMEXPR_NUM_THREADS:", os.environ.get("NUMEXPR_NUM_THREADS"))
print("VECLIB_MAXIMUM_THREADS:", os.environ.get("VECLIB_MAXIMUM_THREADS"))
print("OPENBLAS_NUM_THREADS:", os.environ.get("OPENBLAS_NUM_THREADS"))
print("BLIS_NUM_THREADS:", os.environ.get("BLIS_NUM_THREADS"))

# Set PyTorch threads
import torch
torch.set_num_threads(18)
torch.set_num_interop_threads(18)

OMP_NUM_THREADS: None
MKL_NUM_THREADS: 12
NUMEXPR_NUM_THREADS: 12
VECLIB_MAXIMUM_THREADS: 12
OPENBLAS_NUM_THREADS: 12
BLIS_NUM_THREADS: 12


In [2]:
# WrappedProteinDataset & PCAExplainability - Usage Demo
# ======================================================
# This notebook demonstrates how to use WrappedProteinDataset and PCAExplainability
# to perform dimensionality reduction and interpret protein embedding data.

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

from project_root.dataset.dataset_loader import DatasetLoader
from project_root.dataset.protein_dataset import ProteinDataset
from project_root.dataset.wrapped_protein_dataset import WrappedProteinDataset
from project_root.explainability.pca_explainability import PCAExplainability

# ------------------------------------------------------
# 📥 1. Load Dataset and Embeddings
# ------------------------------------------------------

# Define data path
path_dataset_folder = "../DATASETS/"

# Load data
loader = DatasetLoader(path_dataset_folder)
df = loader.load_dataframe()
embeddings, attention_weights = loader.load_embeddings_and_attention()

# Create ProteinDataset instance
protein_dataset = ProteinDataset(df, embeddings, attention_weights, solve_inconsistencies=True)

Checking consistency...
Removing duplicate entries...
 - DataFrame IDs: 1249
 - Embeddings IDs: 1219
 - Attention Weights IDs: 1219
Resolving inconsistencies. Keeping 1219 common samples.
Consistency checked.

ProteinDataset Report:
 - Number of samples: 1219
 - Number of embeddings: 1219
 - Number of attention weights: 1219
 - Target column: Class
 - ID column: UniProt IDs
 - Save path: ./OUTPUTS/



In [3]:
print("ProteinDataset:")
print(f"Print ids length: {len(protein_dataset.ids)}")
print(f"Print levels length: {len(protein_dataset.labels)}")
print(f"Print embeddings shape: {len(protein_dataset.embeddings)}")
print(f"Print attention weights shape: {len(protein_dataset.attention_weights)}")

ProteinDataset:
Print ids length: 1219
Print levels length: 1219
Print embeddings shape: 1219
Print attention weights shape: 1219


In [4]:
print(protein_dataset.dataframe.head())

  UniProt IDs                                             PDB ID  \
1      Q9Y2X8                                                NaN   
2      Q05086  1C4Z; 1D5F; 1EQX; 2KR1; 4GIZ; 4XR8; 6SJV; 6SLM...   
3      Q9Y6X0                                                NaN   
4      Q8BH75                                               2OGB   
5      Q04120                                   5DVB; 5EPT; 6UTL   

                                           Gene Name  \
1         {'Name': 'UBE2D4', 'Synonyms': ['UBCH5D']}   
2  {'Name': 'UBE3A {ECO:0000312|HGNC:HGNC:12496}'...   
3       {'Name': 'SETBP1', 'Synonyms': ['KIAA0437']}   
4   {'Name': 'Rnf41', 'Synonyms': ['Flrf', 'Nrdp1']}   
5  {'Name': 'TSA2 {ECO:0000303|PubMed:11741925}',...   

                         Protein Name  \
1  Ubiquitin-conjugating enzyme E2 D4   
2        Ubiquitin-protein ligase E3A   
3                 SET-binding protein   
4   E3 ubiquitin-protein ligase NRDP1   
5    Peroxiredoxin TSA2 {ECO:0000305}   

       

In [None]:
# ------------------------------------------------------
# 🧪 2. Create WrappedProteinDataset with Configurations
# ------------------------------------------------------

# ➤ Option A: Use PCA for both embeddings and attention weights
dataset_pca = WrappedProteinDataset(
    dataset=protein_dataset,
    reduce_method='pca',              # or 'tsne'
    pca_method='threshold',           # 'threshold', 'derivative', 'custom'
    threshold=0.95,
    random_projection_dim=800        # Random projection dim before PCA on attention weights
)

# ➤ Option B: No dimensionality reduction, just flatten + concat
dataset_raw = WrappedProteinDataset(
    dataset=protein_dataset,
    reduce_method=None               # No reduction
)

# ➤ Option C: Use t-SNE for visual exploration
dataset_tsne = WrappedProteinDataset(
    dataset=protein_dataset,
    reduce_method='tsne'            # Use only for 2D plotting, not for training
)

Converting embeddings and attention weights to NumPy arrays...
Applying random projection to reduce attention weights from 6255001 to 800 dimensions...


In [None]:
# ------------------------------------------------------
# 📊 3. Apply PCAExplainability on any representation
# ------------------------------------------------------

# A. On PCA-reduced embeddings
pca_embeddings = PCA().fit(dataset_pca.embeddings)
PCAExplainability.plot_variance_explained(pca_embeddings, title="Embeddings PCA", threshold=0.95)
PCAExplainability.plot_scree(pca_embeddings)
PCAExplainability.plot_variance_contribution(pca_embeddings)

# B. On PCA-reduced attention weights
pca_attn = PCA().fit(dataset_pca.attention_weights)
PCAExplainability.plot_variance_explained(pca_attn, title="Attention PCA", threshold=0.95)

# C. Feature Importance (optional)
# PCAExplainability.plot_feature_importance(pca_embeddings, feature_names=[f"dim_{i}" for i in range(dataset_pca.embeddings.shape[1])])

In [None]:
# ------------------------------------------------------
# 🌀 4. Cluster Visualization Example
# ------------------------------------------------------

# Visualize clustering using KMeans on combined PCA-reduced features
dataset_pca.plot_kmeans(n_clusters=2, attribute='Class', embedding=True, attention_weights=True)

# Visualize clustering on t-SNE-reduced embeddings
dataset_tsne.plot_kmeans(n_clusters=2, attribute='Class', embedding=True, attention_weights=False)

# ------------------------------------------------------
# 📌 Notes:
# ------------------------------------------------------
# - reduce_method: Choose 'pca', 'tsne', or None
# - pca_method: Use 'threshold' for 95% variance, 'derivative' for elbow
# - random_projection_dim: Optional projection to speed up PCA on high-dim attention
# - get PCA object using: PCA().fit(data)
# - Use PCAExplainability for detailed analysis of components and features
# - Use WrappedProteinDataset as input for training or further analysis

print("✅ Demo complete.")

ValueError: array length 1219 does not match index length 1249

<Figure size 1600x1000 with 0 Axes>