# Example code for loading saved data

In [None]:
import numpy as np
from pathlib import Path

# Load embeddings from .npz file
data = np.load('./embeddings.npz', allow_pickle=True)

# To see the array names stored in the npz file:
print("Keys in .npz file:", data.files)

# Access the embeddings and ids arrays:
embeddings = data['embeddings']
ids = data['ids']

print("\nOriginal embeddings shape:", embeddings.shape)
print("IDs shape:", ids.shape)
print("First sequence ID:", ids[0])

# Handle both padded (3D) and variable-length (object array) embeddings
if embeddings.dtype == object:
    print("Variable-length embeddings detected")
    print("First sequence embedding shape:", embeddings[0].shape)
else:
    print("Padded embeddings detected")
    print("Embedding dimensions: [batch_size, seq_len, hidden_dim]")

['ids', 'embeddings']
embeddings shape: (3,)
ids shape: (3,)
First sequence ID: seq1
First sequence embedding shape: (85, 1280)


## Reduce Embedding Dimensionality with PCA

After extracting embeddings, you can reduce their dimensionality using GPU-accelerated PCA. This is useful for:
- Reducing memory usage
- Speeding up downstream analysis
- Removing noise while preserving most variance


In [4]:
from pca import reduce_embeddings_pca
from pathlib import Path

# Reduce embeddings to 512 dimensions using PCA
reduce_embeddings_pca(
    input_file=Path("./embeddings.npz"),
    output_file=Path("./embeddings_pca512.npz"),
    n_components=2,
    random_state=42,
    use_mean_pooling=True
)

print("PCA reduction completed!")


INFO:pca.pca:Loading embeddings from embeddings.npz...
INFO:pca.pca:Loaded 3 sequences
INFO:pca.pca:Original embeddings shape: (3,)
INFO:pca.pca:Embeddings dtype: object
INFO:pca.pca:Processing variable-length embeddings...
INFO:pca.pca:Mean pooled variable-length embeddings to shape (3, 1280)
INFO:pca.pca:Reducing dimensionality from 1280 to 2 using cuML PCA...
INFO:pca.pca:Total explained variance: 1.0000 (100.00%)
INFO:pca.pca:Reduced embeddings shape: (3, 2)
INFO:pca.pca:Saving reduced embeddings to embeddings_pca512.npz...
INFO:pca.pca:Successfully saved 3 reduced embeddings with shape (3, 2) to embeddings_pca512.npz
INFO:pca.pca:PCA dimensionality reduction completed successfully!


PCA reduction completed!


### Alternative: Reduce to Different Dimensions

You can reduce to any number of components (must be less than the number of features):


## Load Reduced Embeddings

After PCA reduction, load and inspect the reduced embeddings:


In [5]:
# Reduce to 256 dimensions
reduce_embeddings_pca(
    input_file=Path("./embeddings.npz"),
    output_file=Path("./embeddings_pca256.npz"),
    n_components=256,
    random_state=42
)

print("Reduced to 256 dimensions!")


NameError: name 'reduce_embeddings_pca' is not defined