### Residual Vector Quantization (RVQ) with learned codebooks using K-Means clustering

In [1]:
! pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp313-cp313-win_amd64.whl (11.1 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.15.2-cp313-cp313-win_amd64.whl (41.0 MB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 scipy-1.15.2 threadpoolctl-3.5.0



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
from sklearn.cluster import KMeans

In [6]:
class ResidualVectorQuantizer:
    def __init__(self, num_stages=3, codebook_size=8, vector_dim=2):
        self.num_stages = num_stages
        self.codebook_size = codebook_size
        self.vector_dim = vector_dim
        self.codebooks = [None] * num_stages # Placeholder for learned codebooks

    def train(self, X):
        """
        Train the RVQ model using K-Means clustering on residuals.
        :param X: A (num_samples, vector_dim) array of input vectors.
        """
        residuals = X.copy()
        for stage in range(self.num_stages):
            # Train K-Means on residuals
            kmeans = KMeans(n_clusters=self.codebook_size, n_init=10, random_state=42)
            kmeans.fit(residuals)
            self.codebooks[stage] = kmeans.cluster_centers_

            # Find the closest codebook vector for each input
            indices = kmeans.predict(residuals)

            # Update residuals
            residuals -= self.codebooks[stage][indices]

    def quantize(self, x):
        """
        Quantizes a single input vector using trained cookbooks.
        :param x: A vector of shape (vector_dim,)
        :return: List of quantized vectors and indices
        """
        residual = x.copy()
        quantized_vectors = []
        indices = []

        for stage in range(self.num_stages):
            # Compute distances and find the nearest codebook vector
            distances = np.linalg.norm(self.codebooks[stage] - residual, axis=1)
            best_match_idx = np.argmin(distances)
            quantized_vector = self.codebooks[stage][best_match_idx]

            # Update residual
            residual -= quantized_vector

            # Store quantized vector and its index
            quantized_vectors.append(quantized_vector)
            indices.append(best_match_idx)

        return quantized_vectors, indices

    def reconstruct(self, indices):
        """
        Reconstructs a vector using stored indices.
        :param indices: List of indices for each stage
        :return: Reconstructed vector
        """
        return sum(self.codebooks[stage][indices[stage]] for stage in range(self.num_stages))

In [14]:
# ----------------Testing the RVQ model----------------
# Generate synthetic dataset (1000 samples, 2 dimensions)
np.random.seed(42)
X_train = np.random.rand(10000, 2) # Sample input data

# Train the RVQ model
rvq = ResidualVectorQuantizer(num_stages=5, codebook_size=256, vector_dim=2)
rvq.train(X_train)

# Test the RVQ model
x = np.array([2.5, -1.3]) # Test input vector
quantized_vectors, indices = rvq.quantize(x)
reconstructed_x = rvq.reconstruct(indices)

# Print results
print(f"Original Vector: {x}")
print(f"Reconstructed Vector: {reconstructed_x}")
print(f"Residual Error: {np.linalg.norm(x - reconstructed_x)}")

Original Vector: [ 2.5 -1.3]
Reconstructed Vector: [1.00895996 0.01655394]
Residual Error: 1.989098959345546
