In [1]:
import numpy as np
import pandas as pd
np.random.seed(0)

def uniform_random_sample(X, num_landmarks, replace=False):
    """
        Samples indices uniformly at random from a data matrix.
    """
    p, n = X.shape 
    indices = np.random.choice(p, num_landmarks, replace=replace)
    return indices

def column_norm_sample(X, num_landmarks, replace=False):
    """Samples indices based on the column norms of a data matrix."""
    p, n = X.shape 
    
    # Step 1: Compute the column norms
    column_norms = np.linalg.norm(X, axis=1) 

    # Step 2: Compute the probabilities
    probabilities = column_norms ** 2
    probabilities /= np.sum(probabilities)

    # Step 3: Sample indices based on the probabilities
    indices = np.random.choice(p, num_landmarks, replace=replace, p=probabilities)

    return indices


def generate_low_rank_data(n_features, n_samples, rank):
    """
    Generates a low rank data matrix.
    """
    rank = min(rank, n_features, n_samples)
    A = np.random.randn(n_features, rank) @ np.random.randn(rank, n_samples)
    
    return A

def compare_eigenvalues(eigvals_sample, eigvals_nystrom, k):
    eigvals_sample = eigvals_sample[:k]
    eigvals_nystrom = eigvals_nystrom[:k]
    relative_error = np.linalg.norm(eigvals_sample - eigvals_nystrom) / np.linalg.norm(eigvals_sample)
    return relative_error

def compare_eigenvectors(eigvecs_sample, eigvecs_nystrom, k):
    eigvecs_sample = eigvecs_sample[:, :k]
    eigvecs_nystrom = eigvecs_nystrom[:, :k]
    # Adjust signs
    for i in range(eigvecs_sample.shape[1]):
        if np.dot(eigvecs_sample[:, i], eigvecs_nystrom[:, i]) < 0:
            eigvecs_nystrom[:, i] *= -1
    
    frobenius_norm = np.linalg.norm(eigvecs_sample - eigvecs_nystrom, 'fro') / np.linalg.norm(eigvecs_sample, 'fro')
    return frobenius_norm

def nystrom_pca(X, indices, num_landmarks):
    """
        Estimates the principal components of a data matrix using the Nyström method.
    """
    p, n = X.shape
    Y = X[indices, :]  # X_I.shape = (num_landmarks, n)

    # Step 2: Define J and X_J
    indices_J = np.setdiff1d(np.arange(p), indices)
    Z = X[indices_J, :]  # X_J.shape = (p - num_landmarks, n)

    # Step 3: Compute Thin SVD of X_I
    U_Y, D_Y, V_Y_T = np.linalg.svd(Y, full_matrices=False)
    # Step 4: Construct W_I and W_J
    W_Y = (1 / np.sqrt(n)) * U_Y @ np.diag(D_Y)
    W_Z = (1 / np.sqrt(n)) * Z @ V_Y_T.T

    # Initialize W with the same shape as X
    W = np.zeros((p, num_landmarks))
    W[indices, :] = W_Y
    W[indices_J, :] = W_Z

    # Step 6: Perform thin SVD on W
    U, Lambda, V_T = np.linalg.svd(W, full_matrices=False)

    # Eigenvalues and eigenvectors
    eigenvalues = Lambda**2
    eigenvectors = U

    return eigenvalues, eigenvectors

def sample_svd(X):
    """
        Estimates the sample covariance matrix of a data matrix.
    """
    n = X.shape[1]
    U_Y, D_Y, V_Y_T = np.linalg.svd(X, full_matrices=False)
    eigenvalues = D_Y**2 / n
    eigenvectors = U_Y

 
    return eigenvalues, eigenvectors


In [3]:
import numpy as np
from scipy import linalg
from scipy.sparse.linalg import eigsh

def recursive_rls_sample(X, num_landmarks=None, replace=False, accelerated_flag=False):
    p, r = X.shape
    
    if num_landmarks is None:
        s = int(np.ceil(np.sqrt(p)))
    else:
        s = num_landmarks
    
    kernel_func = lambda X, row_ind, col_ind: (X[row_ind, :] @ X[col_ind, :].T if len(col_ind) > 0 else np.sum(X[row_ind, :]**2, axis=1))
    
    if not accelerated_flag:
        s_level = s
    else:
        s_level = int(np.ceil(np.sqrt((p * s + s**3) / (4 * p))))
    
    oversamp = np.log(s_level)
    k = int(np.ceil(s_level / (4 * oversamp)))
    n_levels = int(np.ceil(np.log(p / s_level) / np.log(2)))
    
    perm = np.random.permutation(p)
    
    l_size = [p]
    for _ in range(n_levels):
        l_size.append(int(np.ceil(l_size[-1] / 2)))
    
    samp = np.arange(l_size[-1])
    r_ind = perm[samp]
    weights = np.ones(len(r_ind))
    
    k_diag = kernel_func(X, np.arange(p), [])
    
    for l in range(n_levels, 0, -1):
        r_ind_curr = perm[:l_size[l-1]]
        KS = kernel_func(X, r_ind_curr, r_ind)
        SKS = KS[samp, :]
        SKSn = SKS.shape[0]
        
        if k >= SKSn:
            lambda_ = 1e-6
        else:
            diag_sum = np.sum((SKS.diagonal() * weights**2))
            SKS_weighted = SKS * weights[:, np.newaxis]
            eigvals = eigsh(SKS_weighted @ SKS_weighted.T, k=k, which='LM', return_eigenvectors=False)
            eig_sum = np.sum(np.abs(eigvals))
            lambda_ = (diag_sum - eig_sum) / k
        
        if l != 1:
            R = linalg.inv(SKS + np.diag(lambda_ * weights**(-2)))
            levs = np.maximum(0, np.minimum(1, oversamp * (1/lambda_) * np.maximum(0, k_diag[r_ind_curr] - np.sum((KS @ R) * KS, axis=1))))
            levs_sum = np.sum(levs)
            if levs_sum == 0 or np.count_nonzero(levs) < s_level:
                samp = np.random.choice(l_size[l-1], size=s_level, replace=False)
            else:
                samp = np.random.choice(l_size[l-1], size=s_level, replace=False, p=levs / levs_sum)
            
            weights = np.sqrt(1 / np.maximum(levs[samp], 1e-12))  # Avoid division by zero
        else:
            R = linalg.inv(SKS + np.diag(lambda_ * weights**(-2)))
            levs = np.maximum(0, np.minimum(1, (1/lambda_) * np.maximum(0, k_diag[r_ind_curr] - np.sum((KS @ R) * KS, axis=1))))
            levs_sum = np.sum(levs)
            if levs_sum == 0 or np.count_nonzero(levs) < s:
                samp = np.random.choice(p, size=s, replace=False)
            else:
                samp = np.random.choice(p, size=s, replace=False, p=levs / levs_sum)
        
        r_ind = perm[samp]
    return r_ind

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

def test1(X, rank_start=2, rank_end=180):
    ranks = np.arange(rank_start, rank_end + 1)
    sampling_methods = {
        'Recursive RLS Sampling': recursive_rls_sample,
        'Uniform Sampling': uniform_random_sample,
        'Column Norm Sampling': column_norm_sample
    }

    true_eigenvalues, true_eigenvectors = sample_svd(X)

    eigenvalue_diffs = {method_name: [] for method_name in sampling_methods.keys()}
    eigenvector_diffs = {method_name: [] for method_name in sampling_methods.keys()}

    for r in ranks:
        for method_name, method_func in sampling_methods.items():
            indices = method_func(X, num_landmarks=r)
            eigenvalues_hat, eigenvectors_hat = nystrom_pca(X, indices, r)
            
            # Calculate differences
            eigenvalue_diff = compare_eigenvalues(true_eigenvalues, eigenvalues_hat, r)
            eigenvector_diff = compare_eigenvectors(true_eigenvectors, eigenvectors_hat, r)
            
            # Record differences
            eigenvalue_diffs[method_name].append(eigenvalue_diff)
            eigenvector_diffs[method_name].append(eigenvector_diff)

    return ranks, eigenvalue_diffs, eigenvector_diffs

def plot_differences(ranks, diffs, ylabel, title):
    plt.figure(figsize=(10, 6))
    for method_name, diff_values in diffs.items():
        rank_percentage = ranks / np.max(ranks) * 100  # Rank as a percentage of max rank
        plt.plot(rank_percentage, diff_values, label=method_name)
    
    plt.xlabel('Rank Percentage (%)')
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()

X = pd.read_pickle('russell_minute_data.pkl').values
# Example of how to use the functions
ranks, eigenvalue_diffs, eigenvector_diffs = test1(X)

# Plotting the differences in eigenvalues
plot_differences(ranks, eigenvalue_diffs, ylabel='% Change in Eigenvalues', title='Difference in Eigenvalues against SCM')

# Plotting the differences in eigenvectors
plot_differences(ranks, eigenvector_diffs, ylabel='% Change in Eigenvectors', title='Difference in Eigenvectors against SCM')

KeyboardInterrupt: 

In [70]:
eigenvalue_diffs = pd.DataFrame(
    eigenvalue_diffs, index=ranks
)
eigenvector_diffs = pd.DataFrame(
    eigenvector_diffs, index=ranks
)

eigenvalue_diffs.to_pickle('eigenvalue_diffs.csv')
eigenvector_diffs.to_pickle('eigenvector_diffs.csv')

In [68]:
X = pd.read_pickle('russell_minute_data.pkl').values
X.shape

(1014, 180)

In [66]:
eigenvalue, eigenvector = sample_svd(X)
indices = uniform_random_sample(X, 182)
eigenvalues_hat, eigenvectors_hat = nystrom_pca(X, indices, 182)
print(compare_eigenvalues(eigenvalue, eigenvalues_hat, 182))
print(compare_eigenvectors(eigenvector, eigenvectors_hat, 182))
eigenvalue.size

0.019763304569389968
1.1874191671590806


252