In [80]:
from hicpap import paptools
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

np.set_printoptions(suppress=True)
np.set_printoptions(precision=5)
np.set_printoptions(threshold=10)
# np.random.seed(3)

In [81]:
def generate_matrix(variance_threshold_L = 0, variance_threshold_U = 1):
    """
        Generates a random nxn symmetric matrix with positive values and
        the first PC explaining at least L% and at most U% of the variance.
    """
    matrix_n = 2000

    while True:
        # Create a nxn array with random positive values
        matrix = np.abs(np.random.randn(matrix_n, matrix_n) * 10)  # Use absolute values for positive entries
        # Make the array symmetric
        matrix = matrix + matrix.T
        
        origin_matrix = matrix.copy()

        # Subtract the mean of each row to center the data
        matrix = matrix - matrix.mean(axis=1, keepdims=True)

        # SVD 
        n = len(matrix[0])
        y = matrix.T / np.sqrt(n)
        U, S, Vh = np.linalg.svd(y, full_matrices=True)
        eigenvalues = S * S
        EV1_explained_variance = eigenvalues[0] / np.sum(eigenvalues) 

        # Check if the first PC explains at least the desired variance
        if EV1_explained_variance >= variance_threshold_L and EV1_explained_variance < variance_threshold_U:
            return origin_matrix

# Generate a matrix that meets the conditions
matrix = generate_matrix()
pearson_np = np.corrcoef(matrix)

# Subtract the mean of each row to center the data
pearson_np -= pearson_np.mean(axis=1, keepdims=True)

my pc1

In [82]:
Vh, explained_variances, total_entry_num, valid_entry_num = paptools.pca_on_pearson(pearson_np=pearson_np)
pc1_np = Vh[0]
print(pc1_np)

[-0.00638  0.02046  0.03247 ...  0.0136   0.01198 -0.01022]


In [83]:
print(explained_variances)
print(np.sum(explained_variances))

[0.00392 0.0039  0.00384 ... 0.      0.      0.     ]
0.9999999999999999


sklearn

In [84]:
from sklearn.decomposition import PCA
pca = PCA(n_components=len(pearson_np[0]))
pca.fit(pearson_np)
print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))
# print(pca.components_[0])


[0.00392 0.0039  0.00384 ... 0.      0.      0.     ]
0.999999999999997


In [85]:
approx_np = paptools.create_approx(pearson_np=pearson_np)
print(approx_np)

[ 0.00004  0.00006 -0.00001 ... -0.00007  0.00008 -0.00003]


In [86]:
correctness_info = paptools.calc_correctness(pc1_np=pc1_np, approx_np=approx_np)
display(correctness_info)

{'total_entry_num': 2000,
 'valid_entry_num': 2000,
 'correct_num': 1057,
 'correct_rate': 0.5285}