自己的算法參考: A Tutorial on Principal Component Analysis <br>
要注意, 在算 covariance matrix 的部份:
* Peason 一定要做 zero mean 
* Y 的分母要代 n-1 算出來的結果才會跟 numpy.cov() 一樣。

In [1]:
!pwd

/home/jordan990301/Projects/HiC-PC1_Approximation/test


In [2]:
import numpy as np
import pandas as pd
from copy import deepcopy
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
np.set_printoptions(suppress=True)

In [3]:
def generate_oe_matrix(variance_threshold_L = 0.85, variance_threshold_U = 1):
    """
        Generates a random 5x5 symmetric matrix with positive values and
        the first PC explaining at least L% and at most U% of the variance.
    """

    while True:
        # Create a nxn array with random positive values
        matrix = np.abs(np.random.randn(5, 5) * 10)  # Use absolute values for positive entries
        # Make the array symmetric
        matrix = matrix + matrix.T
        
        origin_matrix = matrix.copy()

        # Subtract the mean of each row to center the data
        matrix = matrix - matrix.mean(axis=1, keepdims=True)

        # SVD 
        n = len(matrix[0])
        matrix_t = np.transpose(matrix)
        y = matrix_t / np.sqrt(n)
        U, S, Vh = np.linalg.svd(y, full_matrices=True)
        eigenvalues = S * S
        EV1_explained_variance = eigenvalues[0] / np.sum(eigenvalues) 

        # Check if the first PC explains at least the desired variance
        if EV1_explained_variance >= variance_threshold_L and EV1_explained_variance < variance_threshold_U:
            # print(EV1_explained_variance, '\n')
            # print(origin_matrix, '\n')
            return origin_matrix

# Generate a matrix that meets the conditions
oe_np = generate_oe_matrix()
oe_np = oe_np - oe_np.mean(axis=1, keepdims=True)
display(oe_np)

array([[ -1.4621364 ,  -7.25704572,   8.16935584,   0.65961515,
         -0.10978887],
       [ -4.17757594, -10.04924043,  19.31800034,  -5.95877697,
          0.86759299],
       [  0.28623764,   8.35541237, -11.73759267,   4.9925866 ,
         -1.89664394],
       [  2.65484696,  -7.04301494,  14.87093661,  -6.68144259,
         -3.80132603],
       [  1.43671551,  -0.6653724 ,   7.53297864,  -4.25005345,
         -4.05426831]])

In [4]:
n = len(oe_np)

## It will make the difference between the np.cov
# y_np = oe_np.T / np.sqrt(n)
y_np = oe_np.T / np.sqrt(n - 1)

cov_oe_np = np.matmul(y_np.T, y_np)

print(cov_oe_np)


[[ 30.49701899  58.2064626  -38.36032077  41.1814671   15.47731521]
 [ 58.2064626  131.97106296 -85.82584287  95.86952666  42.00356224]
 [-38.36032077 -85.82584287  59.04727715 -64.69607306 -26.77412752]
 [ 41.1814671   95.86952666 -64.69607306  84.22219553  41.08275481]
 [ 15.47731521  42.00356224 -26.77412752  41.08275481  23.43817125]]


In [5]:
print(np.cov(oe_np))

[[ 30.49701899  58.2064626  -38.36032077  41.1814671   15.47731521]
 [ 58.2064626  131.97106296 -85.82584287  95.86952666  42.00356224]
 [-38.36032077 -85.82584287  59.04727715 -64.69607306 -26.77412752]
 [ 41.1814671   95.86952666 -64.69607306  84.22219553  41.08275481]
 [ 15.47731521  42.00356224 -26.77412752  41.08275481  23.43817125]]


In [6]:
np.cov(oe_np) / cov_oe_np 

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])