HOMER perform PCA on correlation matrix <br>
http://homer.ucsd.edu/homer/interactions2/HiCpca.html

In [55]:
import numpy as np
import sympy
from sympy import lambdify
np.set_printoptions(suppress=True)
np.set_printoptions(precision=2)

In [56]:
def generate_oe_matrix(variance_threshold_L = 0.5, variance_threshold_U = 0.6):
    """Generates a random 5x5 symmetric matrix with positive values and
       the first PC explaining at least L% and at most U% of the variance.
    """

    while True:
        # Create a nxn array with random positive values
        matrix = np.abs(np.random.randn(6, 6) * 10)  # Use absolute values for positive entries
        # Make the array symmetric
        matrix = matrix + matrix.T
        
        origin_matrix = matrix

        # Subtract the mean of each row to center the data
        matrix = matrix - matrix.mean(axis=1, keepdims=True)

        # SVD 
        n = len(matrix[0])
        matrix_t = np.transpose(matrix)
        y = matrix_t / np.sqrt(n)
        U, S, Vh = np.linalg.svd(y, full_matrices=True)
        eigenvalues = S * S
        EV1_explained_variance = eigenvalues[0] / np.sum(eigenvalues) 

        # Check if the first PC explains at least the desired variance
        if EV1_explained_variance >= variance_threshold_L and EV1_explained_variance < variance_threshold_U:
            print(EV1_explained_variance, '\n')
            print(origin_matrix, '\n')
            return origin_matrix

# Generate a matrix that meets the conditions
oe = generate_oe_matrix()

0.5491557078917478 

[[20.27  5.76  3.5   4.75  9.73  9.75]
 [ 5.76 32.88 20.71 21.33 14.39  6.47]
 [ 3.5  20.71  4.58 24.81  1.09 14.33]
 [ 4.75 21.33 24.81  1.67  6.62  7.34]
 [ 9.73 14.39  1.09  6.62 24.99 18.04]
 [ 9.75  6.47 14.33  7.34 18.04 54.57]] 



In [57]:
corr_oe = np.corrcoef(oe)
# Subtract the mean of each row to center the data
corr_oe_zero = corr_oe - corr_oe.mean(axis=1, keepdims=True)

### PCA

In [58]:
n = len(corr_oe_zero[0])
x_t = np.transpose(corr_oe_zero)
y = x_t / np.sqrt(n)

U, S, Vh = np.linalg.svd(y, full_matrices=True)
eigenvalues = S * S
sum_eigenvalues = np.sum(eigenvalues)
explained_variances = eigenvalues / sum_eigenvalues
print('explained_variances')
print(explained_variances)

cov_x = np.matmul(np.transpose(y), y)
V_Cx = np.matmul(Vh, cov_x)

print('Vh[0]')
print(Vh[0])
print('corr_oe')
print(corr_oe)
print('cov_x')
print(cov_x)

cov_x_sum = [np.sum(np.abs(row)) for row in cov_x]
print('cov_x_sum')
print(cov_x_sum) 

explained_variances
[0.74 0.12 0.09 0.05 0.   0.  ]
Vh[0]
[-0.47  0.56  0.31  0.4  -0.33 -0.33]
corr_oe
[[ 1.   -0.72 -0.48 -0.5   0.24  0.07]
 [-0.72  1.    0.52  0.6  -0.22 -0.57]
 [-0.48  0.52  1.   -0.08 -0.18 -0.06]
 [-0.5   0.6  -0.08  1.   -0.35 -0.18]
 [ 0.24 -0.22 -0.18 -0.35  1.    0.4 ]
 [ 0.07 -0.57 -0.06 -0.18  0.4   1.  ]]
cov_x
[[ 0.34 -0.34 -0.21 -0.24  0.17  0.13]
 [-0.34  0.41  0.22  0.27 -0.21 -0.25]
 [-0.21  0.22  0.24  0.07 -0.11 -0.1 ]
 [-0.24  0.27  0.07  0.29 -0.18 -0.15]
 [ 0.17 -0.21 -0.11 -0.18  0.21  0.15]
 [ 0.13 -0.25 -0.1  -0.15  0.15  0.24]]
cov_x_sum
[1.4367773344467003, 1.7025150186470275, 0.9568457517923594, 1.2041277423978558, 1.032045223266003, 1.023582455353972]


Test-V4: Confirm the reltionship between the EV1's pos-neg pattern and the corresponding Covariance matrix row.

If the explained variance is high enough, the EV1's pos-neg pattern will match the pos-neg pattern of the Covariance matrix with maximal absolute value sum of that row (compare to every row of the covariance matirx)