MOSAIC perform PCA on O/E matrix <br>

In [1]:
import numpy as np
import sympy
from sympy import lambdify
np.set_printoptions(suppress=True)
np.set_printoptions(precision=2)

In [2]:
def generate_oe_matrix(variance_threshold_L = 0.5, variance_threshold_U = 0.6):
    """Generates a random 5x5 symmetric matrix with positive values and
       the first PC explaining at least L% and at most U% of the variance.
    """

    while True:
        # Create a nxn array with random positive values
        matrix = np.abs(np.random.randn(6, 6) * 10)  # Use absolute values for positive entries
        # Make the array symmetric
        matrix = matrix + matrix.T
        
        origin_matrix = matrix

        # Subtract the mean of each row to center the data
        matrix = matrix - matrix.mean(axis=1, keepdims=True)

        # SVD 
        n = len(matrix[0])
        matrix_t = np.transpose(matrix)
        y = matrix_t / np.sqrt(n)
        U, S, Vh = np.linalg.svd(y, full_matrices=True)
        eigenvalues = S * S
        EV1_explained_variance = eigenvalues[0] / np.sum(eigenvalues) 

        # Check if the first PC explains at least the desired variance
        if EV1_explained_variance >= variance_threshold_L and EV1_explained_variance < variance_threshold_U:
            print(EV1_explained_variance, '\n')
            print(origin_matrix, '\n')
            return origin_matrix

# Generate a matrix that meets the conditions
oe = generate_oe_matrix()

0.5104936182219424 

[[53.46 24.14 16.97  4.81 33.26  2.66]
 [24.14 36.14 23.56 31.42 28.87  7.32]
 [16.97 23.56  5.71 17.36  1.82  4.44]
 [ 4.81 31.42 17.36 18.27 10.85 26.69]
 [33.26 28.87  1.82 10.85  2.3  24.1 ]
 [ 2.66  7.32  4.44 26.69 24.1  14.54]] 



In [3]:
# Subtract the mean of each row to center the data
oe_zero = oe - oe.mean(axis=1, keepdims=True)

### PCA

In [5]:
n = len(oe_zero[0])
x_t = np.transpose(oe_zero)
y = x_t / np.sqrt(n)

U, S, Vh = np.linalg.svd(y, full_matrices=True)
eigenvalues = S * S
sum_eigenvalues = np.sum(eigenvalues)
explained_variances = eigenvalues / sum_eigenvalues
print('explained_variances')
print(explained_variances)

cov_x = np.matmul(np.transpose(y), y)
V_Cx = np.matmul(Vh, cov_x)

print('Vh[0]')
print(Vh[0])
print('corr_oe')
print(np.corrcoef(oe))
print('cov_x')
print(cov_x)

cov_x_sum = [np.sum(np.abs(row)) for row in cov_x]
print('cov_x_sum')
print(cov_x_sum) 

explained_variances
[0.51 0.25 0.15 0.07 0.02 0.  ]
Vh[0]
[-0.85 -0.13 -0.15  0.26 -0.32  0.27]
corr_oe
[[ 1.    0.29  0.18 -0.68  0.32 -0.45]
 [ 0.29  1.    0.6  -0.04 -0.12  0.12]
 [ 0.18  0.6   1.    0.24  0.62 -0.25]
 [-0.68 -0.04  0.24  1.    0.18  0.01]
 [ 0.32 -0.12  0.62  0.18  1.   -0.49]
 [-0.45  0.12 -0.25  0.01 -0.49  1.  ]]
cov_x
[[ 302.33   46.44   25.57 -106.16   69.45  -72.58]
 [  46.44   82.55   43.79   -3.04  -13.58   10.18]
 [  25.57   43.79   64.42   17.08   62.69  -18.98]
 [-106.16   -3.04   17.08   80.14   19.96    0.5 ]
 [  69.45  -13.58   62.69   19.96  156.67  -56.96]
 [ -72.58   10.18  -18.98    0.5   -56.96   87.49]]
cov_x_sum
[622.5162015342019, 199.57693184733165, 232.52698355099554, 226.8852712561125, 379.3026471575441, 246.6822137711036]
