## Making sense of Guassian Correlation Inequality in N dimensions

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
%matplotlib inline

1) Generate correlated random variable with given mean and fixed standard deviation.  <br>
2) Calculate covariance matrix using cholesky decomposition. <br>
3) Correlated Data = mean + covariace * data <br>
4) Find how many points lie within 1 STD from mean for height weight and combination and compare them. 


In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [3]:
ndim = 4
corr_values = np.random.random(((ndim ** 2 - ndim) // 2) - 1)
corr_matrix = np.identity(ndim)

<IPython.core.display.Javascript object>

In [4]:
corr_matrix

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

<IPython.core.display.Javascript object>

In [5]:
for row in range(0, ndim):
    for col in range(0, ndim):
        if row != col:
            corr_matrix[row][col] = corr_values[row + col - 1]

<IPython.core.display.Javascript object>

In [6]:
corr_matrix

array([[1.        , 0.05934805, 0.30635826, 0.62957662],
       [0.05934805, 1.        , 0.62957662, 0.38556287],
       [0.30635826, 0.62957662, 1.        , 0.86897783],
       [0.62957662, 0.38556287, 0.86897783, 1.        ]])

<IPython.core.display.Javascript object>

In [7]:
corr_values

array([0.05934805, 0.30635826, 0.62957662, 0.38556287, 0.86897783])

<IPython.core.display.Javascript object>

In [8]:
samples = 10000

<IPython.core.display.Javascript object>

In [9]:
# Get covariance from correlation using cholesky decomposition.
# Covariance is square root of correlation
chol = np.linalg.cholesky(corr_matrix)

<IPython.core.display.Javascript object>

In [10]:
assert (
    ~np.isclose(chol @ np.transpose(chol), corr_matrix)
).sum() == 0, "Cholesky not working"

<IPython.core.display.Javascript object>

In [11]:
mean = 0
std = 2.5

<IPython.core.display.Javascript object>

In [12]:
# generate correlated random variables with mean 0 and std 2.5
data = np.reshape(
    np.random.normal(mean, std, size=ndim * samples), (ndim, samples)
)
print(f"Correlation of randomly generated data - \n{np.corrcoef(data)}")
corr_data = (
    chol @ data
)  # will correlate the random generated data using covariance
print(f"Correlation of correlated data - \n{np.corrcoef(corr_data)}")

Correlation of randomly generated data - 
[[ 1.          0.00749563 -0.00659092 -0.004798  ]
 [ 0.00749563  1.          0.02046933  0.01087735]
 [-0.00659092  0.02046933  1.          0.00886228]
 [-0.004798    0.01087735  0.00886228  1.        ]]
Correlation of correlated data - 
[[1.         0.06679859 0.3023296  0.62088824]
 [0.06679859 1.         0.63858065 0.40237576]
 [0.3023296  0.63858065 1.         0.87191994]
 [0.62088824 0.40237576 0.87191994 1.        ]]


<IPython.core.display.Javascript object>

Weight and Height has different mean but same standard deviation. Loop through the data set and find points within 1 STD from mean.

In [13]:
check_std = 1
bool_array = np.zeros(shape=(ndim, samples))
for i in range(ndim):
    # check if every dimension is within 1 standard deviation
    bool_array[i] = (corr_data[i] > (mean - check_std)) & (
        corr_data[i] < (mean + check_std)
    )

<IPython.core.display.Javascript object>

In [164]:
all_normal = np.all(bool_array, axis=0)

<IPython.core.display.Javascript object>

In [175]:
np.prod(np.sum(bool_array, axis=1) / samples), np.sum(all_normal) / samples

(0.009264087793459198, 0.0398)

<IPython.core.display.Javascript object>

In [None]:

normal_height_weight = normal_height & normal_weight

prob_normal_height = normal_height.sum()/samples
prob_normal_weight = normal_weight.sum()/samples
prob_normal_height_weight = normal_height_weight.sum()/samples

print(prob_normal_height, prob_normal_weight, prob_normal_height_weight)

In [0]:
"Guassin correlation inequality is true" \
    if prob_normal_height_weight > (prob_normal_height * prob_normal_weight)\
    else "Gaussian correlation inequality is false"

'Guassin correlation inequality is true'

0.66 * 0.66 is always less than 0.5. (0.66 being 1 STD from mean and 0.5 as a result of correlation)

In [174]:
np.multiply?