In [533]:
import numpy as np
import matplotlib.pyplot as plt
import random
import time

In [534]:
numpts = 8000
numfeats = 2100
# Create different vectors (each row is a data point, NxM matrix)
X = []
for p in range(numfeats):
    X.append([np.random.normal(p, (p+1)*0.1) * (i + -p + (np.random.rand()*2*p)) for i in range(numpts)])
X = np.array(X).T

corrcalctime = time.time()
X_truecorr = np.corrcoef(X, rowvar=0)

corrcalctime = (time.time() - corrcalctime)*0.001

In [535]:
def get_subset(X, n=100):
    rows = sorted(np.random.choice(np.arange(X.shape[0]), n, replace=False))
    return X[rows,:]

In [536]:
# Get correlation matrix for each subset
numiters = 50
corrmats = []

mycalctime = time.time()
for i in range(numiters):
    X_sub = get_subset(X, n=30)
    corrmats.append(np.corrcoef(X_sub, rowvar=0))
#print(corrmats[0] - X_truecorr)
# Compute means of correlations seen so far with each iteration
X_means = [0 for _ in range(numiters)]
X_means[0] = corrmats[0]
for i in range(1, numiters):
    X_means[i] = (X_means[i - 1]*(i) + corrmats[i])/(i+1)

mycalctime = (time.time() - mycalctime)*0.001

In [537]:
# Get abs differences between true correlation and mean correlation at each iteration each elem
# is a (numfeats) x (numfeats) matrix. There are numiters elements.
X_diffs = []
for i in range(numiters):
    X_diffs.append(abs(X_means[i] - X_truecorr))
#print(X_diffs[0])
# Now get max difference for each feature at each iteration
feat_diffs = [[] for _ in range(numfeats)]
for i in range(numiters):
    for j in range(numfeats):
        feat_diffs[j].append(np.mean(X_diffs[i][j,:]))

In [538]:
for i in range(numfeats):
    plt.plot(np.arange(numiters), feat_diffs[i], label=str(i+1))
plt.legend()
plt.show()

plt.bar(np.arange(numfeats)+1, [min(feat_diffs[i]) for i in range(numfeats)])
plt.show()

### Conclusion ###
Seems possible that taking many repeated samples and inspecting correlation of each could converge on the true correlation, but this convergence may be off slightly without enough iterations.

However, this approach (at least in its current state) show no improvement over the typical approach, so this isn't worth it.

### Time difference ###

In [540]:
print("corrcalctime - mycalctime = ", (corrcalctime - mycalctime), "Max error =", max([x[-1] for x in feat_diffs]))

corrcalctime - mycalctime =  -0.007444018125534057 Max error = 0.0193556482123
