# andrewdyates/fast_correlation

working pearson's correlation using matrices

1 parent 097e0c4 commit 75d5a07b668d32658b15713122cf13b9595d2a00 committed May 29, 2012
Showing with 95 additions and 0 deletions.
1. +3 −0 .gitmodules
2. +27 −0 __init__.py
3. +1 −0 py_symmetric_matrix
4. +64 −0 test.py
 @@ -0,0 +1,3 @@ +[submodule "py_symmetric_matrix"] + path = py_symmetric_matrix + url = git@github.com:andrewdyates/py_symmetric_matrix.git
 @@ -0,0 +1,27 @@ +#!/usr/bin/python +"""Efficiently compute all pairs correlation and save residuals.""" +import numpy as np +from py_symmetric_matrix import * +from scipy.spatial.distance import squareform + +def load_file(filename, n=None, delimiter="\t"): + """Load labeled row matrix from file.""" + pass + + +def correlate_all(M, ranks=False): + m = np.size(M, 0) # number of rows (variables) + n = np.size(M, 1) # number of columns (power) + + sums = np.sum(M,1).reshape(m,1) + stds = np.std(M,1).reshape(m,1) # divided by n + + Dot = squareform(np.dot(M, M.T), checks=False) + SumProd = squareform(np.dot(sums, sums.T), checks=False) + StdProd = squareform(np.dot(stds, stds.T), checks=False) + + CorrMatrix = (Dot - (SumProd/n)) / (StdProd*n) + + # Correlation Matrix + return CorrMatrix +
64 test.py
 @@ -0,0 +1,64 @@ +import unittest +from __init__ import * +from numpy import * +from scipy.spatial.distance import squareform +# get "sym_idx" function +from py_symmetric_matrix import * + +# Squareform overhead +#%timeit squareform(X, checks=False) +#1 loops, best of 3: 676 ms per loop + + +# use numpy.ma.MaskedArray for missing values. 1=MASK, 0=TRANSPARENT +# use numpy.ma.MaskedArray for missing values. 1=MASK, 0=TRANSPARENT + +# In [12]: Q = arange(10000,dtype=float).reshape(10000,1) +# In [13]: %timeit dot(Q,Q.T) +# 1 loops, best of 3: 1.7 s per loop + +# %timeit test(10000) +# 1 loops, best of 3: 3.2 s per loop + + +class TestMatrix(unittest.TestCase): + + def test_small(self): + M = arange(8*10, dtype=float).reshape(8,10) + C = correlate_all(M) + self.assertTrue(all(C >= 0.999)) + self.assertTrue(all(C < 1.001)) + + def test_random(self): + M = random.rand(100,10000) + C = correlate_all(M) + self.assertTrue(all(C**2 >= 0)) + # These may fail... but it's very unlikely for n=10000. + self.assertTrue(mean(C) <= 0.001) + self.assertTrue(std(C) <= 0.1) + self.assertTrue(max(C) <= 1.0) + self.assertTrue(min(C) >= -1.0) + + def test_big(self): + M = arange(10000*200, dtype=float).reshape(10000,200) + C = correlate_all(M) + self.assertTrue(all(C >= 0.999)) + self.assertTrue(all(C < 1.001)) + +if __name__ == "__main__": + unittest.main() + +# # sum of x squared, square of sum y (compress these to save memory) +# D_xsq_sqy = np.dot(D_sq_sums, (D_sums.T)**2) +# D_ysq_sqx = np.dot(D_sums**2, D_sq_sums.T) +# D_midterm = (D_xsq_sqy + D_ysq_sqx) / n + +# numerator = D_dot - D_sums_prod_n +# denominator = D_sq_sums_prod - D_midterm + D_sums_prod_n**2 + +# pearsonsr = numerator/denominator**0.5 + + + + +