Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

working pearson's correlation using matrices

  • Loading branch information...
commit 75d5a07b668d32658b15713122cf13b9595d2a00 1 parent 097e0c4
@andrewdyates authored
Showing with 95 additions and 0 deletions.
  1. +3 −0  .gitmodules
  2. +27 −0 __init__.py
  3. +1 −0  py_symmetric_matrix
  4. +64 −0 test.py
View
3  .gitmodules
@@ -0,0 +1,3 @@
+[submodule "py_symmetric_matrix"]
+ path = py_symmetric_matrix
+ url = git@github.com:andrewdyates/py_symmetric_matrix.git
View
27 __init__.py
@@ -0,0 +1,27 @@
+#!/usr/bin/python
+"""Efficiently compute all pairs correlation and save residuals."""
+import numpy as np
+from py_symmetric_matrix import *
+from scipy.spatial.distance import squareform
+
+def load_file(filename, n=None, delimiter="\t"):
+ """Load labeled row matrix from file."""
+ pass
+
+
+def correlate_all(M, ranks=False):
+ m = np.size(M, 0) # number of rows (variables)
+ n = np.size(M, 1) # number of columns (power)
+
+ sums = np.sum(M,1).reshape(m,1)
+ stds = np.std(M,1).reshape(m,1) # divided by n
+
+ Dot = squareform(np.dot(M, M.T), checks=False)
+ SumProd = squareform(np.dot(sums, sums.T), checks=False)
+ StdProd = squareform(np.dot(stds, stds.T), checks=False)
+
+ CorrMatrix = (Dot - (SumProd/n)) / (StdProd*n)
+
+ # Correlation Matrix
+ return CorrMatrix
+
1  py_symmetric_matrix
@@ -0,0 +1 @@
+Subproject commit 9551d732206e961887d1aeb14c4ef2b660489073
View
64 test.py
@@ -0,0 +1,64 @@
+import unittest
+from __init__ import *
+from numpy import *
+from scipy.spatial.distance import squareform
+# get "sym_idx" function
+from py_symmetric_matrix import *
+
+# Squareform overhead
+#%timeit squareform(X, checks=False)
+#1 loops, best of 3: 676 ms per loop
+
+
+# use numpy.ma.MaskedArray for missing values. 1=MASK, 0=TRANSPARENT
+# use numpy.ma.MaskedArray for missing values. 1=MASK, 0=TRANSPARENT
+
+# In [12]: Q = arange(10000,dtype=float).reshape(10000,1)
+# In [13]: %timeit dot(Q,Q.T)
+# 1 loops, best of 3: 1.7 s per loop
+
+# %timeit test(10000)
+# 1 loops, best of 3: 3.2 s per loop
+
+
+class TestMatrix(unittest.TestCase):
+
+ def test_small(self):
+ M = arange(8*10, dtype=float).reshape(8,10)
+ C = correlate_all(M)
+ self.assertTrue(all(C >= 0.999))
+ self.assertTrue(all(C < 1.001))
+
+ def test_random(self):
+ M = random.rand(100,10000)
+ C = correlate_all(M)
+ self.assertTrue(all(C**2 >= 0))
+ # These may fail... but it's very unlikely for n=10000.
+ self.assertTrue(mean(C) <= 0.001)
+ self.assertTrue(std(C) <= 0.1)
+ self.assertTrue(max(C) <= 1.0)
+ self.assertTrue(min(C) >= -1.0)
+
+ def test_big(self):
+ M = arange(10000*200, dtype=float).reshape(10000,200)
+ C = correlate_all(M)
+ self.assertTrue(all(C >= 0.999))
+ self.assertTrue(all(C < 1.001))
+
+if __name__ == "__main__":
+ unittest.main()
+
+# # sum of x squared, square of sum y (compress these to save memory)
+# D_xsq_sqy = np.dot(D_sq_sums, (D_sums.T)**2)
+# D_ysq_sqx = np.dot(D_sums**2, D_sq_sums.T)
+# D_midterm = (D_xsq_sqy + D_ysq_sqx) / n
+
+# numerator = D_dot - D_sums_prod_n
+# denominator = D_sq_sums_prod - D_midterm + D_sums_prod_n**2
+
+# pearsonsr = numerator/denominator**0.5
+
+
+
+
+
Please sign in to comment.
Something went wrong with that request. Please try again.