From d1ab9b6ef020bc0983a72da8a9add9cfc2356a4c Mon Sep 17 00:00:00 2001 From: Evan Sparks Date: Wed, 7 May 2014 18:18:36 -0700 Subject: [PATCH 1/2] Use numpy directly for matrix multiply. Using matrix multiply to compute XtX and XtY yields a 5-20x speedup depending on problem size. For example - the following takes 19s locally after this change vs. 5m21s before the change. (16x speedup). bin/pyspark examples/src/main/python/als.py local[8] 1000 1000 50 10 10 --- examples/src/main/python/als.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py index a77dfb2577835..fcf28e0fd097f 100755 --- a/examples/src/main/python/als.py +++ b/examples/src/main/python/als.py @@ -36,13 +36,10 @@ def rmse(R, ms, us): def update(i, vec, mat, ratings): uu = mat.shape[0] ff = mat.shape[1] - XtX = matrix(np.zeros((ff, ff))) - Xty = np.zeros((ff, 1)) - - for j in range(uu): - v = mat[j, :] - XtX += v.T * v - Xty += v.T * ratings[i, j] + + XtX = mat.T * mat + XtY = mat.T * ratings[i, :].T + XtX += np.eye(ff, ff) * LAMBDA * uu return np.linalg.solve(XtX, Xty) From e094dbcce3ac2cae858399af80c1d4fcb6bf9fcb Mon Sep 17 00:00:00 2001 From: Evan Sparks Date: Wed, 7 May 2014 21:16:27 -0700 Subject: [PATCH 2/2] Touching only diaganols on update. This probably won't make a huge difference when K is small, but it's better style. --- examples/src/main/python/als.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py index fcf28e0fd097f..33700ab4f8c53 100755 --- a/examples/src/main/python/als.py +++ b/examples/src/main/python/als.py @@ -40,7 +40,9 @@ def update(i, vec, mat, ratings): XtX = mat.T * mat XtY = mat.T * ratings[i, :].T - XtX += np.eye(ff, ff) * LAMBDA * uu + for j in range(ff): + XtX[j,j] += LAMBDA * uu + return np.linalg.solve(XtX, Xty) if __name__ == "__main__":