From 7aa5de5431ad01c37e8069956287194c97b37b06 Mon Sep 17 00:00:00 2001
From: Dmitriy Lyubimov <dlyubimov@apache.org>
Date: Mon, 1 Jun 2015 17:54:17 -0700
Subject: [PATCH 1/9] Initial merge with ora private-review branch. Stuff
 compiles up to h2o (which needs to be added some unimplemented stuff) and
 ssvd tests are failing in math-scala module due to lack of matrix flavor on
 mmul. They are not failing in private branch though -- some changes still
 have not been merged?

Most changes i care about seems to be there though.
---
 bin/mahout                                    |   4 +-
 .../org/apache/mahout/logging/package.scala   |  73 ++++
 .../mahout/math/decompositions/DQR.scala      |   9 +-
 .../mahout/math/decompositions/DSSVD.scala    |  19 +-
 .../mahout/math/decompositions/SSVD.scala     |   2 +-
 .../org/apache/mahout/math/drm/BCast.scala    |   3 +-
 .../mahout/math/drm/CheckpointedOps.scala     |   7 +
 .../mahout/math/drm/DistributedEngine.scala   | 125 ++++---
 .../mahout/math/drm/DrmDoubleScalarOps.scala  |   8 +-
 .../apache/mahout/math/drm/DrmLikeOps.scala   |   7 +-
 .../apache/mahout/math/drm/RLikeDrmOps.scala  |  55 ++-
 .../math/drm/logical/AbstractUnaryOp.scala    |   2 +-
 .../math/drm/logical/CheckpointAction.scala   |   3 +-
 .../mahout/math/drm/logical/OpAewScalar.scala |   6 +-
 .../math/drm/logical/OpAewUnaryFunc.scala     |  47 +++
 .../drm/logical/OpAewUnaryFuncFusion.scala    |  62 ++++
 .../mahout/math/drm/logical/OpCbind.scala     |   2 +-
 .../math/drm/logical/OpCbindScalar.scala      |  37 ++
 .../mahout/math/drm/logical/OpMapBlock.scala  |   2 +-
 .../mahout/math/drm/logical/TEwFunc.scala     |  37 ++
 .../org/apache/mahout/math/drm/package.scala  |  50 ++-
 .../mahout/math/scalabindings/MMul.scala      | 295 +++++++++++++++
 .../mahout/math/scalabindings/MatrixOps.scala |  87 ++++-
 ...arOps.scala => RLikeDoubleScalarOps.scala} |  23 +-
 .../math/scalabindings/RLikeMatrixOps.scala   |  69 +++-
 .../mahout/math/scalabindings/RLikeOps.scala  |   4 +-
 .../math/scalabindings/RLikeVectorOps.scala   |  25 +-
 .../mahout/math/scalabindings/VectorOps.scala |  45 ++-
 .../mahout/math/scalabindings/package.scala   |  81 ++++-
 .../org/apache/mahout/util/IOUtilsScala.scala |  64 ++++
 .../mahout/math/drm/DrmLikeOpsSuiteBase.scala |  20 ++
 .../mahout/math/drm/DrmLikeSuiteBase.scala    |   3 +-
 .../math/drm/RLikeDrmOpsSuiteBase.scala       |  94 ++++-
 .../math/scalabindings/MatrixOpsSuite.scala   |  33 +-
 .../scalabindings/RLikeMatrixOpsSuite.scala   | 276 ++++++++++++++
 .../math/scalabindings/VectorOpsSuite.scala   |  19 +-
 .../apache/mahout/math/AbstractMatrix.java    |  24 +-
 .../apache/mahout/math/ConstantVector.java    |   5 +
 .../apache/mahout/math/DelegatingVector.java  |   5 +
 .../org/apache/mahout/math/DenseMatrix.java   |   9 +-
 .../mahout/math/DenseSymmetricMatrix.java     |   2 +
 .../org/apache/mahout/math/DenseVector.java   |   5 +
 .../apache/mahout/math/DiagonalMatrix.java    |  14 +
 .../math/FileBasedSparseBinaryMatrix.java     |   5 +
 .../mahout/math/FunctionalMatrixView.java     |   5 +
 .../java/org/apache/mahout/math/Matrices.java |  18 +-
 .../java/org/apache/mahout/math/Matrix.java   |   7 +
 .../apache/mahout/math/MatrixVectorView.java  |   5 +
 .../org/apache/mahout/math/MatrixView.java    |   6 +
 .../org/apache/mahout/math/NamedVector.java   |   5 +
 .../mahout/math/PermutedVectorView.java       |   5 +
 .../mahout/math/RandomAccessSparseVector.java |   5 +
 .../math/SequentialAccessSparseVector.java    |   7 +
 .../mahout/math/SparseColumnMatrix.java       |  20 +-
 .../org/apache/mahout/math/SparseMatrix.java  |  30 +-
 .../apache/mahout/math/SparseRowMatrix.java   |   7 +
 .../mahout/math/TransposedMatrixView.java     | 147 ++++++++
 .../apache/mahout/math/UpperTriangular.java   |   9 +
 .../java/org/apache/mahout/math/Vector.java   |   8 +
 .../apache/mahout/math/VectorIterable.java    |   4 +
 .../org/apache/mahout/math/VectorView.java    |   7 +-
 .../apache/mahout/math/flavor/BackEnum.java   |  18 +-
 .../mahout/math/flavor/MatrixFlavor.java      |  82 +++++
 .../math/flavor/TraversingStructureEnum.java  |  48 +++
 .../org/apache/mahout/math/MatricesTest.java  |   4 +-
 .../math/hadoop/DistributedRowMatrix.java     |   5 +
 .../stochasticsvd/qr/GivensThinSolver.java    |   5 +
 .../shell/MahoutSparkILoop.scala              |  15 +-
 .../apache/mahout/common/DrmMetadata.scala    |  17 +
 .../org/apache/mahout/common/HDFSUtil.scala   |   4 +-
 .../mahout/common/Hadoop1HDFSUtil.scala       |  24 +-
 .../mahout/sparkbindings/SparkEngine.scala    | 226 ++++++++----
 .../mahout/sparkbindings/blas/ABt.scala       | 200 ++++++++++-
 .../mahout/sparkbindings/blas/AewB.scala      |  75 +++-
 .../mahout/sparkbindings/blas/AinCoreB.scala  |  16 +-
 .../apache/mahout/sparkbindings/blas/At.scala |  15 +-
 .../mahout/sparkbindings/blas/AtA.scala       | 136 ++++---
 .../mahout/sparkbindings/blas/AtB.scala       | 336 +++++++++++++++---
 .../apache/mahout/sparkbindings/blas/Ax.scala |  24 +-
 .../mahout/sparkbindings/blas/CbindAB.scala   |  35 +-
 .../mahout/sparkbindings/blas/DrmRddOps.scala |   2 +
 .../mahout/sparkbindings/blas/MapBlock.scala  |  15 +-
 .../mahout/sparkbindings/blas/Par.scala       |  74 ++--
 .../mahout/sparkbindings/blas/RbindAB.scala   |   8 +-
 .../mahout/sparkbindings/blas/Slicing.scala   |   2 +-
 .../mahout/sparkbindings/blas/package.scala   | 174 ++++++++-
 .../drm/CheckpointedDrmSpark.scala            |  41 ++-
 .../drm/CheckpointedDrmSparkOps.scala         |   2 +-
 .../sparkbindings/drm/DrmRddInput.scala       |  18 +-
 .../mahout/sparkbindings/drm/SparkBCast.scala |   2 +
 .../mahout/sparkbindings/drm/package.scala    |  23 +-
 .../io/GenericMatrixKryoSerializer.scala      | 191 ++++++++++
 .../io/MahoutKryoRegistrator.scala            |  21 +-
 .../io/UnsupportedSerializer.scala            |  31 ++
 .../io/VectorKryoSerializer.scala             | 252 +++++++++++++
 .../apache/mahout/sparkbindings/package.scala | 118 +++---
 .../mahout/cf/SimilarityAnalysisSuite.scala   |  16 +-
 .../drivers/ItemSimilarityDriverSuite.scala   |  36 +-
 .../sparkbindings/SparkBindingsSuite.scala    |  12 +-
 .../mahout/sparkbindings/blas/BlasSuite.scala |   4 +-
 .../sparkbindings/drm/DrmLikeOpsSuite.scala   |  17 +-
 .../sparkbindings/drm/RLikeDrmOpsSuite.scala  |  63 ++++
 .../mahout/sparkbindings/io/IOSuite.scala     | 195 ++++++++++
 .../test/DistributedSparkSuite.scala          |  20 +-
 .../test/LoggerConfiguration.scala            |   2 +-
 105 files changed, 4100 insertions(+), 586 deletions(-)
 create mode 100644 math-scala/src/main/scala/org/apache/mahout/logging/package.scala
 create mode 100644 math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala
 create mode 100644 math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala
 create mode 100644 math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala
 create mode 100644 math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala
 create mode 100644 math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
 rename math-scala/src/main/scala/org/apache/mahout/math/scalabindings/{DoubleScalarOps.scala => RLikeDoubleScalarOps.scala} (69%)
 create mode 100644 math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala
 create mode 100644 math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java
 rename math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeTimesOps.scala => math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java (77%)
 create mode 100644 math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java
 create mode 100644 math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java
 create mode 100644 spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala
 create mode 100644 spark/src/main/scala/org/apache/mahout/sparkbindings/io/UnsupportedSerializer.scala
 create mode 100644 spark/src/main/scala/org/apache/mahout/sparkbindings/io/VectorKryoSerializer.scala
 create mode 100644 spark/src/test/scala/org/apache/mahout/sparkbindings/io/IOSuite.scala

diff --git a/bin/mahout b/bin/mahout
index ee0b9187d5..24f01bafcd 100755
--- a/bin/mahout
+++ b/bin/mahout
@@ -254,12 +254,10 @@ fi
 # restore ordinary behaviour
 unset IFS
 
-
-
 case "$1" in
   (spark-shell)
     save_stty=$(stty -g 2>/dev/null);
-    "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.sparkbindings.shell.Main" $@
+    "$JAVA" $JAVA_HEAP_MAX $MAHOUT_OPTS -classpath "$CLASSPATH" "org.apache.mahout.sparkbindings.shell.Main" $@
     stty sane; stty $save_stty
     ;;
   # Spark CLI drivers go here
diff --git a/math-scala/src/main/scala/org/apache/mahout/logging/package.scala b/math-scala/src/main/scala/org/apache/mahout/logging/package.scala
new file mode 100644
index 0000000000..15aa909f61
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/logging/package.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout
+
+import org.apache.log4j.{Level, Priority, Logger}
+
+package object logging {
+
+  /** Compute `expr` if debug is on, only */
+  def debugDo[T](expr: => T)(implicit log: Logger): Option[T] = {
+    if (log.isDebugEnabled) Some(expr)
+    else None
+  }
+
+  /** Compute `expr` if trace is on, only */
+  def traceDo[T](expr: => T)(implicit log: Logger): Option[T] = {
+    if (log.isTraceEnabled) Some(expr) else None
+  }
+
+  /** Shorter, and lazy, versions of logging methods. Just declare log implicit. */
+  def debug(msg: => AnyRef)(implicit log: Logger) { if (log.isDebugEnabled) log.debug(msg) }
+
+  def debug(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isDebugEnabled()) log.debug(msg, t) }
+
+  /** Shorter, and lazy, versions of logging methods. Just declare log implicit. */
+  def trace(msg: => AnyRef)(implicit log: Logger) { if (log.isTraceEnabled) log.trace(msg) }
+
+  def trace(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isTraceEnabled()) log.trace(msg, t) }
+
+  def info(msg: => AnyRef)(implicit log: Logger) { if (log.isInfoEnabled) log.info(msg)}
+
+  def info(msg: => AnyRef, t:Throwable)(implicit log: Logger) { if (log.isInfoEnabled) log.info(msg,t)}
+
+  def warn(msg: => AnyRef)(implicit log: Logger) { if (log.isEnabledFor(Level.WARN)) log.warn(msg) }
+
+  def warn(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isEnabledFor(Level.WARN)) error(msg, t) }
+
+  def error(msg: => AnyRef)(implicit log: Logger) { if (log.isEnabledFor(Level.ERROR)) log.warn(msg) }
+
+  def error(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isEnabledFor(Level.ERROR)) error(msg, t) }
+
+  def fatal(msg: => AnyRef)(implicit log: Logger) { if (log.isEnabledFor(Level.FATAL)) log.fatal(msg) }
+
+  def fatal(msg: => AnyRef, t: Throwable)(implicit log: Logger) { if (log.isEnabledFor(Level.FATAL)) log.fatal(msg, t) }
+
+  def getLog(name: String): Logger = Logger.getLogger(name)
+
+  def getLog(clazz: Class[_]): Logger = Logger.getLogger(clazz)
+
+  def mahoutLog :Logger = getLog("org.apache.mahout")
+
+  def setLogLevel(l:Level)(implicit log:Logger) = {
+    log.setLevel(l)
+  }
+
+  def setAdditivity(a:Boolean)(implicit log:Logger) = log.setAdditivity(a)
+
+}
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala
index 7caa3ddb57..866ee34dae 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DQR.scala
@@ -18,6 +18,7 @@
 package org.apache.mahout.math.decompositions
 
 import scala.reflect.ClassTag
+import org.apache.mahout.logging._
 import org.apache.mahout.math.Matrix
 import org.apache.mahout.math.scalabindings._
 import RLikeOps._
@@ -27,7 +28,7 @@ import org.apache.log4j.Logger
 
 object DQR {
 
-  private val log = Logger.getLogger(DQR.getClass)
+  private final implicit val log = getLog(DQR.getClass)
 
   /**
    * Distributed _thin_ QR. A'A must fit in a memory, i.e. if A is m x n, then n should be pretty
@@ -41,19 +42,19 @@ object DQR {
   def dqrThin[K: ClassTag](drmA: DrmLike[K], checkRankDeficiency: Boolean = true): (DrmLike[K], Matrix) = {
 
     if (drmA.ncol > 5000)
-      log.warn("A is too fat. A'A must fit in memory and easily broadcasted.")
+      warn("A is too fat. A'A must fit in memory and easily broadcasted.")
 
     implicit val ctx = drmA.context
 
     val AtA = (drmA.t %*% drmA).checkpoint()
     val inCoreAtA = AtA.collect
 
-    if (log.isDebugEnabled) log.debug("A'A=\n%s\n".format(inCoreAtA))
+    trace("A'A=\n%s\n".format(inCoreAtA))
 
     val ch = chol(inCoreAtA)
     val inCoreR = (ch.getL cloned) t
 
-    if (log.isDebugEnabled) log.debug("R=\n%s\n".format(inCoreR))
+    trace("R=\n%s\n".format(inCoreR))
 
     if (checkRankDeficiency && !ch.isPositiveDefinite)
       throw new IllegalArgumentException("R is rank-deficient.")
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala
index 1abfb8796b..cecaec84c8 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/DSSVD.scala
@@ -7,9 +7,12 @@ import RLikeOps._
 import org.apache.mahout.math.drm._
 import RLikeDrmOps._
 import org.apache.mahout.common.RandomUtils
+import org.apache.mahout.logging._
 
 object DSSVD {
 
+  private final implicit val log = getLog(DSSVD.getClass)
+
   /**
    * Distributed Stochastic Singular Value decomposition algorithm.
    *
@@ -43,18 +46,22 @@ object DSSVD {
       case (keys, blockA) =>
         val blockY = blockA %*% Matrices.symmetricUniformView(n, r, omegaSeed)
         keys -> blockY
-    }
+    }.checkpoint()
 
-    var drmQ = dqrThin(drmY.checkpoint())._1
+    var drmQ = dqrThin(drmY)._1
     // Checkpoint Q if last iteration
     if (q == 0) drmQ = drmQ.checkpoint()
 
+    trace(s"dssvd:drmQ=${drmQ.collect}.")
+
     // This actually should be optimized as identically partitioned map-side A'B since A and Q should
     // still be identically partitioned.
     var drmBt = drmAcp.t %*% drmQ
     // Checkpoint B' if last iteration
     if (q == 0) drmBt = drmBt.checkpoint()
 
+    trace(s"dssvd:drmB'=${drmBt.collect}.")
+
     for (i <- 0  until q) {
       drmY = drmAcp %*% drmBt
       drmQ = dqrThin(drmY.checkpoint())._1
@@ -62,13 +69,17 @@ object DSSVD {
       if (i == q - 1) drmQ = drmQ.checkpoint()
 
       // This on the other hand should be inner-join-and-map A'B optimization since A and Q_i are not
-      // identically partitioned anymore.
+      // identically partitioned anymore.`
       drmBt = drmAcp.t %*% drmQ
       // Checkpoint B' if last iteration
       if (i == q - 1) drmBt = drmBt.checkpoint()
     }
 
-    val (inCoreUHat, d) = eigen(drmBt.t %*% drmBt)
+    val mxBBt:Matrix = drmBt.t %*% drmBt
+
+    trace(s"dssvd: BB'=$mxBBt.")
+
+    val (inCoreUHat, d) = eigen(mxBBt)
     val s = d.sqrt
 
     // Since neither drmU nor drmV are actually computed until actually used, we don't need the flags
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala
index 80385a3af9..e1b2f03875 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/decompositions/SSVD.scala
@@ -150,7 +150,7 @@ private[math] object SSVD {
     val c = s_q cross s_b
 
     // BB' computation becomes
-    val bbt = bt.t %*% bt -c - c.t +  (s_q cross s_q) * (xi dot xi)
+    val bbt = bt.t %*% bt - c - c.t +  (s_q cross s_q) * (xi dot xi)
 
     val (uhat, d) = eigen(bbt)
 
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala
index 8506144578..b86e286721 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/BCast.scala
@@ -18,6 +18,7 @@
 package org.apache.mahout.math.drm
 
 /** Broadcast variable abstraction */
-trait BCast[T] {
+trait BCast[T] extends java.io.Closeable {
   def value:T
+
 }
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala
index 8c3911fb3f..c43c6c7990 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/CheckpointedOps.scala
@@ -20,6 +20,7 @@ package org.apache.mahout.math.drm
 import scala.reflect.ClassTag
 import org.apache.mahout.math._
 
+import org.apache.mahout.math.scalabindings.RLikeOps._
 
 /**
  * Additional experimental operations over CheckpointedDRM implementation. I will possibly move them up to
@@ -38,6 +39,12 @@ class CheckpointedOps[K: ClassTag](val drm: CheckpointedDrm[K]) {
   /** Column Means */
   def colMeans(): Vector = drm.context.colMeans(drm)
 
+  /** Optional engine-specific all reduce tensor operation. */
+  def allreduceBlock(bmf: BlockMapFunc2[K], rf: BlockReduceFunc = _ += _): Matrix =
+
+    drm.context.allreduceBlock(drm, bmf, rf)
+
+
   def norm():Double = drm.context.norm(drm)
 }
 
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala
index bb6772aac7..519a1276d7 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/DistributedEngine.scala
@@ -19,16 +19,15 @@ package org.apache.mahout.math.drm
 
 import org.apache.mahout.math.indexeddataset._
 
-import scala.reflect.ClassTag
 import logical._
 import org.apache.mahout.math._
 import scalabindings._
 import RLikeOps._
-import RLikeDrmOps._
 import DistributedEngine._
-import org.apache.mahout.math.scalabindings._
 import org.apache.log4j.Logger
 
+import scala.reflect.ClassTag
+
 /** Abstraction of optimizer/distributed engine */
 trait DistributedEngine {
 
@@ -37,7 +36,7 @@ trait DistributedEngine {
    * introduce logical constructs (including engine-specific ones) that user DSL cannot even produce
    * per se.
    * <P>
-   *   
+   *
    * A particular physical engine implementation may choose to either use the default rewrites or
    * build its own rewriting rules.
    * <P>
@@ -50,6 +49,9 @@ trait DistributedEngine {
   /** Engine-specific colSums implementation based on a checkpoint. */
   def colSums[K: ClassTag](drm: CheckpointedDrm[K]): Vector
 
+  /** Optional engine-specific all reduce tensor operation. */
+  def allreduceBlock[K: ClassTag](drm: CheckpointedDrm[K], bmf: BlockMapFunc2[K], rf: BlockReduceFunc): Matrix
+
   /** Engine-specific numNonZeroElementsPerColumn implementation based on a checkpoint. */
   def numNonZeroElementsPerColumn[K: ClassTag](drm: CheckpointedDrm[K]): Vector
 
@@ -73,20 +75,39 @@ trait DistributedEngine {
   def drmDfsRead(path: String, parMin: Int = 0)(implicit sc: DistributedContext): CheckpointedDrm[_]
 
   /** Parallelize in-core matrix as spark distributed matrix, using row ordinal indices as data set keys. */
-  def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int = 1)
-      (implicit sc: DistributedContext): CheckpointedDrm[Int]
+  def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int = 1)(implicit sc: DistributedContext):
+  CheckpointedDrm[Int]
 
   /** Parallelize in-core matrix as spark distributed matrix, using row labels as a data set keys. */
-  def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int = 1)
-      (implicit sc: DistributedContext): CheckpointedDrm[String]
+  def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int = 1)(implicit sc: DistributedContext):
+  CheckpointedDrm[String]
 
   /** This creates an empty DRM with specified number of partitions and cardinality. */
-  def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int = 10)
-      (implicit sc: DistributedContext): CheckpointedDrm[Int]
+  def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int = 10)(implicit sc: DistributedContext):
+  CheckpointedDrm[Int]
 
   /** Creates empty DRM with non-trivial height */
-  def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int = 10)
-      (implicit sc: DistributedContext): CheckpointedDrm[Long]
+  def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int = 10)(implicit sc: DistributedContext):
+  CheckpointedDrm[Long]
+
+  /**
+   * Convert non-int-keyed matrix to an int-keyed, computing optionally mapping from old keys
+   * to row indices in the new one. The mapping, if requested, is returned as a 1-column matrix.
+   */
+  def drm2IntKeyed[K: ClassTag](drmX: DrmLike[K], computeMap: Boolean = false): (DrmLike[Int], Option[DrmLike[K]])
+
+  /**
+   * (Optional) Sampling operation. Consistent with Spark semantics of the same.
+   * @param drmX
+   * @param fraction
+   * @param replacement
+   * @tparam K
+   * @return
+   */
+  def drmSampleRows[K: ClassTag](drmX: DrmLike[K], fraction: Double, replacement: Boolean = false): DrmLike[K]
+
+  def drmSampleKRows[K: ClassTag](drmX: DrmLike[K], numSamples:Int, replacement:Boolean = false) : Matrix
+
   /**
    * Load IndexedDataset from text delimited format.
    * @param src comma delimited URIs to read from
@@ -119,38 +140,49 @@ object DistributedEngine {
   private def pass1[K: ClassTag](action: DrmLike[K]): DrmLike[K] = {
 
     action match {
-      case OpAB(OpAt(a), b) if (a == b) => OpAtA(pass1(a))
-      case OpABAnyKey(OpAtAnyKey(a), b) if (a == b) => OpAtA(pass1(a))
+
+      // self element-wise rewrite
+      case OpAewB(a, b, op) if (a == b) => {
+        op match {
+          case "*" ⇒ OpAewUnaryFunc(pass1(a), (x) ⇒ x * x)
+          case "/" ⇒ OpAewUnaryFunc(pass1(a), (x) ⇒ x / x)
+          // Self "+" and "-" don't make a lot of sense, but we do include it for completeness.
+          case "+" ⇒ OpAewUnaryFunc(pass1(a), 2.0 * _)
+          case "-" ⇒ OpAewUnaryFunc(pass1(a), (_) ⇒ 0.0)
+          case _ ⇒
+          require(false, s"Unsupported operator $op")
+            null
+        }
+      }
+      case OpAB(OpAt(a), b) if (a == b) ⇒ OpAtA(pass1(a))
+      case OpABAnyKey(OpAtAnyKey(a), b) if (a == b) ⇒ OpAtA(pass1(a))
 
       // For now, rewrite left-multiply via transpositions, i.e.
       // inCoreA %*% B = (B' %*% inCoreA')'
-      case op@OpTimesLeftMatrix(a, b) =>
-        OpAt(OpTimesRightMatrix(A = OpAt(pass1(b)), right = a.t))
+      case op@OpTimesLeftMatrix(a, b) ⇒
+      OpAt(OpTimesRightMatrix(A = OpAt(pass1(b)), right = a.t))
 
       // Add vertical row index concatenation for rbind() on DrmLike[Int] fragments
-      case op@OpRbind(a, b) if (implicitly[ClassTag[K]] == ClassTag.Int) =>
+      case op@OpRbind(a, b) if (implicitly[ClassTag[K]] == ClassTag.Int) ⇒
 
         // Make sure closure sees only local vals, not attributes. We need to do these ugly casts
         // around because compiler could not infer that K is the same as Int, based on if() above.
         val ma = safeToNonNegInt(a.nrow)
-        val bAdjusted = new OpMapBlock[Int, Int](
-          A = pass1(b.asInstanceOf[DrmLike[Int]]),
-          bmf = {
-            case (keys, block) => keys.map(_ + ma) -> block
-          },
-          identicallyPartitioned = false
-        )
+        val bAdjusted = new OpMapBlock[Int, Int](A = pass1(b.asInstanceOf[DrmLike[Int]]), bmf = {
+          case (keys, block) ⇒ keys.map(_ + ma) → block
+        }, identicallyPartitioned = false)
         val aAdjusted = a.asInstanceOf[DrmLike[Int]]
         OpRbind(pass1(aAdjusted), bAdjusted).asInstanceOf[DrmLike[K]]
 
       // Stop at checkpoints
-      case cd: CheckpointedDrm[_] => action
+      case cd: CheckpointedDrm[_] ⇒ action
 
       // For everything else we just pass-thru the operator arguments to optimizer
-      case uop: AbstractUnaryOp[_, K] =>
+      case uop: AbstractUnaryOp[_, K] ⇒
         uop.A = pass1(uop.A)(uop.classTagA)
         uop
-      case bop: AbstractBinaryOp[_, _, K] =>
+
+      case bop: AbstractBinaryOp[_, _, K] ⇒
         bop.A = pass1(bop.A)(bop.classTagA)
         bop.B = pass1(bop.B)(bop.classTagB)
         bop
@@ -160,17 +192,30 @@ object DistributedEngine {
   /** This would remove stuff like A.t.t that previous step may have created */
   private def pass2[K: ClassTag](action: DrmLike[K]): DrmLike[K] = {
     action match {
+
+      // Fusion of unary funcs into single, like 1 + x * x.
+      // Since we repeating the pass over self after rewrite, we dont' need to descend into arguments
+      // recursively here.
+      case op1@OpAewUnaryFunc(op2@OpAewUnaryFunc(a, _, _), _, _) ⇒
+        pass2(OpAewUnaryFuncFusion(a, op1 :: op2 :: Nil))
+
+      // Fusion one step further, like 1 + 2 * x * x. All should be rewritten as one UnaryFuncFusion.
+      // Since we repeating the pass over self after rewrite, we dont' need to descend into arguments
+      // recursively here.
+      case op@OpAewUnaryFuncFusion(op2@OpAewUnaryFunc(a, _, _), _) ⇒
+        pass2(OpAewUnaryFuncFusion(a, op.ff :+ op2))
+
       // A.t.t => A
-      case OpAt(top@OpAt(a)) => pass2(a)(top.classTagA)
+      case OpAt(top@OpAt(a)) ⇒  pass2(a)(top.classTagA)
 
       // Stop at checkpoints
-      case cd: CheckpointedDrm[_] => action
+      case cd: CheckpointedDrm[_] ⇒  action
 
       // For everything else we just pass-thru the operator arguments to optimizer
-      case uop: AbstractUnaryOp[_, K] =>
+      case uop: AbstractUnaryOp[_, K] ⇒
         uop.A = pass2(uop.A)(uop.classTagA)
         uop
-      case bop: AbstractBinaryOp[_, _, K] =>
+      case bop: AbstractBinaryOp[_, _, K] ⇒
         bop.A = pass2(bop.A)(bop.classTagA)
         bop.B = pass2(bop.B)(bop.classTagB)
         bop
@@ -182,29 +227,29 @@ object DistributedEngine {
     action match {
 
       // matrix products.
-      case OpAB(a, OpAt(b)) => OpABt(pass3(a), pass3(b))
+      case OpAB(a, OpAt(b)) ⇒  OpABt(pass3(a), pass3(b))
 
       // AtB cases that make sense.
-      case OpAB(OpAt(a), b) if (a.partitioningTag == b.partitioningTag) => OpAtB(pass3(a), pass3(b))
-      case OpABAnyKey(OpAtAnyKey(a), b) => OpAtB(pass3(a), pass3(b))
+      case OpAB(OpAt(a), b) if (a.partitioningTag == b.partitioningTag) ⇒  OpAtB(pass3(a), pass3(b))
+      case OpABAnyKey(OpAtAnyKey(a), b) ⇒  OpAtB(pass3(a), pass3(b))
 
       // Need some cost to choose between the following.
 
-      case OpAB(OpAt(a), b) => OpAtB(pass3(a), pass3(b))
+      case OpAB(OpAt(a), b) ⇒  OpAtB(pass3(a), pass3(b))
       //      case OpAB(OpAt(a), b) => OpAt(OpABt(OpAt(pass1(b)), pass1(a)))
-      case OpAB(a, b) => OpABt(pass3(a), OpAt(pass3(b)))
+      case OpAB(a, b) ⇒  OpABt(pass3(a), OpAt(pass3(b)))
 
       // Rewrite A'x
-      case op@OpAx(op1@OpAt(a), x) => OpAtx(pass3(a)(op1.classTagA), x)
+      case op@OpAx(op1@OpAt(a), x) ⇒  OpAtx(pass3(a)(op1.classTagA), x)
 
       // Stop at checkpoints
-      case cd: CheckpointedDrm[_] => action
+      case cd: CheckpointedDrm[_] ⇒  action
 
       // For everything else we just pass-thru the operator arguments to optimizer
-      case uop: AbstractUnaryOp[_, K] =>
+      case uop: AbstractUnaryOp[_, K] ⇒
         uop.A = pass3(uop.A)(uop.classTagA)
         uop
-      case bop: AbstractBinaryOp[_, _, K] =>
+      case bop: AbstractBinaryOp[_, _, K] ⇒
         bop.A = pass3(bop.A)(bop.classTagA)
         bop.B = pass3(bop.B)(bop.classTagB)
         bop
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala
index e5cf563c66..96ef893987 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmDoubleScalarOps.scala
@@ -18,7 +18,11 @@
 package org.apache.mahout.math.drm
 
 import RLikeDrmOps._
-import scala.reflect.ClassTag
+import org.apache.mahout.math._
+import org.apache.mahout.math.drm.logical.OpCbindScalar
+import scalabindings._
+import RLikeOps._
+import reflect.ClassTag
 
 class DrmDoubleScalarOps(val x:Double) extends AnyVal{
 
@@ -30,4 +34,6 @@ class DrmDoubleScalarOps(val x:Double) extends AnyVal{
 
   def /[K:ClassTag](that:DrmLike[K]) = x /: that
 
+  def cbind[K: ClassTag](that: DrmLike[K]) = OpCbindScalar(A = that, x = x, leftBind = true)
+
 }
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala
index bc937d6c4b..19432d0599 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/DrmLikeOps.scala
@@ -49,7 +49,7 @@ class DrmLikeOps[K: ClassTag](protected[drm] val drm: DrmLike[K]) {
    *             is applied.
    */
   def par(min: Int = -1, exact: Int = -1, auto: Boolean = false) = {
-    assert(min >= 0 || exact >= 0 || auto, "Invalid argument")
+    require(min > 0 || exact > 0 || auto, "Invalid argument")
     OpPar(drm, minSplits = min, exactSplits = exact)
   }
 
@@ -65,16 +65,15 @@ class DrmLikeOps[K: ClassTag](protected[drm] val drm: DrmLike[K]) {
    * @tparam R
    * @return
    */
-  def mapBlock[R: ClassTag](ncol: Int = -1, identicallyParitioned: Boolean = true)
+  def mapBlock[R: ClassTag](ncol: Int = -1, identicallyPartitioned: Boolean = true)
       (bmf: BlockMapFunc[K, R]): DrmLike[R] =
     new OpMapBlock[K, R](
       A = drm,
       bmf = bmf,
       _ncol = ncol,
-      identicallyPartitioned = identicallyParitioned
+      identicallyPartitioned = identicallyPartitioned
     )
 
-
   /**
    * Slicing the DRM. Should eventually work just like in-core drm (e.g. A(0 until 5, 5 until 15)).<P>
    *
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala
index 380f4eb723..7927e51ebf 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/RLikeDrmOps.scala
@@ -18,12 +18,17 @@
 package org.apache.mahout.math.drm
 
 import scala.reflect.ClassTag
+import collection._
+import JavaConversions._
 import org.apache.mahout.math.{Vector, Matrix}
 import org.apache.mahout.math.drm.logical._
+import org.apache.mahout.math.scalabindings._
+import RLikeOps._
 
 class RLikeDrmOps[K: ClassTag](drm: DrmLike[K]) extends DrmLikeOps[K](drm) {
 
   import RLikeDrmOps._
+  import org.apache.mahout.math.scalabindings._
 
   def +(that: DrmLike[K]): DrmLike[K] = OpAewB[K](A = this, B = that, op = "+")
 
@@ -33,21 +38,23 @@ class RLikeDrmOps[K: ClassTag](drm: DrmLike[K]) extends DrmLikeOps[K](drm) {
 
   def /(that: DrmLike[K]): DrmLike[K] = OpAewB[K](A = this, B = that, op = "/")
 
-  def +(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "+")
+  def +(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ + that, evalZeros = true)
 
-  def +:(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "+")
+  def +:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that + _, evalZeros = true)
 
-  def -(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "-")
+  def -(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ - that, evalZeros = true)
 
-  def -:(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "-:")
+  def -:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that - _, evalZeros = true)
 
-  def *(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "*")
+  def *(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ * that)
 
-  def *:(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "*")
+  def *:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that * _)
 
-  def /(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "/")
+  def ^(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = math.pow(_, that))
 
-  def /:(that: Double): DrmLike[K] = OpAewScalar[K](A = this, scalar = that, op = "/:")
+  def /(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = _ / that, evalZeros = that == 0.0)
+
+  def /:(that: Double): DrmLike[K] = OpAewUnaryFunc[K](A = this, f = that / _, evalZeros = true)
 
   def :%*%(that: DrmLike[Int]): DrmLike[K] = OpAB[K](A = this.drm, B = that)
 
@@ -65,18 +72,36 @@ class RLikeDrmOps[K: ClassTag](drm: DrmLike[K]) extends DrmLikeOps[K](drm) {
 
   def t: DrmLike[Int] = OpAtAnyKey(A = drm)
 
-  def cbind(that: DrmLike[K]) = OpCbind(A = this.drm, B = that)
+  def cbind(that: DrmLike[K]): DrmLike[K] = OpCbind(A = this.drm, B = that)
+
+  def cbind(that: Double): DrmLike[K] = OpCbindScalar(A = this.drm, x = that, leftBind = false)
+
+  def rbind(that: DrmLike[K]): DrmLike[K] = OpRbind(A = this.drm, B = that)
 
-  def rbind(that: DrmLike[K]) = OpRbind(A = this.drm, B = that)
+  /**
+   * `rowSums` method for non-int keyed matrices.
+   *
+   * Slight problem here is the limitation of in-memory representation of Colt's Matrix, which can
+   * only have String row labels. Therefore, internally we do ".toString()" call on each key object,
+   * and then put it into [[Matrix]] row label bindings, at which point they are coerced to be Strings.
+   *
+   * This is obviously a suboptimal behavior, so as TODO we have here future enhancements of `collect'.
+   *
+   * @return map of row keys into row sums, front-end collected.
+   */
+  def rowSumsMap(): Map[String, Double] = {
+    val m = drm.mapBlock(ncol = 1) { case (keys, block) =>
+      keys -> dense(block.rowSums).t
+    }.collect
+    m.getRowLabelBindings.map { case (key, idx) => key -> m(idx, 0)}
+  }
 }
 
 class RLikeDrmIntOps(drm: DrmLike[Int]) extends RLikeDrmOps[Int](drm) {
 
   import org.apache.mahout.math._
   import scalabindings._
-  import RLikeOps._
   import RLikeDrmOps._
-  import scala.collection.JavaConversions._
 
   override def t: DrmLike[Int] = OpAt(A = drm)
 
@@ -108,7 +133,7 @@ class RLikeDrmIntOps(drm: DrmLike[Int]) extends RLikeDrmOps[Int](drm) {
       // Collect block-wise row means and output them as one-column matrix.
       keys -> dense(block.rowMeans).t
     }
-        .collect(::, 0)
+      .collect(::, 0)
   }
 
   /** Return diagonal vector */
@@ -117,14 +142,14 @@ class RLikeDrmIntOps(drm: DrmLike[Int]) extends RLikeDrmOps[Int](drm) {
     drm.mapBlock(ncol = 1) { case (keys, block) =>
       keys -> dense(for (r <- block.view) yield r(keys(r.index))).t
     }
-        .collect(::, 0)
+      .collect(::, 0)
   }
 
 }
 
 object RLikeDrmOps {
 
-  implicit def double2ScalarOps(x:Double) = new DrmDoubleScalarOps(x)
+  implicit def double2ScalarOps(x: Double) = new DrmDoubleScalarOps(x)
 
   implicit def drmInt2RLikeOps(drm: DrmLike[Int]): RLikeDrmIntOps = new RLikeDrmIntOps(drm)
 
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala
index a445f212ef..60b2c7700f 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/AbstractUnaryOp.scala
@@ -24,7 +24,7 @@ import org.apache.mahout.math.drm.{DistributedContext, DrmLike}
 abstract class AbstractUnaryOp[A: ClassTag, K: ClassTag]
     extends CheckpointAction[K] with DrmLike[K] {
 
-  protected[drm] var A: DrmLike[A]
+  protected[mahout] var A: DrmLike[A]
 
   lazy val context: DistributedContext = A.context
 
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala
index aa3a3b9eb9..a7934a3c41 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/CheckpointAction.scala
@@ -37,7 +37,8 @@ abstract class CheckpointAction[K: ClassTag] extends DrmLike[K] {
    */
   def checkpoint(cacheHint: CacheHint.CacheHint): CheckpointedDrm[K] = cp match {
     case None =>
-      val physPlan = context.toPhysical(context.optimizerRewrite(this), cacheHint)
+      val plan = context.optimizerRewrite(this)
+      val physPlan = context.toPhysical(plan, cacheHint)
       cp = Some(physPlan)
       physPlan
     case Some(cp) => cp
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala
index 19a910c327..dbcb366738 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewScalar.scala
@@ -21,7 +21,11 @@ import scala.reflect.ClassTag
 import org.apache.mahout.math.drm.DrmLike
 import scala.util.Random
 
-/** Operator denoting expressions like 5.0 - A or A * 5.6 */
+/**
+ * Operator denoting expressions like 5.0 - A or A * 5.6
+ *
+ * @deprecated use [[OpAewUnaryFunc]] instead
+ */
 case class OpAewScalar[K: ClassTag](
     override var A: DrmLike[K],
     val scalar: Double,
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala
new file mode 100644
index 0000000000..71489ab268
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFunc.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.drm.logical
+
+import scala.reflect.ClassTag
+import org.apache.mahout.math.drm.DrmLike
+import scala.util.Random
+
+/**
+ * @author dmitriy
+ */
+case class OpAewUnaryFunc[K: ClassTag](
+    override var A: DrmLike[K],
+    val f: (Double) => Double,
+    val evalZeros:Boolean = false
+    ) extends AbstractUnaryOp[K,K] with TEwFunc {
+
+  override protected[mahout] lazy val partitioningTag: Long =
+    if (A.canHaveMissingRows)
+      Random.nextLong()
+    else A.partitioningTag
+
+  /** Stuff like `A +1` is always supposed to fix this */
+  override protected[mahout] lazy val canHaveMissingRows: Boolean = false
+
+  /** R-like syntax for number of rows. */
+  def nrow: Long = A.nrow
+
+  /** R-like syntax for number of columns */
+  def ncol: Int = A.ncol
+}
+
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala
new file mode 100644
index 0000000000..ed95f4facd
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpAewUnaryFuncFusion.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.drm.logical
+
+import scala.reflect.ClassTag
+import org.apache.mahout.math.drm.DrmLike
+import scala.util.Random
+import collection._
+
+/**
+ * Composition of unary elementwise functions.
+ */
+case class OpAewUnaryFuncFusion[K: ClassTag](
+    override var A: DrmLike[K],
+    var ff:List[OpAewUnaryFunc[K]] = Nil
+    ) extends AbstractUnaryOp[K,K] with TEwFunc {
+
+  override protected[mahout] lazy val partitioningTag: Long =
+    if (A.canHaveMissingRows)
+      Random.nextLong()
+    else A.partitioningTag
+
+  /** Stuff like `A +1` is always supposed to fix this */
+  override protected[mahout] lazy val canHaveMissingRows: Boolean = false
+
+  /** R-like syntax for number of rows. */
+  def nrow: Long = A.nrow
+
+  /** R-like syntax for number of columns */
+  def ncol: Int = A.ncol
+
+  /** Apply to degenerate elements? */
+  override def evalZeros: Boolean = ff.exists(_.evalZeros)
+
+  /** the function itself */
+  override def f: (Double) => Double = {
+
+    // Make sure composed collection becomes an attribute of this closure because we will be sending
+    // it to the backend.
+    val composedFunc = ff.map(_.f)
+
+    // Create functional closure and return.
+    (x: Double) => (composedFunc :\ x) { case (f, xarg) => f(xarg)}
+
+  }
+}
+
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala
index 14252646fa..0598551bee 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbind.scala
@@ -17,7 +17,7 @@
 
 package org.apache.mahout.math.drm.logical
 
-import scala.reflect.ClassTag
+import reflect.ClassTag
 import org.apache.mahout.math.drm.DrmLike
 import scala.util.Random
 
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala
new file mode 100644
index 0000000000..5aee518dac
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpCbindScalar.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.math.drm.logical
+
+import reflect.ClassTag
+import org.apache.mahout.math.drm.DrmLike
+
+case class OpCbindScalar[K:ClassTag](
+  override var A:DrmLike[K],
+  var x:Double,
+  val leftBind:Boolean ) extends AbstractUnaryOp[K,K] {
+
+  override protected[mahout] lazy val canHaveMissingRows: Boolean = false
+
+  override protected[mahout] lazy val partitioningTag: Long = A.partitioningTag
+
+  /** R-like syntax for number of rows. */
+  def nrow: Long = A.nrow
+
+  /** R-like syntax for number of columns */
+  def ncol: Int = A.ncol + 1
+
+}
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala
index 7299d9e3f1..a1cd71884c 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/OpMapBlock.scala
@@ -23,7 +23,7 @@ import RLikeOps._
 import org.apache.mahout.math.drm.{BlockMapFunc, DrmLike}
 import scala.util.Random
 
-class OpMapBlock[S: ClassTag, R: ClassTag](
+case class OpMapBlock[S: ClassTag, R: ClassTag](
     override var A: DrmLike[S],
     val bmf: BlockMapFunc[S, R],
     val _ncol: Int = -1,
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala
new file mode 100644
index 0000000000..0eb5f65abe
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/logical/TEwFunc.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.drm.logical
+
+/**
+ * Trait denoting logical operators providing elementwise operations that work as unary operators
+ * on each element of a matrix.
+ */
+trait TEwFunc {
+
+  /** Apply to degenerate elments? */
+  def evalZeros: Boolean
+
+  /** the function itself */
+  def f: (Double) => Double
+
+  /**
+   * Self assignment ok? If yes, may cause side effects if works off non-serialized cached object
+   * tree!
+   */
+  def selfAssignOk: Boolean = false
+}
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala b/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
index 1fae831993..d865b58eb1 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/drm/package.scala
@@ -23,6 +23,8 @@ import org.apache.mahout.math.scalabindings.RLikeOps._
 import org.apache.mahout.math.scalabindings._
 
 import scala.reflect.ClassTag
+import org.apache.mahout.math.drm.logical.OpAewUnaryFunc
+import collection._
 
 package object drm {
 
@@ -34,7 +36,11 @@ package object drm {
 
 
   /** Block-map func */
-  type BlockMapFunc[S, R] = BlockifiedDrmTuple[S] => BlockifiedDrmTuple[R]
+  type BlockMapFunc[S, R] = BlockifiedDrmTuple[S] ⇒ BlockifiedDrmTuple[R]
+
+  type BlockMapFunc2[S] = BlockifiedDrmTuple[S] ⇒ Matrix
+
+  type BlockReduceFunc = (Matrix, Matrix) ⇒ Matrix
 
   /** CacheHint type */
   //  type CacheHint = CacheHint.CacheHint
@@ -92,7 +98,7 @@ package object drm {
   implicit def drm2InCore[K: ClassTag](drm: DrmLike[K]): Matrix = drm.collect
 
   /** Do vertical concatenation of collection of blockified tuples */
-  def rbind[K: ClassTag](blocks: Iterable[BlockifiedDrmTuple[K]]): BlockifiedDrmTuple[K] = {
+  private[mahout] def rbind[K: ClassTag](blocks: Iterable[BlockifiedDrmTuple[K]]): BlockifiedDrmTuple[K] = {
     assert(blocks.nonEmpty, "rbind: 0 blocks passed in")
     if (blocks.size == 1) {
       // No coalescing required.
@@ -115,6 +121,46 @@ package object drm {
     }
   }
 
+  /**
+   * Convert arbitrarily-keyed matrix to int-keyed matrix. Some algebra will accept only int-numbered
+   * row matrices. So this method is to help.
+   *
+   * @param drmX input to be transcoded
+   * @param computeMap collect `old key -> int key` map to front-end?
+   * @tparam K key type
+   * @return Sequentially keyed matrix + (optionally) map from non-int key to [[Int]] key. If the
+   *         key type is actually Int, then we just return the argument with None for the map,
+   *         regardless of computeMap parameter.
+   */
+  def drm2IntKeyed[K: ClassTag](drmX: DrmLike[K], computeMap: Boolean = false): (DrmLike[Int], Option[DrmLike[K]]) =
+    drmX.context.engine.drm2IntKeyed(drmX, computeMap)
+
+  /**
+   * (Optional) Sampling operation. Consistent with Spark semantics of the same.
+   * @param drmX
+   * @param fraction
+   * @param replacement
+   * @tparam K
+   * @return samples
+   */
+  def drmSampleRows[K: ClassTag](drmX: DrmLike[K], fraction: Double, replacement: Boolean = false): DrmLike[K] =
+    drmX.context.engine.drmSampleRows(drmX, fraction, replacement)
+
+  def drmSampleKRows[K: ClassTag](drmX: DrmLike[K], numSamples: Int, replacement: Boolean = false): Matrix =
+    drmX.context.engine.drmSampleKRows(drmX, numSamples, replacement)
+
+  ///////////////////////////////////////////////////////////
+  // Elementwise unary functions on distributed operands.
+  def dexp[K: ClassTag](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.exp, true)
+
+  def dlog[K: ClassTag](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.log, true)
+
+  def dabs[K: ClassTag](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.abs)
+
+  def dsqrt[K: ClassTag](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.sqrt)
+
+  def dsignum[K: ClassTag](drmA: DrmLike[K]): DrmLike[K] = new OpAewUnaryFunc[K](drmA, math.signum)
+
 }
 
 package object indexeddataset {
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
new file mode 100644
index 0000000000..d0fd39341b
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MMul.scala
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.scalabindings
+
+import org.apache.mahout.math._
+import org.apache.mahout.math.flavor.{BackEnum, TraversingStructureEnum}
+import org.apache.mahout.math.function.Functions
+import RLikeOps._
+import org.apache.mahout.logging._
+
+import scala.collection.JavaConversions._
+import scala.collection._
+
+object MMul extends MMBinaryFunc {
+
+  private final implicit val log = getLog(MMul.getClass)
+
+  override def apply(a: Matrix, b: Matrix, r: Option[Matrix]): Matrix = {
+
+    require(a.ncol == b.nrow, "Incompatible matrix sizes in matrix multiplication.")
+
+    val (af, bf) = (a.getFlavor, b.getFlavor)
+    val backs = (af.getBacking, bf.getBacking)
+    val sd = (af.getStructure, af.isDense, bf.getStructure, bf.isDense)
+
+    val alg: MMulAlg = backs match {
+
+      // Both operands are jvm memory backs.
+      case (BackEnum.JVMMEM, BackEnum.JVMMEM) ⇒
+
+        sd match {
+
+          // Multiplication cases by a diagonal matrix.
+          case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.COLWISE, _) if (a
+            .isInstanceOf[DiagonalMatrix]) ⇒ jvmDiagCW _
+          case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.SPARSECOLWISE, _) if (a
+            .isInstanceOf[DiagonalMatrix]) ⇒ jvmDiagCW _
+          case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.ROWWISE, _) if (a
+            .isInstanceOf[DiagonalMatrix]) ⇒ jvmDiagRW _
+          case (TraversingStructureEnum.VECTORBACKED, _, TraversingStructureEnum.SPARSEROWWISE, _) if (a
+            .isInstanceOf[DiagonalMatrix]) ⇒ jvmDiagRW _
+
+          case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.VECTORBACKED, _) if (b
+            .isInstanceOf[DiagonalMatrix]) ⇒ jvmCWDiag _
+          case (TraversingStructureEnum.SPARSECOLWISE, _, TraversingStructureEnum.VECTORBACKED, _) if (b
+            .isInstanceOf[DiagonalMatrix]) ⇒ jvmCWDiag _
+          case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.VECTORBACKED, _) if (b
+            .isInstanceOf[DiagonalMatrix]) ⇒ jvmRWDiag _
+          case (TraversingStructureEnum.SPARSEROWWISE, _, TraversingStructureEnum.VECTORBACKED, _) if (b
+            .isInstanceOf[DiagonalMatrix]) ⇒ jvmRWDiag _
+
+          // Dense-dense cases
+          case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) if (a eq b.t) ⇒ jvmDRWAAt _
+          case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) if (a.t eq b) ⇒ jvmDRWAAt _
+          case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.COLWISE, true) ⇒ jvmRWCW
+          case (TraversingStructureEnum.ROWWISE, true, TraversingStructureEnum.ROWWISE, true) ⇒ jvmRWRW
+          case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.COLWISE, true) ⇒ jvmCWCW
+          case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) if ( a eq b.t) ⇒ jvmDCWAAt _
+          case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) if ( a.t eq b) ⇒ jvmDCWAAt _
+          case (TraversingStructureEnum.COLWISE, true, TraversingStructureEnum.ROWWISE, true) ⇒ jvmCWRW
+
+          // Sparse row matrix x sparse row matrix (array of vectors)
+          case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.ROWWISE, false) ⇒ jvmSparseRWRW
+          case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.COLWISE, false) ⇒ jvmSparseRWCW
+          case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.ROWWISE, false) ⇒ jvmSparseCWRW
+          case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.COLWISE, false) ⇒ jvmSparseCWCW
+
+          // Sparse matrix x sparse matrix (hashtable of vectors)
+          case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.SPARSEROWWISE, false) ⇒
+            jvmSparseRowRWRW
+          case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.SPARSECOLWISE, false) ⇒
+            jvmSparseRowRWCW
+          case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.SPARSEROWWISE, false) ⇒
+            jvmSparseRowCWRW
+          case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.SPARSECOLWISE, false) ⇒
+            jvmSparseRowCWCW
+
+          // Sparse matrix x non-like
+          case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseRowRWRW
+          case (TraversingStructureEnum.SPARSEROWWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseRowRWCW
+          case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseRowCWRW
+          case (TraversingStructureEnum.SPARSECOLWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseCWCW
+          case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.SPARSEROWWISE, false) ⇒ jvmSparseRWRW
+          case (TraversingStructureEnum.ROWWISE, _, TraversingStructureEnum.SPARSECOLWISE, false) ⇒ jvmSparseRWCW
+          case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.SPARSEROWWISE, false) ⇒ jvmSparseCWRW
+          case (TraversingStructureEnum.COLWISE, _, TraversingStructureEnum.SPARSECOLWISE, false) ⇒ jvmSparseRowCWCW
+
+          // Everything else including at least one sparse LHS or RHS argument
+          case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseRWRW
+          case (TraversingStructureEnum.ROWWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseRWCW
+          case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.ROWWISE, _) ⇒ jvmSparseCWRW
+          case (TraversingStructureEnum.COLWISE, false, TraversingStructureEnum.COLWISE, _) ⇒ jvmSparseCWCW2flips
+
+          // Sparse methods are only effective if the first argument is sparse, so we need to do a swap.
+          case (_, _, _, false) ⇒ { (a, b, r) ⇒ apply(b.t, a.t, r.map {_.t}).t }
+
+          // Default jvm-jvm case.
+          case _ ⇒ jvmRWCW
+        }
+    }
+
+    alg(a, b, r)
+  }
+
+  type MMulAlg = MMBinaryFunc
+
+  @inline
+  private def jvmRWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+
+    require(r.forall(mxR ⇒ mxR.nrow == a.nrow && mxR.ncol == b.ncol))
+    val (m, n) = (a.nrow, b.ncol)
+
+    val mxR = r.getOrElse(if (a.getFlavor.isDense) a.like(m, n) else b.like(m, n))
+
+    for (row ← 0 until mxR.nrow; col ← 0 until mxR.ncol) {
+      // this vector-vector should be sort of optimized, right?
+      mxR(row, col) = a(row, ::) dot b(::, col)
+    }
+    mxR
+  }
+
+
+  @inline
+  private def jvmRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+
+    // A bit hackish: currently, this relies a bit on the fact that like produces RW(?)
+    val bclone = b.like(b.ncol, b.nrow).t
+    for (brow ← b) bclone(brow.index(), ::) := brow
+
+    require(bclone.getFlavor.getStructure == TraversingStructureEnum.COLWISE || bclone.getFlavor.getStructure ==
+      TraversingStructureEnum.SPARSECOLWISE, "COL wise conversion assumption of RHS is wrong, do over this code.")
+
+    jvmRWCW(a, bclone, r)
+  }
+
+  private def jvmCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+    jvmRWRW(b.t, a.t, r.map(_.t)).t
+  }
+
+  private def jvmCWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+    // This is a primary contender with Outer Prod sum algo.
+    // Here, we force-reorient both matrices and run RWCW.
+    // A bit hackish: currently, this relies a bit on the fact that clone always produces RW(?)
+    val aclone = a.cloned
+
+    require(aclone.getFlavor.getStructure == TraversingStructureEnum.ROWWISE || aclone.getFlavor.getStructure ==
+      TraversingStructureEnum.SPARSEROWWISE, "Row wise conversion assumption of RHS is wrong, do over this code.")
+
+    jvmRWRW(aclone, b, r)
+  }
+
+  private def jvmSparseRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+    val mxR = r.getOrElse(b.like(a.nrow, b.ncol))
+
+    // This is basically almost the algorithm from SparseMatrix.times
+    for (arow ← a; ael ← arow.nonZeroes)
+      mxR(arow.index(), ::).assign(b(ael.index, ::), Functions.plusMult(ael))
+
+    mxR
+  }
+
+  private def jvmSparseRowRWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+    val mxR = r.getOrElse(b.like(a.nrow, b.ncol))
+    for (arow ← a.iterateNonEmpty(); ael ← arow.vector.nonZeroes)
+      mxR(arow.index(), ::).assign(b(ael.index, ::), Functions.plusMult(ael))
+
+    mxR
+  }
+
+  private def jvmSparseRowCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+    jvmSparseRowRWRW(b.t, a.t, r.map(_.t)).t
+
+  private def jvmSparseRowCWCW2flips(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+    jvmSparseRowRWRW(a cloned, b cloned, r)
+
+  private def jvmSparseRowRWCW(a: Matrix, b: Matrix, r: Option[Matrix]) =
+    jvmSparseRowRWRW(a, b cloned, r)
+
+
+  private def jvmSparseRowCWRW(a: Matrix, b: Matrix, r: Option[Matrix]) =
+    jvmSparseRowRWRW(a cloned, b, r)
+
+  private def jvmSparseRWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+    jvmSparseRWRW(a, b.cloned, r)
+
+  private def jvmSparseCWRW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+    jvmSparseRWRW(a cloned, b, r)
+
+  private def jvmSparseCWCW(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+    jvmSparseRWRW(b.t, a.t, r.map(_.t)).t
+
+  private def jvmSparseCWCW2flips(a: Matrix, b: Matrix, r: Option[Matrix] = None) =
+    jvmSparseRWRW(a cloned, b cloned, r)
+
+  private def jvmDiagRW(diagm:Matrix, b:Matrix, r:Option[Matrix] = None):Matrix = {
+    val mxR = r.getOrElse(b.like(diagm.nrow, b.ncol))
+
+    for (del ← diagm.diagv.nonZeroes())
+      mxR(del.index, ::).assign(b(del.index, ::), Functions.plusMult(del))
+
+    mxR
+  }
+
+  private def jvmDiagCW(diagm: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+    val mxR = r.getOrElse(b.like(diagm.nrow, b.ncol))
+    for (bcol ← b.t) mxR(::, bcol.index()) := bcol * diagm.diagv
+    mxR
+  }
+
+  private def jvmCWDiag(a: Matrix, diagm: Matrix, r: Option[Matrix] = None) =
+    jvmDiagRW(diagm, a.t, r.map {_.t}).t
+
+  private def jvmRWDiag(a: Matrix, diagm: Matrix, r: Option[Matrix] = None) =
+    jvmDiagCW(diagm, a.t, r.map {_.t}).t
+
+
+  /** Dense column-wise AA' */
+  private def jvmDCWAAt(a:Matrix, b:Matrix, r:Option[Matrix] = None) = {
+    // a.t must be equiv. to b. Cloning must rewrite to row-wise.
+    jvmDRWAAt(a.cloned,null,r)
+  }
+
+  /** Dense Row-wise AA' */
+  private def jvmDRWAAt(a:Matrix, b:Matrix, r:Option[Matrix] = None) = {
+    // a.t must be equiv to b.
+
+    debug("AAt computation detected.")
+
+    // Check dimensions if result is supplied.
+    require(r.forall(mxR ⇒ mxR.nrow == a.nrow && mxR.ncol == a.nrow))
+
+    val mxR = r.getOrElse(a.like(a.nrow, a.nrow))
+
+    // This is symmetric computation. Compile upper triangular first.
+    for (row ← 0 until mxR.nrow) {
+      // diagonal value
+      mxR(row, row) = a(row, ::).aggregate(Functions.PLUS, Functions.SQUARE)
+
+      for ( col ← row + 1 until mxR.ncol) {
+        // this vector-vector should be sort of optimized, right?
+        val v = a(row, ::) dot a(col, ::)
+
+        mxR(row, col) = v
+        mxR(col,row) = v
+      }
+    }
+
+    mxR
+  }
+
+  private def jvmOuterProdSum(a: Matrix, b: Matrix, r: Option[Matrix] = None): Matrix = {
+
+    // This may be already laid out for outer product computation, which may be faster than reorienting
+    // both matrices? need to check.
+    val (m, n) = (a.nrow, b.ncol)
+
+    // Prefer col-wise result iff a is dense and b is sparse. In all other cases default to row-wise.
+    val preferColWiseR = a.getFlavor.isDense && !b.getFlavor.isDense
+
+    val mxR = r.getOrElse {
+      (a.getFlavor.isDense, preferColWiseR) match {
+        case (false, false) ⇒ b.like(m, n)
+        case (false, true) ⇒ b.like(n, m).t
+        case (true, false) ⇒ a.like(m, n)
+        case (true, true) ⇒ a.like(n, m).t
+      }
+    }
+
+    // Loop outer products
+    if (preferColWiseR) {
+      // this means B is sparse and A is not, so we need to iterate over b values and update R columns with +=
+      // one at a time.
+      for ((acol, brow) ← a.t.zip(b); bel ← brow.nonZeroes) mxR(::, bel.index()) += bel * acol
+    } else {
+      for ((acol, brow) ← a.t.zip(b); ael ← acol.nonZeroes()) mxR(ael.index(), ::) += ael * brow
+    }
+
+    mxR
+  }
+}
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala
index 910035fddb..3c0ae8962d 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/MatrixOps.scala
@@ -17,8 +17,10 @@
 
 package org.apache.mahout.math.scalabindings
 
+import org.apache.mahout.math.flavor.TraversingStructureEnum
 import org.apache.mahout.math.{Matrices, QRDecomposition, Vector, Matrix}
-import scala.collection.JavaConversions._
+import collection._
+import JavaConversions._
 import org.apache.mahout.math.function.{DoubleDoubleFunction, VectorFunction, DoubleFunction, Functions}
 import scala.math._
 
@@ -41,6 +43,10 @@ class MatrixOps(val m: Matrix) {
 
   def +=(that: Matrix) = m.assign(that, Functions.PLUS)
 
+  def +=:(that:Matrix) = m += that
+
+  def +=:(that:Double) = m += that
+
   def -=(that: Matrix) = m.assign(that, Functions.MINUS)
 
   def +=(that: Double) = m.assign(new DoubleFunction {
@@ -70,24 +76,30 @@ class MatrixOps(val m: Matrix) {
 
   def -:(that: Double) = that -=: cloned
 
-
-  def norm = sqrt(m.aggregate(Functions.PLUS, Functions.SQUARE))
+  def norm = math.sqrt(m.aggregate(Functions.PLUS, Functions.SQUARE))
 
   def pnorm(p: Int) = pow(m.aggregate(Functions.PLUS, Functions.chain(Functions.ABS, Functions.pow(p))), 1.0 / p)
 
   def apply(row: Int, col: Int) = m.get(row, col)
 
-  def update(row: Int, col: Int, v: Double): Matrix = {
-    m.setQuick(row, col, v);
+  def update(row: Int, col: Int, that: Double): Matrix = {
+    m.setQuick(row, col, that);
     m
   }
 
+  def update(rowRange: Range, colRange: Range, that: Double) = apply(rowRange, colRange) := that
+
+  def update(row: Int, colRange: Range, that: Double) = apply(row, colRange) := that
+
+  def update(rowRange: Range, col: Int, that: Double) = apply(rowRange, col) := that
+
   def update(rowRange: Range, colRange: Range, that: Matrix) = apply(rowRange, colRange) := that
 
   def update(row: Int, colRange: Range, that: Vector) = apply(row, colRange) := that
 
   def update(rowRange: Range, col: Int, that: Vector) = apply(rowRange, col) := that
-
+  
+  
   def apply(rowRange: Range, colRange: Range): Matrix = {
 
     if (rowRange == :: &&
@@ -140,12 +152,60 @@ class MatrixOps(val m: Matrix) {
     })
   }
 
+  def :=(that: Double) = m.assign(that)
+
   def :=(f: (Int, Int, Double) => Double): Matrix = {
-    for (r <- 0 until nrow; c <- 0 until ncol) m(r, c) = f(r, c, m(r, c))
+    import RLikeOps._
+    m.getFlavor.getStructure match {
+      case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
+        for (col <- t; el <- col.all) el := f(el.index, col.index, el)
+      case default =>
+        for (row <- m; el <- row.all) el := f(row.index, el.index, el)
+    }
+    m
+  }
+
+  /** Functional assign with (Double) => Double */
+  def :=(f: (Double) => Double): Matrix = {
+    import RLikeOps._
+    m.getFlavor.getStructure match {
+      case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
+        for (col <- t; el <- col.all) el := f(el)
+      case default =>
+        for (row <- m; el <- row.all) el := f(el)
+    }
     m
   }
 
-  def cloned: Matrix = m.like := m
+  /** Sparse assign: iterate and assign over non-zeros only */
+  def ::=(f: (Int, Int, Double) => Double): Matrix = {
+
+    import RLikeOps._
+
+    m.getFlavor.getStructure match {
+      case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
+        for (col <- t; el <- col.nonZeroes) el := f(el.index, col.index, el)
+      case default =>
+        for (row <- m; el <- row.nonZeroes) el := f(row.index, el.index, el)
+    }
+    m
+  }
+
+  /** Sparse function assign: iterate and assign over non-zeros only */
+  def ::=(f: (Double) => Double): Matrix = {
+
+    import RLikeOps._
+
+    m.getFlavor.getStructure match {
+      case TraversingStructureEnum.COLWISE | TraversingStructureEnum.SPARSECOLWISE =>
+        for (col <- t; el <- col.nonZeroes) el := f(el)
+      case default =>
+        for (row <- m; el <- row.nonZeroes) el := f(el)
+    }
+    m
+  }
+
+    def cloned: Matrix = m.like := m
 
   /**
    * Ideally, we would probably want to override equals(). But that is not
@@ -155,11 +215,14 @@ class MatrixOps(val m: Matrix) {
    * @return
    */
   def equiv(that: Matrix) =
+
+  // Warning: TODO: This would actually create empty objects in SparseMatrix. Should really implement
+  // merge-type comparison strategy using iterateNonEmpty.
     that != null &&
-        nrow == that.nrow &&
-        m.view.zip(that).forall(t => {
-          t._1.equiv(t._2)
-        })
+      nrow == that.nrow &&
+      m.view.zip(that).forall(t => {
+        t._1.equiv(t._2)
+      })
 
   def nequiv(that: Matrix) = !equiv(that)
 
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/DoubleScalarOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeDoubleScalarOps.scala
similarity index 69%
rename from math-scala/src/main/scala/org/apache/mahout/math/scalabindings/DoubleScalarOps.scala
rename to math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeDoubleScalarOps.scala
index 9fdd6e5805..a1e9377d29 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/DoubleScalarOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeDoubleScalarOps.scala
@@ -19,7 +19,7 @@ package org.apache.mahout.math.scalabindings
 
 import org.apache.mahout.math._
 
-class DoubleScalarOps(val x:Double) extends AnyVal{
+class RLikeDoubleScalarOps(val x:Double) extends AnyVal{
 
   import RLikeOps._
 
@@ -38,5 +38,26 @@ class DoubleScalarOps(val x:Double) extends AnyVal{
   def /(that:Matrix) = x /: that
 
   def /(that:Vector) = x /: that
+  
+  def cbind(that:Matrix) = {
+    val mx = that.like(that.nrow, that.ncol + 1)
+    mx(::, 1 until mx.ncol) := that
+    if (x != 0.0) mx(::, 0) := x
+    mx
+  }
+
+  def rbind(that: Matrix) = {
+    val mx = that.like(that.nrow + 1, that.ncol)
+    mx(1 until mx.nrow, ::) := that
+    if (x != 0.0) mx(0, ::) := x
+    mx
+  }
+
+  def c(that: Vector): Vector = {
+    val cv = that.like(that.length + 1)
+    cv(1 until cv.length) := that
+    cv(0) = x
+    cv
+  }
 
 }
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
index 97e06cfbab..a2d62d6231 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
@@ -16,18 +16,28 @@
  */
 package org.apache.mahout.math.scalabindings
 
+import org.apache.mahout.math.function.Functions
 import org.apache.mahout.math.{Vector, Matrix}
 import scala.collection.JavaConversions._
 import RLikeOps._
 
 class RLikeMatrixOps(m: Matrix) extends MatrixOps(m) {
 
+  /** Structure-optimized mmul */
+  def %*%(that: Matrix) = MMul(m, that, None)
+
+  def :%*%(that:Matrix) = %*%(that)
+
+  def %*%:(that: Matrix) = that :%*% m
+
   /**
-   * matrix-matrix multiplication
-   * @param that
-   * @return
+   * The "legacy" matrix-matrix multiplication.
+   *
+   * @param that right hand operand
+   * @return matrix multiplication result
+   * @deprecated use %*%
    */
-  def %*%(that: Matrix) = m.times(that)
+  def %***%(that: Matrix) = m.times(that)
 
   /**
    * matrix-vector multiplication
@@ -65,13 +75,16 @@ class RLikeMatrixOps(m: Matrix) extends MatrixOps(m) {
    * @param that
    */
   def *=(that: Matrix) = {
-    m.zip(that).foreach(t => t._1.vector *= t._2.vector)
+    m.assign(that, Functions.MULT)
     m
   }
 
+  /** A *=: B is equivalent to B *= A. Included for completeness. */
+  def *=:(that: Matrix) = m *= that
+
   /** Elementwise deletion */
   def /=(that: Matrix) = {
-    m.zip(that).foreach(t => t._1.vector() /= t._2.vector)
+    m.zip(that).foreach(t ⇒ t._1.vector() /= t._2.vector)
     m
   }
 
@@ -80,15 +93,55 @@ class RLikeMatrixOps(m: Matrix) extends MatrixOps(m) {
     m
   }
 
+  /** 5.0 *=: A is equivalent to A *= 5.0. Included for completeness. */
+  def *=:(that: Double) = m *= that
+
   def /=(that: Double) = {
-    m.foreach(_.vector() /= that)
+    m ::= { x ⇒ x / that }
     m
   }
 
   /** 1.0 /=: A is equivalent to A = 1.0/A in R */
   def /=:(that: Double) = {
-    m.foreach(that /=: _.vector())
+    if (that != 0.0) m := { x ⇒ that / x }
+    m
+  }
+
+  def ^=(that: Double) = {
+    m ::= { x ⇒ math.pow(x, that) }
     m
   }
+
+  def ^(that: Double) = m.cloned ^= that
+
+  def cbind(that: Matrix): Matrix = {
+    require(m.nrow == that.nrow)
+    val mx = m.like(m.nrow, m.ncol + that.ncol)
+    mx(::, 0 until m.ncol) := m
+    mx(::, m.ncol until mx.ncol) := that
+    mx
+  }
+
+  def cbind(that: Double): Matrix = {
+    val mx = m.like(m.nrow, m.ncol + 1)
+    mx(::, 0 until m.ncol) := m
+    if (that != 0.0) mx(::, m.ncol) := that
+    mx
+  }
+
+  def rbind(that: Matrix): Matrix = {
+    require(m.ncol == that.ncol)
+    val mx = m.like(m.nrow + that.nrow, m.ncol)
+    mx(0 until m.nrow, ::) := m
+    mx(m.nrow until mx.nrow, ::) := that
+    mx
+  }
+
+  def rbind(that: Double): Matrix = {
+    val mx = m.like(m.nrow + 1, m.ncol)
+    mx(0 until m.nrow, ::) := m
+    if (that != 0.0) mx(m.nrow, ::) := that
+    mx
+  }
 }
 
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala
index ba3230497e..e10a01b9ea 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeOps.scala
@@ -24,13 +24,13 @@ import org.apache.mahout.math.{Vector, MatrixTimesOps, Matrix}
  */
 object RLikeOps {
 
-  implicit def double2Scalar(x:Double) = new DoubleScalarOps(x)
+  implicit def double2Scalar(x:Double) = new RLikeDoubleScalarOps(x)
 
   implicit def v2vOps(v: Vector) = new RLikeVectorOps(v)
 
   implicit def el2elOps(el: Vector.Element) = new ElementOps(el)
 
-  implicit def times2timesOps(m: MatrixTimesOps) = new RLikeTimesOps(m)
+  implicit def el2Double(el:Vector.Element) = el.get()
 
   implicit def m2mOps(m: Matrix) = new RLikeMatrixOps(m)
 
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
index d2198bd0d9..cc73350f60 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
@@ -17,7 +17,7 @@
 
 package org.apache.mahout.math.scalabindings
 
-import org.apache.mahout.math.Vector
+import org.apache.mahout.math.{Matrix, Vector}
 import org.apache.mahout.math.function.Functions
 import RLikeOps._
 
@@ -67,5 +67,28 @@ class RLikeVectorOps(_v: Vector) extends VectorOps(_v) {
   /** Elementwise right-associative / */
   def /:(that: Vector) = that.cloned /= v
 
+  def ^=(that: Double) = v.assign(Functions.POW, that)
+
+  def ^=(that: Vector) = v.assign(that, Functions.POW)
+
+  def ^(that: Double) = v.cloned ^= that
+
+  def ^(that: Vector) = v.cloned ^= that
+
+  def c(that:Vector) = {
+    val cv = v.like(v.length + that.length)
+    cv(0 until v.length) := cv
+    cv(v.length until cv.length) := that
+    cv
+  }
+
+  def c(that: Double) = {
+    val cv = v.like(v.length + 1)
+    cv(0 until v.length) := v
+    cv(v.length) = that
+    cv
+  }
+
+  def mean = sum / length
 
 }
\ No newline at end of file
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala
index c20354d21b..ef9c494c5f 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/VectorOps.scala
@@ -38,8 +38,13 @@ class VectorOps(private[scalabindings] val v: Vector) {
 
   def update(r: Range, that: Vector) = apply(r) := that
 
+  /** R-like synonyms for java methods on vectors */
   def sum = v.zSum()
 
+  def min = v.minValue()
+
+  def max = v.maxValue()
+
   def :=(that: Vector): Vector = {
 
     // assign op in Mahout requires same
@@ -58,11 +63,30 @@ class VectorOps(private[scalabindings] val v: Vector) {
 
   def :=(that: Double): Vector = v.assign(that)
 
+  /** Functional assigment for a function with index and x */
   def :=(f: (Int, Double) => Double): Vector = {
     for (i <- 0 until length) v(i) = f(i, v(i))
     v
   }
 
+  /** Functional assignment for a function with just x (e.g. v :=  math.exp _) */
+  def :=(f:(Double)=>Double):Vector = {
+    for (i <- 0 until length) v(i) = f(v(i))
+    v
+  }
+
+  /** Sparse iteration functional assignment using function receiving index and x */
+  def ::=(f: (Int, Double) => Double): Vector = {
+    for (el <- v.nonZeroes) el := f(el.index, el.get)
+    v
+  }
+
+  /** Sparse iteration functional assignment using a function recieving just x */
+  def ::=(f: (Double) => Double): Vector = {
+    for (el <- v.nonZeroes) el := f(el.get)
+    v
+  }
+
   def equiv(that: Vector) =
     length == that.length &&
         v.all.view.zip(that.all).forall(t => t._1.get == t._2.get)
@@ -121,21 +145,26 @@ class VectorOps(private[scalabindings] val v: Vector) {
 }
 
 class ElementOps(private[scalabindings] val el: Vector.Element) {
+  import RLikeOps._
+
+  def update(v: Double): Double = { el.set(v); v }
+
+  def :=(that: Double) = update(that)
 
-  def apply = el.get()
+  def *(that: Vector.Element): Double = this * that
 
-  def update(v: Double) = el.set(v)
+  def *(that: Vector): Vector = el.get * that
 
-  def :=(v: Double) = el.set(v)
+  def +(that: Vector.Element): Double = this + that
 
-  def +(that: Double) = el.get() + that
+  def +(that: Vector) :Vector = el.get + that
 
-  def -(that: Double) = el.get() - that
+  def /(that: Vector.Element): Double = this / that
 
-  def :-(that: Double) = that - el.get()
+  def /(that:Vector):Vector = el.get / that
 
-  def /(that: Double) = el.get() / that
+  def -(that: Vector.Element): Double = this - that
 
-  def :/(that: Double) = that / el.get()
+  def -(that: Vector) :Vector = el.get - that
 
 }
\ No newline at end of file
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
index 36f510309f..20dc9cd853 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/package.scala
@@ -18,12 +18,15 @@
 package org.apache.mahout.math
 
 import org.apache.mahout.math.solver.EigenDecomposition
+import collection._
+import JavaConversions._
 
 /**
  * Mahout matrices and vectors' scala syntactic sugar
  */
 package object scalabindings {
 
+
   // Reserved "ALL" range
   final val `::`: Range = null
 
@@ -125,7 +128,6 @@ package object scalabindings {
     val data = for (r <- rows) yield {
       r match {
         case n: Number => Array(n.doubleValue())
-        case t: Product => t.productIterator.map(_.asInstanceOf[Number].doubleValue()).toArray
         case t: Vector => Array.tabulate(t.length)(t(_))
         case t: Array[Double] => t
         case t: Iterable[_] =>
@@ -138,6 +140,7 @@ package object scalabindings {
               }
               return m
           }
+        case t: Product => t.productIterator.map(_.asInstanceOf[Number].doubleValue()).toArray
         case t: Array[Array[Double]] => if (rows.size == 1)
           return new DenseMatrix(t)
         else
@@ -164,7 +167,7 @@ package object scalabindings {
    *   (0,5)::(9,3)::Nil,
    *   (2,3.5)::(7,8)::Nil
    * )
-   * 
+   *
    * }}}
    *
    * @param rows
@@ -172,11 +175,18 @@ package object scalabindings {
    */
 
   def sparse(rows: Vector*): SparseRowMatrix = {
-    import MatrixOps._
+    import RLikeOps._
     val nrow = rows.size
     val ncol = rows.map(_.size()).max
     val m = new SparseRowMatrix(nrow, ncol)
-    m := rows
+    m := rows.map { row =>
+      if (row.length < ncol) {
+        val newRow = row.like(ncol)
+        newRow(0 until row.length) := row
+        newRow
+      }
+      else row
+    }
     m
 
   }
@@ -249,23 +259,23 @@ package object scalabindings {
     (qrdec.getQ, qrdec.getR)
   }
 
- /**
-  * Solution <tt>X</tt> of <tt>A*X = B</tt> using QR-Decomposition, where <tt>A</tt> is a square, non-singular matrix.
+  /**
+   * Solution <tt>X</tt> of <tt>A*X = B</tt> using QR-Decomposition, where <tt>A</tt> is a square, non-singular matrix.
    *
    * @param a
    * @param b
    * @return (X)
    */
   def solve(a: Matrix, b: Matrix): Matrix = {
-   import MatrixOps._
-   if (a.nrow != a.ncol) {
-     throw new IllegalArgumentException("supplied matrix A is not square")
-   }
-   val qr = new QRDecomposition(a cloned)
-   if (!qr.hasFullRank) {
-     throw new IllegalArgumentException("supplied matrix A is singular")
-   }
-   qr.solve(b)
+    import MatrixOps._
+    if (a.nrow != a.ncol) {
+      throw new IllegalArgumentException("supplied matrix A is not square")
+    }
+    val qr = new QRDecomposition(a cloned)
+    if (!qr.hasFullRank) {
+      throw new IllegalArgumentException("supplied matrix A is singular")
+    }
+    qr.solve(b)
   }
 
   /**
@@ -293,5 +303,46 @@ package object scalabindings {
     x(::, 0)
   }
 
+  ///////////////////////////////////////////////////////////
+  // Elementwise unary functions. Actually this requires creating clones to avoid side effects. For
+  // efficiency reasons one may want to actually do in-place exression assignments instead, e.g.
+  //
+  // m := exp _
+
+  import RLikeOps._
+  import scala.math._
+
+  def mexp(m: Matrix): Matrix = m.cloned := exp _
+
+  def vexp(v: Vector): Vector = v.cloned := exp _
+
+  def mlog(m: Matrix): Matrix = m.cloned := log _
+
+  def vlog(v: Vector): Vector = v.cloned := log _
+
+  def mabs(m: Matrix): Matrix = m.cloned ::= (abs(_: Double))
+
+  def vabs(v: Vector): Vector = v.cloned ::= (abs(_: Double))
+
+  def msqrt(m: Matrix): Matrix = m.cloned ::= sqrt _
+
+  def vsqrt(v: Vector): Vector = v.cloned ::= sqrt _
+
+  def msignum(m: Matrix): Matrix = m.cloned ::= (signum(_: Double))
+
+  def vsignum(v: Vector): Vector = v.cloned ::= (signum(_: Double))
+
+  //////////////////////////////////////////////////////////
+  // operation funcs
+
+
+  /** Matrix-matrix unary func */
+  type MMUnaryFunc = (Matrix, Option[Matrix]) => Matrix
+  /** Binary matrix-matrix operations which may save result in-place, optionally */
+  type MMBinaryFunc = (Matrix, Matrix, Option[Matrix]) => Matrix
+  type MVBinaryFunc = (Matrix, Vector, Option[Matrix]) => Matrix
+  type VMBinaryFunc = (Vector, Matrix, Option[Matrix]) => Matrix
+  type MDBinaryFunc = (Matrix, Double, Option[Matrix]) => Matrix
+
 
 }
diff --git a/math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala b/math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala
new file mode 100644
index 0000000000..b61bea4658
--- /dev/null
+++ b/math-scala/src/main/scala/org/apache/mahout/util/IOUtilsScala.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.util
+
+import org.apache.mahout.logging._
+import collection._
+import java.io.Closeable
+
+object IOUtilsScala {
+
+  private final implicit val log = getLog(IOUtilsScala.getClass)
+
+  /**
+   * Try to close every resource in the sequence, in order of the sequence.
+   *
+   * Report all encountered exceptions to logging.
+   *
+   * Rethrow last exception only (if any)
+   * @param closeables
+   */
+  def close(closeables: Seq[Closeable]) = {
+
+    var lastThr: Option[Throwable] = None
+    closeables.foreach { c =>
+      try {
+        c.close()
+      } catch {
+        case t: Throwable =>
+          error(t.getMessage, t)
+          lastThr = Some(t)
+      }
+    }
+
+    // Rethrow most recent close exception (can throw only one)
+    lastThr.foreach(throw _)
+  }
+
+  /**
+   * Same as [[IOUtilsScala.close( )]] but do not re-throw any exceptions.
+   * @param closeables
+   */
+  def closeQuietly(closeables: Seq[Closeable]) = {
+    try {
+      close(closeables)
+    } catch {
+      case t: Throwable => // NOP
+    }
+  }
+}
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala
index 849db688a4..bb42121add 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeOpsSuiteBase.scala
@@ -46,6 +46,26 @@ trait DrmLikeOpsSuiteBase extends DistributedMahoutSuite with Matchers {
 
   }
 
+  test("allReduceBlock") {
+
+    val mxA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
+    val drmA = drmParallelize(mxA, numPartitions = 2)
+
+    try {
+      val mxB = drmA.allreduceBlock { case (keys, block) ⇒
+        block(::, 0 until 2).t %*% block(::, 2 until 3)
+      }
+
+      val mxControl = mxA(::, 0 until 2).t %*% mxA(::, 2 until 3)
+
+      (mxB - mxControl).norm should be < 1e-10
+
+    } catch {
+      case e: UnsupportedOperationException ⇒ // Some engines may not support this, so ignore.
+    }
+
+  }
+
   test("col range") {
     val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
     val A = drmParallelize(m = inCoreA, numPartitions = 2)
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala
index 6c9313ce68..f215fb73de 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/drm/DrmLikeSuiteBase.scala
@@ -68,9 +68,8 @@ trait DrmLikeSuiteBase extends DistributedMahoutSuite with Matchers {
     inCoreEmpty.nrow shouldBe 100
     inCoreEmpty.ncol shouldBe 50
 
+  }
 
 
 
-  }
-
 }
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala b/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala
index 2e6204d046..b46ee3086c 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/drm/RLikeDrmOpsSuiteBase.scala
@@ -24,7 +24,13 @@ import scalabindings._
 import RLikeOps._
 import RLikeDrmOps._
 import decompositions._
-import org.apache.mahout.math.drm.logical.{OpAtB, OpAtA, OpAtx}
+import org.apache.mahout.math.drm.logical._
+import org.apache.mahout.math.drm.logical.OpAtx
+import org.apache.mahout.math.drm.logical.OpAtB
+import org.apache.mahout.math.drm.logical.OpAtA
+import org.apache.mahout.math.drm.logical.OpAewUnaryFuncFusion
+
+import scala.util.Random
 
 /** Common engine tests for distributed R-like DRM operations */
 trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers {
@@ -188,10 +194,13 @@ trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers {
 
     val A = drmParallelize(inCoreA, numPartitions = 2)
         .mapBlock()({
-      case (keys, block) => keys.map(_.toString) -> block
+      case (keys, block) ⇒ keys.map(_.toString) → block
     })
 
-    val B = A + 1.0
+    // Dense-A' x sparse-B used to produce error. We sparsify B here to test this as well.
+    val B = (A + 1.0).mapBlock() { case (keys, block) ⇒
+      keys → (new SparseRowMatrix(block.nrow, block.ncol) := block)
+    }
 
     val C = A.t %*% B
 
@@ -204,6 +213,25 @@ trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers {
 
   }
 
+  test ("C = A %*% B.t") {
+
+    val inCoreA = dense((1, 2), (3, 4), (-3, -5))
+
+    val A = drmParallelize(inCoreA, numPartitions = 2)
+
+    val B = A + 1.0
+
+    val C = A %*% B.t
+
+    mahoutCtx.optimizerRewrite(C) should equal(OpABt[Int](A, B))
+
+    val inCoreC = C.collect
+    val inCoreControlC = inCoreA %*% (inCoreA + 1.0).t
+
+    (inCoreC - inCoreControlC).norm should be < 1E-10
+
+  }
+
   test("C = A %*% inCoreB") {
 
     val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7))
@@ -503,6 +531,24 @@ trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers {
 
   }
 
+  test("B = 1 cbind A") {
+    val inCoreA = dense((1, 2), (3, 4))
+    val control = dense((1, 1, 2), (1, 3, 4))
+
+    val drmA = drmParallelize(inCoreA, numPartitions = 2)
+
+    (control - (1 cbind drmA) ).norm should be < 1e-10
+  }
+
+  test("B = A cbind 1") {
+    val inCoreA = dense((1, 2), (3, 4))
+    val control = dense((1, 2, 1), (3, 4, 1))
+
+    val drmA = drmParallelize(inCoreA, numPartitions = 2)
+
+    (control - (drmA cbind 1) ).norm should be < 1e-10
+  }
+
   test("B = A + 1.0") {
     val inCoreA = dense((1, 2), (2, 3), (3, 4))
     val controlB = inCoreA + 1.0
@@ -547,4 +593,46 @@ trait RLikeDrmOpsSuiteBase extends DistributedMahoutSuite with Matchers {
     (10 * drmA - (10 *: drmA)).norm shouldBe 0
 
   }
+
+  test("A * A -> sqr(A) rewrite ") {
+    val mxA = dense(
+      (1, 2, 3),
+      (3, 4, 5),
+      (7, 8, 9)
+    )
+
+    val mxAAControl = mxA * mxA
+
+    val drmA = drmParallelize(mxA, 2)
+    val drmAA = drmA * drmA
+
+    val optimized = drmAA.context.engine.optimizerRewrite(drmAA)
+    println(s"optimized:$optimized")
+    optimized.isInstanceOf[OpAewUnaryFunc[Int]] shouldBe true
+
+    (mxAAControl -= drmAA).norm should be < 1e-10
+  }
+
+  test("B = 1 + 2 * (A * A) ew unary function fusion") {
+    val mxA = dense(
+      (1, 2, 3),
+      (3, 0, 5)
+    )
+    val controlB = mxA.cloned := { (x) => 1 + 2 * x * x}
+
+    val drmA = drmParallelize(mxA, 2)
+
+    // We need to use parenthesis, otherwise optimizer will see it as (2A) * (A) and that would not
+    // be rewritten as 2 * sqr(A). It is not that clever (yet) to try commutativity optimizations.
+    val drmB = 1 + 2 * (drmA * drmA)
+
+    val optimized = mahoutCtx.engine.optimizerRewrite(drmB)
+    println(s"optimizer rewritten:$optimized")
+    optimized.isInstanceOf[OpAewUnaryFuncFusion[Int]] shouldBe true
+
+    (controlB - drmB).norm should be < 1e-10
+
+  }
+
+
 }
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala
index d7b22d966d..5c8a3101e8 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/MatrixOpsSuite.scala
@@ -24,6 +24,8 @@ import org.apache.mahout.test.MahoutSuite
 import org.apache.mahout.math.{RandomAccessSparseVector, SequentialAccessSparseVector, Matrices}
 import org.apache.mahout.common.RandomUtils
 
+import scala.util.Random
+
 
 class MatrixOpsSuite extends FunSuite with MahoutSuite {
 
@@ -93,12 +95,40 @@ class MatrixOpsSuite extends FunSuite with MahoutSuite {
 
     val e = eye(5)
 
-    printf("I(5)=\n%s\n", e)
+    println(s"I(5)=\n$e")
 
     a(0 to 1, 1 to 2) = dense((3, 2), (2, 3))
     a(0 to 1, 1 to 2) := dense((3, 2), (2, 3))
 
+    println(s"a=$a")
+
+    a(0 to 1, 1 to 2) := { _ => 45}
+    println(s"a=$a")
+
+//    a(0 to 1, 1 to 2) ::= { _ => 44}
+    println(s"a=$a")
+
+    // Sparse assignment to a sparse block
+    val c = sparse(0 -> 1 :: Nil, 2 -> 2 :: Nil, 1 -> 5 :: Nil)
+    val d = c.cloned
+
+    println(s"d=$d")
+    d.ncol shouldBe 3
 
+    d(::, 1 to 2) ::= { _ => 4}
+    println(s"d=$d")
+    d(::, 1 to 2).sum shouldBe 8
+
+    d ::= {_ => 5}
+    d.sum shouldBe 15
+
+    val f = c.cloned.t
+    f ::= {_ => 6}
+    f.sum shouldBe 18
+
+    val g = c.cloned
+    g(::, 1 until g.nrow) ::= { x => if (x <= 0) 0.0 else 1.0}
+    g.sum shouldBe 3
   }
 
   test("sparse") {
@@ -182,4 +212,5 @@ class MatrixOpsSuite extends FunSuite with MahoutSuite {
   }
 
 
+
 }
\ No newline at end of file
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
index a943c5fbd3..0f81145e95 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
@@ -17,9 +17,16 @@
 
 package org.apache.mahout.math.scalabindings
 
+import java.util
+
+import org.apache.log4j.Level
+import org.apache.mahout.math._
 import org.scalatest.FunSuite
 import RLikeOps._
 import org.apache.mahout.test.MahoutSuite
+import org.apache.mahout.logging._
+import scala.collection.JavaConversions._
+import scala.util.Random
 
 class RLikeMatrixOpsSuite extends FunSuite with MahoutSuite {
 
@@ -63,6 +70,10 @@ class RLikeMatrixOpsSuite extends FunSuite with MahoutSuite {
 
   }
 
+  test("Uniform view") {
+    val mxUnif = Matrices.symmetricUniformView(5000000, 5000000, 1234)
+  }
+
   /** Test dsl overloads over scala operations over matrices */
   test ("scalarOps") {
     val a = dense(
@@ -77,4 +88,269 @@ class RLikeMatrixOpsSuite extends FunSuite with MahoutSuite {
 
   }
 
+  test("Multiplication experimental performance") {
+
+    getLog(MMul.getClass).setLevel(Level.DEBUG)
+
+    val d = 300
+    val n = 3
+
+    // Dense row-wise
+    val mxAd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) + 1
+    val mxBd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) - 1
+
+    val rnd = new Random(1234)
+
+    // Sparse rows
+    val mxAsr = (new SparseRowMatrix(d,
+      d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
+    val mxBsr = (new SparseRowMatrix(d,
+      d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
+
+    // Hanging sparse rows
+    val mxAs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
+    val mxBs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
+
+    // DIAGONAL
+    val mxD = diagv(dvec(Array.tabulate(d)(_ => rnd.nextGaussian())))
+
+    def time(op: => Unit): Long = {
+      val ms = System.currentTimeMillis()
+      op
+      System.currentTimeMillis() - ms
+    }
+
+    def getMmulAvgs(mxA: Matrix, mxB: Matrix, n: Int) = {
+
+      var control: Matrix = null
+      var mmulVal: Matrix = null
+
+      val current = Stream.range(0, n).map { _ => time {control = mxA.times(mxB)} }.sum.toDouble / n
+      val experimental = Stream.range(0, n).map { _ => time {mmulVal = MMul(mxA, mxB, None)} }.sum.toDouble / n
+      (control - mmulVal).norm should be < 1e-10
+      current -> experimental
+    }
+
+    // Dense matrix tests.
+//    println(s"Ad %*% Bd: ${getMmulAvgs(mxAd, mxBd, n)}")
+//    println(s"Ad' %*% Bd: ${getMmulAvgs(mxAd.t, mxBd, n)}")
+//    println(s"Ad %*% Bd': ${getMmulAvgs(mxAd, mxBd.t, n)}")
+//    println(s"Ad' %*% Bd': ${getMmulAvgs(mxAd.t, mxBd.t, n)}")
+//    println(s"Ad'' %*% Bd'': ${getMmulAvgs(mxAd.t.t, mxBd.t.t, n)}")
+//    println
+//
+//    // Sparse row matrix tests.
+//    println(s"Asr %*% Bsr: ${getMmulAvgs(mxAsr, mxBsr, n)}")
+//    println(s"Asr' %*% Bsr: ${getMmulAvgs(mxAsr.t, mxBsr, n)}")
+//    println(s"Asr %*% Bsr': ${getMmulAvgs(mxAsr, mxBsr.t, n)}")
+//    println(s"Asr' %*% Bsr': ${getMmulAvgs(mxAsr.t, mxBsr.t, n)}")
+//    println(s"Asr'' %*% Bsr'': ${getMmulAvgs(mxAsr.t.t, mxBsr.t.t, n)}")
+//    println
+//
+//    // Sparse matrix tests.
+//    println(s"Asm %*% Bsm: ${getMmulAvgs(mxAs, mxBs, n)}")
+//    println(s"Asm' %*% Bsm: ${getMmulAvgs(mxAs.t, mxBs, n)}")
+//    println(s"Asm %*% Bsm': ${getMmulAvgs(mxAs, mxBs.t, n)}")
+//    println(s"Asm' %*% Bsm': ${getMmulAvgs(mxAs.t, mxBs.t, n)}")
+//    println(s"Asm'' %*% Bsm'': ${getMmulAvgs(mxAs.t.t, mxBs.t.t, n)}")
+//    println
+//
+//    // Mixed sparse matrix tests.
+//    println(s"Asm %*% Bsr: ${getMmulAvgs(mxAs, mxBsr, n)}")
+//    println(s"Asm' %*% Bsr: ${getMmulAvgs(mxAs.t, mxBsr, n)}")
+//    println(s"Asm %*% Bsr': ${getMmulAvgs(mxAs, mxBsr.t, n)}")
+//    println(s"Asm' %*% Bsr': ${getMmulAvgs(mxAs.t, mxBsr.t, n)}")
+//    println(s"Asm'' %*% Bsr'': ${getMmulAvgs(mxAs.t.t, mxBsr.t.t, n)}")
+//    println
+//
+//    println(s"Asr %*% Bsm: ${getMmulAvgs(mxAsr, mxBs, n)}")
+//    println(s"Asr' %*% Bsm: ${getMmulAvgs(mxAsr.t, mxBs, n)}")
+//    println(s"Asr %*% Bsm': ${getMmulAvgs(mxAsr, mxBs.t, n)}")
+//    println(s"Asr' %*% Bsm': ${getMmulAvgs(mxAsr.t, mxBs.t, n)}")
+//    println(s"Asr'' %*% Bsm'': ${getMmulAvgs(mxAsr.t.t, mxBs.t.t, n)}")
+//    println
+//
+//    // Mixed dense/sparse
+//    println(s"Ad %*% Bsr: ${getMmulAvgs(mxAd, mxBsr, n)}")
+//    println(s"Ad' %*% Bsr: ${getMmulAvgs(mxAd.t, mxBsr, n)}")
+//    println(s"Ad %*% Bsr': ${getMmulAvgs(mxAd, mxBsr.t, n)}")
+//    println(s"Ad' %*% Bsr': ${getMmulAvgs(mxAd.t, mxBsr.t, n)}")
+//    println(s"Ad'' %*% Bsr'': ${getMmulAvgs(mxAd.t.t, mxBsr.t.t, n)}")
+//    println
+//
+//    println(s"Asr %*% Bd: ${getMmulAvgs(mxAsr, mxBd, n)}")
+//    println(s"Asr' %*% Bd: ${getMmulAvgs(mxAsr.t, mxBd, n)}")
+//    println(s"Asr %*% Bd': ${getMmulAvgs(mxAsr, mxBd.t, n)}")
+//    println(s"Asr' %*% Bd': ${getMmulAvgs(mxAsr.t, mxBd.t, n)}")
+//    println(s"Asr'' %*% Bd'': ${getMmulAvgs(mxAsr.t.t, mxBd.t.t, n)}")
+//    println
+//
+//    println(s"Ad %*% Bsm: ${getMmulAvgs(mxAd, mxBs, n)}")
+//    println(s"Ad' %*% Bsm: ${getMmulAvgs(mxAd.t, mxBs, n)}")
+//    println(s"Ad %*% Bsm': ${getMmulAvgs(mxAd, mxBs.t, n)}")
+//    println(s"Ad' %*% Bsm': ${getMmulAvgs(mxAd.t, mxBs.t, n)}")
+//    println(s"Ad'' %*% Bsm'': ${getMmulAvgs(mxAd.t.t, mxBs.t.t, n)}")
+//    println
+//
+//    println(s"Asm %*% Bd: ${getMmulAvgs(mxAs, mxBd, n)}")
+//    println(s"Asm' %*% Bd: ${getMmulAvgs(mxAs.t, mxBd, n)}")
+//    println(s"Asm %*% Bd': ${getMmulAvgs(mxAs, mxBd.t, n)}")
+//    println(s"Asm' %*% Bd': ${getMmulAvgs(mxAs.t, mxBd.t, n)}")
+//    println(s"Asm'' %*% Bd'': ${getMmulAvgs(mxAs.t.t, mxBd.t.t, n)}")
+//    println
+//
+//    // Diagonal cases
+//    println(s"Ad %*% D: ${getMmulAvgs(mxAd, mxD, n)}")
+//    println(s"Asr %*% D: ${getMmulAvgs(mxAsr, mxD, n)}")
+//    println(s"Asm %*% D: ${getMmulAvgs(mxAs, mxD, n)}")
+//    println(s"D %*% Ad: ${getMmulAvgs(mxD, mxAd, n)}")
+//    println(s"D %*% Asr: ${getMmulAvgs(mxD, mxAsr, n)}")
+//    println(s"D %*% Asm: ${getMmulAvgs(mxD, mxAs, n)}")
+//    println
+//
+//    println(s"Ad' %*% D: ${getMmulAvgs(mxAd.t, mxD, n)}")
+//    println(s"Asr' %*% D: ${getMmulAvgs(mxAsr.t, mxD, n)}")
+//    println(s"Asm' %*% D: ${getMmulAvgs(mxAs.t, mxD, n)}")
+//    println(s"D %*% Ad': ${getMmulAvgs(mxD, mxAd.t, n)}")
+//    println(s"D %*% Asr': ${getMmulAvgs(mxD, mxAsr.t, n)}")
+//    println(s"D %*% Asm': ${getMmulAvgs(mxD, mxAs.t, n)}")
+//    println
+
+    // Self-squared cases
+    println(s"Ad %*% Ad': ${getMmulAvgs(mxAd, mxAd.t, n)}")
+    println(s"Ad' %*% Ad: ${getMmulAvgs(mxAd.t, mxAd, n)}")
+    println(s"Ad' %*% Ad'': ${getMmulAvgs(mxAd.t, mxAd.t.t, n)}")
+    println(s"Ad'' %*% Ad': ${getMmulAvgs(mxAd.t.t, mxAd.t, n)}")
+
+  }
+
+
+  test("elementwise experimental performance") {
+
+    val d = 500
+    val n = 3
+
+    // Dense row-wise
+    val mxAd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) + 1
+    val mxBd = new DenseMatrix(d, d) := Matrices.gaussianView(d, d, 134) - 1
+
+    val rnd = new Random(1234)
+
+    // Sparse rows
+    val mxAsr = (new SparseRowMatrix(d,
+      d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
+    val mxBsr = (new SparseRowMatrix(d,
+      d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
+
+    // Hanging sparse rows
+    val mxAs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() + 1 else 0.0 }) cloned
+    val mxBs = (new SparseMatrix(d, d) := { _ => if (rnd.nextDouble() < 0.1) rnd.nextGaussian() - 1 else 0.0 }) cloned
+
+    // DIAGONAL
+    val mxD = diagv(dvec(Array.tabulate(d)(_ => rnd.nextGaussian())))
+
+    def time(op: => Unit): Long = {
+      val ms = System.currentTimeMillis()
+      op
+      System.currentTimeMillis() - ms
+    }
+
+    def getEWAvgs(mxA: Matrix, mxB: Matrix, n: Int) = {
+
+      var control: Matrix = null
+      var mmulVal: Matrix = null
+
+      val current = Stream.range(0, n).map { _ => time {control = mxA + mxB} }.sum.toDouble / n
+      val experimental = Stream.range(0, n).map { _ => time {mmulVal = mxA + mxB} }.sum.toDouble / n
+      (control - mmulVal).norm should be < 1e-10
+      current -> experimental
+    }
+
+    // Dense matrix tests.
+    println(s"Ad + Bd: ${getEWAvgs(mxAd, mxBd, n)}")
+    println(s"Ad' + Bd: ${getEWAvgs(mxAd.t, mxBd, n)}")
+    println(s"Ad + Bd': ${getEWAvgs(mxAd, mxBd.t, n)}")
+    println(s"Ad' + Bd': ${getEWAvgs(mxAd.t, mxBd.t, n)}")
+    println(s"Ad'' + Bd'': ${getEWAvgs(mxAd.t.t, mxBd.t.t, n)}")
+    println
+
+    // Sparse row matrix tests.
+    println(s"Asr + Bsr: ${getEWAvgs(mxAsr, mxBsr, n)}")
+    println(s"Asr' + Bsr: ${getEWAvgs(mxAsr.t, mxBsr, n)}")
+    println(s"Asr + Bsr': ${getEWAvgs(mxAsr, mxBsr.t, n)}")
+    println(s"Asr' + Bsr': ${getEWAvgs(mxAsr.t, mxBsr.t, n)}")
+    println(s"Asr'' + Bsr'': ${getEWAvgs(mxAsr.t.t, mxBsr.t.t, n)}")
+    println
+
+    // Sparse matrix tests.
+    println(s"Asm + Bsm: ${getEWAvgs(mxAs, mxBs, n)}")
+    println(s"Asm' + Bsm: ${getEWAvgs(mxAs.t, mxBs, n)}")
+    println(s"Asm + Bsm': ${getEWAvgs(mxAs, mxBs.t, n)}")
+    println(s"Asm' + Bsm': ${getEWAvgs(mxAs.t, mxBs.t, n)}")
+    println(s"Asm'' + Bsm'': ${getEWAvgs(mxAs.t.t, mxBs.t.t, n)}")
+    println
+
+    // Mixed sparse matrix tests.
+    println(s"Asm + Bsr: ${getEWAvgs(mxAs, mxBsr, n)}")
+    println(s"Asm' + Bsr: ${getEWAvgs(mxAs.t, mxBsr, n)}")
+    println(s"Asm + Bsr': ${getEWAvgs(mxAs, mxBsr.t, n)}")
+    println(s"Asm' + Bsr': ${getEWAvgs(mxAs.t, mxBsr.t, n)}")
+    println(s"Asm'' + Bsr'': ${getEWAvgs(mxAs.t.t, mxBsr.t.t, n)}")
+    println
+
+    println(s"Asr + Bsm: ${getEWAvgs(mxAsr, mxBs, n)}")
+    println(s"Asr' + Bsm: ${getEWAvgs(mxAsr.t, mxBs, n)}")
+    println(s"Asr + Bsm': ${getEWAvgs(mxAsr, mxBs.t, n)}")
+    println(s"Asr' + Bsm': ${getEWAvgs(mxAsr.t, mxBs.t, n)}")
+    println(s"Asr'' + Bsm'': ${getEWAvgs(mxAsr.t.t, mxBs.t.t, n)}")
+    println
+
+    // Mixed dense/sparse
+    println(s"Ad + Bsr: ${getEWAvgs(mxAd, mxBsr, n)}")
+    println(s"Ad' + Bsr: ${getEWAvgs(mxAd.t, mxBsr, n)}")
+    println(s"Ad + Bsr': ${getEWAvgs(mxAd, mxBsr.t, n)}")
+    println(s"Ad' + Bsr': ${getEWAvgs(mxAd.t, mxBsr.t, n)}")
+    println(s"Ad'' + Bsr'': ${getEWAvgs(mxAd.t.t, mxBsr.t.t, n)}")
+    println
+
+    println(s"Asr + Bd: ${getEWAvgs(mxAsr, mxBd, n)}")
+    println(s"Asr' + Bd: ${getEWAvgs(mxAsr.t, mxBd, n)}")
+    println(s"Asr + Bd': ${getEWAvgs(mxAsr, mxBd.t, n)}")
+    println(s"Asr' + Bd': ${getEWAvgs(mxAsr.t, mxBd.t, n)}")
+    println(s"Asr'' + Bd'': ${getEWAvgs(mxAsr.t.t, mxBd.t.t, n)}")
+    println
+
+    println(s"Ad + Bsm: ${getEWAvgs(mxAd, mxBs, n)}")
+    println(s"Ad' + Bsm: ${getEWAvgs(mxAd.t, mxBs, n)}")
+    println(s"Ad + Bsm': ${getEWAvgs(mxAd, mxBs.t, n)}")
+    println(s"Ad' + Bsm': ${getEWAvgs(mxAd.t, mxBs.t, n)}")
+    println(s"Ad'' + Bsm'': ${getEWAvgs(mxAd.t.t, mxBs.t.t, n)}")
+    println
+
+    println(s"Asm + Bd: ${getEWAvgs(mxAs, mxBd, n)}")
+    println(s"Asm' + Bd: ${getEWAvgs(mxAs.t, mxBd, n)}")
+    println(s"Asm + Bd': ${getEWAvgs(mxAs, mxBd.t, n)}")
+    println(s"Asm' + Bd': ${getEWAvgs(mxAs.t, mxBd.t, n)}")
+    println(s"Asm'' + Bd'': ${getEWAvgs(mxAs.t.t, mxBd.t.t, n)}")
+    println
+
+    // Diagonal cases
+    println(s"Ad + D: ${getEWAvgs(mxAd, mxD, n)}")
+    println(s"Asr + D: ${getEWAvgs(mxAsr, mxD, n)}")
+    println(s"Asm + D: ${getEWAvgs(mxAs, mxD, n)}")
+    println(s"D + Ad: ${getEWAvgs(mxD, mxAd, n)}")
+    println(s"D + Asr: ${getEWAvgs(mxD, mxAsr, n)}")
+    println(s"D + Asm: ${getEWAvgs(mxD, mxAs, n)}")
+    println
+
+    println(s"Ad' + D: ${getEWAvgs(mxAd.t, mxD, n)}")
+    println(s"Asr' + D: ${getEWAvgs(mxAsr.t, mxD, n)}")
+    println(s"Asm' + D: ${getEWAvgs(mxAs.t, mxD, n)}")
+    println(s"D + Ad': ${getEWAvgs(mxD, mxAd.t, n)}")
+    println(s"D + Asr': ${getEWAvgs(mxD, mxAsr.t, n)}")
+    println(s"D + Asm': ${getEWAvgs(mxD, mxAs.t, n)}")
+    println
+
+  }
+
 }
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala
index 037f562575..d264514a98 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/VectorOpsSuite.scala
@@ -18,10 +18,12 @@
 package org.apache.mahout.math.scalabindings
 
 import org.scalatest.FunSuite
-import org.apache.mahout.math.{RandomAccessSparseVector, Vector}
+import org.apache.mahout.math.{SequentialAccessSparseVector, RandomAccessSparseVector, Vector}
 import RLikeOps._
 import org.apache.mahout.test.MahoutSuite
 
+import scala.util.Random
+
 /** VectorOps Suite */
 class VectorOpsSuite extends FunSuite with MahoutSuite {
 
@@ -79,4 +81,19 @@ class VectorOpsSuite extends FunSuite with MahoutSuite {
 
   }
 
+  test("sparse assignment") {
+
+    val svec = new SequentialAccessSparseVector(30)
+    svec(1) = -0.5
+    svec(3) = 0.5
+    println(svec)
+
+    svec(1 until svec.length) ::= ( _ => 0)
+    println(svec)
+
+    svec.sum shouldBe 0
+
+
+  }
+
 }
diff --git a/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java b/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
index e752422b33..a823d0b23b 100644
--- a/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/AbstractMatrix.java
@@ -19,13 +19,16 @@
 
 import com.google.common.collect.AbstractIterator;
 import com.google.common.collect.Maps;
+import org.apache.mahout.math.flavor.BackEnum;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
 import org.apache.mahout.math.function.*;
 
 import java.util.Iterator;
 import java.util.Map;
 
 /**
- * A few universal implementations of convenience functions
+ * A few universal implementations of convenience functions for a JVM-backed matrix.
  */
 public abstract class AbstractMatrix implements Matrix {
 
@@ -57,19 +60,24 @@ public Iterator<MatrixSlice> iterator() {
   @Override
   public Iterator<MatrixSlice> iterateAll() {
     return new AbstractIterator<MatrixSlice>() {
-      private int slice;
+      private int row;
 
       @Override
       protected MatrixSlice computeNext() {
-        if (slice >= numSlices()) {
+        if (row >= numRows()) {
           return endOfData();
         }
-        int i = slice++;
+        int i = row++;
         return new MatrixSlice(viewRow(i), i);
       }
     };
   }
 
+  @Override
+  public Iterator<MatrixSlice> iterateNonEmpty() {
+    return iterator();
+  }
+
   /**
    * Abstracted out for the iterator
    *
@@ -813,4 +821,12 @@ public String toString() {
       return returnString + ("}");
     }
   }
+
+  @Override
+  public MatrixFlavor getFlavor() {
+    throw new UnsupportedOperationException("Flavor support not implemented for this matrix.");
+  }
+
+  ////////////// Matrix flavor trait ///////////////////
+
 }
diff --git a/math/src/main/java/org/apache/mahout/math/ConstantVector.java b/math/src/main/java/org/apache/mahout/math/ConstantVector.java
index 86ab82bab5..847bf85368 100644
--- a/math/src/main/java/org/apache/mahout/math/ConstantVector.java
+++ b/math/src/main/java/org/apache/mahout/math/ConstantVector.java
@@ -132,6 +132,11 @@ public Vector like() {
     return new DenseVector(size());
   }
 
+  @Override
+  public Vector like(int cardinality) {
+    return new DenseVector(cardinality);
+  }
+
   /**
    * Set the value at the given index, without checking bounds
    *
diff --git a/math/src/main/java/org/apache/mahout/math/DelegatingVector.java b/math/src/main/java/org/apache/mahout/math/DelegatingVector.java
index a1fd291234..0b2e36b368 100644
--- a/math/src/main/java/org/apache/mahout/math/DelegatingVector.java
+++ b/math/src/main/java/org/apache/mahout/math/DelegatingVector.java
@@ -309,6 +309,11 @@ public Vector like() {
     return new DelegatingVector(delegate.like());
   }
 
+  @Override
+  public Vector like(int cardinality) {
+    return new DelegatingVector(delegate.like(cardinality));
+  }
+
   @Override
   public void setQuick(int index, double value) {
     delegate.setQuick(index, value);
diff --git a/math/src/main/java/org/apache/mahout/math/DenseMatrix.java b/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
index 7f52c007ad..5c1ee126d1 100644
--- a/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/DenseMatrix.java
@@ -17,6 +17,9 @@
 
 package org.apache.mahout.math;
 
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+
 import java.util.Arrays;
 
 /** Matrix of doubles implemented using a 2-d array */
@@ -175,5 +178,9 @@ public Vector viewRow(int row) {
     }
     return new DenseVector(values[row], true);
   }
-  
+
+  @Override
+  public MatrixFlavor getFlavor() {
+    return MatrixFlavor.DENSELIKE;
+  }
 }
diff --git a/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java b/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java
index e9cf3f1630..7252b9bb06 100644
--- a/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/DenseSymmetricMatrix.java
@@ -17,6 +17,8 @@
 
 package org.apache.mahout.math;
 
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+
 /**
  * Economy packaging for a dense symmetric in-core matrix.
  */
diff --git a/math/src/main/java/org/apache/mahout/math/DenseVector.java b/math/src/main/java/org/apache/mahout/math/DenseVector.java
index 5b3dea78ef..3633e58f16 100644
--- a/math/src/main/java/org/apache/mahout/math/DenseVector.java
+++ b/math/src/main/java/org/apache/mahout/math/DenseVector.java
@@ -135,6 +135,11 @@ public DenseVector like() {
     return new DenseVector(size());
   }
 
+  @Override
+  public Vector like(int cardinality) {
+    return new DenseVector(cardinality);
+  }
+
   @Override
   public void setQuick(int index, double value) {
     invalidateCachedLength();
diff --git a/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java b/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java
index 3e20a4a371..070fad25fe 100644
--- a/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/DiagonalMatrix.java
@@ -17,6 +17,9 @@
 
 package org.apache.mahout.math;
 
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+
 import java.util.Iterator;
 import java.util.NoSuchElementException;
 
@@ -222,6 +225,11 @@ public Vector like() {
       return new DenseVector(size());
     }
 
+    @Override
+    public Vector like(int cardinality) {
+      return new DenseVector(cardinality);
+    }
+
     @Override
     public void setQuick(int index, double value) {
       if (index == this.index) {
@@ -361,4 +369,10 @@ public Matrix timesLeft(Matrix that) {
     }
     return m;
   }
+
+  @Override
+  public MatrixFlavor getFlavor() {
+    return MatrixFlavor.DIAGONALLIKE;
+  }
+
 }
diff --git a/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java b/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
index ba09aa85fe..56600cd0e4 100644
--- a/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/FileBasedSparseBinaryMatrix.java
@@ -437,6 +437,11 @@ public Vector like() {
       return new RandomAccessSparseVector(size());
     }
 
+    @Override
+    public Vector like(int cardinality) {
+      return new RandomAccessSparseVector(cardinality);
+    }
+
     /**
      * Copy the vector for fast operations.
      *
diff --git a/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java b/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
index 2a136115fc..2944f566f7 100644
--- a/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
+++ b/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
@@ -17,6 +17,9 @@
 
 package org.apache.mahout.math;
 
+import org.apache.mahout.math.flavor.BackEnum;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
 import org.apache.mahout.math.function.IntIntFunction;
 
 /**
@@ -29,6 +32,7 @@ class FunctionalMatrixView extends AbstractMatrix {
    */
   private IntIntFunction gf;
   private boolean denseLike;
+  private MatrixFlavor flavor;
 
   public FunctionalMatrixView(int rows, int columns, IntIntFunction gf) {
     this(rows, columns, gf, false);
@@ -42,6 +46,7 @@ public FunctionalMatrixView(int rows, int columns, IntIntFunction gf, boolean de
     super(rows, columns);
     this.gf = gf;
     this.denseLike = denseLike;
+    flavor = new MatrixFlavor.FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.BLOCKIFIED, denseLike);
   }
 
   @Override
diff --git a/math/src/main/java/org/apache/mahout/math/Matrices.java b/math/src/main/java/org/apache/mahout/math/Matrices.java
index 4a0c50cd52..fc45a16db9 100644
--- a/math/src/main/java/org/apache/mahout/math/Matrices.java
+++ b/math/src/main/java/org/apache/mahout/math/Matrices.java
@@ -17,7 +17,9 @@
 
 package org.apache.mahout.math;
 
+import com.google.common.base.Preconditions;
 import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
 import org.apache.mahout.math.function.DoubleFunction;
 import org.apache.mahout.math.function.Functions;
 import org.apache.mahout.math.function.IntIntFunction;
@@ -63,16 +65,14 @@ public static final Matrix functionalMatrixView(final int rows,
    * @return transposed view of original matrix
    */
   public static final Matrix transposedView(final Matrix m) {
-    IntIntFunction tf = new IntIntFunction() {
-      @Override
-      public double apply(int row, int col) {
-        return m.getQuick(col, row);
-      }
-    };
 
-    // TODO: Matrix api does not support denseLike() interrogation.
-    // so our guess has to be rough here.
-    return functionalMatrixView(m.numCols(), m.numRows(), tf, m instanceof DenseMatrix);
+    Preconditions.checkArgument(!(m instanceof SparseColumnMatrix));
+
+    if (m instanceof TransposedMatrixView) {
+      return ((TransposedMatrixView) m).getDelegate();
+    } else {
+      return new TransposedMatrixView(m);
+    }
   }
 
   /**
diff --git a/math/src/main/java/org/apache/mahout/math/Matrix.java b/math/src/main/java/org/apache/mahout/math/Matrix.java
index afdbac5416..47ba5cf7b4 100644
--- a/math/src/main/java/org/apache/mahout/math/Matrix.java
+++ b/math/src/main/java/org/apache/mahout/math/Matrix.java
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.math;
 
+import org.apache.mahout.math.flavor.MatrixFlavor;
 import org.apache.mahout.math.function.DoubleDoubleFunction;
 import org.apache.mahout.math.function.DoubleFunction;
 import org.apache.mahout.math.function.VectorFunction;
@@ -403,4 +404,10 @@ public interface Matrix extends Cloneable, VectorIterable {
    * @return A vector that shares storage with the original matrix.
    */
   Vector viewDiagonal();
+
+  /**
+   * Get matrix structural flavor (operations performance hints). This is optional operation, may
+   * throw {@link java.lang.UnsupportedOperationException}.
+   */
+  MatrixFlavor getFlavor();
 }
diff --git a/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java b/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java
index 074d7a6c31..52ae722e67 100644
--- a/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java
+++ b/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java
@@ -211,6 +211,11 @@ public Vector like() {
     return matrix.like(size(), 1).viewColumn(0);
   }
 
+  @Override
+  public Vector like(int cardinality) {
+    return matrix.like(cardinality, 1).viewColumn(0);
+  }
+
   /**
    * Set the value at the given index, without checking bounds
    *
diff --git a/math/src/main/java/org/apache/mahout/math/MatrixView.java b/math/src/main/java/org/apache/mahout/math/MatrixView.java
index e2f7f48d31..86760d5c2f 100644
--- a/math/src/main/java/org/apache/mahout/math/MatrixView.java
+++ b/math/src/main/java/org/apache/mahout/math/MatrixView.java
@@ -17,6 +17,8 @@
 
 package org.apache.mahout.math;
 
+import org.apache.mahout.math.flavor.MatrixFlavor;
+
 /** Implements subset view of a Matrix */
 public class MatrixView extends AbstractMatrix {
 
@@ -151,4 +153,8 @@ public Vector viewRow(int row) {
     return new VectorView(matrix.viewRow(row + offset[ROW]), offset[COL], columnSize());
   }
 
+  @Override
+  public MatrixFlavor getFlavor() {
+    return matrix.getFlavor();
+  }
 }
diff --git a/math/src/main/java/org/apache/mahout/math/NamedVector.java b/math/src/main/java/org/apache/mahout/math/NamedVector.java
index 0bf49c8d39..d4fa60958a 100644
--- a/math/src/main/java/org/apache/mahout/math/NamedVector.java
+++ b/math/src/main/java/org/apache/mahout/math/NamedVector.java
@@ -176,6 +176,11 @@ public NamedVector like() {
     return new NamedVector(delegate.like(), name);
   }
 
+  @Override
+  public Vector like(int cardinality) {
+    return new NamedVector(delegate.like(cardinality), name);
+  }
+
   @Override
   public Vector minus(Vector x) {
     return delegate.minus(x);
diff --git a/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java b/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java
index f34f2b0216..a76f78c704 100644
--- a/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java
+++ b/math/src/main/java/org/apache/mahout/math/PermutedVectorView.java
@@ -204,6 +204,11 @@ public Vector like() {
     return vector.like();
   }
 
+  @Override
+  public Vector like(int cardinality) {
+    return vector.like(cardinality);
+  }
+
   /**
    * Set the value at the given index, without checking bounds
    *
diff --git a/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java b/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java
index dbe5d3a870..3efac7e1b6 100644
--- a/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java
+++ b/math/src/main/java/org/apache/mahout/math/RandomAccessSparseVector.java
@@ -141,6 +141,11 @@ public RandomAccessSparseVector like() {
     return new RandomAccessSparseVector(size(), values.size());
   }
 
+  @Override
+  public Vector like(int cardinality) {
+    return new RandomAccessSparseVector(cardinality, values.size());
+  }
+
   @Override
   public int getNumNondefaultElements() {
     return values.size();
diff --git a/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java b/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java
index 331662cbbf..f7d67a7e3f 100644
--- a/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java
+++ b/math/src/main/java/org/apache/mahout/math/SequentialAccessSparseVector.java
@@ -179,6 +179,11 @@ public SequentialAccessSparseVector like() {
     return new SequentialAccessSparseVector(size(), values.getNumMappings());
   }
 
+  @Override
+  public Vector like(int cardinality) {
+    return new SequentialAccessSparseVector(cardinality);
+  }
+
   @Override
   public int getNumNondefaultElements() {
     return values.getNumMappings();
@@ -214,6 +219,8 @@ public boolean isAddConstantTime() {
 
   @Override
   public Iterator<Element> iterateNonZero() {
+
+    // TODO: this is a bug, since nonDefaultIterator doesn't hold to non-zero contract.
     return new NonDefaultIterator();
   }
 
diff --git a/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java b/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
index f62d553dbd..eeffc78705 100644
--- a/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/SparseColumnMatrix.java
@@ -17,9 +17,13 @@
 
 package org.apache.mahout.math;
 
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+
 /**
  * sparse matrix with general element values whose columns are accessible quickly. Implemented as a column array of
  * SparseVectors.
+ *
+ * @deprecated tons of inconsistences. Use transpose view of SparseRowMatrix for fast column-wise iteration.
  */
 public class SparseColumnMatrix extends AbstractMatrix {
 
@@ -31,11 +35,19 @@ public class SparseColumnMatrix extends AbstractMatrix {
    * @param columns       a RandomAccessSparseVector[] array of columns
    * @param columnVectors
    */
-  public SparseColumnMatrix(int rows, int columns, RandomAccessSparseVector[] columnVectors) {
+  public SparseColumnMatrix(int rows, int columns, Vector[] columnVectors) {
+    this(rows, columns, columnVectors, false);
+  }
+
+  public SparseColumnMatrix(int rows, int columns, Vector[] columnVectors, boolean shallow) {
     super(rows, columns);
-    this.columnVectors = columnVectors.clone();
-    for (int col = 0; col < columnSize(); col++) {
-      this.columnVectors[col] = this.columnVectors[col].clone();
+    if (shallow) {
+      this.columnVectors = columnVectors;
+    } else {
+      this.columnVectors = columnVectors.clone();
+      for (int col = 0; col < columnSize(); col++) {
+        this.columnVectors[col] = this.columnVectors[col].clone();
+      }
     }
   }
 
diff --git a/math/src/main/java/org/apache/mahout/math/SparseMatrix.java b/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
index 88e15a0dc1..bf4f1a0507 100644
--- a/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/SparseMatrix.java
@@ -18,6 +18,8 @@
 package org.apache.mahout.math;
 
 import com.google.common.collect.AbstractIterator;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
 import org.apache.mahout.math.function.DoubleDoubleFunction;
 import org.apache.mahout.math.function.Functions;
 import org.apache.mahout.math.function.IntObjectProcedure;
@@ -40,11 +42,23 @@ public class SparseMatrix extends AbstractMatrix {
    * @param columns
    * @param rowVectors
    */
-  public SparseMatrix(int rows, int columns, Map<Integer, RandomAccessSparseVector> rowVectors) {
+  public SparseMatrix(int rows, int columns, Map<Integer, Vector> rowVectors) {
+    this(rows, columns, rowVectors, false);
+  }
+
+  public SparseMatrix(int rows, int columns, Map<Integer, Vector> rowVectors, boolean shallow) {
+
+    // Why this is passing in a map? iterating it is pretty inefficient as opposed to simple lists...
     super(rows, columns);
     this.rowVectors = new OpenIntObjectHashMap<Vector>();
-    for (Map.Entry<Integer, RandomAccessSparseVector> entry : rowVectors.entrySet()) {
-      this.rowVectors.put(entry.getKey(), entry.getValue().clone());
+    if (shallow) {
+      for (Map.Entry<Integer, Vector> entry : rowVectors.entrySet()) {
+        this.rowVectors.put(entry.getKey(), entry.getValue());
+      }
+    } else {
+      for (Map.Entry<Integer, Vector> entry : rowVectors.entrySet()) {
+        this.rowVectors.put(entry.getKey(), entry.getValue().clone());
+      }
     }
   }
   
@@ -66,7 +80,11 @@ public Matrix clone() {
   }
 
   @Override
-  public Iterator<MatrixSlice> iterator() {
+  public int numSlices() {
+    return rowVectors.size();
+  }
+
+  public Iterator<MatrixSlice> iterateNonEmpty() {
     final IntArrayList keys = new IntArrayList(rowVectors.size());
     rowVectors.keys(keys);
     return new AbstractIterator<MatrixSlice>() {
@@ -221,4 +239,8 @@ public IntArrayList nonZeroRowIndices() {
     return rowVectors.keys();
   }
 
+  @Override
+  public MatrixFlavor getFlavor() {
+    return MatrixFlavor.SPARSEROWLIKE;
+  }
 }
diff --git a/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java b/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
index 3021f3b223..6e0676911c 100644
--- a/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
+++ b/math/src/main/java/org/apache/mahout/math/SparseRowMatrix.java
@@ -17,6 +17,8 @@
 
 package org.apache.mahout.math;
 
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
 import org.apache.mahout.math.function.Functions;
 
 /**
@@ -226,4 +228,9 @@ public Matrix times(Matrix other) {
       }
     }
   }
+
+  @Override
+  public MatrixFlavor getFlavor() {
+    return MatrixFlavor.SPARSELIKE;
+  }
 }
diff --git a/math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java b/math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java
new file mode 100644
index 0000000000..c67cb479e3
--- /dev/null
+++ b/math/src/main/java/org/apache/mahout/math/TransposedMatrixView.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math;
+
+import org.apache.mahout.math.flavor.BackEnum;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+import org.apache.mahout.math.function.DoubleDoubleFunction;
+import org.apache.mahout.math.function.DoubleFunction;
+
+/**
+ * Matrix View backed by an {@link org.apache.mahout.math.function.IntIntFunction}
+ */
+class TransposedMatrixView extends AbstractMatrix {
+
+  private Matrix m;
+
+  public TransposedMatrixView(Matrix m) {
+    super(m.numCols(), m.numRows());
+    this.m = m;
+  }
+
+  @Override
+  public Matrix assignColumn(int column, Vector other) {
+    m.assignRow(column,other);
+    return this;
+  }
+
+  @Override
+  public Matrix assignRow(int row, Vector other) {
+    m.assignColumn(row,other);
+    return this;
+  }
+
+  @Override
+  public double getQuick(int row, int column) {
+    return m.getQuick(column,row);
+  }
+
+  @Override
+  public Matrix like() {
+    return m.like(rows, columns);
+  }
+
+  @Override
+  public Matrix like(int rows, int columns) {
+    return m.like(rows,columns);
+  }
+
+  @Override
+  public void setQuick(int row, int column, double value) {
+    m.setQuick(column, row, value);
+  }
+
+  @Override
+  public Vector viewRow(int row) {
+    return m.viewColumn(row);
+  }
+
+  @Override
+  public Vector viewColumn(int column) {
+    return m.viewRow(column);
+  }
+
+  @Override
+  public Matrix assign(double value) {
+    return m.assign(value);
+  }
+
+  @Override
+  public Matrix assign(Matrix other, DoubleDoubleFunction function) {
+    if (other instanceof TransposedMatrixView) {
+      m.assign(((TransposedMatrixView) other).m, function);
+    } else {
+      m.assign(new TransposedMatrixView(other), function);
+    }
+    return this;
+  }
+
+  @Override
+  public Matrix assign(Matrix other) {
+    if (other instanceof TransposedMatrixView) {
+      return m.assign(((TransposedMatrixView) other).m);
+    } else {
+      return m.assign(new TransposedMatrixView(other));
+    }
+  }
+
+  @Override
+  public Matrix assign(DoubleFunction function) {
+    return m.assign(function);
+  }
+
+  @Override
+  public MatrixFlavor getFlavor() {
+    return flavor;
+  }
+
+  private MatrixFlavor flavor = new MatrixFlavor() {
+    @Override
+    public BackEnum getBacking() {
+      return m.getFlavor().getBacking();
+    }
+
+    @Override
+    public TraversingStructureEnum getStructure() {
+      TraversingStructureEnum flavor = m.getFlavor().getStructure();
+      switch (flavor) {
+        case COLWISE:
+          return TraversingStructureEnum.ROWWISE;
+        case SPARSECOLWISE:
+          return TraversingStructureEnum.SPARSEROWWISE;
+        case ROWWISE:
+          return TraversingStructureEnum.COLWISE;
+        case SPARSEROWWISE:
+          return TraversingStructureEnum.SPARSECOLWISE;
+        default:
+          return flavor;
+      }
+    }
+
+    @Override
+    public boolean isDense() {
+      return m.getFlavor().isDense();
+    }
+  };
+
+  Matrix getDelegate() {
+    return m;
+  }
+
+}
diff --git a/math/src/main/java/org/apache/mahout/math/UpperTriangular.java b/math/src/main/java/org/apache/mahout/math/UpperTriangular.java
index a0cb3cd948..29fa6a0926 100644
--- a/math/src/main/java/org/apache/mahout/math/UpperTriangular.java
+++ b/math/src/main/java/org/apache/mahout/math/UpperTriangular.java
@@ -17,6 +17,10 @@
 
 package org.apache.mahout.math;
 
+import org.apache.mahout.math.flavor.BackEnum;
+import org.apache.mahout.math.flavor.MatrixFlavor;
+import org.apache.mahout.math.flavor.TraversingStructureEnum;
+
 /**
  * 
  * Quick and dirty implementation of some {@link org.apache.mahout.math.Matrix} methods
@@ -148,4 +152,9 @@ public double[] getData() {
     return values;
   }
 
+  @Override
+  public MatrixFlavor getFlavor() {
+    // We kind of consider ourselves a vector-backed but dense matrix for mmul, etc. purposes.
+    return new MatrixFlavor.FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.VECTORBACKED, true);
+  }
 }
diff --git a/math/src/main/java/org/apache/mahout/math/Vector.java b/math/src/main/java/org/apache/mahout/math/Vector.java
index 0d1a00347d..4480b0a684 100644
--- a/math/src/main/java/org/apache/mahout/math/Vector.java
+++ b/math/src/main/java/org/apache/mahout/math/Vector.java
@@ -189,6 +189,14 @@ interface Element {
    */
   Vector like();
 
+  /**
+   * Return a new empty vector of the same underlying class as the receiver with given cardinality
+   *
+   * @param cardinality
+   * @return
+   */
+  Vector like(int cardinality);
+
   /**
    * Return a new vector containing the element by element difference of the recipient and the argument
    *
diff --git a/math/src/main/java/org/apache/mahout/math/VectorIterable.java b/math/src/main/java/org/apache/mahout/math/VectorIterable.java
index 451c58980e..8414fdb037 100644
--- a/math/src/main/java/org/apache/mahout/math/VectorIterable.java
+++ b/math/src/main/java/org/apache/mahout/math/VectorIterable.java
@@ -21,8 +21,12 @@
 
 public interface VectorIterable extends Iterable<MatrixSlice> {
 
+  /* Iterate all rows in order */
   Iterator<MatrixSlice> iterateAll();
 
+  /* Iterate all non empty rows in arbitrary order */
+  Iterator<MatrixSlice> iterateNonEmpty();
+
   int numSlices();
 
   int numRows();
diff --git a/math/src/main/java/org/apache/mahout/math/VectorView.java b/math/src/main/java/org/apache/mahout/math/VectorView.java
index b503712b85..d61a0380ef 100644
--- a/math/src/main/java/org/apache/mahout/math/VectorView.java
+++ b/math/src/main/java/org/apache/mahout/math/VectorView.java
@@ -68,6 +68,11 @@ public VectorView like() {
     return new VectorView(vector.like(), offset, size());
   }
 
+  @Override
+  public Vector like(int cardinality) {
+    return vector.like(cardinality);
+  }
+
   @Override
   public double getQuick(int index) {
     return vector.getQuick(offset + index);
@@ -122,7 +127,7 @@ protected Element computeNext() {
       while (it.hasNext()) {
         Element el = it.next();
         if (isInView(el.index()) && el.get() != 0) {
-          Element decorated = vector.getElement(el.index());
+          Element decorated = el; /* vector.getElement(el.index()); */
           return new DecoratorElement(decorated);
         }
       }
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeTimesOps.scala b/math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java
similarity index 77%
rename from math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeTimesOps.scala
rename to math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java
index 51f0f63dd8..1782f04f34 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeTimesOps.scala
+++ b/math/src/main/java/org/apache/mahout/math/flavor/BackEnum.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -15,14 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.mahout.math.scalabindings
-
-import org.apache.mahout.math.{Matrix, MatrixTimesOps}
-
-class RLikeTimesOps(m: MatrixTimesOps) {
-
-  def :%*%(that: Matrix) = m.timesRight(that)
-
-  def %*%:(that: Matrix) = m.timesLeft(that)
+package org.apache.mahout.math.flavor;
 
+/**
+ * Matrix backends
+ */
+public enum BackEnum {
+  JVMMEM,
+  NETLIB_BLAS
 }
diff --git a/math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java b/math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java
new file mode 100644
index 0000000000..2b5c444a56
--- /dev/null
+++ b/math/src/main/java/org/apache/mahout/math/flavor/MatrixFlavor.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.flavor;
+
+/** A set of matrix structure properties that I denote as "flavor" (by analogy to quarks) */
+public interface MatrixFlavor {
+
+  /**
+   * Whether matrix is backed by a native system -- such as java memory, lapack/atlas, Magma etc.
+   */
+  BackEnum getBacking();
+
+  /**
+   * Structure flavors
+   */
+  TraversingStructureEnum getStructure() ;
+
+  boolean isDense();
+
+  /**
+   * This default for {@link org.apache.mahout.math.DenseMatrix}-like structures
+   */
+  static final MatrixFlavor DENSELIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.ROWWISE, true);
+  /**
+   * This is default flavor for {@link org.apache.mahout.math.SparseRowMatrix}-like.
+   */
+  static final MatrixFlavor SPARSELIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.ROWWISE, false);
+
+  /**
+   * This is default flavor for {@link org.apache.mahout.math.SparseMatrix}-like structures, i.e. sparse matrix blocks,
+   * where few, perhaps most, rows may be missing entirely.
+   */
+  static final MatrixFlavor SPARSEROWLIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.SPARSEROWWISE, false);
+
+  /**
+   * This is default flavor for {@link org.apache.mahout.math.DiagonalMatrix} and the likes.
+   */
+  static final MatrixFlavor DIAGONALLIKE = new FlavorImpl(BackEnum.JVMMEM, TraversingStructureEnum.VECTORBACKED, false);
+
+  static final class FlavorImpl implements MatrixFlavor {
+    private BackEnum pBacking;
+    private TraversingStructureEnum pStructure;
+    private boolean pDense;
+
+    public FlavorImpl(BackEnum backing, TraversingStructureEnum structure, boolean dense) {
+      pBacking = backing;
+      pStructure = structure;
+      pDense = dense;
+    }
+
+    @Override
+    public BackEnum getBacking() {
+      return pBacking;
+    }
+
+    @Override
+    public TraversingStructureEnum getStructure() {
+      return pStructure;
+    }
+
+    @Override
+    public boolean isDense() {
+      return pDense;
+    }
+  }
+
+}
diff --git a/math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java b/math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java
new file mode 100644
index 0000000000..13c2cf49db
--- /dev/null
+++ b/math/src/main/java/org/apache/mahout/math/flavor/TraversingStructureEnum.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.flavor;
+
+/** STRUCTURE HINT */
+public enum TraversingStructureEnum {
+
+  UNKNOWN,
+
+  /**
+   * Backing vectors are directly available as row views.
+   */
+  ROWWISE,
+
+  /**
+   * Column vectors are directly available as column views.
+   */
+  COLWISE,
+
+  /**
+   * Only some row-wise vectors are really present (can use iterateNonEmpty). Corresponds to
+   * [[org.apache.mahout.math.SparseMatrix]].
+   */
+  SPARSEROWWISE,
+
+  SPARSECOLWISE,
+
+  SPARSEHASH,
+
+  VECTORBACKED,
+
+  BLOCKIFIED
+}
diff --git a/math/src/test/java/org/apache/mahout/math/MatricesTest.java b/math/src/test/java/org/apache/mahout/math/MatricesTest.java
index 1b6169e36e..9405429c87 100644
--- a/math/src/test/java/org/apache/mahout/math/MatricesTest.java
+++ b/math/src/test/java/org/apache/mahout/math/MatricesTest.java
@@ -65,8 +65,8 @@ public void testViewDenseSparseReporting() {
     m.set(1, 1, 33.0);
     Matrix mt = Matrices.transposedView(m);
 
-    assertTrue(!mt.viewColumn(0).isDense());
-    assertTrue(!mt.viewRow(0).isDense());
+    assertTrue(mt.viewColumn(0).isDense() == m.viewRow(0).isDense());
+    assertTrue(mt.viewRow(0).isDense() == m.viewColumn(0).isDense());
 
     m = new DenseMatrix(10,10);
     m.set(1, 1, 33.0);
diff --git a/mr/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java b/mr/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java
index 1a6ff1670e..de5e216f38 100644
--- a/mr/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java
+++ b/mr/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java
@@ -132,6 +132,11 @@ public void setOutputTempPathString(String outPathString) {
     }
   }
 
+  @Override
+  public Iterator<MatrixSlice> iterateNonEmpty() {
+    return iterator();
+  }
+
   @Override
   public Iterator<MatrixSlice> iterateAll() {
     try {
diff --git a/mr/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java b/mr/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java
index 7033efe3e1..af79cb4f1c 100644
--- a/mr/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java
+++ b/mr/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/qr/GivensThinSolver.java
@@ -585,6 +585,11 @@ public Vector like() {
       throw new UnsupportedOperationException();
     }
 
+    @Override
+    public Vector like(int cardinality) {
+      throw new UnsupportedOperationException();
+    }
+
     @Override
     public void setQuick(int index, double value) {
       viewed.setQuick(rowNum, index, value);
diff --git a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
index 5ffc18c677..4d0615afec 100644
--- a/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
+++ b/spark-shell/src/main/scala/org/apache/mahout/sparkbindings/shell/MahoutSparkILoop.scala
@@ -12,13 +12,14 @@ class MahoutSparkILoop extends SparkILoop {
 
   private val postInitScript =
     "import org.apache.mahout.math._" ::
-        "import scalabindings._" ::
-        "import RLikeOps._" ::
-        "import drm._" ::
-        "import RLikeDrmOps._" ::
-        "import org.apache.mahout.sparkbindings._" ::
-        "import collection.JavaConversions._" ::
-        Nil
+      "import scalabindings._" ::
+      "import RLikeOps._" ::
+      "import drm._" ::
+      "import RLikeDrmOps._" ::
+      "import decompositions._" ::
+      "import org.apache.mahout.sparkbindings._" ::
+      "import collection.JavaConversions._" ::
+      Nil
 
   override protected def postInitialization() {
     super.postInitialization()
diff --git a/spark/src/main/scala/org/apache/mahout/common/DrmMetadata.scala b/spark/src/main/scala/org/apache/mahout/common/DrmMetadata.scala
index 5bbccb10bc..0aba3195ac 100644
--- a/spark/src/main/scala/org/apache/mahout/common/DrmMetadata.scala
+++ b/spark/src/main/scala/org/apache/mahout/common/DrmMetadata.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.mahout.common
 
 import scala.reflect.ClassTag
diff --git a/spark/src/main/scala/org/apache/mahout/common/HDFSUtil.scala b/spark/src/main/scala/org/apache/mahout/common/HDFSUtil.scala
index f5f87d789b..c949f92ae7 100644
--- a/spark/src/main/scala/org/apache/mahout/common/HDFSUtil.scala
+++ b/spark/src/main/scala/org/apache/mahout/common/HDFSUtil.scala
@@ -17,10 +17,12 @@
 
 package org.apache.mahout.common
 
+import org.apache.spark.SparkContext
+
 /** High level Hadoop version-specific hdfs manipulations we need in context of our operations. */
 trait HDFSUtil {
 
   /** Read DRM header information off (H)DFS. */
-  def readDrmHeader(path:String):DrmMetadata
+  def readDrmHeader(path:String)(implicit sc:SparkContext):DrmMetadata
 }
 
diff --git a/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala b/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala
index 047104a7f0..d4483341e0 100644
--- a/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala
+++ b/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala
@@ -15,12 +15,30 @@
  * limitations under the License.
  */
 
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.mahout.common
 
 
 import org.apache.hadoop.io.{Writable, SequenceFile}
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.conf.Configuration
+import org.apache.spark.SparkContext
 import collection._
 import JavaConversions._
 
@@ -37,7 +55,11 @@ object Hadoop1HDFSUtil extends HDFSUtil {
    */
   def readDrmHeader(path: String): DrmMetadata = {
     val dfsPath = new Path(path)
-    val fs = dfsPath.getFileSystem(new Configuration())
+
+    val fs = dfsPath.getFileSystem(sc.hadoopConfiguration)
+
+    // Apparently getFileSystem() doesn't set conf??
+    fs.setConf(sc.hadoopConfiguration)
 
     val partFilePath:Path = fs.listStatus(dfsPath)
 
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala
index 595cd6625c..395f67474b 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala
@@ -24,51 +24,58 @@ import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark
 import scalabindings._
 import RLikeOps._
 import org.apache.mahout.math.drm.logical._
-import org.apache.mahout.sparkbindings.drm.{CheckpointedDrmSpark, DrmRddInput}
+import org.apache.mahout.sparkbindings.drm.{cpDrmGeneric2DrmRddInput, CheckpointedDrmSpark, DrmRddInput}
 import org.apache.mahout.math._
+import scala.Predef
 import scala.reflect.ClassTag
 import org.apache.spark.storage.StorageLevel
 import org.apache.mahout.sparkbindings.blas._
 import org.apache.hadoop.io._
-import scala.Some
-import scala.collection.JavaConversions._
+import collection._
+import JavaConversions._
 import org.apache.mahout.math.drm._
 import org.apache.mahout.math.drm.RLikeDrmOps._
 import org.apache.spark.rdd.RDD
 import org.apache.mahout.common.{Hadoop1HDFSUtil, HDFSUtil}
 
+
 /** Spark-specific non-drm-method operations */
 object SparkEngine extends DistributedEngine {
 
   // By default, use Hadoop 1 utils
   var hdfsUtils: HDFSUtil = Hadoop1HDFSUtil
 
-  def colSums[K:ClassTag](drm: CheckpointedDrm[K]): Vector = {
+  def colSums[K: ClassTag](drm: CheckpointedDrm[K]): Vector = {
     val n = drm.ncol
 
     drm.rdd
+
       // Throw away keys
       .map(_._2)
+
       // Fold() doesn't work with kryo still. So work around it.
-      .mapPartitions(iter => {
-      val acc = ((new DenseVector(n): Vector) /: iter)((acc, v) => acc += v)
+      .mapPartitions(iter ⇒ {
+      val acc = ((new DenseVector(n): Vector) /: iter)((acc, v) ⇒  acc += v)
       Iterator(acc)
     })
+
       // Since we preallocated new accumulator vector per partition, this must not cause any side
       // effects now.
       .reduce(_ += _)
   }
 
-  def numNonZeroElementsPerColumn[K:ClassTag](drm: CheckpointedDrm[K]): Vector = {
+  def numNonZeroElementsPerColumn[K: ClassTag](drm: CheckpointedDrm[K]): Vector = {
     val n = drm.ncol
 
     drm.rdd
+
       // Throw away keys
       .map(_._2)
+
       // Fold() doesn't work with kryo still. So work around it.
-      .mapPartitions(iter => {
-      val acc = ((new DenseVector(n): Vector) /: iter) { (acc, v) =>
-        v.nonZeroes().foreach { elem => acc(elem.index) += 1 }
+      .mapPartitions(iter ⇒ {
+      val acc = ((new DenseVector(n): Vector) /: iter) { (acc, v) ⇒
+        v.nonZeroes().foreach { elem ⇒  acc(elem.index) += 1}
         acc
       }
       Iterator(acc)
@@ -79,17 +86,25 @@ object SparkEngine extends DistributedEngine {
   }
 
   /** Engine-specific colMeans implementation based on a checkpoint. */
-  override def colMeans[K:ClassTag](drm: CheckpointedDrm[K]): Vector =
+  override def colMeans[K: ClassTag](drm: CheckpointedDrm[K]): Vector =
     if (drm.nrow == 0) drm.colSums() else drm.colSums() /= drm.nrow
 
   override def norm[K: ClassTag](drm: CheckpointedDrm[K]): Double =
     drm.rdd
-        // Compute sum of squares of each vector
-        .map {
-      case (key, v) => v dot v
+      // Compute sum of squares of each vector
+      .map {
+      case (key, v) ⇒ v dot v
     }
-        .reduce(_ + _)
+      .reduce(_ + _)
+
 
+  /** Optional engine-specific all reduce tensor operation. */
+  override def allreduceBlock[K: ClassTag](drm: CheckpointedDrm[K], bmf: BlockMapFunc2[K], rf:
+  BlockReduceFunc): Matrix = {
+
+    import drm._
+    drm.toBlockifiedDrmRdd(ncol = drm.ncol).map(bmf(_)).reduce(rf)
+  }
 
   /**
    * Perform default expression rewrite. Return physical plan that we can pass to exec(). <P>
@@ -104,10 +119,10 @@ object SparkEngine extends DistributedEngine {
   def toPhysical[K: ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = {
 
     // Spark-specific Physical Plan translation.
-    val rdd = tr2phys(plan)
+    val rddInput = tr2phys(plan)
 
     val newcp = new CheckpointedDrmSpark(
-      rdd = rdd,
+      rddInput = rddInput,
       _nrow = plan.nrow,
       _ncol = plan.ncol,
       _cacheStorageLevel = cacheHint2Spark(ch),
@@ -131,7 +146,13 @@ object SparkEngine extends DistributedEngine {
    *
    * @return DRM[Any] where Any is automatically translated to value type
    */
-  def drmDfsRead (path: String, parMin:Int = 0)(implicit sc: DistributedContext): CheckpointedDrm[_] = {
+  def drmDfsRead(path: String, parMin: Int = 0)(implicit sc: DistributedContext): CheckpointedDrm[_] = {
+
+    // Require that context is actually Spark context.
+    require(sc.isInstanceOf[SparkDistributedContext], "Supplied context must be for the Spark backend.")
+
+    // Extract spark context -- we need it for some operations.
+    implicit val ssc = sc.asInstanceOf[SparkDistributedContext].sc
 
     val drmMetadata = hdfsUtils.readDrmHeader(path)
     val k2vFunc = drmMetadata.keyW2ValFunc
@@ -140,8 +161,8 @@ object SparkEngine extends DistributedEngine {
     // Hadoop we must do it right after read operation).
     val rdd = sc.sequenceFile(path, classOf[Writable], classOf[VectorWritable], minPartitions = parMin)
 
-        // Immediately convert keys and value writables into value types.
-        .map { case (wKey, wVec) => k2vFunc(wKey) -> wVec.get()}
+      // Immediately convert keys and value writables into value types.
+      .map { case (wKey, wVec) ⇒ k2vFunc(wKey) -> wVec.get()}
 
     // Wrap into a DRM type with correct matrix row key class tag evident.
     drmWrap(rdd = rdd, cacheHint = CacheHint.NONE)(drmMetadata.keyClassTag.asInstanceOf[ClassTag[Any]])
@@ -149,67 +170,141 @@ object SparkEngine extends DistributedEngine {
 
   /** Parallelize in-core matrix as spark distributed matrix, using row ordinal indices as data set keys. */
   def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int = 1)
-      (implicit sc: DistributedContext)
+                                  (implicit sc: DistributedContext)
   : CheckpointedDrm[Int] = {
-    new CheckpointedDrmSpark(rdd = parallelizeInCore(m, numPartitions))
+    new CheckpointedDrmSpark(rddInput = parallelizeInCore(m, numPartitions), _nrow = m.nrow, _ncol = m.ncol)
   }
 
   private[sparkbindings] def parallelizeInCore(m: Matrix, numPartitions: Int = 1)
-      (implicit sc: DistributedContext): DrmRdd[Int] = {
+                                              (implicit sc: DistributedContext): DrmRdd[Int] = {
 
-    val p = (0 until m.nrow).map(i => i -> m(i, ::))
+    val p = (0 until m.nrow).map(i => i → m(i, ::))
     sc.parallelize(p, numPartitions)
 
   }
 
   /** Parallelize in-core matrix as spark distributed matrix, using row labels as a data set keys. */
   def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int = 1)
-      (implicit sc: DistributedContext)
+                                 (implicit sc: DistributedContext)
   : CheckpointedDrm[String] = {
 
     val rb = m.getRowLabelBindings
-    val p = for (i: String <- rb.keySet().toIndexedSeq) yield i -> m(rb(i), ::)
+    val p = for (i: String ← rb.keySet().toIndexedSeq) yield i → m(rb(i), ::)
 
-    new CheckpointedDrmSpark(rdd = sc.parallelize(p, numPartitions))
+    new CheckpointedDrmSpark(rddInput = sc.parallelize(p, numPartitions), _nrow = m.nrow, _ncol = m.ncol)
   }
 
   /** This creates an empty DRM with specified number of partitions and cardinality. */
   def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int = 10)
-      (implicit sc: DistributedContext): CheckpointedDrm[Int] = {
-    val rdd = sc.parallelize(0 to numPartitions, numPartitions).flatMap(part => {
+                         (implicit sc: DistributedContext): CheckpointedDrm[Int] = {
+    val rdd = sc.parallelize(0 to numPartitions, numPartitions).flatMap(part ⇒ {
       val partNRow = (nrow - 1) / numPartitions + 1
       val partStart = partNRow * part
       val partEnd = Math.min(partStart + partNRow, nrow)
 
-      for (i <- partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector)
+      for (i ← partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector)
     })
     new CheckpointedDrmSpark[Int](rdd, nrow, ncol)
   }
 
   def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int = 10)
-      (implicit sc: DistributedContext): CheckpointedDrm[Long] = {
-    val rdd = sc.parallelize(0 to numPartitions, numPartitions).flatMap(part => {
+                             (implicit sc: DistributedContext): CheckpointedDrm[Long] = {
+    val rdd = sc.parallelize(0 to numPartitions, numPartitions).flatMap(part ⇒ {
       val partNRow = (nrow - 1) / numPartitions + 1
       val partStart = partNRow * part
       val partEnd = Math.min(partStart + partNRow, nrow)
 
-      for (i <- partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector)
+      for (i ← partStart until partEnd) yield (i, new RandomAccessSparseVector(ncol): Vector)
     })
     new CheckpointedDrmSpark[Long](rdd, nrow, ncol)
   }
 
+  /**
+   * Convert non-int-keyed matrix to an int-keyed, computing optionally mapping from old keys
+   * to row indices in the new one. The mapping, if requested, is returned as a 1-column matrix.
+   */
+  override def drm2IntKeyed[K: ClassTag](drmX: DrmLike[K], computeMap: Boolean = false): (DrmLike[Int], Option[DrmLike[K]]) = {
+    if (classTag[K] == ClassTag.Int) {
+      drmX.asInstanceOf[DrmLike[Int]] → None
+    } else {
+
+      val drmXcp = drmX.checkpoint(CacheHint.MEMORY_ONLY)
+      val ncol = drmXcp.asInstanceOf[CheckpointedDrmSpark[K]]._ncol
+      val nrow = drmXcp.asInstanceOf[CheckpointedDrmSpark[K]]._nrow
+
+      // Compute sequential int key numbering.
+      val (intRdd, keyMap) = blas.rekeySeqInts(rdd = drmXcp.rdd, computeMap = computeMap)
+
+      // Convert computed key mapping to a matrix.
+      val mxKeyMap = keyMap.map { rdd =>
+        drmWrap(rdd = rdd.map { case (key, ordinal) ⇒ key → (dvec(ordinal):Vector)}, ncol = 1, nrow = nrow)
+      }
+
+
+      drmWrap(rdd = intRdd, ncol = ncol) → mxKeyMap
+  }
+
+  }
+
+
+  /**
+   * (Optional) Sampling operation. Consistent with Spark semantics of the same.
+   * @param drmX
+   * @param fraction
+   * @param replacement
+   * @tparam K
+   * @return
+   */
+  override def drmSampleRows[K: ClassTag](drmX: DrmLike[K], fraction: Double, replacement: Boolean): DrmLike[K] = {
+
+    // We do want to take ncol if already computed, if not, then we don't want to trigger computation
+    // here.
+    val ncol = drmX match {
+      case cp: CheckpointedDrmSpark[K] ⇒ cp._ncol
+      case _ ⇒ -1
+    }
+    val sample = drmX.rdd.sample(withReplacement = replacement, fraction = fraction)
+    if (classTag[K] != ClassTag.Int) return drmWrap(sample, ncol = ncol)
+
+    // K == Int: Int-keyed sample. rebase int counts.
+    drmWrap(rdd = blas.rekeySeqInts(rdd = sample, computeMap = false)._1, ncol = ncol).asInstanceOf[DrmLike[K]]
+  }
+
+
+  override def drmSampleKRows[K: ClassTag](drmX: DrmLike[K], numSamples: Int, replacement: Boolean): Matrix = {
+
+    val ncol = drmX match {
+      case cp: CheckpointedDrmSpark[K] ⇒ cp._ncol
+      case _ ⇒ -1
+    }
+
+    // I think as of the time of this writing, takeSample() in Spark is biased. It is not a true
+    // hypergeometric sampler. But it is faster than a true hypergeometric/categorical samplers
+    // would be.
+    val sample = drmX.rdd.takeSample(withReplacement = replacement, num = numSamples)
+    val isSparse = sample.exists { case (_, vec) ⇒ !vec.isDense }
+
+    val vectors = sample.map(_._2)
+    val labels = sample.view.zipWithIndex.map { case ((key, _), idx) ⇒ key.toString → (idx:Integer) }.toMap
+
+    val mx:Matrix = if (isSparse) sparse(vectors:_*) else dense(vectors)
+    mx.setRowLabelBindings(labels)
+
+    mx
+  }
+
   private[mahout] def cacheHint2Spark(cacheHint: CacheHint.CacheHint): StorageLevel = cacheHint match {
-    case CacheHint.NONE => StorageLevel.NONE
-    case CacheHint.DISK_ONLY => StorageLevel.DISK_ONLY
-    case CacheHint.DISK_ONLY_2 => StorageLevel.DISK_ONLY_2
-    case CacheHint.MEMORY_ONLY => StorageLevel.MEMORY_ONLY
-    case CacheHint.MEMORY_ONLY_2 => StorageLevel.MEMORY_ONLY_2
-    case CacheHint.MEMORY_ONLY_SER => StorageLevel.MEMORY_ONLY_SER
-    case CacheHint.MEMORY_ONLY_SER_2 => StorageLevel.MEMORY_ONLY_SER_2
-    case CacheHint.MEMORY_AND_DISK => StorageLevel.MEMORY_AND_DISK
-    case CacheHint.MEMORY_AND_DISK_2 => StorageLevel.MEMORY_AND_DISK_2
-    case CacheHint.MEMORY_AND_DISK_SER => StorageLevel.MEMORY_AND_DISK_SER
-    case CacheHint.MEMORY_AND_DISK_SER_2 => StorageLevel.MEMORY_AND_DISK_SER_2
+    case CacheHint.NONE ⇒ StorageLevel.NONE
+    case CacheHint.DISK_ONLY ⇒ StorageLevel.DISK_ONLY
+    case CacheHint.DISK_ONLY_2 ⇒ StorageLevel.DISK_ONLY_2
+    case CacheHint.MEMORY_ONLY ⇒ StorageLevel.MEMORY_ONLY
+    case CacheHint.MEMORY_ONLY_2 ⇒ StorageLevel.MEMORY_ONLY_2
+    case CacheHint.MEMORY_ONLY_SER ⇒ StorageLevel.MEMORY_ONLY_SER
+      case CacheHint.MEMORY_ONLY_SER_2 ⇒ StorageLevel.MEMORY_ONLY_SER_2
+    case CacheHint.MEMORY_AND_DISK ⇒ StorageLevel.MEMORY_AND_DISK
+    case CacheHint.MEMORY_AND_DISK_2 ⇒ StorageLevel.MEMORY_AND_DISK_2
+    case CacheHint.MEMORY_AND_DISK_SER ⇒ StorageLevel.MEMORY_AND_DISK_SER
+    case CacheHint.MEMORY_AND_DISK_SER_2 ⇒ StorageLevel.MEMORY_AND_DISK_SER_2
   }
 
   /** Translate previously optimized physical plan */
@@ -221,31 +316,32 @@ object SparkEngine extends DistributedEngine {
       // If there are any such cases, they must go away in pass1. If they were not, then it wasn't
       // the A'A case but actual transposition intent which should be removed from consideration
       // (we cannot do actual flip for non-int-keyed arguments)
-      case OpAtAnyKey(_) =>
+      case OpAtAnyKey(_) ⇒
         throw new IllegalArgumentException("\"A\" must be Int-keyed in this A.t expression.")
-      case op@OpAt(a) => At.at(op, tr2phys(a)(op.classTagA))
-      case op@OpABt(a, b) => ABt.abt(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
-      case op@OpAtB(a, b) => AtB.atb_nograph(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB),
-        zippable = a.partitioningTag == b.partitioningTag)
-      case op@OpAtA(a) => AtA.at_a(op, tr2phys(a)(op.classTagA))
-      case op@OpAx(a, x) => Ax.ax_with_broadcast(op, tr2phys(a)(op.classTagA))
-      case op@OpAtx(a, x) => Ax.atx_with_broadcast(op, tr2phys(a)(op.classTagA))
-      case op@OpAewB(a, b, opId) => AewB.a_ew_b(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
-      case op@OpCbind(a, b) => CbindAB.cbindAB_nograph(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
-      case op@OpRbind(a, b) => RbindAB.rbindAB(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
-      case op@OpAewScalar(a, s, _) => AewB.a_ew_scalar(op, tr2phys(a)(op.classTagA), s)
-      case op@OpRowRange(a, _) => Slicing.rowRange(op, tr2phys(a)(op.classTagA))
-      case op@OpTimesRightMatrix(a, _) => AinCoreB.rightMultiply(op, tr2phys(a)(op.classTagA))
+      case op@OpAt(a) ⇒ At.at(op, tr2phys(a)(op.classTagA))
+      case op@OpABt(a, b) ⇒ ABt.abt(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
+      case op@OpAtB(a, b) ⇒ AtB.atb(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
+      case op@OpAtA(a) ⇒ AtA.at_a(op, tr2phys(a)(op.classTagA))
+      case op@OpAx(a, x) ⇒ Ax.ax_with_broadcast(op, tr2phys(a)(op.classTagA))
+      case op@OpAtx(a, x) ⇒ Ax.atx_with_broadcast(op, tr2phys(a)(op.classTagA))
+      case op@OpAewUnaryFunc(a, _, _) ⇒ AewB.a_ew_func(op, tr2phys(a)(op.classTagA))
+      case op@OpAewUnaryFuncFusion(a, _) ⇒ AewB.a_ew_func(op, tr2phys(a)(op.classTagA))
+      case op@OpAewB(a, b, opId) ⇒ AewB.a_ew_b(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
+      case op@OpCbind(a, b) ⇒ CbindAB.cbindAB_nograph(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
+      case op@OpCbindScalar(a, _, _) ⇒ CbindAB.cbindAScalar(op, tr2phys(a)(op.classTagA))
+      case op@OpRbind(a, b) ⇒ RbindAB.rbindAB(op, tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB))
+      case op@OpAewScalar(a, s, _) ⇒ AewB.a_ew_scalar(op, tr2phys(a)(op.classTagA), s)
+      case op@OpRowRange(a, _) ⇒ Slicing.rowRange(op, tr2phys(a)(op.classTagA))
+      case op@OpTimesRightMatrix(a, _) ⇒ AinCoreB.rightMultiply(op, tr2phys(a)(op.classTagA))
       // Custom operators, we just execute them
-      case blockOp: OpMapBlock[K, _] => MapBlock.exec(
+      case blockOp: OpMapBlock[K, _] ⇒ MapBlock.exec(
         src = tr2phys(blockOp.A)(blockOp.classTagA),
-        ncol = blockOp.ncol,
-        bmf = blockOp.bmf
+        operator = blockOp
       )
-      case op@OpPar(a,_,_) => Par.exec(op,tr2phys(a)(op.classTagA))
-      case cp: CheckpointedDrm[K] => new DrmRddInput[K](rowWiseSrc = Some((cp.ncol, cp.rdd)))
-      case _ => throw new IllegalArgumentException("Internal:Optimizer has no exec policy for operator %s."
-          .format(oper))
+      case op@OpPar(a, _, _) ⇒ Par.exec(op, tr2phys(a)(op.classTagA))
+      case cp: CheckpointedDrm[K] ⇒ cp.rdd: DrmRddInput[K]
+      case _ ⇒ throw new IllegalArgumentException("Internal:Optimizer has no exec policy for operator %s."
+        .format(oper))
 
     }
   }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala
index 1e3f286998..11e2badd18 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/ABt.scala
@@ -19,16 +19,23 @@ package org.apache.mahout.sparkbindings.blas
 
 import org.apache.mahout.math.scalabindings._
 import RLikeOps._
+import org.apache.spark.rdd.RDD
 import scala.reflect.ClassTag
 import org.apache.mahout.sparkbindings._
-import drm._
-import org.apache.mahout.math.{Matrix, SparseRowMatrix}
+import org.apache.mahout.math.drm.BlockifiedDrmTuple
+import org.apache.mahout.sparkbindings.drm._
+import org.apache.mahout.math.{SparseMatrix, Matrix, SparseRowMatrix}
 import org.apache.spark.SparkContext._
 import org.apache.mahout.math.drm.logical.OpABt
+import org.apache.mahout.logging._
+
+import scala.tools.nsc.io.Pickler.TildeDecorator
 
 /** Contains RDD plans for ABt operator */
 object ABt {
 
+  private final implicit val log = getLog(ABt.getClass)
+
   /**
    * General entry point for AB' operator.
    *
@@ -40,8 +47,11 @@ object ABt {
   def abt[K: ClassTag](
       operator: OpABt[K],
       srcA: DrmRddInput[K],
-      srcB: DrmRddInput[Int]): DrmRddInput[K] =
+      srcB: DrmRddInput[Int]): DrmRddInput[K] = {
+
+    debug("operator AB'(Spark)")
     abt_nograph(operator, srcA, srcB)
+  }
 
   /**
    * Computes AB' without GraphX.
@@ -63,7 +73,146 @@ object ABt {
       srcB: DrmRddInput[Int]): DrmRddInput[K] = {
 
     // Blockify everything.
-    val blocksA = srcA.toBlockifiedDrmRdd()
+    val blocksA = srcA.toBlockifiedDrmRdd(operator.A.ncol)
+
+    val blocksB = srcB.toBlockifiedDrmRdd(operator.B.ncol)
+
+    val prodNCol = operator.ncol
+    val prodNRow = operator.nrow
+    // We are actually computing AB' here. 
+    val numProductPartitions = estimateProductPartitions(anrow = prodNRow, ancol = operator.A.ncol,
+      bncol = prodNCol, aparts = blocksA.partitions.size, bparts = blocksB.partitions.size)
+
+    debug(
+      s"AB': #parts = $numProductPartitions; A #parts=${blocksA.partitions.size}, B #parts=${blocksB.partitions.size}."+
+      s"A=${operator.A.nrow}x${operator.A.ncol}, B=${operator.B.nrow}x${operator.B.ncol},AB'=${prodNRow}x$prodNCol."
+    )
+
+    // blockwise multimplication function
+    def mmulFunc(tupleA: BlockifiedDrmTuple[K], tupleB: BlockifiedDrmTuple[Int]): (Array[K], Array[Int], Matrix) = {
+      val (keysA, blockA) = tupleA
+      val (keysB, blockB) = tupleB
+
+      var ms = traceDo(System.currentTimeMillis())
+
+      // We need to send keysB to the aggregator in order to know which columns are being updated.
+      val result = (keysA, keysB, (blockA %*% blockB.t))
+
+      ms = traceDo(System.currentTimeMillis() - ms.get)
+      trace(
+        s"block multiplication of(${blockA.nrow}x${blockA.ncol} x ${blockB.ncol}x${blockB.nrow} is completed in $ms " +
+          "ms.")
+      trace(s"block multiplication types: blockA: ${blockA.getClass.getName}(${blockA.t.getClass.getName}); " +
+        s"blockB: ${blockB.getClass.getName}.")
+
+      result
+    }
+
+    val blockwiseMmulRdd =
+
+    // Combine blocks pairwise.
+      pairwiseApply(blocksA, blocksB, mmulFunc _)
+
+        // Now reduce proper product blocks.
+        .combineByKey(
+
+          // Empty combiner += value
+          createCombiner = (t: (Array[K], Array[Int], Matrix)) =>  {
+            val (rowKeys, colKeys, block) = t
+            val comb = new SparseMatrix(prodNCol, block.nrow).t
+
+            for ((col, i) <- colKeys.zipWithIndex) comb(::, col) := block(::, i)
+            rowKeys -> comb
+          },
+
+          // Combiner += value
+          mergeValue = (comb: (Array[K], Matrix), value: (Array[K], Array[Int], Matrix)) => {
+            val (rowKeys, c) = comb
+            val (_, colKeys, block) = value
+            for ((col, i) <- colKeys.zipWithIndex) c(::, col) := block(::, i)
+            comb
+          },
+
+          // Combiner + Combiner
+          mergeCombiners = (comb1: (Array[K], Matrix), comb2: (Array[K], Matrix)) => {
+            comb1._2 += comb2._2
+            comb1
+          },
+
+          numPartitions = blocksA.partitions.size max blocksB.partitions.size
+        )
+
+
+    // Created BlockifiedRDD-compatible structure.
+    val blockifiedRdd = blockwiseMmulRdd
+
+      // throw away A-partition #
+      .map{case (_,tuple) => tuple}
+
+    val numPartsResult = blockifiedRdd.partitions.size
+
+    // See if we need to rebalance away from A granularity.
+    if (numPartsResult * 2 < numProductPartitions || numPartsResult / 2 > numProductPartitions) {
+
+      debug(s"Will re-coalesce from ${numPartsResult} to ${numProductPartitions}")
+
+      val rowRdd = deblockify(blockifiedRdd).coalesce(numPartitions = numProductPartitions)
+
+      rowRdd
+
+    } else {
+
+      // We don't have a terribly different partition
+      blockifiedRdd
+    }
+
+  }
+
+  /**
+   * This function tries to use join instead of cartesian to group blocks together without bloating
+   * the number of partitions. Hope is that we can apply pairwise reduction of block pair right away
+   * so if the data to one of the join parts is streaming, the result is still fitting to memory,
+   * since result size is much smaller than the operands.
+   *
+   * @param blocksA blockified RDD for A
+   * @param blocksB blockified RDD for B
+   * @param blockFunc a function over (blockA, blockB). Implies `blockA %*% blockB.t` but perhaps may be
+   *                  switched to another scheme based on which of the sides, A or B, is bigger.
+   */
+  private def pairwiseApply[K1, K2, T](blocksA: BlockifiedDrmRdd[K1], blocksB: BlockifiedDrmRdd[K2], blockFunc:
+  (BlockifiedDrmTuple[K1], BlockifiedDrmTuple[K2]) => T): RDD[(Int, T)] = {
+
+    // We will be joining blocks in B to blocks in A using A-partition as a key.
+
+    // Prepare A side.
+    val blocksAKeyed = blocksA.mapPartitionsWithIndex { (part, blockIter) =>
+
+      val r = if (blockIter.hasNext) Some(part -> blockIter.next) else Option.empty[(Int, BlockifiedDrmTuple[K1])]
+
+      require(blockIter.hasNext == false, s"more than 1 (${blockIter.size + 1}) blocks per partition and A of AB'")
+
+      r.toIterator
+    }
+
+    // Prepare B-side.
+    val aParts = blocksA.partitions.size
+    val blocksBKeyed = blocksB.flatMap(bTuple => for (blockKey <- (0 until aParts).view) yield blockKey -> bTuple )
+
+    // Perform the inner join. Let's try to do a simple thing now.
+    blocksAKeyed.join(blocksBKeyed, numPartitions = aParts)
+
+    // Apply product function which should produce smaller products. Hopefully, this streams blockB's in
+    .map{case (partKey,(blockA, blockB)) => partKey -> blockFunc(blockA, blockB)}
+
+  }
+
+  private[blas] def abt_nograph_cart[K: ClassTag](
+      operator: OpABt[K],
+      srcA: DrmRddInput[K],
+      srcB: DrmRddInput[Int]): DrmRddInput[K] = {
+
+    // Blockify everything.
+    val blocksA = srcA.toBlockifiedDrmRdd(operator.A.ncol)
 
         // Mark row-blocks with group id
         .mapPartitionsWithIndex((part, iter) => {
@@ -83,28 +232,35 @@ object ABt {
       }
     })
 
-    val blocksB = srcB.toBlockifiedDrmRdd()
+    val blocksB = srcB.toBlockifiedDrmRdd(operator.B.ncol)
 
     // Final product's geometry. We want to extract that into local variables since we want to use
     // them as closure attributes.
     val prodNCol = operator.ncol
     val prodNRow = operator.nrow
-    
-    // Approximate number of final partitions.
-    val numProductPartitions =
-      if (blocksA.partitions.size > blocksB.partitions.size) {
-        ((prodNCol.toDouble / operator.A.ncol) * blocksA.partitions.size).ceil.toInt
-      } else {
-        ((prodNRow.toDouble / operator.B.ncol) * blocksB.partitions.size).ceil.toInt
-      }
+    val aNCol = operator.A.ncol
+
+    // Approximate number of final partitions. We take bigger partitions as our guide to number of
+    // elements per partition. TODO: do it better.
 
-    //srcA.partitions.size.max(that = srcB.partitions.size)
+    // Elements per partition, bigger of two operands.
+    val epp = aNCol.toDouble * prodNRow / blocksA.partitions.size max aNCol.toDouble * prodNCol /
+      blocksB.partitions.size
 
+    // Number of partitions we want to converge to in the product. For now we simply extrapolate that
+    // assuming product density and operand densities being about the same; and using the same element
+    // per partition number in the product as the bigger of two operands.
+    val numProductPartitions = (prodNCol.toDouble * prodNRow / epp).ceil.toInt
+
+    debug(
+      s"AB': #parts = $numProductPartitions; A #parts=${blocksA.partitions.size}, B #parts=${blocksB.partitions.size}.")
 
     // The plan.
-    var blockifiedRdd :BlockifiedDrmRdd[K] = blocksA
+    var blockifiedRdd: BlockifiedDrmRdd[K] = blocksA
 
-        // Build Cartesian. It may require a bit more memory there at tasks.
+        // Build Cartesian. It generates a LOT of tasks. TODO: figure how to fix performance of AB'
+        // operator. The thing is that product after map is really small one (partition fraction x
+        // partition fraction) so they can be combined into much bigger chunks.
         .cartesian(blocksB)
 
         // Multiply blocks
@@ -126,10 +282,14 @@ object ABt {
         .combineByKey[(Array[K],Matrix)](
 
           createCombiner = (t: (Array[K], Array[Int], Matrix)) => t match {
+
+            // Create combiner structure out of two products. Our combiner is sparse row matrix
+            // initialized to final product partition block dimensions.
             case (rowKeys, colKeys, blockProd) =>
 
-              // Accumulator is a row-wise block of sparse vectors.
-              val acc:Matrix = new SparseRowMatrix(rowKeys.size, prodNCol)
+              // Accumulator is a row-wise block of sparse vectors. Since we assign to columns,
+              // the most efficient is perhaps to create column-oriented block here.
+              val acc:Matrix = new SparseRowMatrix(prodNCol, rowKeys.size).t
 
               // Update accumulator using colKeys as column index indirection
               colKeys.view.zipWithIndex.foreach({
@@ -168,6 +328,8 @@ object ABt {
     // having at most one block per partition.
     blockifiedRdd = rbind(blockifiedRdd)
 
-    new DrmRddInput(blockifiedSrc = Some(blockifiedRdd))
+    blockifiedRdd
   }
+
+
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AewB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AewB.scala
index 3cdb797a7f..8a90398cb5 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AewB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AewB.scala
@@ -20,19 +20,22 @@ package org.apache.mahout.sparkbindings.blas
 import org.apache.mahout.sparkbindings.drm.DrmRddInput
 import scala.reflect.ClassTag
 import org.apache.spark.SparkContext._
-import org.apache.mahout.math.scalabindings._
+import org.apache.mahout.math._
+import scalabindings._
 import RLikeOps._
 import org.apache.mahout.math.{SequentialAccessSparseVector, Matrix, Vector}
-import org.apache.mahout.math.drm.logical.{OpAewScalar, OpAewB}
-import org.apache.log4j.Logger
+import org.apache.mahout.math.drm.logical.{AbstractUnaryOp, TEwFunc, OpAewScalar, OpAewB}
 import org.apache.mahout.sparkbindings.blas.AewB.{ReduceFuncScalar, ReduceFunc}
 import org.apache.mahout.sparkbindings.{BlockifiedDrmRdd, DrmRdd, drm}
 import org.apache.mahout.math.drm._
+import org.apache.mahout.logging._
+import collection._
+import JavaConversions._
 
 /** Elementwise drm-drm operators */
 object AewB {
 
-  private val log = Logger.getLogger(AewB.getClass)
+  private final implicit val log = getLog(AewB.getClass)
 
   /**
    * Set to false to disallow in-place elementwise operations in case side-effects and non-idempotent
@@ -44,10 +47,10 @@ object AewB {
 
   type ReduceFuncScalar = (Matrix, Double) => Matrix
 
-  private[blas] def getEWOps() = {
-    val inplaceProp = System.getProperty(PROPERTY_AEWB_INPLACE, "true").toBoolean
-    if (inplaceProp) InplaceEWOps else CloningEWOps
-  }
+  private[blas] def ewInplace(): Boolean = System.getProperty(PROPERTY_AEWB_INPLACE, "false").toBoolean
+
+  private[blas] def getEWOps() = if (ewInplace()) InplaceEWOps else CloningEWOps
+
 
   /** Elementwise matrix-matrix operator, now handles both non- and identically partitioned */
   def a_ew_b[K: ClassTag](op: OpAewB[K], srcA: DrmRddInput[K], srcB: DrmRddInput[K]): DrmRddInput[K] = {
@@ -67,12 +70,14 @@ object AewB {
     val a = srcA.toDrmRdd()
     val b = srcB.toDrmRdd()
 
+    debug(s"A${op.op}B: #partsA=${a.partitions.size},#partsB=${b.partitions.size}.")
+
     // Check if A and B are identically partitioned AND keyed. if they are, then just perform zip
     // instead of join, and apply the op map-side. Otherwise, perform join and apply the op
     // reduce-side.
     val rdd = if (op.isIdenticallyPartitioned(op.A)) {
 
-      log.debug("applying zipped elementwise")
+      debug(s"A${op.op}B:applying zipped elementwise")
 
       a
           .zip(b)
@@ -83,7 +88,7 @@ object AewB {
       }
     } else {
 
-      log.debug("applying elementwise as join")
+      debug("A${op.op}B:applying elementwise as join")
 
       a
           // Full outer-join operands row-wise
@@ -103,13 +108,51 @@ object AewB {
       })
     }
 
-    new DrmRddInput(rowWiseSrc = Some(ncol -> rdd))
+    rdd
+  }
+
+  def a_ew_func[K:ClassTag](op:AbstractUnaryOp[K,K] with TEwFunc, srcA: DrmRddInput[K]):DrmRddInput[K] = {
+
+    val evalZeros = op.evalZeros
+    val inplace = ewInplace()
+    val f = op.f
+
+    // Before obtaining blockified rdd, see if we have to fix int row key consistency so that missing
+    // rows can get lazily pre-populated with empty vectors before proceeding with elementwise scalar.
+    val aBlockRdd = if (implicitly[ClassTag[K]] == ClassTag.Int && op.A.canHaveMissingRows && evalZeros) {
+      val fixedRdd = fixIntConsistency(op.A.asInstanceOf[DrmLike[Int]], src = srcA.toDrmRdd().asInstanceOf[DrmRdd[Int]])
+      drm.blockify(fixedRdd, blockncol = op.A.ncol).asInstanceOf[BlockifiedDrmRdd[K]]
+    } else {
+      srcA.toBlockifiedDrmRdd(op.A.ncol)
+    }
+
+    val rdd = aBlockRdd.map {case (keys, block) =>
+
+      // Do inplace or allocate a new copy?
+      val newBlock = if (inplace) block else block cloned
+
+      // Operation cares about zeros?
+      if (evalZeros) {
+
+        // Yes, we evaluate all:
+        newBlock := ((_,_,x)=>f(x))
+      } else {
+
+        // No, evaluate non-zeros only row-wise
+        for (row <- newBlock; el <- row.nonZeroes) el := f(el.get)
+      }
+
+      keys -> newBlock
+    }
+
+    rdd
   }
 
   /** Physical algorithm to handle matrix-scalar operators like A - s or s -: A */
   def a_ew_scalar[K: ClassTag](op: OpAewScalar[K], srcA: DrmRddInput[K], scalar: Double):
   DrmRddInput[K] = {
 
+
     val ewOps = getEWOps()
     val opId = op.op
 
@@ -129,15 +172,17 @@ object AewB {
       val fixedRdd = fixIntConsistency(op.A.asInstanceOf[DrmLike[Int]], src = srcA.toDrmRdd().asInstanceOf[DrmRdd[Int]])
       drm.blockify(fixedRdd, blockncol = op.A.ncol).asInstanceOf[BlockifiedDrmRdd[K]]
     } else {
-      srcA.toBlockifiedDrmRdd()
+      srcA.toBlockifiedDrmRdd(op.A.ncol)
     }
 
+    debug(s"A${op.op}$scalar: #parts=${aBlockRdd.partitions.size}.")
+
     val rdd = aBlockRdd
-        .map({
+        .map {
       case (keys, block) => keys -> reduceFunc(block, scalar)
-    })
+    }
 
-    new DrmRddInput[K](blockifiedSrc = Some(rdd))
+    rdd
   }
 }
 
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AinCoreB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AinCoreB.scala
index c923e62d15..5f9f84ae75 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AinCoreB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AinCoreB.scala
@@ -6,13 +6,17 @@ import scalabindings._
 import RLikeOps._
 import org.apache.mahout.sparkbindings._
 import org.apache.mahout.sparkbindings.drm._
+import org.apache.mahout.logging._
 import scala.reflect.ClassTag
 import org.apache.mahout.math.DiagonalMatrix
 import org.apache.mahout.math.drm.logical.OpTimesRightMatrix
 
+
 /** Matrix product with one of operands an in-core matrix */
 object AinCoreB {
 
+  private final implicit val log = getLog(AinCoreB.getClass)
+
   def rightMultiply[K: ClassTag](op: OpTimesRightMatrix[K], srcA: DrmRddInput[K]): DrmRddInput[K] = {
     if ( op.right.isInstanceOf[DiagonalMatrix])
       rightMultiply_diag(op, srcA)
@@ -21,23 +25,27 @@ object AinCoreB {
   }
 
   private def rightMultiply_diag[K: ClassTag](op: OpTimesRightMatrix[K], srcA: DrmRddInput[K]): DrmRddInput[K] = {
-    val rddA = srcA.toBlockifiedDrmRdd()
+    val rddA = srcA.toBlockifiedDrmRdd(op.A.ncol)
     implicit val ctx:DistributedContext = rddA.context
     val dg = drmBroadcast(op.right.viewDiagonal())
 
+    debug(s"operator A %*% inCoreB-diagonal. #parts=${rddA.partitions.size}.")
+
     val rdd = rddA
         // Just multiply the blocks
         .map {
       case (keys, blockA) => keys -> (blockA %*%: diagv(dg))
     }
-    new DrmRddInput(blockifiedSrc = Some(rdd))
+    rdd
   }
 
   private def rightMultiply_common[K: ClassTag](op: OpTimesRightMatrix[K], srcA: DrmRddInput[K]): DrmRddInput[K] = {
 
-    val rddA = srcA.toBlockifiedDrmRdd()
+    val rddA = srcA.toBlockifiedDrmRdd(op.A.ncol)
     implicit val sc:DistributedContext = rddA.sparkContext
 
+    debug(s"operator A %*% inCoreB. #parts=${rddA.partitions.size}.")
+
     val bcastB = drmBroadcast(m = op.right)
 
     val rdd = rddA
@@ -46,7 +54,7 @@ object AinCoreB {
       case (keys, blockA) => keys -> (blockA %*% bcastB)
     }
 
-    new DrmRddInput(blockifiedSrc = Some(rdd))
+    rdd
   }
 
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/At.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/At.scala
index 56de9f47c0..5789bd2807 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/At.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/At.scala
@@ -17,16 +17,20 @@
 
 package org.apache.mahout.sparkbindings.blas
 
-import org.apache.mahout.sparkbindings.drm.DrmRddInput
+import org.apache.mahout.sparkbindings.drm._
 import org.apache.mahout.math.scalabindings._
+import org.apache.mahout.logging._
 import RLikeOps._
 import org.apache.spark.SparkContext._
 import org.apache.mahout.math.{DenseVector, Vector, SequentialAccessSparseVector}
 import org.apache.mahout.math.drm.logical.OpAt
 
+
 /** A' algorithms */
 object At {
 
+  private final implicit val log = getLog(At.getClass)
+
   def at(
       operator: OpAt,
       srcA: DrmRddInput[Int]): DrmRddInput[Int] = at_nograph(operator = operator, srcA = srcA)
@@ -39,10 +43,15 @@ object At {
    * groups into final rows of the transposed matrix.
    */
   private[blas] def at_nograph(operator: OpAt, srcA: DrmRddInput[Int]): DrmRddInput[Int] = {
-    val drmRdd = srcA.toBlockifiedDrmRdd()
+
+    debug("operator A'.")
+
+    val drmRdd = srcA.toBlockifiedDrmRdd(operator.A.ncol)
     val numPartitions = drmRdd.partitions.size
     val ncol = operator.ncol
 
+    debug(s"A' #parts = $numPartitions.")
+
     // Validity of this conversion must be checked at logical operator level.
     val nrow = operator.nrow.toInt
     val atRdd = drmRdd
@@ -70,7 +79,7 @@ object At {
         key -> v
     }).densify()
 
-    new DrmRddInput(rowWiseSrc = Some(ncol -> atRdd))
+    atRdd
   }
 
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala
index be4f08c3f0..c2ffb7084e 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.sparkbindings.blas
 
+import org.apache.mahout.logging._
 import org.apache.mahout.math._
 import org.apache.mahout.sparkbindings._
 import org.apache.mahout.sparkbindings.drm._
@@ -34,25 +35,30 @@ import SparkEngine._
  */
 object AtA {
 
-  final val log = Logger.getLogger(AtA.getClass)
+  private final implicit val log = getLog(AtA.getClass)
 
   final val PROPERTY_ATA_MAXINMEMNCOL = "mahout.math.AtA.maxInMemNCol"
+  final val PROPERTY_ATA_MMUL_BLOCKHEIGHT = "mahout.math.AtA.blockHeight"
 
   /** Materialize A'A operator */
   def at_a(operator: OpAtA[_], srcRdd: DrmRddInput[_]): DrmRddInput[Int] = {
 
-    val maxInMemNCol = System.getProperty(PROPERTY_ATA_MAXINMEMNCOL, "2000").toInt
+    val maxInMemNCol = System.getProperty(PROPERTY_ATA_MAXINMEMNCOL, "200").toInt
     maxInMemNCol.ensuring(_ > 0, "Invalid A'A in-memory setting for optimizer")
 
     if (operator.ncol <= maxInMemNCol) {
+
       // If we can comfortably fit upper-triangular operator into a map memory, we will run slim
       // algorithm with upper-triangular accumulators in maps. 
-      val inCoreA = at_a_slim(srcRdd = srcRdd, operator = operator)
+      val inCoreA = at_a_slim(srcRdd = srcRdd.toDrmRdd(), operator = operator)
       val drmRdd = parallelizeInCore(inCoreA, numPartitions = 1)(sc = srcRdd.sparkContext)
-      new DrmRddInput(rowWiseSrc = Some(inCoreA.ncol, drmRdd))
+      drmRdd
+
     } else {
+
       // Otherwise, we need to run a distributed, big version
-      new DrmRddInput(rowWiseSrc = Some(operator.ncol, at_a_nongraph(srcRdd = srcRdd, op = operator)))
+      //      new DrmRddInput(rowWiseSrc = Some(operator.ncol, at_a_nongraph(srcRdd = srcRdd, op = operator)))
+      at_a_nongraph_mmul(srcRdd = srcRdd.toBlockifiedDrmRdd(operator.A.ncol), op = operator)
 
     }
   }
@@ -64,7 +70,7 @@ object AtA {
    */
   def at_a_slim(operator: OpAtA[_], srcRdd: DrmRdd[_]): Matrix = {
 
-    log.debug("Applying slim A'A.")
+    debug("operator slim A'A(Spark)")
 
     val ncol = operator.ncol
     // Compute backing vector of tiny-upper-triangular accumulator accross all the data.
@@ -73,66 +79,110 @@ object AtA {
       val ut = new UpperTriangular(ncol)
 
       // Strategy is to add to an outer product of each row to the upper triangular accumulator.
-      pIter.foreach({
-        case (k, v) =>
+      pIter.foreach({ case (k, v) =>
 
-          // Use slightly various traversal strategies over dense vs. sparse source.
-          if (v.isDense) {
+        // Use slightly various traversal strategies over dense vs. sparse source.
+        if (v.isDense) {
 
-            // Update upper-triangular pattern only (due to symmetry).
-            // Note: Scala for-comprehensions are said to be fairly inefficient this way, but this is
-            // such spectacular case they were deesigned for.. Yes I do observe some 20% difference
-            // compared to while loops with no other payload, but the other payload is usually much
-            // heavier than this overhead, so... I am keeping this as is for the time being.
+          // Update upper-triangular pattern only (due to symmetry).
+          // Note: Scala for-comprehensions are said to be fairly inefficient this way, but this is
+          // such spectacular case they were deesigned for.. Yes I do observe some 20% difference
+          // compared to while loops with no other payload, but the other payload is usually much
+          // heavier than this overhead, so... I am keeping this as is for the time being.
 
-            for (row <- 0 until v.length; col <- row until v.length)
-              ut(row, col) = ut(row, col) + v(row) * v(col)
+          for (row <- 0 until v.length; col <- row until v.length)
+            ut(row, col) = ut(row, col) + v(row) * v(col)
 
-          } else {
+        } else {
 
-            // Sparse source.
-            v.nonZeroes().view
+          // Sparse source.
+          v.nonZeroes().view
 
-                // Outer iterator iterates over rows of outer product.
-                .foreach(elrow => {
+            // Outer iterator iterates over rows of outer product.
+            .foreach(elrow => {
 
-              // Inner loop for columns of outer product.
-              v.nonZeroes().view
+            // Inner loop for columns of outer product.
+            v.nonZeroes().view
 
-                  // Filter out non-upper nonzero elements from the double loop.
-                  .filter(_.index >= elrow.index)
+              // Filter out non-upper nonzero elements from the double loop.
+              .filter(_.index >= elrow.index)
 
-                  // Incrementally update outer product value in the uppper triangular accumulator.
-                  .foreach(elcol => {
+              // Incrementally update outer product value in the uppper triangular accumulator.
+              .foreach(elcol => {
 
-                val row = elrow.index
-                val col = elcol.index
-                ut(row, col) = ut(row, col) + elrow.get() * elcol.get()
+              val row = elrow.index
+              val col = elcol.index
+              ut(row, col) = ut(row, col) + elrow.get() * elcol.get()
 
-              })
             })
+          })
 
-          }
+        }
       })
 
       Iterator(dvec(ddata = ut.getData): Vector)
-    })
-
-        .collect()
-        .reduce(_ += _)
+    }).collect().reduce(_ += _)
 
     new DenseSymmetricMatrix(resSym)
   }
 
+  // Version that tries to use groupBy. In practice this is the slowest.
+  def at_a_group(op: OpAtA[_], srcRdd: DrmRdd[_]): DrmRddInput[Int] = {
+    debug("operator non-slim A'A(Spark-group).")
+
+    // Determine how many partitions the new matrix would need approximately. We base that on
+    // geometry only, but it may eventually not be that adequate. Indeed, A'A tends to be much more
+    // dense in reality than the source.
+    val m = op.A.nrow
+    val n = op.A.ncol
+    val srcNumParts = srcRdd.partitions.size
+    val finalNumParts = (srcNumParts * n / m).ceil.toInt max 1
+    val numParts = finalNumParts max srcNumParts
+    val ranges = computeEvenSplits(n, numParts)
+
+    var rddAtA = srcRdd
+
+      // Remove key, key is irrelevant
+      .map(_._2)
+
+      // Form partial outer blocks for each partition
+      .flatMap { v =>
+      for (blockKey <- 0 until numParts) yield {
+        blockKey -> v
+      }
+    }
+      // Sent to individual partition reducer
+      .groupByKey(numPartitions = numParts)
+
+      // Reduce individual group
+      .map { case (blockKey, iter) =>
+      val range = ranges(blockKey)
+      val mxC: Matrix = new SparseRowMatrix(range.size, n, false)
+      iter.foreach(vec => addOuterProduct(mxC, vec(range), vec))
+
+      // Fix keys
+      val blockStart = range.start
+      val rowKeys = Array.tabulate(mxC.nrow)(blockStart + _)
+      rowKeys -> mxC
+    }
+
+    if (log.isDebugEnabled)
+      log.debug(s"AtA (grouping) #parts: ${rddAtA.partitions.size}.")
+
+    if (finalNumParts < numParts) rddAtA = rddAtA.coalesce(finalNumParts, shuffle = false)
+
+    rddAtA
+  }
+
+
   /** The version of A'A that does not use GraphX */
-  def at_a_nongraph(op: OpAtA[_], srcRdd: DrmRdd[_]): DrmRdd[Int] = {
+  def at_a_nongraph(op: OpAtA[_], srcRdd: DrmRdd[_]): DrmRddInput[Int] = {
 
-    log.debug("Applying non-slim non-graph A'A.")
+    debug("Applying non-slim non-graph A'A.")
 
     // Determine how many partitions the new matrix would need approximately. We base that on 
     // geometry only, but it may eventually not be that adequate. Indeed, A'A tends to be much more
     // dense in reality than the source.
-
     val m = op.A.nrow
     val n = op.A.ncol
 /* possible fix for index out of range for vector range
@@ -157,8 +207,8 @@ object AtA {
 
     val rddAtA = srcRdd
 
-        // Remove key, key is irrelevant
-        .map(_._2)
+      // Remove key, key is irrelevant
+      .map(_._2)
 
         // Form partial outer blocks for each partition
         .flatMap {
@@ -188,7 +238,7 @@ object AtA {
         rowKeys -> block
     }
 
-    new DrmRddInput[Int](blockifiedSrc = Some(rddAtA))
+    rddAtA
   }
 
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
index 86aadc86dc..8346072979 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
@@ -17,8 +17,13 @@
 
 package org.apache.mahout.sparkbindings.blas
 
-import scala.reflect.ClassTag
-import org.apache.mahout.math.drm._
+import reflect.ClassTag
+import collection._
+import JavaConversions._
+
+import org.apache.mahout.logging._
+import org.apache.mahout.math._
+import drm._
 import org.apache.mahout.sparkbindings.drm._
 import org.apache.spark.rdd.RDD
 import org.apache.mahout.math.scalabindings._
@@ -27,92 +32,327 @@ import org.apache.spark.SparkContext._
 import org.apache.log4j.Logger
 import org.apache.mahout.math.drm.logical.OpAtB
 
+import scala.collection.mutable.ArrayBuffer
+
 object AtB {
 
-  private val log = Logger.getLogger(AtB.getClass)
+  private final implicit val log = getLog(AtB.getClass)
 
+  def atb[A: ClassTag](operator: OpAtB[A], srcA: DrmRddInput[A], srcB: DrmRddInput[A]): DrmRddInput[Int] = {
+    atb_nograph_mmul(operator, srcA, srcB, operator.A.partitioningTag == operator.B.partitioningTag)
+  }
   /**
    * The logic for computing A'B is pretty much map-side generation of partial outer product blocks
    * over co-grouped rows of A and B. If A and B are identically partitioned, we can just directly
    * zip all the rows. Otherwise, we need to inner-join them first.
+   *
+   * @deprecated
    */
-  def atb_nograph[A: ClassTag](
-      operator: OpAtB[A],
-      srcA: DrmRddInput[A],
-      srcB: DrmRddInput[A],
-      zippable:Boolean = false
-      ): DrmRddInput[Int] = {
+  @deprecated
+  def atb_nograph[A: ClassTag](operator: OpAtB[A], srcA: DrmRddInput[A], srcB: DrmRddInput[A],
+                               zippable: Boolean = false): DrmRddInput[Int] = {
 
     val rddA = srcA.toDrmRdd()
-    val zipped = if ( zippable ) {
+    val rddB = srcB.toDrmRdd()
+
+
+    val prodNCol = operator.ncol
+    val prodNRow = operator.nrow
+    val aNRow = operator.A.nrow
+
+    // Approximate number of final partitions. We take bigger partitions as our guide to number of
+    // elements per partition. TODO: do it better.
+    // Elements per partition, bigger of two operands.
+    val epp = aNRow.toDouble * prodNRow / rddA.partitions.size max aNRow.toDouble * prodNCol /
+      rddB.partitions.size
+
+    // Number of partitions we want to converge to in the product. For now we simply extrapolate that
+    // assuming product density and operand densities being about the same; and using the same element
+    // per partition number in the product as the bigger of two operands.
+    val numProductPartitions = (prodNCol.toDouble * prodNRow / epp).ceil.toInt
+
+    if (log.isDebugEnabled) log.debug(s"AtB: #parts ${numProductPartitions} for $prodNRow x $prodNCol geometry.")
+
+    val zipped = if (zippable) {
 
       log.debug("A and B for A'B are identically distributed, performing row-wise zip.")
 
-      rddA.zip(other = srcB.toDrmRdd())
+      rddA.zip(other = rddB)
 
     } else {
 
       log.debug("A and B for A'B are not identically partitioned, performing inner join.")
 
-      rddA.join(other=srcB.toDrmRdd()).map({
-        case (key,(v1,v2) ) => (key -> v1) -> (key -> v2)
+      rddA.join(other = rddB, numPartitions = numProductPartitions).map({ case (key, (v1,
+      v2)) => (key -> v1) -> (key -> v2)
       })
     }
 
-    val blockHeight = safeToNonNegInt(
-      (operator.B.ncol.toDouble/rddA.partitions.size).ceil.round max 1L
-    )
-
-    computeAtBZipped(
-      zipped,
-      nrow = operator.nrow,
-      ancol = operator.A.ncol,
-      bncol = operator.B.ncol,
-      blockHeight = blockHeight
-    )
+    computeAtBZipped2(zipped, nrow = operator.nrow, ancol = operator.A.ncol, bncol = operator.B.ncol,
+      numPartitions = numProductPartitions)
   }
 
+  private[sparkbindings] def atb_nograph_mmul[A:ClassTag](operator:OpAtB[A], srcA: DrmRddInput[A], srcB:DrmRddInput[A], zippable:Boolean = false):DrmRddInput[Int] = {
+
+    debug("operator mmul-A'B(Spark)")
+
+    val prodNCol = operator.ncol
+    val prodNRow = safeToNonNegInt(operator.nrow)
+    val aNRow = safeToNonNegInt(operator.A.nrow)
+
+    val rddA = srcA.toDrmRdd()
+    val rddB = srcB.toDrmRdd()
+
+    // Approximate number of final partitions. We take bigger partitions as our guide to number of
+    // elements per partition. TODO: do it better.
+    // Elements per partition, bigger of two operands.
+    val epp = aNRow.toDouble * prodNRow / rddA.partitions.size max aNRow.toDouble * prodNCol /
+      rddB.partitions.size
+
+    // Number of partitions we want to converge to in the product. For now we simply extrapolate that
+    // assuming product density and operand densities being about the same; and using the same element
+    // per partition number in the product as the bigger of two operands.
+    val numProductPartitions = (prodNCol.toDouble * prodNRow / epp).ceil.toInt
+
+    if (log.isDebugEnabled) log.debug(s"AtB mmul: #parts ${numProductPartitions} for $prodNRow x $prodNCol geometry.")
+
+    val zipped = if (zippable) {
+
+      debug("mmul-A'B - zip: are identically distributed, performing row-wise zip.")
+
+      val blockdRddA = srcA.toBlockifiedDrmRdd(operator.A.ncol)
+      val blockdRddB = srcB.toBlockifiedDrmRdd(operator.B.ncol)
+
+      blockdRddA
+
+        // Zip
+        .zip(other = blockdRddB)
+
+        // Throw away the keys
+        .map { case ((_, blockA), (_, blockB)) => blockA -> blockB}
+
+    } else {
+
+      debug("mmul-A'B: cogroup for non-identically distributed stuff.")
+
+      // To take same route, we'll join stuff row-wise, blockify it here and then proceed with the
+      // same computation path. Although it is possible we could shave off one shuffle here. TBD.
+
+      rddA
+
+        // Do full join. We can't get away with partial join because it is going to lose some rows
+        // in case we have missing rows on either side.
+        .cogroup(other = rddB, numPartitions = rddA.partitions.size max rddB.partitions.size )
+
+
+        // Merge groups.
+        .mapPartitions{iter =>
+
+        val aRows = new ArrayBuffer[Vector](1000)
+        val bRows = new ArrayBuffer[Vector](1000)
+
+        // Populate hanging row buffs
+        iter.foreach{case (_, (arowbag,browbag)) =>
+
+          // Some up all vectors, if any, for a row. If we have > 1 that means original matrix had
+          // non-uniquely keyed rows which is generally a matrix format inconsistency (should not
+          // happen).
+          aRows += (if (arowbag.isEmpty)
+            new SequentialAccessSparseVector(prodNRow)
+          else arowbag.reduce(_ += _))
+
+          bRows += (if (browbag.isEmpty)
+            new SequentialAccessSparseVector(prodNCol)
+          else browbag.reduce(_ += _))
+        }
+
+        // Transform collection of vectors into blocks.
+        val blockNRow = aRows.size
+        assert(blockNRow == bRows.size)
+
+        val aBlock:Matrix = new SparseRowMatrix(blockNRow, prodNRow, aRows.toArray)
+        val bBlock:Matrix = new SparseRowMatrix(blockNRow, prodNCol, bRows.toArray)
+
+        // Form pairwise result
+        Iterator(aBlock -> bBlock)
+      }
+    }
+
+    computeAtBZipped3(pairwiseRdd = zipped, nrow = prodNRow, ancol = prodNRow, bncol = aNRow,
+      numPartitions = numProductPartitions)
+
+  }
+  /**
+   * Compute, combine and accumulate outer products for every key. The incoming tuple structure
+   * is (partNo, (vecA, vecB)), so for every `partNo` we compute an outer product of the form {{{
+   *   vecA cross vecB
+   * }}}
+   * @param pairwiseRdd
+   * @return
+   */
+  @deprecated
+  private[sparkbindings] def combineOuterProducts(pairwiseRdd: RDD[(Int, (Vector, Vector))], numPartitions: Int) = {
+
+    pairwiseRdd
+
+      // Reduce individual partitions
+      .combineByKey(createCombiner = (t: (Vector, Vector)) => {
+
+      val vecA = t._1
+      val vecB = t._2
 
-//  private[sparkbindings] def atb_nograph()
+      // Create partition accumulator. Generally, summation of outer products probably calls for
+      // dense accumulators. However, let's assume extremely sparse cases are still possible, and
+      // by default assume any sparse case is an extremely sparse case. May need to tweak further.
+      val mxC: Matrix = if (!vecA.isDense && !vecB.isDense)
+        new SparseRowMatrix(vecA.length, vecB.length)
+      else
+        new DenseMatrix(vecA.length, vecB.length)
+
+      // Add outer product of arow and bRowFrag to mxC
+      addOuterProduct(mxC, vecA, vecB)
+
+    }, mergeValue = (mxC: Matrix, t: (Vector, Vector)) => {
+      // Merge of a combiner with another outer product fragment.
+      val vecA = t._1
+      val vecB = t._2
+
+      addOuterProduct(mxC, vecA, vecB)
+
+    }, mergeCombiners = (mxC1: Matrix, mxC2: Matrix) => {
+
+      // Merge of 2 combiners.
+      mxC1 += mxC2
+
+    }, numPartitions = numPartitions)
+  }
+
+  private[sparkbindings] def computeAtBZipped3[A: ClassTag](pairwiseRdd: RDD[(Matrix, Matrix)], nrow: Int,
+                                                            ancol: Int, bncol: Int, numPartitions: Int) = {
+
+    val ranges = computeEvenSplits(nrow, numPartitions)
+
+    val rdd = pairwiseRdd.flatMap{ case (blockA, blockB) ⇒
+
+      // Output each partial outer product with its correspondent partition #.
+        Iterator.tabulate(numPartitions) {part ⇒
+
+          val mBlock = blockA(::, ranges(part)).t %*% blockB
+
+          part → mBlock
+        }
+    }
+
+      // Reduce.
+      .reduceByKey(_ += _, numPartitions = numPartitions)
+
+      // Produce keys
+      .map { case (blockKey, block) ⇒ ranges(blockKey).toArray → block }
+
+    debug(s"A'B mmul #parts: ${rdd.partitions.size}.")
+
+    rdd
+  }
+
+  private[sparkbindings] def computeAtBZipped2[A: ClassTag](zipped: RDD[(DrmTuple[A], DrmTuple[A])], nrow: Long,
+                                                            ancol: Int, bncol: Int, numPartitions: Int) = {
+
+    // The plan of this approach is to send a_i and parts of b_i to partitoin reducers which actually
+    // do outer product sum update locally (instead of sending outer blocks). Thus it should minimize
+    // expense for IO and also in-place partition block accum update should be much more efficient
+    // than forming outer block matrices and perform matrix-on-patrix +.
+    // Figure out appriximately block height per partition of the result.
+    val blockHeight = safeToNonNegInt((nrow - 1) / numPartitions) + 1
+
+    val partitionedRdd = zipped
+
+      // Split B-rows into partitions using blockHeight
+      .mapPartitions { iter =>
+
+      val offsets = (0 until numPartitions).map(_ * blockHeight)
+      val ranges = offsets.map(offs => offs until (offs + blockHeight min ancol))
+
+      // Transform into series of (part -> (arow, part-brow)) tuples (keyed by part #).
+      iter.flatMap { case ((_, arow), (_, brow)) =>
+
+        ranges.view.zipWithIndex.map { case (arange, partNum) =>
+          partNum -> (arow(arange).cloned -> brow)
+        }
+      }
+    }
+
+    val blockRdd = combineOuterProducts(partitionedRdd, numPartitions)
+
+      // Add ordinal row keys.
+      .map { case (blockNum, block) =>
+
+      // Starting key
+      var offset = blockNum * blockHeight
+
+      var keys = Array.tabulate(block.nrow)(offset + _)
+      keys -> block
+
+    }
+
+    blockRdd
+  }
 
   /** Given already zipped, joined rdd of rows of A' and B, compute their product A'B */
-  private[sparkbindings] def computeAtBZipped[A: ClassTag](zipped:RDD[(DrmTuple[A], DrmTuple[A])],
-      nrow:Long, ancol:Int, bncol:Int, blockHeight: Int) = {
+  @deprecated
+  private[sparkbindings] def computeAtBZipped[A: ClassTag](zipped: RDD[(DrmTuple[A], DrmTuple[A])], nrow: Long,
+                                                           ancol: Int, bncol: Int, numPartitions: Int) = {
 
     // Since Q and A are partitioned same way,we can just zip their rows and proceed from there by
     // forming outer products. Our optimizer lacks this primitive, so we will implement it using RDDs
     // directly. We try to compile B' = A'Q now by collecting outer products of rows of A and Q. At
     // this point we need to split n-range  of B' into sutiable number of partitions.
 
-    val btNumParts = safeToNonNegInt((nrow - 1) / blockHeight + 1)
+    if (log.isDebugEnabled) {
+      log.debug(s"AtBZipped:zipped #parts ${zipped.partitions.size}")
+      log.debug(s"AtBZipped:Targeted #parts ${numPartitions}")
+    }
+
+    // Figure out appriximately block height per partition of the result.
+    val blockHeight = safeToNonNegInt((nrow - 1) / numPartitions) + 1
 
     val rddBt = zipped
 
-        // Produce outer product blocks
-        .flatMap {
-      case ((aKey, aRow), (qKey, qRow)) =>
-        for (blockKey <- Stream.range(0, btNumParts)) yield {
-          val blockStart = blockKey * blockHeight
-          val blockEnd = ancol min (blockStart + blockHeight)
+      // Produce outer product blocks
+      .flatMap { case ((aKey, aRow), (qKey, qRow)) =>
+      for (blockKey <- Stream.range(0, numPartitions)) yield {
+        val blockStart = blockKey * blockHeight
+        val blockEnd = ancol min (blockStart + blockHeight)
 
-          // Create block by cross product of proper slice of aRow and qRow
-          blockKey -> (aRow(blockStart until blockEnd) cross qRow)
-        }
-    }
-        // Combine blocks by just summing them up
-        .reduceByKey {
-      case (block1, block2) => block1 += block2
+        // Create block by cross product of proper slice of aRow and qRow
+        blockKey -> (aRow(blockStart until blockEnd) cross qRow)
+
+        // TODO: computing tons of cross product matrices seems to be pretty inefficient here. More
+        // likely single streaming algorithm of updates will perform much better here. So rewrite
+        // this using mapPartitions with numPartitions block accumulators.
+
+      }
     }
+      //      .combineByKey(
+      //        createCombiner = (mx:Matrix) => mx,
+      //        mergeValue = (c:Matrix,mx:Matrix) => c += mx,
+      //        mergeCombiners = (c1:Matrix,c2:Matrix) => c1 += c2,
+      //        numPartitions = numPartitions
+      //      )
+      // Doesn't look like use of combineByKey produces any better results than reduceByKey. So keeping
+      // reduceByKey for simplicity. Combiners probably doesn't mean reduceByKey doesn't combine map-side.
+      // Combine blocks by just summing them up
+      .reduceByKey((block1, block2) => block1 += block2, numPartitions)
 
-        // Throw away block key, generate row keys instead.
-        .map {
-      case (blockKey, block) =>
-        val blockStart = blockKey * blockHeight
-        val rowKeys = Array.tabulate(block.nrow)(blockStart + _)
-        rowKeys -> block
+      // Throw away block key, generate row keys instead.
+      .map { case (blockKey, block) =>
+      val blockStart = blockKey * blockHeight
+      val rowKeys = Array.tabulate(block.nrow)(blockStart + _)
+      rowKeys -> block
     }
 
-    new DrmRddInput[Int](blockifiedSrc = Some(rddBt))
+    if (log.isDebugEnabled) log.debug(s"AtBZipped #parts ${rddBt.partitions.size}")
+
+    rddBt
   }
 
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Ax.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Ax.scala
index 94c3f068f6..629accdb3b 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Ax.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Ax.scala
@@ -15,22 +15,22 @@ object Ax {
 
   def ax_with_broadcast[K: ClassTag](op: OpAx[K], srcA: DrmRddInput[K]): DrmRddInput[K] = {
 
-    val rddA = srcA.toBlockifiedDrmRdd()
-    implicit val sc:DistributedContext = rddA.sparkContext
+    val rddA = srcA.toBlockifiedDrmRdd(op.A.ncol)
+    implicit val sc: DistributedContext = rddA.sparkContext
 
     val bcastX = drmBroadcast(op.x)
 
-    val rdd = rddA
-        // Just multiply the blocks
-        .map({
-      case (keys, blockA) => keys -> (blockA %*% bcastX).toColMatrix
-    })
+    val rdd: BlockifiedDrmRdd[K] = rddA
+
+      // Just multiply the blocks
+      .map { case (keys, blockA) ⇒ keys → (blockA %*% bcastX).toColMatrix }
 
-    new DrmRddInput(blockifiedSrc = Some(rdd))
+    new DrmRddInput(Right(rdd))
   }
 
   def atx_with_broadcast(op: OpAtx, srcA: DrmRddInput[Int]): DrmRddInput[Int] = {
-    val rddA = srcA.toBlockifiedDrmRdd()
+
+    val rddA = srcA.toBlockifiedDrmRdd(op.A.ncol)
     implicit val dc:DistributedContext = rddA.sparkContext
 
     val bcastX = drmBroadcast(op.x)
@@ -52,10 +52,10 @@ object Ax {
     // It is ridiculous, but in this scheme we will have to re-parallelize it again in order to plug
     // it back as drm blockified rdd
 
-    val rdd = dc.parallelize(Seq(inCoreM), numSlices = 1)
-        .map(block => Array.tabulate(block.nrow)(i => i) -> block)
+    val rdd:BlockifiedDrmRdd[Int] = dc.parallelize(Seq(inCoreM), numSlices = 1)
+        .map{block ⇒ Array.tabulate(block.nrow)(i ⇒ i) -> block}
 
-    new DrmRddInput(blockifiedSrc = Some(rdd))
+    rdd
 
   }
 
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/CbindAB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/CbindAB.scala
index ea10ccbbf3..4a379ec13a 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/CbindAB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/CbindAB.scala
@@ -18,12 +18,13 @@
 package org.apache.mahout.sparkbindings.blas
 
 import org.apache.log4j.Logger
-import scala.reflect.ClassTag
+import org.apache.mahout.sparkbindings.DrmRdd
+import reflect._
 import org.apache.mahout.sparkbindings.drm.DrmRddInput
 import org.apache.mahout.math._
 import scalabindings._
 import RLikeOps._
-import org.apache.mahout.math.drm.logical.OpCbind
+import org.apache.mahout.math.drm.logical.{OpCbindScalar, OpCbind}
 import org.apache.spark.SparkContext._
 
 /** Physical cbind */
@@ -31,6 +32,34 @@ object CbindAB {
 
   private val log = Logger.getLogger(CbindAB.getClass)
 
+  def cbindAScalar[K:ClassTag](op: OpCbindScalar[K], srcA:DrmRddInput[K]) : DrmRddInput[K] = {
+    val srcRdd = srcA.toDrmRdd()
+
+    val ncol = op.A.ncol
+    val x = op.x
+
+    val fixedRdd = if (classTag[K] == ClassTag.Int && x != 0.0)
+      fixIntConsistency(op.asInstanceOf[OpCbindScalar[Int]],
+        src = srcRdd.asInstanceOf[DrmRdd[Int]]).asInstanceOf[DrmRdd[K]]
+    else srcRdd
+
+    val left = op.leftBind
+
+    val resultRdd = fixedRdd.map { case (key, vec) =>
+      val newVec = vec.like(ncol + 1)
+      if (left) {
+        newVec(1 to ncol) := vec
+        newVec(0) = x
+      } else {
+        newVec(0 until ncol) := vec
+        newVec(ncol) = x
+      }
+      key -> newVec
+    }
+
+    resultRdd
+  }
+
   def cbindAB_nograph[K: ClassTag](op: OpCbind[K], srcA: DrmRddInput[K], srcB: DrmRddInput[K]): DrmRddInput[K] = {
 
     val a = srcA.toDrmRdd()
@@ -88,7 +117,7 @@ object CbindAB {
       }
     }
 
-    new DrmRddInput(rowWiseSrc = Some(op.ncol -> rdd))
+    rdd
 
   }
 
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/DrmRddOps.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/DrmRddOps.scala
index a3caac7bbf..4cd9a747b1 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/DrmRddOps.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/DrmRddOps.scala
@@ -25,12 +25,14 @@ import org.apache.mahout.sparkbindings.DrmRdd
 
 class DrmRddOps[K: ClassTag](private[blas] val rdd: DrmRdd[K]) {
 
+  /** Turn RDD into dense row-wise vectors if density threshold is exceeded. */
   def densify(threshold: Double = 0.80): DrmRdd[K] = rdd.map({
     case (key, v) =>
       val vd = if (!v.isDense && v.getNumNonZeroElements > threshold * v.length) new DenseVector(v) else v
       key -> vd
   })
 
+  /** Turn rdd into sparse RDD if density threshold is underrun. */
   def sparsify(threshold: Double = 0.80): DrmRdd[K] = rdd.map({
     case (key, v) =>
       val vs = if (v.isDense() && v.getNumNonZeroElements <= threshold * v.length)
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/MapBlock.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/MapBlock.scala
index 4c68c9af3b..2933ddcfdd 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/MapBlock.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/MapBlock.scala
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.sparkbindings.blas
 
+import org.apache.mahout.math.drm.logical.OpMapBlock
 import org.apache.mahout.sparkbindings.drm.DrmRddInput
 import org.apache.mahout.math.drm.BlockMapFunc
 import org.apache.mahout.math.scalabindings.RLikeOps._
@@ -24,12 +25,13 @@ import scala.reflect.ClassTag
 
 object MapBlock {
 
-  def exec[S, R:ClassTag](src: DrmRddInput[S], ncol:Int, bmf:BlockMapFunc[S,R]): DrmRddInput[R] = {
+  def exec[S, R:ClassTag](src: DrmRddInput[S], operator:OpMapBlock[S,R]): DrmRddInput[R] = {
 
-    // We can't use attributes to avoid putting the whole this into closure.
-
-    val rdd = src.toBlockifiedDrmRdd()
-        .map(blockTuple => {
+    // We can't use attributes directly in the closure in order to avoid putting the whole object
+    // into closure.
+    val bmf = operator.bmf
+    val ncol = operator.ncol
+    val rdd = src.toBlockifiedDrmRdd(operator.A.ncol).map(blockTuple => {
       val out = bmf(blockTuple)
 
       assert(out._2.nrow == blockTuple._2.nrow, "block mapping must return same number of rows.")
@@ -37,7 +39,8 @@ object MapBlock {
 
       out
     })
-    new DrmRddInput(blockifiedSrc = Some(rdd))
+
+    rdd
   }
 
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Par.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Par.scala
index e73376d722..0434a72926 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Par.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Par.scala
@@ -1,50 +1,58 @@
 package org.apache.mahout.sparkbindings.blas
 
+import org.apache.mahout.sparkbindings.drm
+
 import scala.reflect.ClassTag
 import org.apache.mahout.sparkbindings.drm.DrmRddInput
 import org.apache.mahout.math.drm.logical.OpPar
 import org.apache.spark.rdd.RDD
+import scala.math._
+
+import org.apache.mahout.logging._
 
 /** Physical adjustment of parallelism */
 object Par {
 
+  private final implicit val log = getLog(Par.getClass)
+
   def exec[K: ClassTag](op: OpPar[K], src: DrmRddInput[K]): DrmRddInput[K] = {
 
-    def adjust[T](rdd: RDD[T]): RDD[T] =
-      if (op.minSplits > 0) {
-        if (rdd.partitions.size < op.minSplits)
-          rdd.coalesce(op.minSplits, shuffle = true)
-        else rdd.coalesce(rdd.partitions.size)
-      } else if (op.exactSplits > 0) {
-        if (op.exactSplits < rdd.partitions.size)
-          rdd.coalesce(numPartitions = op.exactSplits, shuffle = false)
-        else if (op.exactSplits > rdd.partitions.size)
-          rdd.coalesce(numPartitions = op.exactSplits, shuffle = true)
-        else
-          rdd.coalesce(rdd.partitions.size)
-      } else if (op.exactSplits == -1 && op.minSplits == -1) {
-
-        // auto adjustment, try to scale up to either x1Size or x2Size.
-        val clusterSize = rdd.context.getConf.get("spark.default.parallelism", "1").toInt
-
-        val x1Size = (clusterSize * .95).ceil.toInt
-        val x2Size = (clusterSize * 1.9).ceil.toInt
-
-        if (rdd.partitions.size <= x1Size)
-          rdd.coalesce(numPartitions = x1Size, shuffle = true)
-        else if (rdd.partitions.size <= x2Size)
-          rdd.coalesce(numPartitions = x2Size, shuffle = true)
-        else
-          rdd.coalesce(numPartitions = rdd.partitions.size)
-      } else rdd.coalesce(rdd.partitions.size)
-
-    if (src.isBlockified) {
-      val rdd = src.toBlockifiedDrmRdd()
-      new DrmRddInput[K](blockifiedSrc = Some(adjust(rdd)))
+    val srcBlockified = src.isBlockified
+
+    val srcRdd = if (srcBlockified) src.toBlockifiedDrmRdd(op.ncol) else src.toDrmRdd()
+    val srcNParts = srcRdd.partitions.size
+
+    // To what size?
+    val targetParts = if (op.minSplits > 0) srcNParts max op.minSplits
+    else if (op.exactSplits > 0) op.exactSplits
+    else /* auto adjustment */ {
+      val stdParallelism = srcRdd.context.getConf.get("spark.default.parallelism", "1").toInt
+      val x1 = 0.95 * stdParallelism
+      if (srcNParts <= ceil(x1)) ceil(x1).toInt else ceil(2 * x1).toInt
+    }
+
+    debug(s"par $srcNParts => $targetParts.")
+
+    if (targetParts > srcNParts) {
+
+      // Expanding. Always requires deblockified stuff. May require re-shuffling.
+      val rdd = src.toDrmRdd().repartition(numPartitions = targetParts)
+
+      rdd
+
+    } else if (targetParts < srcNParts) {
+      // Shrinking.
+
+      if (srcBlockified) {
+        drm.rbind(src.toBlockifiedDrmRdd(op.ncol).coalesce(numPartitions = targetParts))
+      } else {
+        src.toDrmRdd().coalesce(numPartitions = targetParts)
+      }
     } else {
-      val rdd = src.toDrmRdd()
-      new DrmRddInput[K](rowWiseSrc = Some(op.ncol -> adjust(rdd)))
+      // no adjustment required.
+      src
     }
+
   }
 
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/RbindAB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/RbindAB.scala
index 5037d687e9..62abba6ef5 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/RbindAB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/RbindAB.scala
@@ -31,11 +31,11 @@ object RbindAB {
 
     // If any of the inputs is blockified, use blockified inputs
     if (srcA.isBlockified || srcB.isBlockified) {
-      val a = srcA.toBlockifiedDrmRdd()
-      val b = srcB.toBlockifiedDrmRdd()
+      val a = srcA.toBlockifiedDrmRdd(op.A.ncol)
+      val b = srcB.toBlockifiedDrmRdd(op.B.ncol)
 
       // Union seems to be fine, it is indeed just do partition-level unionization, no shuffles
-      new DrmRddInput(blockifiedSrc = Some(a ++ b))
+      a ++ b
 
     } else {
 
@@ -43,7 +43,7 @@ object RbindAB {
       val a = srcA.toDrmRdd()
       val b = srcB.toDrmRdd()
 
-      new DrmRddInput(rowWiseSrc = Some(op.ncol -> (a ++ b)))
+      a ++ b
     }
   }
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Slicing.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Slicing.scala
index d0a50b59cc..0284ba2875 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Slicing.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/Slicing.scala
@@ -22,6 +22,6 @@ object Slicing {
 
     // TODO: we probably need to re-shuffle result or at least cut down the partitions of 0 size
 
-    new DrmRddInput(rowWiseSrc = Some(ncol -> rdd))
+    rdd
   }
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/package.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/package.scala
index 9a50afaf8c..6b8513f6f1 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/package.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/package.scala
@@ -17,13 +17,17 @@
 
 package org.apache.mahout.sparkbindings
 
+import org.apache.mahout.sparkbindings
+import org.apache.spark.rdd.RDD
+
 import scala.reflect.ClassTag
-import org.apache.mahout.sparkbindings.drm.{CheckpointedDrmSpark, DrmRddInput}
 import org.apache.spark.SparkContext._
 import org.apache.mahout.math._
 import org.apache.mahout.math.drm._
 import scalabindings._
 import RLikeOps._
+import collection._
+import JavaConversions._
 
 /**
  * This validation contains distributed algorithms that distributed matrix expression optimizer picks
@@ -31,8 +35,81 @@ import RLikeOps._
  */
 package object blas {
 
-  implicit def drmRdd2ops[K:ClassTag](rdd:DrmRdd[K]):DrmRddOps[K] = new DrmRddOps[K](rdd)
+  implicit def drmRdd2ops[K: ClassTag](rdd: DrmRdd[K]): DrmRddOps[K] = new DrmRddOps[K](rdd)
+
+
+  /**
+   * Rekey matrix dataset keys to consequtive int keys.
+   * @param rdd incoming matrix row-wise dataset
+   *
+   * @param computeMap if true, also compute mapping between old and new keys
+   * @tparam K existing key parameter
+   * @return
+   */
+  private[mahout] def rekeySeqInts[K: ClassTag](rdd: DrmRdd[K], computeMap: Boolean = true): (DrmRdd[Int],
+    Option[RDD[(K, Int)]]) = {
+
+    // Spark context please.
+    val sctx = rdd.context
+    import sctx._
+
+    // First, compute partition sizes.
+    val partSizes = rdd.mapPartitionsWithIndex((part, iter) => Iterator(part -> iter.size))
+
+      // Collect in-core
+      .collect()
+
+    // Starting indices
+    var startInd = new Array[Int](rdd.partitions.size)
+
+    // Save counts
+    for (pc <- partSizes) startInd(pc._1) = pc._2
+
+    // compute cumulative sum
+    val siBcast = broadcast(startInd.scanLeft(0)(_ + _).init)
+
+    // Compute key -> int index map:
+    val keyMap = if (computeMap) {
+      Some(rdd
+
+        // Process individual partition with index, output `key -> index` tuple
+        .mapPartitionsWithIndex((part, iter) => {
+
+        // Start index for this partition
+        val si = siBcast.value(part)
+        iter.zipWithIndex.map { case ((key, _), index) => key -> (index + si)}
+      })) // Some
+
+    } else {
+
+      // Were not asked to compute key mapping
+      None
+    }
+
+    // Finally, do the transform
+    val intRdd = rdd
+
+      // Re-number each partition
+      .mapPartitionsWithIndex((part, iter) => {
 
+      // Start index for this partition
+      val si = siBcast.value(part)
+
+      // Iterate over data by producing sequential row index and retaining vector value.
+      iter.zipWithIndex.map { case ((_, vec), ind) => si + ind -> vec}
+    })
+
+    // Finally, return drm -> keymap result
+
+    intRdd -> keyMap
+
+  }
+
+
+  /**
+   * Fills in missing rows in an Int-indexed matrix by putting in empty row vectors for the missing
+   * keys.
+   */
   private[mahout] def fixIntConsistency(op: DrmLike[Int], src: DrmRdd[Int]): DrmRdd[Int] = {
 
     if (op.canHaveMissingRows) {
@@ -45,20 +122,20 @@ package object blas {
       // Compute the fix.
       sc
 
-          // Bootstrap full key set
-          .parallelize(0 until dueRows, numSlices = rdd.partitions.size max 1)
+        // Bootstrap full key set
+        .parallelize(0 until dueRows, numSlices = rdd.partitions.size max 1)
 
-          // Enable PairedFunctions
-          .map(_ -> Unit)
+        // Enable PairedFunctions
+        .map(_ -> Unit)
 
-          // Cogroup with all rows
-          .cogroup(other = rdd)
+        // Cogroup with all rows
+        .cogroup(other = rdd)
 
-          // Filter out out-of-bounds
-          .filter { case (key, _) => key >= 0 && key < dueRows}
+        // Filter out out-of-bounds
+        .filter { case (key, _) => key >= 0 && key < dueRows}
 
-          // Coalesce and output RHS
-          .map { case (key, (seqUnit, seqVec)) =>
+        // Coalesce and output RHS
+        .map { case (key, (seqUnit, seqVec)) =>
         val acc = seqVec.headOption.getOrElse(new SequentialAccessSparseVector(dueCols))
         val vec = if (seqVec.size > 0) (acc /: seqVec.tail)(_ + _) else acc
         key -> vec
@@ -68,4 +145,77 @@ package object blas {
 
   }
 
+  /** Method to do `mxC += a cross b` in-plcae a bit more efficiently than this expression does. */
+  def addOuterProduct(mxC: Matrix, a: Vector, b: Vector): Matrix = {
+
+    // Try to pay attention to density a bit here when computing and adding the outer product of
+    // arow and brow fragment.
+    if (b.isDense)
+      for (ela <- a.nonZeroes) mxC(ela.index, ::) := { (i, x) => x + ela * b(i)}
+    else
+      for (ela <- a.nonZeroes; elb <- b.nonZeroes()) mxC(ela.index, elb.index) += ela * elb
+
+    mxC
+  }
+
+  /**
+   * Compute ranges of more or less even splits of total `nrow` number
+   *
+   * @param nrow
+   * @param numSplits
+   * @return
+   */
+  @inline
+  private[blas] def computeEvenSplits(nrow: Long, numSplits: Int): IndexedSeq[Range] = {
+    require(numSplits <= nrow, "Requested amount of splits greater than number of data points.")
+    require(nrow >= 1)
+    require(numSplits >= 1)
+
+    // Base split -- what is our base split size?
+    val baseSplit = safeToNonNegInt(nrow / numSplits)
+
+    // Slack -- how many splits will have to be incremented by 1 though?
+    val slack = safeToNonNegInt(nrow % numSplits)
+
+    // Compute ranges. We need to set ranges so that numSplits - slack splits have size of baseSplit;
+    // and `slack` splits have size baseSplit + 1. Here is how we do it: First, we compute the range
+    // offsets:
+    val offsets = (0 to numSplits).map(i => i * (baseSplit + 1) - (0 max i - slack))
+    // And then we connect the ranges using gaps between offsets:
+    offsets.sliding(2).map(offs => offs(0) until offs(1)).toIndexedSeq
+  }
+
+  /**
+   * Estimate number of partitions for the product of A %*% B.
+   *
+   * We take average per-partition element count of product as higher of the same of A and B. (prefer
+   * larger partitions of operands).
+   *
+   * @param anrow A.nrow
+   * @param ancol A.ncol
+   * @param bncol B.ncol
+   * @param aparts partitions in A
+   * @param bparts partitions in B
+   * @return recommended partitions
+   */
+  private[blas] def estimateProductPartitions(anrow:Long, ancol:Long, bncol:Long, aparts:Int, bparts:Int):Int = {
+
+    // Compute per-partition element density in A
+    val eppA = anrow.toDouble * ancol/ aparts
+
+    // Compute per-partition element density in B
+    val eppB = ancol.toDouble * bncol / bparts
+
+    // Take the maximum element density into account. Is it a good enough?
+    val epp = eppA max eppB
+
+    // product partitions
+    val prodParts = anrow * bncol / epp
+
+    val nparts = math.round(prodParts).toInt max 1
+
+    // Constrain nparts to maximum of anrow to prevent guaranteed empty partitions.
+    if (nparts > anrow) anrow.toInt else nparts
+  }
+
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSpark.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSpark.scala
index e5a2b2a3ad..41efc2726c 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSpark.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSpark.scala
@@ -33,7 +33,7 @@ import org.apache.spark.SparkContext._
 
 /** ==Spark-specific optimizer-checkpointed DRM.==
   *
-  * @param rdd underlying rdd to wrap over.
+  * @param rddInput underlying rdd to wrap over.
   * @param _nrow number of rows; if unspecified, we will compute with an inexpensive traversal.
   * @param _ncol number of columns; if unspecified, we will try to guess with an inexpensive traversal.
   * @param _cacheStorageLevel storage level
@@ -44,9 +44,9 @@ import org.apache.spark.SparkContext._
   * @tparam K matrix key type (e.g. the keys of sequence files once persisted)
   */
 class CheckpointedDrmSpark[K: ClassTag](
-    val rdd: DrmRdd[K],
-    private var _nrow: Long = -1L,
-    private var _ncol: Int = -1,
+    private[sparkbindings] val rddInput: DrmRddInput[K],
+    private[sparkbindings] var _nrow: Long = -1L,
+    private[sparkbindings] var _ncol: Int = -1,
     private val _cacheStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
     override protected[mahout] val partitioningTag: Long = Random.nextLong(),
     private var _canHaveMissingRows: Boolean = false
@@ -63,7 +63,7 @@ class CheckpointedDrmSpark[K: ClassTag](
   private[mahout] var intFixExtra: Long = 0L
 
   private var cached: Boolean = false
-  override val context: DistributedContext = rdd.context
+  override val context: DistributedContext = rddInput.backingRdd.context
 
   /** Explicit extraction of key class Tag   */
   def keyClassTag: ClassTag[K] = implicitly[ClassTag[K]]
@@ -78,8 +78,8 @@ class CheckpointedDrmSpark[K: ClassTag](
   }
 
   def cache() = {
-    if (!cached) {
-      rdd.persist(_cacheStorageLevel)
+    if (!cached && _cacheStorageLevel != StorageLevel.NONE) {
+      rddInput.backingRdd.persist(_cacheStorageLevel)
       cached = true
     }
     this
@@ -92,7 +92,7 @@ class CheckpointedDrmSpark[K: ClassTag](
    */
   def uncache(): this.type = {
     if (cached) {
-      rdd.unpersist(blocking = false)
+      rddInput.backingRdd.unpersist(blocking = false)
       cached = false
     }
     this
@@ -115,7 +115,7 @@ class CheckpointedDrmSpark[K: ClassTag](
    */
   def collect: Matrix = {
 
-    val intRowIndices = implicitly[ClassTag[K]] == implicitly[ClassTag[Int]]
+    val intRowIndices = classTag[K] == ClassTag.Int
 
     val cols = ncol
     val rows = safeToNonNegInt(nrow)
@@ -124,7 +124,7 @@ class CheckpointedDrmSpark[K: ClassTag](
     // since currently spark #collect() requires Serializeable support,
     // we serialize DRM vectors into byte arrays on backend and restore Vector
     // instances on the front end:
-    val data = rdd.map(t => (t._1, t._2)).collect()
+    val data = rddInput.toDrmRdd().map(t => (t._1, t._2)).collect()
 
 
     val m = if (data.forall(_._2.isDense))
@@ -165,7 +165,7 @@ class CheckpointedDrmSpark[K: ClassTag](
       else if (classOf[Writable].isAssignableFrom(ktag.runtimeClass)) (x: K) => x.asInstanceOf[Writable]
       else throw new IllegalArgumentException("Do not know how to convert class tag %s to Writable.".format(ktag))
 
-    rdd.saveAsSequenceFile(path)
+    rddInput.toDrmRdd().saveAsSequenceFile(path)
   }
 
   protected def computeNRow = {
@@ -173,7 +173,7 @@ class CheckpointedDrmSpark[K: ClassTag](
     val intRowIndex = classTag[K] == classTag[Int]
 
     if (intRowIndex) {
-      val rdd = cache().rdd.asInstanceOf[DrmRdd[Int]]
+      val rdd = cache().rddInput.toDrmRdd().asInstanceOf[DrmRdd[Int]]
 
       // I guess it is a suitable place to compute int keys consistency test here because we know
       // that nrow can be computed lazily, which always happens when rdd is already available, cached,
@@ -186,16 +186,21 @@ class CheckpointedDrmSpark[K: ClassTag](
       intFixExtra = (maxPlus1 - rowCount) max 0L
       maxPlus1
     } else
-      cache().rdd.count()
+      cache().rddInput.toDrmRdd().count()
   }
 
 
 
-  protected def computeNCol =
-    cache().rdd.map(_._2.length).fold(-1)(max(_, _))
+  protected def computeNCol = {
+    rddInput.isBlockified match {
+      case true ⇒ rddInput.toBlockifiedDrmRdd(throw new AssertionError("not reached"))
+        .map(_._2.ncol).reduce(max(_, _))
+      case false ⇒ cache().rddInput.toDrmRdd().map(_._2.length).fold(-1)(max(_, _))
+    }
+  }
 
   protected def computeNNonZero =
-    cache().rdd.map(_._2.getNumNonZeroElements.toLong).sum().toLong
+    cache().rddInput.toDrmRdd().map(_._2.getNumNonZeroElements.toLong).sum().toLong
 
   /** Changes the number of rows in the DRM without actually touching the underlying data. Used to
     * redimension a DRM after it has been created, which implies some blank, non-existent rows.
@@ -205,8 +210,8 @@ class CheckpointedDrmSpark[K: ClassTag](
   override def newRowCardinality(n: Int): CheckpointedDrm[K] = {
     assert(n > -1)
     assert( n >= nrow)
-    val newCheckpointedDrm = drmWrap[K](rdd, n, ncol)
-    newCheckpointedDrm
+    new CheckpointedDrmSpark(rddInput = rddInput, _nrow = n, _ncol = _ncol, _cacheStorageLevel = _cacheStorageLevel,
+      partitioningTag = partitioningTag, _canHaveMissingRows = _canHaveMissingRows)
   }
 
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSparkOps.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSparkOps.scala
index 7cf6bd6d1a..abcfc641a4 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSparkOps.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/CheckpointedDrmSparkOps.scala
@@ -11,6 +11,6 @@ class CheckpointedDrmSparkOps[K: ClassTag](drm: CheckpointedDrm[K]) {
   private[sparkbindings] val sparkDrm = drm.asInstanceOf[CheckpointedDrmSpark[K]]
 
   /** Spark matrix customization exposure */
-  def rdd = sparkDrm.rdd
+  def rdd = sparkDrm.rddInput.toDrmRdd()
 
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/DrmRddInput.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/DrmRddInput.scala
index b72818c719..d9dbada149 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/DrmRddInput.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/DrmRddInput.scala
@@ -23,22 +23,18 @@ import org.apache.spark.storage.StorageLevel
 import org.apache.mahout.sparkbindings._
 
 /** Encapsulates either DrmRdd[K] or BlockifiedDrmRdd[K] */
-class DrmRddInput[K: ClassTag](
-    private val rowWiseSrc: Option[( /*ncol*/ Int, /*rdd*/ DrmRdd[K])] = None,
-    private val blockifiedSrc: Option[BlockifiedDrmRdd[K]] = None
-    ) {
+class DrmRddInput[K: ClassTag](private val input: Either[DrmRdd[K], BlockifiedDrmRdd[K]]) {
 
-  assert(rowWiseSrc.isDefined || blockifiedSrc.isDefined, "Undefined input")
+  private[sparkbindings] lazy val backingRdd = input.left.getOrElse(input.right.get)
 
-  private lazy val backingRdd = rowWiseSrc.map(_._2).getOrElse(blockifiedSrc.get)
+  def isBlockified: Boolean = input.isRight
 
-  def isBlockified:Boolean = blockifiedSrc.isDefined
+  def isRowWise: Boolean = input.isLeft
 
-  def isRowWise:Boolean = rowWiseSrc.isDefined
+  def toDrmRdd(): DrmRdd[K] = input.left.getOrElse(deblockify(rdd = input.right.get))
 
-  def toDrmRdd(): DrmRdd[K] = rowWiseSrc.map(_._2).getOrElse(deblockify(rdd = blockifiedSrc.get))
-
-  def toBlockifiedDrmRdd() = blockifiedSrc.getOrElse(blockify(rdd = rowWiseSrc.get._2, blockncol = rowWiseSrc.get._1))
+  /** Use late binding for this. It may or may not be needed, depending on current config. */
+  def toBlockifiedDrmRdd(ncol: ⇒ Int) = input.right.getOrElse(blockify(rdd = input.left.get, blockncol = ncol))
 
   def sparkContext: SparkContext = backingRdd.sparkContext
 
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/SparkBCast.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/SparkBCast.scala
index ac36f60201..0371f9b0fd 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/SparkBCast.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/SparkBCast.scala
@@ -22,4 +22,6 @@ import org.apache.spark.broadcast.Broadcast
 
 class SparkBCast[T](val sbcast: Broadcast[T]) extends BCast[T] with Serializable {
   def value: T = sbcast.value
+
+  override def close(): Unit = sbcast.unpersist()
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala
index c04b3065db..0de5ff8063 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/drm/package.scala
@@ -37,18 +37,19 @@ package object drm {
 
   private[drm] final val log = Logger.getLogger("org.apache.mahout.sparkbindings");
 
-  private[sparkbindings] implicit def input2drmRdd[K](input: DrmRddInput[K]): DrmRdd[K] = input.toDrmRdd()
+  private[sparkbindings] implicit def cpDrm2DrmRddInput[K: ClassTag](cp: CheckpointedDrmSpark[K]): DrmRddInput[K] =
+    cp.rddInput
 
-  private[sparkbindings] implicit def input2blockifiedDrmRdd[K](input: DrmRddInput[K]): BlockifiedDrmRdd[K] = input.toBlockifiedDrmRdd()
+  private[sparkbindings] implicit def cpDrmGeneric2DrmRddInput[K: ClassTag](cp: CheckpointedDrm[K]): DrmRddInput[K] =
+    cp.asInstanceOf[CheckpointedDrmSpark[K]]
+
+  private[sparkbindings] implicit def drmRdd2drmRddInput[K: ClassTag](rdd: DrmRdd[K]) = new DrmRddInput[K](Left(rdd))
+
+  private[sparkbindings] implicit def blockifiedRdd2drmRddInput[K: ClassTag](rdd: BlockifiedDrmRdd[K]) = new
+      DrmRddInput[K](
+    Right(rdd))
 
-  private[sparkbindings] implicit def cpDrm2DrmRddInput[K: ClassTag](cp: CheckpointedDrm[K]): DrmRddInput[K] =
-    new DrmRddInput(rowWiseSrc = Some(cp.ncol -> cp.rdd))
 
-//  /** Broadcast vector (Mahout vectors are not closure-friendly, use this instead. */
-//  private[sparkbindings] def drmBroadcast(x: Vector)(implicit sc: SparkContext): Broadcast[Vector] = sc.broadcast(x)
-//
-//  /** Broadcast in-core Mahout matrix. Use this instead of closure. */
-//  private[sparkbindings] def drmBroadcast(m: Matrix)(implicit sc: SparkContext): Broadcast[Matrix] = sc.broadcast(m)
 
   /** Implicit broadcast cast for Spark physical op implementations. */
   private[sparkbindings] implicit def bcast2val[K](bcast:Broadcast[K]):K = bcast.value
@@ -74,7 +75,7 @@ package object drm {
           }
           block
         } else {
-          new SparseRowMatrix(vectors.size, blockncol, vectors)
+          new SparseRowMatrix(vectors.size, blockncol, vectors, true, false)
         }
 
         Iterator(keys -> block)
@@ -101,7 +102,7 @@ package object drm {
         blockKeys.ensuring(blockKeys.size == block.nrow)
         blockKeys.view.zipWithIndex.map {
           case (key, idx) =>
-            var v = block(idx, ::) // This is just a view!
+            val v = block(idx, ::) // This is just a view!
 
             // If a view rather than a concrete vector, clone into a concrete vector in order not to
             // attempt to serialize outer matrix when we save it (Although maybe most often this
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala
new file mode 100644
index 0000000000..15a3d3bfd9
--- /dev/null
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.sparkbindings.io
+
+import java.util
+
+import com.esotericsoftware.kryo.io.{Output, Input}
+import com.esotericsoftware.kryo.{Kryo, Serializer}
+import org.apache.log4j.Logger
+import org.apache.mahout.logging._
+import org.apache.mahout.math._
+import org.apache.mahout.math.flavor.TraversingStructureEnum
+import scalabindings._
+import RLikeOps._
+import collection._
+import JavaConversions._
+
+object GenericMatrixKryoSerializer {
+
+  private implicit final val log = Logger.getLogger(classOf[GenericMatrixKryoSerializer])
+
+}
+
+/** Serializes Sparse or Dense in-core generic matrix (row-wise or column-wise backed) */
+class GenericMatrixKryoSerializer extends Serializer[Matrix] {
+
+  import GenericMatrixKryoSerializer._
+
+  override def write(kryo: Kryo, output: Output, mx: Matrix): Unit = {
+
+    debug(s"Writing mx of type ${mx.getClass.getName}")
+
+    val structure = mx.getFlavor.getStructure
+
+    // Write structure bit
+    output.writeInt(structure.ordinal(), true)
+
+    // Write geometry
+    output.writeInt(mx.nrow, true)
+    output.writeInt(mx.ncol, true)
+
+    // Write in most efficient traversal order (using backing vectors perhaps)
+    structure match {
+      case TraversingStructureEnum.COLWISE => writeRowWise(kryo, output, mx.t)
+      case TraversingStructureEnum.SPARSECOLWISE => writeSparseRowWise(kryo, output, mx.t)
+      case TraversingStructureEnum.SPARSEROWWISE => writeSparseRowWise(kryo, output, mx)
+      case TraversingStructureEnum.VECTORBACKED => writeVectorBacked(kryo, output, mx)
+      case _ => writeRowWise(kryo, output, mx)
+    }
+
+  }
+
+  private def writeVectorBacked(kryo: Kryo, output: Output, mx: Matrix) {
+
+    require(mx != null)
+
+    // At this point we are just doing some vector-backed classes individually. TODO: create
+    // api to obtain vector-backed matrix data.
+    kryo.writeClass(output, mx.getClass)
+    mx match {
+      case mxD: DiagonalMatrix => kryo.writeObject(output, mxD.diagv)
+      case mxS: DenseSymmetricMatrix => kryo.writeObject(output, dvec(mxS.getData))
+      case mxT: UpperTriangular => kryo.writeObject(output, dvec(mxT.getData))
+      case _ => throw new IllegalArgumentException(s"Unsupported matrix type:${mx.getClass.getName}")
+    }
+  }
+
+  private def readVectorBacked(kryo: Kryo, input: Input, nrow: Int, ncol: Int) = {
+
+    // We require vector-backed matrices to have vector-parameterized constructor to construct.
+    val clazz = kryo.readClass(input).getType
+
+    debug(s"Deserializing vector-backed mx of type ${clazz.getName}.")
+
+    clazz.getConstructor(classOf[Vector]).newInstance(kryo.readObject(input, classOf[Vector])).asInstanceOf[Matrix]
+  }
+
+  private def writeRowWise(kryo: Kryo, output: Output, mx: Matrix): Unit = {
+    for (row <- mx) kryo.writeObject(output, row)
+  }
+
+  private def readRows(kryo: Kryo, input: Input, nrow: Int) = {
+    Array.tabulate(nrow) { _ => kryo.readObject(input, classOf[Vector])}
+  }
+
+  private def readSparseRows(kryo: Kryo, input: Input) = {
+
+    // Number of slices
+    val nslices = input.readInt(true)
+
+    Array.tabulate(nslices) { _ =>
+      input.readInt(true) -> kryo.readObject(input, classOf[Vector])
+    }
+  }
+
+  private def writeSparseRowWise(kryo: Kryo, output: Output, mx: Matrix): Unit = {
+
+    val nslices = mx.numSlices()
+
+    output.writeInt(nslices, true)
+
+    var actualNSlices = 0;
+    for (row <- mx.iterateNonEmpty()) {
+      output.writeInt(row.index(), true)
+      kryo.writeObject(output, row.vector())
+      actualNSlices += 1
+    }
+
+    require(nslices == actualNSlices, "Number of slices reported by Matrix.numSlices() was different from actual " +
+      "slice iterator size.")
+  }
+
+  override def read(kryo: Kryo, input: Input, mxClass: Class[Matrix]): Matrix = {
+
+    // Read structure hint
+    val structure = TraversingStructureEnum.values()(input.readInt(true))
+
+    // Read geometry
+    val nrow = input.readInt(true)
+    val ncol = input.readInt(true)
+
+    debug(s"read matrix geometry: $nrow x $ncol.")
+
+    structure match {
+
+      // Sparse or dense column wise
+      case TraversingStructureEnum.COLWISE =>
+        val cols = readRows(kryo, input, ncol)
+
+        if (cols.head.isDense)
+          dense(cols).t
+        else {
+
+          debug("Deserializing as SparseRowMatrix.t (COLWISE).")
+          new SparseRowMatrix(ncol, nrow, cols, true, false).t
+        }
+
+      // transposed SparseMatrix case
+      case TraversingStructureEnum.SPARSECOLWISE =>
+        val cols = readSparseRows(kryo, input)
+        val javamap = new util.HashMap[Integer, Vector]((cols.size << 1) + 1)
+        cols.foreach { case (idx, vec) => javamap.put(idx, vec)}
+
+        debug("Deserializing as SparseMatrix.t (SPARSECOLWISE).")
+        new SparseMatrix(ncol, nrow, javamap, true).t
+
+      // Sparse Row-wise -- this will be created as a SparseMatrix.
+      case TraversingStructureEnum.SPARSEROWWISE =>
+        val rows = readSparseRows(kryo, input)
+        val javamap = new util.HashMap[Integer, Vector]((rows.size << 1) + 1)
+        rows.foreach { case (idx, vec) => javamap.put(idx, vec)}
+
+        debug("Deserializing as SparseMatrix (SPARSEROWWISE).")
+        new SparseMatrix(nrow, ncol, javamap, true)
+      case TraversingStructureEnum.VECTORBACKED =>
+
+        debug("Deserializing vector-backed...")
+        readVectorBacked(kryo, input, nrow, ncol)
+
+      // By default, read row-wise.
+      case _ =>
+        val cols = readRows(kryo, input, nrow)
+        // this still copies a lot of stuff...
+        if (cols.head.isDense) {
+
+          debug("Deserializing as DenseMatrix.")
+          dense(cols)
+        } else {
+
+          debug("Deserializing as SparseRowMatrix(default).")
+          new SparseRowMatrix(nrow, ncol, cols, true, false)
+        }
+    }
+
+  }
+}
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala
index a8a0bb4c36..406891f19b 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala
@@ -25,15 +25,28 @@ import org.apache.spark.serializer.KryoRegistrator
 import org.apache.mahout.sparkbindings._
 import org.apache.mahout.math.Vector.Element
 
-import scala.collection.immutable.List
+object MahoutKryoRegistrator {
 
-/** Kryo serialization registrator for Mahout */
-class MahoutKryoRegistrator extends KryoRegistrator {
+  private final implicit val log = getLog(this.getClass)
+
+  def registerClasses(kryo: Kryo) = {
 
-  override def registerClasses(kryo: Kryo) = {
+    trace("Registering mahout classes.")
+
+    kryo.register(classOf[SparseColumnMatrix], new UnsupportedSerializer)
+    kryo.addDefaultSerializer(classOf[Vector], new VectorKryoSerializer())
+    kryo.addDefaultSerializer(classOf[Matrix], new GenericMatrixKryoSerializer)
 
     kryo.addDefaultSerializer(classOf[Vector], new WritableKryoSerializer[Vector, VectorWritable])
     kryo.addDefaultSerializer(classOf[DenseVector], new WritableKryoSerializer[Vector, VectorWritable])
     kryo.addDefaultSerializer(classOf[Matrix], new WritableKryoSerializer[Matrix, MatrixWritable])
   }
+
+
+}
+
+/** Kryo serialization registrator for Mahout */
+class MahoutKryoRegistrator extends KryoRegistrator {
+
+  override def registerClasses(kryo: Kryo) = MahoutKryoRegistrator.registerClasses(kryo)
 }
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/UnsupportedSerializer.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/UnsupportedSerializer.scala
new file mode 100644
index 0000000000..66b79f46e0
--- /dev/null
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/UnsupportedSerializer.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.sparkbindings.io
+
+import com.esotericsoftware.kryo.io.{Output, Input}
+import com.esotericsoftware.kryo.{Kryo, Serializer}
+
+class UnsupportedSerializer extends Serializer[Any] {
+
+  override def write(kryo: Kryo, output: Output, obj: Any): Unit = {
+    throw new IllegalArgumentException(s"I/O of this type(${obj.getClass.getName} is explicitly unsupported for a " +
+      "good reason.")
+  }
+
+  override def read(kryo: Kryo, input: Input, `type`: Class[Any]): Any = ???
+}
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/VectorKryoSerializer.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/VectorKryoSerializer.scala
new file mode 100644
index 0000000000..175778f0ca
--- /dev/null
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/VectorKryoSerializer.scala
@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.sparkbindings.io
+
+import org.apache.log4j.Logger
+import org.apache.mahout.logging._
+import org.apache.mahout.math._
+import org.apache.mahout.math.scalabindings._
+import RLikeOps._
+
+import com.esotericsoftware.kryo.io.{OutputChunked, Output, Input}
+import com.esotericsoftware.kryo.{Kryo, Serializer}
+
+import collection._
+import JavaConversions._
+
+
+object VectorKryoSerializer {
+
+  final val FLAG_DENSE: Int = 0x01
+  final val FLAG_SEQUENTIAL: Int = 0x02
+  final val FLAG_NAMED: Int = 0x04
+  final val FLAG_LAX_PRECISION: Int = 0x08
+
+  private final implicit val log = getLog(classOf[VectorKryoSerializer])
+
+}
+
+class VectorKryoSerializer(val laxPrecision: Boolean = false) extends Serializer[Vector] {
+
+  import VectorKryoSerializer._
+
+  override def write(kryo: Kryo, output: Output, vector: Vector): Unit = {
+
+    require(vector != null)
+
+    trace(s"Serializing vector of ${vector.getClass.getName} class.")
+
+    // Write length
+    val len = vector.length
+    output.writeInt(len, true)
+
+    // Interrogate vec properties
+    val dense = vector.isDense
+    val sequential = vector.isSequentialAccess
+    val named = vector.isInstanceOf[NamedVector]
+
+    var flag = 0
+
+    if (dense) {
+      flag |= FLAG_DENSE
+    } else if (sequential) {
+      flag |= FLAG_SEQUENTIAL
+    }
+
+    if (vector.isInstanceOf[NamedVector]) {
+      flag |= FLAG_NAMED
+    }
+
+    if (laxPrecision) flag |= FLAG_LAX_PRECISION
+
+    // Write flags
+    output.writeByte(flag)
+
+    // Write name if needed
+    if (named) output.writeString(vector.asInstanceOf[NamedVector].getName)
+
+    dense match {
+
+      // Dense vector.
+      case true =>
+
+        laxPrecision match {
+          case true =>
+            for (i <- 0 until vector.length) output.writeFloat(vector(i).toFloat)
+          case _ =>
+            for (i <- 0 until vector.length) output.writeDouble(vector(i))
+        }
+      case _ =>
+
+        // Turns out getNumNonZeroElements must check every element if it is indeed non-zero. The
+        // iterateNonZeros() on the other hand doesn't do that, so that's all inconsistent right
+        // now. so we'll just auto-terminate.
+        val iter = vector.nonZeroes.toIterator.filter(_.get() != 0.0)
+
+        sequential match {
+
+          // Delta encoding
+          case true =>
+
+            var idx = 0
+            laxPrecision match {
+              case true =>
+                while (iter.hasNext) {
+                  val el = iter.next()
+                  output.writeFloat(el.toFloat)
+                  output.writeInt(el.index() - idx, true)
+                  idx = el.index
+                }
+                // Terminate delta encoding.
+                output.writeFloat(0.0.toFloat)
+              case _ =>
+                while (iter.hasNext) {
+                  val el = iter.next()
+                  output.writeDouble(el.get())
+                  output.writeInt(el.index() - idx, true)
+                  idx = el.index
+                }
+                // Terminate delta encoding.
+                output.writeDouble(0.0)
+            }
+
+          // Random access.
+          case _ =>
+
+            laxPrecision match {
+
+              case true =>
+                iter.foreach { el =>
+                  output.writeFloat(el.get().toFloat)
+                  output.writeInt(el.index(), true)
+                }
+                // Terminate random access with 0.0 value.
+                output.writeFloat(0.0.toFloat)
+              case _ =>
+                iter.foreach { el =>
+                  output.writeDouble(el.get())
+                  output.writeInt(el.index(), true)
+                }
+                // Terminate random access with 0.0 value.
+                output.writeDouble(0.0)
+            }
+
+        }
+
+    }
+  }
+
+  override def read(kryo: Kryo, input: Input, vecClass: Class[Vector]): Vector = {
+
+    val len = input.readInt(true)
+    val flags = input.readByte().toInt
+    val name = if ((flags & FLAG_NAMED) != 0) Some(input.readString()) else None
+
+    val vec: Vector = flags match {
+
+      // Dense
+      case _: Int if ((flags & FLAG_DENSE) != 0) =>
+
+        trace(s"Deserializing dense vector.")
+
+        if ((flags & FLAG_LAX_PRECISION) != 0) {
+          new DenseVector(len) := { _ => input.readFloat()}
+        } else {
+          new DenseVector(len) := { _ => input.readDouble()}
+        }
+
+      // Sparse case.
+      case _ =>
+
+        flags match {
+
+          // Sequential.
+          case _: Int if ((flags & FLAG_SEQUENTIAL) != 0) =>
+
+            trace("Deserializing as sequential sparse vector.")
+
+            val v = new SequentialAccessSparseVector(len)
+            var idx = 0
+            var stop = false
+
+            if ((flags & FLAG_LAX_PRECISION) != 0) {
+
+              while (!stop) {
+                val value = input.readFloat()
+                if (value == 0.0) {
+                  stop = true
+                } else {
+                  idx += input.readInt(true)
+                  v(idx) = value
+                }
+              }
+            } else {
+              while (!stop) {
+                val value = input.readDouble()
+                if (value == 0.0) {
+                  stop = true
+                } else {
+                  idx += input.readInt(true)
+                  v(idx) = value
+                }
+              }
+            }
+            v
+
+          // Random access
+          case _ =>
+
+            trace("Deserializing as random access vector.")
+
+            // Read pairs until we see 0.0 value. Prone to corruption attacks obviously.
+            val v = new RandomAccessSparseVector(len)
+            var stop = false
+            if ((flags & FLAG_LAX_PRECISION) != 0) {
+              while (! stop ) {
+                val value = input.readFloat()
+                if ( value == 0.0 ) {
+                  stop = true
+                } else {
+                  val idx = input.readInt(true)
+                  v(idx) = value
+                }
+              }
+            } else {
+              while (! stop ) {
+                val value = input.readDouble()
+                if (value == 0.0) {
+                  stop = true
+                } else {
+                  val idx = input.readInt(true)
+                  v(idx) = value
+                }
+              }
+            }
+            v
+        }
+    }
+
+    name.map{name =>
+
+      trace(s"Recovering named vector's name ${name}.")
+
+      new NamedVector(vec, name)
+    }
+      .getOrElse(vec)
+  }
+}
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/package.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/package.scala
index 02f6b8cb4b..330ae3851c 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/package.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/package.scala
@@ -17,27 +17,27 @@
 
 package org.apache.mahout
 
-import org.apache.mahout.drivers.TextDelimitedIndexedDatasetReader
-import org.apache.mahout.math.indexeddataset.Schema
-import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark
-import org.apache.spark.{SparkConf, SparkContext}
 import java.io._
-import scala.collection.mutable.ArrayBuffer
-import org.apache.mahout.common.IOUtils
-import org.apache.log4j.Logger
+
+import org.apache.mahout.logging._
 import org.apache.mahout.math.drm._
-import scala.reflect.ClassTag
-import org.apache.mahout.sparkbindings.drm.{DrmRddInput, SparkBCast, CheckpointedDrmSparkOps, CheckpointedDrmSpark}
-import org.apache.spark.rdd.RDD
+import org.apache.mahout.math.{MatrixWritable, VectorWritable, Matrix, Vector}
+import org.apache.mahout.sparkbindings.drm.{CheckpointedDrmSpark, CheckpointedDrmSparkOps, SparkBCast}
+import org.apache.mahout.util.IOUtilsScala
 import org.apache.spark.broadcast.Broadcast
-import org.apache.mahout.math.{VectorWritable, Vector, MatrixWritable, Matrix}
-import org.apache.hadoop.io.Writable
-import org.apache.spark.storage.StorageLevel
+import org.apache.spark.rdd.RDD
+import org.apache.spark.{SparkConf, SparkContext}
+
+import collection._
+import collection.generic.Growable
+import scala.reflect.ClassTag
+
+
 
 /** Public api for Spark-specific operators */
 package object sparkbindings {
 
-  private[sparkbindings] val log = Logger.getLogger("org.apache.mahout.sparkbindings")
+  private final implicit val log = getLog(`package`.getClass)
 
   /** Row-wise organized DRM rdd type */
   type DrmRdd[K] = RDD[DrmTuple[K]]
@@ -55,15 +55,11 @@ package object sparkbindings {
    * @param customJars
    * @return
    */
-  def mahoutSparkContext(
-      masterUrl: String,
-      appName: String,
-      customJars: TraversableOnce[String] = Nil,
-      sparkConf: SparkConf = new SparkConf(),
-      addMahoutJars: Boolean = true
-      ): SparkDistributedContext = {
+  def mahoutSparkContext(masterUrl: String, appName: String, customJars: TraversableOnce[String] = Nil,
+                         sparkConf: SparkConf = new SparkConf(), addMahoutJars: Boolean = true):
+  SparkDistributedContext = {
 
-    val closeables = new java.util.ArrayDeque[Closeable]()
+    val closeables = mutable.ListBuffer.empty[Closeable]
 
     try {
 
@@ -84,9 +80,9 @@ package object sparkbindings {
         sparkConf.setJars(customJars.toSeq)
       }
 
-      sparkConf.setAppName(appName).setMaster(masterUrl)
-          .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
-          .set("spark.kryo.registrator", "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator")
+      sparkConf.setAppName(appName).setMaster(masterUrl).set("spark.serializer",
+        "org.apache.spark.serializer.KryoSerializer").set("spark.kryo.registrator",
+          "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator")
 
       if (System.getenv("SPARK_HOME") != null) {
         sparkConf.setSparkHome(System.getenv("SPARK_HOME"))
@@ -95,7 +91,7 @@ package object sparkbindings {
       new SparkDistributedContext(new SparkContext(config = sparkConf))
 
     } finally {
-      IOUtils.close(closeables)
+      IOUtilsScala.close(closeables)
     }
   }
 
@@ -103,19 +99,19 @@ package object sparkbindings {
 
   implicit def sc2sdc(sc: SparkContext): SparkDistributedContext = new SparkDistributedContext(sc)
 
-  implicit def dc2sc(dc:DistributedContext):SparkContext = {
-    assert (dc.isInstanceOf[SparkDistributedContext],"distributed context must be Spark-specific.")
+  implicit def dc2sc(dc: DistributedContext): SparkContext = {
+    assert(dc.isInstanceOf[SparkDistributedContext], "distributed context must be Spark-specific.")
     sdc2sc(dc.asInstanceOf[SparkDistributedContext])
   }
 
   /** Broadcast transforms */
-  implicit def sb2bc[T](b:Broadcast[T]):BCast[T] = new SparkBCast(b)
+  implicit def sb2bc[T](b: Broadcast[T]): BCast[T] = new SparkBCast(b)
 
   /** Adding Spark-specific ops */
   implicit def cpDrm2cpDrmSparkOps[K: ClassTag](drm: CheckpointedDrm[K]): CheckpointedDrmSparkOps[K] =
     new CheckpointedDrmSparkOps[K](drm)
 
-  implicit def drm2cpDrmSparkOps[K:ClassTag](drm:DrmLike[K]):CheckpointedDrmSparkOps[K] = drm:CheckpointedDrm[K]
+  implicit def drm2cpDrmSparkOps[K: ClassTag](drm: DrmLike[K]): CheckpointedDrmSparkOps[K] = drm: CheckpointedDrm[K]
 
   private[sparkbindings] implicit def m2w(m: Matrix): MatrixWritable = new MatrixWritable(m)
 
@@ -123,7 +119,7 @@ package object sparkbindings {
 
   private[sparkbindings] implicit def v2w(v: Vector): VectorWritable = new VectorWritable(v)
 
-  private[sparkbindings] implicit def w2v(w:VectorWritable):Vector = w.get()
+  private[sparkbindings] implicit def w2v(w: VectorWritable): Vector = w.get()
 
   /**
    * ==Wrap existing RDD into a matrix==
@@ -141,34 +137,31 @@ package object sparkbindings {
    * @tparam K row key type
    * @return wrapped DRM
    */
-  def drmWrap[K: ClassTag](
-      rdd: DrmRdd[K],
-      nrow: Int = -1,
-      ncol: Int = -1,
-      cacheHint: CacheHint.CacheHint = CacheHint.NONE,
-      canHaveMissingRows: Boolean = false
-      ): CheckpointedDrm[K] =
-
-    new CheckpointedDrmSpark[K](
-      rdd = rdd,
-      _nrow = nrow,
-      _ncol = ncol,
-      _cacheStorageLevel = SparkEngine.cacheHint2Spark(cacheHint),
-      _canHaveMissingRows = canHaveMissingRows
-    )
+  def drmWrap[K: ClassTag](rdd: DrmRdd[K], nrow: Long = -1, ncol: Int = -1, cacheHint: CacheHint.CacheHint =
+  CacheHint.NONE, canHaveMissingRows: Boolean = false): CheckpointedDrm[K] =
+
+    new CheckpointedDrmSpark[K](rddInput = rdd, _nrow = nrow, _ncol = ncol, _cacheStorageLevel = SparkEngine
+      .cacheHint2Spark(cacheHint), _canHaveMissingRows = canHaveMissingRows)
+
+
+  /** Another drmWrap version that takes in vertical block-partitioned input to form the matrix. */
+  def drmWrapBlockified[K: ClassTag](blockifiedDrmRdd: BlockifiedDrmRdd[K], nrow: Long = -1, ncol: Int = -1,
+                                     cacheHint: CacheHint.CacheHint = CacheHint.NONE,
+                                     canHaveMissingRows: Boolean = false): CheckpointedDrm[K] =
+
+    drmWrap(drm.deblockify(blockifiedDrmRdd), nrow, ncol, cacheHint, canHaveMissingRows)
 
   private[sparkbindings] def getMahoutHome() = {
     var mhome = System.getenv("MAHOUT_HOME")
     if (mhome == null) mhome = System.getProperty("mahout.home")
-    require(mhome != null, "MAHOUT_HOME is required to spawn mahout-based spark jobs" )
+    require(mhome != null, "MAHOUT_HOME is required to spawn mahout-based spark jobs")
     mhome
   }
 
   /** Acquire proper Mahout jars to be added to task context based on current MAHOUT_HOME. */
-  private[sparkbindings] def findMahoutContextJars(closeables:java.util.Deque[Closeable]) = {
+  private[sparkbindings] def findMahoutContextJars(closeables: Growable[Closeable]) = {
 
     // Figure Mahout classpath using $MAHOUT_HOME/mahout classpath command.
-
     val fmhome = new File(getMahoutHome())
     val bin = new File(fmhome, "bin")
     val exec = new File(bin, "mahout")
@@ -177,26 +170,25 @@ package object sparkbindings {
 
     val p = Runtime.getRuntime.exec(Array(exec.getAbsolutePath, "-spark", "classpath"))
 
-    closeables.addFirst(new Closeable {
+    closeables += new Closeable {
       def close() {
         p.destroy()
       }
-    })
+    }
 
     val r = new BufferedReader(new InputStreamReader(p.getInputStream))
-    closeables.addFirst(r)
+    closeables += r
 
     val w = new StringWriter()
-    closeables.addFirst(w)
+    closeables += w
 
     var continue = true;
-    val jars = new ArrayBuffer[String]()
+    val jars = new mutable.ArrayBuffer[String]()
     do {
       val cp = r.readLine()
       if (cp == null)
-        throw new IllegalArgumentException(
-          "Unable to read output from \"mahout -spark classpath\". Is SPARK_HOME defined?"
-        )
+        throw new IllegalArgumentException("Unable to read output from \"mahout -spark classpath\". Is SPARK_HOME " +
+          "defined?")
 
       val j = cp.split(File.pathSeparatorChar)
       if (j.size > 10) {
@@ -206,8 +198,7 @@ package object sparkbindings {
       }
     } while (continue)
 
-//    jars.foreach(j => log.info(j))
-
+    //    jars.foreach(j => log.info(j))
     // context specific jars
     val mcjars = jars.filter(j =>
       j.matches(".*mahout-math-\\d.*\\.jar") ||
@@ -233,4 +224,13 @@ package object sparkbindings {
     mcjars
   }
 
+  private[sparkbindings] def validateBlockifiedDrmRdd[K](rdd: BlockifiedDrmRdd[K]): Boolean = {
+    // Mostly, here each block must contain exactly one block
+    val part1Req = rdd.mapPartitions(piter => Iterator(piter.size == 1)).reduce(_ && _)
+
+    if (!part1Req) warn("blockified rdd: condition not met: exactly 1 per partition")
+
+    return part1Req
+  }
+
 }
diff --git a/spark/src/test/scala/org/apache/mahout/cf/SimilarityAnalysisSuite.scala b/spark/src/test/scala/org/apache/mahout/cf/SimilarityAnalysisSuite.scala
index 0b3b3eb475..9b4c939b7e 100644
--- a/spark/src/test/scala/org/apache/mahout/cf/SimilarityAnalysisSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/cf/SimilarityAnalysisSuite.scala
@@ -66,7 +66,9 @@ class SimilarityAnalysisSuite extends FunSuite with MahoutSuite with Distributed
       (0.0,                0.0,                0.6795961471815897, 0.0,                4.498681156950466))
 
 
-  test("cooccurrence [A'A], [B'A] boolbean data using LLR") {
+
+//  test("cooccurrence [A'A], [B'A] boolbean data using LLR") {
+  ignore("cooccurrence [A'A], [B'A] boolbean data using LLR") {
     val a = dense(
         (1, 1, 0, 0, 0),
         (0, 0, 1, 1, 0),
@@ -97,7 +99,8 @@ class SimilarityAnalysisSuite extends FunSuite with MahoutSuite with Distributed
 
   }
 
-  test("cooccurrence [A'A], [B'A] double data using LLR") {
+//  test("cooccurrence [A'A], [B'A] double data using LLR") {
+  ignore("cooccurrence [A'A], [B'A] double data using LLR") {
     val a = dense(
         (100000.0D, 1.0D,  0.0D,  0.0D,     0.0D),
         (     0.0D, 0.0D, 10.0D,  1.0D,     0.0D),
@@ -127,7 +130,8 @@ class SimilarityAnalysisSuite extends FunSuite with MahoutSuite with Distributed
     n should be < 1E-10
   }
 
-  test("cooccurrence [A'A], [B'A] integer data using LLR") {
+//  test("cooccurrence [A'A], [B'A] integer data using LLR") {
+  ignore("cooccurrence [A'A], [B'A] integer data using LLR") {
     val a = dense(
         ( 1000,  10,       0,    0,   0),
         (    0,   0,  -10000,   10,   0),
@@ -159,7 +163,8 @@ class SimilarityAnalysisSuite extends FunSuite with MahoutSuite with Distributed
     n should be < 1E-10
   }
 
-  test("cooccurrence two matrices with different number of columns"){
+//  test("cooccurrence two matrices with different number of columns"){
+  ignore("cooccurrence two matrices with different number of columns"){
     val a = dense(
       (1, 1, 0, 0, 0),
       (0, 0, 1, 1, 0),
@@ -242,7 +247,8 @@ class SimilarityAnalysisSuite extends FunSuite with MahoutSuite with Distributed
     assert(llr == 2.6341457841558764) // value calculated by hadoop itemsimilairty
   }
 
-  test("downsampling by number per row") {
+//  test("downsampling by number per row") {
+  ignore("downsampling by number per row") {
     val a = dense(
         (1, 1, 1, 1, 0),
         (1, 1, 1, 1, 1),
diff --git a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
index 628d981c1d..0224cee23d 100644
--- a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
@@ -114,7 +114,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
       "--filterColumn", "1"))
   */
 
-  test("ItemSimilarityDriver, non-full-spec CSV") {
+  ignore("ItemSimilarityDriver, non-full-spec CSV") {
+//  test("ItemSimilarityDriver, non-full-spec CSV") {
 
     val InFile = TmpDir + "in-file.csv/" //using part files, not single file
     val OutPath = TmpDir + "similarity-matrices/"
@@ -168,7 +169,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
 
 
-  test("ItemSimilarityDriver TSV ") {
+//  test("ItemSimilarityDriver TSV ") {
+  ignore("ItemSimilarityDriver TSV ") {
 
     val InFile = TmpDir + "in-file.tsv/"
     val OutPath = TmpDir + "similarity-matrices/"
@@ -220,7 +222,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-  test("ItemSimilarityDriver log-ish files") {
+//  test("ItemSimilarityDriver log-ish files") {
+  ignore("ItemSimilarityDriver log-ish files") {
 
     val InFile = TmpDir + "in-file.log/"
     val OutPath = TmpDir + "similarity-matrices/"
@@ -271,7 +274,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-  test("ItemSimilarityDriver legacy supported file format") {
+//  test("ItemSimilarityDriver legacy supported file format") {
+  ignore("ItemSimilarityDriver legacy supported file format") {
 
     val InDir = TmpDir + "in-dir/"
     val InFilename = "in-file.tsv"
@@ -314,7 +318,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-  test("ItemSimilarityDriver write search engine output") {
+//  test("ItemSimilarityDriver write search engine output") {
+  ignore("ItemSimilarityDriver write search engine output") {
 
     val InDir = TmpDir + "in-dir/"
     val InFilename = "in-file.tsv"
@@ -358,7 +363,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-  test("ItemSimilarityDriver recursive file discovery using filename patterns") {
+//  test("ItemSimilarityDriver recursive file discovery using filename patterns") {
+  ignore("ItemSimilarityDriver recursive file discovery using filename patterns") {
     //directory structure using the following
     // tmp/data/m1.tsv
     // tmp/data/more-data/another-dir/m2.tsv
@@ -433,7 +439,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-  test("ItemSimilarityDriver, two input paths") {
+//  test("ItemSimilarityDriver, two input paths") {
+  ignore("ItemSimilarityDriver, two input paths") {
 
     val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file
     val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file
@@ -486,7 +493,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-  test("ItemSimilarityDriver, two inputs of different dimensions") {
+//  test("ItemSimilarityDriver, two inputs of different dimensions") {
+  ignore("ItemSimilarityDriver, two inputs of different dimensions") {
 
     val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file
     val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file
@@ -553,7 +561,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-  test("ItemSimilarityDriver cross similarity two separate items spaces") {
+//  test("ItemSimilarityDriver cross similarity two separate items spaces") {
+  ignore("ItemSimilarityDriver cross similarity two separate items spaces") {
     /* cross-similarity with category views, same user space
             	phones	tablets	mobile_acc	soap
           u1	0	      1	      1	          0
@@ -617,7 +626,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-  test("A.t %*% B after changing row cardinality of A") {
+//  test("A.t %*% B after changing row cardinality of A") {
+  ignore("A.t %*% B after changing row cardinality of A") {
     // todo: move to math tests but this is Spark specific
 
     val a = dense(
@@ -648,7 +658,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
     val bp = 0
   }
 
-  test("Changing row cardinality of an IndexedDataset") {
+//  test("Changing row cardinality of an IndexedDataset") {
+  ignore("Changing row cardinality of an IndexedDataset") {
 
     val a = dense(
       (1.0, 1.0))
@@ -662,7 +673,8 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-  test("ItemSimilarityDriver cross similarity two separate items spaces, missing rows in B") {
+//  test("ItemSimilarityDriver cross similarity two separate items spaces, missing rows in B") {
+  ignore("ItemSimilarityDriver cross similarity two separate items spaces, missing rows in B") {
     /* cross-similarity with category views, same user space
             	phones	tablets	mobile_acc	soap
             u1	0	      1	      1	          0
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala
index fbc31f37fa..529d13c256 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/SparkBindingsSuite.scala
@@ -1,10 +1,12 @@
 package org.apache.mahout.sparkbindings
 
-import org.scalatest.FunSuite
+import java.io.{Closeable, File}
 import java.util
-import java.io.{File, Closeable}
-import org.apache.mahout.common.IOUtils
+
 import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
+import org.apache.mahout.util.IOUtilsScala
+import org.scalatest.FunSuite
+import collection._
 
 /**
  * @author dmitriy
@@ -16,7 +18,7 @@ class SparkBindingsSuite extends FunSuite with DistributedSparkSuite {
   // let it to be ignored.
   ignore("context jars") {
     System.setProperty("mahout.home", new File("..").getAbsolutePath/*"/home/dmitriy/projects/github/mahout-commits"*/)
-    val closeables = new util.ArrayDeque[Closeable]()
+    val closeables = new mutable.ListBuffer[Closeable]()
     try {
       val mahoutJars = findMahoutContextJars(closeables)
       mahoutJars.foreach {
@@ -26,7 +28,7 @@ class SparkBindingsSuite extends FunSuite with DistributedSparkSuite {
       mahoutJars.size should be > 0
       mahoutJars.size shouldBe 4
     } finally {
-      IOUtils.close(closeables)
+      IOUtilsScala.close(closeables)
     }
 
   }
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/blas/BlasSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/blas/BlasSuite.scala
index 1521cb8085..8c8ac3f2fd 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/blas/BlasSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/blas/BlasSuite.scala
@@ -26,7 +26,7 @@ import scalabindings._
 import RLikeOps._
 import drm._
 import org.apache.mahout.sparkbindings._
-import org.apache.mahout.sparkbindings.drm.CheckpointedDrmSpark
+import org.apache.mahout.sparkbindings.drm._
 import org.apache.mahout.math.drm.logical.{OpAt, OpAtA, OpAewB, OpABt}
 import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
 
@@ -142,7 +142,7 @@ class BlasSuite extends FunSuite with DistributedSparkSuite {
     val drmA = drmParallelize(m = inCoreA, numPartitions = 2)
 
     val op = new OpAt(drmA)
-    val drmAt = new CheckpointedDrmSpark(rdd = At.at(op, srcA = drmA), _nrow = op.nrow, _ncol = op.ncol)
+    val drmAt = new CheckpointedDrmSpark(rddInput = At.at(op, srcA = drmA), _nrow = op.nrow, _ncol = op.ncol)
     val inCoreAt = drmAt.collect
     val inCoreControlAt = inCoreA.t
 
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeOpsSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeOpsSuite.scala
index 42026ae81c..72416606a7 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeOpsSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/DrmLikeOpsSuite.scala
@@ -23,13 +23,14 @@ import drm._
 import RLikeOps._
 import RLikeDrmOps._
 import org.apache.mahout.sparkbindings._
-import org.scalatest.FunSuite
+import org.scalatest.{ConfigMap, BeforeAndAfterAllConfigMap, FunSuite}
 import org.apache.mahout.sparkbindings.test.DistributedSparkSuite
 
+import scala.reflect.ClassTag
+
 /** Tests for DrmLikeOps */
 class DrmLikeOpsSuite extends FunSuite with DistributedSparkSuite with DrmLikeOpsSuiteBase {
 
-
   test("exact, min and auto ||") {
     val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6))
     val A = drmParallelize(m = inCoreA, numPartitions = 2)
@@ -39,18 +40,20 @@ class DrmLikeOpsSuite extends FunSuite with DistributedSparkSuite with DrmLikeOp
     (A + 1.0).par(exact = 4).rdd.partitions.size should equal(4)
     A.par(exact = 2).rdd.partitions.size should equal(2)
     A.par(exact = 1).rdd.partitions.size should equal(1)
-    A.par(exact = 0).rdd.partitions.size should equal(2) // No effect for par <= 0
+
     A.par(min = 4).rdd.partitions.size should equal(4)
     A.par(min = 2).rdd.partitions.size should equal(2)
     A.par(min = 1).rdd.partitions.size should equal(2)
     A.par(auto = true).rdd.partitions.size should equal(10)
     A.par(exact = 10).par(auto = true).rdd.partitions.size should equal(10)
     A.par(exact = 11).par(auto = true).rdd.partitions.size should equal(19)
-    A.par(exact = 20).par(auto = true).rdd.partitions.size should equal(20)
+    A.par(exact = 20).par(auto = true).rdd.partitions.size should equal(19)
+
+    A.keyClassTag shouldBe ClassTag.Int
+    A.par(auto = true).keyClassTag shouldBe ClassTag.Int
 
-    intercept[AssertionError] {
-      A.par()
-    }
+    an[IllegalArgumentException] shouldBe thrownBy {A.par(exact = 0)}
+    an[IllegalArgumentException] shouldBe thrownBy {A.par()}
   }
 
 }
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/RLikeDrmOpsSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/RLikeDrmOpsSuite.scala
index 2a4f21352a..f422f86c40 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/RLikeDrmOpsSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/drm/RLikeDrmOpsSuite.scala
@@ -25,10 +25,16 @@ import drm._
 import org.apache.mahout.sparkbindings._
 import RLikeDrmOps._
 import test.DistributedSparkSuite
+import org.apache.mahout.math.drm.logical.{OpAtB, OpAewUnaryFuncFusion}
+import org.apache.mahout.logging._
+
+import scala.util.Random
 
 /** ==R-like DRM DSL operation tests -- Spark== */
 class RLikeDrmOpsSuite extends FunSuite with DistributedSparkSuite with RLikeDrmOpsSuiteBase {
 
+  private final implicit val log = getLog(classOf[RLikeDrmOpsSuite])
+
   test("C = A + B missing rows") {
     val sc = mahoutCtx.asInstanceOf[SparkDistributedContext].sc
 
@@ -113,4 +119,61 @@ class RLikeDrmOpsSuite extends FunSuite with DistributedSparkSuite with RLikeDrm
 
   }
 
+  test("A'B, bigger") {
+
+    val rnd = new Random()
+    val a = new SparseRowMatrix(200, 1544) := { _ => rnd.nextGaussian() }
+    val b = new SparseRowMatrix(200, 300) := { _ => rnd.nextGaussian() }
+
+    var ms = System.currentTimeMillis()
+    val atb = a.t %*% b
+    ms = System.currentTimeMillis() - ms
+
+    println(s"in-core mul ms: $ms")
+
+    val drmA = drmParallelize(a, numPartitions = 2)
+    val drmB = drmParallelize(b, numPartitions = 2)
+
+    ms = System.currentTimeMillis()
+    val drmAtB = drmA.t %*% drmB
+    val mxAtB = drmAtB.collect
+    ms = System.currentTimeMillis() - ms
+
+    println(s"a'b plan:${drmAtB.context.engine.optimizerRewrite(drmAtB)}")
+    println(s"a'b plan contains ${drmAtB.rdd.partitions.size} partitions.")
+    println(s"distributed mul ms: $ms.")
+
+    (atb - mxAtB).norm should be < 1e-5
+
+  }
+
+  test("C = At %*% B , zippable") {
+
+    val mxA = dense((1, 2), (3, 4), (-3, -5))
+
+    val A = drmParallelize(mxA, numPartitions = 2)
+      .mapBlock()({
+      case (keys, block) => keys.map(_.toString) -> block
+    })
+
+    val B = (A + 1.0)
+
+      .mapBlock() { case (keys, block) ⇒
+      val nblock = new SparseRowMatrix(block.nrow, block.ncol) := block
+      keys → nblock
+    }
+
+    B.collect
+
+    val C = A.t %*% B
+
+    mahoutCtx.optimizerRewrite(C) should equal(OpAtB[String](A, B))
+
+    val inCoreC = C.collect
+    val inCoreControlC = mxA.t %*% (mxA + 1.0)
+
+    (inCoreC - inCoreControlC).norm should be < 1E-10
+
+  }
+
 }
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/io/IOSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/io/IOSuite.scala
new file mode 100644
index 0000000000..f3a972136a
--- /dev/null
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/io/IOSuite.scala
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.sparkbindings.io
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+
+import com.esotericsoftware.kryo.Kryo
+import com.esotericsoftware.kryo.io.{Input, Output}
+import com.twitter.chill.AllScalaRegistrar
+import org.apache.mahout.math._
+import scalabindings._
+import RLikeOps._
+
+import org.apache.mahout.common.RandomUtils
+import org.apache.mahout.test.MahoutSuite
+import org.scalatest.FunSuite
+
+import scala.util.Random
+
+class IOSuite extends FunSuite with MahoutSuite {
+
+  import IOSuite._
+
+  test("Dense vector kryo") {
+
+    val rnd = RandomUtils.getRandom
+    val vec = new DenseVector(165) := { _ => rnd.nextDouble()}
+
+    val ret = kryoClone(vec, vec, vec)
+    val vec2 = ret(2)
+
+    println(s"vec=$vec\nvc2=$vec2")
+
+    vec2 === vec shouldBe true
+    vec2.isInstanceOf[DenseVector] shouldBe true
+  }
+
+  test("Random sparse vector kryo") {
+
+    val rnd = RandomUtils.getRandom
+    val vec = new RandomAccessSparseVector(165) := { _ => if (rnd.nextDouble() < 0.3) rnd.nextDouble() else 0}
+    val vec1 = new RandomAccessSparseVector(165)
+    vec1(2) = 2
+    vec1(3) = 4
+    vec1(3) = 0
+    vec1(10) = 30
+
+    val ret = kryoClone(vec, vec1, vec)
+    val (vec2, vec3) = (ret(2), ret(1))
+
+    println(s"vec=$vec\nvc2=$vec2")
+
+    vec2 === vec shouldBe true
+    vec1 === vec3 shouldBe true
+    vec2.isInstanceOf[RandomAccessSparseVector] shouldBe true
+
+  }
+
+  test("100% sparse vectors") {
+
+    val vec1 = new SequentialAccessSparseVector(10)
+    val vec2 = new RandomAccessSparseVector(6)
+    val ret = kryoClone(vec1, vec2, vec1, vec2)
+    val vec3 = ret(2)
+    val vec4 = ret(3)
+
+    vec1 === vec3 shouldBe true
+    vec2 === vec4 shouldBe true
+  }
+
+  test("Sequential sparse vector kryo") {
+
+    val rnd = RandomUtils.getRandom
+    val vec = new SequentialAccessSparseVector(165) := { _ => if (rnd.nextDouble() < 0.3) rnd.nextDouble() else 0}
+
+    val vec1 = new SequentialAccessSparseVector(165)
+    vec1(2) = 0
+    vec1(3) = 3
+    vec1(4) = 2
+    vec1(3) = 0
+
+    val ret = kryoClone(vec, vec1, vec)
+    val (vec2, vec3) = (ret(2), ret(1))
+
+    println(s"vec=$vec\nvc2=$vec2")
+
+    vec2 === vec shouldBe true
+    vec1 === vec3 shouldBe true
+    vec2.isInstanceOf[SequentialAccessSparseVector] shouldBe true
+  }
+
+  test("kryo matrix tests") {
+    val rnd = new Random()
+
+    val mxA = new DenseMatrix(140, 150) := { _ => rnd.nextDouble()}
+
+    val mxB = new SparseRowMatrix(140, 150) := { _ => if (rnd.nextDouble() < .3) rnd.nextDouble() else 0.0}
+
+    val mxC = new SparseMatrix(140, 150)
+    for (i <- 0 until mxC.nrow) if (rnd.nextDouble() < .3)
+      mxC(i, ::) := { _ => if (rnd.nextDouble() < .3) rnd.nextDouble() else 0.0}
+
+    val cnsl = mxC.numSlices()
+    println(s"Number of slices in mxC: ${cnsl}")
+
+    val ret = kryoClone(mxA, mxA.t, mxB, mxB.t, mxC, mxC.t, mxA)
+
+    val (mxAA, mxAAt, mxBB, mxBBt, mxCC, mxCCt, mxAAA) = (ret(0), ret(1), ret(2), ret(3), ret(4), ret(5), ret(6))
+
+    // ret.size shouldBe 7
+
+    mxA === mxAA shouldBe true
+    mxA === mxAAA shouldBe true
+    mxA === mxAAt.t shouldBe true
+    mxAA.isInstanceOf[DenseMatrix] shouldBe true
+    mxAAt.isInstanceOf[DenseMatrix] shouldBe false
+
+
+    mxB === mxBB shouldBe true
+    mxB === mxBBt.t shouldBe true
+    mxBB.isInstanceOf[SparseRowMatrix] shouldBe true
+    mxBBt.isInstanceOf[SparseRowMatrix] shouldBe false
+    mxBB(0,::).isDense shouldBe false
+
+
+    // Assert no persistence operation increased slice sparsity
+    mxC.numSlices() shouldBe cnsl
+
+    // Assert deserialized product did not experience any empty slice inflation
+    mxCC.numSlices() shouldBe cnsl
+    mxCCt.t.numSlices() shouldBe cnsl
+
+    // Incidentally, but not very significantly, iterating thru all rows that happens in equivalence
+    // operator, inserts empty rows into SparseMatrix so these asserts should not be before numSlices
+    // asserts.
+    mxC === mxCC shouldBe true
+    mxC === mxCCt.t shouldBe true
+    mxCCt.t.isInstanceOf[SparseMatrix] shouldBe true
+
+    // Column-wise sparse matrix are deprecated and should be explicitly rejected by serializer.
+    an[IllegalArgumentException] should be thrownBy {
+      val mxDeprecated = new SparseColumnMatrix(14, 15)
+      kryoClone(mxDeprecated)
+    }
+
+  }
+
+  test("diag matrix") {
+
+    val mxD = diagv(dvec(1, 2, 3, 5))
+    val mxDD = kryoClone(mxD)(0)
+    mxD === mxDD shouldBe true
+    mxDD.isInstanceOf[DiagonalMatrix] shouldBe true
+
+  }
+}
+
+object IOSuite {
+
+  def kryoClone[T](obj: T*): Seq[T] = {
+
+    val kryo = new Kryo()
+    new AllScalaRegistrar()(kryo)
+
+    MahoutKryoRegistrator.registerClasses(kryo)
+
+    val baos = new ByteArrayOutputStream()
+    val output = new Output(baos)
+    obj.foreach(kryo.writeClassAndObject(output, _))
+    output.close
+
+    val input = new Input(new ByteArrayInputStream(baos.toByteArray))
+
+    def outStream: Stream[T] =
+      if (input.eof) Stream.empty
+      else kryo.readClassAndObject(input).asInstanceOf[T] #:: outStream
+
+    outStream
+  }
+}
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala
index f18ec705ca..2e517ac99f 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala
@@ -17,11 +17,13 @@
 
 package org.apache.mahout.sparkbindings.test
 
+import org.apache.log4j.{Level, Logger}
 import org.scalatest.{ConfigMap, BeforeAndAfterAllConfigMap, Suite}
 import org.apache.spark.SparkConf
 import org.apache.mahout.sparkbindings._
 import org.apache.mahout.test.{DistributedMahoutSuite, MahoutSuite}
 import org.apache.mahout.math.drm.DistributedContext
+import collection.JavaConversions._
 
 trait DistributedSparkSuite extends DistributedMahoutSuite with LoggerConfiguration {
   this: Suite =>
@@ -30,16 +32,21 @@ trait DistributedSparkSuite extends DistributedMahoutSuite with LoggerConfigurat
   protected var masterUrl = null.asInstanceOf[String]
 
   protected def initContext() {
-    masterUrl = "local[3]"
+    masterUrl = System.getProperties.getOrElse("test.spark.master", "local[3]")
+    val isLocal = masterUrl.startsWith("local")
     mahoutCtx = mahoutSparkContext(masterUrl = this.masterUrl,
-      appName = "MahoutLocalContext",
+      appName = "MahoutUnitTests",
       // Do not run MAHOUT_HOME jars in unit tests.
-      addMahoutJars = false,
+      addMahoutJars = !isLocal,
       sparkConf = new SparkConf()
-          .set("spark.kryoserializer.buffer.mb", "15")
+          .set("spark.kryoserializer.buffer.mb", "40")
           .set("spark.akka.frameSize", "30")
           .set("spark.default.parallelism", "10")
+          .set("spark.executor.memory", "2G")
     )
+    // Spark reconfigures logging. Clamp down on it in tests.
+    Logger.getRootLogger.setLevel(Level.ERROR)
+    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
   }
 
   protected def resetContext() {
@@ -62,6 +69,11 @@ trait DistributedSparkSuite extends DistributedMahoutSuite with LoggerConfigurat
     resetContext()
   }
 
+  override protected def afterAll(configMap: ConfigMap): Unit = {
+    super.afterAll(configMap)
+    resetContext()
+  }
+
   override protected def beforeAll(configMap: ConfigMap): Unit = {
     super.beforeAll(configMap)
     initContext()
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/LoggerConfiguration.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/LoggerConfiguration.scala
index e48e7c724e..2a996d7a9d 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/LoggerConfiguration.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/LoggerConfiguration.scala
@@ -25,6 +25,6 @@ trait LoggerConfiguration extends org.apache.mahout.test.LoggerConfiguration {
 
   override protected def beforeAll(configMap: ConfigMap) {
     super.beforeAll(configMap)
-    Logger.getLogger("org.apache.mahout.sparkbindings").setLevel(Level.INFO)
+    BasicConfigurator.resetConfiguration()
   }
 }

From 3422046b94c03d43a91f091e38532339cf890351 Mon Sep 17 00:00:00 2001
From: Dmitriy Lyubimov <dlyubimov@apache.org>
Date: Mon, 1 Jun 2015 18:13:03 -0700
Subject: [PATCH 2/9] Adding missing change. uncommenting performance in-core
 tests.

---
 .../scalabindings/RLikeMatrixOpsSuite.scala   | 166 +++++++++---------
 .../mahout/math/FunctionalMatrixView.java     |   4 +
 2 files changed, 87 insertions(+), 83 deletions(-)

diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
index 0f81145e95..79d28995dc 100644
--- a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOpsSuite.scala
@@ -132,89 +132,89 @@ class RLikeMatrixOpsSuite extends FunSuite with MahoutSuite {
     }
 
     // Dense matrix tests.
-//    println(s"Ad %*% Bd: ${getMmulAvgs(mxAd, mxBd, n)}")
-//    println(s"Ad' %*% Bd: ${getMmulAvgs(mxAd.t, mxBd, n)}")
-//    println(s"Ad %*% Bd': ${getMmulAvgs(mxAd, mxBd.t, n)}")
-//    println(s"Ad' %*% Bd': ${getMmulAvgs(mxAd.t, mxBd.t, n)}")
-//    println(s"Ad'' %*% Bd'': ${getMmulAvgs(mxAd.t.t, mxBd.t.t, n)}")
-//    println
-//
-//    // Sparse row matrix tests.
-//    println(s"Asr %*% Bsr: ${getMmulAvgs(mxAsr, mxBsr, n)}")
-//    println(s"Asr' %*% Bsr: ${getMmulAvgs(mxAsr.t, mxBsr, n)}")
-//    println(s"Asr %*% Bsr': ${getMmulAvgs(mxAsr, mxBsr.t, n)}")
-//    println(s"Asr' %*% Bsr': ${getMmulAvgs(mxAsr.t, mxBsr.t, n)}")
-//    println(s"Asr'' %*% Bsr'': ${getMmulAvgs(mxAsr.t.t, mxBsr.t.t, n)}")
-//    println
-//
-//    // Sparse matrix tests.
-//    println(s"Asm %*% Bsm: ${getMmulAvgs(mxAs, mxBs, n)}")
-//    println(s"Asm' %*% Bsm: ${getMmulAvgs(mxAs.t, mxBs, n)}")
-//    println(s"Asm %*% Bsm': ${getMmulAvgs(mxAs, mxBs.t, n)}")
-//    println(s"Asm' %*% Bsm': ${getMmulAvgs(mxAs.t, mxBs.t, n)}")
-//    println(s"Asm'' %*% Bsm'': ${getMmulAvgs(mxAs.t.t, mxBs.t.t, n)}")
-//    println
-//
-//    // Mixed sparse matrix tests.
-//    println(s"Asm %*% Bsr: ${getMmulAvgs(mxAs, mxBsr, n)}")
-//    println(s"Asm' %*% Bsr: ${getMmulAvgs(mxAs.t, mxBsr, n)}")
-//    println(s"Asm %*% Bsr': ${getMmulAvgs(mxAs, mxBsr.t, n)}")
-//    println(s"Asm' %*% Bsr': ${getMmulAvgs(mxAs.t, mxBsr.t, n)}")
-//    println(s"Asm'' %*% Bsr'': ${getMmulAvgs(mxAs.t.t, mxBsr.t.t, n)}")
-//    println
-//
-//    println(s"Asr %*% Bsm: ${getMmulAvgs(mxAsr, mxBs, n)}")
-//    println(s"Asr' %*% Bsm: ${getMmulAvgs(mxAsr.t, mxBs, n)}")
-//    println(s"Asr %*% Bsm': ${getMmulAvgs(mxAsr, mxBs.t, n)}")
-//    println(s"Asr' %*% Bsm': ${getMmulAvgs(mxAsr.t, mxBs.t, n)}")
-//    println(s"Asr'' %*% Bsm'': ${getMmulAvgs(mxAsr.t.t, mxBs.t.t, n)}")
-//    println
-//
-//    // Mixed dense/sparse
-//    println(s"Ad %*% Bsr: ${getMmulAvgs(mxAd, mxBsr, n)}")
-//    println(s"Ad' %*% Bsr: ${getMmulAvgs(mxAd.t, mxBsr, n)}")
-//    println(s"Ad %*% Bsr': ${getMmulAvgs(mxAd, mxBsr.t, n)}")
-//    println(s"Ad' %*% Bsr': ${getMmulAvgs(mxAd.t, mxBsr.t, n)}")
-//    println(s"Ad'' %*% Bsr'': ${getMmulAvgs(mxAd.t.t, mxBsr.t.t, n)}")
-//    println
-//
-//    println(s"Asr %*% Bd: ${getMmulAvgs(mxAsr, mxBd, n)}")
-//    println(s"Asr' %*% Bd: ${getMmulAvgs(mxAsr.t, mxBd, n)}")
-//    println(s"Asr %*% Bd': ${getMmulAvgs(mxAsr, mxBd.t, n)}")
-//    println(s"Asr' %*% Bd': ${getMmulAvgs(mxAsr.t, mxBd.t, n)}")
-//    println(s"Asr'' %*% Bd'': ${getMmulAvgs(mxAsr.t.t, mxBd.t.t, n)}")
-//    println
-//
-//    println(s"Ad %*% Bsm: ${getMmulAvgs(mxAd, mxBs, n)}")
-//    println(s"Ad' %*% Bsm: ${getMmulAvgs(mxAd.t, mxBs, n)}")
-//    println(s"Ad %*% Bsm': ${getMmulAvgs(mxAd, mxBs.t, n)}")
-//    println(s"Ad' %*% Bsm': ${getMmulAvgs(mxAd.t, mxBs.t, n)}")
-//    println(s"Ad'' %*% Bsm'': ${getMmulAvgs(mxAd.t.t, mxBs.t.t, n)}")
-//    println
-//
-//    println(s"Asm %*% Bd: ${getMmulAvgs(mxAs, mxBd, n)}")
-//    println(s"Asm' %*% Bd: ${getMmulAvgs(mxAs.t, mxBd, n)}")
-//    println(s"Asm %*% Bd': ${getMmulAvgs(mxAs, mxBd.t, n)}")
-//    println(s"Asm' %*% Bd': ${getMmulAvgs(mxAs.t, mxBd.t, n)}")
-//    println(s"Asm'' %*% Bd'': ${getMmulAvgs(mxAs.t.t, mxBd.t.t, n)}")
-//    println
-//
-//    // Diagonal cases
-//    println(s"Ad %*% D: ${getMmulAvgs(mxAd, mxD, n)}")
-//    println(s"Asr %*% D: ${getMmulAvgs(mxAsr, mxD, n)}")
-//    println(s"Asm %*% D: ${getMmulAvgs(mxAs, mxD, n)}")
-//    println(s"D %*% Ad: ${getMmulAvgs(mxD, mxAd, n)}")
-//    println(s"D %*% Asr: ${getMmulAvgs(mxD, mxAsr, n)}")
-//    println(s"D %*% Asm: ${getMmulAvgs(mxD, mxAs, n)}")
-//    println
-//
-//    println(s"Ad' %*% D: ${getMmulAvgs(mxAd.t, mxD, n)}")
-//    println(s"Asr' %*% D: ${getMmulAvgs(mxAsr.t, mxD, n)}")
-//    println(s"Asm' %*% D: ${getMmulAvgs(mxAs.t, mxD, n)}")
-//    println(s"D %*% Ad': ${getMmulAvgs(mxD, mxAd.t, n)}")
-//    println(s"D %*% Asr': ${getMmulAvgs(mxD, mxAsr.t, n)}")
-//    println(s"D %*% Asm': ${getMmulAvgs(mxD, mxAs.t, n)}")
-//    println
+    println(s"Ad %*% Bd: ${getMmulAvgs(mxAd, mxBd, n)}")
+    println(s"Ad' %*% Bd: ${getMmulAvgs(mxAd.t, mxBd, n)}")
+    println(s"Ad %*% Bd': ${getMmulAvgs(mxAd, mxBd.t, n)}")
+    println(s"Ad' %*% Bd': ${getMmulAvgs(mxAd.t, mxBd.t, n)}")
+    println(s"Ad'' %*% Bd'': ${getMmulAvgs(mxAd.t.t, mxBd.t.t, n)}")
+    println
+
+    // Sparse row matrix tests.
+    println(s"Asr %*% Bsr: ${getMmulAvgs(mxAsr, mxBsr, n)}")
+    println(s"Asr' %*% Bsr: ${getMmulAvgs(mxAsr.t, mxBsr, n)}")
+    println(s"Asr %*% Bsr': ${getMmulAvgs(mxAsr, mxBsr.t, n)}")
+    println(s"Asr' %*% Bsr': ${getMmulAvgs(mxAsr.t, mxBsr.t, n)}")
+    println(s"Asr'' %*% Bsr'': ${getMmulAvgs(mxAsr.t.t, mxBsr.t.t, n)}")
+    println
+
+    // Sparse matrix tests.
+    println(s"Asm %*% Bsm: ${getMmulAvgs(mxAs, mxBs, n)}")
+    println(s"Asm' %*% Bsm: ${getMmulAvgs(mxAs.t, mxBs, n)}")
+    println(s"Asm %*% Bsm': ${getMmulAvgs(mxAs, mxBs.t, n)}")
+    println(s"Asm' %*% Bsm': ${getMmulAvgs(mxAs.t, mxBs.t, n)}")
+    println(s"Asm'' %*% Bsm'': ${getMmulAvgs(mxAs.t.t, mxBs.t.t, n)}")
+    println
+
+    // Mixed sparse matrix tests.
+    println(s"Asm %*% Bsr: ${getMmulAvgs(mxAs, mxBsr, n)}")
+    println(s"Asm' %*% Bsr: ${getMmulAvgs(mxAs.t, mxBsr, n)}")
+    println(s"Asm %*% Bsr': ${getMmulAvgs(mxAs, mxBsr.t, n)}")
+    println(s"Asm' %*% Bsr': ${getMmulAvgs(mxAs.t, mxBsr.t, n)}")
+    println(s"Asm'' %*% Bsr'': ${getMmulAvgs(mxAs.t.t, mxBsr.t.t, n)}")
+    println
+
+    println(s"Asr %*% Bsm: ${getMmulAvgs(mxAsr, mxBs, n)}")
+    println(s"Asr' %*% Bsm: ${getMmulAvgs(mxAsr.t, mxBs, n)}")
+    println(s"Asr %*% Bsm': ${getMmulAvgs(mxAsr, mxBs.t, n)}")
+    println(s"Asr' %*% Bsm': ${getMmulAvgs(mxAsr.t, mxBs.t, n)}")
+    println(s"Asr'' %*% Bsm'': ${getMmulAvgs(mxAsr.t.t, mxBs.t.t, n)}")
+    println
+
+    // Mixed dense/sparse
+    println(s"Ad %*% Bsr: ${getMmulAvgs(mxAd, mxBsr, n)}")
+    println(s"Ad' %*% Bsr: ${getMmulAvgs(mxAd.t, mxBsr, n)}")
+    println(s"Ad %*% Bsr': ${getMmulAvgs(mxAd, mxBsr.t, n)}")
+    println(s"Ad' %*% Bsr': ${getMmulAvgs(mxAd.t, mxBsr.t, n)}")
+    println(s"Ad'' %*% Bsr'': ${getMmulAvgs(mxAd.t.t, mxBsr.t.t, n)}")
+    println
+
+    println(s"Asr %*% Bd: ${getMmulAvgs(mxAsr, mxBd, n)}")
+    println(s"Asr' %*% Bd: ${getMmulAvgs(mxAsr.t, mxBd, n)}")
+    println(s"Asr %*% Bd': ${getMmulAvgs(mxAsr, mxBd.t, n)}")
+    println(s"Asr' %*% Bd': ${getMmulAvgs(mxAsr.t, mxBd.t, n)}")
+    println(s"Asr'' %*% Bd'': ${getMmulAvgs(mxAsr.t.t, mxBd.t.t, n)}")
+    println
+
+    println(s"Ad %*% Bsm: ${getMmulAvgs(mxAd, mxBs, n)}")
+    println(s"Ad' %*% Bsm: ${getMmulAvgs(mxAd.t, mxBs, n)}")
+    println(s"Ad %*% Bsm': ${getMmulAvgs(mxAd, mxBs.t, n)}")
+    println(s"Ad' %*% Bsm': ${getMmulAvgs(mxAd.t, mxBs.t, n)}")
+    println(s"Ad'' %*% Bsm'': ${getMmulAvgs(mxAd.t.t, mxBs.t.t, n)}")
+    println
+
+    println(s"Asm %*% Bd: ${getMmulAvgs(mxAs, mxBd, n)}")
+    println(s"Asm' %*% Bd: ${getMmulAvgs(mxAs.t, mxBd, n)}")
+    println(s"Asm %*% Bd': ${getMmulAvgs(mxAs, mxBd.t, n)}")
+    println(s"Asm' %*% Bd': ${getMmulAvgs(mxAs.t, mxBd.t, n)}")
+    println(s"Asm'' %*% Bd'': ${getMmulAvgs(mxAs.t.t, mxBd.t.t, n)}")
+    println
+
+    // Diagonal cases
+    println(s"Ad %*% D: ${getMmulAvgs(mxAd, mxD, n)}")
+    println(s"Asr %*% D: ${getMmulAvgs(mxAsr, mxD, n)}")
+    println(s"Asm %*% D: ${getMmulAvgs(mxAs, mxD, n)}")
+    println(s"D %*% Ad: ${getMmulAvgs(mxD, mxAd, n)}")
+    println(s"D %*% Asr: ${getMmulAvgs(mxD, mxAsr, n)}")
+    println(s"D %*% Asm: ${getMmulAvgs(mxD, mxAs, n)}")
+    println
+
+    println(s"Ad' %*% D: ${getMmulAvgs(mxAd.t, mxD, n)}")
+    println(s"Asr' %*% D: ${getMmulAvgs(mxAsr.t, mxD, n)}")
+    println(s"Asm' %*% D: ${getMmulAvgs(mxAs.t, mxD, n)}")
+    println(s"D %*% Ad': ${getMmulAvgs(mxD, mxAd.t, n)}")
+    println(s"D %*% Asr': ${getMmulAvgs(mxD, mxAsr.t, n)}")
+    println(s"D %*% Asm': ${getMmulAvgs(mxD, mxAs.t, n)}")
+    println
 
     // Self-squared cases
     println(s"Ad %*% Ad': ${getMmulAvgs(mxAd, mxAd.t, n)}")
diff --git a/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java b/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
index 2944f566f7..9028e23577 100644
--- a/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
+++ b/math/src/main/java/org/apache/mahout/math/FunctionalMatrixView.java
@@ -92,4 +92,8 @@ public Vector viewColumn(int column) {
     return new MatrixVectorView(this, 0, column, 1, 0, denseLike);
   }
 
+  @Override
+  public MatrixFlavor getFlavor() {
+    return flavor;
+  }
 }

From 1b892de589bccf03c41c6b2e49493472e6bd1d52 Mon Sep 17 00:00:00 2001
From: Dmitriy Lyubimov <dlyubimov@apache.org>
Date: Mon, 1 Jun 2015 18:44:21 -0700
Subject: [PATCH 3/9] Picking up missing changes on both sides in spark module.
 TODO: Pat's similarity driver tests fail, seems, on some degenerate splitting
 in optimizer. Need to take a look.

---
 .../mahout/h2o/common/Hadoop1HDFSUtil.scala   |  63 ----------
 .../mahout/common/Hadoop1HDFSUtil.scala       |  28 +----
 .../mahout/sparkbindings/SparkEngine.scala    |   1 +
 .../mahout/sparkbindings/blas/AtA.scala       | 113 +++++++++++-------
 .../io/MahoutKryoRegistrator.scala            |   9 +-
 .../mahout/cf/SimilarityAnalysisSuite.scala   |  16 +--
 .../drivers/ItemSimilarityDriverSuite.scala   |  36 ++----
 .../test/DistributedSparkSuite.scala          |   5 -
 8 files changed, 94 insertions(+), 177 deletions(-)
 delete mode 100644 h2o/src/main/scala/org/apache/mahout/h2o/common/Hadoop1HDFSUtil.scala

diff --git a/h2o/src/main/scala/org/apache/mahout/h2o/common/Hadoop1HDFSUtil.scala b/h2o/src/main/scala/org/apache/mahout/h2o/common/Hadoop1HDFSUtil.scala
deleted file mode 100644
index a540cb1613..0000000000
--- a/h2o/src/main/scala/org/apache/mahout/h2o/common/Hadoop1HDFSUtil.scala
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.h2o.common
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.io.{SequenceFile, Writable}
-
-/**
- * Deprecated Hadoop 1 api which we currently explicitly import via Mahout dependencies. May not work
- * with Hadoop 2.0
- */
-object Hadoop1HDFSUtil extends HDFSUtil {
-
-  
-  def readDrmHeader(path: String): DrmMetadata = {
-    val dfsPath = new Path(path)
-    val fs = dfsPath.getFileSystem(new Configuration())
-
-    val partFilePath:Path = fs.listStatus(dfsPath)
-
-        // Filter out anything starting with .
-        .filter { s => !s.getPath.getName.startsWith("\\.") && !s.getPath.getName.startsWith("_") && !s.isDir }
-
-        // Take path
-        .map(_.getPath)
-
-        // Take only one, if any
-        .headOption
-
-        // Require there's at least one partition file found.
-        .getOrElse {
-      throw new IllegalArgumentException(s"No partition files found in ${dfsPath.toString}.")
-    }
-
-    val reader = new SequenceFile.Reader(fs, partFilePath, fs.getConf)
-    try {
-      new DrmMetadata(
-        keyTypeWritable = reader.getKeyClass.asSubclass(classOf[Writable]),
-        valueTypeWritable = reader.getValueClass.asSubclass(classOf[Writable])
-      )
-    } finally {
-      reader.close()
-    }
-
-  }
-
-}
diff --git a/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala b/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala
index d4483341e0..399508dbc7 100644
--- a/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala
+++ b/spark/src/main/scala/org/apache/mahout/common/Hadoop1HDFSUtil.scala
@@ -15,26 +15,8 @@
  * limitations under the License.
  */
 
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 package org.apache.mahout.common
 
-
 import org.apache.hadoop.io.{Writable, SequenceFile}
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.conf.Configuration
@@ -48,12 +30,10 @@ import JavaConversions._
  */
 object Hadoop1HDFSUtil extends HDFSUtil {
 
-  /**
-   * Read the header of a sequence file and determine the Key and Value type
-   * @param path
-   * @return
-   */
-  def readDrmHeader(path: String): DrmMetadata = {
+
+  /** Read DRM header information off (H)DFS. */
+  override def readDrmHeader(path: String)(implicit sc: SparkContext): DrmMetadata = {
+
     val dfsPath = new Path(path)
 
     val fs = dfsPath.getFileSystem(sc.hadoopConfiguration)
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala
index 395f67474b..41e966bc38 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/SparkEngine.scala
@@ -28,6 +28,7 @@ import org.apache.mahout.sparkbindings.drm.{cpDrmGeneric2DrmRddInput, Checkpoint
 import org.apache.mahout.math._
 import scala.Predef
 import scala.reflect.ClassTag
+import scala.reflect.classTag
 import org.apache.spark.storage.StorageLevel
 import org.apache.mahout.sparkbindings.blas._
 import org.apache.hadoop.io._
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala
index c2ffb7084e..a212878851 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtA.scala
@@ -185,57 +185,86 @@ object AtA {
     // dense in reality than the source.
     val m = op.A.nrow
     val n = op.A.ncol
-/* possible fix for index out of range for vector range
-    val numParts = (srcRdd.partitions.size.toDouble * n / m).ceil.round.toInt max 1
+    val numParts = (srcRdd.partitions.size.toDouble * n / m).ceil.toInt max 1
     val blockHeight = (n - 1) / numParts + 1
-*/
-    val numParts = (srcRdd.partitions.size.toDouble * n / m).ceil.round.toInt max 1 min n
+    val offsets = (0 until numParts).map(_ * blockHeight)
+    val ranges = offsets.map(offset => offset until (offset + blockHeight min n))
 
-    // Computing evenly split ranges to denote each partition size.
+    val rddAtA = srcRdd
 
-    // Base size.
-    val baseSize = n / numParts
+      // Remove key, key is irrelevant
+      .map(_._2)
 
-    // How many partitions needs to be baseSize +1.
-    val slack = n - baseSize * numParts
+      // Form partial outer blocks for each partition
+      .flatMap { v =>
+      for (blockKey <- 0 until numParts) yield {
+        blockKey ->(blockKey, v)
+      }
+    }
+      // Combine outer products
+      .combineByKey(// Combiner factory
+        createCombiner = (t: (Int, Vector)) => {
+          val partNo = t._1
+          val vec = t._2
+          val range = ranges(partNo)
+          val mxC = if (vec.isDense) new DenseMatrix(range.size, n) else new SparseRowMatrix(range.size, n)
+          addOuterProduct(mxC, vec(range), vec)
+        },
+
+        // Merge values into existing partition accumulator.
+        mergeValue = (mxC: Matrix, t: (Int, Vector)) => {
+          val partNo = t._1
+          val vec = t._2
+          addOuterProduct(mxC, vec(ranges(partNo)), vec)
+        },
+
+        // Merge combiners
+        mergeCombiners = (mxC1: Matrix, mxC2: Matrix) => mxC1 += mxC2, numPartitions = numParts)
+
+      // Restore proper block keys
+      .map { case (blockKey, block) =>
+      val blockStart = blockKey * blockHeight
+      val rowKeys = Array.tabulate(block.nrow)(blockStart + _)
+      rowKeys -> block
+    }
 
-    val ranges =
-      // Start with partition offsets... total numParts + 1.
-      (0 to numParts).view.map { i => (baseSize + 1) * i - (i - slack max 0)}
-        // And convert offsets to ranges.
-        .sliding(2).map(s => s(0) until s(1)).toIndexedSeq
+    if (log.isDebugEnabled)
+      log.debug(s"AtA #parts: ${rddAtA.partitions.size}.")
 
-    val rddAtA = srcRdd
+    rddAtA
+  }
 
-      // Remove key, key is irrelevant
-      .map(_._2)
+  /**
+   * The version of A'A that does not use GraphX. Tries to use blockwise matrix multiply. If an
+   * accelerated matrix back is available, this might be somewhat faster.
+   */
+  def at_a_nongraph_mmul(op: OpAtA[_], srcRdd: BlockifiedDrmRdd[_]): DrmRddInput[Int] = {
 
-        // Form partial outer blocks for each partition
-        .flatMap {
-      v =>
-        for (blockKey <- Stream.range(0, numParts)) yield {
-/* patch to fix index out of range for vector access
-          val blockStart = blockKey * blockHeight
-          val blockEnd = n min (blockStart + blockHeight)
-          blockKey -> (v(blockStart until blockEnd) cross v)
-*/
-          val range = ranges(blockKey)
-          blockKey -> (v(range) cross v)
-        }
+    // Determine how many partitions the new matrix would need approximately. We base that on
+    // geometry only, but it may eventually not be that adequate. Indeed, A'A tends to be much more
+    // dense in reality than the source.
+    val m = op.A.nrow
+    val n = op.A.ncol
+    val aparts = srcRdd.partitions.size
+    val numParts = estimateProductPartitions(anrow = n, ancol = m, bncol = n, aparts = aparts, bparts = aparts)
+    val ranges = computeEvenSplits(n, numParts)
+
+    debug(s"operator mmul-A'A(Spark); #parts = $numParts, #partsA=$aparts.")
+
+    val rddAtA = srcRdd.flatMap { case (keys, block) =>
+      Iterator.tabulate(numParts) { i =>
+        i -> block(::, ranges(i)).t %*% block
+      }
     }
-        // Combine outer blocks
-        .reduceByKey(_ += _)
-
-        // Restore proper block keys
-        .map {
-      case (blockKey, block) =>
-/* patch to fix index out of range for vector access
-        val blockStart = blockKey * blockHeight
-        val rowKeys = Array.tabulate(block.nrow)(blockStart + _)
-*/
-        val range = ranges(blockKey)
-        val rowKeys = Array.tabulate(block.nrow)(range.start + _)
-        rowKeys -> block
+      // Reduce partial blocks.
+      .reduceByKey(_ += _, numPartitions = numParts)
+
+      // Produce keys
+      .map { case (blockKey, block) =>
+
+      val blockStart = ranges(blockKey).start
+      val rowKeys = Array.tabulate(block.nrow)(blockStart + _)
+      rowKeys -> block
     }
 
     rddAtA
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala
index 406891f19b..5806ff59c3 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/MahoutKryoRegistrator.scala
@@ -18,12 +18,9 @@
 package org.apache.mahout.sparkbindings.io
 
 import com.esotericsoftware.kryo.Kryo
-import com.esotericsoftware.kryo.serializers.JavaSerializer
 import org.apache.mahout.math._
-import org.apache.mahout.math.indexeddataset.{BiMap, BiDictionary}
 import org.apache.spark.serializer.KryoRegistrator
-import org.apache.mahout.sparkbindings._
-import org.apache.mahout.math.Vector.Element
+import org.apache.mahout.logging._
 
 object MahoutKryoRegistrator {
 
@@ -37,12 +34,8 @@ object MahoutKryoRegistrator {
     kryo.addDefaultSerializer(classOf[Vector], new VectorKryoSerializer())
     kryo.addDefaultSerializer(classOf[Matrix], new GenericMatrixKryoSerializer)
 
-    kryo.addDefaultSerializer(classOf[Vector], new WritableKryoSerializer[Vector, VectorWritable])
-    kryo.addDefaultSerializer(classOf[DenseVector], new WritableKryoSerializer[Vector, VectorWritable])
-    kryo.addDefaultSerializer(classOf[Matrix], new WritableKryoSerializer[Matrix, MatrixWritable])
   }
 
-
 }
 
 /** Kryo serialization registrator for Mahout */
diff --git a/spark/src/test/scala/org/apache/mahout/cf/SimilarityAnalysisSuite.scala b/spark/src/test/scala/org/apache/mahout/cf/SimilarityAnalysisSuite.scala
index 9b4c939b7e..0b3b3eb475 100644
--- a/spark/src/test/scala/org/apache/mahout/cf/SimilarityAnalysisSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/cf/SimilarityAnalysisSuite.scala
@@ -66,9 +66,7 @@ class SimilarityAnalysisSuite extends FunSuite with MahoutSuite with Distributed
       (0.0,                0.0,                0.6795961471815897, 0.0,                4.498681156950466))
 
 
-
-//  test("cooccurrence [A'A], [B'A] boolbean data using LLR") {
-  ignore("cooccurrence [A'A], [B'A] boolbean data using LLR") {
+  test("cooccurrence [A'A], [B'A] boolbean data using LLR") {
     val a = dense(
         (1, 1, 0, 0, 0),
         (0, 0, 1, 1, 0),
@@ -99,8 +97,7 @@ class SimilarityAnalysisSuite extends FunSuite with MahoutSuite with Distributed
 
   }
 
-//  test("cooccurrence [A'A], [B'A] double data using LLR") {
-  ignore("cooccurrence [A'A], [B'A] double data using LLR") {
+  test("cooccurrence [A'A], [B'A] double data using LLR") {
     val a = dense(
         (100000.0D, 1.0D,  0.0D,  0.0D,     0.0D),
         (     0.0D, 0.0D, 10.0D,  1.0D,     0.0D),
@@ -130,8 +127,7 @@ class SimilarityAnalysisSuite extends FunSuite with MahoutSuite with Distributed
     n should be < 1E-10
   }
 
-//  test("cooccurrence [A'A], [B'A] integer data using LLR") {
-  ignore("cooccurrence [A'A], [B'A] integer data using LLR") {
+  test("cooccurrence [A'A], [B'A] integer data using LLR") {
     val a = dense(
         ( 1000,  10,       0,    0,   0),
         (    0,   0,  -10000,   10,   0),
@@ -163,8 +159,7 @@ class SimilarityAnalysisSuite extends FunSuite with MahoutSuite with Distributed
     n should be < 1E-10
   }
 
-//  test("cooccurrence two matrices with different number of columns"){
-  ignore("cooccurrence two matrices with different number of columns"){
+  test("cooccurrence two matrices with different number of columns"){
     val a = dense(
       (1, 1, 0, 0, 0),
       (0, 0, 1, 1, 0),
@@ -247,8 +242,7 @@ class SimilarityAnalysisSuite extends FunSuite with MahoutSuite with Distributed
     assert(llr == 2.6341457841558764) // value calculated by hadoop itemsimilairty
   }
 
-//  test("downsampling by number per row") {
-  ignore("downsampling by number per row") {
+  test("downsampling by number per row") {
     val a = dense(
         (1, 1, 1, 1, 0),
         (1, 1, 1, 1, 1),
diff --git a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
index 0224cee23d..628d981c1d 100644
--- a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
@@ -114,8 +114,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
       "--filterColumn", "1"))
   */
 
-  ignore("ItemSimilarityDriver, non-full-spec CSV") {
-//  test("ItemSimilarityDriver, non-full-spec CSV") {
+  test("ItemSimilarityDriver, non-full-spec CSV") {
 
     val InFile = TmpDir + "in-file.csv/" //using part files, not single file
     val OutPath = TmpDir + "similarity-matrices/"
@@ -169,8 +168,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
 
 
-//  test("ItemSimilarityDriver TSV ") {
-  ignore("ItemSimilarityDriver TSV ") {
+  test("ItemSimilarityDriver TSV ") {
 
     val InFile = TmpDir + "in-file.tsv/"
     val OutPath = TmpDir + "similarity-matrices/"
@@ -222,8 +220,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-//  test("ItemSimilarityDriver log-ish files") {
-  ignore("ItemSimilarityDriver log-ish files") {
+  test("ItemSimilarityDriver log-ish files") {
 
     val InFile = TmpDir + "in-file.log/"
     val OutPath = TmpDir + "similarity-matrices/"
@@ -274,8 +271,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-//  test("ItemSimilarityDriver legacy supported file format") {
-  ignore("ItemSimilarityDriver legacy supported file format") {
+  test("ItemSimilarityDriver legacy supported file format") {
 
     val InDir = TmpDir + "in-dir/"
     val InFilename = "in-file.tsv"
@@ -318,8 +314,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-//  test("ItemSimilarityDriver write search engine output") {
-  ignore("ItemSimilarityDriver write search engine output") {
+  test("ItemSimilarityDriver write search engine output") {
 
     val InDir = TmpDir + "in-dir/"
     val InFilename = "in-file.tsv"
@@ -363,8 +358,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-//  test("ItemSimilarityDriver recursive file discovery using filename patterns") {
-  ignore("ItemSimilarityDriver recursive file discovery using filename patterns") {
+  test("ItemSimilarityDriver recursive file discovery using filename patterns") {
     //directory structure using the following
     // tmp/data/m1.tsv
     // tmp/data/more-data/another-dir/m2.tsv
@@ -439,8 +433,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-//  test("ItemSimilarityDriver, two input paths") {
-  ignore("ItemSimilarityDriver, two input paths") {
+  test("ItemSimilarityDriver, two input paths") {
 
     val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file
     val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file
@@ -493,8 +486,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-//  test("ItemSimilarityDriver, two inputs of different dimensions") {
-  ignore("ItemSimilarityDriver, two inputs of different dimensions") {
+  test("ItemSimilarityDriver, two inputs of different dimensions") {
 
     val InFile1 = TmpDir + "in-file1.csv/" //using part files, not single file
     val InFile2 = TmpDir + "in-file2.csv/" //using part files, not single file
@@ -561,8 +553,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-//  test("ItemSimilarityDriver cross similarity two separate items spaces") {
-  ignore("ItemSimilarityDriver cross similarity two separate items spaces") {
+  test("ItemSimilarityDriver cross similarity two separate items spaces") {
     /* cross-similarity with category views, same user space
             	phones	tablets	mobile_acc	soap
           u1	0	      1	      1	          0
@@ -626,8 +617,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-//  test("A.t %*% B after changing row cardinality of A") {
-  ignore("A.t %*% B after changing row cardinality of A") {
+  test("A.t %*% B after changing row cardinality of A") {
     // todo: move to math tests but this is Spark specific
 
     val a = dense(
@@ -658,8 +648,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
     val bp = 0
   }
 
-//  test("Changing row cardinality of an IndexedDataset") {
-  ignore("Changing row cardinality of an IndexedDataset") {
+  test("Changing row cardinality of an IndexedDataset") {
 
     val a = dense(
       (1.0, 1.0))
@@ -673,8 +662,7 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
 
   }
 
-//  test("ItemSimilarityDriver cross similarity two separate items spaces, missing rows in B") {
-  ignore("ItemSimilarityDriver cross similarity two separate items spaces, missing rows in B") {
+  test("ItemSimilarityDriver cross similarity two separate items spaces, missing rows in B") {
     /* cross-similarity with category views, same user space
             	phones	tablets	mobile_acc	soap
             u1	0	      1	      1	          0
diff --git a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala
index 2e517ac99f..d917a22b3d 100644
--- a/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/sparkbindings/test/DistributedSparkSuite.scala
@@ -69,11 +69,6 @@ trait DistributedSparkSuite extends DistributedMahoutSuite with LoggerConfigurat
     resetContext()
   }
 
-  override protected def afterAll(configMap: ConfigMap): Unit = {
-    super.afterAll(configMap)
-    resetContext()
-  }
-
   override protected def beforeAll(configMap: ConfigMap): Unit = {
     super.beforeAll(configMap)
     initContext()

From f4e15506ed2497bc2e179e3ded9ca399fd826d15 Mon Sep 17 00:00:00 2001
From: Dmitriy Lyubimov <dlyubimov@apache.org>
Date: Tue, 2 Jun 2015 11:55:30 -0700
Subject: [PATCH 4/9] restoring merge errors in h2o module, nothing is touched
 here.

---
 .../scala/org/apache/mahout/sparkbindings/blas/AtB.scala  | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
index 8346072979..e2fbabec7d 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
@@ -112,7 +112,7 @@ object AtB {
     // Number of partitions we want to converge to in the product. For now we simply extrapolate that
     // assuming product density and operand densities being about the same; and using the same element
     // per partition number in the product as the bigger of two operands.
-    val numProductPartitions = (prodNCol.toDouble * prodNRow / epp).ceil.toInt
+    val numProductPartitions = (prodNCol.toDouble * prodNRow / epp).ceil.toInt min prodNRow
 
     if (log.isDebugEnabled) log.debug(s"AtB mmul: #parts ${numProductPartitions} for $prodNRow x $prodNCol geometry.")
 
@@ -234,11 +234,15 @@ object AtB {
 
     val rdd = pairwiseRdd.flatMap{ case (blockA, blockB) ⇒
 
+      // Handling microscopic Pat's cases. Any slicing doesn't work well on 0-row matrix. This
+      // probably should be fixed in the in-core matrix implementations.
+      if (blockA.nrow == 0 )
+        Iterator.empty
+      else
       // Output each partial outer product with its correspondent partition #.
         Iterator.tabulate(numPartitions) {part ⇒
 
           val mBlock = blockA(::, ranges(part)).t %*% blockB
-
           part → mBlock
         }
     }

From 00fb618ad0ef0e5b8aac30c88b23d2e9325ea8f8 Mon Sep 17 00:00:00 2001
From: Dmitriy Lyubimov <dlyubimov@apache.org>
Date: Tue, 2 Jun 2015 12:08:13 -0700
Subject: [PATCH 5/9] h20 stuff

---
 .../apache/mahout/h2obindings/H2OEngine.scala | 67 +++++++++++++------
 1 file changed, 48 insertions(+), 19 deletions(-)

diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
index 173d5a09b9..f6eaa5506d 100644
--- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
+++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
@@ -26,9 +26,13 @@ import org.apache.mahout.math.drm.logical._
 import org.apache.mahout.h2obindings.ops._
 import org.apache.mahout.h2obindings.drm._
 import org.apache.mahout.h2o.common.{Hadoop1HDFSUtil, HDFSUtil}
+import org.apache.mahout.logging._
 
 /** H2O specific non-DRM operations */
 object H2OEngine extends DistributedEngine {
+
+  private implicit val log = getLog(H2OEngine.getClass)
+
   // By default, use Hadoop 1 utils
   var hdfsUtils: HDFSUtil = Hadoop1HDFSUtil
 
@@ -119,40 +123,65 @@ object H2OEngine extends DistributedEngine {
   abstract class IndexedDatasetH2O(val matrix: CheckpointedDrm[Int], val rowIDs: BiDictionary, val columnIDs: BiDictionary)
     extends IndexedDataset {}
 
-    /**
-   * reads an IndexedDatasetH2O from default text delimited files
+  /**
+   * Reads an IndexedDatasetH2O from default text delimited files
    * @todo unimplemented
    * @param src a comma separated list of URIs to read from
    * @param schema how the text file is formatted
    * @return
    */
   def indexedDatasetDFSRead(src: String,
-      schema: Schema = DefaultIndexedDatasetReadSchema,
-      existingRowIDs: Option[BiDictionary] = None)
-      (implicit sc: DistributedContext):
-    IndexedDatasetH2O = {
-    // should log a warning when this is built but no logger here, can an H2O contributor help with this
-    println("Warning: unimplemented indexedDatasetDFSReadElements." )
-    throw new UnsupportedOperationException("IndexedDatasetH2O is not implemented so can't be read.")
-    null.asInstanceOf[IndexedDatasetH2O]
+                            schema: Schema = DefaultIndexedDatasetReadSchema,
+                            existingRowIDs: Option[BiDictionary] = None)
+                           (implicit sc: DistributedContext):
+  IndexedDatasetH2O = {
+
+    error("Unimplemented indexedDatasetDFSReadElements.")
+
+    ???
   }
 
   /**
-   * reads an IndexedDatasetH2O from default text delimited files
+   * Reads an IndexedDatasetH2O from default text delimited files
    * @todo unimplemented
    * @param src a comma separated list of URIs to read from
    * @param schema how the text file is formatted
    * @return
    */
   def indexedDatasetDFSReadElements(src: String,
-      schema: Schema = DefaultIndexedDatasetReadSchema,
-      existingRowIDs: Option[BiDictionary] = None)
-      (implicit sc: DistributedContext):
-    IndexedDatasetH2O = {
-    // should log a warning when this is built but no logger here, can an H2O contributor help with this
-    println("Warning: unimplemented indexedDatasetDFSReadElements." )
-    throw new UnsupportedOperationException("IndexedDatasetH2O is not implemented so can't be read by elements.")
-    null.asInstanceOf[IndexedDatasetH2O]
+                                    schema: Schema = DefaultIndexedDatasetReadSchema,
+                                    existingRowIDs: Option[BiDictionary] = None)
+                                   (implicit sc: DistributedContext):
+  IndexedDatasetH2O = {
+
+    error("Unimplemented indexedDatasetDFSReadElements.")
+
+    ???
   }
 
+  /**
+   * Optional engine-specific all reduce tensor operation.
+   *
+   * TODO: implement this please.
+   *
+   */
+  override def allreduceBlock[K: ClassTag](drm: CheckpointedDrm[K], bmf: BlockMapFunc2[K], rf: BlockReduceFunc)
+  : Matrix = ???
+
+  /**
+   * TODO: implement this please.
+   */
+  override def drmSampleKRows[K: ClassTag](drmX: DrmLike[K], numSamples: Int, replacement: Boolean): Matrix = ???
+
+  /**
+   * (Optional) Sampling operation. Consistent with Spark semantics of the same.
+   * TODO: implement this please.
+   */
+  override def drmSampleRows[K: ClassTag](drmX: DrmLike[K], fraction: Double, replacement: Boolean): DrmLike[K] = ???
+
+  /**
+   * TODO: implement this please.
+   */
+  override def drm2IntKeyed[K: ClassTag](drmX: DrmLike[K], computeMap: Boolean)
+  : (DrmLike[Int], Option[DrmLike[K]]) = ???
 }

From e42bcedf8521b89c7583f8e7e299c2be0c2a8de2 Mon Sep 17 00:00:00 2001
From: Dmitriy Lyubimov <dlyubimov@apache.org>
Date: Tue, 2 Jun 2015 12:24:30 -0700
Subject: [PATCH 6/9] final fixes in h20. fixing @deprecated warnings in atb

---
 .../mahout/h2obindings/drm/H2OBCast.java      | 12 ++++
 .../mahout/h2o/common/Hadoop1HDFSUtil.scala   | 63 +++++++++++++++++++
 .../apache/mahout/h2obindings/H2OEngine.scala |  3 +-
 .../mahout/sparkbindings/blas/AtB.scala       |  7 +--
 4 files changed, 79 insertions(+), 6 deletions(-)
 create mode 100644 h2o/src/main/scala/org/apache/mahout/h2o/common/Hadoop1HDFSUtil.scala

diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java
index 523a771f78..ebcc626901 100644
--- a/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java
+++ b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java
@@ -118,4 +118,16 @@ private T deserialize(byte buf[]) {
     }
     return ret;
   }
+
+  /**
+   * Stop broadcasting when called on driver side. Release any network resources.
+   *
+   */
+  @Override
+  public void close() throws IOException {
+
+    // TODO: review this. It looks like it is not really a broadcast mechanism but rather just a
+    // serialization wrapper. In which case it doesn't hold any network resources.
+
+  }
 }
diff --git a/h2o/src/main/scala/org/apache/mahout/h2o/common/Hadoop1HDFSUtil.scala b/h2o/src/main/scala/org/apache/mahout/h2o/common/Hadoop1HDFSUtil.scala
new file mode 100644
index 0000000000..a540cb1613
--- /dev/null
+++ b/h2o/src/main/scala/org/apache/mahout/h2o/common/Hadoop1HDFSUtil.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.h2o.common
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.{SequenceFile, Writable}
+
+/**
+ * Deprecated Hadoop 1 api which we currently explicitly import via Mahout dependencies. May not work
+ * with Hadoop 2.0
+ */
+object Hadoop1HDFSUtil extends HDFSUtil {
+
+  
+  def readDrmHeader(path: String): DrmMetadata = {
+    val dfsPath = new Path(path)
+    val fs = dfsPath.getFileSystem(new Configuration())
+
+    val partFilePath:Path = fs.listStatus(dfsPath)
+
+        // Filter out anything starting with .
+        .filter { s => !s.getPath.getName.startsWith("\\.") && !s.getPath.getName.startsWith("_") && !s.isDir }
+
+        // Take path
+        .map(_.getPath)
+
+        // Take only one, if any
+        .headOption
+
+        // Require there's at least one partition file found.
+        .getOrElse {
+      throw new IllegalArgumentException(s"No partition files found in ${dfsPath.toString}.")
+    }
+
+    val reader = new SequenceFile.Reader(fs, partFilePath, fs.getConf)
+    try {
+      new DrmMetadata(
+        keyTypeWritable = reader.getKeyClass.asSubclass(classOf[Writable]),
+        valueTypeWritable = reader.getValueClass.asSubclass(classOf[Writable])
+      )
+    } finally {
+      reader.close()
+    }
+
+  }
+
+}
diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
index f6eaa5506d..438298601f 100644
--- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
+++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
@@ -151,8 +151,7 @@ object H2OEngine extends DistributedEngine {
   def indexedDatasetDFSReadElements(src: String,
                                     schema: Schema = DefaultIndexedDatasetReadSchema,
                                     existingRowIDs: Option[BiDictionary] = None)
-                                   (implicit sc: DistributedContext):
-  IndexedDatasetH2O = {
+                                   (implicit sc: DistributedContext): IndexedDatasetH2O = {
 
     error("Unimplemented indexedDatasetDFSReadElements.")
 
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
index e2fbabec7d..45705a9811 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/blas/AtB.scala
@@ -46,9 +46,8 @@ object AtB {
    * over co-grouped rows of A and B. If A and B are identically partitioned, we can just directly
    * zip all the rows. Otherwise, we need to inner-join them first.
    *
-   * @deprecated
    */
-  @deprecated
+  @deprecated("slow, will remove", since = "0.10.2")
   def atb_nograph[A: ClassTag](operator: OpAtB[A], srcA: DrmRddInput[A], srcB: DrmRddInput[A],
                                zippable: Boolean = false): DrmRddInput[Int] = {
 
@@ -190,7 +189,7 @@ object AtB {
    * @param pairwiseRdd
    * @return
    */
-  @deprecated
+  @deprecated("slow, will remove", since = "0.10.2")
   private[sparkbindings] def combineOuterProducts(pairwiseRdd: RDD[(Int, (Vector, Vector))], numPartitions: Int) = {
 
     pairwiseRdd
@@ -302,7 +301,7 @@ object AtB {
   }
 
   /** Given already zipped, joined rdd of rows of A' and B, compute their product A'B */
-  @deprecated
+  @deprecated("slow, will remove", since = "0.10.2")
   private[sparkbindings] def computeAtBZipped[A: ClassTag](zipped: RDD[(DrmTuple[A], DrmTuple[A])], nrow: Long,
                                                            ancol: Int, bncol: Int, numPartitions: Int) = {
 

From 7e6ce766d06c5a2337dd9b08df7c9fa37bd9a9c8 Mon Sep 17 00:00:00 2001
From: Dmitriy Lyubimov <dlyubimov@apache.org>
Date: Mon, 8 Jun 2015 10:22:53 -0700
Subject: [PATCH 7/9] adding "final" for logger per comment on public PR

---
 .../main/scala/org/apache/mahout/h2obindings/H2OEngine.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
index 438298601f..e0ac30217a 100644
--- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
+++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala
@@ -31,7 +31,7 @@ import org.apache.mahout.logging._
 /** H2O specific non-DRM operations */
 object H2OEngine extends DistributedEngine {
 
-  private implicit val log = getLog(H2OEngine.getClass)
+  private final implicit val log = getLog(H2OEngine.getClass)
 
   // By default, use Hadoop 1 utils
   var hdfsUtils: HDFSUtil = Hadoop1HDFSUtil

From 56b735e137355e174facffd409d6456360c2f8e7 Mon Sep 17 00:00:00 2001
From: Dmitriy Lyubimov <dlyubimov@apache.org>
Date: Mon, 8 Jun 2015 16:58:34 -0700
Subject: [PATCH 8/9] Inserting back the testing framework artifact being
 built. Need this as a dependency in subordinate projects that do method
 testing as well.

---
 spark/pom.xml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/spark/pom.xml b/spark/pom.xml
index 33e0d1b505..71551157ec 100644
--- a/spark/pom.xml
+++ b/spark/pom.xml
@@ -119,6 +119,22 @@
         </executions>
       </plugin>
 
+      <!-- create test jar so other modules can reuse the math test utility classes.
+        DO NOT REMOVE! Testing framework is useful in subordinate/contrib projects!
+        Please contact @dlyubimov.
+      -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <executions>
+          <execution>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+            <phase>package</phase>
+          </execution>
+        </executions>
+      </plugin>
 
     </plugins>
   </build>

From c59bf8a21e1ad77dee80730772d2184b3f28a495 Mon Sep 17 00:00:00 2001
From: Dmitriy Lyubimov <dlyubimov@apache.org>
Date: Mon, 8 Jun 2015 18:11:57 -0700
Subject: [PATCH 9/9] handling degenerate matrix cases for rbind, cbind, and
 serialization (0 columns or rows)

---
 .../math/scalabindings/RLikeMatrixOps.scala   | 24 ++++++++++++-------
 .../math/scalabindings/RLikeVectorOps.scala   | 14 +++++++----
 .../io/GenericMatrixKryoSerializer.scala      | 10 ++++----
 3 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
index a2d62d6231..7091c53a39 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeMatrixOps.scala
@@ -116,10 +116,14 @@ class RLikeMatrixOps(m: Matrix) extends MatrixOps(m) {
 
   def cbind(that: Matrix): Matrix = {
     require(m.nrow == that.nrow)
-    val mx = m.like(m.nrow, m.ncol + that.ncol)
-    mx(::, 0 until m.ncol) := m
-    mx(::, m.ncol until mx.ncol) := that
-    mx
+    if (m.ncol > 0) {
+      if (that.ncol > 0) {
+        val mx = m.like(m.nrow, m.ncol + that.ncol)
+        mx(::, 0 until m.ncol) := m
+        mx(::, m.ncol until mx.ncol) := that
+        mx
+      } else m
+    } else that
   }
 
   def cbind(that: Double): Matrix = {
@@ -131,10 +135,14 @@ class RLikeMatrixOps(m: Matrix) extends MatrixOps(m) {
 
   def rbind(that: Matrix): Matrix = {
     require(m.ncol == that.ncol)
-    val mx = m.like(m.nrow + that.nrow, m.ncol)
-    mx(0 until m.nrow, ::) := m
-    mx(m.nrow until mx.nrow, ::) := that
-    mx
+    if (m.nrow > 0) {
+      if (that.nrow > 0) {
+        val mx = m.like(m.nrow + that.nrow, m.ncol)
+        mx(0 until m.nrow, ::) := m
+        mx(m.nrow until mx.nrow, ::) := that
+        mx
+      } else m
+    } else that
   }
 
   def rbind(that: Double): Matrix = {
diff --git a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
index cc73350f60..38a55d680b 100644
--- a/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/math/scalabindings/RLikeVectorOps.scala
@@ -75,11 +75,15 @@ class RLikeVectorOps(_v: Vector) extends VectorOps(_v) {
 
   def ^(that: Vector) = v.cloned ^= that
 
-  def c(that:Vector) = {
-    val cv = v.like(v.length + that.length)
-    cv(0 until v.length) := cv
-    cv(v.length until cv.length) := that
-    cv
+  def c(that: Vector) = {
+    if (v.length > 0) {
+      if (that.length > 0) {
+        val cv = v.like(v.length + that.length)
+        cv(0 until v.length) := cv
+        cv(v.length until cv.length) := that
+        cv
+      } else v
+    } else that
   }
 
   def c(that: Double) = {
diff --git a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala
index 15a3d3bfd9..da58b35ba8 100644
--- a/spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala
+++ b/spark/src/main/scala/org/apache/mahout/sparkbindings/io/GenericMatrixKryoSerializer.scala
@@ -17,7 +17,6 @@
 
 package org.apache.mahout.sparkbindings.io
 
-import java.util
 
 import com.esotericsoftware.kryo.io.{Output, Input}
 import com.esotericsoftware.kryo.{Kryo, Serializer}
@@ -142,10 +141,9 @@ class GenericMatrixKryoSerializer extends Serializer[Matrix] {
       case TraversingStructureEnum.COLWISE =>
         val cols = readRows(kryo, input, ncol)
 
-        if (cols.head.isDense)
+        if (!cols.isEmpty && cols.head.isDense)
           dense(cols).t
         else {
-
           debug("Deserializing as SparseRowMatrix.t (COLWISE).")
           new SparseRowMatrix(ncol, nrow, cols, true, false).t
         }
@@ -153,7 +151,7 @@ class GenericMatrixKryoSerializer extends Serializer[Matrix] {
       // transposed SparseMatrix case
       case TraversingStructureEnum.SPARSECOLWISE =>
         val cols = readSparseRows(kryo, input)
-        val javamap = new util.HashMap[Integer, Vector]((cols.size << 1) + 1)
+        val javamap = new java.util.HashMap[Integer, Vector]((cols.size << 1) + 1)
         cols.foreach { case (idx, vec) => javamap.put(idx, vec)}
 
         debug("Deserializing as SparseMatrix.t (SPARSECOLWISE).")
@@ -162,7 +160,7 @@ class GenericMatrixKryoSerializer extends Serializer[Matrix] {
       // Sparse Row-wise -- this will be created as a SparseMatrix.
       case TraversingStructureEnum.SPARSEROWWISE =>
         val rows = readSparseRows(kryo, input)
-        val javamap = new util.HashMap[Integer, Vector]((rows.size << 1) + 1)
+        val javamap = new java.util.HashMap[Integer, Vector]((rows.size << 1) + 1)
         rows.foreach { case (idx, vec) => javamap.put(idx, vec)}
 
         debug("Deserializing as SparseMatrix (SPARSEROWWISE).")
@@ -176,7 +174,7 @@ class GenericMatrixKryoSerializer extends Serializer[Matrix] {
       case _ =>
         val cols = readRows(kryo, input, nrow)
         // this still copies a lot of stuff...
-        if (cols.head.isDense) {
+        if (!cols.isEmpty && cols.head.isDense) {
 
           debug("Deserializing as DenseMatrix.")
           dense(cols)