From fa4389e9e123164619ae77622453baa54ba3a3e3 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Tue, 15 Mar 2016 23:44:21 -0700 Subject: [PATCH 1/2] add rowIter/colIter to Matrix --- .../apache/spark/mllib/linalg/Matrices.scala | 64 ++++++++++++++++++- .../spark/mllib/linalg/MatricesSuite.scala | 13 ++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 0fdb402fd6447..fdede2ad39109 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -22,10 +22,11 @@ import java.util.{Arrays, Random} import scala.collection.mutable.{ArrayBuffer, ArrayBuilder => MArrayBuilder, HashSet => MHashSet} import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, Matrix => BM} +import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.{DeveloperApi, Since} -import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types._ @@ -58,6 +59,20 @@ sealed trait Matrix extends Serializable { newArray } + /** + * Returns an iterator of column vectors. + * This operation could be expensive, depending on the underlying storage. + */ + @Since("2.0.0") + def colIter: Iterator[Vector] + + /** + * Returns an iterator of row vectors. + * This operation could be expensive, depending on the underlying storage. + */ + @Since("2.0.0") + def rowIter: Iterator[Vector] = this.transpose.colIter + /** Converts to a breeze matrix. */ private[mllib] def toBreeze: BM[Double] @@ -386,6 +401,21 @@ class DenseMatrix @Since("1.3.0") ( } new SparseMatrix(numRows, numCols, colPtrs, rowIndices.result(), spVals.result()) } + + @Since("2.0.0") + override def colIter: Iterator[Vector] = { + if (isTransposed) { + Iterator.tabulate(numCols) { j => + val col = new Array[Double](numRows) + blas.dcopy(numRows, values, j, numCols, col, 0, 1) + new DenseVector(col) + } + } else { + Iterator.tabulate(numCols) { j => + new DenseVector(values.slice(j * numRows, (j + 1) * numRows)) + } + } + } } /** @@ -656,6 +686,38 @@ class SparseMatrix @Since("1.3.0") ( @Since("1.5.0") override def numActives: Int = values.length + @Since("2.0.0") + override def colIter: Iterator[Vector] = { + if (isTransposed) { + val indicesArray = Array.fill(numCols)(MArrayBuilder.make[Int]) + val valuesArray = Array.fill(numCols)(MArrayBuilder.make[Double]) + var i = 0 + while (i < numRows) { + var k = colPtrs(i) + val rowEnd = colPtrs(i + 1) + while (k < rowEnd) { + val j = rowIndices(k) + indicesArray(j) += i + valuesArray(j) += values(k) + k += 1 + } + i += 1 + } + Iterator.tabulate(numCols) { j => + val ii = indicesArray(j).result() + val vv = valuesArray(j).result() + new SparseVector(numRows, ii, vv) + } + } else { + Iterator.tabulate(numCols) { j => + val colStart = colPtrs(j) + val colEnd = colPtrs(j + 1) + val ii = rowIndices.slice(colStart, colEnd) + val vv = values.slice(colStart, colEnd) + new SparseVector(numRows, ii, vv) + } + } + } } /** diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index 1833cf3833671..a02b8c9635ba6 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -494,4 +494,17 @@ class MatricesSuite extends SparkFunSuite { assert(sm1.numNonzeros === 1) assert(sm1.numActives === 3) } + + test("row/col iterator") { + val dm = new DenseMatrix(3, 2, Array(0, 1, 2, 3, 4, 0)) + val sm = dm.toSparse + val rows = Seq(Vectors.dense(0, 3), Vectors.dense(1, 4), Vectors.dense(2, 0)) + val cols = Seq(Vectors.dense(0, 1, 2), Vectors.dense(3, 4, 0)) + for (m <- Seq(dm, sm)) { + assert(m.rowIter.toSeq === rows) + assert(m.colIter.toSeq === cols) + assert(m.transpose.rowIter.toSeq === cols) + assert(m.transpose.colIter.toSeq === rows) + } + } } From 3d76781f0c530cb31fc16e2b32dc8dc0c38aa481 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Tue, 15 Mar 2016 23:56:41 -0700 Subject: [PATCH 2/2] add mima excludes --- project/MimaExcludes.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 45776fbb9f336..4f703f841c7b0 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -296,6 +296,10 @@ object MimaExcludes { // SPARK-12073: backpressure rate controller consumes events preferentially from lagging partitions ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.streaming.kafka.KafkaTestUtils.createTopic"), ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.streaming.kafka.DirectKafkaInputDStream.maxMessagesPerPartition") + ) ++ Seq( + // SPARK-13927: add row/column iterator to local matrices + ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Matrix.rowIter"), + ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.mllib.linalg.Matrix.colIter") ) case v if v.startsWith("1.6") => Seq(