From 330c6849bfe77535211fef198e215e558e2c25d1 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 8 Sep 2016 14:52:42 -0700 Subject: [PATCH 01/21] first commit --- .../org/apache/spark/ml/linalg/Matrices.scala | 55 +++++++++++++------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index d9ffdeb797fb8..821a082cc64ce 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -150,6 +150,10 @@ sealed trait Matrix extends Serializable { */ private[spark] def foreachActive(f: (Int, Int, Double) => Unit) + def toSparse: SparseMatrix + + def toDense: DenseMatrix = new DenseMatrix(numRows, numCols, this.toArray) + /** * Find the number of non-zero active values. */ @@ -295,27 +299,32 @@ class DenseMatrix @Since("2.0.0") ( * set to false. */ @Since("2.0.0") - def toSparse: SparseMatrix = { - val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble - val colPtrs: Array[Int] = new Array[Int](numCols + 1) - val rowIndices: MArrayBuilder[Int] = new MArrayBuilder.ofInt - var nnz = 0 - var j = 0 - while (j < numCols) { - var i = 0 - while (i < numRows) { - val v = values(index(i, j)) - if (v != 0.0) { - rowIndices += i - spVals += v - nnz += 1 + def toSparse: SparseMatrix = toSparse(columnMajor = true) + + private[ml] def toSparse(columnMajor: Boolean): SparseMatrix = { + if (!columnMajor) this.transpose.toSparse.transpose + else { + val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble + val colPtrs: Array[Int] = new Array[Int](numCols + 1) + val rowIndices: MArrayBuilder[Int] = new MArrayBuilder.ofInt + var nnz = 0 + var j = 0 + while (j < numCols) { + var i = 0 + while (i < numRows) { + val v = values(index(i, j)) + if (v != 0.0) { + rowIndices += i + spVals += v + nnz += 1 + } + i += 1 } - i += 1 + j += 1 + colPtrs(j) = nnz } - j += 1 - colPtrs(j) = nnz + new SparseMatrix(numRows, numCols, colPtrs, rowIndices.result(), spVals.result()) } - new SparseMatrix(numRows, numCols, colPtrs, rowIndices.result(), spVals.result()) } override def colIter: Iterator[Vector] = { @@ -515,6 +524,16 @@ class SparseMatrix @Since("2.0.0") ( } } + def toSparse: SparseMatrix = { + val nnz = numNonzeros + if (nnz == numNonzeros) { + this + } else { + // TODO + this + } + } + override def apply(i: Int, j: Int): Double = { val ind = index(i, j) if (ind < 0) 0.0 else values(ind) From 59bff68fc35d8a5366a2435d8230024b2abf370c Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 22 Sep 2016 07:48:02 -0700 Subject: [PATCH 02/21] start to add tests --- .../org/apache/spark/ml/linalg/Matrices.scala | 78 ++++++++++++++++++- .../spark/ml/linalg/MatricesSuite.scala | 17 ++++ 2 files changed, 91 insertions(+), 4 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 821a082cc64ce..425c6cde1aa01 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -150,9 +150,49 @@ sealed trait Matrix extends Serializable { */ private[spark] def foreachActive(f: (Int, Int, Double) => Unit) - def toSparse: SparseMatrix + def toSparse(columnMajor: Boolean): SparseMatrix - def toDense: DenseMatrix = new DenseMatrix(numRows, numCols, this.toArray) + // always creates a new array in column major format + // this is a problem and it is not the behavior in the vector class + // we need to NOT create a new array if it's already dense.... + // this should be abstract. + def toDense: DenseMatrix //= new DenseMatrix(numRows, numCols, this.toArray) + + def compressed(columnMajor: Boolean): Matrix = { + if (getDenseSize < getSparseSize(columnMajor)) { + toDense + } else { + toSparse(columnMajor) + } + } + + def compressed: Matrix = { + val cscSize = getSparseSize(true) + val csrSize = getSparseSize(false) + val minSparseSize = cscSize.min(csrSize) + if (getDenseSize < minSparseSize) { + toDense + } else { + if (cscSize == minSparseSize) toSparse(true) else toSparse(false) + } + } + + def getDenseSize: Int = { + val nnz = numNonzeros + 8 * (numRows + numCols) + 8 + } + + def getSparseSize(columnMajor: Boolean): Int = { + // TODO: use long? + val nnz = numNonzeros + if (columnMajor) { + 8 * nnz + 4 * nnz + 4 * (numCols + 1) + } else { + 8 * nnz + 4 * nnz + 4 * (numRows + 1) + } + } + +// def getSparseSize: Int = getSparseSize(true).min(getSparseSize(false)) /** * Find the number of non-zero active values. @@ -301,7 +341,8 @@ class DenseMatrix @Since("2.0.0") ( @Since("2.0.0") def toSparse: SparseMatrix = toSparse(columnMajor = true) - private[ml] def toSparse(columnMajor: Boolean): SparseMatrix = { + // before, this always returned column major + def toSparse(columnMajor: Boolean): SparseMatrix = { if (!columnMajor) this.transpose.toSparse.transpose else { val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble @@ -327,6 +368,30 @@ class DenseMatrix @Since("2.0.0") ( } } + def toDense: DenseMatrix = toDense(columnMajor = true) + + def toDense(columnMajor: Boolean): DenseMatrix = { + if (isTransposed ^ columnMajor) { + if (isTransposed) { + // it is row major and we want column major + val newValues = Array.fill[Double](values.length)(0.0) + values.indices.foreach { i => + newValues(i / numCols + (i % numCols) * numRows) = values(i) + } + new DenseMatrix(numRows, numCols, newValues, isTransposed = false) + } else { + // it is col major and we want row major + val newValues = Array.fill[Double](values.length)(0.0) + values.indices.foreach { i => + newValues(i / numRows + (i % numRows) * numCols) = values(i) + } + new DenseMatrix(numRows, numCols, newValues, isTransposed = true) + } + } else { + this + } + } + override def colIter: Iterator[Vector] = { if (isTransposed) { Iterator.tabulate(numCols) { j => @@ -534,6 +599,11 @@ class SparseMatrix @Since("2.0.0") ( } } + def toSparse(columnMajor: Boolean): SparseMatrix = { + // TODO + this + } + override def apply(i: Int, j: Int): Double = { val ind = index(i, j) if (ind < 0) 0.0 else values(ind) @@ -611,7 +681,7 @@ class SparseMatrix @Since("2.0.0") ( * set to false. */ @Since("2.0.0") - def toDense: DenseMatrix = { + override def toDense: DenseMatrix = { new DenseMatrix(numRows, numCols, toArray) } diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index 9c0aa73938478..2c4fcbd31d8f6 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -28,6 +28,14 @@ import org.apache.spark.ml.SparkMLFunSuite import org.apache.spark.ml.util.TestingUtils._ class MatricesSuite extends SparkMLFunSuite { + def computeSpace(matrix: Matrix): Int = { + matrix match { + case dm: DenseMatrix => + 12 * dm.values.length + 8 + case sm: SparseMatrix => + 12 * sm.numNonzeros + 4 * (if (!sm.isTransposed) sm.numCols + 1 else sm.numRows + 1) + } + } test("dense matrix construction") { val m = 3 val n = 2 @@ -44,6 +52,15 @@ class MatricesSuite extends SparkMLFunSuite { } } + test("compressed dense") { + val dm1 = new DenseMatrix(3, 4, List(List.fill(4)(1.0), List.fill(8)(0.0)).flatten.toArray) + println(computeSpace(dm1)) + println(computeSpace(dm1.toSparse(true))) + println(computeSpace(dm1.toSparse(false))) + val cm = dm1.compressed + println(cm.isInstanceOf[SparseMatrix]) + } + test("sparse matrix construction") { val m = 3 val n = 4 From c900457572793c5c261ad211ec20304600a5508f Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 23 Sep 2016 15:55:20 -0700 Subject: [PATCH 03/21] sparse to sparse stuff --- .../org/apache/spark/ml/linalg/Matrices.scala | 76 ++++++++++---- .../spark/ml/linalg/MatricesSuite.scala | 98 +++++++++++++++---- 2 files changed, 138 insertions(+), 36 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 425c6cde1aa01..29d19638b309e 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -156,11 +156,13 @@ sealed trait Matrix extends Serializable { // this is a problem and it is not the behavior in the vector class // we need to NOT create a new array if it's already dense.... // this should be abstract. - def toDense: DenseMatrix //= new DenseMatrix(numRows, numCols, this.toArray) + def toDense(columnMajor: Boolean): DenseMatrix //= new DenseMatrix(numRows, numCols, this.toArray) + +// def toDense: DenseMatrix = toDense(true) def compressed(columnMajor: Boolean): Matrix = { if (getDenseSize < getSparseSize(columnMajor)) { - toDense + toDense(columnMajor) } else { toSparse(columnMajor) } @@ -171,7 +173,8 @@ sealed trait Matrix extends Serializable { val csrSize = getSparseSize(false) val minSparseSize = cscSize.min(csrSize) if (getDenseSize < minSparseSize) { - toDense + // the size is the same either way, so default to column major + toDense(true) } else { if (cscSize == minSparseSize) toSparse(true) else toSparse(false) } @@ -368,10 +371,12 @@ class DenseMatrix @Since("2.0.0") ( } } - def toDense: DenseMatrix = toDense(columnMajor = true) + def toDense: DenseMatrix = toDense(true) def toDense(columnMajor: Boolean): DenseMatrix = { - if (isTransposed ^ columnMajor) { + // TODO: is this method needed anywhere, really? + // TODO: use while + if (!(isTransposed ^ columnMajor)) { if (isTransposed) { // it is row major and we want column major val newValues = Array.fill[Double](values.length)(0.0) @@ -589,19 +594,51 @@ class SparseMatrix @Since("2.0.0") ( } } - def toSparse: SparseMatrix = { - val nnz = numNonzeros - if (nnz == numNonzeros) { - this - } else { - // TODO - this - } - } + def toSparse: SparseMatrix = toSparse(true) def toSparse(columnMajor: Boolean): SparseMatrix = { - // TODO - this + if (!(columnMajor ^ isTransposed)) { + if (!isTransposed) { + // it is row major and we want col major + val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]].t + Matrices.fromBreeze(breezeTransposed).transpose.asInstanceOf[SparseMatrix] + } else { + // it is col major and we want row major + val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]] + Matrices.fromBreeze(breezeTransposed).asInstanceOf[SparseMatrix] + } + } else { + // TODO: this is technically incorrect since it ignores columnn major + val nnz = numNonzeros + if (nnz != numActives) { + // convert to sparse + val rr = new Array[Int](nnz) + val vv = new Array[Double](nnz) + val cc = new Array[Int](numCols + 1) + var k = 0 + var colIdx = 0 + var idx = 0 + var numRemoved = 0 + foreachActive { (i, j, value) => + if (value != 0.0) { + rr(k) = i + vv(k) = value + k += 1 + } else { + numRemoved += 1 + } + if (idx == colPtrs(colIdx + 1)) { + colIdx += 1 + cc(colIdx) = colPtrs(colIdx) - numRemoved + 1 + } + idx += 1 + } + cc(numCols) = nnz + new SparseMatrix(numRows, numCols, cc, rr, vv, isTransposed = isTransposed) + } else { + this + } + } } override def apply(i: Int, j: Int): Double = { @@ -676,13 +713,16 @@ class SparseMatrix @Since("2.0.0") ( } } + def toDense: DenseMatrix = toDense(true) + /** * Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed * set to false. */ @Since("2.0.0") - override def toDense: DenseMatrix = { - new DenseMatrix(numRows, numCols, toArray) + override def toDense(columnMajor: Boolean): DenseMatrix = { + if (columnMajor) new DenseMatrix(numRows, numCols, toArray) + else new DenseMatrix(numRows, numCols, this.transpose.toArray, isTransposed = true) } override def numNonzeros: Int = values.count(_ != 0) diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index 2c4fcbd31d8f6..c14493f23820e 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -28,14 +28,14 @@ import org.apache.spark.ml.SparkMLFunSuite import org.apache.spark.ml.util.TestingUtils._ class MatricesSuite extends SparkMLFunSuite { - def computeSpace(matrix: Matrix): Int = { - matrix match { - case dm: DenseMatrix => - 12 * dm.values.length + 8 - case sm: SparseMatrix => - 12 * sm.numNonzeros + 4 * (if (!sm.isTransposed) sm.numCols + 1 else sm.numRows + 1) - } + def computeDenseSpace(length: Int): Int = { + 12 * length + 8 + } + def computeSparseSpace(numNonzeros: Int, numRows: Int, + numCols: Int, columnMajor: Boolean): Int = { + 12 * numNonzeros + 4 * (if (columnMajor) numCols + 1 else numRows + 1) } + test("dense matrix construction") { val m = 3 val n = 2 @@ -53,12 +53,31 @@ class MatricesSuite extends SparkMLFunSuite { } test("compressed dense") { - val dm1 = new DenseMatrix(3, 4, List(List.fill(4)(1.0), List.fill(8)(0.0)).flatten.toArray) - println(computeSpace(dm1)) - println(computeSpace(dm1.toSparse(true))) - println(computeSpace(dm1.toSparse(false))) - val cm = dm1.compressed - println(cm.isInstanceOf[SparseMatrix]) + val dm1 = new DenseMatrix(3, 4, Array.fill(2)(1.0) ++ Array.fill(10)(0.0)) + val cm1 = dm1.compressed.asInstanceOf[SparseMatrix] + assert(cm1 === dm1) + assert(cm1.isTransposed) + + val cm1CSC = dm1.compressed(true).asInstanceOf[SparseMatrix] + assert(cm1CSC === dm1) + assert(!cm1CSC.isTransposed) + + val dm2 = dm1.transpose + val cm2 = dm2.compressed.asInstanceOf[SparseMatrix] + assert(cm2 === dm2) + assert(!cm2.isTransposed) + + + val dm3 = new DenseMatrix(3, 4, Array.fill(6)(1.0) ++ Array.fill(6)(0.0)) + val cm3 = dm3.compressed.asInstanceOf[DenseMatrix] + assert(cm3 === dm3) + + val sm1CSC = dm3.toSparse(true) + val cm4 = sm1CSC.compressed.asInstanceOf[DenseMatrix] + assert(cm4 === sm1CSC) + val sm1CSR = dm3.toSparse(false) + val cm5 = sm1CSR.compressed.asInstanceOf[DenseMatrix] + assert(cm5 === sm1CSR) } test("sparse matrix construction") { @@ -177,6 +196,18 @@ class MatricesSuite extends SparkMLFunSuite { assert(sparseMat.values(2) === 10.0) } + + test("tosparsesparse") { + val sm1 = new SparseMatrix(2, 3, Array(0, 1, 2, 3), Array(1, 0, 0), Array(3.0, 1, 2)) + val sm3 = new SparseMatrix(2, 3, Array(0, 2, 3), Array(1, 2, 0), Array(1.0, 2.0, 3.0), true) + val sm2 = sm1.toSparse(false) + assert(sm2 === sm1) + assert(sm2.isTransposed) + val sm4 = sm3.toSparse(true) + assert(sm4 === sm3) + assert(!sm4.isTransposed) + } + test("toSparse, toDense") { val m = 3 val n = 2 @@ -188,11 +219,42 @@ class MatricesSuite extends SparkMLFunSuite { val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values) val deMat1 = new DenseMatrix(m, n, allValues) - val spMat2 = deMat1.toSparse - val deMat2 = spMat1.toDense - - assert(spMat1.asBreeze === spMat2.asBreeze) - assert(deMat1.asBreeze === deMat2.asBreeze) + // sparse to dense + val deMat2 = spMat1.toDense(true) + val deMat3 = spMat1.toDense(false) + val deMat4 = spMat1.toDense + assert(deMat1 === deMat2) + assert(!deMat2.isTransposed) + assert(deMat1 === deMat3) + assert(deMat3.isTransposed) + assert(deMat1 === deMat4) + assert(!deMat4.isTransposed) + + // sparse to sparse + // TODO: check sparse with explicit zeros changes + val sparseToSparse1 = spMat1.toSparse(true) + assert(sparseToSparse1 === spMat1) + assert(!sparseToSparse1.isTransposed) + val sparseToSparse2 = spMat1.toSparse(false) + + // dense to sparse + val spMat2 = deMat1.toSparse(true) + val spMat3 = deMat1.toSparse(false) + val spMat4 = deMat1.toSparse + assert(deMat1 === spMat2) + assert(!spMat2.isTransposed) + assert(deMat1 === spMat3) + assert(spMat3.isTransposed) + assert(deMat1 === spMat4) + assert(!spMat4.isTransposed) + + // dense to dense + val denseToDenseMat1 = deMat1.toDense(true) + assert(denseToDenseMat1 === deMat1) + assert(!denseToDenseMat1.isTransposed) + val denseToDenseMat2 = deMat1.toDense(false) + assert(denseToDenseMat2 === deMat1) + assert(denseToDenseMat2.isTransposed) } test("map, update") { From 65e2361a3e2370320cab082bfc92462c156317a3 Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 25 Oct 2016 10:25:47 -0700 Subject: [PATCH 04/21] improve test cases and cleanup --- .../org/apache/spark/ml/linalg/Matrices.scala | 153 ++++++--- .../spark/ml/linalg/MatricesSuite.scala | 299 +++++++++++++++--- 2 files changed, 349 insertions(+), 103 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 29d19638b309e..5618db800c2b9 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -150,29 +150,50 @@ sealed trait Matrix extends Serializable { */ private[spark] def foreachActive(f: (Int, Int, Double) => Unit) + /** + * Converts this matrix to a sparse matrix. + * + * @param columnMajor Whether the values of the resulting sparse matrix should be in column major + * or row major order. If `false`, resulting matrix will be row major. + */ + @Since("2.1.0") def toSparse(columnMajor: Boolean): SparseMatrix - // always creates a new array in column major format - // this is a problem and it is not the behavior in the vector class - // we need to NOT create a new array if it's already dense.... - // this should be abstract. - def toDense(columnMajor: Boolean): DenseMatrix //= new DenseMatrix(numRows, numCols, this.toArray) + /** + * Converts this matrix to a dense matrix. + * + * @param columnMajor Whether the values of the resulting dense matrix should be in column major + * or row major order. If `false`, resulting matrix will be row major. + */ + @Since("2.1.0") + def toDense(columnMajor: Boolean): DenseMatrix -// def toDense: DenseMatrix = toDense(true) + /** + * Returns a matrix in either dense or sparse format, whichever uses less storage. + * + * @param columnMajor Whether the values of the resulting matrix should be in column major + * or row major order. If `false`, resulting matrix will be row major. + */ + @Since("2.1.0") def compressed(columnMajor: Boolean): Matrix = { - if (getDenseSize < getSparseSize(columnMajor)) { + if (getDenseSizeInBytes < getSparseSizeInBytes(columnMajor)) { toDense(columnMajor) } else { toSparse(columnMajor) } } + /** + * Returns a matrix in dense column major, sparse row major, or sparse column major format, + * whichever uses less storage. + */ + @Since("2.1.0") def compressed: Matrix = { - val cscSize = getSparseSize(true) - val csrSize = getSparseSize(false) + val cscSize = getSparseSizeInBytes(true) + val csrSize = getSparseSizeInBytes(false) val minSparseSize = cscSize.min(csrSize) - if (getDenseSize < minSparseSize) { + if (getDenseSizeInBytes < minSparseSize) { // the size is the same either way, so default to column major toDense(true) } else { @@ -180,23 +201,18 @@ sealed trait Matrix extends Serializable { } } - def getDenseSize: Int = { - val nnz = numNonzeros - 8 * (numRows + numCols) + 8 + /** Gets the size of the dense representation of this `Matrix`. */ + private[ml] def getDenseSizeInBytes: Long = { + 8L * numRows * numCols + 16L } - def getSparseSize(columnMajor: Boolean): Int = { - // TODO: use long? + /** Gets the size of the minimal sparse representation of this `Matrix`. */ + private[ml] def getSparseSizeInBytes(columnMajor: Boolean): Long = { val nnz = numNonzeros - if (columnMajor) { - 8 * nnz + 4 * nnz + 4 * (numCols + 1) - } else { - 8 * nnz + 4 * nnz + 4 * (numRows + 1) - } + val numPtrs = if (columnMajor) numCols + 1L else numRows + 1L + 8L * nnz + 4L * nnz + 4L * numPtrs + 32L } -// def getSparseSize: Int = getSparseSize(true).min(getSparseSize(false)) - /** * Find the number of non-zero active values. */ @@ -344,7 +360,12 @@ class DenseMatrix @Since("2.0.0") ( @Since("2.0.0") def toSparse: SparseMatrix = toSparse(columnMajor = true) - // before, this always returned column major + /** + * Generate a `SparseMatrix` from the given `DenseMatrix`. + * + * @param columnMajor Whether the resulting `SparseMatrix` values will be in column major order. + */ + @Since("2.1.0") def toSparse(columnMajor: Boolean): SparseMatrix = { if (!columnMajor) this.transpose.toSparse.transpose else { @@ -371,24 +392,36 @@ class DenseMatrix @Since("2.0.0") ( } } + /** + * Generate a `DenseMatrix` from this `DenseMatrix`. + */ + @Since("2.1.0") def toDense: DenseMatrix = toDense(true) + /** + * Generate a `DenseMatrix` from this `DenseMatrix`. + * + * @param columnMajor Whether the resulting `DenseMatrix` values will be in column major order. + */ + @Since("2.1.0") def toDense(columnMajor: Boolean): DenseMatrix = { - // TODO: is this method needed anywhere, really? - // TODO: use while if (!(isTransposed ^ columnMajor)) { if (isTransposed) { // it is row major and we want column major - val newValues = Array.fill[Double](values.length)(0.0) - values.indices.foreach { i => - newValues(i / numCols + (i % numCols) * numRows) = values(i) + val newValues = Array.fill[Double](numCols * numRows)(0.0) + var j = 0 + while (j < numCols * numRows) { + newValues(j / numCols + (j % numCols) * numRows) = values(j) + j += 1 } new DenseMatrix(numRows, numCols, newValues, isTransposed = false) } else { // it is col major and we want row major val newValues = Array.fill[Double](values.length)(0.0) - values.indices.foreach { i => - newValues(i / numRows + (i % numRows) * numCols) = values(i) + var j = 0 + while (j < numCols * numRows) { + newValues(j / numRows + (j % numRows) * numCols) = values(j) + j += 1 } new DenseMatrix(numRows, numCols, newValues, isTransposed = true) } @@ -594,8 +627,21 @@ class SparseMatrix @Since("2.0.0") ( } } + /** + * Generate a `SparseMatrix` from this `SparseMatrix`, removing explicit zero values if they + * exist. The resulting `SparseMatrix` will have `isTransposed` set to false. + */ + @Since("2.1.0") def toSparse: SparseMatrix = toSparse(true) + /** + * Generate a `SparseMatrix` from this `SparseMatrix`, removing explicit zero values if they + * exist. + * + * @param columnMajor Whether or not the resulting `SparseMatrix` values are in column major + * order. + */ + @Since("2.1.0") def toSparse(columnMajor: Boolean): SparseMatrix = { if (!(columnMajor ^ isTransposed)) { if (!isTransposed) { @@ -608,32 +654,29 @@ class SparseMatrix @Since("2.0.0") ( Matrices.fromBreeze(breezeTransposed).asInstanceOf[SparseMatrix] } } else { - // TODO: this is technically incorrect since it ignores columnn major val nnz = numNonzeros if (nnz != numActives) { - // convert to sparse val rr = new Array[Int](nnz) val vv = new Array[Double](nnz) - val cc = new Array[Int](numCols + 1) - var k = 0 - var colIdx = 0 - var idx = 0 - var numRemoved = 0 - foreachActive { (i, j, value) => - if (value != 0.0) { - rr(k) = i - vv(k) = value - k += 1 - } else { - numRemoved += 1 - } - if (idx == colPtrs(colIdx + 1)) { - colIdx += 1 - cc(colIdx) = colPtrs(colIdx) - numRemoved + 1 + val numPtrs = if (isTransposed) numRows else numCols + val cc = new Array[Int](numPtrs + 1) + var vidx = 0 + var j = 0 + while (j < numPtrs) { + var idx = colPtrs(j) + val idxEnd = colPtrs(j + 1) + cc(j) = vidx + while (idx < idxEnd) { + if (values(idx) != 0.0) { + vv(vidx) = values(idx) + rr(vidx) = rowIndices(idx) + vidx += 1 + } + idx += 1 } - idx += 1 + j += 1 } - cc(numCols) = nnz + cc(j) = nnz new SparseMatrix(numRows, numCols, cc, rr, vv, isTransposed = isTransposed) } else { this @@ -713,13 +756,19 @@ class SparseMatrix @Since("2.0.0") ( } } - def toDense: DenseMatrix = toDense(true) - /** * Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed * set to false. */ @Since("2.0.0") + def toDense: DenseMatrix = toDense(true) + + /** + * Generate a `DenseMatrix` from the given `SparseMatrix`. + * + * @param columnMajor Whether the resulting [[DenseMatrix]] values are in column major order. + */ + @Since("2.1.0") override def toDense(columnMajor: Boolean): DenseMatrix = { if (columnMajor) new DenseMatrix(numRows, numCols, toArray) else new DenseMatrix(numRows, numCols, this.transpose.toArray, isTransposed = true) diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index c14493f23820e..dad55c7b15a7e 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -53,31 +53,98 @@ class MatricesSuite extends SparkMLFunSuite { } test("compressed dense") { + // TODO: compare before/after size in bytes? val dm1 = new DenseMatrix(3, 4, Array.fill(2)(1.0) ++ Array.fill(10)(0.0)) + + // optimal compression layout is row major since numRows < numCols val cm1 = dm1.compressed.asInstanceOf[SparseMatrix] assert(cm1 === dm1) assert(cm1.isTransposed) + // force compressed column major val cm1CSC = dm1.compressed(true).asInstanceOf[SparseMatrix] assert(cm1CSC === dm1) assert(!cm1CSC.isTransposed) + // optimal compression layout for transpose is column major val dm2 = dm1.transpose val cm2 = dm2.compressed.asInstanceOf[SparseMatrix] assert(cm2 === dm2) assert(!cm2.isTransposed) - val dm3 = new DenseMatrix(3, 4, Array.fill(6)(1.0) ++ Array.fill(6)(0.0)) + + // dense is optimal val cm3 = dm3.compressed.asInstanceOf[DenseMatrix] assert(cm3 === dm3) + assert(!cm3.isTransposed) - val sm1CSC = dm3.toSparse(true) - val cm4 = sm1CSC.compressed.asInstanceOf[DenseMatrix] - assert(cm4 === sm1CSC) - val sm1CSR = dm3.toSparse(false) - val cm5 = sm1CSR.compressed.asInstanceOf[DenseMatrix] - assert(cm5 === sm1CSR) + // force compressed row major + val cm4 = dm3.compressed(false).asInstanceOf[DenseMatrix] + assert(cm4 === dm3) + assert(cm4.isTransposed) + } + + test("sparse compressed") { + /* + sm1 = 0.0 -1.0 + 0.0 0.0 + -4.0 0.0 + 0.0 0.0 + + sm2 = 0.0 0.0 -4.0 0.0 + -1.0 0.0 0.0 0.0 + */ + val sm1 = new SparseMatrix(4, 2, Array(0, 1, 2), Array(2, 0), Array(-4.0, -1.0)) + val sm2 = sm1.transpose + + val cm1 = sm1.compressed.asInstanceOf[SparseMatrix] + // optimal is column major + assert(cm1 === sm1) + assert(!cm1.isTransposed) + assert(cm1.values.equals(sm1.values)) + + val cm2 = sm1.compressed(false).asInstanceOf[SparseMatrix] + assert(cm2 === sm1) + assert(cm2.isTransposed) + + val cm3 = sm2.compressed.asInstanceOf[SparseMatrix] + assert(cm3 === sm2) + assert(cm3.isTransposed) + assert(cm3.values.equals(sm2.values)) + + /* + sm3 = 0.0 -1.0 + 2.0 3.0 + -4.0 9.0 + */ + val sm3 = new SparseMatrix(3, 2, Array(0, 2, 5), Array(1, 2, 0, 1, 2), + Array(2.0, -4.0, -1.0, 3.0, 9.0)) + + // dense is optimal, and defaults to column major + val cm4 = sm3.compressed.asInstanceOf[DenseMatrix] + assert(cm4 === sm3) + assert(!cm4.isTransposed) + + val cm5 = sm3.compressed(false).asInstanceOf[DenseMatrix] + assert(cm5 === sm3) + assert(cm5.isTransposed) + + /* + sm4 = 1.0 0.0 + 0.0 0.0 + ... + */ + val sm4 = new SparseMatrix(Int.MaxValue, 1, Array(0, 1), Array(0), Array(4.5)) + val cm6 = sm4.compressed.asInstanceOf[SparseMatrix] + assert(cm6 === sm4) + assert(!cm6.isTransposed) + + val sm5 = new SparseMatrix(1, Int.MaxValue, Array(0, 1), Array(0), Array(4.5), + isTransposed = true) + val cm7 = sm5.compressed.asInstanceOf[SparseMatrix] + assert(cm7 === sm5) + assert(cm7.isTransposed) } test("sparse matrix construction") { @@ -208,53 +275,183 @@ class MatricesSuite extends SparkMLFunSuite { assert(!sm4.isTransposed) } - test("toSparse, toDense") { - val m = 3 - val n = 2 - val values = Array(1.0, 2.0, 4.0, 5.0) - val allValues = Array(1.0, 2.0, 0.0, 0.0, 4.0, 5.0) - val colPtrs = Array(0, 2, 4) - val rowIndices = Array(0, 1, 1, 2) + test("dense to dense") { + // dense to dense + /* + dm1 = 4.0 2.0 -8.0 + -1.0 7.0 4.0 + + dm2 = 5.0 -9.0 4.0 + 1.0 -3.0 -8.0 + */ + val dm1 = new DenseMatrix(2, 3, Array(4.0, -1.0, 2.0, 7.0, -8.0, 4.0)) + val dm2 = new DenseMatrix(2, 3, Array(5.0, -9.0, 4.0, 1.0, -3.0, -8.0), isTransposed = true) + + val dm3 = dm1.toDense + assert(dm3 === dm1) + assert(dm3.isTransposed === false) + assert(dm3.values.equals(dm1.values)) + + val dm4 = dm1.toDense(false) + assert(dm4 === dm1) + assert(dm4.isTransposed === true) + assert(dm4.values === Array(4.0, 2.0, -8.0, -1.0, 7.0, 4.0)) + + val dm5 = dm2.toDense(true) + assert(dm5 === dm2) + assert(dm5.isTransposed === false) + assert(dm5.values === Array(5.0, 1.0, -9.0, -3.0, 4.0, -8.0)) + + val dm6 = dm2.toDense(false) + assert(dm6 === dm2) + assert(dm6.isTransposed === true) + assert(dm6.values.equals(dm2.values)) + } - val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values) - val deMat1 = new DenseMatrix(m, n, allValues) + test("dense to sparse") { + /* + dm1 = 0.0 4.0 5.0 + 0.0 2.0 0.0 - // sparse to dense - val deMat2 = spMat1.toDense(true) - val deMat3 = spMat1.toDense(false) - val deMat4 = spMat1.toDense - assert(deMat1 === deMat2) - assert(!deMat2.isTransposed) - assert(deMat1 === deMat3) - assert(deMat3.isTransposed) - assert(deMat1 === deMat4) - assert(!deMat4.isTransposed) - - // sparse to sparse - // TODO: check sparse with explicit zeros changes - val sparseToSparse1 = spMat1.toSparse(true) - assert(sparseToSparse1 === spMat1) - assert(!sparseToSparse1.isTransposed) - val sparseToSparse2 = spMat1.toSparse(false) - - // dense to sparse - val spMat2 = deMat1.toSparse(true) - val spMat3 = deMat1.toSparse(false) - val spMat4 = deMat1.toSparse - assert(deMat1 === spMat2) - assert(!spMat2.isTransposed) - assert(deMat1 === spMat3) - assert(spMat3.isTransposed) - assert(deMat1 === spMat4) - assert(!spMat4.isTransposed) + dm2 = 0.0 4.0 5.0 + 0.0 2.0 0.0 - // dense to dense - val denseToDenseMat1 = deMat1.toDense(true) - assert(denseToDenseMat1 === deMat1) - assert(!denseToDenseMat1.isTransposed) - val denseToDenseMat2 = deMat1.toDense(false) - assert(denseToDenseMat2 === deMat1) - assert(denseToDenseMat2.isTransposed) + dm1 = 0.0 0.0 0.0 + 0.0 0.0 0.0 + */ + // dense to sparse should convert to sparse ignoring all zero entries + val dm1 = new DenseMatrix(2, 3, Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) + val dm2 = new DenseMatrix(2, 3, Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0), isTransposed = true) + val dm3 = new DenseMatrix(2, 3, Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) + + val sm1 = dm1.toSparse(true) + assert(sm1 === dm1) + assert(sm1.isTransposed === false) + assert(sm1.values === Array(4.0, 2.0, 5.0)) + + val sm2 = dm1.toSparse(false) + assert(sm2 === dm1) + assert(sm2.isTransposed === true) + assert(sm2.values === Array(4.0, 5.0, 2.0)) + + val sm3 = dm2.toSparse(true) + assert(sm3 === dm2) + assert(sm3.isTransposed === false) + assert(sm3.values === Array(4.0, 2.0, 5.0)) + + val sm4 = dm2.toSparse(false) + assert(sm4 === dm2) + assert(sm4.isTransposed === true) + assert(sm4.values === Array(4.0, 5.0, 2.0)) + + val sm5 = dm3.toSparse(true) + assert(sm5 === dm3) + assert(sm5.values === Array.empty[Double]) + + val sm6 = dm3.toSparse(false) + assert(sm6 === dm3) + assert(sm6.values === Array.empty[Double]) + } + + test("sparse to sparse") { + /* + sm1 = sm2 = sm3 = sm4 = 0.0 4.0 5.0 + 0.0 2.0 0.0 + smZeros = 0.0 0.0 0.0 + 0.0 0.0 0.0 + */ + // TODO: renaming + val sm1 = new SparseMatrix(2, 3, Array(0, 0, 2, 3), Array(0, 1, 0), Array(4.0, 2.0, 5.0)) + val sm2 = new SparseMatrix(2, 3, Array(0, 2, 3), Array(1, 2, 1), Array(4.0, 5.0, 2.0), + isTransposed = true) + val sm3 = new SparseMatrix(2, 3, Array(0, 0, 2, 4), Array(0, 1, 0, 1), + Array(4.0, 2.0, 5.0, 0.0)) + val sm12 = new SparseMatrix(2, 3, Array(0, 2, 4), Array(1, 2, 1, 2), + Array(4.0, 5.0, 2.0, 0.0), isTransposed = true) + val smZeros = new SparseMatrix(2, 3, Array(0, 2, 4, 6), Array(0, 1, 0, 1, 0, 1), + Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) + + val sm4 = sm1.toSparse(false) + assert(sm4 === sm1) + assert(sm4.isTransposed === true) + assert(sm4.values === Array(4.0, 5.0, 2.0)) + + val sm5 = sm1.toSparse(true) + assert(sm5 === sm1) + assert(sm5.isTransposed === false) + sm5.values(0) = 6.0 + assert(sm1.values(0) === 6.0) + + val sm6 = sm2.toSparse(true) + assert(sm6 === sm2) + assert(sm6.isTransposed === false) + assert(sm6.values === Array(4.0, 2.0, 5.0)) + + val sm7 = sm2.toSparse(false) + assert(sm7 === sm2) + assert(sm7.isTransposed === true) + assert(sm7.values.equals(sm2.values)) + + val sm8 = sm3.toSparse + assert(sm8 === sm3) + assert(sm8.values === Array(4.0, 2.0, 5.0)) + assert(sm8.isTransposed === false) + + val sm9 = sm3.toSparse(false) + assert(sm9 === sm3) + assert(sm9.values === Array(4.0, 5.0, 2.0)) + assert(sm9.isTransposed === true) + + val sm10 = sm12.toSparse(false) + assert(sm10 === sm12) + assert(sm10.values === Array(4.0, 5.0, 2.0)) + assert(sm10.isTransposed === true) + + val sm11 = sm12.toSparse + assert(sm11 === sm12) + assert(sm11.values === Array(4.0, 2.0, 5.0)) + assert(sm11.isTransposed === false) + + val sm13 = smZeros.toSparse + assert(sm13 === smZeros) + assert(sm13.values === Array.empty[Double]) + assert(!sm13.isTransposed) + } + + test("sparse to dense") { + /* + 0.0 4.0 5.0 + 0.0 2.0 0.0 + */ + val sm1 = new SparseMatrix(2, 3, Array(0, 0, 2, 3), Array(0, 1, 0), Array(4.0, 2.0, 5.0)) + val sm2 = new SparseMatrix(2, 3, Array(0, 2, 3), Array(1, 2, 1), Array(4.0, 5.0, 2.0), + isTransposed = true) + val sm3 = new SparseMatrix(2, 3, Array(0, 0, 0, 0), Array.empty[Int], Array.empty[Double]) + + val dm1 = sm1.toDense + assert(dm1 === sm1) + assert(dm1.isTransposed === false) + assert(dm1.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) + + val dm2 = sm1.toDense(false) + assert(dm2 === sm1) + assert(dm2.isTransposed === true) + assert(dm2.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) + + val dm3 = sm2.toDense + assert(dm3 === sm2) + assert(dm3.isTransposed === false) + assert(dm3.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) + + val dm4 = sm2.toDense(false) + assert(dm4 === sm2) + assert(dm4.isTransposed === true) + assert(dm4.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) + + val dm5 = sm3.toDense + assert(dm5 === sm3) + assert(!dm5.isTransposed) + assert(dm5.values === Array.fill(6)(0.0)) } test("map, update") { From ffdf3613c66b2606c773f6bcb80db85e525c4e4b Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 25 Oct 2016 11:22:23 -0700 Subject: [PATCH 05/21] adding some helper methods and shoring up test cases --- .../org/apache/spark/ml/linalg/Matrices.scala | 22 +- .../spark/ml/linalg/MatricesSuite.scala | 299 +++++++++--------- 2 files changed, 178 insertions(+), 143 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 5618db800c2b9..558906b616b15 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -203,16 +203,19 @@ sealed trait Matrix extends Serializable { /** Gets the size of the dense representation of this `Matrix`. */ private[ml] def getDenseSizeInBytes: Long = { - 8L * numRows * numCols + 16L + Matrices.getDenseSize(numCols, numRows) } /** Gets the size of the minimal sparse representation of this `Matrix`. */ private[ml] def getSparseSizeInBytes(columnMajor: Boolean): Long = { val nnz = numNonzeros val numPtrs = if (columnMajor) numCols + 1L else numRows + 1L - 8L * nnz + 4L * nnz + 4L * numPtrs + 32L + Matrices.getSparseSize(nnz, numPtrs) } + /** Get the current size in bytes of this `Matrix`. Useful for testing */ + private[ml] def getSizeInBytes: Long + /** * Find the number of non-zero active values. */ @@ -443,6 +446,8 @@ class DenseMatrix @Since("2.0.0") ( } } } + + private[ml] def getSizeInBytes: Long = Matrices.getDenseSize(numCols, numRows) } /** @@ -809,6 +814,8 @@ class SparseMatrix @Since("2.0.0") ( } } } + + private[ml] def getSizeInBytes: Long = Matrices.getSparseSize(numActives, colPtrs.length) } /** @@ -1257,4 +1264,15 @@ object Matrices { SparseMatrix.fromCOO(numRows, numCols, entries) } } + + private[ml] def getSparseSize(numActives: Long, numPtrs: Long): Long = { + // 8 * values.length + 4 * rowIndices.length + 4 * colPtrs.length + 8 + 8 + 1 + 12L * numActives + 4L * numPtrs + 17L + } + + private[ml] def getDenseSize(numCols: Long, numRows: Long): Long = { + // 8 * values.length + 8 + 1 + 8L * numCols * numRows + 9L + } + } diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index dad55c7b15a7e..82080e03fc0df 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -52,101 +52,6 @@ class MatricesSuite extends SparkMLFunSuite { } } - test("compressed dense") { - // TODO: compare before/after size in bytes? - val dm1 = new DenseMatrix(3, 4, Array.fill(2)(1.0) ++ Array.fill(10)(0.0)) - - // optimal compression layout is row major since numRows < numCols - val cm1 = dm1.compressed.asInstanceOf[SparseMatrix] - assert(cm1 === dm1) - assert(cm1.isTransposed) - - // force compressed column major - val cm1CSC = dm1.compressed(true).asInstanceOf[SparseMatrix] - assert(cm1CSC === dm1) - assert(!cm1CSC.isTransposed) - - // optimal compression layout for transpose is column major - val dm2 = dm1.transpose - val cm2 = dm2.compressed.asInstanceOf[SparseMatrix] - assert(cm2 === dm2) - assert(!cm2.isTransposed) - - val dm3 = new DenseMatrix(3, 4, Array.fill(6)(1.0) ++ Array.fill(6)(0.0)) - - // dense is optimal - val cm3 = dm3.compressed.asInstanceOf[DenseMatrix] - assert(cm3 === dm3) - assert(!cm3.isTransposed) - - // force compressed row major - val cm4 = dm3.compressed(false).asInstanceOf[DenseMatrix] - assert(cm4 === dm3) - assert(cm4.isTransposed) - } - - test("sparse compressed") { - /* - sm1 = 0.0 -1.0 - 0.0 0.0 - -4.0 0.0 - 0.0 0.0 - - sm2 = 0.0 0.0 -4.0 0.0 - -1.0 0.0 0.0 0.0 - */ - val sm1 = new SparseMatrix(4, 2, Array(0, 1, 2), Array(2, 0), Array(-4.0, -1.0)) - val sm2 = sm1.transpose - - val cm1 = sm1.compressed.asInstanceOf[SparseMatrix] - // optimal is column major - assert(cm1 === sm1) - assert(!cm1.isTransposed) - assert(cm1.values.equals(sm1.values)) - - val cm2 = sm1.compressed(false).asInstanceOf[SparseMatrix] - assert(cm2 === sm1) - assert(cm2.isTransposed) - - val cm3 = sm2.compressed.asInstanceOf[SparseMatrix] - assert(cm3 === sm2) - assert(cm3.isTransposed) - assert(cm3.values.equals(sm2.values)) - - /* - sm3 = 0.0 -1.0 - 2.0 3.0 - -4.0 9.0 - */ - val sm3 = new SparseMatrix(3, 2, Array(0, 2, 5), Array(1, 2, 0, 1, 2), - Array(2.0, -4.0, -1.0, 3.0, 9.0)) - - // dense is optimal, and defaults to column major - val cm4 = sm3.compressed.asInstanceOf[DenseMatrix] - assert(cm4 === sm3) - assert(!cm4.isTransposed) - - val cm5 = sm3.compressed(false).asInstanceOf[DenseMatrix] - assert(cm5 === sm3) - assert(cm5.isTransposed) - - /* - sm4 = 1.0 0.0 - 0.0 0.0 - ... - */ - val sm4 = new SparseMatrix(Int.MaxValue, 1, Array(0, 1), Array(0), Array(4.5)) - val cm6 = sm4.compressed.asInstanceOf[SparseMatrix] - assert(cm6 === sm4) - assert(!cm6.isTransposed) - - val sm5 = new SparseMatrix(1, Int.MaxValue, Array(0, 1), Array(0), Array(4.5), - isTransposed = true) - val cm7 = sm5.compressed.asInstanceOf[SparseMatrix] - assert(cm7 === sm5) - assert(cm7.isTransposed) - } - test("sparse matrix construction") { val m = 3 val n = 4 @@ -263,20 +168,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(sparseMat.values(2) === 10.0) } - - test("tosparsesparse") { - val sm1 = new SparseMatrix(2, 3, Array(0, 1, 2, 3), Array(1, 0, 0), Array(3.0, 1, 2)) - val sm3 = new SparseMatrix(2, 3, Array(0, 2, 3), Array(1, 2, 0), Array(1.0, 2.0, 3.0), true) - val sm2 = sm1.toSparse(false) - assert(sm2 === sm1) - assert(sm2.isTransposed) - val sm4 = sm3.toSparse(true) - assert(sm4 === sm3) - assert(!sm4.isTransposed) - } - test("dense to dense") { - // dense to dense /* dm1 = 4.0 2.0 -8.0 -1.0 7.0 4.0 @@ -319,7 +211,6 @@ class MatricesSuite extends SparkMLFunSuite { dm1 = 0.0 0.0 0.0 0.0 0.0 0.0 */ - // dense to sparse should convert to sparse ignoring all zero entries val dm1 = new DenseMatrix(2, 3, Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) val dm2 = new DenseMatrix(2, 3, Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0), isTransposed = true) val dm3 = new DenseMatrix(2, 3, Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) @@ -360,57 +251,56 @@ class MatricesSuite extends SparkMLFunSuite { smZeros = 0.0 0.0 0.0 0.0 0.0 0.0 */ - // TODO: renaming val sm1 = new SparseMatrix(2, 3, Array(0, 0, 2, 3), Array(0, 1, 0), Array(4.0, 2.0, 5.0)) val sm2 = new SparseMatrix(2, 3, Array(0, 2, 3), Array(1, 2, 1), Array(4.0, 5.0, 2.0), isTransposed = true) val sm3 = new SparseMatrix(2, 3, Array(0, 0, 2, 4), Array(0, 1, 0, 1), Array(4.0, 2.0, 5.0, 0.0)) - val sm12 = new SparseMatrix(2, 3, Array(0, 2, 4), Array(1, 2, 1, 2), + val sm4 = new SparseMatrix(2, 3, Array(0, 2, 4), Array(1, 2, 1, 2), Array(4.0, 5.0, 2.0, 0.0), isTransposed = true) val smZeros = new SparseMatrix(2, 3, Array(0, 2, 4, 6), Array(0, 1, 0, 1, 0, 1), Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) - val sm4 = sm1.toSparse(false) - assert(sm4 === sm1) - assert(sm4.isTransposed === true) - assert(sm4.values === Array(4.0, 5.0, 2.0)) - - val sm5 = sm1.toSparse(true) + val sm5 = sm1.toSparse(false) assert(sm5 === sm1) - assert(sm5.isTransposed === false) - sm5.values(0) = 6.0 - assert(sm1.values(0) === 6.0) + assert(sm5.isTransposed === true) + assert(sm5.values === Array(4.0, 5.0, 2.0)) - val sm6 = sm2.toSparse(true) - assert(sm6 === sm2) + val sm6 = sm1.toSparse(true) + assert(sm6 === sm1) assert(sm6.isTransposed === false) - assert(sm6.values === Array(4.0, 2.0, 5.0)) + sm6.values(0) = 6.0 + assert(sm1.values(0) === 6.0) - val sm7 = sm2.toSparse(false) + val sm7 = sm2.toSparse(true) assert(sm7 === sm2) - assert(sm7.isTransposed === true) - assert(sm7.values.equals(sm2.values)) + assert(sm7.isTransposed === false) + assert(sm7.values === Array(4.0, 2.0, 5.0)) - val sm8 = sm3.toSparse - assert(sm8 === sm3) - assert(sm8.values === Array(4.0, 2.0, 5.0)) - assert(sm8.isTransposed === false) + val sm8 = sm2.toSparse(false) + assert(sm8 === sm2) + assert(sm8.isTransposed === true) + assert(sm8.values.equals(sm2.values)) - val sm9 = sm3.toSparse(false) + val sm9 = sm3.toSparse assert(sm9 === sm3) - assert(sm9.values === Array(4.0, 5.0, 2.0)) - assert(sm9.isTransposed === true) + assert(sm9.values === Array(4.0, 2.0, 5.0)) + assert(sm9.isTransposed === false) - val sm10 = sm12.toSparse(false) - assert(sm10 === sm12) + val sm10 = sm3.toSparse(false) + assert(sm10 === sm3) assert(sm10.values === Array(4.0, 5.0, 2.0)) assert(sm10.isTransposed === true) - val sm11 = sm12.toSparse - assert(sm11 === sm12) - assert(sm11.values === Array(4.0, 2.0, 5.0)) - assert(sm11.isTransposed === false) + val sm11 = sm4.toSparse(false) + assert(sm11 === sm4) + assert(sm11.values === Array(4.0, 5.0, 2.0)) + assert(sm11.isTransposed === true) + + val sm12 = sm4.toSparse + assert(sm12 === sm4) + assert(sm12.values === Array(4.0, 2.0, 5.0)) + assert(sm12.isTransposed === false) val sm13 = smZeros.toSparse assert(sm13 === smZeros) @@ -420,8 +310,11 @@ class MatricesSuite extends SparkMLFunSuite { test("sparse to dense") { /* - 0.0 4.0 5.0 - 0.0 2.0 0.0 + sm1 = sm2 = 0.0 4.0 5.0 + 0.0 2.0 0.0 + + sm3 = 0.0 0.0 0.0 + 0.0 0.0 0.0 */ val sm1 = new SparseMatrix(2, 3, Array(0, 0, 2, 3), Array(0, 1, 0), Array(4.0, 2.0, 5.0)) val sm2 = new SparseMatrix(2, 3, Array(0, 2, 3), Array(1, 2, 1), Array(4.0, 5.0, 2.0), @@ -454,6 +347,130 @@ class MatricesSuite extends SparkMLFunSuite { assert(dm5.values === Array.fill(6)(0.0)) } + test("compressed dense") { + /* + dm1 = 1.0 0.0 0.0 0.0 + 1.0 0.0 0.0 0.0 + 0.0 0.0 0.0 0.0 + */ + // this should compress to a sparse matrix + val dm1 = new DenseMatrix(3, 4, Array.fill(2)(1.0) ++ Array.fill(10)(0.0)) + + // optimal compression layout is row major since numRows < numCols + val cm1 = dm1.compressed.asInstanceOf[SparseMatrix] + assert(cm1 === dm1) + assert(cm1.isTransposed) + assert(cm1.getSizeInBytes <= dm1.getSizeInBytes) + + // force compressed column major + val cm2 = dm1.compressed(true).asInstanceOf[SparseMatrix] + assert(cm2 === dm1) + assert(!cm2.isTransposed) + assert(cm2.getSizeInBytes <= dm1.getSizeInBytes) + + // optimal compression layout for transpose is column major + val dm2 = dm1.transpose + val cm3 = dm2.compressed.asInstanceOf[SparseMatrix] + assert(cm3 === dm2) + assert(!cm3.isTransposed) + assert(cm3.getSizeInBytes <= dm2.getSizeInBytes) + + /* + dm3 = 1.0 1.0 1.0 0.0 + 1.0 1.0 0.0 0.0 + 1.0 1.0 0.0 0.0 + */ + // this should compress to a dense matrix + val dm3 = new DenseMatrix(3, 4, Array.fill(7)(1.0) ++ Array.fill(5)(0.0)) + + val cm4 = dm3.compressed.asInstanceOf[DenseMatrix] + assert(cm4 === dm3) + assert(!cm4.isTransposed) + assert(cm4.getSizeInBytes <= dm3.getSizeInBytes) + + // force compressed row major + val cm5 = dm3.compressed(false).asInstanceOf[DenseMatrix] + assert(cm5 === dm3) + assert(cm5.isTransposed) + assert(cm5.getSizeInBytes <= dm3.getSizeInBytes) + } + + test("compressed sparse") { + /* + sm1 = 0.0 -1.0 + 0.0 0.0 + -4.0 0.0 + 0.0 0.0 + + sm2 = 0.0 0.0 -4.0 0.0 + -1.0 0.0 0.0 0.0 + */ + // these should compress to sparse matrices + val sm1 = new SparseMatrix(4, 2, Array(0, 1, 2), Array(2, 0), Array(-4.0, -1.0)) + val sm2 = sm1.transpose + + val cm1 = sm1.compressed.asInstanceOf[SparseMatrix] + // optimal is column major + assert(cm1 === sm1) + assert(!cm1.isTransposed) + assert(cm1.values.equals(sm1.values)) + assert(cm1.getSizeInBytes <= sm1.getSizeInBytes) + + val cm2 = sm1.compressed(false).asInstanceOf[SparseMatrix] + assert(cm2 === sm1) + assert(cm2.isTransposed) + // forced to be row major, so we have increased the size + assert(cm2.getSizeInBytes > sm1.getSizeInBytes) + assert(cm2.getSizeInBytes <= sm1.toDense.getSizeInBytes) + + val cm3 = sm2.compressed.asInstanceOf[SparseMatrix] + assert(cm3 === sm2) + assert(cm3.isTransposed) + assert(cm3.values.equals(sm2.values)) + assert(cm3.getSizeInBytes <= sm2.getSizeInBytes) + + /* + sm3 = 0.0 -1.0 + 2.0 3.0 + -4.0 9.0 + */ + // this should compress to a dense matrix + val sm3 = new SparseMatrix(3, 2, Array(0, 2, 5), Array(1, 2, 0, 1, 2), + Array(2.0, -4.0, -1.0, 3.0, 9.0)) + + // dense is optimal, and defaults to column major + val cm4 = sm3.compressed.asInstanceOf[DenseMatrix] + assert(cm4 === sm3) + assert(!cm4.isTransposed) + assert(cm4.getSizeInBytes <= sm3.getSizeInBytes) + + val cm5 = sm3.compressed(false).asInstanceOf[DenseMatrix] + assert(cm5 === sm3) + assert(cm5.isTransposed) + assert(cm5.getSizeInBytes <= sm3.getSizeInBytes) + + /* + sm4 = 1.0 0.0 0.0 ... + + sm5 = 1.0 + 0.0 + 0.0 + ... + */ + val sm4 = new SparseMatrix(Int.MaxValue, 1, Array(0, 1), Array(0), Array(4.5)) + val cm6 = sm4.compressed.asInstanceOf[SparseMatrix] + assert(cm6 === sm4) + assert(!cm6.isTransposed) + assert(cm6.getSizeInBytes <= sm4.getSizeInBytes) + + val sm5 = new SparseMatrix(1, Int.MaxValue, Array(0, 1), Array(0), Array(4.5), + isTransposed = true) + val cm7 = sm5.compressed.asInstanceOf[SparseMatrix] + assert(cm7 === sm5) + assert(cm7.isTransposed) + assert(cm7.getSizeInBytes <= sm5.getSizeInBytes) + } + test("map, update") { val m = 3 val n = 2 From 3a14f2d77d95f17ab72662ff6e73edb8ffa9bfdf Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 25 Oct 2016 12:34:01 -0700 Subject: [PATCH 06/21] cleanup --- .../org/apache/spark/ml/linalg/Matrices.scala | 26 +++---- .../spark/ml/linalg/MatricesSuite.scala | 69 ++++++++++--------- .../apache/spark/ml/linalg/VectorsSuite.scala | 5 ++ 3 files changed, 55 insertions(+), 45 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 558906b616b15..4f5295c2e8628 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -185,8 +185,9 @@ sealed trait Matrix extends Serializable { } /** - * Returns a matrix in dense column major, sparse row major, or sparse column major format, - * whichever uses less storage. + * Returns a matrix in dense column major, dense row major, sparse row major, or sparse column + * major format, whichever uses less storage. When dense representation is optimal, it maintains + * the current layout order. */ @Since("2.1.0") def compressed: Matrix = { @@ -194,8 +195,8 @@ sealed trait Matrix extends Serializable { val csrSize = getSparseSizeInBytes(false) val minSparseSize = cscSize.min(csrSize) if (getDenseSizeInBytes < minSparseSize) { - // the size is the same either way, so default to column major - toDense(true) + // size is the same either way, so maintain current layout + if (isTransposed) toDense(false) else toDense(true) } else { if (cscSize == minSparseSize) toSparse(true) else toSparse(false) } @@ -370,7 +371,7 @@ class DenseMatrix @Since("2.0.0") ( */ @Since("2.1.0") def toSparse(columnMajor: Boolean): SparseMatrix = { - if (!columnMajor) this.transpose.toSparse.transpose + if (!columnMajor) this.transpose.toSparse(columnMajor = true).transpose else { val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble val colPtrs: Array[Int] = new Array[Int](numCols + 1) @@ -399,7 +400,7 @@ class DenseMatrix @Since("2.0.0") ( * Generate a `DenseMatrix` from this `DenseMatrix`. */ @Since("2.1.0") - def toDense: DenseMatrix = toDense(true) + def toDense: DenseMatrix = toDense(columnMajor = true) /** * Generate a `DenseMatrix` from this `DenseMatrix`. @@ -649,6 +650,7 @@ class SparseMatrix @Since("2.0.0") ( @Since("2.1.0") def toSparse(columnMajor: Boolean): SparseMatrix = { if (!(columnMajor ^ isTransposed)) { + // breeze transpose rearranges values in column major and removes explicit zeros if (!isTransposed) { // it is row major and we want col major val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]].t @@ -665,17 +667,17 @@ class SparseMatrix @Since("2.0.0") ( val vv = new Array[Double](nnz) val numPtrs = if (isTransposed) numRows else numCols val cc = new Array[Int](numPtrs + 1) - var vidx = 0 + var nzIdx = 0 var j = 0 while (j < numPtrs) { var idx = colPtrs(j) val idxEnd = colPtrs(j + 1) - cc(j) = vidx + cc(j) = nzIdx while (idx < idxEnd) { if (values(idx) != 0.0) { - vv(vidx) = values(idx) - rr(vidx) = rowIndices(idx) - vidx += 1 + vv(nzIdx) = values(idx) + rr(nzIdx) = rowIndices(idx) + nzIdx += 1 } idx += 1 } @@ -771,7 +773,7 @@ class SparseMatrix @Since("2.0.0") ( /** * Generate a `DenseMatrix` from the given `SparseMatrix`. * - * @param columnMajor Whether the resulting [[DenseMatrix]] values are in column major order. + * @param columnMajor Whether the resulting `DenseMatrix` values are in column major order. */ @Since("2.1.0") override def toDense(columnMajor: Boolean): DenseMatrix = { diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index 82080e03fc0df..83613e4eab7ae 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -28,14 +28,6 @@ import org.apache.spark.ml.SparkMLFunSuite import org.apache.spark.ml.util.TestingUtils._ class MatricesSuite extends SparkMLFunSuite { - def computeDenseSpace(length: Int): Int = { - 12 * length + 8 - } - def computeSparseSpace(numNonzeros: Int, numRows: Int, - numCols: Int, columnMajor: Boolean): Int = { - 12 * numNonzeros + 4 * (if (columnMajor) numCols + 1 else numRows + 1) - } - test("dense matrix construction") { val m = 3 val n = 2 @@ -181,22 +173,22 @@ class MatricesSuite extends SparkMLFunSuite { val dm3 = dm1.toDense assert(dm3 === dm1) - assert(dm3.isTransposed === false) + assert(!dm3.isTransposed) assert(dm3.values.equals(dm1.values)) val dm4 = dm1.toDense(false) assert(dm4 === dm1) - assert(dm4.isTransposed === true) + assert(dm4.isTransposed) assert(dm4.values === Array(4.0, 2.0, -8.0, -1.0, 7.0, 4.0)) val dm5 = dm2.toDense(true) assert(dm5 === dm2) - assert(dm5.isTransposed === false) + assert(!dm5.isTransposed) assert(dm5.values === Array(5.0, 1.0, -9.0, -3.0, 4.0, -8.0)) val dm6 = dm2.toDense(false) assert(dm6 === dm2) - assert(dm6.isTransposed === true) + assert(dm6.isTransposed) assert(dm6.values.equals(dm2.values)) } @@ -217,22 +209,22 @@ class MatricesSuite extends SparkMLFunSuite { val sm1 = dm1.toSparse(true) assert(sm1 === dm1) - assert(sm1.isTransposed === false) + assert(!sm1.isTransposed) assert(sm1.values === Array(4.0, 2.0, 5.0)) val sm2 = dm1.toSparse(false) assert(sm2 === dm1) - assert(sm2.isTransposed === true) + assert(sm2.isTransposed) assert(sm2.values === Array(4.0, 5.0, 2.0)) val sm3 = dm2.toSparse(true) assert(sm3 === dm2) - assert(sm3.isTransposed === false) + assert(!sm3.isTransposed) assert(sm3.values === Array(4.0, 2.0, 5.0)) val sm4 = dm2.toSparse(false) assert(sm4 === dm2) - assert(sm4.isTransposed === true) + assert(sm4.isTransposed) assert(sm4.values === Array(4.0, 5.0, 2.0)) val sm5 = dm3.toSparse(true) @@ -263,44 +255,43 @@ class MatricesSuite extends SparkMLFunSuite { val sm5 = sm1.toSparse(false) assert(sm5 === sm1) - assert(sm5.isTransposed === true) + assert(sm5.isTransposed) assert(sm5.values === Array(4.0, 5.0, 2.0)) val sm6 = sm1.toSparse(true) assert(sm6 === sm1) - assert(sm6.isTransposed === false) - sm6.values(0) = 6.0 - assert(sm1.values(0) === 6.0) + assert(!sm6.isTransposed) + assert(sm6.values.equals(sm1.values)) val sm7 = sm2.toSparse(true) assert(sm7 === sm2) - assert(sm7.isTransposed === false) + assert(!sm7.isTransposed) assert(sm7.values === Array(4.0, 2.0, 5.0)) val sm8 = sm2.toSparse(false) assert(sm8 === sm2) - assert(sm8.isTransposed === true) + assert(sm8.isTransposed) assert(sm8.values.equals(sm2.values)) val sm9 = sm3.toSparse assert(sm9 === sm3) assert(sm9.values === Array(4.0, 2.0, 5.0)) - assert(sm9.isTransposed === false) + assert(!sm9.isTransposed) val sm10 = sm3.toSparse(false) assert(sm10 === sm3) assert(sm10.values === Array(4.0, 5.0, 2.0)) - assert(sm10.isTransposed === true) + assert(sm10.isTransposed) val sm11 = sm4.toSparse(false) assert(sm11 === sm4) assert(sm11.values === Array(4.0, 5.0, 2.0)) - assert(sm11.isTransposed === true) + assert(sm11.isTransposed) val sm12 = sm4.toSparse assert(sm12 === sm4) assert(sm12.values === Array(4.0, 2.0, 5.0)) - assert(sm12.isTransposed === false) + assert(!sm12.isTransposed) val sm13 = smZeros.toSparse assert(sm13 === smZeros) @@ -323,22 +314,22 @@ class MatricesSuite extends SparkMLFunSuite { val dm1 = sm1.toDense assert(dm1 === sm1) - assert(dm1.isTransposed === false) + assert(!dm1.isTransposed) assert(dm1.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) val dm2 = sm1.toDense(false) assert(dm2 === sm1) - assert(dm2.isTransposed === true) + assert(dm2.isTransposed) assert(dm2.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) val dm3 = sm2.toDense assert(dm3 === sm2) - assert(dm3.isTransposed === false) + assert(!dm3.isTransposed) assert(dm3.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) val dm4 = sm2.toDense(false) assert(dm4 === sm2) - assert(dm4.isTransposed === true) + assert(dm4.isTransposed) assert(dm4.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) val dm5 = sm3.toDense @@ -379,13 +370,19 @@ class MatricesSuite extends SparkMLFunSuite { dm3 = 1.0 1.0 1.0 0.0 1.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 + + dm4 = 1.0 1.0 1.0 1.0 + 1.0 1.0 1.0 0.0 + 0.0 0.0 0.0 0.0 */ // this should compress to a dense matrix val dm3 = new DenseMatrix(3, 4, Array.fill(7)(1.0) ++ Array.fill(5)(0.0)) + val dm4 = new DenseMatrix(3, 4, Array.fill(7)(1.0) ++ Array.fill(5)(0.0), isTransposed = true) val cm4 = dm3.compressed.asInstanceOf[DenseMatrix] assert(cm4 === dm3) assert(!cm4.isTransposed) + assert(cm4.values.equals(dm3.values)) assert(cm4.getSizeInBytes <= dm3.getSizeInBytes) // force compressed row major @@ -393,6 +390,12 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm5 === dm3) assert(cm5.isTransposed) assert(cm5.getSizeInBytes <= dm3.getSizeInBytes) + + val cm6 = dm4.compressed.asInstanceOf[DenseMatrix] + assert(cm6 === dm4) + assert(cm6.isTransposed) + assert(cm6.values.equals(dm4.values)) + assert(cm6.getSizeInBytes <= dm4.getSizeInBytes) } test("compressed sparse") { @@ -438,7 +441,7 @@ class MatricesSuite extends SparkMLFunSuite { val sm3 = new SparseMatrix(3, 2, Array(0, 2, 5), Array(1, 2, 0, 1, 2), Array(2.0, -4.0, -1.0, 3.0, 9.0)) - // dense is optimal, and defaults to column major + // dense is optimal, and maintains column major val cm4 = sm3.compressed.asInstanceOf[DenseMatrix] assert(cm4 === sm3) assert(!cm4.isTransposed) @@ -457,13 +460,13 @@ class MatricesSuite extends SparkMLFunSuite { 0.0 ... */ - val sm4 = new SparseMatrix(Int.MaxValue, 1, Array(0, 1), Array(0), Array(4.5)) + val sm4 = new SparseMatrix(Int.MaxValue, 1, Array(0, 1), Array(0), Array(1.0)) val cm6 = sm4.compressed.asInstanceOf[SparseMatrix] assert(cm6 === sm4) assert(!cm6.isTransposed) assert(cm6.getSizeInBytes <= sm4.getSizeInBytes) - val sm5 = new SparseMatrix(1, Int.MaxValue, Array(0, 1), Array(0), Array(4.5), + val sm5 = new SparseMatrix(1, Int.MaxValue, Array(0, 1), Array(0), Array(1.0), isTransposed = true) val cm7 = sm5.compressed.asInstanceOf[SparseMatrix] assert(cm7 === sm5) diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala index ea22c2787fb3c..dfbdaf19d374b 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/VectorsSuite.scala @@ -336,6 +336,11 @@ class VectorsSuite extends SparkMLFunSuite { val sv1 = Vectors.sparse(4, Array(0, 1, 2), Array(1.0, 2.0, 3.0)) val sv1c = sv1.compressed.asInstanceOf[DenseVector] assert(sv1 === sv1c) + + val sv2 = Vectors.sparse(Int.MaxValue, Array(0), Array(3.4)) + val sv2c = sv2.compressed.asInstanceOf[SparseVector] + assert(sv2c === sv2) + assert(sv2c.numActives === 1) } test("SparseVector.slice") { From d226ccbf7feba6425150719cc6e9b6ddba1af7ea Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 25 Oct 2016 12:51:07 -0700 Subject: [PATCH 07/21] minor cleanup --- .../scala/org/apache/spark/ml/linalg/MatricesSuite.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index 83613e4eab7ae..d2e6d993c9cbc 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -200,7 +200,7 @@ class MatricesSuite extends SparkMLFunSuite { dm2 = 0.0 4.0 5.0 0.0 2.0 0.0 - dm1 = 0.0 0.0 0.0 + dm3 = 0.0 0.0 0.0 0.0 0.0 0.0 */ val dm1 = new DenseMatrix(2, 3, Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) @@ -230,10 +230,12 @@ class MatricesSuite extends SparkMLFunSuite { val sm5 = dm3.toSparse(true) assert(sm5 === dm3) assert(sm5.values === Array.empty[Double]) + assert(!sm5.isTransposed) val sm6 = dm3.toSparse(false) assert(sm6 === dm3) assert(sm6.values === Array.empty[Double]) + assert(sm6.isTransposed) } test("sparse to sparse") { @@ -343,6 +345,10 @@ class MatricesSuite extends SparkMLFunSuite { dm1 = 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 + + dm2 = 1.0 1.0 0.0 0.0 + 0.0 0.0 0.0 0.0 + 0.0 0.0 0.0 0.0 */ // this should compress to a sparse matrix val dm1 = new DenseMatrix(3, 4, Array.fill(2)(1.0) ++ Array.fill(10)(0.0)) From e034eb944a8cc943d60e3003c5d72d29645319c7 Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 25 Oct 2016 13:19:00 -0700 Subject: [PATCH 08/21] minor syntactical changes --- .../main/scala/org/apache/spark/ml/linalg/Matrices.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 4f5295c2e8628..56f0d3049ce8e 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -191,14 +191,14 @@ sealed trait Matrix extends Serializable { */ @Since("2.1.0") def compressed: Matrix = { - val cscSize = getSparseSizeInBytes(true) - val csrSize = getSparseSizeInBytes(false) + val cscSize = getSparseSizeInBytes(columnMajor = true) + val csrSize = getSparseSizeInBytes(columnMajor = false) val minSparseSize = cscSize.min(csrSize) if (getDenseSizeInBytes < minSparseSize) { // size is the same either way, so maintain current layout - if (isTransposed) toDense(false) else toDense(true) + toDense(!isTransposed) } else { - if (cscSize == minSparseSize) toSparse(true) else toSparse(false) + if (cscSize == minSparseSize) toSparse(columnMajor = true) else toSparse(columnMajor = false) } } From 0dcada7191b2035f824f81ecd6c9d864da316762 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 31 Oct 2016 11:00:50 -0700 Subject: [PATCH 09/21] add overrides, rearrange code, minor cleanups --- .../org/apache/spark/ml/linalg/Matrices.scala | 194 +++++++++--------- 1 file changed, 93 insertions(+), 101 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 56f0d3049ce8e..eb11ae889946f 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -150,6 +150,18 @@ sealed trait Matrix extends Serializable { */ private[spark] def foreachActive(f: (Int, Int, Double) => Unit) + /** + * Find the number of non-zero active values. + */ + @Since("2.0.0") + def numNonzeros: Int + + /** + * Find the number of values stored explicitly. These values can be zero as well. + */ + @Since("2.0.0") + def numActives: Int + /** * Converts this matrix to a sparse matrix. * @@ -168,7 +180,6 @@ sealed trait Matrix extends Serializable { @Since("2.1.0") def toDense(columnMajor: Boolean): DenseMatrix - /** * Returns a matrix in either dense or sparse format, whichever uses less storage. * @@ -214,20 +225,8 @@ sealed trait Matrix extends Serializable { Matrices.getSparseSize(nnz, numPtrs) } - /** Get the current size in bytes of this `Matrix`. Useful for testing */ + /** Gets the current size in bytes of this `Matrix`. Useful for testing */ private[ml] def getSizeInBytes: Long - - /** - * Find the number of non-zero active values. - */ - @Since("2.0.0") - def numNonzeros: Int - - /** - * Find the number of values stored explicitly. These values can be zero as well. - */ - @Since("2.0.0") - def numActives: Int } /** @@ -358,8 +357,7 @@ class DenseMatrix @Since("2.0.0") ( override def numActives: Int = values.length /** - * Generate a `SparseMatrix` from the given `DenseMatrix`. The new matrix will have isTransposed - * set to false. + * Generate a `SparseMatrix` from the given `DenseMatrix` in column major order. */ @Since("2.0.0") def toSparse: SparseMatrix = toSparse(columnMajor = true) @@ -370,7 +368,7 @@ class DenseMatrix @Since("2.0.0") ( * @param columnMajor Whether the resulting `SparseMatrix` values will be in column major order. */ @Since("2.1.0") - def toSparse(columnMajor: Boolean): SparseMatrix = { + override def toSparse(columnMajor: Boolean): SparseMatrix = { if (!columnMajor) this.transpose.toSparse(columnMajor = true).transpose else { val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble @@ -397,7 +395,7 @@ class DenseMatrix @Since("2.0.0") ( } /** - * Generate a `DenseMatrix` from this `DenseMatrix`. + * Generate a `DenseMatrix` from this `DenseMatrix` in column major order. */ @Since("2.1.0") def toDense: DenseMatrix = toDense(columnMajor = true) @@ -408,27 +406,22 @@ class DenseMatrix @Since("2.0.0") ( * @param columnMajor Whether the resulting `DenseMatrix` values will be in column major order. */ @Since("2.1.0") - def toDense(columnMajor: Boolean): DenseMatrix = { + override def toDense(columnMajor: Boolean): DenseMatrix = { if (!(isTransposed ^ columnMajor)) { - if (isTransposed) { - // it is row major and we want column major - val newValues = Array.fill[Double](numCols * numRows)(0.0) - var j = 0 - while (j < numCols * numRows) { - newValues(j / numCols + (j % numCols) * numRows) = values(j) - j += 1 - } - new DenseMatrix(numRows, numCols, newValues, isTransposed = false) - } else { - // it is col major and we want row major - val newValues = Array.fill[Double](values.length)(0.0) - var j = 0 - while (j < numCols * numRows) { - newValues(j / numRows + (j % numRows) * numCols) = values(j) - j += 1 + val newValues = new Array[Double](numCols * numRows) + var j = 0 + while (j < numCols * numRows) { + val newIndex = if (isTransposed) { + // it is row major and we want column major + j / numCols + (j % numCols) * numRows + } else { + // it is column major and we want row major + j / numRows + (j % numRows) * numCols } - new DenseMatrix(numRows, numCols, newValues, isTransposed = true) + newValues(newIndex) = values(j) + j += 1 } + new DenseMatrix(numRows, numCols, newValues, isTransposed = !isTransposed) } else { this } @@ -633,64 +626,6 @@ class SparseMatrix @Since("2.0.0") ( } } - /** - * Generate a `SparseMatrix` from this `SparseMatrix`, removing explicit zero values if they - * exist. The resulting `SparseMatrix` will have `isTransposed` set to false. - */ - @Since("2.1.0") - def toSparse: SparseMatrix = toSparse(true) - - /** - * Generate a `SparseMatrix` from this `SparseMatrix`, removing explicit zero values if they - * exist. - * - * @param columnMajor Whether or not the resulting `SparseMatrix` values are in column major - * order. - */ - @Since("2.1.0") - def toSparse(columnMajor: Boolean): SparseMatrix = { - if (!(columnMajor ^ isTransposed)) { - // breeze transpose rearranges values in column major and removes explicit zeros - if (!isTransposed) { - // it is row major and we want col major - val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]].t - Matrices.fromBreeze(breezeTransposed).transpose.asInstanceOf[SparseMatrix] - } else { - // it is col major and we want row major - val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]] - Matrices.fromBreeze(breezeTransposed).asInstanceOf[SparseMatrix] - } - } else { - val nnz = numNonzeros - if (nnz != numActives) { - val rr = new Array[Int](nnz) - val vv = new Array[Double](nnz) - val numPtrs = if (isTransposed) numRows else numCols - val cc = new Array[Int](numPtrs + 1) - var nzIdx = 0 - var j = 0 - while (j < numPtrs) { - var idx = colPtrs(j) - val idxEnd = colPtrs(j + 1) - cc(j) = nzIdx - while (idx < idxEnd) { - if (values(idx) != 0.0) { - vv(nzIdx) = values(idx) - rr(nzIdx) = rowIndices(idx) - nzIdx += 1 - } - idx += 1 - } - j += 1 - } - cc(j) = nnz - new SparseMatrix(numRows, numCols, cc, rr, vv, isTransposed = isTransposed) - } else { - this - } - } - } - override def apply(i: Int, j: Int): Double = { val ind = index(i, j) if (ind < 0) 0.0 else values(ind) @@ -763,12 +698,73 @@ class SparseMatrix @Since("2.0.0") ( } } + override def numNonzeros: Int = values.count(_ != 0) + + override def numActives: Int = values.length + + /** + * Generate a `SparseMatrix` from this `SparseMatrix` in column major, removing explicit zero + * values if they exist. + */ + @Since("2.1.0") + def toSparse: SparseMatrix = toSparse(columnMajor = true) + + /** + * Generate a `SparseMatrix` from this `SparseMatrix`, removing explicit zero values if they + * exist. + * + * @param columnMajor Whether or not the resulting `SparseMatrix` values are in column major + * order. + */ + @Since("2.1.0") + override def toSparse(columnMajor: Boolean): SparseMatrix = { + if (!(columnMajor ^ isTransposed)) { + // breeze transpose rearranges values in column major and removes explicit zeros + if (!isTransposed) { + // it is row major and we want col major + val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]].t + Matrices.fromBreeze(breezeTransposed).transpose.asInstanceOf[SparseMatrix] + } else { + // it is col major and we want row major + val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]] + Matrices.fromBreeze(breezeTransposed).asInstanceOf[SparseMatrix] + } + } else { + val nnz = numNonzeros + if (nnz != numActives) { + val rr = new Array[Int](nnz) + val vv = new Array[Double](nnz) + val numPtrs = if (isTransposed) numRows else numCols + val cc = new Array[Int](numPtrs + 1) + var nzIdx = 0 + var j = 0 + while (j < numPtrs) { + var idx = colPtrs(j) + val idxEnd = colPtrs(j + 1) + cc(j) = nzIdx + while (idx < idxEnd) { + if (values(idx) != 0.0) { + vv(nzIdx) = values(idx) + rr(nzIdx) = rowIndices(idx) + nzIdx += 1 + } + idx += 1 + } + j += 1 + } + cc(j) = nnz + new SparseMatrix(numRows, numCols, cc, rr, vv, isTransposed = isTransposed) + } else { + this + } + } + } + /** - * Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed - * set to false. + * Generate a `DenseMatrix` from the given `SparseMatrix` in column major order. */ @Since("2.0.0") - def toDense: DenseMatrix = toDense(true) + def toDense: DenseMatrix = toDense(columnMajor = true) /** * Generate a `DenseMatrix` from the given `SparseMatrix`. @@ -781,10 +777,6 @@ class SparseMatrix @Since("2.0.0") ( else new DenseMatrix(numRows, numCols, this.transpose.toArray, isTransposed = true) } - override def numNonzeros: Int = values.count(_ != 0) - - override def numActives: Int = values.length - override def colIter: Iterator[Vector] = { if (isTransposed) { val indicesArray = Array.fill(numCols)(MArrayBuilder.make[Int]) From 2d3c4df8db25da343ea806fc04213fb2cff64801 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 31 Oct 2016 11:10:21 -0700 Subject: [PATCH 10/21] move toDense and toSparse to trait --- .../org/apache/spark/ml/linalg/Matrices.scala | 65 +++++++------------ .../spark/ml/linalg/MatricesSuite.scala | 34 +++++----- 2 files changed, 42 insertions(+), 57 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index eb11ae889946f..9efa5bc08d8a0 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -168,8 +168,13 @@ sealed trait Matrix extends Serializable { * @param columnMajor Whether the values of the resulting sparse matrix should be in column major * or row major order. If `false`, resulting matrix will be row major. */ + private[ml] def toSparseMatrix(columnMajor: Boolean): SparseMatrix + + /** + * Converts this matrix to a sparse matrix in column major order. + */ @Since("2.1.0") - def toSparse(columnMajor: Boolean): SparseMatrix + def toSparse: SparseMatrix = toSparseMatrix(columnMajor = true) /** * Converts this matrix to a dense matrix. @@ -177,8 +182,13 @@ sealed trait Matrix extends Serializable { * @param columnMajor Whether the values of the resulting dense matrix should be in column major * or row major order. If `false`, resulting matrix will be row major. */ + private [ml] def toDenseMatrix(columnMajor: Boolean): DenseMatrix + + /** + * Converts this matrix to a dense matrix in column major order. + */ @Since("2.1.0") - def toDense(columnMajor: Boolean): DenseMatrix + def toDense: DenseMatrix = toDenseMatrix(columnMajor = true) /** * Returns a matrix in either dense or sparse format, whichever uses less storage. @@ -189,9 +199,9 @@ sealed trait Matrix extends Serializable { @Since("2.1.0") def compressed(columnMajor: Boolean): Matrix = { if (getDenseSizeInBytes < getSparseSizeInBytes(columnMajor)) { - toDense(columnMajor) + toDenseMatrix(columnMajor) } else { - toSparse(columnMajor) + toSparseMatrix(columnMajor) } } @@ -207,9 +217,13 @@ sealed trait Matrix extends Serializable { val minSparseSize = cscSize.min(csrSize) if (getDenseSizeInBytes < minSparseSize) { // size is the same either way, so maintain current layout - toDense(!isTransposed) + toDenseMatrix(!isTransposed) } else { - if (cscSize == minSparseSize) toSparse(columnMajor = true) else toSparse(columnMajor = false) + if (cscSize == minSparseSize) { + toSparseMatrix(columnMajor = true) + } else { + toSparseMatrix(columnMajor = false) + } } } @@ -356,20 +370,13 @@ class DenseMatrix @Since("2.0.0") ( override def numActives: Int = values.length - /** - * Generate a `SparseMatrix` from the given `DenseMatrix` in column major order. - */ - @Since("2.0.0") - def toSparse: SparseMatrix = toSparse(columnMajor = true) - /** * Generate a `SparseMatrix` from the given `DenseMatrix`. * * @param columnMajor Whether the resulting `SparseMatrix` values will be in column major order. */ - @Since("2.1.0") - override def toSparse(columnMajor: Boolean): SparseMatrix = { - if (!columnMajor) this.transpose.toSparse(columnMajor = true).transpose + private[ml] override def toSparseMatrix(columnMajor: Boolean): SparseMatrix = { + if (!columnMajor) this.transpose.toSparseMatrix(columnMajor = true).transpose else { val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble val colPtrs: Array[Int] = new Array[Int](numCols + 1) @@ -394,19 +401,12 @@ class DenseMatrix @Since("2.0.0") ( } } - /** - * Generate a `DenseMatrix` from this `DenseMatrix` in column major order. - */ - @Since("2.1.0") - def toDense: DenseMatrix = toDense(columnMajor = true) - /** * Generate a `DenseMatrix` from this `DenseMatrix`. * * @param columnMajor Whether the resulting `DenseMatrix` values will be in column major order. */ - @Since("2.1.0") - override def toDense(columnMajor: Boolean): DenseMatrix = { + private[ml] override def toDenseMatrix(columnMajor: Boolean): DenseMatrix = { if (!(isTransposed ^ columnMajor)) { val newValues = new Array[Double](numCols * numRows) var j = 0 @@ -702,13 +702,6 @@ class SparseMatrix @Since("2.0.0") ( override def numActives: Int = values.length - /** - * Generate a `SparseMatrix` from this `SparseMatrix` in column major, removing explicit zero - * values if they exist. - */ - @Since("2.1.0") - def toSparse: SparseMatrix = toSparse(columnMajor = true) - /** * Generate a `SparseMatrix` from this `SparseMatrix`, removing explicit zero values if they * exist. @@ -716,8 +709,7 @@ class SparseMatrix @Since("2.0.0") ( * @param columnMajor Whether or not the resulting `SparseMatrix` values are in column major * order. */ - @Since("2.1.0") - override def toSparse(columnMajor: Boolean): SparseMatrix = { + private[ml] override def toSparseMatrix(columnMajor: Boolean): SparseMatrix = { if (!(columnMajor ^ isTransposed)) { // breeze transpose rearranges values in column major and removes explicit zeros if (!isTransposed) { @@ -760,19 +752,12 @@ class SparseMatrix @Since("2.0.0") ( } } - /** - * Generate a `DenseMatrix` from the given `SparseMatrix` in column major order. - */ - @Since("2.0.0") - def toDense: DenseMatrix = toDense(columnMajor = true) - /** * Generate a `DenseMatrix` from the given `SparseMatrix`. * * @param columnMajor Whether the resulting `DenseMatrix` values are in column major order. */ - @Since("2.1.0") - override def toDense(columnMajor: Boolean): DenseMatrix = { + private[ml] override def toDenseMatrix(columnMajor: Boolean): DenseMatrix = { if (columnMajor) new DenseMatrix(numRows, numCols, toArray) else new DenseMatrix(numRows, numCols, this.transpose.toArray, isTransposed = true) } diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index d2e6d993c9cbc..480130e55f561 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -176,17 +176,17 @@ class MatricesSuite extends SparkMLFunSuite { assert(!dm3.isTransposed) assert(dm3.values.equals(dm1.values)) - val dm4 = dm1.toDense(false) + val dm4 = dm1.toDenseMatrix(false) assert(dm4 === dm1) assert(dm4.isTransposed) assert(dm4.values === Array(4.0, 2.0, -8.0, -1.0, 7.0, 4.0)) - val dm5 = dm2.toDense(true) + val dm5 = dm2.toDenseMatrix(true) assert(dm5 === dm2) assert(!dm5.isTransposed) assert(dm5.values === Array(5.0, 1.0, -9.0, -3.0, 4.0, -8.0)) - val dm6 = dm2.toDense(false) + val dm6 = dm2.toDenseMatrix(false) assert(dm6 === dm2) assert(dm6.isTransposed) assert(dm6.values.equals(dm2.values)) @@ -207,32 +207,32 @@ class MatricesSuite extends SparkMLFunSuite { val dm2 = new DenseMatrix(2, 3, Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0), isTransposed = true) val dm3 = new DenseMatrix(2, 3, Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) - val sm1 = dm1.toSparse(true) + val sm1 = dm1.toSparseMatrix(true) assert(sm1 === dm1) assert(!sm1.isTransposed) assert(sm1.values === Array(4.0, 2.0, 5.0)) - val sm2 = dm1.toSparse(false) + val sm2 = dm1.toSparseMatrix(false) assert(sm2 === dm1) assert(sm2.isTransposed) assert(sm2.values === Array(4.0, 5.0, 2.0)) - val sm3 = dm2.toSparse(true) + val sm3 = dm2.toSparseMatrix(true) assert(sm3 === dm2) assert(!sm3.isTransposed) assert(sm3.values === Array(4.0, 2.0, 5.0)) - val sm4 = dm2.toSparse(false) + val sm4 = dm2.toSparseMatrix(false) assert(sm4 === dm2) assert(sm4.isTransposed) assert(sm4.values === Array(4.0, 5.0, 2.0)) - val sm5 = dm3.toSparse(true) + val sm5 = dm3.toSparseMatrix(true) assert(sm5 === dm3) assert(sm5.values === Array.empty[Double]) assert(!sm5.isTransposed) - val sm6 = dm3.toSparse(false) + val sm6 = dm3.toSparseMatrix(false) assert(sm6 === dm3) assert(sm6.values === Array.empty[Double]) assert(sm6.isTransposed) @@ -255,22 +255,22 @@ class MatricesSuite extends SparkMLFunSuite { val smZeros = new SparseMatrix(2, 3, Array(0, 2, 4, 6), Array(0, 1, 0, 1, 0, 1), Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) - val sm5 = sm1.toSparse(false) + val sm5 = sm1.toSparseMatrix(false) assert(sm5 === sm1) assert(sm5.isTransposed) assert(sm5.values === Array(4.0, 5.0, 2.0)) - val sm6 = sm1.toSparse(true) + val sm6 = sm1.toSparseMatrix(true) assert(sm6 === sm1) assert(!sm6.isTransposed) assert(sm6.values.equals(sm1.values)) - val sm7 = sm2.toSparse(true) + val sm7 = sm2.toSparseMatrix(true) assert(sm7 === sm2) assert(!sm7.isTransposed) assert(sm7.values === Array(4.0, 2.0, 5.0)) - val sm8 = sm2.toSparse(false) + val sm8 = sm2.toSparseMatrix(false) assert(sm8 === sm2) assert(sm8.isTransposed) assert(sm8.values.equals(sm2.values)) @@ -280,12 +280,12 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm9.values === Array(4.0, 2.0, 5.0)) assert(!sm9.isTransposed) - val sm10 = sm3.toSparse(false) + val sm10 = sm3.toSparseMatrix(false) assert(sm10 === sm3) assert(sm10.values === Array(4.0, 5.0, 2.0)) assert(sm10.isTransposed) - val sm11 = sm4.toSparse(false) + val sm11 = sm4.toSparseMatrix(false) assert(sm11 === sm4) assert(sm11.values === Array(4.0, 5.0, 2.0)) assert(sm11.isTransposed) @@ -319,7 +319,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(!dm1.isTransposed) assert(dm1.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) - val dm2 = sm1.toDense(false) + val dm2 = sm1.toDenseMatrix(false) assert(dm2 === sm1) assert(dm2.isTransposed) assert(dm2.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) @@ -329,7 +329,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(!dm3.isTransposed) assert(dm3.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) - val dm4 = sm2.toDense(false) + val dm4 = sm2.toDenseMatrix(false) assert(dm4 === sm2) assert(dm4.isTransposed) assert(dm4.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) From c8f85d29f0d6107c171273e50d4f0647afe11ff1 Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 14 Dec 2016 08:17:48 -0800 Subject: [PATCH 11/21] update since --- .../scala/org/apache/spark/ml/linalg/Matrices.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 9efa5bc08d8a0..a52b89be1b1c2 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -173,7 +173,7 @@ sealed trait Matrix extends Serializable { /** * Converts this matrix to a sparse matrix in column major order. */ - @Since("2.1.0") + @Since("2.2.0") def toSparse: SparseMatrix = toSparseMatrix(columnMajor = true) /** @@ -182,12 +182,12 @@ sealed trait Matrix extends Serializable { * @param columnMajor Whether the values of the resulting dense matrix should be in column major * or row major order. If `false`, resulting matrix will be row major. */ - private [ml] def toDenseMatrix(columnMajor: Boolean): DenseMatrix + private[ml] def toDenseMatrix(columnMajor: Boolean): DenseMatrix /** * Converts this matrix to a dense matrix in column major order. */ - @Since("2.1.0") + @Since("2.2.0") def toDense: DenseMatrix = toDenseMatrix(columnMajor = true) /** @@ -196,7 +196,7 @@ sealed trait Matrix extends Serializable { * @param columnMajor Whether the values of the resulting matrix should be in column major * or row major order. If `false`, resulting matrix will be row major. */ - @Since("2.1.0") + @Since("2.2.0") def compressed(columnMajor: Boolean): Matrix = { if (getDenseSizeInBytes < getSparseSizeInBytes(columnMajor)) { toDenseMatrix(columnMajor) @@ -210,7 +210,7 @@ sealed trait Matrix extends Serializable { * major format, whichever uses less storage. When dense representation is optimal, it maintains * the current layout order. */ - @Since("2.1.0") + @Since("2.2.0") def compressed: Matrix = { val cscSize = getSparseSizeInBytes(columnMajor = true) val csrSize = getSparseSizeInBytes(columnMajor = false) From 44d76ccd499e3e56d7be2320fc4e32912a45b486 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 9 Mar 2017 12:54:53 -0800 Subject: [PATCH 12/21] address review --- .../org/apache/spark/ml/linalg/Matrices.scala | 84 +++++++++++-------- .../spark/ml/linalg/MatricesSuite.scala | 35 ++++++++ 2 files changed, 83 insertions(+), 36 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index a52b89be1b1c2..2d875902eef3d 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -148,7 +148,8 @@ sealed trait Matrix extends Serializable { * and column indices respectively with the type `Int`, and the final parameter is the * corresponding value in the matrix with type `Double`. */ - private[spark] def foreachActive(f: (Int, Int, Double) => Unit) + @Since("2.2.0") + def foreachActive(f: (Int, Int, Double) => Unit) /** * Find the number of non-zero active values. @@ -165,43 +166,55 @@ sealed trait Matrix extends Serializable { /** * Converts this matrix to a sparse matrix. * - * @param columnMajor Whether the values of the resulting sparse matrix should be in column major + * @param colMajor Whether the values of the resulting sparse matrix should be in column major * or row major order. If `false`, resulting matrix will be row major. */ - private[ml] def toSparseMatrix(columnMajor: Boolean): SparseMatrix + private[ml] def toSparseMatrix(colMajor: Boolean): SparseMatrix + + /** + * Converts this matrix to a sparse matrix in column major order. + */ + @Since("2.2.0") + def toCSCMatrix: SparseMatrix = toSparseMatrix(colMajor = true) + + /** + * Converts this matrix to a sparse matrix in row major order. + */ + @Since("2.2.0") + def toCSRMatrix: SparseMatrix = toSparseMatrix(colMajor = false) /** * Converts this matrix to a sparse matrix in column major order. */ @Since("2.2.0") - def toSparse: SparseMatrix = toSparseMatrix(columnMajor = true) + def toSparse: SparseMatrix = toSparseMatrix(colMajor = true) /** * Converts this matrix to a dense matrix. * - * @param columnMajor Whether the values of the resulting dense matrix should be in column major + * @param colMajor Whether the values of the resulting dense matrix should be in column major * or row major order. If `false`, resulting matrix will be row major. */ - private[ml] def toDenseMatrix(columnMajor: Boolean): DenseMatrix + private[ml] def toDenseMatrix(colMajor: Boolean): DenseMatrix /** * Converts this matrix to a dense matrix in column major order. */ @Since("2.2.0") - def toDense: DenseMatrix = toDenseMatrix(columnMajor = true) + def toDense: DenseMatrix = toDenseMatrix(colMajor = true) /** * Returns a matrix in either dense or sparse format, whichever uses less storage. * - * @param columnMajor Whether the values of the resulting matrix should be in column major + * @param colMajor Whether the values of the resulting matrix should be in column major * or row major order. If `false`, resulting matrix will be row major. */ @Since("2.2.0") - def compressed(columnMajor: Boolean): Matrix = { - if (getDenseSizeInBytes < getSparseSizeInBytes(columnMajor)) { - toDenseMatrix(columnMajor) + def compressed(colMajor: Boolean): Matrix = { + if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor)) { + toDenseMatrix(colMajor) } else { - toSparseMatrix(columnMajor) + toSparseMatrix(colMajor) } } @@ -212,17 +225,16 @@ sealed trait Matrix extends Serializable { */ @Since("2.2.0") def compressed: Matrix = { - val cscSize = getSparseSizeInBytes(columnMajor = true) - val csrSize = getSparseSizeInBytes(columnMajor = false) - val minSparseSize = cscSize.min(csrSize) - if (getDenseSizeInBytes < minSparseSize) { - // size is the same either way, so maintain current layout + val cscSize = getSparseSizeInBytes(colMajor = true) + val csrSize = getSparseSizeInBytes(colMajor = false) + if (getDenseSizeInBytes < math.min(cscSize, csrSize)) { + // dense matrix size is the same for column major and row major, so maintain current layout toDenseMatrix(!isTransposed) } else { - if (cscSize == minSparseSize) { - toSparseMatrix(columnMajor = true) + if (cscSize <= csrSize) { + toSparseMatrix(colMajor = true) } else { - toSparseMatrix(columnMajor = false) + toSparseMatrix(colMajor = false) } } } @@ -233,9 +245,9 @@ sealed trait Matrix extends Serializable { } /** Gets the size of the minimal sparse representation of this `Matrix`. */ - private[ml] def getSparseSizeInBytes(columnMajor: Boolean): Long = { + private[ml] def getSparseSizeInBytes(colMajor: Boolean): Long = { val nnz = numNonzeros - val numPtrs = if (columnMajor) numCols + 1L else numRows + 1L + val numPtrs = if (colMajor) numCols + 1L else numRows + 1L Matrices.getSparseSize(nnz, numPtrs) } @@ -338,7 +350,7 @@ class DenseMatrix @Since("2.0.0") ( override def transpose: DenseMatrix = new DenseMatrix(numCols, numRows, values, !isTransposed) - private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = { + override def foreachActive(f: (Int, Int, Double) => Unit): Unit = { if (!isTransposed) { // outer loop over columns var j = 0 @@ -373,10 +385,10 @@ class DenseMatrix @Since("2.0.0") ( /** * Generate a `SparseMatrix` from the given `DenseMatrix`. * - * @param columnMajor Whether the resulting `SparseMatrix` values will be in column major order. + * @param colMajor Whether the resulting `SparseMatrix` values will be in column major order. */ - private[ml] override def toSparseMatrix(columnMajor: Boolean): SparseMatrix = { - if (!columnMajor) this.transpose.toSparseMatrix(columnMajor = true).transpose + private[ml] override def toSparseMatrix(colMajor: Boolean): SparseMatrix = { + if (!colMajor) this.transpose.toSparseMatrix(colMajor = true).transpose else { val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble val colPtrs: Array[Int] = new Array[Int](numCols + 1) @@ -404,10 +416,10 @@ class DenseMatrix @Since("2.0.0") ( /** * Generate a `DenseMatrix` from this `DenseMatrix`. * - * @param columnMajor Whether the resulting `DenseMatrix` values will be in column major order. + * @param colMajor Whether the resulting `DenseMatrix` values will be in column major order. */ - private[ml] override def toDenseMatrix(columnMajor: Boolean): DenseMatrix = { - if (!(isTransposed ^ columnMajor)) { + private[ml] override def toDenseMatrix(colMajor: Boolean): DenseMatrix = { + if (!(isTransposed ^ colMajor)) { val newValues = new Array[Double](numCols * numRows) var j = 0 while (j < numCols * numRows) { @@ -671,7 +683,7 @@ class SparseMatrix @Since("2.0.0") ( override def transpose: SparseMatrix = new SparseMatrix(numCols, numRows, colPtrs, rowIndices, values, !isTransposed) - private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = { + override def foreachActive(f: (Int, Int, Double) => Unit): Unit = { if (!isTransposed) { var j = 0 while (j < numCols) { @@ -706,11 +718,11 @@ class SparseMatrix @Since("2.0.0") ( * Generate a `SparseMatrix` from this `SparseMatrix`, removing explicit zero values if they * exist. * - * @param columnMajor Whether or not the resulting `SparseMatrix` values are in column major + * @param colMajor Whether or not the resulting `SparseMatrix` values are in column major * order. */ - private[ml] override def toSparseMatrix(columnMajor: Boolean): SparseMatrix = { - if (!(columnMajor ^ isTransposed)) { + private[ml] override def toSparseMatrix(colMajor: Boolean): SparseMatrix = { + if (!(colMajor ^ isTransposed)) { // breeze transpose rearranges values in column major and removes explicit zeros if (!isTransposed) { // it is row major and we want col major @@ -755,10 +767,10 @@ class SparseMatrix @Since("2.0.0") ( /** * Generate a `DenseMatrix` from the given `SparseMatrix`. * - * @param columnMajor Whether the resulting `DenseMatrix` values are in column major order. + * @param colMajor Whether the resulting `DenseMatrix` values are in column major order. */ - private[ml] override def toDenseMatrix(columnMajor: Boolean): DenseMatrix = { - if (columnMajor) new DenseMatrix(numRows, numCols, toArray) + private[ml] override def toDenseMatrix(colMajor: Boolean): DenseMatrix = { + if (colMajor) new DenseMatrix(numRows, numCols, toArray) else new DenseMatrix(numRows, numCols, this.transpose.toArray, isTransposed = true) } diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index 480130e55f561..78bddf14e566d 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -236,6 +236,21 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm6 === dm3) assert(sm6.values === Array.empty[Double]) assert(sm6.isTransposed) + + val sm7 = dm1.toSparse + assert(sm7 === dm1) + assert(sm7.values === Array(4.0, 2.0, 5.0)) + assert(!sm7.isTransposed) + + val sm8 = dm1.toCSCMatrix + assert(sm8 === dm1) + assert(sm8.values === Array(4.0, 2.0, 5.0)) + assert(!sm8.isTransposed) + + val sm9 = dm2.toCSRMatrix + assert(sm9 === dm2) + assert(sm9.values === Array(4.0, 5.0, 2.0)) + assert(sm9.isTransposed) } test("sparse to sparse") { @@ -299,6 +314,26 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm13 === smZeros) assert(sm13.values === Array.empty[Double]) assert(!sm13.isTransposed) + + val sm14 = sm4.toCSCMatrix + assert(sm14 === sm4) + assert(sm14.values === Array(4.0, 2.0, 5.0)) + assert(!sm14.isTransposed) + + val sm15 = smZeros.toCSCMatrix + assert(sm15 === smZeros) + assert(sm15.values === Array.empty[Double]) + assert(!sm15.isTransposed) + + val sm16 = sm3.toCSRMatrix + assert(sm16 === sm4) + assert(sm16.values === Array(4.0, 5.0, 2.0)) + assert(sm16.isTransposed) + + val sm17 = smZeros.toCSRMatrix + assert(sm17 === smZeros) + assert(sm17.values === Array.empty[Double]) + assert(sm17.isTransposed) } test("sparse to dense") { From f5d63ea98c6dd8a640543b92831141141d4c73ee Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 13 Mar 2017 18:24:48 -0700 Subject: [PATCH 13/21] toDenseRowMajor, toDenseColMajor --- .../org/apache/spark/ml/linalg/Matrices.scala | 16 ++++++++-- .../spark/ml/linalg/MatricesSuite.scala | 32 +++++++++++++++---- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 2d875902eef3d..c3dbfa1070f93 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -175,13 +175,13 @@ sealed trait Matrix extends Serializable { * Converts this matrix to a sparse matrix in column major order. */ @Since("2.2.0") - def toCSCMatrix: SparseMatrix = toSparseMatrix(colMajor = true) + def toCSC: SparseMatrix = toSparseMatrix(colMajor = true) /** * Converts this matrix to a sparse matrix in row major order. */ @Since("2.2.0") - def toCSRMatrix: SparseMatrix = toSparseMatrix(colMajor = false) + def toCSR: SparseMatrix = toSparseMatrix(colMajor = false) /** * Converts this matrix to a sparse matrix in column major order. @@ -203,6 +203,18 @@ sealed trait Matrix extends Serializable { @Since("2.2.0") def toDense: DenseMatrix = toDenseMatrix(colMajor = true) + /** + * Converts this matrix to a dense matrix in row major order. + */ + @Since("2.2.0") + def toDenseRowMajor: DenseMatrix = toDenseMatrix(colMajor = false) + + /** + * Converts this matrix to a dense matrix in column major order. + */ + @Since("2.2.0") + def toDenseColMajor: DenseMatrix = toDenseMatrix(colMajor = true) + /** * Returns a matrix in either dense or sparse format, whichever uses less storage. * diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index 78bddf14e566d..3e85b8081fe44 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -190,6 +190,16 @@ class MatricesSuite extends SparkMLFunSuite { assert(dm6 === dm2) assert(dm6.isTransposed) assert(dm6.values.equals(dm2.values)) + + val dm7 = dm1.toDenseRowMajor + assert(dm7 === dm1) + assert(dm7.isTransposed) + assert(dm7.values === Array(4.0, 2.0, -8.0, -1.0, 7.0, 4.0)) + + val dm8 = dm1.toDenseColMajor + assert(dm8 === dm1) + assert(!dm8.isTransposed) + assert(dm8.values.equals(dm1.values)) } test("dense to sparse") { @@ -242,12 +252,12 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm7.values === Array(4.0, 2.0, 5.0)) assert(!sm7.isTransposed) - val sm8 = dm1.toCSCMatrix + val sm8 = dm1.toCSC assert(sm8 === dm1) assert(sm8.values === Array(4.0, 2.0, 5.0)) assert(!sm8.isTransposed) - val sm9 = dm2.toCSRMatrix + val sm9 = dm2.toCSR assert(sm9 === dm2) assert(sm9.values === Array(4.0, 5.0, 2.0)) assert(sm9.isTransposed) @@ -315,22 +325,22 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm13.values === Array.empty[Double]) assert(!sm13.isTransposed) - val sm14 = sm4.toCSCMatrix + val sm14 = sm4.toCSC assert(sm14 === sm4) assert(sm14.values === Array(4.0, 2.0, 5.0)) assert(!sm14.isTransposed) - val sm15 = smZeros.toCSCMatrix + val sm15 = smZeros.toCSC assert(sm15 === smZeros) assert(sm15.values === Array.empty[Double]) assert(!sm15.isTransposed) - val sm16 = sm3.toCSRMatrix + val sm16 = sm3.toCSR assert(sm16 === sm4) assert(sm16.values === Array(4.0, 5.0, 2.0)) assert(sm16.isTransposed) - val sm17 = smZeros.toCSRMatrix + val sm17 = smZeros.toCSR assert(sm17 === smZeros) assert(sm17.values === Array.empty[Double]) assert(sm17.isTransposed) @@ -373,6 +383,16 @@ class MatricesSuite extends SparkMLFunSuite { assert(dm5 === sm3) assert(!dm5.isTransposed) assert(dm5.values === Array.fill(6)(0.0)) + + val dm6 = sm2.toDenseColMajor + assert(dm6 === sm2) + assert(!dm6.isTransposed) + assert(dm6.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) + + val dm7 = sm2.toDenseRowMajor + assert(dm7 === sm2) + assert(dm7.isTransposed) + assert(dm7.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) } test("compressed dense") { From f35682846221eec759e090a2c3d7e62a1861e9c8 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 13 Mar 2017 20:09:08 -0700 Subject: [PATCH 14/21] mima --- project/MimaExcludes.scala | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 9925a8ba72662..d221f6939bdda 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -68,7 +68,22 @@ object MimaExcludes { // [SPARK-19876] Add one time trigger, and improve Trigger APIs ProblemFilters.exclude[IncompatibleTemplateDefProblem]("org.apache.spark.sql.streaming.Trigger"), - ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.sql.streaming.ProcessingTime") + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.sql.streaming.ProcessingTime"), + + // [SPARK-17471][ML] Add compressed method to ML matrices + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.compressed"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.compressed"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.getSparseSizeInBytes"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDense"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toSparse"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDenseRowMajor"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toCSR"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toCSC"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.getDenseSizeInBytes"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDenseColMajor"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDenseMatrix"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toSparseMatrix"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.getSizeInBytes") ) // Exclude rules for 2.1.x From 5dbdc64d28031fd70f0c63b278e601bb842d4b6c Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 21 Mar 2017 17:44:55 -0700 Subject: [PATCH 15/21] add compressedColRowMajor --- .../org/apache/spark/ml/linalg/Matrices.scala | 21 ++++++++++--------- .../spark/ml/linalg/MatricesSuite.scala | 14 +++++++++---- project/MimaExcludes.scala | 3 ++- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index c3dbfa1070f93..1805786887e49 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -216,18 +216,19 @@ sealed trait Matrix extends Serializable { def toDenseColMajor: DenseMatrix = toDenseMatrix(colMajor = true) /** - * Returns a matrix in either dense or sparse format, whichever uses less storage. - * - * @param colMajor Whether the values of the resulting matrix should be in column major - * or row major order. If `false`, resulting matrix will be row major. + * Returns a matrix in dense or sparse column major format, whichever uses less storage. */ @Since("2.2.0") - def compressed(colMajor: Boolean): Matrix = { - if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor)) { - toDenseMatrix(colMajor) - } else { - toSparseMatrix(colMajor) - } + def compressedColMajor: Matrix = { + if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor = true)) toDenseColMajor else toCSC + } + + /** + * Returns a matrix in dense or sparse row major format, whichever uses less storage. + */ + @Since("2.2.0") + def compressedRowMajor: Matrix = { + if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor = false)) toDenseRowMajor else toCSR } /** diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index 3e85b8081fe44..e3cf6580ea039 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -415,7 +415,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm1.getSizeInBytes <= dm1.getSizeInBytes) // force compressed column major - val cm2 = dm1.compressed(true).asInstanceOf[SparseMatrix] + val cm2 = dm1.compressedColMajor.asInstanceOf[SparseMatrix] assert(cm2 === dm1) assert(!cm2.isTransposed) assert(cm2.getSizeInBytes <= dm1.getSizeInBytes) @@ -447,7 +447,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm4.getSizeInBytes <= dm3.getSizeInBytes) // force compressed row major - val cm5 = dm3.compressed(false).asInstanceOf[DenseMatrix] + val cm5 = dm3.compressedRowMajor.asInstanceOf[DenseMatrix] assert(cm5 === dm3) assert(cm5.isTransposed) assert(cm5.getSizeInBytes <= dm3.getSizeInBytes) @@ -480,7 +480,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm1.values.equals(sm1.values)) assert(cm1.getSizeInBytes <= sm1.getSizeInBytes) - val cm2 = sm1.compressed(false).asInstanceOf[SparseMatrix] + val cm2 = sm1.compressedRowMajor.asInstanceOf[SparseMatrix] assert(cm2 === sm1) assert(cm2.isTransposed) // forced to be row major, so we have increased the size @@ -493,6 +493,12 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm3.values.equals(sm2.values)) assert(cm3.getSizeInBytes <= sm2.getSizeInBytes) + val cm8 = sm2.compressedColMajor.asInstanceOf[SparseMatrix] + assert(cm8 === sm2) + assert(!cm8.isTransposed) + assert(cm8.getSizeInBytes > sm2.getSizeInBytes) + assert(cm8.getSizeInBytes <= sm2.toDense.getSizeInBytes) + /* sm3 = 0.0 -1.0 2.0 3.0 @@ -508,7 +514,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(!cm4.isTransposed) assert(cm4.getSizeInBytes <= sm3.getSizeInBytes) - val cm5 = sm3.compressed(false).asInstanceOf[DenseMatrix] + val cm5 = sm3.compressedRowMajor.asInstanceOf[DenseMatrix] assert(cm5 === sm3) assert(cm5.isTransposed) assert(cm5.getSizeInBytes <= sm3.getSizeInBytes) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index d221f6939bdda..2131bbb1101da 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -72,7 +72,8 @@ object MimaExcludes { // [SPARK-17471][ML] Add compressed method to ML matrices ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.compressed"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.compressed"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.compressedColMajor"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.compressedRowMajor"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.getSparseSizeInBytes"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDense"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toSparse"), From 8bfbd4eea28f2a6767c1e8885429fb9268fb1424 Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 22 Mar 2017 22:53:29 -0700 Subject: [PATCH 16/21] toSparseColMajor --- .../org/apache/spark/ml/linalg/Matrices.scala | 57 +++++++++---------- .../spark/ml/linalg/MatricesSuite.scala | 46 +++++++-------- project/MimaExcludes.scala | 4 +- 3 files changed, 51 insertions(+), 56 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 1805786887e49..b807d8ad8d1c3 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -149,7 +149,7 @@ sealed trait Matrix extends Serializable { * corresponding value in the matrix with type `Double`. */ @Since("2.2.0") - def foreachActive(f: (Int, Int, Double) => Unit) + def foreachActive(f: (Int, Int, Double) => Unit): Unit /** * Find the number of non-zero active values. @@ -175,13 +175,13 @@ sealed trait Matrix extends Serializable { * Converts this matrix to a sparse matrix in column major order. */ @Since("2.2.0") - def toCSC: SparseMatrix = toSparseMatrix(colMajor = true) + def toSparseColMajor: SparseMatrix = toSparseMatrix(colMajor = true) /** * Converts this matrix to a sparse matrix in row major order. */ @Since("2.2.0") - def toCSR: SparseMatrix = toSparseMatrix(colMajor = false) + def toSparseRowMajor: SparseMatrix = toSparseMatrix(colMajor = false) /** * Converts this matrix to a sparse matrix in column major order. @@ -220,7 +220,11 @@ sealed trait Matrix extends Serializable { */ @Since("2.2.0") def compressedColMajor: Matrix = { - if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor = true)) toDenseColMajor else toCSC + if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor = true)) { + toDenseColMajor + } else { + toSparseColMajor + } } /** @@ -228,7 +232,11 @@ sealed trait Matrix extends Serializable { */ @Since("2.2.0") def compressedRowMajor: Matrix = { - if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor = false)) toDenseRowMajor else toCSR + if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor = false)) { + toDenseRowMajor + } else { + toSparseRowMajor + } } /** @@ -432,21 +440,10 @@ class DenseMatrix @Since("2.0.0") ( * @param colMajor Whether the resulting `DenseMatrix` values will be in column major order. */ private[ml] override def toDenseMatrix(colMajor: Boolean): DenseMatrix = { - if (!(isTransposed ^ colMajor)) { - val newValues = new Array[Double](numCols * numRows) - var j = 0 - while (j < numCols * numRows) { - val newIndex = if (isTransposed) { - // it is row major and we want column major - j / numCols + (j % numCols) * numRows - } else { - // it is column major and we want row major - j / numRows + (j % numRows) * numCols - } - newValues(newIndex) = values(j) - j += 1 - } - new DenseMatrix(numRows, numCols, newValues, isTransposed = !isTransposed) + if (isTransposed && colMajor) { + new DenseMatrix(numRows, numCols, toArray, isTransposed = false) + } else if (!isTransposed && !colMajor) { + new DenseMatrix(numRows, numCols, transpose.toArray, isTransposed = true) } else { this } @@ -735,20 +732,18 @@ class SparseMatrix @Since("2.0.0") ( * order. */ private[ml] override def toSparseMatrix(colMajor: Boolean): SparseMatrix = { - if (!(colMajor ^ isTransposed)) { - // breeze transpose rearranges values in column major and removes explicit zeros - if (!isTransposed) { - // it is row major and we want col major - val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]].t - Matrices.fromBreeze(breezeTransposed).transpose.asInstanceOf[SparseMatrix] - } else { - // it is col major and we want row major - val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]] - Matrices.fromBreeze(breezeTransposed).asInstanceOf[SparseMatrix] - } + if (!isTransposed && !colMajor) { + // it is row major and we want col major, use breeze to remove explicit zeros + val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]].t + Matrices.fromBreeze(breezeTransposed).transpose.asInstanceOf[SparseMatrix] + } else if (isTransposed && colMajor) { + // it is col major and we want row major, use breeze to remove explicit zeros + val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]] + Matrices.fromBreeze(breezeTransposed).asInstanceOf[SparseMatrix] } else { val nnz = numNonzeros if (nnz != numActives) { + // remove explicit zeros val rr = new Array[Int](nnz) val vv = new Array[Double](nnz) val numPtrs = if (isTransposed) numRows else numCols diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index e3cf6580ea039..b463c64a79576 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -176,17 +176,17 @@ class MatricesSuite extends SparkMLFunSuite { assert(!dm3.isTransposed) assert(dm3.values.equals(dm1.values)) - val dm4 = dm1.toDenseMatrix(false) + val dm4 = dm1.toDenseRowMajor assert(dm4 === dm1) assert(dm4.isTransposed) assert(dm4.values === Array(4.0, 2.0, -8.0, -1.0, 7.0, 4.0)) - val dm5 = dm2.toDenseMatrix(true) + val dm5 = dm2.toDenseColMajor assert(dm5 === dm2) assert(!dm5.isTransposed) assert(dm5.values === Array(5.0, 1.0, -9.0, -3.0, 4.0, -8.0)) - val dm6 = dm2.toDenseMatrix(false) + val dm6 = dm2.toDenseRowMajor assert(dm6 === dm2) assert(dm6.isTransposed) assert(dm6.values.equals(dm2.values)) @@ -217,32 +217,32 @@ class MatricesSuite extends SparkMLFunSuite { val dm2 = new DenseMatrix(2, 3, Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0), isTransposed = true) val dm3 = new DenseMatrix(2, 3, Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) - val sm1 = dm1.toSparseMatrix(true) + val sm1 = dm1.toSparseColMajor assert(sm1 === dm1) assert(!sm1.isTransposed) assert(sm1.values === Array(4.0, 2.0, 5.0)) - val sm2 = dm1.toSparseMatrix(false) + val sm2 = dm1.toSparseRowMajor assert(sm2 === dm1) assert(sm2.isTransposed) assert(sm2.values === Array(4.0, 5.0, 2.0)) - val sm3 = dm2.toSparseMatrix(true) + val sm3 = dm2.toSparseColMajor assert(sm3 === dm2) assert(!sm3.isTransposed) assert(sm3.values === Array(4.0, 2.0, 5.0)) - val sm4 = dm2.toSparseMatrix(false) + val sm4 = dm2.toSparseRowMajor assert(sm4 === dm2) assert(sm4.isTransposed) assert(sm4.values === Array(4.0, 5.0, 2.0)) - val sm5 = dm3.toSparseMatrix(true) + val sm5 = dm3.toSparseColMajor assert(sm5 === dm3) assert(sm5.values === Array.empty[Double]) assert(!sm5.isTransposed) - val sm6 = dm3.toSparseMatrix(false) + val sm6 = dm3.toSparseRowMajor assert(sm6 === dm3) assert(sm6.values === Array.empty[Double]) assert(sm6.isTransposed) @@ -252,12 +252,12 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm7.values === Array(4.0, 2.0, 5.0)) assert(!sm7.isTransposed) - val sm8 = dm1.toCSC + val sm8 = dm1.toSparseColMajor assert(sm8 === dm1) assert(sm8.values === Array(4.0, 2.0, 5.0)) assert(!sm8.isTransposed) - val sm9 = dm2.toCSR + val sm9 = dm2.toSparseRowMajor assert(sm9 === dm2) assert(sm9.values === Array(4.0, 5.0, 2.0)) assert(sm9.isTransposed) @@ -280,22 +280,22 @@ class MatricesSuite extends SparkMLFunSuite { val smZeros = new SparseMatrix(2, 3, Array(0, 2, 4, 6), Array(0, 1, 0, 1, 0, 1), Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) - val sm5 = sm1.toSparseMatrix(false) + val sm5 = sm1.toSparseRowMajor assert(sm5 === sm1) assert(sm5.isTransposed) assert(sm5.values === Array(4.0, 5.0, 2.0)) - val sm6 = sm1.toSparseMatrix(true) + val sm6 = sm1.toSparseColMajor assert(sm6 === sm1) assert(!sm6.isTransposed) assert(sm6.values.equals(sm1.values)) - val sm7 = sm2.toSparseMatrix(true) + val sm7 = sm2.toSparseColMajor assert(sm7 === sm2) assert(!sm7.isTransposed) assert(sm7.values === Array(4.0, 2.0, 5.0)) - val sm8 = sm2.toSparseMatrix(false) + val sm8 = sm2.toSparseRowMajor assert(sm8 === sm2) assert(sm8.isTransposed) assert(sm8.values.equals(sm2.values)) @@ -305,12 +305,12 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm9.values === Array(4.0, 2.0, 5.0)) assert(!sm9.isTransposed) - val sm10 = sm3.toSparseMatrix(false) + val sm10 = sm3.toSparseRowMajor assert(sm10 === sm3) assert(sm10.values === Array(4.0, 5.0, 2.0)) assert(sm10.isTransposed) - val sm11 = sm4.toSparseMatrix(false) + val sm11 = sm4.toSparseRowMajor assert(sm11 === sm4) assert(sm11.values === Array(4.0, 5.0, 2.0)) assert(sm11.isTransposed) @@ -325,22 +325,22 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm13.values === Array.empty[Double]) assert(!sm13.isTransposed) - val sm14 = sm4.toCSC + val sm14 = sm4.toSparseColMajor assert(sm14 === sm4) assert(sm14.values === Array(4.0, 2.0, 5.0)) assert(!sm14.isTransposed) - val sm15 = smZeros.toCSC + val sm15 = smZeros.toSparseColMajor assert(sm15 === smZeros) assert(sm15.values === Array.empty[Double]) assert(!sm15.isTransposed) - val sm16 = sm3.toCSR + val sm16 = sm3.toSparseRowMajor assert(sm16 === sm4) assert(sm16.values === Array(4.0, 5.0, 2.0)) assert(sm16.isTransposed) - val sm17 = smZeros.toCSR + val sm17 = smZeros.toSparseRowMajor assert(sm17 === smZeros) assert(sm17.values === Array.empty[Double]) assert(sm17.isTransposed) @@ -364,7 +364,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(!dm1.isTransposed) assert(dm1.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) - val dm2 = sm1.toDenseMatrix(false) + val dm2 = sm1.toDenseRowMajor assert(dm2 === sm1) assert(dm2.isTransposed) assert(dm2.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) @@ -374,7 +374,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(!dm3.isTransposed) assert(dm3.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) - val dm4 = sm2.toDenseMatrix(false) + val dm4 = sm2.toDenseRowMajor assert(dm4 === sm2) assert(dm4.isTransposed) assert(dm4.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 2131bbb1101da..b00abbba10fbc 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -78,8 +78,8 @@ object MimaExcludes { ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDense"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toSparse"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDenseRowMajor"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toCSR"), - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toCSC"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toSparseRowMajor"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toSparseColMajor"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.getDenseSizeInBytes"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDenseColMajor"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDenseMatrix"), From 4026e892c60a0c1866fd61fc5137c4b5d1bcb13b Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 23 Mar 2017 14:46:19 -0700 Subject: [PATCH 17/21] toSparse, toDense maintain current layout --- .../org/apache/spark/ml/linalg/Matrices.scala | 4 ++-- .../apache/spark/ml/linalg/MatricesSuite.scala | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index b807d8ad8d1c3..ba4a46aeeab7c 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -187,7 +187,7 @@ sealed trait Matrix extends Serializable { * Converts this matrix to a sparse matrix in column major order. */ @Since("2.2.0") - def toSparse: SparseMatrix = toSparseMatrix(colMajor = true) + def toSparse: SparseMatrix = toSparseMatrix(colMajor = !isTransposed) /** * Converts this matrix to a dense matrix. @@ -201,7 +201,7 @@ sealed trait Matrix extends Serializable { * Converts this matrix to a dense matrix in column major order. */ @Since("2.2.0") - def toDense: DenseMatrix = toDenseMatrix(colMajor = true) + def toDense: DenseMatrix = toDenseMatrix(colMajor = !isTransposed) /** * Converts this matrix to a dense matrix in row major order. diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index b463c64a79576..b9240220a7a9c 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -200,6 +200,11 @@ class MatricesSuite extends SparkMLFunSuite { assert(dm8 === dm1) assert(!dm8.isTransposed) assert(dm8.values.equals(dm1.values)) + + val dm9 = dm2.toDense + assert(dm9 === dm2) + assert(dm9.isTransposed) + assert(dm9.values.equals(dm2.values)) } test("dense to sparse") { @@ -261,6 +266,11 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm9 === dm2) assert(sm9.values === Array(4.0, 5.0, 2.0)) assert(sm9.isTransposed) + + val sm10 = dm2.toSparse + assert(sm10 === dm2) + assert(sm10.values === Array(4.0, 5.0, 2.0)) + assert(sm10.isTransposed) } test("sparse to sparse") { @@ -317,8 +327,8 @@ class MatricesSuite extends SparkMLFunSuite { val sm12 = sm4.toSparse assert(sm12 === sm4) - assert(sm12.values === Array(4.0, 2.0, 5.0)) - assert(!sm12.isTransposed) + assert(sm12.values === Array(4.0, 5.0, 2.0)) + assert(sm12.isTransposed) val sm13 = smZeros.toSparse assert(sm13 === smZeros) @@ -371,8 +381,8 @@ class MatricesSuite extends SparkMLFunSuite { val dm3 = sm2.toDense assert(dm3 === sm2) - assert(!dm3.isTransposed) - assert(dm3.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) + assert(dm3.isTransposed) + assert(dm3.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) val dm4 = sm2.toDenseRowMajor assert(dm4 === sm2) From 93ec250b2b0268f57e2c4d191f27da22a74d3f0c Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 23 Mar 2017 21:27:12 -0700 Subject: [PATCH 18/21] add isRowMajor, isColMajor --- .../org/apache/spark/ml/linalg/Matrices.scala | 36 ++++--- .../spark/ml/linalg/MatricesSuite.scala | 102 +++++++++--------- project/MimaExcludes.scala | 2 + 3 files changed, 73 insertions(+), 67 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index ba4a46aeeab7c..974235f7d5b65 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -44,6 +44,12 @@ sealed trait Matrix extends Serializable { @Since("2.0.0") val isTransposed: Boolean = false + /** Indicates whether the values backing this matrix are arranged in column major order. */ + private[ml] def isColMajor: Boolean = !isTransposed + + /** Indicates whether the values backing this matrix are arranged in row major order. */ + private[ml] def isRowMajor: Boolean = isTransposed + /** Converts to a dense array in column major. */ @Since("2.0.0") def toArray: Array[Double] = { @@ -184,7 +190,7 @@ sealed trait Matrix extends Serializable { def toSparseRowMajor: SparseMatrix = toSparseMatrix(colMajor = false) /** - * Converts this matrix to a sparse matrix in column major order. + * Converts this matrix to a sparse matrix while maintaining the layout of the current matrix. */ @Since("2.2.0") def toSparse: SparseMatrix = toSparseMatrix(colMajor = !isTransposed) @@ -198,7 +204,7 @@ sealed trait Matrix extends Serializable { private[ml] def toDenseMatrix(colMajor: Boolean): DenseMatrix /** - * Converts this matrix to a dense matrix in column major order. + * Converts this matrix to a dense matrix while maintaining the layout of the current matrix. */ @Since("2.2.0") def toDense: DenseMatrix = toDenseMatrix(colMajor = !isTransposed) @@ -250,13 +256,11 @@ sealed trait Matrix extends Serializable { val csrSize = getSparseSizeInBytes(colMajor = false) if (getDenseSizeInBytes < math.min(cscSize, csrSize)) { // dense matrix size is the same for column major and row major, so maintain current layout - toDenseMatrix(!isTransposed) + toDense + } else if (cscSize <= csrSize) { + toSparseColMajor } else { - if (cscSize <= csrSize) { - toSparseMatrix(colMajor = true) - } else { - toSparseMatrix(colMajor = false) - } + toSparseRowMajor } } @@ -409,7 +413,7 @@ class DenseMatrix @Since("2.0.0") ( * @param colMajor Whether the resulting `SparseMatrix` values will be in column major order. */ private[ml] override def toSparseMatrix(colMajor: Boolean): SparseMatrix = { - if (!colMajor) this.transpose.toSparseMatrix(colMajor = true).transpose + if (!colMajor) this.transpose.toSparseColMajor.transpose else { val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble val colPtrs: Array[Int] = new Array[Int](numCols + 1) @@ -440,9 +444,9 @@ class DenseMatrix @Since("2.0.0") ( * @param colMajor Whether the resulting `DenseMatrix` values will be in column major order. */ private[ml] override def toDenseMatrix(colMajor: Boolean): DenseMatrix = { - if (isTransposed && colMajor) { + if (isRowMajor && colMajor) { new DenseMatrix(numRows, numCols, toArray, isTransposed = false) - } else if (!isTransposed && !colMajor) { + } else if (isColMajor && !colMajor) { new DenseMatrix(numRows, numCols, transpose.toArray, isTransposed = true) } else { this @@ -732,12 +736,12 @@ class SparseMatrix @Since("2.0.0") ( * order. */ private[ml] override def toSparseMatrix(colMajor: Boolean): SparseMatrix = { - if (!isTransposed && !colMajor) { - // it is row major and we want col major, use breeze to remove explicit zeros + if (isColMajor && !colMajor) { + // it is col major and we want row major, use breeze to remove explicit zeros val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]].t Matrices.fromBreeze(breezeTransposed).transpose.asInstanceOf[SparseMatrix] - } else if (isTransposed && colMajor) { - // it is col major and we want row major, use breeze to remove explicit zeros + } else if (isRowMajor && colMajor) { + // it is row major and we want col major, use breeze to remove explicit zeros val breezeTransposed = asBreeze.asInstanceOf[BSM[Double]] Matrices.fromBreeze(breezeTransposed).asInstanceOf[SparseMatrix] } else { @@ -746,7 +750,7 @@ class SparseMatrix @Since("2.0.0") ( // remove explicit zeros val rr = new Array[Int](nnz) val vv = new Array[Double](nnz) - val numPtrs = if (isTransposed) numRows else numCols + val numPtrs = if (isRowMajor) numRows else numCols val cc = new Array[Int](numPtrs + 1) var nzIdx = 0 var j = 0 diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index b9240220a7a9c..f5749db37b6cf 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -173,37 +173,37 @@ class MatricesSuite extends SparkMLFunSuite { val dm3 = dm1.toDense assert(dm3 === dm1) - assert(!dm3.isTransposed) + assert(dm3.isColMajor) assert(dm3.values.equals(dm1.values)) val dm4 = dm1.toDenseRowMajor assert(dm4 === dm1) - assert(dm4.isTransposed) + assert(dm4.isRowMajor) assert(dm4.values === Array(4.0, 2.0, -8.0, -1.0, 7.0, 4.0)) val dm5 = dm2.toDenseColMajor assert(dm5 === dm2) - assert(!dm5.isTransposed) + assert(dm5.isColMajor) assert(dm5.values === Array(5.0, 1.0, -9.0, -3.0, 4.0, -8.0)) val dm6 = dm2.toDenseRowMajor assert(dm6 === dm2) - assert(dm6.isTransposed) + assert(dm6.isRowMajor) assert(dm6.values.equals(dm2.values)) val dm7 = dm1.toDenseRowMajor assert(dm7 === dm1) - assert(dm7.isTransposed) + assert(dm7.isRowMajor) assert(dm7.values === Array(4.0, 2.0, -8.0, -1.0, 7.0, 4.0)) val dm8 = dm1.toDenseColMajor assert(dm8 === dm1) - assert(!dm8.isTransposed) + assert(dm8.isColMajor) assert(dm8.values.equals(dm1.values)) val dm9 = dm2.toDense assert(dm9 === dm2) - assert(dm9.isTransposed) + assert(dm9.isRowMajor) assert(dm9.values.equals(dm2.values)) } @@ -224,53 +224,53 @@ class MatricesSuite extends SparkMLFunSuite { val sm1 = dm1.toSparseColMajor assert(sm1 === dm1) - assert(!sm1.isTransposed) + assert(sm1.isColMajor) assert(sm1.values === Array(4.0, 2.0, 5.0)) val sm2 = dm1.toSparseRowMajor assert(sm2 === dm1) - assert(sm2.isTransposed) + assert(sm2.isRowMajor) assert(sm2.values === Array(4.0, 5.0, 2.0)) val sm3 = dm2.toSparseColMajor assert(sm3 === dm2) - assert(!sm3.isTransposed) + assert(sm3.isColMajor) assert(sm3.values === Array(4.0, 2.0, 5.0)) val sm4 = dm2.toSparseRowMajor assert(sm4 === dm2) - assert(sm4.isTransposed) + assert(sm4.isRowMajor) assert(sm4.values === Array(4.0, 5.0, 2.0)) val sm5 = dm3.toSparseColMajor assert(sm5 === dm3) assert(sm5.values === Array.empty[Double]) - assert(!sm5.isTransposed) + assert(sm5.isColMajor) val sm6 = dm3.toSparseRowMajor assert(sm6 === dm3) assert(sm6.values === Array.empty[Double]) - assert(sm6.isTransposed) + assert(sm6.isRowMajor) val sm7 = dm1.toSparse assert(sm7 === dm1) assert(sm7.values === Array(4.0, 2.0, 5.0)) - assert(!sm7.isTransposed) + assert(sm7.isColMajor) val sm8 = dm1.toSparseColMajor assert(sm8 === dm1) assert(sm8.values === Array(4.0, 2.0, 5.0)) - assert(!sm8.isTransposed) + assert(sm8.isColMajor) val sm9 = dm2.toSparseRowMajor assert(sm9 === dm2) assert(sm9.values === Array(4.0, 5.0, 2.0)) - assert(sm9.isTransposed) + assert(sm9.isRowMajor) val sm10 = dm2.toSparse assert(sm10 === dm2) assert(sm10.values === Array(4.0, 5.0, 2.0)) - assert(sm10.isTransposed) + assert(sm10.isRowMajor) } test("sparse to sparse") { @@ -292,68 +292,68 @@ class MatricesSuite extends SparkMLFunSuite { val sm5 = sm1.toSparseRowMajor assert(sm5 === sm1) - assert(sm5.isTransposed) + assert(sm5.isRowMajor) assert(sm5.values === Array(4.0, 5.0, 2.0)) val sm6 = sm1.toSparseColMajor assert(sm6 === sm1) - assert(!sm6.isTransposed) + assert(sm6.isColMajor) assert(sm6.values.equals(sm1.values)) val sm7 = sm2.toSparseColMajor assert(sm7 === sm2) - assert(!sm7.isTransposed) + assert(sm7.isColMajor) assert(sm7.values === Array(4.0, 2.0, 5.0)) val sm8 = sm2.toSparseRowMajor assert(sm8 === sm2) - assert(sm8.isTransposed) + assert(sm8.isRowMajor) assert(sm8.values.equals(sm2.values)) val sm9 = sm3.toSparse assert(sm9 === sm3) assert(sm9.values === Array(4.0, 2.0, 5.0)) - assert(!sm9.isTransposed) + assert(sm9.isColMajor) val sm10 = sm3.toSparseRowMajor assert(sm10 === sm3) assert(sm10.values === Array(4.0, 5.0, 2.0)) - assert(sm10.isTransposed) + assert(sm10.isRowMajor) val sm11 = sm4.toSparseRowMajor assert(sm11 === sm4) assert(sm11.values === Array(4.0, 5.0, 2.0)) - assert(sm11.isTransposed) + assert(sm11.isRowMajor) val sm12 = sm4.toSparse assert(sm12 === sm4) assert(sm12.values === Array(4.0, 5.0, 2.0)) - assert(sm12.isTransposed) + assert(sm12.isRowMajor) val sm13 = smZeros.toSparse assert(sm13 === smZeros) assert(sm13.values === Array.empty[Double]) - assert(!sm13.isTransposed) + assert(sm13.isColMajor) val sm14 = sm4.toSparseColMajor assert(sm14 === sm4) assert(sm14.values === Array(4.0, 2.0, 5.0)) - assert(!sm14.isTransposed) + assert(sm14.isColMajor) val sm15 = smZeros.toSparseColMajor assert(sm15 === smZeros) assert(sm15.values === Array.empty[Double]) - assert(!sm15.isTransposed) + assert(sm15.isColMajor) val sm16 = sm3.toSparseRowMajor assert(sm16 === sm4) assert(sm16.values === Array(4.0, 5.0, 2.0)) - assert(sm16.isTransposed) + assert(sm16.isRowMajor) val sm17 = smZeros.toSparseRowMajor assert(sm17 === smZeros) assert(sm17.values === Array.empty[Double]) - assert(sm17.isTransposed) + assert(sm17.isRowMajor) } test("sparse to dense") { @@ -371,37 +371,37 @@ class MatricesSuite extends SparkMLFunSuite { val dm1 = sm1.toDense assert(dm1 === sm1) - assert(!dm1.isTransposed) + assert(dm1.isColMajor) assert(dm1.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) val dm2 = sm1.toDenseRowMajor assert(dm2 === sm1) - assert(dm2.isTransposed) + assert(dm2.isRowMajor) assert(dm2.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) val dm3 = sm2.toDense assert(dm3 === sm2) - assert(dm3.isTransposed) + assert(dm3.isRowMajor) assert(dm3.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) val dm4 = sm2.toDenseRowMajor assert(dm4 === sm2) - assert(dm4.isTransposed) + assert(dm4.isRowMajor) assert(dm4.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) val dm5 = sm3.toDense assert(dm5 === sm3) - assert(!dm5.isTransposed) + assert(dm5.isColMajor) assert(dm5.values === Array.fill(6)(0.0)) val dm6 = sm2.toDenseColMajor assert(dm6 === sm2) - assert(!dm6.isTransposed) + assert(dm6.isColMajor) assert(dm6.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) val dm7 = sm2.toDenseRowMajor assert(dm7 === sm2) - assert(dm7.isTransposed) + assert(dm7.isRowMajor) assert(dm7.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) } @@ -421,20 +421,20 @@ class MatricesSuite extends SparkMLFunSuite { // optimal compression layout is row major since numRows < numCols val cm1 = dm1.compressed.asInstanceOf[SparseMatrix] assert(cm1 === dm1) - assert(cm1.isTransposed) + assert(cm1.isRowMajor) assert(cm1.getSizeInBytes <= dm1.getSizeInBytes) // force compressed column major val cm2 = dm1.compressedColMajor.asInstanceOf[SparseMatrix] assert(cm2 === dm1) - assert(!cm2.isTransposed) + assert(cm2.isColMajor) assert(cm2.getSizeInBytes <= dm1.getSizeInBytes) // optimal compression layout for transpose is column major val dm2 = dm1.transpose val cm3 = dm2.compressed.asInstanceOf[SparseMatrix] assert(cm3 === dm2) - assert(!cm3.isTransposed) + assert(cm3.isColMajor) assert(cm3.getSizeInBytes <= dm2.getSizeInBytes) /* @@ -452,19 +452,19 @@ class MatricesSuite extends SparkMLFunSuite { val cm4 = dm3.compressed.asInstanceOf[DenseMatrix] assert(cm4 === dm3) - assert(!cm4.isTransposed) + assert(cm4.isColMajor) assert(cm4.values.equals(dm3.values)) assert(cm4.getSizeInBytes <= dm3.getSizeInBytes) // force compressed row major val cm5 = dm3.compressedRowMajor.asInstanceOf[DenseMatrix] assert(cm5 === dm3) - assert(cm5.isTransposed) + assert(cm5.isRowMajor) assert(cm5.getSizeInBytes <= dm3.getSizeInBytes) val cm6 = dm4.compressed.asInstanceOf[DenseMatrix] assert(cm6 === dm4) - assert(cm6.isTransposed) + assert(cm6.isRowMajor) assert(cm6.values.equals(dm4.values)) assert(cm6.getSizeInBytes <= dm4.getSizeInBytes) } @@ -486,26 +486,26 @@ class MatricesSuite extends SparkMLFunSuite { val cm1 = sm1.compressed.asInstanceOf[SparseMatrix] // optimal is column major assert(cm1 === sm1) - assert(!cm1.isTransposed) + assert(cm1.isColMajor) assert(cm1.values.equals(sm1.values)) assert(cm1.getSizeInBytes <= sm1.getSizeInBytes) val cm2 = sm1.compressedRowMajor.asInstanceOf[SparseMatrix] assert(cm2 === sm1) - assert(cm2.isTransposed) + assert(cm2.isRowMajor) // forced to be row major, so we have increased the size assert(cm2.getSizeInBytes > sm1.getSizeInBytes) assert(cm2.getSizeInBytes <= sm1.toDense.getSizeInBytes) val cm3 = sm2.compressed.asInstanceOf[SparseMatrix] assert(cm3 === sm2) - assert(cm3.isTransposed) + assert(cm3.isRowMajor) assert(cm3.values.equals(sm2.values)) assert(cm3.getSizeInBytes <= sm2.getSizeInBytes) val cm8 = sm2.compressedColMajor.asInstanceOf[SparseMatrix] assert(cm8 === sm2) - assert(!cm8.isTransposed) + assert(cm8.isColMajor) assert(cm8.getSizeInBytes > sm2.getSizeInBytes) assert(cm8.getSizeInBytes <= sm2.toDense.getSizeInBytes) @@ -521,12 +521,12 @@ class MatricesSuite extends SparkMLFunSuite { // dense is optimal, and maintains column major val cm4 = sm3.compressed.asInstanceOf[DenseMatrix] assert(cm4 === sm3) - assert(!cm4.isTransposed) + assert(cm4.isColMajor) assert(cm4.getSizeInBytes <= sm3.getSizeInBytes) val cm5 = sm3.compressedRowMajor.asInstanceOf[DenseMatrix] assert(cm5 === sm3) - assert(cm5.isTransposed) + assert(cm5.isRowMajor) assert(cm5.getSizeInBytes <= sm3.getSizeInBytes) /* @@ -540,14 +540,14 @@ class MatricesSuite extends SparkMLFunSuite { val sm4 = new SparseMatrix(Int.MaxValue, 1, Array(0, 1), Array(0), Array(1.0)) val cm6 = sm4.compressed.asInstanceOf[SparseMatrix] assert(cm6 === sm4) - assert(!cm6.isTransposed) + assert(cm6.isColMajor) assert(cm6.getSizeInBytes <= sm4.getSizeInBytes) val sm5 = new SparseMatrix(1, Int.MaxValue, Array(0, 1), Array(0), Array(1.0), isTransposed = true) val cm7 = sm5.compressed.asInstanceOf[SparseMatrix] assert(cm7 === sm5) - assert(cm7.isTransposed) + assert(cm7.isRowMajor) assert(cm7.getSizeInBytes <= sm5.getSizeInBytes) } diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index b00abbba10fbc..1aa7696ac1448 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -74,6 +74,8 @@ object MimaExcludes { ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.compressed"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.compressedColMajor"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.compressedRowMajor"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.isRowMajor"), + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.isColMajor"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.getSparseSizeInBytes"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDense"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toSparse"), From 5411d4629da1dfa7cc4db83a2741c4f80cbd7bf3 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 23 Mar 2017 23:40:38 -0700 Subject: [PATCH 19/21] organize test suites --- .../org/apache/spark/ml/linalg/Matrices.scala | 20 +- .../spark/ml/linalg/MatricesSuite.scala | 202 +++++++++--------- 2 files changed, 115 insertions(+), 107 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 974235f7d5b65..8712c394b9f95 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -227,9 +227,9 @@ sealed trait Matrix extends Serializable { @Since("2.2.0") def compressedColMajor: Matrix = { if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor = true)) { - toDenseColMajor + this.toDenseColMajor } else { - toSparseColMajor + this.toSparseColMajor } } @@ -239,9 +239,9 @@ sealed trait Matrix extends Serializable { @Since("2.2.0") def compressedRowMajor: Matrix = { if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor = false)) { - toDenseRowMajor + this.toDenseRowMajor } else { - toSparseRowMajor + this.toSparseRowMajor } } @@ -256,11 +256,11 @@ sealed trait Matrix extends Serializable { val csrSize = getSparseSizeInBytes(colMajor = false) if (getDenseSizeInBytes < math.min(cscSize, csrSize)) { // dense matrix size is the same for column major and row major, so maintain current layout - toDense + this.toDense } else if (cscSize <= csrSize) { - toSparseColMajor + this.toSparseColMajor } else { - toSparseRowMajor + this.toSparseRowMajor } } @@ -445,9 +445,9 @@ class DenseMatrix @Since("2.0.0") ( */ private[ml] override def toDenseMatrix(colMajor: Boolean): DenseMatrix = { if (isRowMajor && colMajor) { - new DenseMatrix(numRows, numCols, toArray, isTransposed = false) + new DenseMatrix(numRows, numCols, this.toArray, isTransposed = false) } else if (isColMajor && !colMajor) { - new DenseMatrix(numRows, numCols, transpose.toArray, isTransposed = true) + new DenseMatrix(numRows, numCols, this.transpose.toArray, isTransposed = true) } else { this } @@ -782,7 +782,7 @@ class SparseMatrix @Since("2.0.0") ( * @param colMajor Whether the resulting `DenseMatrix` values are in column major order. */ private[ml] override def toDenseMatrix(colMajor: Boolean): DenseMatrix = { - if (colMajor) new DenseMatrix(numRows, numCols, toArray) + if (colMajor) new DenseMatrix(numRows, numCols, this.toArray) else new DenseMatrix(numRows, numCols, this.transpose.toArray, isTransposed = true) } diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index f5749db37b6cf..b9927c5da6e96 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -171,35 +171,30 @@ class MatricesSuite extends SparkMLFunSuite { val dm1 = new DenseMatrix(2, 3, Array(4.0, -1.0, 2.0, 7.0, -8.0, 4.0)) val dm2 = new DenseMatrix(2, 3, Array(5.0, -9.0, 4.0, 1.0, -3.0, -8.0), isTransposed = true) - val dm3 = dm1.toDense - assert(dm3 === dm1) - assert(dm3.isColMajor) - assert(dm3.values.equals(dm1.values)) - - val dm4 = dm1.toDenseRowMajor - assert(dm4 === dm1) - assert(dm4.isRowMajor) - assert(dm4.values === Array(4.0, 2.0, -8.0, -1.0, 7.0, 4.0)) + val dm8 = dm1.toDenseColMajor + assert(dm8 === dm1) + assert(dm8.isColMajor) + assert(dm8.values.equals(dm1.values)) val dm5 = dm2.toDenseColMajor assert(dm5 === dm2) assert(dm5.isColMajor) assert(dm5.values === Array(5.0, 1.0, -9.0, -3.0, 4.0, -8.0)) + val dm4 = dm1.toDenseRowMajor + assert(dm4 === dm1) + assert(dm4.isRowMajor) + assert(dm4.values === Array(4.0, 2.0, -8.0, -1.0, 7.0, 4.0)) + val dm6 = dm2.toDenseRowMajor assert(dm6 === dm2) assert(dm6.isRowMajor) assert(dm6.values.equals(dm2.values)) - val dm7 = dm1.toDenseRowMajor - assert(dm7 === dm1) - assert(dm7.isRowMajor) - assert(dm7.values === Array(4.0, 2.0, -8.0, -1.0, 7.0, 4.0)) - - val dm8 = dm1.toDenseColMajor - assert(dm8 === dm1) - assert(dm8.isColMajor) - assert(dm8.values.equals(dm1.values)) + val dm3 = dm1.toDense + assert(dm3 === dm1) + assert(dm3.isColMajor) + assert(dm3.values.equals(dm1.values)) val dm9 = dm2.toDense assert(dm9 === dm2) @@ -227,26 +222,26 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm1.isColMajor) assert(sm1.values === Array(4.0, 2.0, 5.0)) - val sm2 = dm1.toSparseRowMajor - assert(sm2 === dm1) - assert(sm2.isRowMajor) - assert(sm2.values === Array(4.0, 5.0, 2.0)) - val sm3 = dm2.toSparseColMajor assert(sm3 === dm2) assert(sm3.isColMajor) assert(sm3.values === Array(4.0, 2.0, 5.0)) - val sm4 = dm2.toSparseRowMajor - assert(sm4 === dm2) - assert(sm4.isRowMajor) - assert(sm4.values === Array(4.0, 5.0, 2.0)) - val sm5 = dm3.toSparseColMajor assert(sm5 === dm3) assert(sm5.values === Array.empty[Double]) assert(sm5.isColMajor) + val sm2 = dm1.toSparseRowMajor + assert(sm2 === dm1) + assert(sm2.isRowMajor) + assert(sm2.values === Array(4.0, 5.0, 2.0)) + + val sm4 = dm2.toSparseRowMajor + assert(sm4 === dm2) + assert(sm4.isRowMajor) + assert(sm4.values === Array(4.0, 5.0, 2.0)) + val sm6 = dm3.toSparseRowMajor assert(sm6 === dm3) assert(sm6.values === Array.empty[Double]) @@ -257,16 +252,6 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm7.values === Array(4.0, 2.0, 5.0)) assert(sm7.isColMajor) - val sm8 = dm1.toSparseColMajor - assert(sm8 === dm1) - assert(sm8.values === Array(4.0, 2.0, 5.0)) - assert(sm8.isColMajor) - - val sm9 = dm2.toSparseRowMajor - assert(sm9 === dm2) - assert(sm9.values === Array(4.0, 5.0, 2.0)) - assert(sm9.isRowMajor) - val sm10 = dm2.toSparse assert(sm10 === dm2) assert(sm10.values === Array(4.0, 5.0, 2.0)) @@ -290,11 +275,6 @@ class MatricesSuite extends SparkMLFunSuite { val smZeros = new SparseMatrix(2, 3, Array(0, 2, 4, 6), Array(0, 1, 0, 1, 0, 1), Array(0.0, 0.0, 0.0, 0.0, 0.0, 0.0)) - val sm5 = sm1.toSparseRowMajor - assert(sm5 === sm1) - assert(sm5.isRowMajor) - assert(sm5.values === Array(4.0, 5.0, 2.0)) - val sm6 = sm1.toSparseColMajor assert(sm6 === sm1) assert(sm6.isColMajor) @@ -305,16 +285,31 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm7.isColMajor) assert(sm7.values === Array(4.0, 2.0, 5.0)) + val sm16 = sm3.toSparseColMajor + assert(sm16 === sm3) + assert(sm16.isColMajor) + assert(sm16.values === Array(4.0, 2.0, 5.0)) + + val sm14 = sm4.toSparseColMajor + assert(sm14 === sm4) + assert(sm14.values === Array(4.0, 2.0, 5.0)) + assert(sm14.isColMajor) + + val sm15 = smZeros.toSparseColMajor + assert(sm15 === smZeros) + assert(sm15.values === Array.empty[Double]) + assert(sm15.isColMajor) + + val sm5 = sm1.toSparseRowMajor + assert(sm5 === sm1) + assert(sm5.isRowMajor) + assert(sm5.values === Array(4.0, 5.0, 2.0)) + val sm8 = sm2.toSparseRowMajor assert(sm8 === sm2) assert(sm8.isRowMajor) assert(sm8.values.equals(sm2.values)) - val sm9 = sm3.toSparse - assert(sm9 === sm3) - assert(sm9.values === Array(4.0, 2.0, 5.0)) - assert(sm9.isColMajor) - val sm10 = sm3.toSparseRowMajor assert(sm10 === sm3) assert(sm10.values === Array(4.0, 5.0, 2.0)) @@ -325,6 +320,16 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm11.values === Array(4.0, 5.0, 2.0)) assert(sm11.isRowMajor) + val sm17 = smZeros.toSparseRowMajor + assert(sm17 === smZeros) + assert(sm17.values === Array.empty[Double]) + assert(sm17.isRowMajor) + + val sm9 = sm3.toSparse + assert(sm9 === sm3) + assert(sm9.values === Array(4.0, 2.0, 5.0)) + assert(sm9.isColMajor) + val sm12 = sm4.toSparse assert(sm12 === sm4) assert(sm12.values === Array(4.0, 5.0, 2.0)) @@ -334,26 +339,6 @@ class MatricesSuite extends SparkMLFunSuite { assert(sm13 === smZeros) assert(sm13.values === Array.empty[Double]) assert(sm13.isColMajor) - - val sm14 = sm4.toSparseColMajor - assert(sm14 === sm4) - assert(sm14.values === Array(4.0, 2.0, 5.0)) - assert(sm14.isColMajor) - - val sm15 = smZeros.toSparseColMajor - assert(sm15 === smZeros) - assert(sm15.values === Array.empty[Double]) - assert(sm15.isColMajor) - - val sm16 = sm3.toSparseRowMajor - assert(sm16 === sm4) - assert(sm16.values === Array(4.0, 5.0, 2.0)) - assert(sm16.isRowMajor) - - val sm17 = smZeros.toSparseRowMajor - assert(sm17 === smZeros) - assert(sm17.values === Array.empty[Double]) - assert(sm17.isRowMajor) } test("sparse to dense") { @@ -369,40 +354,40 @@ class MatricesSuite extends SparkMLFunSuite { isTransposed = true) val sm3 = new SparseMatrix(2, 3, Array(0, 0, 0, 0), Array.empty[Int], Array.empty[Double]) - val dm1 = sm1.toDense - assert(dm1 === sm1) - assert(dm1.isColMajor) - assert(dm1.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) + val dm6 = sm1.toDenseColMajor + assert(dm6 === sm1) + assert(dm6.isColMajor) + assert(dm6.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) + + val dm7 = sm2.toDenseColMajor + assert(dm7 === sm2) + assert(dm7.isColMajor) + assert(dm7.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) val dm2 = sm1.toDenseRowMajor assert(dm2 === sm1) assert(dm2.isRowMajor) assert(dm2.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) - val dm3 = sm2.toDense - assert(dm3 === sm2) - assert(dm3.isRowMajor) - assert(dm3.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) - val dm4 = sm2.toDenseRowMajor assert(dm4 === sm2) assert(dm4.isRowMajor) assert(dm4.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) + val dm1 = sm1.toDense + assert(dm1 === sm1) + assert(dm1.isColMajor) + assert(dm1.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) + + val dm3 = sm2.toDense + assert(dm3 === sm2) + assert(dm3.isRowMajor) + assert(dm3.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) + val dm5 = sm3.toDense assert(dm5 === sm3) assert(dm5.isColMajor) assert(dm5.values === Array.fill(6)(0.0)) - - val dm6 = sm2.toDenseColMajor - assert(dm6 === sm2) - assert(dm6.isColMajor) - assert(dm6.values === Array(0.0, 0.0, 4.0, 2.0, 5.0, 0.0)) - - val dm7 = sm2.toDenseRowMajor - assert(dm7 === sm2) - assert(dm7.isRowMajor) - assert(dm7.values === Array(0.0, 4.0, 5.0, 0.0, 2.0, 0.0)) } test("compressed dense") { @@ -422,20 +407,20 @@ class MatricesSuite extends SparkMLFunSuite { val cm1 = dm1.compressed.asInstanceOf[SparseMatrix] assert(cm1 === dm1) assert(cm1.isRowMajor) - assert(cm1.getSizeInBytes <= dm1.getSizeInBytes) + assert(cm1.getSizeInBytes < dm1.getSizeInBytes) // force compressed column major val cm2 = dm1.compressedColMajor.asInstanceOf[SparseMatrix] assert(cm2 === dm1) assert(cm2.isColMajor) - assert(cm2.getSizeInBytes <= dm1.getSizeInBytes) + assert(cm2.getSizeInBytes < dm1.getSizeInBytes) // optimal compression layout for transpose is column major val dm2 = dm1.transpose val cm3 = dm2.compressed.asInstanceOf[SparseMatrix] assert(cm3 === dm2) assert(cm3.isColMajor) - assert(cm3.getSizeInBytes <= dm2.getSizeInBytes) + assert(cm3.getSizeInBytes < dm2.getSizeInBytes) /* dm3 = 1.0 1.0 1.0 0.0 @@ -454,19 +439,24 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm4 === dm3) assert(cm4.isColMajor) assert(cm4.values.equals(dm3.values)) - assert(cm4.getSizeInBytes <= dm3.getSizeInBytes) + assert(cm4.getSizeInBytes === dm3.getSizeInBytes) // force compressed row major val cm5 = dm3.compressedRowMajor.asInstanceOf[DenseMatrix] assert(cm5 === dm3) assert(cm5.isRowMajor) - assert(cm5.getSizeInBytes <= dm3.getSizeInBytes) + assert(cm5.getSizeInBytes === dm3.getSizeInBytes) val cm6 = dm4.compressed.asInstanceOf[DenseMatrix] assert(cm6 === dm4) assert(cm6.isRowMajor) assert(cm6.values.equals(dm4.values)) - assert(cm6.getSizeInBytes <= dm4.getSizeInBytes) + assert(cm6.getSizeInBytes === dm4.getSizeInBytes) + + val cm7 = dm4.compressedColMajor.asInstanceOf[DenseMatrix] + assert(cm7 === dm4) + assert(cm7.isColMajor) + assert(cm7.getSizeInBytes === dm4.getSizeInBytes) } test("compressed sparse") { @@ -488,7 +478,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm1 === sm1) assert(cm1.isColMajor) assert(cm1.values.equals(sm1.values)) - assert(cm1.getSizeInBytes <= sm1.getSizeInBytes) + assert(cm1.getSizeInBytes === sm1.getSizeInBytes) val cm2 = sm1.compressedRowMajor.asInstanceOf[SparseMatrix] assert(cm2 === sm1) @@ -497,18 +487,31 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm2.getSizeInBytes > sm1.getSizeInBytes) assert(cm2.getSizeInBytes <= sm1.toDense.getSizeInBytes) + val cm9 = sm1.compressedColMajor.asInstanceOf[SparseMatrix] + assert(cm9 === sm1) + assert(cm9.values.equals(sm1.values)) + assert(cm9.getSizeInBytes === sm1.getSizeInBytes) + val cm3 = sm2.compressed.asInstanceOf[SparseMatrix] assert(cm3 === sm2) assert(cm3.isRowMajor) assert(cm3.values.equals(sm2.values)) - assert(cm3.getSizeInBytes <= sm2.getSizeInBytes) + assert(cm3.getSizeInBytes === sm2.getSizeInBytes) val cm8 = sm2.compressedColMajor.asInstanceOf[SparseMatrix] assert(cm8 === sm2) assert(cm8.isColMajor) + // forced to be col major, so we have increased the size assert(cm8.getSizeInBytes > sm2.getSizeInBytes) assert(cm8.getSizeInBytes <= sm2.toDense.getSizeInBytes) + val cm10 = sm2.compressedRowMajor.asInstanceOf[SparseMatrix] + assert(cm10 === sm2) + assert(cm10.isRowMajor) + assert(cm10.values.equals(sm2.values)) + assert(cm10.getSizeInBytes === sm2.getSizeInBytes) + + /* sm3 = 0.0 -1.0 2.0 3.0 @@ -522,12 +525,17 @@ class MatricesSuite extends SparkMLFunSuite { val cm4 = sm3.compressed.asInstanceOf[DenseMatrix] assert(cm4 === sm3) assert(cm4.isColMajor) - assert(cm4.getSizeInBytes <= sm3.getSizeInBytes) + assert(cm4.getSizeInBytes < sm3.getSizeInBytes) val cm5 = sm3.compressedRowMajor.asInstanceOf[DenseMatrix] assert(cm5 === sm3) assert(cm5.isRowMajor) - assert(cm5.getSizeInBytes <= sm3.getSizeInBytes) + assert(cm5.getSizeInBytes < sm3.getSizeInBytes) + + val cm11 = sm3.compressedColMajor.asInstanceOf[DenseMatrix] + assert(cm11 === sm3) + assert(cm11.isColMajor) + assert(cm11.getSizeInBytes < sm3.getSizeInBytes) /* sm4 = 1.0 0.0 0.0 ... From 95ac0e0e6ad43646477ef8a359b6dd977764c455 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 24 Mar 2017 10:14:47 -0700 Subject: [PATCH 20/21] break ties with dense --- .../org/apache/spark/ml/linalg/Matrices.scala | 14 +++++------ .../spark/ml/linalg/MatricesSuite.scala | 25 ++++++++++++++----- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 8712c394b9f95..1c6ebb8fda737 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -226,7 +226,7 @@ sealed trait Matrix extends Serializable { */ @Since("2.2.0") def compressedColMajor: Matrix = { - if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor = true)) { + if (getDenseSizeInBytes <= getSparseSizeInBytes(colMajor = true)) { this.toDenseColMajor } else { this.toSparseColMajor @@ -238,7 +238,7 @@ sealed trait Matrix extends Serializable { */ @Since("2.2.0") def compressedRowMajor: Matrix = { - if (getDenseSizeInBytes < getSparseSizeInBytes(colMajor = false)) { + if (getDenseSizeInBytes <= getSparseSizeInBytes(colMajor = false)) { this.toDenseRowMajor } else { this.toSparseRowMajor @@ -254,7 +254,7 @@ sealed trait Matrix extends Serializable { def compressed: Matrix = { val cscSize = getSparseSizeInBytes(colMajor = true) val csrSize = getSparseSizeInBytes(colMajor = false) - if (getDenseSizeInBytes < math.min(cscSize, csrSize)) { + if (getDenseSizeInBytes <= math.min(cscSize, csrSize)) { // dense matrix size is the same for column major and row major, so maintain current layout this.toDense } else if (cscSize <= csrSize) { @@ -1269,13 +1269,13 @@ object Matrices { } private[ml] def getSparseSize(numActives: Long, numPtrs: Long): Long = { - // 8 * values.length + 4 * rowIndices.length + 4 * colPtrs.length + 8 + 8 + 1 - 12L * numActives + 4L * numPtrs + 17L + // 8 * values.length + 4 * rowIndices.length + 4 * colPtrs.length + 12 + 12 + 12 + 1 + 12L * numActives + 4L * numPtrs + 37L } private[ml] def getDenseSize(numCols: Long, numRows: Long): Long = { - // 8 * values.length + 8 + 1 - 8L * numCols * numRows + 9L + // 8 * values.length + 12 + 1 + 8L * numCols * numRows + 13L } } diff --git a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala index b9927c5da6e96..9f8202086817d 100644 --- a/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala +++ b/mllib-local/src/test/scala/org/apache/spark/ml/linalg/MatricesSuite.scala @@ -457,20 +457,26 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm7 === dm4) assert(cm7.isColMajor) assert(cm7.getSizeInBytes === dm4.getSizeInBytes) + + // this has the same size sparse or dense + val dm5 = new DenseMatrix(4, 4, Array.fill(7)(1.0) ++ Array.fill(9)(0.0)) + // should choose dense to break ties + val cm8 = dm5.compressed.asInstanceOf[DenseMatrix] + assert(cm8.getSizeInBytes === dm5.toSparseColMajor.getSizeInBytes) } test("compressed sparse") { /* sm1 = 0.0 -1.0 0.0 0.0 - -4.0 0.0 + 0.0 0.0 0.0 0.0 - sm2 = 0.0 0.0 -4.0 0.0 - -1.0 0.0 0.0 0.0 + sm2 = 0.0 0.0 0.0 0.0 + -1.0 0.0 0.0 0.0 */ // these should compress to sparse matrices - val sm1 = new SparseMatrix(4, 2, Array(0, 1, 2), Array(2, 0), Array(-4.0, -1.0)) + val sm1 = new SparseMatrix(4, 2, Array(0, 0, 1), Array(0), Array(-1.0)) val sm2 = sm1.transpose val cm1 = sm1.compressed.asInstanceOf[SparseMatrix] @@ -485,7 +491,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm2.isRowMajor) // forced to be row major, so we have increased the size assert(cm2.getSizeInBytes > sm1.getSizeInBytes) - assert(cm2.getSizeInBytes <= sm1.toDense.getSizeInBytes) + assert(cm2.getSizeInBytes < sm1.toDense.getSizeInBytes) val cm9 = sm1.compressedColMajor.asInstanceOf[SparseMatrix] assert(cm9 === sm1) @@ -503,7 +509,7 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm8.isColMajor) // forced to be col major, so we have increased the size assert(cm8.getSizeInBytes > sm2.getSizeInBytes) - assert(cm8.getSizeInBytes <= sm2.toDense.getSizeInBytes) + assert(cm8.getSizeInBytes < sm2.toDense.getSizeInBytes) val cm10 = sm2.compressedRowMajor.asInstanceOf[SparseMatrix] assert(cm10 === sm2) @@ -557,6 +563,13 @@ class MatricesSuite extends SparkMLFunSuite { assert(cm7 === sm5) assert(cm7.isRowMajor) assert(cm7.getSizeInBytes <= sm5.getSizeInBytes) + + // this has the same size sparse or dense + val sm6 = new SparseMatrix(4, 4, Array(0, 4, 7, 7, 7), Array(0, 1, 2, 3, 0, 1, 2), + Array.fill(7)(1.0)) + // should choose dense to break ties + val cm12 = sm6.compressed.asInstanceOf[DenseMatrix] + assert(cm12.getSizeInBytes === sm6.getSizeInBytes) } test("map, update") { From 87dfaa06b2861ccc4bd5ca62915dea867985b391 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 24 Mar 2017 11:50:53 -0700 Subject: [PATCH 21/21] clarify get size functions --- .../org/apache/spark/ml/linalg/Matrices.scala | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index 1c6ebb8fda737..07f3bc27280bd 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -193,7 +193,7 @@ sealed trait Matrix extends Serializable { * Converts this matrix to a sparse matrix while maintaining the layout of the current matrix. */ @Since("2.2.0") - def toSparse: SparseMatrix = toSparseMatrix(colMajor = !isTransposed) + def toSparse: SparseMatrix = toSparseMatrix(colMajor = isColMajor) /** * Converts this matrix to a dense matrix. @@ -207,7 +207,7 @@ sealed trait Matrix extends Serializable { * Converts this matrix to a dense matrix while maintaining the layout of the current matrix. */ @Since("2.2.0") - def toDense: DenseMatrix = toDenseMatrix(colMajor = !isTransposed) + def toDense: DenseMatrix = toDenseMatrix(colMajor = isColMajor) /** * Converts this matrix to a dense matrix in row major order. @@ -1269,13 +1269,24 @@ object Matrices { } private[ml] def getSparseSize(numActives: Long, numPtrs: Long): Long = { - // 8 * values.length + 4 * rowIndices.length + 4 * colPtrs.length + 12 + 12 + 12 + 1 - 12L * numActives + 4L * numPtrs + 37L + /* + Sparse matrices store two int arrays, one double array, two ints, and one boolean: + 8 * values.length + 4 * rowIndices.length + 4 * colPtrs.length + arrayHeader * 3 + 2 * 4 + 1 + */ + val doubleBytes = java.lang.Double.BYTES + val intBytes = java.lang.Integer.BYTES + val arrayHeader = 12L + doubleBytes * numActives + intBytes * numActives + intBytes * numPtrs + arrayHeader * 3L + 9L } private[ml] def getDenseSize(numCols: Long, numRows: Long): Long = { - // 8 * values.length + 12 + 1 - 8L * numCols * numRows + 13L + /* + Dense matrices store one double array, two ints, and one boolean: + 8 * values.length + arrayHeader + 2 * 4 + 1 + */ + val doubleBytes = java.lang.Double.BYTES + val arrayHeader = 12L + doubleBytes * numCols * numRows + arrayHeader + 9L } }