From 412b77e4a9288396030ecbe795fd04c0f646640f Mon Sep 17 00:00:00 2001 From: Nakul Jindal Date: Thu, 29 Oct 2015 14:08:11 -0700 Subject: [PATCH 1/7] foreach added to mllib.linalg.Vector. --- .../apache/spark/mllib/linalg/Vectors.scala | 13 ++++++++++++ .../spark/mllib/linalg/VectorsSuite.scala | 20 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index dcdc614455d34..051d9f874bb37 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -125,6 +125,13 @@ sealed trait Vector extends Serializable { */ private[spark] def foreachActive(f: (Int, Double) => Unit) + /** + * Applies a function 'f' to all elements of the vector. + * + * @param f the function that is applied for its side-effect to every element + */ + private[spark] def foreach(f: (Double) => Unit) + /** * Number of active entries. An "active entry" is an element which is explicitly stored, * regardless of its value. Note that inactive entries have value 0. @@ -581,6 +588,9 @@ class DenseVector @Since("1.0.0") ( } } + @Since("1.6.0") + private[spark] override def foreach(f: (Double) => Unit) = { values.foreach(f) } + override def hashCode(): Int = { var result: Int = 31 + size var i = 0 @@ -712,6 +722,9 @@ class SparseVector @Since("1.0.0") ( } } + @Since("1.6.0") + private[spark] def foreach(f: (Double) => Unit) = { indices.foreach { i: Int => f(values(i)) } } + override def hashCode(): Int = { var result: Int = 31 + size val end = values.length diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 6508ddeba4206..3561023eaea75 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -301,6 +301,26 @@ class VectorsSuite extends SparkFunSuite with Logging { assert(svMap.get(3) === Some(0.0)) } + test("foreach") { + val dv = Vectors.dense(0.0, 1.2, 3.1, 0.0) + val sv = Vectors.sparse(4, Seq((1, 1.2), (2, 3.1), (3, 0.0))) + + val dvArray = scala.collection.mutable.ArrayBuffer[Double]() + dv.foreach { (value) => dvArray += value } + assert(dvArray.size === 4) + assert(dvArray(0) === 0.0) + assert(dvArray(1) === 1.2) + assert(dvArray(2) === 3.1) + assert(dvArray(3) === 0.0) + + val svMap = scala.collection.mutable.ArrayBuffer[Double]() + sv.foreach { (value) => svMap += value } + assert(svMap.size === 3) + assert(svMap(1) === 1.2) + assert(svMap(2) === 3.1) + assert(svMap(3) === 0.0) + } + test("vector p-norm") { val dv = Vectors.dense(0.0, -1.2, 3.1, 0.0, -4.5, 1.9) val sv = Vectors.sparse(6, Seq((1, -1.2), (2, 3.1), (3, 0.0), (4, -4.5), (5, 1.9))) From 3aa75b7b3bfb0cbb4a12f63ba2c4e48e164dc21c Mon Sep 17 00:00:00 2001 From: Nakul Jindal Date: Thu, 29 Oct 2015 16:33:58 -0700 Subject: [PATCH 2/7] Updated implementation with better understanding of requirements --- .../org/apache/spark/mllib/linalg/Vectors.scala | 15 ++++++++++++++- .../spark/mllib/linalg/VectorsSuite.scala | 17 ++++++++++------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 051d9f874bb37..43e5114e3cf6b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -688,6 +688,8 @@ class SparseVector @Since("1.0.0") ( require(indices.length <= size, s"You provided ${indices.length} indices and values, " + s"which exceeds the specified vector size ${size}.") + @Since("1.6.0") val NonSparseVal = 0.0 + override def toString: String = s"($size,${indices.mkString("[", ",", "]")},${values.mkString("[", ",", "]")})" @@ -723,7 +725,18 @@ class SparseVector @Since("1.0.0") ( } @Since("1.6.0") - private[spark] def foreach(f: (Double) => Unit) = { indices.foreach { i: Int => f(values(i)) } } + private[spark] def foreach(f: (Double) => Unit) = { + var i, j = 0 // i counts 0 -> size, j counts 0 -> indices.length + while (i < size) { + if (i == indices(j)){ + f(values(j)) + j += 1 + } else { + f(NonSparseVal) + } + i += 1 + } + } override def hashCode(): Int = { var result: Int = 31 + size diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 3561023eaea75..0a66b02c8cc7f 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -303,7 +303,7 @@ class VectorsSuite extends SparkFunSuite with Logging { test("foreach") { val dv = Vectors.dense(0.0, 1.2, 3.1, 0.0) - val sv = Vectors.sparse(4, Seq((1, 1.2), (2, 3.1), (3, 0.0))) + val sv = Vectors.sparse(6, Seq((5, 1.2), (4, 3.1), (1, 6.0))) val dvArray = scala.collection.mutable.ArrayBuffer[Double]() dv.foreach { (value) => dvArray += value } @@ -313,12 +313,15 @@ class VectorsSuite extends SparkFunSuite with Logging { assert(dvArray(2) === 3.1) assert(dvArray(3) === 0.0) - val svMap = scala.collection.mutable.ArrayBuffer[Double]() - sv.foreach { (value) => svMap += value } - assert(svMap.size === 3) - assert(svMap(1) === 1.2) - assert(svMap(2) === 3.1) - assert(svMap(3) === 0.0) + val svArray = scala.collection.mutable.ArrayBuffer[Double]() + sv.foreach { (value) => svArray += value } + assert(svArray.size === 6) + assert(svArray(0) === 0.0) + assert(svArray(1) === 6.0) + assert(svArray(2) === 0.0) + assert(svArray(3) === 0.0) + assert(svArray(4) === 3.1) + assert(svArray(5) === 1.2) } test("vector p-norm") { From 7528c3490595113fcf677a1f2b73b7cb7c3cc445 Mon Sep 17 00:00:00 2001 From: Nakul Jindal Date: Fri, 30 Oct 2015 08:53:49 -0700 Subject: [PATCH 3/7] Removed foreach, foreachActive made public. --- .../apache/spark/mllib/linalg/Vectors.scala | 30 ++----------------- .../spark/mllib/linalg/VectorsSuite.scala | 23 -------------- 2 files changed, 3 insertions(+), 50 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 43e5114e3cf6b..0a428fd405a32 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -123,14 +123,7 @@ sealed trait Vector extends Serializable { * the vector with type `Int`, and the second parameter is the corresponding value * with type `Double`. */ - private[spark] def foreachActive(f: (Int, Double) => Unit) - - /** - * Applies a function 'f' to all elements of the vector. - * - * @param f the function that is applied for its side-effect to every element - */ - private[spark] def foreach(f: (Double) => Unit) + def foreachActive(f: (Int, Double) => Unit) /** * Number of active entries. An "active entry" is an element which is explicitly stored, @@ -577,7 +570,7 @@ class DenseVector @Since("1.0.0") ( new DenseVector(values.clone()) } - private[spark] override def foreachActive(f: (Int, Double) => Unit) = { + override def foreachActive(f: (Int, Double) => Unit) = { var i = 0 val localValuesSize = values.length val localValues = values @@ -588,9 +581,6 @@ class DenseVector @Since("1.0.0") ( } } - @Since("1.6.0") - private[spark] override def foreach(f: (Double) => Unit) = { values.foreach(f) } - override def hashCode(): Int = { var result: Int = 31 + size var i = 0 @@ -712,7 +702,7 @@ class SparseVector @Since("1.0.0") ( private[spark] override def toBreeze: BV[Double] = new BSV[Double](indices, values, size) - private[spark] override def foreachActive(f: (Int, Double) => Unit) = { + override def foreachActive(f: (Int, Double) => Unit) = { var i = 0 val localValuesSize = values.length val localIndices = indices @@ -724,20 +714,6 @@ class SparseVector @Since("1.0.0") ( } } - @Since("1.6.0") - private[spark] def foreach(f: (Double) => Unit) = { - var i, j = 0 // i counts 0 -> size, j counts 0 -> indices.length - while (i < size) { - if (i == indices(j)){ - f(values(j)) - j += 1 - } else { - f(NonSparseVal) - } - i += 1 - } - } - override def hashCode(): Int = { var result: Int = 31 + size val end = values.length diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala index 0a66b02c8cc7f..6508ddeba4206 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/VectorsSuite.scala @@ -301,29 +301,6 @@ class VectorsSuite extends SparkFunSuite with Logging { assert(svMap.get(3) === Some(0.0)) } - test("foreach") { - val dv = Vectors.dense(0.0, 1.2, 3.1, 0.0) - val sv = Vectors.sparse(6, Seq((5, 1.2), (4, 3.1), (1, 6.0))) - - val dvArray = scala.collection.mutable.ArrayBuffer[Double]() - dv.foreach { (value) => dvArray += value } - assert(dvArray.size === 4) - assert(dvArray(0) === 0.0) - assert(dvArray(1) === 1.2) - assert(dvArray(2) === 3.1) - assert(dvArray(3) === 0.0) - - val svArray = scala.collection.mutable.ArrayBuffer[Double]() - sv.foreach { (value) => svArray += value } - assert(svArray.size === 6) - assert(svArray(0) === 0.0) - assert(svArray(1) === 6.0) - assert(svArray(2) === 0.0) - assert(svArray(3) === 0.0) - assert(svArray(4) === 3.1) - assert(svArray(5) === 1.2) - } - test("vector p-norm") { val dv = Vectors.dense(0.0, -1.2, 3.1, 0.0, -4.5, 1.9) val sv = Vectors.sparse(6, Seq((1, -1.2), (2, 3.1), (3, 0.0), (4, -4.5), (5, 1.9))) From ab7b6cbdb2f28a484c5cc7c1d7e3ed9a2a2a7424 Mon Sep 17 00:00:00 2001 From: Nakul Jindal Date: Fri, 30 Oct 2015 08:55:16 -0700 Subject: [PATCH 4/7] Removed spurios NonSparseVal field. --- .../main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 0a428fd405a32..3aac706276bf6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -677,9 +677,7 @@ class SparseVector @Since("1.0.0") ( s" ${values.length} values.") require(indices.length <= size, s"You provided ${indices.length} indices and values, " + s"which exceeds the specified vector size ${size}.") - - @Since("1.6.0") val NonSparseVal = 0.0 - + override def toString: String = s"($size,${indices.mkString("[", ",", "]")},${values.mkString("[", ",", "]")})" From 71b4813c60f8427cd6d352951dfad3dce7252749 Mon Sep 17 00:00:00 2001 From: Nakul Jindal Date: Fri, 30 Oct 2015 10:37:11 -0700 Subject: [PATCH 5/7] Removed spurious whitespace. --- .../src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 3aac706276bf6..cf712242392f3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -677,7 +677,7 @@ class SparseVector @Since("1.0.0") ( s" ${values.length} values.") require(indices.length <= size, s"You provided ${indices.length} indices and values, " + s"which exceeds the specified vector size ${size}.") - + override def toString: String = s"($size,${indices.mkString("[", ",", "]")},${values.mkString("[", ",", "]")})" From 57ce49add2917323ea91d3dbd1acc1ce4f095f0a Mon Sep 17 00:00:00 2001 From: Nakul Jindal Date: Fri, 30 Oct 2015 16:13:16 -0700 Subject: [PATCH 6/7] Added Since annotations, conformed to scala style. --- .../scala/org/apache/spark/mllib/linalg/Vectors.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index cf712242392f3..7761d0ccb2700 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -123,7 +123,7 @@ sealed trait Vector extends Serializable { * the vector with type `Int`, and the second parameter is the corresponding value * with type `Double`. */ - def foreachActive(f: (Int, Double) => Unit) + def foreachActive(f: (Int, Double) => Unit): Unit /** * Number of active entries. An "active entry" is an element which is explicitly stored, @@ -570,7 +570,8 @@ class DenseVector @Since("1.0.0") ( new DenseVector(values.clone()) } - override def foreachActive(f: (Int, Double) => Unit) = { + @Since("1.6.0") + override def foreachActive(f: (Int, Double) => Unit): Unit = { var i = 0 val localValuesSize = values.length val localValues = values @@ -700,7 +701,8 @@ class SparseVector @Since("1.0.0") ( private[spark] override def toBreeze: BV[Double] = new BSV[Double](indices, values, size) - override def foreachActive(f: (Int, Double) => Unit) = { + @Since("1.6.0") + override def foreachActive(f: (Int, Double) => Unit): Unit = { var i = 0 val localValuesSize = values.length val localIndices = indices From 7d1fa9aecdb31876811a0668eeba37e8ce908e63 Mon Sep 17 00:00:00 2001 From: Nakul Jindal Date: Fri, 30 Oct 2015 16:14:28 -0700 Subject: [PATCH 7/7] Added since annotation to Vectors. --- mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 7761d0ccb2700..bd9badc03c345 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -123,6 +123,7 @@ sealed trait Vector extends Serializable { * the vector with type `Int`, and the second parameter is the corresponding value * with type `Double`. */ + @Since("1.6.0") def foreachActive(f: (Int, Double) => Unit): Unit /**