Revert AND-amplification for a future PR

apache · Nov 14, 2016 · c115ed3 · c115ed3
1 parent adbbefe
commit c115ed3
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 59 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
@@ -79,7 +79,8 @@ class BucketedRandomProjectionModel private[ml](
       val hashValues: Array[Double] = randUnitVectors.map({
         randUnitVector => Math.floor(BLAS.dot(key, randUnitVector) / $(bucketLength))
       })
-      hashValues.grouped($(numHashFunctions)).map(Vectors.dense).toArray
+      // TODO: For AND-amplification, output vectors of dimension numHashFunctions
+      hashValues.grouped(1).map(Vectors.dense).toArray
     }
   }
 
@@ -94,11 +95,6 @@ class BucketedRandomProjectionModel private[ml](
     x.zip(y).map(vectorPair => Vectors.sqdist(vectorPair._1, vectorPair._2)).min
   }
 
-  @Since("2.1.0")
-  override protected[ml] def validateDimension(): Unit = {
-    require(randUnitVectors.length == ($(numHashFunctions) * $(numHashTables)))
-  }
-
   @Since("2.1.0")
   override def copy(extra: ParamMap): this.type = defaultCopy(extra)
 
@@ -137,9 +133,6 @@ class BucketedRandomProjectionLSH(override val uid: String)
   @Since("2.1.0")
   override def setOutputCol(value: String): this.type = super.setOutputCol(value)
 
-  @Since("2.1.0")
-  override def setNumHashFunctions(value: Int): this.type = super.setNumHashFunctions(value)
-
   @Since("2.1.0")
   override def setNumHashTables(value: Int): this.type = super.setNumHashTables(value)
 
@@ -160,7 +153,7 @@ class BucketedRandomProjectionLSH(override val uid: String)
   override protected[this] def createRawLSHModel(inputDim: Int): BucketedRandomProjectionModel = {
     val rand = new Random($(seed))
     val randUnitVectors: Array[Vector] = {
-      Array.fill($(numHashTables) * $(numHashFunctions)) {
+      Array.fill($(numHashTables)) {
         val randArray = Array.fill(inputDim)(rand.nextGaussian())
         Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray)))
       }

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
@@ -43,24 +43,10 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol {
     "tables, where increasing number of hash tables lowers the false negative rate, and " +
     "decreasing it improves the running performance", ParamValidators.gt(0))
 
-  /**
-   * Param for the dimension of LSH AND-amplification.
-   *
-   * LSH AND-amplification can be used to reduce the false positive rate. The higher the dimension
-   * is, the lower the false positive rate.
-   * @group param
-   */
-  final val numHashFunctions: IntParam = new IntParam(this, "numHashFunctions", "number of hash " +
-    "functions in each hash table, where increasing the number improves the running performance, " +
-    "and decreasing it raises the false negative rate", ParamValidators.gt(0))
-
-  /** @group getParam */
-  final def getNumHashFunctions: Int = $(numHashFunctions)
-
   /** @group getParam */
   final def getNumHashTables: Int = $(numHashTables)
 
-  setDefault(numHashTables -> 1, numHashFunctions -> 1)
+  setDefault(numHashTables -> 1)
 
   /**
    * Transform the Schema for LSH
@@ -103,10 +89,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
    */
   protected[ml] def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double
 
-  protected[ml] def validateDimension(): Unit
-
   override def transform(dataset: Dataset[_]): DataFrame = {
-    validateDimension()
     transformSchema(dataset.schema, logging = true)
     val transformUDF = udf(hashFunction, DataTypes.createArrayType(new VectorUDT))
     dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
@@ -338,9 +321,6 @@ private[ml] abstract class LSH[T <: LSHModel[T]]
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /** @group setParam */
-  def setNumHashFunctions(value: Int): this.type = set(numHashFunctions, value)
-
   /** @group setParam */
   def setNumHashTables(value: Int): this.type = set(numHashTables, value)
 

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
@@ -56,7 +56,8 @@ class MinHashModel private[ml] (
           (1 + elem) * randCoefficient.toLong % MinHashLSH.prime % numEntries
         }).min.toDouble
       })
-      hashValues.grouped($(numHashFunctions)).map(Vectors.dense).toArray
+      // TODO: For AND-amplification, output vectors of dimension numHashFunctions
+      hashValues.grouped(1).map(Vectors.dense).toArray
     }
   }
 
@@ -79,11 +80,6 @@ class MinHashModel private[ml] (
     ).min
   }
 
-  @Since("2.1.0")
-  override protected[ml] def validateDimension(): Unit = {
-    require(randCoefficients.length == ($(numHashFunctions) * $(numHashTables)))
-  }
-
   @Since("2.1.0")
   override def copy(extra: ParamMap): this.type = defaultCopy(extra)
 
@@ -115,9 +111,6 @@ class MinHashLSH(override val uid: String) extends LSH[MinHashModel] with HasSee
   @Since("2.1.0")
   override def setOutputCol(value: String): this.type = super.setOutputCol(value)
 
-  @Since("2.1.0")
-  override def setNumHashFunctions(value: Int): this.type = super.setNumHashFunctions(value)
-
   @Since("2.1.0")
   override def setNumHashTables(value: Int): this.type = super.setNumHashTables(value)
 
@@ -136,8 +129,7 @@ class MinHashLSH(override val uid: String) extends LSH[MinHashModel] with HasSee
       s"The input vector dimension $inputDim exceeds the threshold ${MinHashLSH.prime / 2}.")
     val rand = new Random($(seed))
     val numEntry = inputDim * 2
-    val randCoofs: Array[Int] = Array
-      .fill($(numHashTables) * $(numHashFunctions))(1 + rand.nextInt(MinHashLSH.prime - 1))
+    val randCoofs: Array[Int] = Array.fill($(numHashTables))(1 + rand.nextInt(MinHashLSH.prime - 1))
     new MinHashModel(uid, numEntry, randCoofs)
   }
 

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
@@ -51,7 +51,6 @@ class BucketedRandomProjectionLSHSuite
 
   test("BucketedRandomProjectionLSH: default params") {
     val brp = new BucketedRandomProjectionLSH
-    assert(brp.getNumHashFunctions === 1.0)
     assert(brp.getNumHashTables === 1.0)
   }
 
@@ -69,15 +68,12 @@ class BucketedRandomProjectionLSHSuite
   }
 
   test("hashFunction") {
-    val randUnitVectors = Array(Vectors.dense(0.0, 1.0),
-      Vectors.dense(1.0, 0.0), Vectors.dense(0.25, 0.75), Vectors.dense(0.75, 0.25))
+    val randUnitVectors = Array(Vectors.dense(0.0, 1.0), Vectors.dense(1.0, 0.0))
     val model = new BucketedRandomProjectionModel("brp", randUnitVectors)
     model.set(model.bucketLength, 0.5)
-    model.set(model.numHashTables, 2)
-    model.set(model.numHashFunctions, 2)
     val res = model.hashFunction(Vectors.dense(1.23, 4.56))
-    assert(res(0).equals(Vectors.dense(9.0, 2.0)))
-    assert(res(1).equals(Vectors.dense(7.0, 4.0)))
+    assert(res(0).equals(Vectors.dense(9.0)))
+    assert(res(1).equals(Vectors.dense(2.0)))
   }
 
   test("keyDistance") {
@@ -88,7 +84,6 @@ class BucketedRandomProjectionLSHSuite
 
   test("BucketedRandomProjectionLSH: randUnitVectors") {
     val brp = new BucketedRandomProjectionLSH()
-      .setNumHashFunctions(10)
       .setNumHashTables(20)
       .setInputCol("keys")
       .setOutputCol("values")
@@ -123,7 +118,6 @@ class BucketedRandomProjectionLSHSuite
 
     // Project from 100 dimensional Euclidean Space to 10 dimensions
     val brp = new BucketedRandomProjectionLSH()
-      .setNumHashFunctions(5)
       .setNumHashTables(10)
       .setInputCol("keys")
       .setOutputCol("values")
@@ -155,7 +149,6 @@ class BucketedRandomProjectionLSHSuite
     val key = Vectors.dense(1.2, 3.4)
 
     val brp = new BucketedRandomProjectionLSH()
-      .setNumHashFunctions(10)
       .setNumHashTables(20)
       .setInputCol("keys")
       .setOutputCol("values")

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
@@ -46,7 +46,6 @@ class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
   test("MinHashLSH: default params") {
     val rp = new MinHashLSH
     assert(rp.getNumHashTables === 1.0)
-    assert(rp.getNumHashFunctions === 1.0)
   }
 
   test("read/write") {
@@ -60,12 +59,11 @@ class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
   }
 
   test("hashFunction") {
-    val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(0, 1, 2, 3))
-    model.set(model.numHashTables, 2)
-    model.set(model.numHashFunctions, 2)
+    val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(0, 1, 3))
     val res = model.hashFunction(Vectors.sparse(10, Seq((2, 1.0), (3, 1.0), (5, 1.0), (7, 1.0))))
-    assert(res(0).equals(Vectors.dense(0.0, 3.0)))
-    assert(res(1).equals(Vectors.dense(6.0, 4.0)))
+    assert(res(0).equals(Vectors.dense(0.0)))
+    assert(res(1).equals(Vectors.dense(3.0)))
+    assert(res(2).equals(Vectors.dense(4.0)))
   }
 
   test("keyDistance") {
@@ -115,7 +113,6 @@ class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     val df2 = spark.createDataFrame(data2.map(Tuple1.apply)).toDF("keys")
 
     val mh = new MinHashLSH()
-      .setNumHashFunctions(2)
       .setNumHashTables(20)
       .setInputCol("keys")
       .setOutputCol("values")