Skip to content

Commit

Permalink
Revert AND-amplification for a future PR
Browse files Browse the repository at this point in the history
  • Loading branch information
Yunni committed Nov 14, 2016
1 parent adbbefe commit c115ed3
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ class BucketedRandomProjectionModel private[ml](
val hashValues: Array[Double] = randUnitVectors.map({
randUnitVector => Math.floor(BLAS.dot(key, randUnitVector) / $(bucketLength))
})
hashValues.grouped($(numHashFunctions)).map(Vectors.dense).toArray
// TODO: For AND-amplification, output vectors of dimension numHashFunctions
hashValues.grouped(1).map(Vectors.dense).toArray
}
}

Expand All @@ -94,11 +95,6 @@ class BucketedRandomProjectionModel private[ml](
x.zip(y).map(vectorPair => Vectors.sqdist(vectorPair._1, vectorPair._2)).min
}

@Since("2.1.0")
override protected[ml] def validateDimension(): Unit = {
require(randUnitVectors.length == ($(numHashFunctions) * $(numHashTables)))
}

@Since("2.1.0")
override def copy(extra: ParamMap): this.type = defaultCopy(extra)

Expand Down Expand Up @@ -137,9 +133,6 @@ class BucketedRandomProjectionLSH(override val uid: String)
@Since("2.1.0")
override def setOutputCol(value: String): this.type = super.setOutputCol(value)

@Since("2.1.0")
override def setNumHashFunctions(value: Int): this.type = super.setNumHashFunctions(value)

@Since("2.1.0")
override def setNumHashTables(value: Int): this.type = super.setNumHashTables(value)

Expand All @@ -160,7 +153,7 @@ class BucketedRandomProjectionLSH(override val uid: String)
override protected[this] def createRawLSHModel(inputDim: Int): BucketedRandomProjectionModel = {
val rand = new Random($(seed))
val randUnitVectors: Array[Vector] = {
Array.fill($(numHashTables) * $(numHashFunctions)) {
Array.fill($(numHashTables)) {
val randArray = Array.fill(inputDim)(rand.nextGaussian())
Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray)))
}
Expand Down
22 changes: 1 addition & 21 deletions mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
Original file line number Diff line number Diff line change
Expand Up @@ -43,24 +43,10 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol {
"tables, where increasing number of hash tables lowers the false negative rate, and " +
"decreasing it improves the running performance", ParamValidators.gt(0))

/**
* Param for the dimension of LSH AND-amplification.
*
* LSH AND-amplification can be used to reduce the false positive rate. The higher the dimension
* is, the lower the false positive rate.
* @group param
*/
final val numHashFunctions: IntParam = new IntParam(this, "numHashFunctions", "number of hash " +
"functions in each hash table, where increasing the number improves the running performance, " +
"and decreasing it raises the false negative rate", ParamValidators.gt(0))

/** @group getParam */
final def getNumHashFunctions: Int = $(numHashFunctions)

/** @group getParam */
final def getNumHashTables: Int = $(numHashTables)

setDefault(numHashTables -> 1, numHashFunctions -> 1)
setDefault(numHashTables -> 1)

/**
* Transform the Schema for LSH
Expand Down Expand Up @@ -103,10 +89,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
*/
protected[ml] def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double

protected[ml] def validateDimension(): Unit

override def transform(dataset: Dataset[_]): DataFrame = {
validateDimension()
transformSchema(dataset.schema, logging = true)
val transformUDF = udf(hashFunction, DataTypes.createArrayType(new VectorUDT))
dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
Expand Down Expand Up @@ -338,9 +321,6 @@ private[ml] abstract class LSH[T <: LSHModel[T]]
/** @group setParam */
def setOutputCol(value: String): this.type = set(outputCol, value)

/** @group setParam */
def setNumHashFunctions(value: Int): this.type = set(numHashFunctions, value)

/** @group setParam */
def setNumHashTables(value: Int): this.type = set(numHashTables, value)

Expand Down
14 changes: 3 additions & 11 deletions mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ class MinHashModel private[ml] (
(1 + elem) * randCoefficient.toLong % MinHashLSH.prime % numEntries
}).min.toDouble
})
hashValues.grouped($(numHashFunctions)).map(Vectors.dense).toArray
// TODO: For AND-amplification, output vectors of dimension numHashFunctions
hashValues.grouped(1).map(Vectors.dense).toArray
}
}

Expand All @@ -79,11 +80,6 @@ class MinHashModel private[ml] (
).min
}

@Since("2.1.0")
override protected[ml] def validateDimension(): Unit = {
require(randCoefficients.length == ($(numHashFunctions) * $(numHashTables)))
}

@Since("2.1.0")
override def copy(extra: ParamMap): this.type = defaultCopy(extra)

Expand Down Expand Up @@ -115,9 +111,6 @@ class MinHashLSH(override val uid: String) extends LSH[MinHashModel] with HasSee
@Since("2.1.0")
override def setOutputCol(value: String): this.type = super.setOutputCol(value)

@Since("2.1.0")
override def setNumHashFunctions(value: Int): this.type = super.setNumHashFunctions(value)

@Since("2.1.0")
override def setNumHashTables(value: Int): this.type = super.setNumHashTables(value)

Expand All @@ -136,8 +129,7 @@ class MinHashLSH(override val uid: String) extends LSH[MinHashModel] with HasSee
s"The input vector dimension $inputDim exceeds the threshold ${MinHashLSH.prime / 2}.")
val rand = new Random($(seed))
val numEntry = inputDim * 2
val randCoofs: Array[Int] = Array
.fill($(numHashTables) * $(numHashFunctions))(1 + rand.nextInt(MinHashLSH.prime - 1))
val randCoofs: Array[Int] = Array.fill($(numHashTables))(1 + rand.nextInt(MinHashLSH.prime - 1))
new MinHashModel(uid, numEntry, randCoofs)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ class BucketedRandomProjectionLSHSuite

test("BucketedRandomProjectionLSH: default params") {
val brp = new BucketedRandomProjectionLSH
assert(brp.getNumHashFunctions === 1.0)
assert(brp.getNumHashTables === 1.0)
}

Expand All @@ -69,15 +68,12 @@ class BucketedRandomProjectionLSHSuite
}

test("hashFunction") {
val randUnitVectors = Array(Vectors.dense(0.0, 1.0),
Vectors.dense(1.0, 0.0), Vectors.dense(0.25, 0.75), Vectors.dense(0.75, 0.25))
val randUnitVectors = Array(Vectors.dense(0.0, 1.0), Vectors.dense(1.0, 0.0))
val model = new BucketedRandomProjectionModel("brp", randUnitVectors)
model.set(model.bucketLength, 0.5)
model.set(model.numHashTables, 2)
model.set(model.numHashFunctions, 2)
val res = model.hashFunction(Vectors.dense(1.23, 4.56))
assert(res(0).equals(Vectors.dense(9.0, 2.0)))
assert(res(1).equals(Vectors.dense(7.0, 4.0)))
assert(res(0).equals(Vectors.dense(9.0)))
assert(res(1).equals(Vectors.dense(2.0)))
}

test("keyDistance") {
Expand All @@ -88,7 +84,6 @@ class BucketedRandomProjectionLSHSuite

test("BucketedRandomProjectionLSH: randUnitVectors") {
val brp = new BucketedRandomProjectionLSH()
.setNumHashFunctions(10)
.setNumHashTables(20)
.setInputCol("keys")
.setOutputCol("values")
Expand Down Expand Up @@ -123,7 +118,6 @@ class BucketedRandomProjectionLSHSuite

// Project from 100 dimensional Euclidean Space to 10 dimensions
val brp = new BucketedRandomProjectionLSH()
.setNumHashFunctions(5)
.setNumHashTables(10)
.setInputCol("keys")
.setOutputCol("values")
Expand Down Expand Up @@ -155,7 +149,6 @@ class BucketedRandomProjectionLSHSuite
val key = Vectors.dense(1.2, 3.4)

val brp = new BucketedRandomProjectionLSH()
.setNumHashFunctions(10)
.setNumHashTables(20)
.setInputCol("keys")
.setOutputCol("values")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
test("MinHashLSH: default params") {
val rp = new MinHashLSH
assert(rp.getNumHashTables === 1.0)
assert(rp.getNumHashFunctions === 1.0)
}

test("read/write") {
Expand All @@ -60,12 +59,11 @@ class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
}

test("hashFunction") {
val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(0, 1, 2, 3))
model.set(model.numHashTables, 2)
model.set(model.numHashFunctions, 2)
val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(0, 1, 3))
val res = model.hashFunction(Vectors.sparse(10, Seq((2, 1.0), (3, 1.0), (5, 1.0), (7, 1.0))))
assert(res(0).equals(Vectors.dense(0.0, 3.0)))
assert(res(1).equals(Vectors.dense(6.0, 4.0)))
assert(res(0).equals(Vectors.dense(0.0)))
assert(res(1).equals(Vectors.dense(3.0)))
assert(res(2).equals(Vectors.dense(4.0)))
}

test("keyDistance") {
Expand Down Expand Up @@ -115,7 +113,6 @@ class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
val df2 = spark.createDataFrame(data2.map(Tuple1.apply)).toDF("keys")

val mh = new MinHashLSH()
.setNumHashFunctions(2)
.setNumHashTables(20)
.setInputCol("keys")
.setOutputCol("values")
Expand Down

0 comments on commit c115ed3

Please sign in to comment.