Skip to content

Commit

Permalink
Code Review Comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Yunni committed Nov 22, 2016
1 parent 4508393 commit 939e9d5
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 35 deletions.
50 changes: 25 additions & 25 deletions mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol {

/**
* Transform the Schema for LSH
* @param schema The schema of the input dataset without [[outputCol]]
* @return A derived schema with [[outputCol]] added
* @param schema The schema of the input dataset without [[outputCol]].
* @return A derived schema with [[outputCol]] added.
*/
protected[this] final def validateAndTransformSchema(schema: StructType): StructType = {
SchemaUtils.appendColumn(schema, $(outputCol), DataTypes.createArrayType(new VectorUDT))
Expand All @@ -73,19 +73,19 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]

/**
* Calculate the distance between two different keys using the distance metric corresponding
* to the hashFunction
* @param x One input vector in the metric space
* @param y One input vector in the metric space
* @return The distance between x and y
* to the hashFunction.
* @param x One input vector in the metric space.
* @param y One input vector in the metric space.
* @return The distance between x and y.
*/
protected[ml] def keyDistance(x: Vector, y: Vector): Double

/**
* Calculate the distance between two different hash Vectors.
*
* @param x One of the hash vector
* @param y Another hash vector
* @return The distance between hash vectors x and y
* @param x One of the hash vector.
* @param y Another hash vector.
* @return The distance between hash vectors x and y.
*/
protected[ml] def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double

Expand Down Expand Up @@ -153,14 +153,14 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
* the [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the
* transformed data when necessary.
*
* NOTE: This method is experimental and will likely change behavior in the next release.
* @note This method is experimental and will likely change behavior in the next release.
*
* @param dataset the dataset to search for nearest neighbors of the key
* @param key Feature vector representing the item to search for
* @param numNearestNeighbors The maximum number of nearest neighbors
* @param distCol Output column for storing the distance between each result row and the key
* @return A dataset containing at most k items closest to the key. A distCol is added to show
* the distance between each row and the key.
* @param dataset The dataset to search for nearest neighbors of the key.
* @param key Feature vector representing the item to search for.
* @param numNearestNeighbors The maximum number of nearest neighbors.
* @param distCol Output column for storing the distance between each result row and the key.
* @return A dataset containing at most k items closest to the key. A column "distCol" is added
* to show the distance between each row and the key.
*/
def approxNearestNeighbors(
dataset: Dataset[_],
Expand All @@ -187,7 +187,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
*
* @param dataset The dataset to transform and explode.
* @param explodeCols The alias for the exploded columns, must be a seq of two strings.
* @return A dataset containing idCol, inputCol and explodeCols
* @return A dataset containing idCol, inputCol and explodeCols.
*/
private[this] def processDataset(
dataset: Dataset[_],
Expand All @@ -206,9 +206,9 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
/**
* Recreate a column using the same column name but different attribute id. Used in approximate
* similarity join.
* @param dataset The dataset where a column need to recreate
* @param colName The name of the column to recreate
* @param tmpColName A temporary column name which does not conflict with existing columns
* @param dataset The dataset where a column need to recreate.
* @param colName The name of the column to recreate.
* @param tmpColName A temporary column name which does not conflict with existing columns.
* @return
*/
private[this] def recreateCol(
Expand All @@ -227,12 +227,12 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
* [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the transformed
* data when necessary.
*
* @param datasetA One of the datasets to join
* @param datasetB Another dataset to join
* @param threshold The threshold for the distance of row pairs
* @param distCol Output column for storing the distance between each result row and the key
* @param datasetA One of the datasets to join.
* @param datasetB Another dataset to join.
* @param threshold The threshold for the distance of row pairs.
* @param distCol Output column for storing the distance between each result row and the key.
* @return A joined dataset containing pairs of rows. The original rows are in columns
* "datasetA" and "datasetB", and a distCol is added to show the distance of each pair
* "datasetA" and "datasetB", and a distCol is added to show the distance of each pair.
*/
def approxSimilarityJoin(
datasetA: Dataset[_],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,16 @@ import org.apache.spark.sql.types.StructType
/**
* :: Experimental ::
*
* Model produced by [[MinHashLSH]], where multiple hash functions are stored. Each hash function is
* picked from a hash family for a specific set `S` with cardinality equal to `numEntries`:
* `h_i(x) = ((x \cdot a_i + b_i) \mod prime) \mod numEntries`
* Model produced by [[MinHashLSH]], where multiple hash functions are stored. Each hash function
* is picked from the following family of hash functions, where a_i and b_i are randomly chosen
* integers less than prime:
* `h_i(x) = ((x \cdot a_i + b_i) \mod prime)`
*
* This hash family is approximately min-wise independent according to the reference.
*
* Reference:
* [[http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.121.8215&rep=rep1&type=pdf Min-wise
* independent permutations]]
* Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear permutations."
* Electronic Journal of Combinatorics 7 (2000): R26.
*
* @param randCoefficients Pairs of random coefficients. Each pair is used by one hash function.
*/
Expand All @@ -55,11 +56,11 @@ class MinHashLSHModel private[ml](
elems: Vector => {
require(elems.numNonzeros > 0, "Must have at least 1 non zero entry.")
val elemsList = elems.toSparse.indices.toList
val hashValues = randCoefficients.map({ case (a: Int, b: Int) =>
val hashValues = randCoefficients.map { case (a, b) =>
elemsList.map { elem: Int =>
((1 + elem) * a + b) % MinHashLSH.HASH_PRIME
}.min.toDouble
})
}
// TODO: Output vectors of dimension numHashFunctions in SPARK-18450
hashValues.grouped(1).map(Vectors.dense).toArray
}
Expand Down Expand Up @@ -132,7 +133,7 @@ class MinHashLSH(override val uid: String) extends LSH[MinHashLSHModel] with Has
require(inputDim <= MinHashLSH.HASH_PRIME,
s"The input vector dimension $inputDim exceeds the threshold ${MinHashLSH.HASH_PRIME}.")
val rand = new Random($(seed))
val randCoefs: Array[(Int, Int)] = Array.fill(2 * $(numHashTables)) {
val randCoefs: Array[(Int, Int)] = Array.fill($(numHashTables)) {
(1 + rand.nextInt(MinHashLSH.HASH_PRIME - 1), rand.nextInt(MinHashLSH.HASH_PRIME - 1))
}
new MinHashLSHModel(uid, randCoefs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ class BucketedRandomProjectionLSHSuite
test("read/write") {
def checkModelData(
model: BucketedRandomProjectionLSHModel,
model2: BucketedRandomProjectionLSHModel
): Unit = {
model2: BucketedRandomProjectionLSHModel): Unit = {
model.randUnitVectors.zip(model2.randUnitVectors)
.foreach(pair => assert(pair._1 === pair._2))
}
Expand Down

0 comments on commit 939e9d5

Please sign in to comment.