Code Review Comments

apache · Nov 22, 2016 · 939e9d5 · 939e9d5
1 parent 4508393
commit 939e9d5
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 35 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
@@ -50,8 +50,8 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol {
 
   /**
    * Transform the Schema for LSH
-   * @param schema The schema of the input dataset without [[outputCol]]
-   * @return A derived schema with [[outputCol]] added
+   * @param schema The schema of the input dataset without [[outputCol]].
+   * @return A derived schema with [[outputCol]] added.
    */
   protected[this] final def validateAndTransformSchema(schema: StructType): StructType = {
     SchemaUtils.appendColumn(schema, $(outputCol), DataTypes.createArrayType(new VectorUDT))
@@ -73,19 +73,19 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
 
   /**
    * Calculate the distance between two different keys using the distance metric corresponding
-   * to the hashFunction
-   * @param x One input vector in the metric space
-   * @param y One input vector in the metric space
-   * @return The distance between x and y
+   * to the hashFunction.
+   * @param x One input vector in the metric space.
+   * @param y One input vector in the metric space.
+   * @return The distance between x and y.
    */
   protected[ml] def keyDistance(x: Vector, y: Vector): Double
 
   /**
    * Calculate the distance between two different hash Vectors.
    *
-   * @param x One of the hash vector
-   * @param y Another hash vector
-   * @return The distance between hash vectors x and y
+   * @param x One of the hash vector.
+   * @param y Another hash vector.
+   * @return The distance between hash vectors x and y.
    */
   protected[ml] def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double
 
@@ -153,14 +153,14 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
    * the [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the
    * transformed data when necessary.
    *
-   * NOTE: This method is experimental and will likely change behavior in the next release.
+   * @note This method is experimental and will likely change behavior in the next release.
    *
-   * @param dataset the dataset to search for nearest neighbors of the key
-   * @param key Feature vector representing the item to search for
-   * @param numNearestNeighbors The maximum number of nearest neighbors
-   * @param distCol Output column for storing the distance between each result row and the key
-   * @return A dataset containing at most k items closest to the key. A distCol is added to show
-   *         the distance between each row and the key.
+   * @param dataset The dataset to search for nearest neighbors of the key.
+   * @param key Feature vector representing the item to search for.
+   * @param numNearestNeighbors The maximum number of nearest neighbors.
+   * @param distCol Output column for storing the distance between each result row and the key.
+   * @return A dataset containing at most k items closest to the key. A column "distCol" is added
+   *         to show the distance between each row and the key.
    */
   def approxNearestNeighbors(
     dataset: Dataset[_],
@@ -187,7 +187,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
    *
    * @param dataset The dataset to transform and explode.
    * @param explodeCols The alias for the exploded columns, must be a seq of two strings.
-   * @return A dataset containing idCol, inputCol and explodeCols
+   * @return A dataset containing idCol, inputCol and explodeCols.
    */
   private[this] def processDataset(
       dataset: Dataset[_],
@@ -206,9 +206,9 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
   /**
    * Recreate a column using the same column name but different attribute id. Used in approximate
    * similarity join.
-   * @param dataset The dataset where a column need to recreate
-   * @param colName The name of the column to recreate
-   * @param tmpColName A temporary column name which does not conflict with existing columns
+   * @param dataset The dataset where a column need to recreate.
+   * @param colName The name of the column to recreate.
+   * @param tmpColName A temporary column name which does not conflict with existing columns.
    * @return
    */
   private[this] def recreateCol(
@@ -227,12 +227,12 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
    * [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the transformed
    * data when necessary.
    *
-   * @param datasetA One of the datasets to join
-   * @param datasetB Another dataset to join
-   * @param threshold The threshold for the distance of row pairs
-   * @param distCol Output column for storing the distance between each result row and the key
+   * @param datasetA One of the datasets to join.
+   * @param datasetB Another dataset to join.
+   * @param threshold The threshold for the distance of row pairs.
+   * @param distCol Output column for storing the distance between each result row and the key.
    * @return A joined dataset containing pairs of rows. The original rows are in columns
-   *         "datasetA" and "datasetB", and a distCol is added to show the distance of each pair
+   *         "datasetA" and "datasetB", and a distCol is added to show the distance of each pair.
    */
   def approxSimilarityJoin(
       datasetA: Dataset[_],

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
@@ -31,15 +31,16 @@ import org.apache.spark.sql.types.StructType
 /**
  * :: Experimental ::
  *
- * Model produced by [[MinHashLSH]], where multiple hash functions are stored. Each hash function is
- * picked from a hash family for a specific set `S` with cardinality equal to `numEntries`:
- *    `h_i(x) = ((x \cdot a_i + b_i) \mod prime) \mod numEntries`
+ * Model produced by [[MinHashLSH]], where multiple hash functions are stored. Each hash function
+ * is picked from the following family of hash functions, where a_i and b_i are randomly chosen
+ * integers less than prime:
+ *    `h_i(x) = ((x \cdot a_i + b_i) \mod prime)`
  *
  * This hash family is approximately min-wise independent according to the reference.
  *
  * Reference:
- * [[http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.121.8215&rep=rep1&type=pdf Min-wise
- * independent permutations]]
+ * Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear permutations."
+ * Electronic Journal of Combinatorics 7 (2000): R26.
  *
  * @param randCoefficients Pairs of random coefficients. Each pair is used by one hash function.
  */
@@ -55,11 +56,11 @@ class MinHashLSHModel private[ml](
     elems: Vector => {
       require(elems.numNonzeros > 0, "Must have at least 1 non zero entry.")
       val elemsList = elems.toSparse.indices.toList
-      val hashValues = randCoefficients.map({ case (a: Int, b: Int) =>
+      val hashValues = randCoefficients.map { case (a, b) =>
         elemsList.map { elem: Int =>
           ((1 + elem) * a + b) % MinHashLSH.HASH_PRIME
         }.min.toDouble
-      })
+      }
       // TODO: Output vectors of dimension numHashFunctions in SPARK-18450
       hashValues.grouped(1).map(Vectors.dense).toArray
     }
@@ -132,7 +133,7 @@ class MinHashLSH(override val uid: String) extends LSH[MinHashLSHModel] with Has
     require(inputDim <= MinHashLSH.HASH_PRIME,
       s"The input vector dimension $inputDim exceeds the threshold ${MinHashLSH.HASH_PRIME}.")
     val rand = new Random($(seed))
-    val randCoefs: Array[(Int, Int)] = Array.fill(2 * $(numHashTables)) {
+    val randCoefs: Array[(Int, Int)] = Array.fill($(numHashTables)) {
         (1 + rand.nextInt(MinHashLSH.HASH_PRIME - 1), rand.nextInt(MinHashLSH.HASH_PRIME - 1))
       }
     new MinHashLSHModel(uid, randCoefs)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
@@ -57,8 +57,7 @@ class BucketedRandomProjectionLSHSuite
   test("read/write") {
     def checkModelData(
       model: BucketedRandomProjectionLSHModel,
-      model2: BucketedRandomProjectionLSHModel
-    ): Unit = {
+      model2: BucketedRandomProjectionLSHModel): Unit = {
       model.randUnitVectors.zip(model2.randUnitVectors)
         .foreach(pair => assert(pair._1 === pair._2))
     }