From 64e17b4db825f85ee19d30ca38cba887633c6900 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Fri, 30 Jun 2017 17:05:17 +0200 Subject: [PATCH 01/18] [SPARK-14516] Adding ClusteringEvaluator with the implementation of Cosine silhouette and squared Euclidean silhouette. --- .../ml/evaluation/ClusteringEvaluator.scala | 171 +++++++++++++ .../ml/evaluation/CosineSilhouette.scala | 119 +++++++++ .../SquaredEuclideanSilhouette.scala | 117 +++++++++ .../ml/linalg/VectorElementWiseSum.scala | 77 ++++++ .../evaluation/ClusteringEvaluatorSuite.scala | 235 ++++++++++++++++++ .../ml/linalg/VectorElementWiseSumSuite.scala | 71 ++++++ 6 files changed, 790 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/linalg/VectorElementWiseSumSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala new file mode 100644 index 0000000000000..251a6f36410e5 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.annotation.Experimental +import org.apache.spark.ml.linalg.{Vector, VectorUDT} +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.functions.{avg, col} +import org.apache.spark.sql.types.IntegerType + +/** + * Evaluator for clustering results. + * At the moment, the supported metrics are: + * squaredSilhouette: silhouette measure using the squared Euclidean distance; + * cosineSilhouette: silhouette measure using the cosine distance. + * The implementation follows the proposal explained + * + * in this document. + */ +@Experimental +class ClusteringEvaluator (val uid: String) + extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("SquaredEuclideanSilhouette")) + + override def copy(pMap: ParamMap): Evaluator = this.defaultCopy(pMap) + + override def isLargerBetter: Boolean = true + + /** @group setParam */ + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default), `"cosineSilhouette"`) + * @group param + */ + val metricName: Param[String] = { + val allowedParams = ParamValidators.inArray(Array("squaredSilhouette", "cosineSilhouette")) + new Param( + this, + "metricName", + "metric name in evaluation (squaredSilhouette|cosineSilhouette)", + allowedParams + ) + } + + /** @group getParam */ + def getMetricName: String = $(metricName) + + /** @group setParam */ + def setMetricName(value: String): this.type = set(metricName, value) + + setDefault(metricName -> "squaredSilhouette") + + override def evaluate(dataset: Dataset[_]): Double = { + SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) + SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + + val metric: Double = $(metricName) match { + case "squaredSilhouette" => + computeSquaredSilhouette(dataset) + case "cosineSilhouette" => + computeCosineSilhouette(dataset) + } + metric + } + + private[this] def computeCosineSilhouette(dataset: Dataset[_]): Double = { + CosineSilhouette.registerKryoClasses(dataset.sparkSession.sparkContext) + + val computeCsi = dataset.sparkSession.udf.register("computeCsi", + CosineSilhouette.computeCsi _ ) + val dfWithCsi = dataset.withColumn("csi", computeCsi(col($(featuresCol)))) + + // compute aggregate values for clusters + // needed by the algorithm + val clustersAggregateValues = CosineSilhouette + .computeOmegaAndCount(dfWithCsi, $(predictionCol), $(featuresCol)) + + val clustersMap = clustersAggregateValues.collect().map(row => { + row.getAs[Int]($(predictionCol)) -> + CosineSilhouette.ClusterStats( + row.getAs[Vector]("omega"), + row.getAs[Long]("count") + ) + }).toMap + + val broadcastedClustersMap = dataset.sparkSession.sparkContext.broadcast(clustersMap) + + val computeSilhouette = dataset.sparkSession.udf.register("computeSilhouette", + CosineSilhouette + .computeCosineSilhouetteCoefficient(broadcastedClustersMap, _: Vector, _: Int, _: Vector) + ) + + val cosineSilhouetteDF = dfWithCsi + .withColumn("silhouetteCoefficient", + computeSilhouette(col($(featuresCol)), col($(predictionCol)), col("csi")) + ) + .agg(avg(col("silhouetteCoefficient"))) + + cosineSilhouetteDF.collect()(0).getDouble(0) + } + + private[this] def computeSquaredSilhouette(dataset: Dataset[_]): Double = { + SquaredEuclideanSilhouette.registerKryoClasses(dataset.sparkSession.sparkContext) + + val computeCsi = dataset.sparkSession.udf.register("computeCsi", + SquaredEuclideanSilhouette.computeCsi _ ) + val dfWithCsi = dataset.withColumn("csi", computeCsi(col($(featuresCol)))) + + // compute aggregate values for clusters + // needed by the algorithm + val clustersAggregateValues = SquaredEuclideanSilhouette + .computeYVectorPsiAndCount(dfWithCsi, $(predictionCol), $(featuresCol)) + + val clustersMap = clustersAggregateValues.collect().map(row => { + row.getAs[Int]($(predictionCol)) -> + SquaredEuclideanSilhouette.ClusterStats( + row.getAs[Vector]("y"), + row.getAs[Double]("psi"), + row.getAs[Long]("count") + ) + }).toMap + + val broadcastedClustersMap = dataset.sparkSession.sparkContext.broadcast(clustersMap) + + val computeSilhouette = dataset.sparkSession.udf.register("computeSilhouette", + SquaredEuclideanSilhouette + .computeSquaredSilhouetteCoefficient(broadcastedClustersMap, _: Vector, _: Int, _: Double) + ) + + val squaredSilhouetteDF = dfWithCsi + .withColumn("silhouetteCoefficient", + computeSilhouette(col($(featuresCol)), col($(predictionCol)), col("csi")) + ) + .agg(avg(col("silhouetteCoefficient"))) + + squaredSilhouetteDF.collect()(0).getDouble(0) + } + +} + + +object ClusteringEvaluator + extends DefaultParamsReadable[ClusteringEvaluator] { + + override def load(path: String): ClusteringEvaluator = super.load(path) + +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala new file mode 100644 index 0000000000000..3a01305f85084 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{DenseVector, Vector, VectorElementWiseSum} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, count} + +private[evaluation] object CosineSilhouette { + + case class ClusterStats(omega: Vector, count: Long) + + private[this] var kryoRegistrationPerformed: Boolean = false + + /** + * This method registers the class + * [[org.apache.spark.ml.evaluation.CosineSilhouette.ClusterStats]] + * for kryo serialization. + * + * @param sc `SparkContext` to be used + */ + def registerKryoClasses(sc: SparkContext): Unit = { + if (! kryoRegistrationPerformed) { + sc.getConf.registerKryoClasses( + Array( + classOf[CosineSilhouette.ClusterStats] + ) + ) + kryoRegistrationPerformed = true + } + } + + def computeOmegaAndCount( + df: DataFrame, + predictionCol: String, + featuresCol: String): DataFrame = { + val omegaUdaf = new VectorElementWiseSum() + df.groupBy(predictionCol) + .agg( + count("*").alias("count"), + omegaUdaf(col("csi")).alias("omega") + ) + } + + def computeCsi(vector: Vector): Vector = { + var sum: Double = 0.0 + vector.foreachActive( (_, i) => { + sum += i * i + }) + val norm = math.sqrt(sum) + new DenseVector(vector.toArray.map( _ / norm )) + } + + def computeCosineSilhouetteCoefficient( + broadcastedClustersMap: Broadcast[Map[Int, ClusterStats]], + vector: Vector, + clusterId: Int, + csi: Vector): Double = { + + def compute(point: Vector, csi: Vector, clusterStats: ClusterStats): Double = { + var omegaMultiplyCsiSum: Double = 0.0 + csi.foreachActive( (i, iCsi) => { + omegaMultiplyCsiSum += clusterStats.omega(i) * iCsi + }) + + 1 - omegaMultiplyCsiSum / clusterStats.count + } + + var minOther = Double.MaxValue + for(c <- broadcastedClustersMap.value.keySet) { + if (c != clusterId) { + val sil = compute(vector, csi, broadcastedClustersMap.value(c)) + if(sil < minOther) { + minOther = sil + } + } + } + val clusterCurrentPoint = broadcastedClustersMap.value(clusterId) + // adjustment for excluding the node itself from + // the computation of the average dissimilarity + val clusterSil = if (clusterCurrentPoint.count == 1) { + 0 + } else { + compute(vector, csi, clusterCurrentPoint) * clusterCurrentPoint.count / + (clusterCurrentPoint.count - 1) + } + + + var silhouetteCoeff = 0.0 + if (clusterSil < minOther) { + silhouetteCoeff = 1 - (clusterSil / minOther) + } else { + if (clusterSil > minOther) { + silhouetteCoeff = (minOther / clusterSil) - 1 + } + } + + silhouetteCoeff + + } + +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala new file mode 100644 index 0000000000000..dfd9ea6d8d394 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkContext +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{Vector, VectorElementWiseSum} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, count, sum} + +private[evaluation] object SquaredEuclideanSilhouette { + + private[this] var kryoRegistrationPerformed: Boolean = false + + /** + * This method registers the class + * [[org.apache.spark.ml.evaluation.SquaredEuclideanSilhouette.ClusterStats]] + * for kryo serialization. + * + * @param sc `SparkContext` to be used + */ + def registerKryoClasses(sc: SparkContext): Unit = { + if (! kryoRegistrationPerformed) { + sc.getConf.registerKryoClasses( + Array( + classOf[SquaredEuclideanSilhouette.ClusterStats] + ) + ) + kryoRegistrationPerformed = true + } + } + + case class ClusterStats(Y: Vector, psi: Double, count: Long) + + def computeCsi(vector: Vector): Double = { + var sumOfSquares = 0.0 + vector.foreachActive((_, v) => { + sumOfSquares += v * v + }) + sumOfSquares + } + + def computeYVectorPsiAndCount( + df: DataFrame, + predictionCol: String, + featuresCol: String): DataFrame = { + val Yudaf = new VectorElementWiseSum() + df.groupBy(predictionCol) + .agg( + count("*").alias("count"), + sum("csi").alias("psi"), + Yudaf(col(featuresCol)).alias("y") + ) + } + + def computeSquaredSilhouetteCoefficient( + broadcastedClustersMap: Broadcast[Map[Int, ClusterStats]], + vector: Vector, + clusterId: Int, + csi: Double): Double = { + + def compute(csi: Double, point: Vector, clusterStats: ClusterStats): Double = { + var YmultiplyPoint = 0.0 + point.foreachActive((idx, v) => { + YmultiplyPoint += clusterStats.Y(idx) * v + }) + + csi + clusterStats.psi / clusterStats.count - 2 * YmultiplyPoint / clusterStats.count + } + + var minOther = Double.MaxValue + for(c <- broadcastedClustersMap.value.keySet) { + if (c != clusterId) { + val sil = compute(csi, vector, broadcastedClustersMap.value(c)) + if(sil < minOther) { + minOther = sil + } + } + } + val clusterCurrentPoint = broadcastedClustersMap.value(clusterId) + // adjustment for excluding the node itself from + // the computation of the average dissimilarity + val clusterSil = if (clusterCurrentPoint.count == 1) { + 0 + } else { + compute(csi, vector, clusterCurrentPoint) * clusterCurrentPoint.count / + (clusterCurrentPoint.count - 1) + } + + var silhouetteCoeff = 0.0 + if (clusterSil < minOther) { + silhouetteCoeff = 1 - (clusterSil / minOther) + } else { + if (clusterSil > minOther) { + silhouetteCoeff = (minOther / clusterSil) - 1 + } + } + silhouetteCoeff + + } + +} \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala new file mode 100644 index 0000000000000..dbdebbde6e816 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.linalg + +import scala.collection.mutable + +import org.apache.spark.sql.Row +import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} +import org.apache.spark.sql.types.{ArrayType, DataType, DoubleType, StructType} + +/** + * A UDAF implementing a element-wise sum on [[org.apache.spark.mllib.linalg.Vector]] + * objects. + * + */ +private[spark] class VectorElementWiseSum extends UserDefinedAggregateFunction { + + override def inputSchema: StructType = new StructType().add("v", new VectorUDT()) + override def bufferSchema: StructType = new StructType().add("buff", ArrayType(DoubleType)) + // Returned Data Type + override def dataType: DataType = new VectorUDT() + override def deterministic: Boolean = true + + // This function is called whenever key changes + override def initialize(buffer: MutableAggregationBuffer): Unit = { + buffer.update(0, Array.emptyDoubleArray) + } + + // Iterate over each entry of a group + override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { + val aggr = buffer.getAs[mutable.WrappedArray[Double]](0) + val curr = input.getAs[Vector](0) + if (aggr.isEmpty) { + buffer.update(0, curr.toArray) + } else { + curr.foreachActive((idx, v) => { + aggr(idx) += v + }) + buffer.update(0, aggr) + } + } + + // Merge two partial aggregates + override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { + val buff1 = buffer1.getAs[mutable.WrappedArray[Double]](0) + val buff2 = buffer2.getAs[mutable.WrappedArray[Double]](0) + if (buff1.isEmpty) { + buffer1.update(0, buff2) + } else if (buff2.isEmpty) { + buffer1.update(0, buff1) + } else { + for ((x, i) <- buff2.zipWithIndex) { + buff1(i) += x + } + buffer1.update(0, buff1) + } + } + + override def evaluate(buffer: Row): Vector = + Vectors.dense(buffer.getAs[mutable.WrappedArray[Double]](0).toArray) + +} \ No newline at end of file diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala new file mode 100644 index 0000000000000..8cdd85881e809 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.linalg.{Vectors, VectorUDT} +import org.apache.spark.ml.param.ParamsSuite +import org.apache.spark.ml.util.DefaultReadWriteTest +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{IntegerType, StructField, StructType} + + +class ClusteringEvaluatorSuite + extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + import testImplicits._ + + val dataset = Seq(Row(Vectors.dense(5.1, 3.5, 1.4, 0.2), 0), + Row(Vectors.dense(4.9, 3.0, 1.4, 0.2), 0), + Row(Vectors.dense(4.7, 3.2, 1.3, 0.2), 0), + Row(Vectors.dense(4.6, 3.1, 1.5, 0.2), 0), + Row(Vectors.dense(5.0, 3.6, 1.4, 0.2), 0), + Row(Vectors.dense(5.4, 3.9, 1.7, 0.4), 0), + Row(Vectors.dense(4.6, 3.4, 1.4, 0.3), 0), + Row(Vectors.dense(5.0, 3.4, 1.5, 0.2), 0), + Row(Vectors.dense(4.4, 2.9, 1.4, 0.2), 0), + Row(Vectors.dense(4.9, 3.1, 1.5, 0.1), 0), + Row(Vectors.dense(5.4, 3.7, 1.5, 0.2), 0), + Row(Vectors.dense(4.8, 3.4, 1.6, 0.2), 0), + Row(Vectors.dense(4.8, 3.0, 1.4, 0.1), 0), + Row(Vectors.dense(4.3, 3.0, 1.1, 0.1), 0), + Row(Vectors.dense(5.8, 4.0, 1.2, 0.2), 0), + Row(Vectors.dense(5.7, 4.4, 1.5, 0.4), 0), + Row(Vectors.dense(5.4, 3.9, 1.3, 0.4), 0), + Row(Vectors.dense(5.1, 3.5, 1.4, 0.3), 0), + Row(Vectors.dense(5.7, 3.8, 1.7, 0.3), 0), + Row(Vectors.dense(5.1, 3.8, 1.5, 0.3), 0), + Row(Vectors.dense(5.4, 3.4, 1.7, 0.2), 0), + Row(Vectors.dense(5.1, 3.7, 1.5, 0.4), 0), + Row(Vectors.dense(4.6, 3.6, 1.0, 0.2), 0), + Row(Vectors.dense(5.1, 3.3, 1.7, 0.5), 0), + Row(Vectors.dense(4.8, 3.4, 1.9, 0.2), 0), + Row(Vectors.dense(5.0, 3.0, 1.6, 0.2), 0), + Row(Vectors.dense(5.0, 3.4, 1.6, 0.4), 0), + Row(Vectors.dense(5.2, 3.5, 1.5, 0.2), 0), + Row(Vectors.dense(5.2, 3.4, 1.4, 0.2), 0), + Row(Vectors.dense(4.7, 3.2, 1.6, 0.2), 0), + Row(Vectors.dense(4.8, 3.1, 1.6, 0.2), 0), + Row(Vectors.dense(5.4, 3.4, 1.5, 0.4), 0), + Row(Vectors.dense(5.2, 4.1, 1.5, 0.1), 0), + Row(Vectors.dense(5.5, 4.2, 1.4, 0.2), 0), + Row(Vectors.dense(4.9, 3.1, 1.5, 0.1), 0), + Row(Vectors.dense(5.0, 3.2, 1.2, 0.2), 0), + Row(Vectors.dense(5.5, 3.5, 1.3, 0.2), 0), + Row(Vectors.dense(4.9, 3.1, 1.5, 0.1), 0), + Row(Vectors.dense(4.4, 3.0, 1.3, 0.2), 0), + Row(Vectors.dense(5.1, 3.4, 1.5, 0.2), 0), + Row(Vectors.dense(5.0, 3.5, 1.3, 0.3), 0), + Row(Vectors.dense(4.5, 2.3, 1.3, 0.3), 0), + Row(Vectors.dense(4.4, 3.2, 1.3, 0.2), 0), + Row(Vectors.dense(5.0, 3.5, 1.6, 0.6), 0), + Row(Vectors.dense(5.1, 3.8, 1.9, 0.4), 0), + Row(Vectors.dense(4.8, 3.0, 1.4, 0.3), 0), + Row(Vectors.dense(5.1, 3.8, 1.6, 0.2), 0), + Row(Vectors.dense(4.6, 3.2, 1.4, 0.2), 0), + Row(Vectors.dense(5.3, 3.7, 1.5, 0.2), 0), + Row(Vectors.dense(5.0, 3.3, 1.4, 0.2), 0), + Row(Vectors.dense(7.0, 3.2, 4.7, 1.4), 1), + Row(Vectors.dense(6.4, 3.2, 4.5, 1.5), 1), + Row(Vectors.dense(6.9, 3.1, 4.9, 1.5), 1), + Row(Vectors.dense(5.5, 2.3, 4.0, 1.3), 1), + Row(Vectors.dense(6.5, 2.8, 4.6, 1.5), 1), + Row(Vectors.dense(5.7, 2.8, 4.5, 1.3), 1), + Row(Vectors.dense(6.3, 3.3, 4.7, 1.6), 1), + Row(Vectors.dense(4.9, 2.4, 3.3, 1.0), 1), + Row(Vectors.dense(6.6, 2.9, 4.6, 1.3), 1), + Row(Vectors.dense(5.2, 2.7, 3.9, 1.4), 1), + Row(Vectors.dense(5.0, 2.0, 3.5, 1.0), 1), + Row(Vectors.dense(5.9, 3.0, 4.2, 1.5), 1), + Row(Vectors.dense(6.0, 2.2, 4.0, 1.0), 1), + Row(Vectors.dense(6.1, 2.9, 4.7, 1.4), 1), + Row(Vectors.dense(5.6, 2.9, 3.6, 1.3), 1), + Row(Vectors.dense(6.7, 3.1, 4.4, 1.4), 1), + Row(Vectors.dense(5.6, 3.0, 4.5, 1.5), 1), + Row(Vectors.dense(5.8, 2.7, 4.1, 1.0), 1), + Row(Vectors.dense(6.2, 2.2, 4.5, 1.5), 1), + Row(Vectors.dense(5.6, 2.5, 3.9, 1.1), 1), + Row(Vectors.dense(5.9, 3.2, 4.8, 1.8), 1), + Row(Vectors.dense(6.1, 2.8, 4.0, 1.3), 1), + Row(Vectors.dense(6.3, 2.5, 4.9, 1.5), 1), + Row(Vectors.dense(6.1, 2.8, 4.7, 1.2), 1), + Row(Vectors.dense(6.4, 2.9, 4.3, 1.3), 1), + Row(Vectors.dense(6.6, 3.0, 4.4, 1.4), 1), + Row(Vectors.dense(6.8, 2.8, 4.8, 1.4), 1), + Row(Vectors.dense(6.7, 3.0, 5.0, 1.7), 1), + Row(Vectors.dense(6.0, 2.9, 4.5, 1.5), 1), + Row(Vectors.dense(5.7, 2.6, 3.5, 1.0), 1), + Row(Vectors.dense(5.5, 2.4, 3.8, 1.1), 1), + Row(Vectors.dense(5.5, 2.4, 3.7, 1.0), 1), + Row(Vectors.dense(5.8, 2.7, 3.9, 1.2), 1), + Row(Vectors.dense(6.0, 2.7, 5.1, 1.6), 1), + Row(Vectors.dense(5.4, 3.0, 4.5, 1.5), 1), + Row(Vectors.dense(6.0, 3.4, 4.5, 1.6), 1), + Row(Vectors.dense(6.7, 3.1, 4.7, 1.5), 1), + Row(Vectors.dense(6.3, 2.3, 4.4, 1.3), 1), + Row(Vectors.dense(5.6, 3.0, 4.1, 1.3), 1), + Row(Vectors.dense(5.5, 2.5, 4.0, 1.3), 1), + Row(Vectors.dense(5.5, 2.6, 4.4, 1.2), 1), + Row(Vectors.dense(6.1, 3.0, 4.6, 1.4), 1), + Row(Vectors.dense(5.8, 2.6, 4.0, 1.2), 1), + Row(Vectors.dense(5.0, 2.3, 3.3, 1.0), 1), + Row(Vectors.dense(5.6, 2.7, 4.2, 1.3), 1), + Row(Vectors.dense(5.7, 3.0, 4.2, 1.2), 1), + Row(Vectors.dense(5.7, 2.9, 4.2, 1.3), 1), + Row(Vectors.dense(6.2, 2.9, 4.3, 1.3), 1), + Row(Vectors.dense(5.1, 2.5, 3.0, 1.1), 1), + Row(Vectors.dense(5.7, 2.8, 4.1, 1.3), 1), + Row(Vectors.dense(6.3, 3.3, 6.0, 2.5), 2), + Row(Vectors.dense(5.8, 2.7, 5.1, 1.9), 2), + Row(Vectors.dense(7.1, 3.0, 5.9, 2.1), 2), + Row(Vectors.dense(6.3, 2.9, 5.6, 1.8), 2), + Row(Vectors.dense(6.5, 3.0, 5.8, 2.2), 2), + Row(Vectors.dense(7.6, 3.0, 6.6, 2.1), 2), + Row(Vectors.dense(4.9, 2.5, 4.5, 1.7), 2), + Row(Vectors.dense(7.3, 2.9, 6.3, 1.8), 2), + Row(Vectors.dense(6.7, 2.5, 5.8, 1.8), 2), + Row(Vectors.dense(7.2, 3.6, 6.1, 2.5), 2), + Row(Vectors.dense(6.5, 3.2, 5.1, 2.0), 2), + Row(Vectors.dense(6.4, 2.7, 5.3, 1.9), 2), + Row(Vectors.dense(6.8, 3.0, 5.5, 2.1), 2), + Row(Vectors.dense(5.7, 2.5, 5.0, 2.0), 2), + Row(Vectors.dense(5.8, 2.8, 5.1, 2.4), 2), + Row(Vectors.dense(6.4, 3.2, 5.3, 2.3), 2), + Row(Vectors.dense(6.5, 3.0, 5.5, 1.8), 2), + Row(Vectors.dense(7.7, 3.8, 6.7, 2.2), 2), + Row(Vectors.dense(7.7, 2.6, 6.9, 2.3), 2), + Row(Vectors.dense(6.0, 2.2, 5.0, 1.5), 2), + Row(Vectors.dense(6.9, 3.2, 5.7, 2.3), 2), + Row(Vectors.dense(5.6, 2.8, 4.9, 2.0), 2), + Row(Vectors.dense(7.7, 2.8, 6.7, 2.0), 2), + Row(Vectors.dense(6.3, 2.7, 4.9, 1.8), 2), + Row(Vectors.dense(6.7, 3.3, 5.7, 2.1), 2), + Row(Vectors.dense(7.2, 3.2, 6.0, 1.8), 2), + Row(Vectors.dense(6.2, 2.8, 4.8, 1.8), 2), + Row(Vectors.dense(6.1, 3.0, 4.9, 1.8), 2), + Row(Vectors.dense(6.4, 2.8, 5.6, 2.1), 2), + Row(Vectors.dense(7.2, 3.0, 5.8, 1.6), 2), + Row(Vectors.dense(7.4, 2.8, 6.1, 1.9), 2), + Row(Vectors.dense(7.9, 3.8, 6.4, 2.0), 2), + Row(Vectors.dense(6.4, 2.8, 5.6, 2.2), 2), + Row(Vectors.dense(6.3, 2.8, 5.1, 1.5), 2), + Row(Vectors.dense(6.1, 2.6, 5.6, 1.4), 2), + Row(Vectors.dense(7.7, 3.0, 6.1, 2.3), 2), + Row(Vectors.dense(6.3, 3.4, 5.6, 2.4), 2), + Row(Vectors.dense(6.4, 3.1, 5.5, 1.8), 2), + Row(Vectors.dense(6.0, 3.0, 4.8, 1.8), 2), + Row(Vectors.dense(6.9, 3.1, 5.4, 2.1), 2), + Row(Vectors.dense(6.7, 3.1, 5.6, 2.4), 2), + Row(Vectors.dense(6.9, 3.1, 5.1, 2.3), 2), + Row(Vectors.dense(5.8, 2.7, 5.1, 1.9), 2), + Row(Vectors.dense(6.8, 3.2, 5.9, 2.3), 2), + Row(Vectors.dense(6.7, 3.3, 5.7, 2.5), 2), + Row(Vectors.dense(6.7, 3.0, 5.2, 2.3), 2), + Row(Vectors.dense(6.3, 2.5, 5.0, 1.9), 2), + Row(Vectors.dense(6.5, 3.0, 5.2, 2.0), 2), + Row(Vectors.dense(6.2, 3.4, 5.4, 2.3), 2), + Row(Vectors.dense(5.9, 3.0, 5.1, 1.8), 2)) + + val dsStruct = StructType( Seq( + StructField("point", new VectorUDT, nullable = false), + StructField("label", IntegerType, nullable = false) + )) + + test("params") { + ParamsSuite.checkParams(new RegressionEvaluator) + } + + test("read/write") { + val evaluator = new ClusteringEvaluator() + .setPredictionCol("myPrediction") + .setFeaturesCol("myLabel") + .setMetricName("cosineSilhouette") + testDefaultReadWrite(evaluator) + } + + test("squared euclidean Silhouette") { + val result = BigDecimal(0.6564679231) + val dsRDD = spark.sparkContext.parallelize(dataset) + val df = spark.createDataFrame(dsRDD, dsStruct) + + val evaluator = new ClusteringEvaluator() + .setFeaturesCol("point") + .setPredictionCol("label") + .setMetricName("squaredSilhouette") + val actual = BigDecimal(evaluator.evaluate(df)) + .setScale(10, BigDecimal.RoundingMode.HALF_UP) + assertResult(result)(actual) + + } + + test("cosine Silhouette") { + val result = BigDecimal(0.7222369298) + val dsRDD = spark.sparkContext.parallelize(dataset) + val df = spark.createDataFrame(dsRDD, dsStruct) + + val evaluator = new ClusteringEvaluator() + .setFeaturesCol("point") + .setPredictionCol("label") + .setMetricName("cosineSilhouette") + val actual = BigDecimal(evaluator.evaluate(df)) + .setScale(10, BigDecimal.RoundingMode.HALF_UP) + assertResult(result)(actual) + + } + + + + +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/linalg/VectorElementWiseSumSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/linalg/VectorElementWiseSumSuite.scala new file mode 100644 index 0000000000000..75f93964cdd05 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/linalg/VectorElementWiseSumSuite.scala @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.linalg + +import org.apache.spark.SparkFunSuite +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.{StructField, StructType} + + +class VectorElementWiseSumSuite extends SparkFunSuite with MLlibTestSparkContext { + + val vStruct = StructType(Seq(StructField("v", new VectorUDT, false))) + + test("Sum with DenseVectors") { + val dv1 = Vectors.dense(0.5, 1.2) + val dv2 = Vectors.dense(1.0, 2.0) + val result = Vectors.dense(1.5, 3.2) + val vectorsRDD = spark.sparkContext.parallelize(Seq(Row(dv1), Row(dv2))) + + val df = spark.createDataFrame(vectorsRDD, vStruct) + val vSum = new VectorElementWiseSum() + val actualRes = df.agg(vSum(df("v"))).first.getAs[Vector](0) + + assert(result.equals(actualRes)) + + } + + test("Sum with SparseVectors") { + val sv1 = Vectors.sparse(3, Array(1, 2), Array(3.6, 1.0)) + val sv2 = Vectors.sparse(3, Array(1), Array(2.0)) + val result = Vectors.sparse(3, Array(1, 2), Array(5.6, 1.0)) + val vectorsRDD = spark.sparkContext.parallelize(Seq(Row(sv1), Row(sv2))) + + val df = spark.createDataFrame(vectorsRDD, vStruct) + val vSum = new VectorElementWiseSum() + val actualRes = df.agg(vSum(df("v"))).first.getAs[Vector](0) + + assert(result.equals(actualRes)) + + } + + test("Sum with negative numbers") { + val dv1 = Vectors.dense(-0.5, 1.2) + val dv2 = Vectors.dense(1.0, -2.0) + val result = Vectors.dense(0.5, -0.8) + val vectorsRDD = spark.sparkContext.parallelize(Seq(Row(dv1), Row(dv2))) + + val df = spark.createDataFrame(vectorsRDD, vStruct) + val vSum = new VectorElementWiseSum() + val actualRes = df.agg(vSum(df("v"))).first.getAs[Vector](0) + + assert(result.equals(actualRes)) + } + +} From cfcb106788e5ea2b905767ff23825c4e5a9bc1e9 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Thu, 3 Aug 2017 13:57:46 +0200 Subject: [PATCH 02/18] Added comments with Python code to reproduce the results of the tests in scikit-learn --- .../ml/evaluation/ClusteringEvaluatorSuite.scala | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala index 8cdd85881e809..c9e70554d24e1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala @@ -199,6 +199,14 @@ class ClusteringEvaluatorSuite testDefaultReadWrite(evaluator) } + /* + Use the following python code to load the data and evaluate it using scikit-learn package. + + from sklearn import datasets + from sklearn.metrics import silhouette_score + iris = datasets.load_iris() + round(silhouette_score(iris.data, iris.target, metric='sqeuclidean'), 10) + */ test("squared euclidean Silhouette") { val result = BigDecimal(0.6564679231) val dsRDD = spark.sparkContext.parallelize(dataset) @@ -214,6 +222,14 @@ class ClusteringEvaluatorSuite } + /* + Use the following python code to load the data and evaluate it using scikit-learn package. + + from sklearn import datasets + from sklearn.metrics import silhouette_score + iris = datasets.load_iris() + round(silhouette_score(iris.data, iris.target, metric='cosine'), 10) + */ test("cosine Silhouette") { val result = BigDecimal(0.7222369298) val dsRDD = spark.sparkContext.parallelize(dataset) From 923418a7139e9cd038882499e7ac0aa544a14858 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Sat, 5 Aug 2017 09:23:45 +0200 Subject: [PATCH 03/18] Fix scalastyle --- .../ml/evaluation/ClusteringEvaluator.scala | 8 ++++---- .../ml/evaluation/CosineSilhouette.scala | 12 +++++------ .../SquaredEuclideanSilhouette.scala | 20 +++++++++---------- .../ml/linalg/VectorElementWiseSum.scala | 2 +- 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 251a6f36410e5..0634c79d3bd6b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -52,10 +52,10 @@ class ClusteringEvaluator (val uid: String) def setFeaturesCol(value: String): this.type = set(featuresCol, value) /** - * param for metric name in evaluation - * (supports `"squaredSilhouette"` (default), `"cosineSilhouette"`) - * @group param - */ + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default), `"cosineSilhouette"`) + * @group param + */ val metricName: Param[String] = { val allowedParams = ParamValidators.inArray(Array("squaredSilhouette", "cosineSilhouette")) new Param( diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala index 3a01305f85084..b60880fe337fc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala @@ -30,12 +30,12 @@ private[evaluation] object CosineSilhouette { private[this] var kryoRegistrationPerformed: Boolean = false /** - * This method registers the class - * [[org.apache.spark.ml.evaluation.CosineSilhouette.ClusterStats]] - * for kryo serialization. - * - * @param sc `SparkContext` to be used - */ + * This method registers the class + * [[org.apache.spark.ml.evaluation.CosineSilhouette.ClusterStats]] + * for kryo serialization. + * + * @param sc `SparkContext` to be used + */ def registerKryoClasses(sc: SparkContext): Unit = { if (! kryoRegistrationPerformed) { sc.getConf.registerKryoClasses( diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala index dfd9ea6d8d394..9e43dceda5e80 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala @@ -28,12 +28,12 @@ private[evaluation] object SquaredEuclideanSilhouette { private[this] var kryoRegistrationPerformed: Boolean = false /** - * This method registers the class - * [[org.apache.spark.ml.evaluation.SquaredEuclideanSilhouette.ClusterStats]] - * for kryo serialization. - * - * @param sc `SparkContext` to be used - */ + * This method registers the class + * [[org.apache.spark.ml.evaluation.SquaredEuclideanSilhouette.ClusterStats]] + * for kryo serialization. + * + * @param sc `SparkContext` to be used + */ def registerKryoClasses(sc: SparkContext): Unit = { if (! kryoRegistrationPerformed) { sc.getConf.registerKryoClasses( @@ -46,7 +46,7 @@ private[evaluation] object SquaredEuclideanSilhouette { } case class ClusterStats(Y: Vector, psi: Double, count: Long) - + def computeCsi(vector: Vector): Double = { var sumOfSquares = 0.0 vector.foreachActive((_, v) => { @@ -54,7 +54,7 @@ private[evaluation] object SquaredEuclideanSilhouette { }) sumOfSquares } - + def computeYVectorPsiAndCount( df: DataFrame, predictionCol: String, @@ -111,7 +111,5 @@ private[evaluation] object SquaredEuclideanSilhouette { } } silhouetteCoeff - } - -} \ No newline at end of file +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala index dbdebbde6e816..d130787299087 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala @@ -74,4 +74,4 @@ private[spark] class VectorElementWiseSum extends UserDefinedAggregateFunction override def evaluate(buffer: Row): Vector = Vectors.dense(buffer.getAs[mutable.WrappedArray[Double]](0).toArray) -} \ No newline at end of file +} From a1701eff4b9a1c6b217c8cca0b6e9c431834985d Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Tue, 8 Aug 2017 18:03:59 +0200 Subject: [PATCH 04/18] Remove cosineSilhouette implementation and move squaredSilhouette to ClusteringEvaluator --- .../ml/evaluation/ClusteringEvaluator.scala | 145 ++++++++++++------ .../ml/evaluation/CosineSilhouette.scala | 119 -------------- .../SquaredEuclideanSilhouette.scala | 115 -------------- .../evaluation/ClusteringEvaluatorSuite.scala | 28 +--- 4 files changed, 102 insertions(+), 305 deletions(-) delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 0634c79d3bd6b..4c507dd2ab4c4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -17,13 +17,15 @@ package org.apache.spark.ml.evaluation +import org.apache.spark.SparkContext import org.apache.spark.annotation.Experimental -import org.apache.spark.ml.linalg.{Vector, VectorUDT} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.linalg.{Vector, VectorElementWiseSum, VectorUDT} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} -import org.apache.spark.sql.Dataset -import org.apache.spark.sql.functions.{avg, col} +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.{avg, col, count, sum} import org.apache.spark.sql.types.IntegerType /** @@ -53,15 +55,15 @@ class ClusteringEvaluator (val uid: String) /** * param for metric name in evaluation - * (supports `"squaredSilhouette"` (default), `"cosineSilhouette"`) + * (supports `"squaredSilhouette"` (default)) * @group param */ val metricName: Param[String] = { - val allowedParams = ParamValidators.inArray(Array("squaredSilhouette", "cosineSilhouette")) + val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) new Param( this, "metricName", - "metric name in evaluation (squaredSilhouette|cosineSilhouette)", + "metric name in evaluation (squaredSilhouette)", allowedParams ) } @@ -81,48 +83,10 @@ class ClusteringEvaluator (val uid: String) val metric: Double = $(metricName) match { case "squaredSilhouette" => computeSquaredSilhouette(dataset) - case "cosineSilhouette" => - computeCosineSilhouette(dataset) } metric } - private[this] def computeCosineSilhouette(dataset: Dataset[_]): Double = { - CosineSilhouette.registerKryoClasses(dataset.sparkSession.sparkContext) - - val computeCsi = dataset.sparkSession.udf.register("computeCsi", - CosineSilhouette.computeCsi _ ) - val dfWithCsi = dataset.withColumn("csi", computeCsi(col($(featuresCol)))) - - // compute aggregate values for clusters - // needed by the algorithm - val clustersAggregateValues = CosineSilhouette - .computeOmegaAndCount(dfWithCsi, $(predictionCol), $(featuresCol)) - - val clustersMap = clustersAggregateValues.collect().map(row => { - row.getAs[Int]($(predictionCol)) -> - CosineSilhouette.ClusterStats( - row.getAs[Vector]("omega"), - row.getAs[Long]("count") - ) - }).toMap - - val broadcastedClustersMap = dataset.sparkSession.sparkContext.broadcast(clustersMap) - - val computeSilhouette = dataset.sparkSession.udf.register("computeSilhouette", - CosineSilhouette - .computeCosineSilhouetteCoefficient(broadcastedClustersMap, _: Vector, _: Int, _: Vector) - ) - - val cosineSilhouetteDF = dfWithCsi - .withColumn("silhouetteCoefficient", - computeSilhouette(col($(featuresCol)), col($(predictionCol)), col("csi")) - ) - .agg(avg(col("silhouetteCoefficient"))) - - cosineSilhouetteDF.collect()(0).getDouble(0) - } - private[this] def computeSquaredSilhouette(dataset: Dataset[_]): Double = { SquaredEuclideanSilhouette.registerKryoClasses(dataset.sparkSession.sparkContext) @@ -169,3 +133,96 @@ object ClusteringEvaluator override def load(path: String): ClusteringEvaluator = super.load(path) } + +private[evaluation] object SquaredEuclideanSilhouette { + + private[this] var kryoRegistrationPerformed: Boolean = false + + /** + * This method registers the class + * [[org.apache.spark.ml.evaluation.SquaredEuclideanSilhouette.ClusterStats]] + * for kryo serialization. + * + * @param sc `SparkContext` to be used + */ + def registerKryoClasses(sc: SparkContext): Unit = { + if (! kryoRegistrationPerformed) { + sc.getConf.registerKryoClasses( + Array( + classOf[SquaredEuclideanSilhouette.ClusterStats] + ) + ) + kryoRegistrationPerformed = true + } + } + + case class ClusterStats(Y: Vector, psi: Double, count: Long) + + def computeCsi(vector: Vector): Double = { + var sumOfSquares = 0.0 + vector.foreachActive((_, v) => { + sumOfSquares += v * v + }) + sumOfSquares + } + + def computeYVectorPsiAndCount( + df: DataFrame, + predictionCol: String, + featuresCol: String): DataFrame = { + val Yudaf = new VectorElementWiseSum() + df.groupBy(predictionCol) + .agg( + count("*").alias("count"), + sum("csi").alias("psi"), + Yudaf(col(featuresCol)).alias("y") + ) + } + + def computeSquaredSilhouetteCoefficient( + broadcastedClustersMap: Broadcast[Map[Int, ClusterStats]], + vector: Vector, + clusterId: Int, + csi: Double): Double = { + + def compute(csi: Double, point: Vector, clusterStats: ClusterStats): Double = { + var YmultiplyPoint = 0.0 + point.foreachActive((idx, v) => { + YmultiplyPoint += clusterStats.Y(idx) * v + }) + + csi + clusterStats.psi / clusterStats.count - 2 * YmultiplyPoint / clusterStats.count + } + + var minOther = Double.MaxValue + for(c <- broadcastedClustersMap.value.keySet) { + if (c != clusterId) { + val sil = compute(csi, vector, broadcastedClustersMap.value(c)) + if(sil < minOther) { + minOther = sil + } + } + } + val clusterCurrentPoint = broadcastedClustersMap.value(clusterId) + // adjustment for excluding the node itself from + // the computation of the average dissimilarity + val clusterSil = if (clusterCurrentPoint.count == 1) { + 0 + } else { + compute(csi, vector, clusterCurrentPoint) * clusterCurrentPoint.count / + (clusterCurrentPoint.count - 1) + } + + var silhouetteCoeff = 0.0 + if (clusterSil < minOther) { + silhouetteCoeff = 1 - (clusterSil / minOther) + } else { + if (clusterSil > minOther) { + silhouetteCoeff = (minOther / clusterSil) - 1 + } + } + silhouetteCoeff + + } + +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala deleted file mode 100644 index b60880fe337fc..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/CosineSilhouette.scala +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.evaluation - -import org.apache.spark.SparkContext -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.ml.linalg.{DenseVector, Vector, VectorElementWiseSum} -import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{col, count} - -private[evaluation] object CosineSilhouette { - - case class ClusterStats(omega: Vector, count: Long) - - private[this] var kryoRegistrationPerformed: Boolean = false - - /** - * This method registers the class - * [[org.apache.spark.ml.evaluation.CosineSilhouette.ClusterStats]] - * for kryo serialization. - * - * @param sc `SparkContext` to be used - */ - def registerKryoClasses(sc: SparkContext): Unit = { - if (! kryoRegistrationPerformed) { - sc.getConf.registerKryoClasses( - Array( - classOf[CosineSilhouette.ClusterStats] - ) - ) - kryoRegistrationPerformed = true - } - } - - def computeOmegaAndCount( - df: DataFrame, - predictionCol: String, - featuresCol: String): DataFrame = { - val omegaUdaf = new VectorElementWiseSum() - df.groupBy(predictionCol) - .agg( - count("*").alias("count"), - omegaUdaf(col("csi")).alias("omega") - ) - } - - def computeCsi(vector: Vector): Vector = { - var sum: Double = 0.0 - vector.foreachActive( (_, i) => { - sum += i * i - }) - val norm = math.sqrt(sum) - new DenseVector(vector.toArray.map( _ / norm )) - } - - def computeCosineSilhouetteCoefficient( - broadcastedClustersMap: Broadcast[Map[Int, ClusterStats]], - vector: Vector, - clusterId: Int, - csi: Vector): Double = { - - def compute(point: Vector, csi: Vector, clusterStats: ClusterStats): Double = { - var omegaMultiplyCsiSum: Double = 0.0 - csi.foreachActive( (i, iCsi) => { - omegaMultiplyCsiSum += clusterStats.omega(i) * iCsi - }) - - 1 - omegaMultiplyCsiSum / clusterStats.count - } - - var minOther = Double.MaxValue - for(c <- broadcastedClustersMap.value.keySet) { - if (c != clusterId) { - val sil = compute(vector, csi, broadcastedClustersMap.value(c)) - if(sil < minOther) { - minOther = sil - } - } - } - val clusterCurrentPoint = broadcastedClustersMap.value(clusterId) - // adjustment for excluding the node itself from - // the computation of the average dissimilarity - val clusterSil = if (clusterCurrentPoint.count == 1) { - 0 - } else { - compute(vector, csi, clusterCurrentPoint) * clusterCurrentPoint.count / - (clusterCurrentPoint.count - 1) - } - - - var silhouetteCoeff = 0.0 - if (clusterSil < minOther) { - silhouetteCoeff = 1 - (clusterSil / minOther) - } else { - if (clusterSil > minOther) { - silhouetteCoeff = (minOther / clusterSil) - 1 - } - } - - silhouetteCoeff - - } - -} diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala deleted file mode 100644 index 9e43dceda5e80..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/SquaredEuclideanSilhouette.scala +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.evaluation - -import org.apache.spark.SparkContext -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.ml.linalg.{Vector, VectorElementWiseSum} -import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{col, count, sum} - -private[evaluation] object SquaredEuclideanSilhouette { - - private[this] var kryoRegistrationPerformed: Boolean = false - - /** - * This method registers the class - * [[org.apache.spark.ml.evaluation.SquaredEuclideanSilhouette.ClusterStats]] - * for kryo serialization. - * - * @param sc `SparkContext` to be used - */ - def registerKryoClasses(sc: SparkContext): Unit = { - if (! kryoRegistrationPerformed) { - sc.getConf.registerKryoClasses( - Array( - classOf[SquaredEuclideanSilhouette.ClusterStats] - ) - ) - kryoRegistrationPerformed = true - } - } - - case class ClusterStats(Y: Vector, psi: Double, count: Long) - - def computeCsi(vector: Vector): Double = { - var sumOfSquares = 0.0 - vector.foreachActive((_, v) => { - sumOfSquares += v * v - }) - sumOfSquares - } - - def computeYVectorPsiAndCount( - df: DataFrame, - predictionCol: String, - featuresCol: String): DataFrame = { - val Yudaf = new VectorElementWiseSum() - df.groupBy(predictionCol) - .agg( - count("*").alias("count"), - sum("csi").alias("psi"), - Yudaf(col(featuresCol)).alias("y") - ) - } - - def computeSquaredSilhouetteCoefficient( - broadcastedClustersMap: Broadcast[Map[Int, ClusterStats]], - vector: Vector, - clusterId: Int, - csi: Double): Double = { - - def compute(csi: Double, point: Vector, clusterStats: ClusterStats): Double = { - var YmultiplyPoint = 0.0 - point.foreachActive((idx, v) => { - YmultiplyPoint += clusterStats.Y(idx) * v - }) - - csi + clusterStats.psi / clusterStats.count - 2 * YmultiplyPoint / clusterStats.count - } - - var minOther = Double.MaxValue - for(c <- broadcastedClustersMap.value.keySet) { - if (c != clusterId) { - val sil = compute(csi, vector, broadcastedClustersMap.value(c)) - if(sil < minOther) { - minOther = sil - } - } - } - val clusterCurrentPoint = broadcastedClustersMap.value(clusterId) - // adjustment for excluding the node itself from - // the computation of the average dissimilarity - val clusterSil = if (clusterCurrentPoint.count == 1) { - 0 - } else { - compute(csi, vector, clusterCurrentPoint) * clusterCurrentPoint.count / - (clusterCurrentPoint.count - 1) - } - - var silhouetteCoeff = 0.0 - if (clusterSil < minOther) { - silhouetteCoeff = 1 - (clusterSil / minOther) - } else { - if (clusterSil > minOther) { - silhouetteCoeff = (minOther / clusterSil) - 1 - } - } - silhouetteCoeff - } -} diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala index c9e70554d24e1..7360dace96412 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala @@ -195,7 +195,7 @@ class ClusteringEvaluatorSuite val evaluator = new ClusteringEvaluator() .setPredictionCol("myPrediction") .setFeaturesCol("myLabel") - .setMetricName("cosineSilhouette") + .setMetricName("squaredSilhouette") testDefaultReadWrite(evaluator) } @@ -222,30 +222,4 @@ class ClusteringEvaluatorSuite } - /* - Use the following python code to load the data and evaluate it using scikit-learn package. - - from sklearn import datasets - from sklearn.metrics import silhouette_score - iris = datasets.load_iris() - round(silhouette_score(iris.data, iris.target, metric='cosine'), 10) - */ - test("cosine Silhouette") { - val result = BigDecimal(0.7222369298) - val dsRDD = spark.sparkContext.parallelize(dataset) - val df = spark.createDataFrame(dsRDD, dsStruct) - - val evaluator = new ClusteringEvaluator() - .setFeaturesCol("point") - .setPredictionCol("label") - .setMetricName("cosineSilhouette") - val actual = BigDecimal(evaluator.evaluate(df)) - .setScale(10, BigDecimal.RoundingMode.HALF_UP) - assertResult(result)(actual) - - } - - - - } From c01ca6e2115d2f912351907993760279879a46e4 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Tue, 8 Aug 2017 18:04:50 +0200 Subject: [PATCH 05/18] fix typo --- .../org/apache/spark/ml/evaluation/ClusteringEvaluator.scala | 2 +- .../apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 4c507dd2ab4c4..0d52ba28ae9e1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -43,7 +43,7 @@ class ClusteringEvaluator (val uid: String) def this() = this(Identifiable.randomUID("SquaredEuclideanSilhouette")) - override def copy(pMap: ParamMap): Evaluator = this.defaultCopy(pMap) + override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) override def isLargerBetter: Boolean = true diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala index 7360dace96412..33cec14a63aa6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala @@ -188,7 +188,7 @@ class ClusteringEvaluatorSuite )) test("params") { - ParamsSuite.checkParams(new RegressionEvaluator) + ParamsSuite.checkParams(new ClusteringEvaluator) } test("read/write") { From ffc17f929dd86d1e7e73931eac5663bc08b6ba7a Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Tue, 8 Aug 2017 19:09:56 +0200 Subject: [PATCH 06/18] Refactor to use RDDs instead of DataFrame for complex objects aggregation --- .../ml/evaluation/ClusteringEvaluator.scala | 154 ++++++++++-------- .../ml/linalg/VectorElementWiseSum.scala | 77 --------- .../ml/linalg/VectorElementWiseSumSuite.scala | 71 -------- 3 files changed, 83 insertions(+), 219 deletions(-) delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala delete mode 100644 mllib/src/test/scala/org/apache/spark/ml/linalg/VectorElementWiseSumSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 0d52ba28ae9e1..6dd85587494d9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -20,12 +20,12 @@ package org.apache.spark.ml.evaluation import org.apache.spark.SparkContext import org.apache.spark.annotation.Experimental import org.apache.spark.broadcast.Broadcast -import org.apache.spark.ml.linalg.{Vector, VectorElementWiseSum, VectorUDT} +import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} -import org.apache.spark.sql.functions.{avg, col, count, sum} +import org.apache.spark.sql.functions.{avg, col, udf} import org.apache.spark.sql.types.IntegerType /** @@ -82,48 +82,15 @@ class ClusteringEvaluator (val uid: String) val metric: Double = $(metricName) match { case "squaredSilhouette" => - computeSquaredSilhouette(dataset) + SquaredEuclideanSilhouette.computeSquaredSilhouette( + dataset, + $(predictionCol), + $(featuresCol) + ) } metric } - private[this] def computeSquaredSilhouette(dataset: Dataset[_]): Double = { - SquaredEuclideanSilhouette.registerKryoClasses(dataset.sparkSession.sparkContext) - - val computeCsi = dataset.sparkSession.udf.register("computeCsi", - SquaredEuclideanSilhouette.computeCsi _ ) - val dfWithCsi = dataset.withColumn("csi", computeCsi(col($(featuresCol)))) - - // compute aggregate values for clusters - // needed by the algorithm - val clustersAggregateValues = SquaredEuclideanSilhouette - .computeYVectorPsiAndCount(dfWithCsi, $(predictionCol), $(featuresCol)) - - val clustersMap = clustersAggregateValues.collect().map(row => { - row.getAs[Int]($(predictionCol)) -> - SquaredEuclideanSilhouette.ClusterStats( - row.getAs[Vector]("y"), - row.getAs[Double]("psi"), - row.getAs[Long]("count") - ) - }).toMap - - val broadcastedClustersMap = dataset.sparkSession.sparkContext.broadcast(clustersMap) - - val computeSilhouette = dataset.sparkSession.udf.register("computeSilhouette", - SquaredEuclideanSilhouette - .computeSquaredSilhouetteCoefficient(broadcastedClustersMap, _: Vector, _: Int, _: Double) - ) - - val squaredSilhouetteDF = dfWithCsi - .withColumn("silhouetteCoefficient", - computeSilhouette(col($(featuresCol)), col($(predictionCol)), col("csi")) - ) - .agg(avg(col("silhouetteCoefficient"))) - - squaredSilhouetteDF.collect()(0).getDouble(0) - } - } @@ -156,48 +123,62 @@ private[evaluation] object SquaredEuclideanSilhouette { } } - case class ClusterStats(Y: Vector, psi: Double, count: Long) - - def computeCsi(vector: Vector): Double = { - var sumOfSquares = 0.0 - vector.foreachActive((_, v) => { - sumOfSquares += v * v - }) - sumOfSquares - } - - def computeYVectorPsiAndCount( - df: DataFrame, - predictionCol: String, - featuresCol: String): DataFrame = { - val Yudaf = new VectorElementWiseSum() - df.groupBy(predictionCol) - .agg( - count("*").alias("count"), - sum("csi").alias("psi"), - Yudaf(col(featuresCol)).alias("y") + case class ClusterStats(featureSum: Vector, squaredNormSum: Double, numOfPoints: Long) + + def computeClusterStats( + df: DataFrame, + predictionCol: String, + featuresCol: String): Map[Int, ClusterStats] = { + val numFeatures = df.select(col(featuresCol)).first().getAs[Vector](0).size + val clustersStatsRDD = df.select(col(predictionCol), col(featuresCol), col("squaredNorm")) + .rdd + .map { row => (row.getInt(0), (row.getAs[Vector](1), row.getDouble(2))) } + .aggregateByKey[(DenseVector, Double, Long)]((Vectors.zeros(numFeatures).toDense, 0.0, 0L))( + seqOp = { + case ( + (featureSum: DenseVector, squaredNormSum: Double, numOfPoints: Long), + (features, squaredNorm) + ) => + BLAS.axpy(1.0, features, featureSum) + (featureSum, squaredNormSum + squaredNorm, numOfPoints + 1) + }, + combOp = { + case ( + (featureSum1, squaredNormSum1, numOfPoints1), + (featureSum2, squaredNormSum2, numOfPoints2) + ) => + BLAS.axpy(1.0, featureSum2, featureSum1) + (featureSum1, squaredNormSum1 + squaredNormSum2, numOfPoints1 + numOfPoints2) + } ) + + clustersStatsRDD + .collectAsMap() + .mapValues { + case (featureSum: DenseVector, squaredNormSum: Double, numOfPoints: Long) => + SquaredEuclideanSilhouette.ClusterStats(featureSum, squaredNormSum, numOfPoints) + } + .toMap } def computeSquaredSilhouetteCoefficient( broadcastedClustersMap: Broadcast[Map[Int, ClusterStats]], vector: Vector, clusterId: Int, - csi: Double): Double = { + squaredNorm: Double): Double = { - def compute(csi: Double, point: Vector, clusterStats: ClusterStats): Double = { - var YmultiplyPoint = 0.0 - point.foreachActive((idx, v) => { - YmultiplyPoint += clusterStats.Y(idx) * v - }) + def compute(squaredNorm: Double, point: Vector, clusterStats: ClusterStats): Double = { + val pointDotClusterFeaturesSum = BLAS.dot(point, clusterStats.featureSum) - csi + clusterStats.psi / clusterStats.count - 2 * YmultiplyPoint / clusterStats.count + squaredNorm + + clusterStats.squaredNormSum / clusterStats.numOfPoints - + 2 * pointDotClusterFeaturesSum / clusterStats.numOfPoints } var minOther = Double.MaxValue for(c <- broadcastedClustersMap.value.keySet) { if (c != clusterId) { - val sil = compute(csi, vector, broadcastedClustersMap.value(c)) + val sil = compute(squaredNorm, vector, broadcastedClustersMap.value(c)) if(sil < minOther) { minOther = sil } @@ -206,11 +187,11 @@ private[evaluation] object SquaredEuclideanSilhouette { val clusterCurrentPoint = broadcastedClustersMap.value(clusterId) // adjustment for excluding the node itself from // the computation of the average dissimilarity - val clusterSil = if (clusterCurrentPoint.count == 1) { + val clusterSil = if (clusterCurrentPoint.numOfPoints == 1) { 0 } else { - compute(csi, vector, clusterCurrentPoint) * clusterCurrentPoint.count / - (clusterCurrentPoint.count - 1) + compute(squaredNorm, vector, clusterCurrentPoint) * clusterCurrentPoint.numOfPoints / + (clusterCurrentPoint.numOfPoints - 1) } var silhouetteCoeff = 0.0 @@ -225,4 +206,35 @@ private[evaluation] object SquaredEuclideanSilhouette { } + def computeSquaredSilhouette(dataset: Dataset[_], + predictionCol: String, + featuresCol: String): Double = { + SquaredEuclideanSilhouette.registerKryoClasses(dataset.sparkSession.sparkContext) + + val squaredNorm = udf { + features: Vector => + math.pow(Vectors.norm(features, 2.0), 2.0) + } + val dfWithSquaredNorm = dataset.withColumn("squaredNorm", squaredNorm(col(featuresCol))) + + // compute aggregate values for clusters + // needed by the algorithm + val clustersStatsMap = SquaredEuclideanSilhouette + .computeClusterStats(dfWithSquaredNorm, predictionCol, featuresCol) + + val bClustersStatsMap = dataset.sparkSession.sparkContext.broadcast(clustersStatsMap) + + val computeSilhouette = dataset.sparkSession.udf.register("computeSilhouette", + computeSquaredSilhouetteCoefficient(bClustersStatsMap, _: Vector, _: Int, _: Double) + ) + + val squaredSilhouetteDF = dfWithSquaredNorm + .withColumn("silhouetteCoefficient", + computeSilhouette(col(featuresCol), col(predictionCol), col("squaredNorm")) + ) + .agg(avg(col("silhouetteCoefficient"))) + + squaredSilhouetteDF.collect()(0).getDouble(0) + } + } diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala deleted file mode 100644 index d130787299087..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/linalg/VectorElementWiseSum.scala +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.linalg - -import scala.collection.mutable - -import org.apache.spark.sql.Row -import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} -import org.apache.spark.sql.types.{ArrayType, DataType, DoubleType, StructType} - -/** - * A UDAF implementing a element-wise sum on [[org.apache.spark.mllib.linalg.Vector]] - * objects. - * - */ -private[spark] class VectorElementWiseSum extends UserDefinedAggregateFunction { - - override def inputSchema: StructType = new StructType().add("v", new VectorUDT()) - override def bufferSchema: StructType = new StructType().add("buff", ArrayType(DoubleType)) - // Returned Data Type - override def dataType: DataType = new VectorUDT() - override def deterministic: Boolean = true - - // This function is called whenever key changes - override def initialize(buffer: MutableAggregationBuffer): Unit = { - buffer.update(0, Array.emptyDoubleArray) - } - - // Iterate over each entry of a group - override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { - val aggr = buffer.getAs[mutable.WrappedArray[Double]](0) - val curr = input.getAs[Vector](0) - if (aggr.isEmpty) { - buffer.update(0, curr.toArray) - } else { - curr.foreachActive((idx, v) => { - aggr(idx) += v - }) - buffer.update(0, aggr) - } - } - - // Merge two partial aggregates - override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { - val buff1 = buffer1.getAs[mutable.WrappedArray[Double]](0) - val buff2 = buffer2.getAs[mutable.WrappedArray[Double]](0) - if (buff1.isEmpty) { - buffer1.update(0, buff2) - } else if (buff2.isEmpty) { - buffer1.update(0, buff1) - } else { - for ((x, i) <- buff2.zipWithIndex) { - buff1(i) += x - } - buffer1.update(0, buff1) - } - } - - override def evaluate(buffer: Row): Vector = - Vectors.dense(buffer.getAs[mutable.WrappedArray[Double]](0).toArray) - -} diff --git a/mllib/src/test/scala/org/apache/spark/ml/linalg/VectorElementWiseSumSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/linalg/VectorElementWiseSumSuite.scala deleted file mode 100644 index 75f93964cdd05..0000000000000 --- a/mllib/src/test/scala/org/apache/spark/ml/linalg/VectorElementWiseSumSuite.scala +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.linalg - -import org.apache.spark.SparkFunSuite -import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.Row -import org.apache.spark.sql.types.{StructField, StructType} - - -class VectorElementWiseSumSuite extends SparkFunSuite with MLlibTestSparkContext { - - val vStruct = StructType(Seq(StructField("v", new VectorUDT, false))) - - test("Sum with DenseVectors") { - val dv1 = Vectors.dense(0.5, 1.2) - val dv2 = Vectors.dense(1.0, 2.0) - val result = Vectors.dense(1.5, 3.2) - val vectorsRDD = spark.sparkContext.parallelize(Seq(Row(dv1), Row(dv2))) - - val df = spark.createDataFrame(vectorsRDD, vStruct) - val vSum = new VectorElementWiseSum() - val actualRes = df.agg(vSum(df("v"))).first.getAs[Vector](0) - - assert(result.equals(actualRes)) - - } - - test("Sum with SparseVectors") { - val sv1 = Vectors.sparse(3, Array(1, 2), Array(3.6, 1.0)) - val sv2 = Vectors.sparse(3, Array(1), Array(2.0)) - val result = Vectors.sparse(3, Array(1, 2), Array(5.6, 1.0)) - val vectorsRDD = spark.sparkContext.parallelize(Seq(Row(sv1), Row(sv2))) - - val df = spark.createDataFrame(vectorsRDD, vStruct) - val vSum = new VectorElementWiseSum() - val actualRes = df.agg(vSum(df("v"))).first.getAs[Vector](0) - - assert(result.equals(actualRes)) - - } - - test("Sum with negative numbers") { - val dv1 = Vectors.dense(-0.5, 1.2) - val dv2 = Vectors.dense(1.0, -2.0) - val result = Vectors.dense(0.5, -0.8) - val vectorsRDD = spark.sparkContext.parallelize(Seq(Row(dv1), Row(dv2))) - - val df = spark.createDataFrame(vectorsRDD, vStruct) - val vSum = new VectorElementWiseSum() - val actualRes = df.agg(vSum(df("v"))).first.getAs[Vector](0) - - assert(result.equals(actualRes)) - } - -} From 53c65f1b06f6b86550862c9810a1d5b44c8916ed Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Thu, 17 Aug 2017 17:35:16 +0200 Subject: [PATCH 07/18] Remove unused params, refactor code, some renaming and move iris dataset to resources --- .../ml/evaluation/ClusteringEvaluator.scala | 139 +++++------- mllib/src/test/resources/test-data/iris.csv | 150 +++++++++++++ .../evaluation/ClusteringEvaluatorSuite.scala | 210 ++++-------------- 3 files changed, 246 insertions(+), 253 deletions(-) create mode 100644 mllib/src/test/resources/test-data/iris.csv diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 6dd85587494d9..ac96091119846 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -18,10 +18,10 @@ package org.apache.spark.ml.evaluation import org.apache.spark.SparkContext -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} -import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} @@ -30,67 +30,42 @@ import org.apache.spark.sql.types.IntegerType /** * Evaluator for clustering results. - * At the moment, the supported metrics are: - * squaredSilhouette: silhouette measure using the squared Euclidean distance; - * cosineSilhouette: silhouette measure using the cosine distance. - * The implementation follows the proposal explained + * The metric computes the silhouette measure + * using the squared Euclidean distance. + * The implementation follows the proposal explained * - * in this document. + * in this document. */ @Experimental +@Since("2.3.0") class ClusteringEvaluator (val uid: String) extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { - def this() = this(Identifiable.randomUID("SquaredEuclideanSilhouette")) + def this() = this(Identifiable.randomUID("cluEval")) override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) override def isLargerBetter: Boolean = true /** @group setParam */ + @Since("2.3.0") def setPredictionCol(value: String): this.type = set(predictionCol, value) /** @group setParam */ + @Since("2.3.0") def setFeaturesCol(value: String): this.type = set(featuresCol, value) - /** - * param for metric name in evaluation - * (supports `"squaredSilhouette"` (default)) - * @group param - */ - val metricName: Param[String] = { - val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) - new Param( - this, - "metricName", - "metric name in evaluation (squaredSilhouette)", - allowedParams - ) - } - - /** @group getParam */ - def getMetricName: String = $(metricName) - - /** @group setParam */ - def setMetricName(value: String): this.type = set(metricName, value) - - setDefault(metricName -> "squaredSilhouette") - + @Since("2.3.0") override def evaluate(dataset: Dataset[_]): Double = { SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) - val metric: Double = $(metricName) match { - case "squaredSilhouette" => - SquaredEuclideanSilhouette.computeSquaredSilhouette( - dataset, - $(predictionCol), - $(featuresCol) - ) - } - metric + SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) + ) } - } @@ -161,9 +136,9 @@ private[evaluation] object SquaredEuclideanSilhouette { .toMap } - def computeSquaredSilhouetteCoefficient( + def computeSilhouetteCoefficient( broadcastedClustersMap: Broadcast[Map[Int, ClusterStats]], - vector: Vector, + features: Vector, clusterId: Int, squaredNorm: Double): Double = { @@ -175,47 +150,51 @@ private[evaluation] object SquaredEuclideanSilhouette { 2 * pointDotClusterFeaturesSum / clusterStats.numOfPoints } - var minOther = Double.MaxValue - for(c <- broadcastedClustersMap.value.keySet) { - if (c != clusterId) { - val sil = compute(squaredNorm, vector, broadcastedClustersMap.value(c)) - if(sil < minOther) { - minOther = sil + // Here we compute the average dissimilarity of the + // current point to any cluster of which the point + // is not a member. + // The cluster with the lowest average dissimilarity + // - i.e. the nearest cluster to the current point - + // is said to be the "neighboring cluster". + var neighboringClusterDissimilarity = Double.MaxValue + broadcastedClustersMap.value.keySet.foreach { + c => + if (c != clusterId) { + val dissimilarity = compute(squaredNorm, features, broadcastedClustersMap.value(c)) + if(dissimilarity < neighboringClusterDissimilarity) { + neighboringClusterDissimilarity = dissimilarity + } } - } } - val clusterCurrentPoint = broadcastedClustersMap.value(clusterId) + val currentCluster = broadcastedClustersMap.value(clusterId) // adjustment for excluding the node itself from // the computation of the average dissimilarity - val clusterSil = if (clusterCurrentPoint.numOfPoints == 1) { + val currentClusterDissimilarity = if (currentCluster.numOfPoints == 1) { 0 } else { - compute(squaredNorm, vector, clusterCurrentPoint) * clusterCurrentPoint.numOfPoints / - (clusterCurrentPoint.numOfPoints - 1) + compute(squaredNorm, features, currentCluster) * currentCluster.numOfPoints / + (currentCluster.numOfPoints - 1) } - var silhouetteCoeff = 0.0 - if (clusterSil < minOther) { - silhouetteCoeff = 1 - (clusterSil / minOther) - } else { - if (clusterSil > minOther) { - silhouetteCoeff = (minOther / clusterSil) - 1 - } + (currentClusterDissimilarity compare neighboringClusterDissimilarity).signum match { + case -1 => 1 - (currentClusterDissimilarity / neighboringClusterDissimilarity) + case 1 => (neighboringClusterDissimilarity / currentClusterDissimilarity) - 1 + case 0 => 0.0 } - silhouetteCoeff - } - def computeSquaredSilhouette(dataset: Dataset[_], - predictionCol: String, - featuresCol: String): Double = { + /** + * Compute the mean Silhouette Coefficient of all samples. + */ + def computeSilhouetteScore(dataset: Dataset[_], + predictionCol: String, + featuresCol: String): Double = { SquaredEuclideanSilhouette.registerKryoClasses(dataset.sparkSession.sparkContext) - val squaredNorm = udf { - features: Vector => - math.pow(Vectors.norm(features, 2.0), 2.0) + val squaredNormUDF = udf { + features: Vector => math.pow(Vectors.norm(features, 2.0), 2.0) } - val dfWithSquaredNorm = dataset.withColumn("squaredNorm", squaredNorm(col(featuresCol))) + val dfWithSquaredNorm = dataset.withColumn("squaredNorm", squaredNormUDF(col(featuresCol))) // compute aggregate values for clusters // needed by the algorithm @@ -224,17 +203,15 @@ private[evaluation] object SquaredEuclideanSilhouette { val bClustersStatsMap = dataset.sparkSession.sparkContext.broadcast(clustersStatsMap) - val computeSilhouette = dataset.sparkSession.udf.register("computeSilhouette", - computeSquaredSilhouetteCoefficient(bClustersStatsMap, _: Vector, _: Int, _: Double) - ) - - val squaredSilhouetteDF = dfWithSquaredNorm - .withColumn("silhouetteCoefficient", - computeSilhouette(col(featuresCol), col(predictionCol), col("squaredNorm")) - ) - .agg(avg(col("silhouetteCoefficient"))) + val computeSilhouetteCoefficientUDF = udf { + computeSilhouetteCoefficient(bClustersStatsMap, _: Vector, _: Int, _: Double) + } - squaredSilhouetteDF.collect()(0).getDouble(0) + dfWithSquaredNorm + .select(avg( + computeSilhouetteCoefficientUDF(col(featuresCol), col(predictionCol), col("squaredNorm")) + )) + .collect()(0) + .getDouble(0) } - } diff --git a/mllib/src/test/resources/test-data/iris.csv b/mllib/src/test/resources/test-data/iris.csv new file mode 100644 index 0000000000000..1de4bbac2d427 --- /dev/null +++ b/mllib/src/test/resources/test-data/iris.csv @@ -0,0 +1,150 @@ +5.1,3.5,1.4,0.2,0 +4.9,3.0,1.4,0.2,0 +4.7,3.2,1.3,0.2,0 +4.6,3.1,1.5,0.2,0 +5.0,3.6,1.4,0.2,0 +5.4,3.9,1.7,0.4,0 +4.6,3.4,1.4,0.3,0 +5.0,3.4,1.5,0.2,0 +4.4,2.9,1.4,0.2,0 +4.9,3.1,1.5,0.1,0 +5.4,3.7,1.5,0.2,0 +4.8,3.4,1.6,0.2,0 +4.8,3.0,1.4,0.1,0 +4.3,3.0,1.1,0.1,0 +5.8,4.0,1.2,0.2,0 +5.7,4.4,1.5,0.4,0 +5.4,3.9,1.3,0.4,0 +5.1,3.5,1.4,0.3,0 +5.7,3.8,1.7,0.3,0 +5.1,3.8,1.5,0.3,0 +5.4,3.4,1.7,0.2,0 +5.1,3.7,1.5,0.4,0 +4.6,3.6,1.0,0.2,0 +5.1,3.3,1.7,0.5,0 +4.8,3.4,1.9,0.2,0 +5.0,3.0,1.6,0.2,0 +5.0,3.4,1.6,0.4,0 +5.2,3.5,1.5,0.2,0 +5.2,3.4,1.4,0.2,0 +4.7,3.2,1.6,0.2,0 +4.8,3.1,1.6,0.2,0 +5.4,3.4,1.5,0.4,0 +5.2,4.1,1.5,0.1,0 +5.5,4.2,1.4,0.2,0 +4.9,3.1,1.5,0.1,0 +5.0,3.2,1.2,0.2,0 +5.5,3.5,1.3,0.2,0 +4.9,3.1,1.5,0.1,0 +4.4,3.0,1.3,0.2,0 +5.1,3.4,1.5,0.2,0 +5.0,3.5,1.3,0.3,0 +4.5,2.3,1.3,0.3,0 +4.4,3.2,1.3,0.2,0 +5.0,3.5,1.6,0.6,0 +5.1,3.8,1.9,0.4,0 +4.8,3.0,1.4,0.3,0 +5.1,3.8,1.6,0.2,0 +4.6,3.2,1.4,0.2,0 +5.3,3.7,1.5,0.2,0 +5.0,3.3,1.4,0.2,0 +7.0,3.2,4.7,1.4,1 +6.4,3.2,4.5,1.5,1 +6.9,3.1,4.9,1.5,1 +5.5,2.3,4.0,1.3,1 +6.5,2.8,4.6,1.5,1 +5.7,2.8,4.5,1.3,1 +6.3,3.3,4.7,1.6,1 +4.9,2.4,3.3,1.0,1 +6.6,2.9,4.6,1.3,1 +5.2,2.7,3.9,1.4,1 +5.0,2.0,3.5,1.0,1 +5.9,3.0,4.2,1.5,1 +6.0,2.2,4.0,1.0,1 +6.1,2.9,4.7,1.4,1 +5.6,2.9,3.6,1.3,1 +6.7,3.1,4.4,1.4,1 +5.6,3.0,4.5,1.5,1 +5.8,2.7,4.1,1.0,1 +6.2,2.2,4.5,1.5,1 +5.6,2.5,3.9,1.1,1 +5.9,3.2,4.8,1.8,1 +6.1,2.8,4.0,1.3,1 +6.3,2.5,4.9,1.5,1 +6.1,2.8,4.7,1.2,1 +6.4,2.9,4.3,1.3,1 +6.6,3.0,4.4,1.4,1 +6.8,2.8,4.8,1.4,1 +6.7,3.0,5.0,1.7,1 +6.0,2.9,4.5,1.5,1 +5.7,2.6,3.5,1.0,1 +5.5,2.4,3.8,1.1,1 +5.5,2.4,3.7,1.0,1 +5.8,2.7,3.9,1.2,1 +6.0,2.7,5.1,1.6,1 +5.4,3.0,4.5,1.5,1 +6.0,3.4,4.5,1.6,1 +6.7,3.1,4.7,1.5,1 +6.3,2.3,4.4,1.3,1 +5.6,3.0,4.1,1.3,1 +5.5,2.5,4.0,1.3,1 +5.5,2.6,4.4,1.2,1 +6.1,3.0,4.6,1.4,1 +5.8,2.6,4.0,1.2,1 +5.0,2.3,3.3,1.0,1 +5.6,2.7,4.2,1.3,1 +5.7,3.0,4.2,1.2,1 +5.7,2.9,4.2,1.3,1 +6.2,2.9,4.3,1.3,1 +5.1,2.5,3.0,1.1,1 +5.7,2.8,4.1,1.3,1 +6.3,3.3,6.0,2.5,2 +5.8,2.7,5.1,1.9,2 +7.1,3.0,5.9,2.1,2 +6.3,2.9,5.6,1.8,2 +6.5,3.0,5.8,2.2,2 +7.6,3.0,6.6,2.1,2 +4.9,2.5,4.5,1.7,2 +7.3,2.9,6.3,1.8,2 +6.7,2.5,5.8,1.8,2 +7.2,3.6,6.1,2.5,2 +6.5,3.2,5.1,2.0,2 +6.4,2.7,5.3,1.9,2 +6.8,3.0,5.5,2.1,2 +5.7,2.5,5.0,2.0,2 +5.8,2.8,5.1,2.4,2 +6.4,3.2,5.3,2.3,2 +6.5,3.0,5.5,1.8,2 +7.7,3.8,6.7,2.2,2 +7.7,2.6,6.9,2.3,2 +6.0,2.2,5.0,1.5,2 +6.9,3.2,5.7,2.3,2 +5.6,2.8,4.9,2.0,2 +7.7,2.8,6.7,2.0,2 +6.3,2.7,4.9,1.8,2 +6.7,3.3,5.7,2.1,2 +7.2,3.2,6.0,1.8,2 +6.2,2.8,4.8,1.8,2 +6.1,3.0,4.9,1.8,2 +6.4,2.8,5.6,2.1,2 +7.2,3.0,5.8,1.6,2 +7.4,2.8,6.1,1.9,2 +7.9,3.8,6.4,2.0,2 +6.4,2.8,5.6,2.2,2 +6.3,2.8,5.1,1.5,2 +6.1,2.6,5.6,1.4,2 +7.7,3.0,6.1,2.3,2 +6.3,3.4,5.6,2.4,2 +6.4,3.1,5.5,1.8,2 +6.0,3.0,4.8,1.8,2 +6.9,3.1,5.4,2.1,2 +6.7,3.1,5.6,2.4,2 +6.9,3.1,5.1,2.3,2 +5.8,2.7,5.1,1.9,2 +6.8,3.2,5.9,2.3,2 +6.7,3.3,5.7,2.5,2 +6.7,3.0,5.2,2.3,2 +6.3,2.5,5.0,1.9,2 +6.5,3.0,5.2,2.0,2 +6.2,3.4,5.4,2.3,2 +5.9,3.0,5.1,1.8,2 \ No newline at end of file diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala index 33cec14a63aa6..c11cd19748756 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala @@ -18,175 +18,20 @@ package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg.{Vectors, VectorUDT} +import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.Row -import org.apache.spark.sql.types.{IntegerType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, SparkSession} +private[ml] case class ClusteringEvaluationTestData(features: Vector, label: Int) + class ClusteringEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ - val dataset = Seq(Row(Vectors.dense(5.1, 3.5, 1.4, 0.2), 0), - Row(Vectors.dense(4.9, 3.0, 1.4, 0.2), 0), - Row(Vectors.dense(4.7, 3.2, 1.3, 0.2), 0), - Row(Vectors.dense(4.6, 3.1, 1.5, 0.2), 0), - Row(Vectors.dense(5.0, 3.6, 1.4, 0.2), 0), - Row(Vectors.dense(5.4, 3.9, 1.7, 0.4), 0), - Row(Vectors.dense(4.6, 3.4, 1.4, 0.3), 0), - Row(Vectors.dense(5.0, 3.4, 1.5, 0.2), 0), - Row(Vectors.dense(4.4, 2.9, 1.4, 0.2), 0), - Row(Vectors.dense(4.9, 3.1, 1.5, 0.1), 0), - Row(Vectors.dense(5.4, 3.7, 1.5, 0.2), 0), - Row(Vectors.dense(4.8, 3.4, 1.6, 0.2), 0), - Row(Vectors.dense(4.8, 3.0, 1.4, 0.1), 0), - Row(Vectors.dense(4.3, 3.0, 1.1, 0.1), 0), - Row(Vectors.dense(5.8, 4.0, 1.2, 0.2), 0), - Row(Vectors.dense(5.7, 4.4, 1.5, 0.4), 0), - Row(Vectors.dense(5.4, 3.9, 1.3, 0.4), 0), - Row(Vectors.dense(5.1, 3.5, 1.4, 0.3), 0), - Row(Vectors.dense(5.7, 3.8, 1.7, 0.3), 0), - Row(Vectors.dense(5.1, 3.8, 1.5, 0.3), 0), - Row(Vectors.dense(5.4, 3.4, 1.7, 0.2), 0), - Row(Vectors.dense(5.1, 3.7, 1.5, 0.4), 0), - Row(Vectors.dense(4.6, 3.6, 1.0, 0.2), 0), - Row(Vectors.dense(5.1, 3.3, 1.7, 0.5), 0), - Row(Vectors.dense(4.8, 3.4, 1.9, 0.2), 0), - Row(Vectors.dense(5.0, 3.0, 1.6, 0.2), 0), - Row(Vectors.dense(5.0, 3.4, 1.6, 0.4), 0), - Row(Vectors.dense(5.2, 3.5, 1.5, 0.2), 0), - Row(Vectors.dense(5.2, 3.4, 1.4, 0.2), 0), - Row(Vectors.dense(4.7, 3.2, 1.6, 0.2), 0), - Row(Vectors.dense(4.8, 3.1, 1.6, 0.2), 0), - Row(Vectors.dense(5.4, 3.4, 1.5, 0.4), 0), - Row(Vectors.dense(5.2, 4.1, 1.5, 0.1), 0), - Row(Vectors.dense(5.5, 4.2, 1.4, 0.2), 0), - Row(Vectors.dense(4.9, 3.1, 1.5, 0.1), 0), - Row(Vectors.dense(5.0, 3.2, 1.2, 0.2), 0), - Row(Vectors.dense(5.5, 3.5, 1.3, 0.2), 0), - Row(Vectors.dense(4.9, 3.1, 1.5, 0.1), 0), - Row(Vectors.dense(4.4, 3.0, 1.3, 0.2), 0), - Row(Vectors.dense(5.1, 3.4, 1.5, 0.2), 0), - Row(Vectors.dense(5.0, 3.5, 1.3, 0.3), 0), - Row(Vectors.dense(4.5, 2.3, 1.3, 0.3), 0), - Row(Vectors.dense(4.4, 3.2, 1.3, 0.2), 0), - Row(Vectors.dense(5.0, 3.5, 1.6, 0.6), 0), - Row(Vectors.dense(5.1, 3.8, 1.9, 0.4), 0), - Row(Vectors.dense(4.8, 3.0, 1.4, 0.3), 0), - Row(Vectors.dense(5.1, 3.8, 1.6, 0.2), 0), - Row(Vectors.dense(4.6, 3.2, 1.4, 0.2), 0), - Row(Vectors.dense(5.3, 3.7, 1.5, 0.2), 0), - Row(Vectors.dense(5.0, 3.3, 1.4, 0.2), 0), - Row(Vectors.dense(7.0, 3.2, 4.7, 1.4), 1), - Row(Vectors.dense(6.4, 3.2, 4.5, 1.5), 1), - Row(Vectors.dense(6.9, 3.1, 4.9, 1.5), 1), - Row(Vectors.dense(5.5, 2.3, 4.0, 1.3), 1), - Row(Vectors.dense(6.5, 2.8, 4.6, 1.5), 1), - Row(Vectors.dense(5.7, 2.8, 4.5, 1.3), 1), - Row(Vectors.dense(6.3, 3.3, 4.7, 1.6), 1), - Row(Vectors.dense(4.9, 2.4, 3.3, 1.0), 1), - Row(Vectors.dense(6.6, 2.9, 4.6, 1.3), 1), - Row(Vectors.dense(5.2, 2.7, 3.9, 1.4), 1), - Row(Vectors.dense(5.0, 2.0, 3.5, 1.0), 1), - Row(Vectors.dense(5.9, 3.0, 4.2, 1.5), 1), - Row(Vectors.dense(6.0, 2.2, 4.0, 1.0), 1), - Row(Vectors.dense(6.1, 2.9, 4.7, 1.4), 1), - Row(Vectors.dense(5.6, 2.9, 3.6, 1.3), 1), - Row(Vectors.dense(6.7, 3.1, 4.4, 1.4), 1), - Row(Vectors.dense(5.6, 3.0, 4.5, 1.5), 1), - Row(Vectors.dense(5.8, 2.7, 4.1, 1.0), 1), - Row(Vectors.dense(6.2, 2.2, 4.5, 1.5), 1), - Row(Vectors.dense(5.6, 2.5, 3.9, 1.1), 1), - Row(Vectors.dense(5.9, 3.2, 4.8, 1.8), 1), - Row(Vectors.dense(6.1, 2.8, 4.0, 1.3), 1), - Row(Vectors.dense(6.3, 2.5, 4.9, 1.5), 1), - Row(Vectors.dense(6.1, 2.8, 4.7, 1.2), 1), - Row(Vectors.dense(6.4, 2.9, 4.3, 1.3), 1), - Row(Vectors.dense(6.6, 3.0, 4.4, 1.4), 1), - Row(Vectors.dense(6.8, 2.8, 4.8, 1.4), 1), - Row(Vectors.dense(6.7, 3.0, 5.0, 1.7), 1), - Row(Vectors.dense(6.0, 2.9, 4.5, 1.5), 1), - Row(Vectors.dense(5.7, 2.6, 3.5, 1.0), 1), - Row(Vectors.dense(5.5, 2.4, 3.8, 1.1), 1), - Row(Vectors.dense(5.5, 2.4, 3.7, 1.0), 1), - Row(Vectors.dense(5.8, 2.7, 3.9, 1.2), 1), - Row(Vectors.dense(6.0, 2.7, 5.1, 1.6), 1), - Row(Vectors.dense(5.4, 3.0, 4.5, 1.5), 1), - Row(Vectors.dense(6.0, 3.4, 4.5, 1.6), 1), - Row(Vectors.dense(6.7, 3.1, 4.7, 1.5), 1), - Row(Vectors.dense(6.3, 2.3, 4.4, 1.3), 1), - Row(Vectors.dense(5.6, 3.0, 4.1, 1.3), 1), - Row(Vectors.dense(5.5, 2.5, 4.0, 1.3), 1), - Row(Vectors.dense(5.5, 2.6, 4.4, 1.2), 1), - Row(Vectors.dense(6.1, 3.0, 4.6, 1.4), 1), - Row(Vectors.dense(5.8, 2.6, 4.0, 1.2), 1), - Row(Vectors.dense(5.0, 2.3, 3.3, 1.0), 1), - Row(Vectors.dense(5.6, 2.7, 4.2, 1.3), 1), - Row(Vectors.dense(5.7, 3.0, 4.2, 1.2), 1), - Row(Vectors.dense(5.7, 2.9, 4.2, 1.3), 1), - Row(Vectors.dense(6.2, 2.9, 4.3, 1.3), 1), - Row(Vectors.dense(5.1, 2.5, 3.0, 1.1), 1), - Row(Vectors.dense(5.7, 2.8, 4.1, 1.3), 1), - Row(Vectors.dense(6.3, 3.3, 6.0, 2.5), 2), - Row(Vectors.dense(5.8, 2.7, 5.1, 1.9), 2), - Row(Vectors.dense(7.1, 3.0, 5.9, 2.1), 2), - Row(Vectors.dense(6.3, 2.9, 5.6, 1.8), 2), - Row(Vectors.dense(6.5, 3.0, 5.8, 2.2), 2), - Row(Vectors.dense(7.6, 3.0, 6.6, 2.1), 2), - Row(Vectors.dense(4.9, 2.5, 4.5, 1.7), 2), - Row(Vectors.dense(7.3, 2.9, 6.3, 1.8), 2), - Row(Vectors.dense(6.7, 2.5, 5.8, 1.8), 2), - Row(Vectors.dense(7.2, 3.6, 6.1, 2.5), 2), - Row(Vectors.dense(6.5, 3.2, 5.1, 2.0), 2), - Row(Vectors.dense(6.4, 2.7, 5.3, 1.9), 2), - Row(Vectors.dense(6.8, 3.0, 5.5, 2.1), 2), - Row(Vectors.dense(5.7, 2.5, 5.0, 2.0), 2), - Row(Vectors.dense(5.8, 2.8, 5.1, 2.4), 2), - Row(Vectors.dense(6.4, 3.2, 5.3, 2.3), 2), - Row(Vectors.dense(6.5, 3.0, 5.5, 1.8), 2), - Row(Vectors.dense(7.7, 3.8, 6.7, 2.2), 2), - Row(Vectors.dense(7.7, 2.6, 6.9, 2.3), 2), - Row(Vectors.dense(6.0, 2.2, 5.0, 1.5), 2), - Row(Vectors.dense(6.9, 3.2, 5.7, 2.3), 2), - Row(Vectors.dense(5.6, 2.8, 4.9, 2.0), 2), - Row(Vectors.dense(7.7, 2.8, 6.7, 2.0), 2), - Row(Vectors.dense(6.3, 2.7, 4.9, 1.8), 2), - Row(Vectors.dense(6.7, 3.3, 5.7, 2.1), 2), - Row(Vectors.dense(7.2, 3.2, 6.0, 1.8), 2), - Row(Vectors.dense(6.2, 2.8, 4.8, 1.8), 2), - Row(Vectors.dense(6.1, 3.0, 4.9, 1.8), 2), - Row(Vectors.dense(6.4, 2.8, 5.6, 2.1), 2), - Row(Vectors.dense(7.2, 3.0, 5.8, 1.6), 2), - Row(Vectors.dense(7.4, 2.8, 6.1, 1.9), 2), - Row(Vectors.dense(7.9, 3.8, 6.4, 2.0), 2), - Row(Vectors.dense(6.4, 2.8, 5.6, 2.2), 2), - Row(Vectors.dense(6.3, 2.8, 5.1, 1.5), 2), - Row(Vectors.dense(6.1, 2.6, 5.6, 1.4), 2), - Row(Vectors.dense(7.7, 3.0, 6.1, 2.3), 2), - Row(Vectors.dense(6.3, 3.4, 5.6, 2.4), 2), - Row(Vectors.dense(6.4, 3.1, 5.5, 1.8), 2), - Row(Vectors.dense(6.0, 3.0, 4.8, 1.8), 2), - Row(Vectors.dense(6.9, 3.1, 5.4, 2.1), 2), - Row(Vectors.dense(6.7, 3.1, 5.6, 2.4), 2), - Row(Vectors.dense(6.9, 3.1, 5.1, 2.3), 2), - Row(Vectors.dense(5.8, 2.7, 5.1, 1.9), 2), - Row(Vectors.dense(6.8, 3.2, 5.9, 2.3), 2), - Row(Vectors.dense(6.7, 3.3, 5.7, 2.5), 2), - Row(Vectors.dense(6.7, 3.0, 5.2, 2.3), 2), - Row(Vectors.dense(6.3, 2.5, 5.0, 1.9), 2), - Row(Vectors.dense(6.5, 3.0, 5.2, 2.0), 2), - Row(Vectors.dense(6.2, 3.4, 5.4, 2.3), 2), - Row(Vectors.dense(5.9, 3.0, 5.1, 1.8), 2)) - - val dsStruct = StructType( Seq( - StructField("point", new VectorUDT, nullable = false), - StructField("label", IntegerType, nullable = false) - )) - test("params") { ParamsSuite.checkParams(new ClusteringEvaluator) } @@ -195,31 +40,52 @@ class ClusteringEvaluatorSuite val evaluator = new ClusteringEvaluator() .setPredictionCol("myPrediction") .setFeaturesCol("myLabel") - .setMetricName("squaredSilhouette") testDefaultReadWrite(evaluator) } /* - Use the following python code to load the data and evaluate it using scikit-learn package. + Use the following python code to load the data and evaluate it using scikit-learn package. + + from sklearn import datasets + from sklearn.metrics import silhouette_score + iris = datasets.load_iris() + round(silhouette_score(iris.data, iris.target, metric='sqeuclidean'), 10) - from sklearn import datasets - from sklearn.metrics import silhouette_score - iris = datasets.load_iris() - round(silhouette_score(iris.data, iris.target, metric='sqeuclidean'), 10) + 0.6564679231 */ test("squared euclidean Silhouette") { val result = BigDecimal(0.6564679231) - val dsRDD = spark.sparkContext.parallelize(dataset) - val df = spark.createDataFrame(dsRDD, dsStruct) - + val iris = ClusteringEvaluatorSuite.irisDataset(spark) val evaluator = new ClusteringEvaluator() - .setFeaturesCol("point") + .setFeaturesCol("features") .setPredictionCol("label") - .setMetricName("squaredSilhouette") - val actual = BigDecimal(evaluator.evaluate(df)) + val actual = BigDecimal(evaluator.evaluate(iris)) .setScale(10, BigDecimal.RoundingMode.HALF_UP) - assertResult(result)(actual) + assertResult(result)(actual) } } + +object ClusteringEvaluatorSuite { + def irisDataset(spark: SparkSession): DataFrame = { + import spark.implicits._ + + val irisCsvPath = Thread.currentThread() + .getContextClassLoader + .getResource("test-data/iris.csv") + .toString + + spark.sparkContext + .textFile(irisCsvPath) + .map { + line => + val splits = line.split(",") + ClusteringEvaluationTestData( + Vectors.dense(splits.take(splits.length-1).map(_.toDouble)), + splits(splits.length-1).toInt + ) + } + .toDF() + } +} From a4ca3cd18852abc8076905a586c6b0f4b622cff6 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Fri, 18 Aug 2017 23:33:12 +0200 Subject: [PATCH 08/18] Added documentation --- .../ml/evaluation/ClusteringEvaluator.scala | 165 +++++++++++++++++- 1 file changed, 163 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index ac96091119846..5e7d0badc6458 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -30,8 +30,16 @@ import org.apache.spark.sql.types.IntegerType /** * Evaluator for clustering results. - * The metric computes the silhouette measure + * The metric computes the Silhouette measure * using the squared Euclidean distance. + * + * The Silhouette is a measure for the validation + * of the consistency within clusters. It ranges + * between 1 and -1, where a value close to 1 + * means that the points in a cluster are close + * to the other points in the same cluster and + * far from the points of the other clusters. + * * The implementation follows the proposal explained * * in this document. @@ -76,6 +84,135 @@ object ClusteringEvaluator } + +/** + * SquaredEuclideanSilhouette computes the average of the + * Silhouette over all the data of the dataset, which is + * a measure of how appropriately the data have been clustered. + * + * The Silhouette for each point `i` is defined as: + * + *
+ * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + *
+ * + * which can be rewritten as + * + *
+ * s_{i}=\left\{ \begin{tabular}{cc} + * $1-\frac{a_{i}}{b_{i}}$ & se $a_{i} \leq b_{i}$ \\ + * $\frac{b_{i}}{a_{i}}-1$ & se $a_{i} > b_{i}$ + *
+ * + * where `a(i)` is the average dissimilarity of `i` with all other data + * within the same cluster, `b(i)` is the lowest average dissimilarity + * of to any other cluster, of which `i` is not a member. + * `a(i)` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `b(i)` is + * a measure of how well `i` has not been assigned to its "neighboring cluster", + * ie. the nearest cluster to `i`. + * + * Unfortunately, the naive implementation of the algorithm requires to compute + * the distance of each couple of points in the dataset. Since the computation of + * the distance measure takes `D` operations - if `D` is the number of dimensions + * of each point, the computational complexity of the algorithm is `O(N^2*D)`, where + * `N` is the cardinality of the dataset. Of course this is not scalable in `N`, + * which is the critical number in a Big Data context. + * + * The algorithm which is implemented in this object, instead, is an efficient + * and parallel implementation of the Silhouette using the squared Euclidean + * distance measure. + * + * With this assumption, the average of the distance of the point `X` + * to the points `C_{i}` belonging to the cluster `\Gamma` is: + * + *
+ * \sum\limits_{i=1}^N d(X, C_{i} )^2 = \\ + * \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D (x_{j}-c_{ij})^2 \Big) + * = \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D x_{j}^2 + \\ + * \sum\limits_{j=1}^D c_{ij}^2 -2\sum\limits_{j=1}^D x_{i}c_{ij} \Big) + * = \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 + \\ + * \sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 \\ + * -2 \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{i}c_{ij} + *
+ * + * where `x_{j}` is the `j`-th dimension of the point `X` and + * `c_{ij} is the `j`-th dimension of the `i`-th point in cluster `\Gamma`. + * + * Then, the first term of the equation can be rewritten as: + * + *
+ * \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 = N \xi_{X} , \\ + * with \xi_{X} = \sum\limits_{j=1}^D x_{j}^2 + *
+ * + * where `\xi_{X}` is fixed for each point and it can be precomputed. + * + * Moreover, the second term is fixed for each cluster too, + * thus we can name it `\Psi_{\Gamma}` + * + *
+ * sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 = \\ + * \sum\limits_{i=1}^N \xi_{C_{i}} = \Psi_{\Gamma} + *
+ * + * Last, the third element becomes + * + *
+ * \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{i}c_{ij} = \\ + * \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{i} + *
+ * + * thus defining the vector + * + *
+ * Y_{\Gamma}:Y_{\Gamma j} = \sum\limits_{i=1}^N c_{ij} , j=0, ..., D + *
+ * + * which is fixed for each cluster `\Gamma`, we have + * + *
+ * \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{i} = \\ + * \sum\limits_{j=1}^D Y_{\Gamma j} x_{i} + *
+ * + * In this way, the previous equation becomes + * + *
+ * N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{i} + *
+ * + * and the distance of a point to a cluster can be computed as + * + *
+ * \frac{\sum\limits_{i=1}^N d(X, C_{i} )^2}{N} = \\ + * \frac{N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{i}}{N} = \\ + * \xi_{X} + \frac{\Psi_{\Gamma} }{N} - 2 \frac{\sum\limits_{j=1}^D Y_{\Gamma j} x_{i}}{N} + *
+ * + * Thus, it is enough to precompute the constant `\xi_{X}` for each point `X` + * and the constants `\Psi_{\Gamma}` and `N` and the vector `Y_{\Gamma}` for + * each cluster `\Gamma`. + * + * In the implementation, the precomputed values for the clusters + * are distributed among the worker nodes via broadcasted variables, + * because we can assume that the clusters are limited in number and + * anyway they are much fewer than the points. + * + * The main strengths of this algorithm are the low computational complexity + * and the intrinsic parallelism. The precomputed information for each point + * and for each cluster can be computed with a computational complexity + * which is `O(N/W)`, where `N` is the number of points in the dataset and + * `W` is the number of worker nodes. After that, every point can be + * analyzed independently of the others. + * + * For every point we need to compute the average distance to all the clusters. + * Since the formula above requires `O(D)` operations, this phase has a + * computational complexity which is `O(C*D*N/W)` where `C` is the number of + * clusters (which we assume quite low), `D` is the number of dimensions, + * `N` is the number of points in the dataset and `W` is the number + * of worker nodes. + */ private[evaluation] object SquaredEuclideanSilhouette { private[this] var kryoRegistrationPerformed: Boolean = false @@ -100,6 +237,16 @@ private[evaluation] object SquaredEuclideanSilhouette { case class ClusterStats(featureSum: Vector, squaredNormSum: Double, numOfPoints: Long) + /** + * The method takes the input dataset and computes the aggregated values + * about a cluster which are needed by the algorithm. + * + * @param df The DataFrame which contains the input data + * @param predictionCol The name of the column which contains the cluster id for the point. + * @param featuresCol The name of the column which contains the feature vector of the point. + * @return A [[Map]] which associates each cluster id to a [[ClusterStats]] object (which + * contains the precomputed values `N`, `\Psi_{\Gamma}` and `Y_{\Gamma}` for a cluster). + */ def computeClusterStats( df: DataFrame, predictionCol: String, @@ -136,6 +283,15 @@ private[evaluation] object SquaredEuclideanSilhouette { .toMap } + /** + * It computes the Silhouette coefficient for a point. + * + * @param broadcastedClustersMap A map of the precomputed values for each cluster. + * @param features The [[org.apache.spark.ml.linalg.Vector]] representing the current point. + * @param clusterId The id of the cluster the current point belongs to. + * @param squaredNorm The `\Xi_{X}` (which is the squared norm) precomputed for the point. + * @return The Silhouette for the point. + */ def computeSilhouetteCoefficient( broadcastedClustersMap: Broadcast[Map[Int, ClusterStats]], features: Vector, @@ -184,7 +340,12 @@ private[evaluation] object SquaredEuclideanSilhouette { } /** - * Compute the mean Silhouette Coefficient of all samples. + * Compute the mean Silhouette values of all samples. + * + * @param dataset The input dataset (previously clustered) on which compute the Silhouette. + * @param predictionCol The name of the column which contains the cluster id for the point. + * @param featuresCol The name of the column which contains the feature vector of the point. + * @return The average of the Silhouette values of the clustered data. */ def computeSilhouetteScore(dataset: Dataset[_], predictionCol: String, From a7db8962745bd000da0737018eef4b1680425c90 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Sat, 19 Aug 2017 00:45:07 +0200 Subject: [PATCH 09/18] Fix javadoc errors --- .../ml/evaluation/ClusteringEvaluator.scala | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 5e7d0badc6458..05ad612f62961 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -100,8 +100,8 @@ object ClusteringEvaluator * *
* s_{i}=\left\{ \begin{tabular}{cc} - * $1-\frac{a_{i}}{b_{i}}$ & se $a_{i} \leq b_{i}$ \\ - * $\frac{b_{i}}{a_{i}}-1$ & se $a_{i} > b_{i}$ + * $1-\frac{a_{i}}{b_{i}}$ & if $a_{i} \leq b_{i}$ \\ + * $\frac{b_{i}}{a_{i}}-1$ & if $a_{i} \gt b_{i}$ *
* * where `a(i)` is the average dissimilarity of `i` with all other data @@ -127,12 +127,12 @@ object ClusteringEvaluator * to the points `C_{i}` belonging to the cluster `\Gamma` is: * *
- * \sum\limits_{i=1}^N d(X, C_{i} )^2 = \\ + * \sum\limits_{i=1}^N d(X, C_{i} )^2 = * \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D (x_{j}-c_{ij})^2 \Big) - * = \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D x_{j}^2 + \\ + * = \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D x_{j}^2 + * \sum\limits_{j=1}^D c_{ij}^2 -2\sum\limits_{j=1}^D x_{i}c_{ij} \Big) - * = \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 + \\ - * \sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 \\ + * = \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 + + * \sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 * -2 \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{i}c_{ij} *
* @@ -142,7 +142,7 @@ object ClusteringEvaluator * Then, the first term of the equation can be rewritten as: * *
- * \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 = N \xi_{X} , \\ + * \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 = N \xi_{X} , * with \xi_{X} = \sum\limits_{j=1}^D x_{j}^2 *
* @@ -152,14 +152,14 @@ object ClusteringEvaluator * thus we can name it `\Psi_{\Gamma}` * *
- * sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 = \\ + * sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 = * \sum\limits_{i=1}^N \xi_{C_{i}} = \Psi_{\Gamma} *
* * Last, the third element becomes * *
- * \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{i}c_{ij} = \\ + * \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{i}c_{ij} = * \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{i} *
* @@ -172,7 +172,7 @@ object ClusteringEvaluator * which is fixed for each cluster `\Gamma`, we have * *
- * \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{i} = \\ + * \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{i} = * \sum\limits_{j=1}^D Y_{\Gamma j} x_{i} *
* @@ -185,8 +185,8 @@ object ClusteringEvaluator * and the distance of a point to a cluster can be computed as * *
- * \frac{\sum\limits_{i=1}^N d(X, C_{i} )^2}{N} = \\ - * \frac{N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{i}}{N} = \\ + * \frac{\sum\limits_{i=1}^N d(X, C_{i} )^2}{N} = + * \frac{N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{i}}{N} = * \xi_{X} + \frac{\Psi_{\Gamma} }{N} - 2 \frac{\sum\limits_{j=1}^D Y_{\Gamma j} x_{i}}{N} *
* @@ -244,8 +244,9 @@ private[evaluation] object SquaredEuclideanSilhouette { * @param df The DataFrame which contains the input data * @param predictionCol The name of the column which contains the cluster id for the point. * @param featuresCol The name of the column which contains the feature vector of the point. - * @return A [[Map]] which associates each cluster id to a [[ClusterStats]] object (which - * contains the precomputed values `N`, `\Psi_{\Gamma}` and `Y_{\Gamma}` for a cluster). + * @return A [[scala.collection.immutable.Map]] which associates each cluster id + * to a [[ClusterStats]] object (which contains the precomputed values `N`, + * `\Psi_{\Gamma}` and `Y_{\Gamma}` for a cluster). */ def computeClusterStats( df: DataFrame, From 45d1380574ece58ff63c34ff31af6243aff16c3c Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Thu, 31 Aug 2017 15:41:24 +0200 Subject: [PATCH 10/18] Fix documentation --- .../ml/evaluation/ClusteringEvaluator.scala | 64 ++++++++++++------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 05ad612f62961..d6be184c97a2a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -39,10 +39,6 @@ import org.apache.spark.sql.types.IntegerType * means that the points in a cluster are close * to the other points in the same cluster and * far from the points of the other clusters. - * - * The implementation follows the proposal explained - * - * in this document. */ @Experimental @Since("2.3.0") @@ -93,29 +89,33 @@ object ClusteringEvaluator * The Silhouette for each point `i` is defined as: * *
+ * $$ * s_{i} = \frac{b_{i}-a_{i}}{max\{a_{i},b_{i}\}} + * $$ *
* * which can be rewritten as * *
- * s_{i}=\left\{ \begin{tabular}{cc} - * $1-\frac{a_{i}}{b_{i}}$ & if $a_{i} \leq b_{i}$ \\ - * $\frac{b_{i}}{a_{i}}-1$ & if $a_{i} \gt b_{i}$ + * $$ + * s_{i}= \begin{cases} + * 1-\frac{a_{i}}{b_{i}} & \text{if } a_{i} \leq b_{i} \\ + * \frac{b_{i}}{a_{i}}-1 & \text{if } a_{i} \gt b_{i} \end{cases} + * $$ *
* - * where `a(i)` is the average dissimilarity of `i` with all other data - * within the same cluster, `b(i)` is the lowest average dissimilarity + * where `$a_{i}$` is the average dissimilarity of `i` with all other data + * within the same cluster, `$b_{i}$` is the lowest average dissimilarity * of to any other cluster, of which `i` is not a member. - * `a(i)` can be interpreted as as how well `i` is assigned to its cluster - * (the smaller the value, the better the assignment), while `b(i)` is + * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * (the smaller the value, the better the assignment), while `$b_{i}$` is * a measure of how well `i` has not been assigned to its "neighboring cluster", * ie. the nearest cluster to `i`. * * Unfortunately, the naive implementation of the algorithm requires to compute * the distance of each couple of points in the dataset. Since the computation of * the distance measure takes `D` operations - if `D` is the number of dimensions - * of each point, the computational complexity of the algorithm is `O(N^2*D)`, where + * of each point, the computational complexity of the algorithm is `O(N^2^*D)`, where * `N` is the cardinality of the dataset. Of course this is not scalable in `N`, * which is the critical number in a Big Data context. * @@ -124,9 +124,10 @@ object ClusteringEvaluator * distance measure. * * With this assumption, the average of the distance of the point `X` - * to the points `C_{i}` belonging to the cluster `\Gamma` is: + * to the points `$C_{i}$` belonging to the cluster `$\Gamma$` is: * *
+ * $$ * \sum\limits_{i=1}^N d(X, C_{i} )^2 = * \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D (x_{j}-c_{ij})^2 \Big) * = \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D x_{j}^2 + @@ -134,65 +135,80 @@ object ClusteringEvaluator * = \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 + * \sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 * -2 \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{i}c_{ij} + * $$ *
* - * where `x_{j}` is the `j`-th dimension of the point `X` and - * `c_{ij} is the `j`-th dimension of the `i`-th point in cluster `\Gamma`. + * where `$x_{j}$` is the `j`-th dimension of the point `X` and + * `$c_{ij}$` is the `j`-th dimension of the `i`-th point in cluster `$\Gamma$`. * * Then, the first term of the equation can be rewritten as: * *
- * \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 = N \xi_{X} , - * with \xi_{X} = \sum\limits_{j=1}^D x_{j}^2 + * $$ + * \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 = N \xi_{X} \text{ , + * with } \xi_{X} = \sum\limits_{j=1}^D x_{j}^2 + * $$ *
* - * where `\xi_{X}` is fixed for each point and it can be precomputed. + * where `$\xi_{X}$` is fixed for each point and it can be precomputed. * * Moreover, the second term is fixed for each cluster too, - * thus we can name it `\Psi_{\Gamma}` + * thus we can name it `$\Psi_{\Gamma}$` * *
- * sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 = + * $$ + * \sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 = * \sum\limits_{i=1}^N \xi_{C_{i}} = \Psi_{\Gamma} + * $$ *
* * Last, the third element becomes * *
+ * $$ * \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{i}c_{ij} = * \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{i} + * $$ *
* * thus defining the vector * *
+ * $$ * Y_{\Gamma}:Y_{\Gamma j} = \sum\limits_{i=1}^N c_{ij} , j=0, ..., D + * $$ *
* - * which is fixed for each cluster `\Gamma`, we have + * which is fixed for each cluster `$\Gamma$`, we have * *
+ * $$ * \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{i} = * \sum\limits_{j=1}^D Y_{\Gamma j} x_{i} + * $$ *
* * In this way, the previous equation becomes * *
+ * $$ * N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{i} + * $$ *
* * and the distance of a point to a cluster can be computed as * *
+ * $$ * \frac{\sum\limits_{i=1}^N d(X, C_{i} )^2}{N} = * \frac{N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{i}}{N} = * \xi_{X} + \frac{\Psi_{\Gamma} }{N} - 2 \frac{\sum\limits_{j=1}^D Y_{\Gamma j} x_{i}}{N} + * $$ *
* - * Thus, it is enough to precompute the constant `\xi_{X}` for each point `X` - * and the constants `\Psi_{\Gamma}` and `N` and the vector `Y_{\Gamma}` for - * each cluster `\Gamma`. + * Thus, it is enough to precompute the constant `$\xi_{X}$` for each point `X` + * and the constants `$\Psi_{\Gamma}$` and `N` and the vector `$Y_{\Gamma}$` for + * each cluster `$\Gamma$`. * * In the implementation, the precomputed values for the clusters * are distributed among the worker nodes via broadcasted variables, From 8bc5664eb67ca259f09cab7aeaf52de67c9462b0 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Fri, 1 Sep 2017 15:15:53 +0200 Subject: [PATCH 11/18] Add Experimental label --- .../org/apache/spark/ml/evaluation/ClusteringEvaluator.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index d6be184c97a2a..790a87aca54d1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.functions.{avg, col, udf} import org.apache.spark.sql.types.IntegerType /** + * :: Experimental :: * Evaluator for clustering results. * The metric computes the Silhouette measure * using the squared Euclidean distance. From 9abe9e560ae12405a480eab325f7a707e8cb1f14 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Fri, 1 Sep 2017 15:16:18 +0200 Subject: [PATCH 12/18] Avoid BigDecimal --- .../spark/ml/evaluation/ClusteringEvaluatorSuite.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala index c11cd19748756..9c54a90a73610 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest +import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, SparkSession} @@ -54,15 +55,12 @@ class ClusteringEvaluatorSuite 0.6564679231 */ test("squared euclidean Silhouette") { - val result = BigDecimal(0.6564679231) val iris = ClusteringEvaluatorSuite.irisDataset(spark) val evaluator = new ClusteringEvaluator() .setFeaturesCol("features") .setPredictionCol("label") - val actual = BigDecimal(evaluator.evaluate(iris)) - .setScale(10, BigDecimal.RoundingMode.HALF_UP) - assertResult(result)(actual) + assert(evaluator.evaluate(iris) ~== 0.6564679231 relTol 1e-10) } } From 4f3c1dbd0ac94600e079f68fce5a032caa8fe36b Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Wed, 6 Sep 2017 15:27:10 +0200 Subject: [PATCH 13/18] Fix typos and destroy broadcast variable --- .../ml/evaluation/ClusteringEvaluator.scala | 55 +++++++++++-------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 790a87aca54d1..52baf713057d0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -43,13 +43,16 @@ import org.apache.spark.sql.types.IntegerType */ @Experimental @Since("2.3.0") -class ClusteringEvaluator (val uid: String) +class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: String) extends Evaluator with HasPredictionCol with HasFeaturesCol with DefaultParamsWritable { + @Since("2.3.0") def this() = this(Identifiable.randomUID("cluEval")) + @Since("2.3.0") override def copy(pMap: ParamMap): ClusteringEvaluator = this.defaultCopy(pMap) + @Since("2.3.0") override def isLargerBetter: Boolean = true /** @group setParam */ @@ -74,9 +77,11 @@ class ClusteringEvaluator (val uid: String) } +@Since("2.3.0") object ClusteringEvaluator extends DefaultParamsReadable[ClusteringEvaluator] { + @Since("2.3.0") override def load(path: String): ClusteringEvaluator = super.load(path) } @@ -107,7 +112,7 @@ object ClusteringEvaluator * * where `$a_{i}$` is the average dissimilarity of `i` with all other data * within the same cluster, `$b_{i}$` is the lowest average dissimilarity - * of to any other cluster, of which `i` is not a member. + * of `i` to any other cluster, of which `i` is not a member. * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster * (the smaller the value, the better the assignment), while `$b_{i}$` is * a measure of how well `i` has not been assigned to its "neighboring cluster", @@ -124,18 +129,18 @@ object ClusteringEvaluator * and parallel implementation of the Silhouette using the squared Euclidean * distance measure. * - * With this assumption, the average of the distance of the point `X` + * With this assumption, the total distance of the point `X` * to the points `$C_{i}$` belonging to the cluster `$\Gamma$` is: * *
* $$ - * \sum\limits_{i=1}^N d(X, C_{i} )^2 = + * \sum\limits_{i=1}^N d(X, C_{i} ) = * \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D (x_{j}-c_{ij})^2 \Big) * = \sum\limits_{i=1}^N \Big( \sum\limits_{j=1}^D x_{j}^2 + - * \sum\limits_{j=1}^D c_{ij}^2 -2\sum\limits_{j=1}^D x_{i}c_{ij} \Big) + * \sum\limits_{j=1}^D c_{ij}^2 -2\sum\limits_{j=1}^D x_{j}c_{ij} \Big) * = \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}^2 + * \sum\limits_{i=1}^N \sum\limits_{j=1}^D c_{ij}^2 - * -2 \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{i}c_{ij} + * -2 \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}c_{ij} * $$ *
* @@ -167,8 +172,8 @@ object ClusteringEvaluator * *
* $$ - * \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{i}c_{ij} = - * \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{i} + * \sum\limits_{i=1}^N \sum\limits_{j=1}^D x_{j}c_{ij} = + * \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{j} * $$ *
* @@ -184,8 +189,8 @@ object ClusteringEvaluator * *
* $$ - * \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{i} = - * \sum\limits_{j=1}^D Y_{\Gamma j} x_{i} + * \sum\limits_{j=1}^D \Big(\sum\limits_{i=1}^N c_{ij} \Big) x_{j} = + * \sum\limits_{j=1}^D Y_{\Gamma j} x_{j} * $$ *
* @@ -193,22 +198,22 @@ object ClusteringEvaluator * *
* $$ - * N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{i} + * N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{j} * $$ *
* - * and the distance of a point to a cluster can be computed as + * and the average distance of a point to a cluster can be computed as * *
* $$ * \frac{\sum\limits_{i=1}^N d(X, C_{i} )^2}{N} = - * \frac{N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{i}}{N} = - * \xi_{X} + \frac{\Psi_{\Gamma} }{N} - 2 \frac{\sum\limits_{j=1}^D Y_{\Gamma j} x_{i}}{N} + * \frac{N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{j}}{N} = + * \xi_{X} + \frac{\Psi_{\Gamma} }{N} - 2 \frac{\sum\limits_{j=1}^D Y_{\Gamma j} x_{j}}{N} * $$ *
* - * Thus, it is enough to precompute the constant `$\xi_{X}$` for each point `X` - * and the constants `$\Psi_{\Gamma}$` and `N` and the vector `$Y_{\Gamma}$` for + * Thus, it is enough to precompute: the constant `$\xi_{X}$` for each point `X`; the + * constants `$\Psi_{\Gamma}$`, `N` and the vector `$Y_{\Gamma}$` for * each cluster `$\Gamma$`. * * In the implementation, the precomputed values for the clusters @@ -242,7 +247,7 @@ private[evaluation] object SquaredEuclideanSilhouette { * @param sc `SparkContext` to be used */ def registerKryoClasses(sc: SparkContext): Unit = { - if (! kryoRegistrationPerformed) { + if (!kryoRegistrationPerformed) { sc.getConf.registerKryoClasses( Array( classOf[SquaredEuclideanSilhouette.ClusterStats] @@ -263,7 +268,7 @@ private[evaluation] object SquaredEuclideanSilhouette { * @param featuresCol The name of the column which contains the feature vector of the point. * @return A [[scala.collection.immutable.Map]] which associates each cluster id * to a [[ClusterStats]] object (which contains the precomputed values `N`, - * `\Psi_{\Gamma}` and `Y_{\Gamma}` for a cluster). + * `$\Psi_{\Gamma}$` and `$Y_{\Gamma}$` for a cluster). */ def computeClusterStats( df: DataFrame, @@ -307,7 +312,7 @@ private[evaluation] object SquaredEuclideanSilhouette { * @param broadcastedClustersMap A map of the precomputed values for each cluster. * @param features The [[org.apache.spark.ml.linalg.Vector]] representing the current point. * @param clusterId The id of the cluster the current point belongs to. - * @param squaredNorm The `\Xi_{X}` (which is the squared norm) precomputed for the point. + * @param squaredNorm The `$\Xi_{X}$` (which is the squared norm) precomputed for the point. * @return The Silhouette for the point. */ def computeSilhouetteCoefficient( @@ -365,7 +370,8 @@ private[evaluation] object SquaredEuclideanSilhouette { * @param featuresCol The name of the column which contains the feature vector of the point. * @return The average of the Silhouette values of the clustered data. */ - def computeSilhouetteScore(dataset: Dataset[_], + def computeSilhouetteScore( + dataset: Dataset[_], predictionCol: String, featuresCol: String): Double = { SquaredEuclideanSilhouette.registerKryoClasses(dataset.sparkSession.sparkContext) @@ -375,8 +381,7 @@ private[evaluation] object SquaredEuclideanSilhouette { } val dfWithSquaredNorm = dataset.withColumn("squaredNorm", squaredNormUDF(col(featuresCol))) - // compute aggregate values for clusters - // needed by the algorithm + // compute aggregate values for clusters needed by the algorithm val clustersStatsMap = SquaredEuclideanSilhouette .computeClusterStats(dfWithSquaredNorm, predictionCol, featuresCol) @@ -386,11 +391,15 @@ private[evaluation] object SquaredEuclideanSilhouette { computeSilhouetteCoefficient(bClustersStatsMap, _: Vector, _: Int, _: Double) } - dfWithSquaredNorm + val silhouetteScore = dfWithSquaredNorm .select(avg( computeSilhouetteCoefficientUDF(col(featuresCol), col(predictionCol), col("squaredNorm")) )) .collect()(0) .getDouble(0) + + bClustersStatsMap.destroy() + + silhouetteScore } } From a99c4299a628bbf35ea19a90c1edb28bea8f895d Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Wed, 6 Sep 2017 15:28:15 +0200 Subject: [PATCH 14/18] Use libsvm format instead of csv --- mllib/src/test/resources/test-data/iris.csv | 150 ------------------ .../src/test/resources/test-data/iris.libsvm | 150 ++++++++++++++++++ .../evaluation/ClusteringEvaluatorSuite.scala | 24 +-- 3 files changed, 156 insertions(+), 168 deletions(-) delete mode 100644 mllib/src/test/resources/test-data/iris.csv create mode 100644 mllib/src/test/resources/test-data/iris.libsvm diff --git a/mllib/src/test/resources/test-data/iris.csv b/mllib/src/test/resources/test-data/iris.csv deleted file mode 100644 index 1de4bbac2d427..0000000000000 --- a/mllib/src/test/resources/test-data/iris.csv +++ /dev/null @@ -1,150 +0,0 @@ -5.1,3.5,1.4,0.2,0 -4.9,3.0,1.4,0.2,0 -4.7,3.2,1.3,0.2,0 -4.6,3.1,1.5,0.2,0 -5.0,3.6,1.4,0.2,0 -5.4,3.9,1.7,0.4,0 -4.6,3.4,1.4,0.3,0 -5.0,3.4,1.5,0.2,0 -4.4,2.9,1.4,0.2,0 -4.9,3.1,1.5,0.1,0 -5.4,3.7,1.5,0.2,0 -4.8,3.4,1.6,0.2,0 -4.8,3.0,1.4,0.1,0 -4.3,3.0,1.1,0.1,0 -5.8,4.0,1.2,0.2,0 -5.7,4.4,1.5,0.4,0 -5.4,3.9,1.3,0.4,0 -5.1,3.5,1.4,0.3,0 -5.7,3.8,1.7,0.3,0 -5.1,3.8,1.5,0.3,0 -5.4,3.4,1.7,0.2,0 -5.1,3.7,1.5,0.4,0 -4.6,3.6,1.0,0.2,0 -5.1,3.3,1.7,0.5,0 -4.8,3.4,1.9,0.2,0 -5.0,3.0,1.6,0.2,0 -5.0,3.4,1.6,0.4,0 -5.2,3.5,1.5,0.2,0 -5.2,3.4,1.4,0.2,0 -4.7,3.2,1.6,0.2,0 -4.8,3.1,1.6,0.2,0 -5.4,3.4,1.5,0.4,0 -5.2,4.1,1.5,0.1,0 -5.5,4.2,1.4,0.2,0 -4.9,3.1,1.5,0.1,0 -5.0,3.2,1.2,0.2,0 -5.5,3.5,1.3,0.2,0 -4.9,3.1,1.5,0.1,0 -4.4,3.0,1.3,0.2,0 -5.1,3.4,1.5,0.2,0 -5.0,3.5,1.3,0.3,0 -4.5,2.3,1.3,0.3,0 -4.4,3.2,1.3,0.2,0 -5.0,3.5,1.6,0.6,0 -5.1,3.8,1.9,0.4,0 -4.8,3.0,1.4,0.3,0 -5.1,3.8,1.6,0.2,0 -4.6,3.2,1.4,0.2,0 -5.3,3.7,1.5,0.2,0 -5.0,3.3,1.4,0.2,0 -7.0,3.2,4.7,1.4,1 -6.4,3.2,4.5,1.5,1 -6.9,3.1,4.9,1.5,1 -5.5,2.3,4.0,1.3,1 -6.5,2.8,4.6,1.5,1 -5.7,2.8,4.5,1.3,1 -6.3,3.3,4.7,1.6,1 -4.9,2.4,3.3,1.0,1 -6.6,2.9,4.6,1.3,1 -5.2,2.7,3.9,1.4,1 -5.0,2.0,3.5,1.0,1 -5.9,3.0,4.2,1.5,1 -6.0,2.2,4.0,1.0,1 -6.1,2.9,4.7,1.4,1 -5.6,2.9,3.6,1.3,1 -6.7,3.1,4.4,1.4,1 -5.6,3.0,4.5,1.5,1 -5.8,2.7,4.1,1.0,1 -6.2,2.2,4.5,1.5,1 -5.6,2.5,3.9,1.1,1 -5.9,3.2,4.8,1.8,1 -6.1,2.8,4.0,1.3,1 -6.3,2.5,4.9,1.5,1 -6.1,2.8,4.7,1.2,1 -6.4,2.9,4.3,1.3,1 -6.6,3.0,4.4,1.4,1 -6.8,2.8,4.8,1.4,1 -6.7,3.0,5.0,1.7,1 -6.0,2.9,4.5,1.5,1 -5.7,2.6,3.5,1.0,1 -5.5,2.4,3.8,1.1,1 -5.5,2.4,3.7,1.0,1 -5.8,2.7,3.9,1.2,1 -6.0,2.7,5.1,1.6,1 -5.4,3.0,4.5,1.5,1 -6.0,3.4,4.5,1.6,1 -6.7,3.1,4.7,1.5,1 -6.3,2.3,4.4,1.3,1 -5.6,3.0,4.1,1.3,1 -5.5,2.5,4.0,1.3,1 -5.5,2.6,4.4,1.2,1 -6.1,3.0,4.6,1.4,1 -5.8,2.6,4.0,1.2,1 -5.0,2.3,3.3,1.0,1 -5.6,2.7,4.2,1.3,1 -5.7,3.0,4.2,1.2,1 -5.7,2.9,4.2,1.3,1 -6.2,2.9,4.3,1.3,1 -5.1,2.5,3.0,1.1,1 -5.7,2.8,4.1,1.3,1 -6.3,3.3,6.0,2.5,2 -5.8,2.7,5.1,1.9,2 -7.1,3.0,5.9,2.1,2 -6.3,2.9,5.6,1.8,2 -6.5,3.0,5.8,2.2,2 -7.6,3.0,6.6,2.1,2 -4.9,2.5,4.5,1.7,2 -7.3,2.9,6.3,1.8,2 -6.7,2.5,5.8,1.8,2 -7.2,3.6,6.1,2.5,2 -6.5,3.2,5.1,2.0,2 -6.4,2.7,5.3,1.9,2 -6.8,3.0,5.5,2.1,2 -5.7,2.5,5.0,2.0,2 -5.8,2.8,5.1,2.4,2 -6.4,3.2,5.3,2.3,2 -6.5,3.0,5.5,1.8,2 -7.7,3.8,6.7,2.2,2 -7.7,2.6,6.9,2.3,2 -6.0,2.2,5.0,1.5,2 -6.9,3.2,5.7,2.3,2 -5.6,2.8,4.9,2.0,2 -7.7,2.8,6.7,2.0,2 -6.3,2.7,4.9,1.8,2 -6.7,3.3,5.7,2.1,2 -7.2,3.2,6.0,1.8,2 -6.2,2.8,4.8,1.8,2 -6.1,3.0,4.9,1.8,2 -6.4,2.8,5.6,2.1,2 -7.2,3.0,5.8,1.6,2 -7.4,2.8,6.1,1.9,2 -7.9,3.8,6.4,2.0,2 -6.4,2.8,5.6,2.2,2 -6.3,2.8,5.1,1.5,2 -6.1,2.6,5.6,1.4,2 -7.7,3.0,6.1,2.3,2 -6.3,3.4,5.6,2.4,2 -6.4,3.1,5.5,1.8,2 -6.0,3.0,4.8,1.8,2 -6.9,3.1,5.4,2.1,2 -6.7,3.1,5.6,2.4,2 -6.9,3.1,5.1,2.3,2 -5.8,2.7,5.1,1.9,2 -6.8,3.2,5.9,2.3,2 -6.7,3.3,5.7,2.5,2 -6.7,3.0,5.2,2.3,2 -6.3,2.5,5.0,1.9,2 -6.5,3.0,5.2,2.0,2 -6.2,3.4,5.4,2.3,2 -5.9,3.0,5.1,1.8,2 \ No newline at end of file diff --git a/mllib/src/test/resources/test-data/iris.libsvm b/mllib/src/test/resources/test-data/iris.libsvm new file mode 100644 index 0000000000000..db959010255d0 --- /dev/null +++ b/mllib/src/test/resources/test-data/iris.libsvm @@ -0,0 +1,150 @@ +0.0 1:5.1 2:3.5 3:1.4 4:0.2 +0.0 1:4.9 2:3.0 3:1.4 4:0.2 +0.0 1:4.7 2:3.2 3:1.3 4:0.2 +0.0 1:4.6 2:3.1 3:1.5 4:0.2 +0.0 1:5.0 2:3.6 3:1.4 4:0.2 +0.0 1:5.4 2:3.9 3:1.7 4:0.4 +0.0 1:4.6 2:3.4 3:1.4 4:0.3 +0.0 1:5.0 2:3.4 3:1.5 4:0.2 +0.0 1:4.4 2:2.9 3:1.4 4:0.2 +0.0 1:4.9 2:3.1 3:1.5 4:0.1 +0.0 1:5.4 2:3.7 3:1.5 4:0.2 +0.0 1:4.8 2:3.4 3:1.6 4:0.2 +0.0 1:4.8 2:3.0 3:1.4 4:0.1 +0.0 1:4.3 2:3.0 3:1.1 4:0.1 +0.0 1:5.8 2:4.0 3:1.2 4:0.2 +0.0 1:5.7 2:4.4 3:1.5 4:0.4 +0.0 1:5.4 2:3.9 3:1.3 4:0.4 +0.0 1:5.1 2:3.5 3:1.4 4:0.3 +0.0 1:5.7 2:3.8 3:1.7 4:0.3 +0.0 1:5.1 2:3.8 3:1.5 4:0.3 +0.0 1:5.4 2:3.4 3:1.7 4:0.2 +0.0 1:5.1 2:3.7 3:1.5 4:0.4 +0.0 1:4.6 2:3.6 3:1.0 4:0.2 +0.0 1:5.1 2:3.3 3:1.7 4:0.5 +0.0 1:4.8 2:3.4 3:1.9 4:0.2 +0.0 1:5.0 2:3.0 3:1.6 4:0.2 +0.0 1:5.0 2:3.4 3:1.6 4:0.4 +0.0 1:5.2 2:3.5 3:1.5 4:0.2 +0.0 1:5.2 2:3.4 3:1.4 4:0.2 +0.0 1:4.7 2:3.2 3:1.6 4:0.2 +0.0 1:4.8 2:3.1 3:1.6 4:0.2 +0.0 1:5.4 2:3.4 3:1.5 4:0.4 +0.0 1:5.2 2:4.1 3:1.5 4:0.1 +0.0 1:5.5 2:4.2 3:1.4 4:0.2 +0.0 1:4.9 2:3.1 3:1.5 4:0.1 +0.0 1:5.0 2:3.2 3:1.2 4:0.2 +0.0 1:5.5 2:3.5 3:1.3 4:0.2 +0.0 1:4.9 2:3.1 3:1.5 4:0.1 +0.0 1:4.4 2:3.0 3:1.3 4:0.2 +0.0 1:5.1 2:3.4 3:1.5 4:0.2 +0.0 1:5.0 2:3.5 3:1.3 4:0.3 +0.0 1:4.5 2:2.3 3:1.3 4:0.3 +0.0 1:4.4 2:3.2 3:1.3 4:0.2 +0.0 1:5.0 2:3.5 3:1.6 4:0.6 +0.0 1:5.1 2:3.8 3:1.9 4:0.4 +0.0 1:4.8 2:3.0 3:1.4 4:0.3 +0.0 1:5.1 2:3.8 3:1.6 4:0.2 +0.0 1:4.6 2:3.2 3:1.4 4:0.2 +0.0 1:5.3 2:3.7 3:1.5 4:0.2 +0.0 1:5.0 2:3.3 3:1.4 4:0.2 +1.0 1:7.0 2:3.2 3:4.7 4:1.4 +1.0 1:6.4 2:3.2 3:4.5 4:1.5 +1.0 1:6.9 2:3.1 3:4.9 4:1.5 +1.0 1:5.5 2:2.3 3:4.0 4:1.3 +1.0 1:6.5 2:2.8 3:4.6 4:1.5 +1.0 1:5.7 2:2.8 3:4.5 4:1.3 +1.0 1:6.3 2:3.3 3:4.7 4:1.6 +1.0 1:4.9 2:2.4 3:3.3 4:1.0 +1.0 1:6.6 2:2.9 3:4.6 4:1.3 +1.0 1:5.2 2:2.7 3:3.9 4:1.4 +1.0 1:5.0 2:2.0 3:3.5 4:1.0 +1.0 1:5.9 2:3.0 3:4.2 4:1.5 +1.0 1:6.0 2:2.2 3:4.0 4:1.0 +1.0 1:6.1 2:2.9 3:4.7 4:1.4 +1.0 1:5.6 2:2.9 3:3.6 4:1.3 +1.0 1:6.7 2:3.1 3:4.4 4:1.4 +1.0 1:5.6 2:3.0 3:4.5 4:1.5 +1.0 1:5.8 2:2.7 3:4.1 4:1.0 +1.0 1:6.2 2:2.2 3:4.5 4:1.5 +1.0 1:5.6 2:2.5 3:3.9 4:1.1 +1.0 1:5.9 2:3.2 3:4.8 4:1.8 +1.0 1:6.1 2:2.8 3:4.0 4:1.3 +1.0 1:6.3 2:2.5 3:4.9 4:1.5 +1.0 1:6.1 2:2.8 3:4.7 4:1.2 +1.0 1:6.4 2:2.9 3:4.3 4:1.3 +1.0 1:6.6 2:3.0 3:4.4 4:1.4 +1.0 1:6.8 2:2.8 3:4.8 4:1.4 +1.0 1:6.7 2:3.0 3:5.0 4:1.7 +1.0 1:6.0 2:2.9 3:4.5 4:1.5 +1.0 1:5.7 2:2.6 3:3.5 4:1.0 +1.0 1:5.5 2:2.4 3:3.8 4:1.1 +1.0 1:5.5 2:2.4 3:3.7 4:1.0 +1.0 1:5.8 2:2.7 3:3.9 4:1.2 +1.0 1:6.0 2:2.7 3:5.1 4:1.6 +1.0 1:5.4 2:3.0 3:4.5 4:1.5 +1.0 1:6.0 2:3.4 3:4.5 4:1.6 +1.0 1:6.7 2:3.1 3:4.7 4:1.5 +1.0 1:6.3 2:2.3 3:4.4 4:1.3 +1.0 1:5.6 2:3.0 3:4.1 4:1.3 +1.0 1:5.5 2:2.5 3:4.0 4:1.3 +1.0 1:5.5 2:2.6 3:4.4 4:1.2 +1.0 1:6.1 2:3.0 3:4.6 4:1.4 +1.0 1:5.8 2:2.6 3:4.0 4:1.2 +1.0 1:5.0 2:2.3 3:3.3 4:1.0 +1.0 1:5.6 2:2.7 3:4.2 4:1.3 +1.0 1:5.7 2:3.0 3:4.2 4:1.2 +1.0 1:5.7 2:2.9 3:4.2 4:1.3 +1.0 1:6.2 2:2.9 3:4.3 4:1.3 +1.0 1:5.1 2:2.5 3:3.0 4:1.1 +1.0 1:5.7 2:2.8 3:4.1 4:1.3 +2.0 1:6.3 2:3.3 3:6.0 4:2.5 +2.0 1:5.8 2:2.7 3:5.1 4:1.9 +2.0 1:7.1 2:3.0 3:5.9 4:2.1 +2.0 1:6.3 2:2.9 3:5.6 4:1.8 +2.0 1:6.5 2:3.0 3:5.8 4:2.2 +2.0 1:7.6 2:3.0 3:6.6 4:2.1 +2.0 1:4.9 2:2.5 3:4.5 4:1.7 +2.0 1:7.3 2:2.9 3:6.3 4:1.8 +2.0 1:6.7 2:2.5 3:5.8 4:1.8 +2.0 1:7.2 2:3.6 3:6.1 4:2.5 +2.0 1:6.5 2:3.2 3:5.1 4:2.0 +2.0 1:6.4 2:2.7 3:5.3 4:1.9 +2.0 1:6.8 2:3.0 3:5.5 4:2.1 +2.0 1:5.7 2:2.5 3:5.0 4:2.0 +2.0 1:5.8 2:2.8 3:5.1 4:2.4 +2.0 1:6.4 2:3.2 3:5.3 4:2.3 +2.0 1:6.5 2:3.0 3:5.5 4:1.8 +2.0 1:7.7 2:3.8 3:6.7 4:2.2 +2.0 1:7.7 2:2.6 3:6.9 4:2.3 +2.0 1:6.0 2:2.2 3:5.0 4:1.5 +2.0 1:6.9 2:3.2 3:5.7 4:2.3 +2.0 1:5.6 2:2.8 3:4.9 4:2.0 +2.0 1:7.7 2:2.8 3:6.7 4:2.0 +2.0 1:6.3 2:2.7 3:4.9 4:1.8 +2.0 1:6.7 2:3.3 3:5.7 4:2.1 +2.0 1:7.2 2:3.2 3:6.0 4:1.8 +2.0 1:6.2 2:2.8 3:4.8 4:1.8 +2.0 1:6.1 2:3.0 3:4.9 4:1.8 +2.0 1:6.4 2:2.8 3:5.6 4:2.1 +2.0 1:7.2 2:3.0 3:5.8 4:1.6 +2.0 1:7.4 2:2.8 3:6.1 4:1.9 +2.0 1:7.9 2:3.8 3:6.4 4:2.0 +2.0 1:6.4 2:2.8 3:5.6 4:2.2 +2.0 1:6.3 2:2.8 3:5.1 4:1.5 +2.0 1:6.1 2:2.6 3:5.6 4:1.4 +2.0 1:7.7 2:3.0 3:6.1 4:2.3 +2.0 1:6.3 2:3.4 3:5.6 4:2.4 +2.0 1:6.4 2:3.1 3:5.5 4:1.8 +2.0 1:6.0 2:3.0 3:4.8 4:1.8 +2.0 1:6.9 2:3.1 3:5.4 4:2.1 +2.0 1:6.7 2:3.1 3:5.6 4:2.4 +2.0 1:6.9 2:3.1 3:5.1 4:2.3 +2.0 1:5.8 2:2.7 3:5.1 4:1.9 +2.0 1:6.8 2:3.2 3:5.9 4:2.3 +2.0 1:6.7 2:3.3 3:5.7 4:2.5 +2.0 1:6.7 2:3.0 3:5.2 4:2.3 +2.0 1:6.3 2:2.5 3:5.0 4:1.9 +2.0 1:6.5 2:3.0 3:5.2 4:2.0 +2.0 1:6.2 2:3.4 3:5.4 4:2.3 +2.0 1:5.9 2:3.0 3:5.1 4:1.8 diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala index 9c54a90a73610..92d96becbbd4a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala @@ -18,16 +18,14 @@ package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.types.IntegerType -private[ml] case class ClusteringEvaluationTestData(features: Vector, label: Int) - class ClusteringEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { @@ -56,34 +54,24 @@ class ClusteringEvaluatorSuite */ test("squared euclidean Silhouette") { val iris = ClusteringEvaluatorSuite.irisDataset(spark) + .withColumn("label", $"label".cast(IntegerType)) val evaluator = new ClusteringEvaluator() .setFeaturesCol("features") .setPredictionCol("label") - assert(evaluator.evaluate(iris) ~== 0.6564679231 relTol 1e-10) + assert(evaluator.evaluate(iris) ~== 0.6564679231 relTol 1e-5) } } object ClusteringEvaluatorSuite { def irisDataset(spark: SparkSession): DataFrame = { - import spark.implicits._ - val irisCsvPath = Thread.currentThread() + val irisPath = Thread.currentThread() .getContextClassLoader - .getResource("test-data/iris.csv") + .getResource("test-data/iris.libsvm") .toString - spark.sparkContext - .textFile(irisCsvPath) - .map { - line => - val splits = line.split(",") - ClusteringEvaluationTestData( - Vectors.dense(splits.take(splits.length-1).map(_.toDouble)), - splits(splits.length-1).toInt - ) - } - .toDF() + spark.read.format("libsvm").load(irisPath) } } From a9006526fcfa79dcaad0a2b91c250a23fc14eaf0 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Wed, 6 Sep 2017 16:59:21 +0200 Subject: [PATCH 15/18] Added metricName param --- .../ml/evaluation/ClusteringEvaluator.scala | 40 ++++++++++++++++--- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 52baf713057d0..203aac7b7ec14 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkContext import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors, VectorUDT} -import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} @@ -63,16 +63,44 @@ class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: Str @Since("2.3.0") def setFeaturesCol(value: String): this.type = set(featuresCol, value) + /** + * param for metric name in evaluation + * (supports `"squaredSilhouette"` (default)) + * @group param + */ + @Since("2.3.0") + val metricName: Param[String] = { + val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) + new Param( + this, + "metricName", + "metric name in evaluation (squaredSilhouette)", + allowedParams + ) + } + + /** @group getParam */ + @Since("2.3.0") + def getMetricName: String = $(metricName) + + /** @group setParam */ + @Since("2.3.0") + def setMetricName(value: String): this.type = set(metricName, value) + + setDefault(metricName -> "squaredSilhouette") + @Since("2.3.0") override def evaluate(dataset: Dataset[_]): Double = { SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) - SquaredEuclideanSilhouette.computeSilhouetteScore( - dataset, - $(predictionCol), - $(featuresCol) - ) + $(metricName) match { + case "squaredSilhouette" => SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, + $(predictionCol), + $(featuresCol) + ) + } } } From 7b8149a3f5fab0f5667b342d76fe3ea1bfc6ce81 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Wed, 6 Sep 2017 17:00:50 +0200 Subject: [PATCH 16/18] Assert number of clusters is greater than one --- .../spark/ml/evaluation/ClusteringEvaluator.scala | 4 ++++ .../ml/evaluation/ClusteringEvaluatorSuite.scala | 14 ++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 203aac7b7ec14..724534265b03f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -94,6 +94,10 @@ class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: Str SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + // Silhouette is reasonable only when the number of clusters is grater then 1 + assert(dataset.select($(predictionCol)).distinct().count() > 1, + "Number of clusters must be greater than one.") + $(metricName) match { case "squaredSilhouette" => SquaredEuclideanSilhouette.computeSilhouetteScore( dataset, diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala index 92d96becbbd4a..f42c6f0235a10 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala @@ -62,6 +62,20 @@ class ClusteringEvaluatorSuite assert(evaluator.evaluate(iris) ~== 0.6564679231 relTol 1e-5) } + test("number of clusters must be greater than one") { + val iris = ClusteringEvaluatorSuite.irisDataset(spark) + .withColumn("label", $"label".cast(IntegerType)) + .where($"label" === 0) + val evaluator = new ClusteringEvaluator() + .setFeaturesCol("features") + .setPredictionCol("label") + + val e = intercept[AssertionError]{ + evaluator.evaluate(iris) + } + assert(e.getMessage.contains("Number of clusters must be greater than one")) + } + } object ClusteringEvaluatorSuite { From b0b7853d68c1c79bd49d6e290d3c96fe9e3af6ea Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Mon, 11 Sep 2017 14:16:48 +0200 Subject: [PATCH 17/18] Fix comments --- .../ml/evaluation/ClusteringEvaluator.scala | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 724534265b03f..1e6639555cce7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -65,16 +65,16 @@ class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: Str /** * param for metric name in evaluation - * (supports `"squaredSilhouette"` (default)) + * (supports `"silhouette"` (default)) * @group param */ @Since("2.3.0") val metricName: Param[String] = { - val allowedParams = ParamValidators.inArray(Array("squaredSilhouette")) + val allowedParams = ParamValidators.inArray(Array("silhouette")) new Param( this, "metricName", - "metric name in evaluation (squaredSilhouette)", + "metric name in evaluation (silhouette)", allowedParams ) } @@ -87,19 +87,15 @@ class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: Str @Since("2.3.0") def setMetricName(value: String): this.type = set(metricName, value) - setDefault(metricName -> "squaredSilhouette") + setDefault(metricName -> "silhouette") @Since("2.3.0") override def evaluate(dataset: Dataset[_]): Double = { SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) - // Silhouette is reasonable only when the number of clusters is grater then 1 - assert(dataset.select($(predictionCol)).distinct().count() > 1, - "Number of clusters must be greater than one.") - $(metricName) match { - case "squaredSilhouette" => SquaredEuclideanSilhouette.computeSilhouetteScore( + case "silhouette" => SquaredEuclideanSilhouette.computeSilhouetteScore( dataset, $(predictionCol), $(featuresCol) @@ -145,7 +141,7 @@ object ClusteringEvaluator * where `$a_{i}$` is the average dissimilarity of `i` with all other data * within the same cluster, `$b_{i}$` is the lowest average dissimilarity * of `i` to any other cluster, of which `i` is not a member. - * `$a_{i}$` can be interpreted as as how well `i` is assigned to its cluster + * `$a_{i}$` can be interpreted as how well `i` is assigned to its cluster * (the smaller the value, the better the assignment), while `$b_{i}$` is * a measure of how well `i` has not been assigned to its "neighboring cluster", * ie. the nearest cluster to `i`. @@ -238,7 +234,7 @@ object ClusteringEvaluator * *
* $$ - * \frac{\sum\limits_{i=1}^N d(X, C_{i} )^2}{N} = + * \frac{\sum\limits_{i=1}^N d(X, C_{i} )}{N} = * \frac{N\xi_{X} + \Psi_{\Gamma} - 2 \sum\limits_{j=1}^D Y_{\Gamma j} x_{j}}{N} = * \xi_{X} + \frac{\Psi_{\Gamma} }{N} - 2 \frac{\sum\limits_{j=1}^D Y_{\Gamma j} x_{j}}{N} * $$ @@ -296,7 +292,8 @@ private[evaluation] object SquaredEuclideanSilhouette { * about a cluster which are needed by the algorithm. * * @param df The DataFrame which contains the input data - * @param predictionCol The name of the column which contains the cluster id for the point. + * @param predictionCol The name of the column which contains the predicted cluster id + * for the point. * @param featuresCol The name of the column which contains the feature vector of the point. * @return A [[scala.collection.immutable.Map]] which associates each cluster id * to a [[ClusterStats]] object (which contains the precomputed values `N`, @@ -398,7 +395,8 @@ private[evaluation] object SquaredEuclideanSilhouette { * Compute the mean Silhouette values of all samples. * * @param dataset The input dataset (previously clustered) on which compute the Silhouette. - * @param predictionCol The name of the column which contains the cluster id for the point. + * @param predictionCol The name of the column which contains the predicted cluster id + * for the point. * @param featuresCol The name of the column which contains the feature vector of the point. * @return The average of the Silhouette values of the clustered data. */ @@ -417,6 +415,9 @@ private[evaluation] object SquaredEuclideanSilhouette { val clustersStatsMap = SquaredEuclideanSilhouette .computeClusterStats(dfWithSquaredNorm, predictionCol, featuresCol) + // Silhouette is reasonable only when the number of clusters is grater then 1 + assert(clustersStatsMap.size > 1, "Number of clusters must be greater than one.") + val bClustersStatsMap = dataset.sparkSession.sparkContext.broadcast(clustersStatsMap) val computeSilhouetteCoefficientUDF = udf { From a7c14818283467276a8f7eaa30b074a0f25237dc Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Tue, 12 Sep 2017 10:05:38 +0200 Subject: [PATCH 18/18] Support all numeric types for prediction and minor style fix --- .../ml/evaluation/ClusteringEvaluator.scala | 34 +++++++++---------- .../evaluation/ClusteringEvaluatorSuite.scala | 4 +-- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala index 1e6639555cce7..d6ec5223237bb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringEvaluator.scala @@ -26,10 +26,11 @@ import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{avg, col, udf} -import org.apache.spark.sql.types.IntegerType +import org.apache.spark.sql.types.DoubleType /** * :: Experimental :: + * * Evaluator for clustering results. * The metric computes the Silhouette measure * using the squared Euclidean distance. @@ -72,11 +73,7 @@ class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: Str val metricName: Param[String] = { val allowedParams = ParamValidators.inArray(Array("silhouette")) new Param( - this, - "metricName", - "metric name in evaluation (silhouette)", - allowedParams - ) + this, "metricName", "metric name in evaluation (silhouette)", allowedParams) } /** @group getParam */ @@ -92,13 +89,12 @@ class ClusteringEvaluator @Since("2.3.0") (@Since("2.3.0") override val uid: Str @Since("2.3.0") override def evaluate(dataset: Dataset[_]): Double = { SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) - SchemaUtils.checkColumnType(dataset.schema, $(predictionCol), IntegerType) + SchemaUtils.checkNumericType(dataset.schema, $(predictionCol)) $(metricName) match { - case "silhouette" => SquaredEuclideanSilhouette.computeSilhouetteScore( - dataset, - $(predictionCol), - $(featuresCol) + case "silhouette" => + SquaredEuclideanSilhouette.computeSilhouetteScore( + dataset, $(predictionCol), $(featuresCol) ) } } @@ -302,11 +298,12 @@ private[evaluation] object SquaredEuclideanSilhouette { def computeClusterStats( df: DataFrame, predictionCol: String, - featuresCol: String): Map[Int, ClusterStats] = { + featuresCol: String): Map[Double, ClusterStats] = { val numFeatures = df.select(col(featuresCol)).first().getAs[Vector](0).size - val clustersStatsRDD = df.select(col(predictionCol), col(featuresCol), col("squaredNorm")) + val clustersStatsRDD = df.select( + col(predictionCol).cast(DoubleType), col(featuresCol), col("squaredNorm")) .rdd - .map { row => (row.getInt(0), (row.getAs[Vector](1), row.getDouble(2))) } + .map { row => (row.getDouble(0), (row.getAs[Vector](1), row.getDouble(2))) } .aggregateByKey[(DenseVector, Double, Long)]((Vectors.zeros(numFeatures).toDense, 0.0, 0L))( seqOp = { case ( @@ -345,9 +342,9 @@ private[evaluation] object SquaredEuclideanSilhouette { * @return The Silhouette for the point. */ def computeSilhouetteCoefficient( - broadcastedClustersMap: Broadcast[Map[Int, ClusterStats]], + broadcastedClustersMap: Broadcast[Map[Double, ClusterStats]], features: Vector, - clusterId: Int, + clusterId: Double, squaredNorm: Double): Double = { def compute(squaredNorm: Double, point: Vector, clusterStats: ClusterStats): Double = { @@ -421,12 +418,13 @@ private[evaluation] object SquaredEuclideanSilhouette { val bClustersStatsMap = dataset.sparkSession.sparkContext.broadcast(clustersStatsMap) val computeSilhouetteCoefficientUDF = udf { - computeSilhouetteCoefficient(bClustersStatsMap, _: Vector, _: Int, _: Double) + computeSilhouetteCoefficient(bClustersStatsMap, _: Vector, _: Double, _: Double) } val silhouetteScore = dfWithSquaredNorm .select(avg( - computeSilhouetteCoefficientUDF(col(featuresCol), col(predictionCol), col("squaredNorm")) + computeSilhouetteCoefficientUDF( + col(featuresCol), col(predictionCol).cast(DoubleType), col("squaredNorm")) )) .collect()(0) .getDouble(0) diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala index f42c6f0235a10..e60ebbd7c852d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/ClusteringEvaluatorSuite.scala @@ -54,7 +54,6 @@ class ClusteringEvaluatorSuite */ test("squared euclidean Silhouette") { val iris = ClusteringEvaluatorSuite.irisDataset(spark) - .withColumn("label", $"label".cast(IntegerType)) val evaluator = new ClusteringEvaluator() .setFeaturesCol("features") .setPredictionCol("label") @@ -64,8 +63,7 @@ class ClusteringEvaluatorSuite test("number of clusters must be greater than one") { val iris = ClusteringEvaluatorSuite.irisDataset(spark) - .withColumn("label", $"label".cast(IntegerType)) - .where($"label" === 0) + .where($"label" === 0.0) val evaluator = new ClusteringEvaluator() .setFeaturesCol("features") .setPredictionCol("label")