From 4c4f83e97d9bd2d8771452498581bf9ce43bd28d Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Mon, 6 Nov 2017 16:49:17 +0100 Subject: [PATCH 1/2] [SPARK-14516][FOLLOWUP] Adding ClusteringEvaluator to examples --- .../spark/examples/ml/JavaKMeansExample.java | 15 ++++++++++++--- examples/src/main/python/ml/kmeans_example.py | 13 ++++++++++--- .../apache/spark/examples/ml/KMeansExample.scala | 15 ++++++++++++--- 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java index d8f948ae38cb3..ea276ce2869c2 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java @@ -20,6 +20,7 @@ // $example on$ import org.apache.spark.ml.clustering.KMeansModel; import org.apache.spark.ml.clustering.KMeans; +import org.apache.spark.ml.evaluation.ClusteringEvaluator; import org.apache.spark.ml.linalg.Vector; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -51,9 +52,17 @@ public static void main(String[] args) { KMeans kmeans = new KMeans().setK(2).setSeed(1L); KMeansModel model = kmeans.fit(dataset); - // Evaluate clustering by computing Within Set Sum of Squared Errors. - double WSSSE = model.computeCost(dataset); - System.out.println("Within Set Sum of Squared Errors = " + WSSSE); + // Make predictions + Dataset predictions = model.transform(dataset); + + // Evaluate clustering by computing Silhouette score + ClusteringEvaluator evaluator = new ClusteringEvaluator() + .setFeaturesCol("features") + .setPredictionCol("prediction") + .setMetricName("silhouette"); + + double silhouette = evaluator.evaluate(predictions); + System.out.println("Silhouette with squared euclidean distance = " + silhouette); // Shows the result. Vector[] centers = model.clusterCenters(); diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py index 6846ec4599714..d6f5e5b0e2b07 100644 --- a/examples/src/main/python/ml/kmeans_example.py +++ b/examples/src/main/python/ml/kmeans_example.py @@ -19,6 +19,7 @@ # $example on$ from pyspark.ml.clustering import KMeans +from pyspark.ml.evaluation import ClusteringEvaluator # $example off$ from pyspark.sql import SparkSession @@ -45,9 +46,15 @@ kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(dataset) - # Evaluate clustering by computing Within Set Sum of Squared Errors. - wssse = model.computeCost(dataset) - print("Within Set Sum of Squared Errors = " + str(wssse)) + # Make predictions + predictions = model.transform(dataset) + + # Evaluate clustering by computing Silhouette score + evaluator = ClusteringEvaluator( + featuresCol="features", predictionCol="prediction", metricName="silhouette") + + silhouette = evaluator.evaluate(predictions) + print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala index a1d19e138dedb..bf19d26efbf52 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala @@ -21,6 +21,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.clustering.KMeans +import org.apache.spark.ml.evaluation.ClusteringEvaluator // $example off$ import org.apache.spark.sql.SparkSession @@ -47,9 +48,17 @@ object KMeansExample { val kmeans = new KMeans().setK(2).setSeed(1L) val model = kmeans.fit(dataset) - // Evaluate clustering by computing Within Set Sum of Squared Errors. - val WSSSE = model.computeCost(dataset) - println(s"Within Set Sum of Squared Errors = $WSSSE") + // Make predictions + val predictions = model.transform(dataset) + + // Evaluate clustering by computing Silhouette score + val evaluator = new ClusteringEvaluator() + .setFeaturesCol("features") + .setPredictionCol("prediction") + .setMetricName("silhouette") + + val silhouette = evaluator.evaluate(predictions) + println(s"Silhouette with squared euclidean distance = $silhouette") // Shows the result. println("Cluster Centers: ") From feb619d657f6ff66dec240ee4619e6f53208ac18 Mon Sep 17 00:00:00 2001 From: Marco Gaido Date: Sat, 9 Dec 2017 10:06:15 +0100 Subject: [PATCH 2/2] remove useless parameters --- .../java/org/apache/spark/examples/ml/JavaKMeansExample.java | 5 +---- examples/src/main/python/ml/kmeans_example.py | 3 +-- .../scala/org/apache/spark/examples/ml/KMeansExample.scala | 3 --- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java index ea276ce2869c2..dc4b0bcb59657 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java @@ -56,10 +56,7 @@ public static void main(String[] args) { Dataset predictions = model.transform(dataset); // Evaluate clustering by computing Silhouette score - ClusteringEvaluator evaluator = new ClusteringEvaluator() - .setFeaturesCol("features") - .setPredictionCol("prediction") - .setMetricName("silhouette"); + ClusteringEvaluator evaluator = new ClusteringEvaluator(); double silhouette = evaluator.evaluate(predictions); System.out.println("Silhouette with squared euclidean distance = " + silhouette); diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py index d6f5e5b0e2b07..5f77843e3743a 100644 --- a/examples/src/main/python/ml/kmeans_example.py +++ b/examples/src/main/python/ml/kmeans_example.py @@ -50,8 +50,7 @@ predictions = model.transform(dataset) # Evaluate clustering by computing Silhouette score - evaluator = ClusteringEvaluator( - featuresCol="features", predictionCol="prediction", metricName="silhouette") + evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala index bf19d26efbf52..2bc8184e623ff 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala @@ -53,9 +53,6 @@ object KMeansExample { // Evaluate clustering by computing Silhouette score val evaluator = new ClusteringEvaluator() - .setFeaturesCol("features") - .setPredictionCol("prediction") - .setMetricName("silhouette") val silhouette = evaluator.evaluate(predictions) println(s"Silhouette with squared euclidean distance = $silhouette")