Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-17389][Follow-up][ML] Change KMeans k-means|| default init steps from 5 to 2. #15050

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe

/**
* Param for the number of steps for the k-means|| initialization mode. This is an advanced
* setting -- the default of 5 is almost always enough. Must be > 0. Default: 5.
* setting -- the default of 2 is almost always enough. Must be > 0. Default: 2.
* @group expertParam
*/
@Since("1.5.0")
Expand Down Expand Up @@ -262,7 +262,7 @@ class KMeans @Since("1.5.0") (
k -> 2,
maxIter -> 20,
initMode -> MLlibKMeans.K_MEANS_PARALLEL,
initSteps -> 5,
initSteps -> 2,
tol -> 1e-4)

@Since("1.5.0")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
assert(kmeans.getPredictionCol === "prediction")
assert(kmeans.getMaxIter === 20)
assert(kmeans.getInitMode === MLlibKMeans.K_MEANS_PARALLEL)
assert(kmeans.getInitSteps === 5)
assert(kmeans.getInitSteps === 2)
assert(kmeans.getTol === 1e-4)
}

Expand Down
10 changes: 5 additions & 5 deletions python/pyspark/ml/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,14 +254,14 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol

@keyword_only
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None):
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None):
"""
__init__(self, featuresCol="features", predictionCol="prediction", k=2, \
initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None)
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None)
"""
super(KMeans, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid)
self._setDefault(k=2, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20)
self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

Expand All @@ -271,10 +271,10 @@ def _create_model(self, java_model):
@keyword_only
@since("1.5.0")
def setParams(self, featuresCol="features", predictionCol="prediction", k=2,
initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None):
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None):
"""
setParams(self, featuresCol="features", predictionCol="prediction", k=2, \
initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20, seed=None)
initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, seed=None)

Sets params for KMeans.
"""
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/mllib/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ class KMeans(object):
@classmethod
@since('0.9.0')
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||",
seed=None, initializationSteps=5, epsilon=1e-4, initialModel=None):
seed=None, initializationSteps=2, epsilon=1e-4, initialModel=None):
"""
Train a k-means clustering model.

Expand All @@ -330,9 +330,9 @@ def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"
(default: None)
:param initializationSteps:
Number of steps for the k-means|| initialization mode.
This is an advanced setting -- the default of 5 is almost
This is an advanced setting -- the default of 2 is almost
always enough.
(default: 5)
(default: 2)
:param epsilon:
Distance threshold within which a center will be considered to
have converged. If all centers move less than this Euclidean
Expand Down