From 582cd11e5331a8e2704a5603080eec41c9002cf4 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 4 Jul 2014 00:27:22 +0800
Subject: [PATCH] simply add checker for the number of clusters.

---
 .../scala/org/apache/spark/mllib/clustering/KMeans.scala     | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index de22fbb6ffc10..1398fd24f30fc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -117,6 +117,11 @@ class KMeans private (
    * performance, because this is an iterative algorithm.
    */
   def run(data: RDD[Vector]): KMeansModel = {
+
+    if (data.count() < k) {
+      throw new IllegalArgumentException("Number of clusters must not be greater than data number")
+    }
+
     // Compute squared norms and cache them.
     val norms = data.map(v => breezeNorm(v.toBreeze, 2.0))
     norms.persist()