Skip to content
Browse files

MAHOUT-981, Added outlier removal option in method and CLI for KMeans…

…Driver.

git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1301886 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent 9399f27 commit 161d55eb60f1728f6929a904eca5601eb5daa515 Paritosh Ranjan committed Mar 17, 2012
View
60 core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
@@ -82,6 +82,7 @@ public int run(String[] args) throws Exception {
addOption(DefaultOptionCreator.overwriteOption().create());
addOption(DefaultOptionCreator.clusteringOption().create());
addOption(DefaultOptionCreator.methodOption().create());
+ addOption(DefaultOptionCreator.outlierThresholdOption().create());
if (parseArguments(args) == null) {
return -1;
@@ -111,28 +112,39 @@ public int run(String[] args) throws Exception {
if (getConf() == null) {
setConf(new Configuration());
}
- run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering, runSequential);
+ double clusterClassificationThreshold = 0.0;
+ if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) {
+ clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD));
+ }
+ run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering,
+ clusterClassificationThreshold, runSequential);
return 0;
}
- /**
- * Iterate over the input vectors to produce clusters and, if requested, use the
- * results of the final iteration to cluster the input vectors.
+ /**
+ * Iterate over the input vectors to produce clusters and, if requested, use
+ * the results of the final iteration to cluster the input vectors.
+ *
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for initial & computed clusters
* @param output
* the directory pathname for output points
- * @param measure
+ * @param measure
* the DistanceMeasure to use
* @param convergenceDelta
* the convergence delta value
* @param maxIterations
* the maximum number of iterations
- * @param runClustering
+ * @param runClustering
* true if points are to be clustered after iterations are completed
- * @param runSequential if true execute sequential algorithm
+ * @param clusterClassificationThreshold
+ * Is a clustering strictness / outlier removal parameter. Its value
+ * should be between 0 and 1. Vectors having pdf below this value
+ * will not be clustered.
+ * @param runSequential
+ * if true execute sequential algorithm
*/
public static void run(Configuration conf,
Path input,
@@ -142,6 +154,7 @@ public static void run(Configuration conf,
double convergenceDelta,
int maxIterations,
boolean runClustering,
+ double clusterClassificationThreshold,
boolean runSequential)
throws IOException, InterruptedException, ClassNotFoundException {
@@ -161,30 +174,35 @@ public static void run(Configuration conf,
clustersOut,
output,
measure,
- delta,
+ clusterClassificationThreshold,
runSequential);
}
}
/**
- * Iterate over the input vectors to produce clusters and, if requested, use the
- * results of the final iteration to cluster the input vectors.
+ * Iterate over the input vectors to produce clusters and, if requested, use
+ * the results of the final iteration to cluster the input vectors.
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for initial & computed clusters
* @param output
* the directory pathname for output points
- * @param measure
+ * @param measure
* the DistanceMeasure to use
* @param convergenceDelta
* the convergence delta value
* @param maxIterations
* the maximum number of iterations
- * @param runClustering
+ * @param runClustering
* true if points are to be clustered after iterations are completed
- * @param runSequential if true execute sequential algorithm
+ * @param clusterClassificationThreshold
+ * Is a clustering strictness / outlier removal parrameter. Its value
+ * should be between 0 and 1. Vectors having pdf below this value
+ * will not be clustered.
+ * @param runSequential
+ * if true execute sequential algorithm
*/
public static void run(Path input,
Path clustersIn,
@@ -193,6 +211,7 @@ public static void run(Path input,
double convergenceDelta,
int maxIterations,
boolean runClustering,
+ double clusterClassificationThreshold,
boolean runSequential)
throws IOException, InterruptedException, ClassNotFoundException {
run(new Configuration(),
@@ -203,6 +222,7 @@ public static void run(Path input,
convergenceDelta,
maxIterations,
runClustering,
+ clusterClassificationThreshold,
runSequential);
}
@@ -404,6 +424,7 @@ private static boolean isConverged(Path filePath, Configuration conf, FileSystem
/**
* Run the job using supplied arguments
+ *
* @param input
* the directory pathname for input points
* @param clustersIn
@@ -412,25 +433,26 @@ private static boolean isConverged(Path filePath, Configuration conf, FileSystem
* the directory pathname for output points
* @param measure
* the classname of the DistanceMeasure
- * @param convergenceDelta
- * the convergence delta value
- * @param runSequential if true execute sequential algorithm
+ * @param clusterClassificationThreshold
+ * Is a clustering strictness / outlier removal parrameter. Its value
+ * should be between 0 and 1. Vectors having pdf below this value
+ * will not be clustered.
+ * @param runSequential
+ * if true execute sequential algorithm
*/
public static void clusterData(Configuration conf,
Path input,
Path clustersIn,
Path output,
DistanceMeasure measure,
- String convergenceDelta,
+ double clusterClassificationThreshold,
boolean runSequential)
throws IOException, InterruptedException, ClassNotFoundException {
if (log.isInfoEnabled()) {
log.info("Running Clustering");
log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output, measure});
- log.info("convergence: {} Input Vectors: {}", convergenceDelta, VectorWritable.class.getName());
}
- Double clusterClassificationThreshold = 0.0;
ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn);
ClusterClassificationDriver.run(input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY),
clusterClassificationThreshold, true, runSequential);
View
1 core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
@@ -187,6 +187,7 @@ public static void run(Configuration conf,
convergenceDelta,
maxIterations,
true,
+ 0.0,
false);
}
}
View
2 core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
@@ -527,7 +527,7 @@ public void testKMeansWithCanopyClusterInput() throws Exception {
// now run the KMeans job
Path kmeansOutput = new Path(outputPath, "kmeans");
KMeansDriver.run(pointsPath, new Path(outputPath, "clusters-0-final"), kmeansOutput, new EuclideanDistanceMeasure(),
- 0.001, 10, true, false);
+ 0.001, 10, true, 0.0, false);
// now compare the expected clusters with actual
Path clusteredPointsPath = new Path(kmeansOutput, "clusteredPoints");
View
2 .../test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java
@@ -101,7 +101,7 @@ private void topLevelClustering(Path pointsPath, Configuration conf) throws IOEx
CanopyDriver.run(conf, pointsPath, outputPathForCanopy, measure, 4.0, 3.0, true, 0.0, true);
Path clustersIn = new Path(outputPathForCanopy, new Path(Cluster.CLUSTERS_DIR + '0'
+ Cluster.FINAL_ITERATION_SUFFIX));
- KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, measure, 1, 1, true, true);
+ KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, measure, 1, 1, true, 0.0, true);
}
private static void verifyThatNumberOfClustersIsCorrect(Configuration conf, Path clusteredPointsPath) {
View
2 examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
@@ -93,7 +93,7 @@ private static void runSequentialKMeansClusterer(Configuration conf, Path sample
DistanceMeasure measure, int maxIterations, double convergenceDelta) throws IOException, InterruptedException,
ClassNotFoundException {
Path clusters = RandomSeedGenerator.buildRandom(conf, samples, new Path(output, "clusters-0"), 3, measure);
- KMeansDriver.run(samples, clusters, output, measure, convergenceDelta, maxIterations, true, true);
+ KMeansDriver.run(samples, clusters, output, measure, convergenceDelta, maxIterations, true, 0.0, true);
loadClusters(output);
}
View
4 examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
@@ -145,7 +145,7 @@ public static void run(Configuration conf, Path input, Path output,
directoryContainingConvertedInput, clusters, k, measure);
log.info("Running KMeans");
KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output,
- measure, convergenceDelta, maxIterations, true, false);
+ measure, convergenceDelta, maxIterations, true, 0.0, false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, maxIterations), new Path(output, "clusteredPoints"));
@@ -195,7 +195,7 @@ public static void run(Configuration conf, Path input, Path output,
log.info("Running KMeans");
KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output,
Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta,
- maxIterations, true, false);
+ maxIterations, true, 0.0, false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, maxIterations), new Path(output, "clusteredPoints"));
View
8 integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
@@ -207,7 +207,7 @@ public void testKmeans() throws Exception {
4, false, 0.0, true);
// now run the KMeans job
KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output,
- "clusters-0-final"), output, measure, 0.001, 10, true, false);
+ "clusters-0-final"), output, measure, 0.001, 10, true, 0.0, false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(output, "clusteredPoints"));
@@ -338,7 +338,7 @@ public void testKmeansSVD() throws Exception {
// now run the KMeans job
Path kmeansOutput = new Path(output, "kmeans");
KMeansDriver.run(svdData, new Path(output, "clusters-0"), kmeansOutput, measure,
- 0.001, 10, true, true);
+ 0.001, 10, true, 0.0, true);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
kmeansOutput, 10), new Path(kmeansOutput, "clusteredPoints"));
@@ -380,7 +380,7 @@ public void testKmeansDSVD() throws Exception {
// now run the KMeans job
Path kmeansOutput = new Path(output, "kmeans");
KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
- kmeansOutput, measure, 0.001, 10, true, true);
+ kmeansOutput, measure, 0.001, 10, true, 0.0, true);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
kmeansOutput, 10), new Path(kmeansOutput, "clusteredPoints"));
@@ -425,7 +425,7 @@ public void testKmeansDSVD2() throws Exception {
// now run the KMeans job
Path kmeansOutput = new Path(output, "kmeans");
KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
- kmeansOutput, measure, 0.001, 10, true, true);
+ kmeansOutput, measure, 0.001, 10, true, 0.0, true);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
kmeansOutput, 10), new Path(kmeansOutput, "clusteredPoints"));
View
2 integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
@@ -344,7 +344,7 @@ public void testKmeans() throws Exception {
// now run the KMeans job
Path kmeansOutput = new Path(output, "kmeans");
KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, measure,
- 0.001, 10, true, true);
+ 0.001, 10, true, 0.0, true);
int numIterations = 10;
Path clustersIn = new Path(output, "clusters-2");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output,
View
2 integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
@@ -354,7 +354,7 @@ public void testKmeans() throws Exception {
// now run the KMeans job
Path kmeansOutput = new Path(output, "kmeans");
KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, measure,
- 0.001, 10, true, true);
+ 0.001, 10, true, 0.0, true);
int numIterations = 10;
Path clustersIn = new Path(output, "clusters-2");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output,

0 comments on commit 161d55e

Please sign in to comment.
Something went wrong with that request. Please try again.