From 1fec5e4d0adb7fd4a5c1f36a967a02dcdb1cd6e5 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 3 Apr 2016 03:35:31 +0000
Subject: [PATCH 1/3] [SPARK-14301][Examples] Java examples code merge and
 clean up.

This fix tries to remove duplicate Java code in examples/mllib and
examples/ml. The following changes have been made:

deleted: ml/JavaCrossValidatorExample.java (->JavaModelSelectionViaCrossValidationExample.java)
deleted: ml/JavaTrainValidationSplitExample.java (-> JavaModelSelectionViaTrainValidationSplitExample.java)
deleted: ml/JavaSimpleTextClassificationPipeline.java (-> JavaModelSelectionViaCrossValidationExample.java)
deleted: ml/JavaDeveloperApiExample.java (conform to changes in scala/DeveloperApiExample.scala)
deleted: mllib/JavaFPGrowthExample.java (-> JavaSimpleFPGrowth.java)
deleted: mllib/JavaLDAExample.java (-> JavaLatentDirichletAllocationExample.java)
deleted: mllib/JavaKMeans.java (merged with JavaKMeansExample.java)
deleted: mllib/JavaLR.java (-> JavaLinearRegressionWithSGDExample.java)
updated: mllib/JavaKMeansExample.java (merged with mllib/JavaKMeans.java)
---
 .../ml/JavaCrossValidatorExample.java         | 127 ---------
 .../examples/ml/JavaDeveloperApiExample.java  | 242 ------------------
 .../JavaSimpleTextClassificationPipeline.java |  94 -------
 .../ml/JavaTrainValidationSplitExample.java   |  87 -------
 .../examples/mllib/JavaFPGrowthExample.java   |  78 ------
 .../spark/examples/mllib/JavaKMeans.java      |  82 ------
 .../examples/mllib/JavaKMeansExample.java     |   7 +
 .../spark/examples/mllib/JavaLDAExample.java  |  77 ------
 .../apache/spark/examples/mllib/JavaLR.java   |  82 ------
 9 files changed, 7 insertions(+), 869 deletions(-)
 delete mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
 delete mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
 delete mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
 delete mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
 delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
 delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeans.java
 delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java
 delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaLR.java

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
deleted file mode 100644
index 07edeb3e521c3..0000000000000
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.ml;
-
-import java.util.List;
-
-import com.google.common.collect.Lists;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.Pipeline;
-import org.apache.spark.ml.PipelineStage;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator;
-import org.apache.spark.ml.feature.HashingTF;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.ml.param.ParamMap;
-import org.apache.spark.ml.tuning.CrossValidator;
-import org.apache.spark.ml.tuning.CrossValidatorModel;
-import org.apache.spark.ml.tuning.ParamGridBuilder;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
-
-/**
- * A simple example demonstrating model selection using CrossValidator.
- * This example also demonstrates how Pipelines are Estimators.
- *
- * This example uses the Java bean classes {@link org.apache.spark.examples.ml.LabeledDocument} and
- * {@link org.apache.spark.examples.ml.Document} defined in the Scala example
- * {@link org.apache.spark.examples.ml.SimpleTextClassificationPipeline}.
- *
- * Run with
- * <pre>
- * bin/run-example ml.JavaCrossValidatorExample
- * </pre>
- */
-public class JavaCrossValidatorExample {
-
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("JavaCrossValidatorExample");
-    JavaSparkContext jsc = new JavaSparkContext(conf);
-    SQLContext jsql = new SQLContext(jsc);
-
-    // Prepare training documents, which are labeled.
-    List<LabeledDocument> localTraining = Lists.newArrayList(
-      new LabeledDocument(0L, "a b c d e spark", 1.0),
-      new LabeledDocument(1L, "b d", 0.0),
-      new LabeledDocument(2L, "spark f g h", 1.0),
-      new LabeledDocument(3L, "hadoop mapreduce", 0.0),
-      new LabeledDocument(4L, "b spark who", 1.0),
-      new LabeledDocument(5L, "g d a y", 0.0),
-      new LabeledDocument(6L, "spark fly", 1.0),
-      new LabeledDocument(7L, "was mapreduce", 0.0),
-      new LabeledDocument(8L, "e spark program", 1.0),
-      new LabeledDocument(9L, "a e c l", 0.0),
-      new LabeledDocument(10L, "spark compile", 1.0),
-      new LabeledDocument(11L, "hadoop software", 0.0));
-    Dataset<Row> training = jsql.createDataFrame(
-        jsc.parallelize(localTraining), LabeledDocument.class);
-
-    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
-    Tokenizer tokenizer = new Tokenizer()
-      .setInputCol("text")
-      .setOutputCol("words");
-    HashingTF hashingTF = new HashingTF()
-      .setNumFeatures(1000)
-      .setInputCol(tokenizer.getOutputCol())
-      .setOutputCol("features");
-    LogisticRegression lr = new LogisticRegression()
-      .setMaxIter(10)
-      .setRegParam(0.01);
-    Pipeline pipeline = new Pipeline()
-      .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
-
-    // We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
-    // This will allow us to jointly choose parameters for all Pipeline stages.
-    // A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
-    CrossValidator crossval = new CrossValidator()
-        .setEstimator(pipeline)
-        .setEvaluator(new BinaryClassificationEvaluator());
-    // We use a ParamGridBuilder to construct a grid of parameters to search over.
-    // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
-    // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
-    ParamMap[] paramGrid = new ParamGridBuilder()
-        .addGrid(hashingTF.numFeatures(), new int[]{10, 100, 1000})
-        .addGrid(lr.regParam(), new double[]{0.1, 0.01})
-        .build();
-    crossval.setEstimatorParamMaps(paramGrid);
-    crossval.setNumFolds(2); // Use 3+ in practice
-
-    // Run cross-validation, and choose the best set of parameters.
-    CrossValidatorModel cvModel = crossval.fit(training);
-
-    // Prepare test documents, which are unlabeled.
-    List<Document> localTest = Lists.newArrayList(
-      new Document(4L, "spark i j k"),
-      new Document(5L, "l m n"),
-      new Document(6L, "mapreduce spark"),
-      new Document(7L, "apache hadoop"));
-    Dataset<Row> test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
-
-    // Make predictions on test documents. cvModel uses the best model found (lrModel).
-    Dataset<Row> predictions = cvModel.transform(test);
-    for (Row r: predictions.select("id", "text", "probability", "prediction").collectAsList()) {
-      System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
-          + ", prediction=" + r.get(3));
-    }
-
-    jsc.stop();
-  }
-}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
deleted file mode 100644
index fbd881766983f..0000000000000
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.ml;
-
-import java.util.List;
-
-import com.google.common.collect.Lists;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.classification.Classifier;
-import org.apache.spark.ml.classification.ClassificationModel;
-import org.apache.spark.ml.param.IntParam;
-import org.apache.spark.ml.param.ParamMap;
-import org.apache.spark.ml.util.Identifiable$;
-import org.apache.spark.mllib.linalg.BLAS;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
-
-
-/**
- * A simple example demonstrating how to write your own learning algorithm using Estimator,
- * Transformer, and other abstractions.
- * This mimics {@link org.apache.spark.ml.classification.LogisticRegression}.
- *
- * Run with
- * <pre>
- * bin/run-example ml.JavaDeveloperApiExample
- * </pre>
- */
-public class JavaDeveloperApiExample {
-
-  public static void main(String[] args) throws Exception {
-    SparkConf conf = new SparkConf().setAppName("JavaDeveloperApiExample");
-    JavaSparkContext jsc = new JavaSparkContext(conf);
-    SQLContext jsql = new SQLContext(jsc);
-
-    // Prepare training data.
-    List<LabeledPoint> localTraining = Lists.newArrayList(
-        new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
-        new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
-        new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
-        new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
-    Dataset<Row> training = jsql.createDataFrame(
-        jsc.parallelize(localTraining), LabeledPoint.class);
-
-    // Create a LogisticRegression instance.  This instance is an Estimator.
-    MyJavaLogisticRegression lr = new MyJavaLogisticRegression();
-    // Print out the parameters, documentation, and any default values.
-    System.out.println("MyJavaLogisticRegression parameters:\n" + lr.explainParams() + "\n");
-
-    // We may set parameters using setter methods.
-    lr.setMaxIter(10);
-
-    // Learn a LogisticRegression model.  This uses the parameters stored in lr.
-    MyJavaLogisticRegressionModel model = lr.fit(training);
-
-    // Prepare test data.
-    List<LabeledPoint> localTest = Lists.newArrayList(
-        new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
-        new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
-        new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
-    Dataset<Row> test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
-
-    // Make predictions on test documents. cvModel uses the best model found (lrModel).
-    Dataset<Row> results = model.transform(test);
-    double sumPredictions = 0;
-    for (Row r : results.select("features", "label", "prediction").collectAsList()) {
-      sumPredictions += r.getDouble(2);
-    }
-    if (sumPredictions != 0.0) {
-      throw new Exception("MyJavaLogisticRegression predicted something other than 0," +
-          " even though all coefficients are 0!");
-    }
-
-    jsc.stop();
-  }
-}
-
-/**
- * Example of defining a type of {@link Classifier}.
- *
- * Note: Some IDEs (e.g., IntelliJ) will complain that this will not compile due to
- *       {@link org.apache.spark.ml.param.Params#set} using incompatible return types.
- *       However, this should still compile and run successfully.
- */
-class MyJavaLogisticRegression
-  extends Classifier<Vector, MyJavaLogisticRegression, MyJavaLogisticRegressionModel> {
-
-  MyJavaLogisticRegression() {
-    init();
-  }
-
-  MyJavaLogisticRegression(String uid) {
-    this.uid_ = uid;
-    init();
-  }
-
-  private String uid_ = Identifiable$.MODULE$.randomUID("myJavaLogReg");
-
-  @Override
-  public String uid() {
-    return uid_;
-  }
-
-  /**
-   * Param for max number of iterations
-   * <p>
-   * NOTE: The usual way to add a parameter to a model or algorithm is to include:
-   * - val myParamName: ParamType
-   * - def getMyParamName
-   * - def setMyParamName
-   */
-  IntParam maxIter = new IntParam(this, "maxIter", "max number of iterations");
-
-  int getMaxIter() { return (Integer) getOrDefault(maxIter); }
-
-  private void init() {
-    setMaxIter(100);
-  }
-
-  // The parameter setter is in this class since it should return type MyJavaLogisticRegression.
-  MyJavaLogisticRegression setMaxIter(int value) {
-    return (MyJavaLogisticRegression) set(maxIter, value);
-  }
-
-  // This method is used by fit().
-  // In Java, we have to make it public since Java does not understand Scala's protected modifier.
-  public MyJavaLogisticRegressionModel train(Dataset<Row> dataset) {
-    // Extract columns from data using helper method.
-    JavaRDD<LabeledPoint> oldDataset = extractLabeledPoints(dataset).toJavaRDD();
-
-    // Do learning to estimate the coefficients vector.
-    int numFeatures = oldDataset.take(1).get(0).features().size();
-    Vector coefficients = Vectors.zeros(numFeatures); // Learning would happen here.
-
-    // Create a model, and return it.
-    return new MyJavaLogisticRegressionModel(uid(), coefficients).setParent(this);
-  }
-
-  @Override
-  public MyJavaLogisticRegression copy(ParamMap extra) {
-    return defaultCopy(extra);
-  }
-}
-
-/**
- * Example of defining a type of {@link ClassificationModel}.
- *
- * Note: Some IDEs (e.g., IntelliJ) will complain that this will not compile due to
- *       {@link org.apache.spark.ml.param.Params#set} using incompatible return types.
- *       However, this should still compile and run successfully.
- */
-class MyJavaLogisticRegressionModel
-  extends ClassificationModel<Vector, MyJavaLogisticRegressionModel> {
-
-  private Vector coefficients_;
-  public Vector coefficients() { return coefficients_; }
-
-  MyJavaLogisticRegressionModel(String uid, Vector coefficients) {
-    this.uid_ = uid;
-    this.coefficients_ = coefficients;
-  }
-
-  private String uid_ = Identifiable$.MODULE$.randomUID("myJavaLogReg");
-
-  @Override
-  public String uid() {
-    return uid_;
-  }
-
-  // This uses the default implementation of transform(), which reads column "features" and outputs
-  // columns "prediction" and "rawPrediction."
-
-  // This uses the default implementation of predict(), which chooses the label corresponding to
-  // the maximum value returned by [[predictRaw()]].
-
-  /**
-   * Raw prediction for each possible label.
-   * The meaning of a "raw" prediction may vary between algorithms, but it intuitively gives
-   * a measure of confidence in each possible label (where larger = more confident).
-   * This internal method is used to implement [[transform()]] and output [[rawPredictionCol]].
-   *
-   * @return vector where element i is the raw prediction for label i.
-   * This raw prediction may be any real number, where a larger value indicates greater
-   * confidence for that label.
-   *
-   * In Java, we have to make this method public since Java does not understand Scala's protected
-   * modifier.
-   */
-  public Vector predictRaw(Vector features) {
-    double margin = BLAS.dot(features, coefficients_);
-    // There are 2 classes (binary classification), so we return a length-2 vector,
-    // where index i corresponds to class i (i = 0, 1).
-    return Vectors.dense(-margin, margin);
-  }
-
-  /**
-   * Number of classes the label can take.  2 indicates binary classification.
-   */
-  public int numClasses() { return 2; }
-
-  /**
-   * Number of features the model was trained on.
-   */
-  public int numFeatures() { return coefficients_.size(); }
-
-  /**
-   * Create a copy of the model.
-   * The copy is shallow, except for the embedded paramMap, which gets a deep copy.
-   * <p>
-   * This is used for the default implementation of [[transform()]].
-   *
-   * In Java, we have to make this method public since Java does not understand Scala's protected
-   * modifier.
-   */
-  @Override
-  public MyJavaLogisticRegressionModel copy(ParamMap extra) {
-    return copyValues(new MyJavaLogisticRegressionModel(uid(), coefficients_), extra)
-      .setParent(parent());
-  }
-}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
deleted file mode 100644
index a18a60f448166..0000000000000
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.ml;
-
-import java.util.List;
-
-import com.google.common.collect.Lists;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.Pipeline;
-import org.apache.spark.ml.PipelineModel;
-import org.apache.spark.ml.PipelineStage;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.ml.feature.HashingTF;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
-
-/**
- * A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
- * bean classes {@link LabeledDocument} and {@link Document} defined in the Scala counterpart of
- * this example {@link SimpleTextClassificationPipeline}. Run with
- * <pre>
- * bin/run-example ml.JavaSimpleTextClassificationPipeline
- * </pre>
- */
-public class JavaSimpleTextClassificationPipeline {
-
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline");
-    JavaSparkContext jsc = new JavaSparkContext(conf);
-    SQLContext jsql = new SQLContext(jsc);
-
-    // Prepare training documents, which are labeled.
-    List<LabeledDocument> localTraining = Lists.newArrayList(
-      new LabeledDocument(0L, "a b c d e spark", 1.0),
-      new LabeledDocument(1L, "b d", 0.0),
-      new LabeledDocument(2L, "spark f g h", 1.0),
-      new LabeledDocument(3L, "hadoop mapreduce", 0.0));
-    Dataset<Row> training =
-        jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
-
-    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
-    Tokenizer tokenizer = new Tokenizer()
-      .setInputCol("text")
-      .setOutputCol("words");
-    HashingTF hashingTF = new HashingTF()
-      .setNumFeatures(1000)
-      .setInputCol(tokenizer.getOutputCol())
-      .setOutputCol("features");
-    LogisticRegression lr = new LogisticRegression()
-      .setMaxIter(10)
-      .setRegParam(0.001);
-    Pipeline pipeline = new Pipeline()
-      .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
-
-    // Fit the pipeline to training documents.
-    PipelineModel model = pipeline.fit(training);
-
-    // Prepare test documents, which are unlabeled.
-    List<Document> localTest = Lists.newArrayList(
-      new Document(4L, "spark i j k"),
-      new Document(5L, "l m n"),
-      new Document(6L, "spark hadoop spark"),
-      new Document(7L, "apache hadoop"));
-    Dataset<Row> test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
-
-    // Make predictions on test documents.
-    Dataset<Row> predictions = model.transform(test);
-    for (Row r: predictions.select("id", "text", "probability", "prediction").collectAsList()) {
-      System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
-          + ", prediction=" + r.get(3));
-    }
-
-    jsc.stop();
-  }
-}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
deleted file mode 100644
index 09bbc39c01fe0..0000000000000
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.ml;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.evaluation.RegressionEvaluator;
-import org.apache.spark.ml.param.ParamMap;
-import org.apache.spark.ml.regression.LinearRegression;
-import org.apache.spark.ml.tuning.*;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
-
-/**
- * A simple example demonstrating model selection using TrainValidationSplit.
- *
- * The example is based on {@link org.apache.spark.examples.ml.JavaSimpleParamsExample}
- * using linear regression.
- *
- * Run with
- * {{{
- * bin/run-example ml.JavaTrainValidationSplitExample
- * }}}
- */
-public class JavaTrainValidationSplitExample {
-
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("JavaTrainValidationSplitExample");
-    JavaSparkContext jsc = new JavaSparkContext(conf);
-    SQLContext jsql = new SQLContext(jsc);
-
-    Dataset<Row> data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
-
-    // Prepare training and test data.
-    Dataset<Row>[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345);
-    Dataset<Row> training = splits[0];
-    Dataset<Row> test = splits[1];
-
-    LinearRegression lr = new LinearRegression();
-
-    // We use a ParamGridBuilder to construct a grid of parameters to search over.
-    // TrainValidationSplit will try all combinations of values and determine best model using
-    // the evaluator.
-    ParamMap[] paramGrid = new ParamGridBuilder()
-      .addGrid(lr.regParam(), new double[] {0.1, 0.01})
-      .addGrid(lr.fitIntercept())
-      .addGrid(lr.elasticNetParam(), new double[] {0.0, 0.5, 1.0})
-      .build();
-
-    // In this case the estimator is simply the linear regression.
-    // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
-    TrainValidationSplit trainValidationSplit = new TrainValidationSplit()
-      .setEstimator(lr)
-      .setEvaluator(new RegressionEvaluator())
-      .setEstimatorParamMaps(paramGrid);
-
-    // 80% of the data will be used for training and the remaining 20% for validation.
-    trainValidationSplit.setTrainRatio(0.8);
-
-    // Run train validation split, and choose the best set of parameters.
-    TrainValidationSplitModel model = trainValidationSplit.fit(training);
-
-    // Make predictions on test data. model is the model with combination of parameters
-    // that performed best.
-    model.transform(test)
-      .select("features", "label", "prediction")
-      .show();
-
-    jsc.stop();
-  }
-}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
deleted file mode 100644
index 36baf5868736c..0000000000000
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import java.util.ArrayList;
-
-import com.google.common.base.Joiner;
-import com.google.common.collect.Lists;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.fpm.FPGrowth;
-import org.apache.spark.mllib.fpm.FPGrowthModel;
-
-/**
- * Java example for mining frequent itemsets using FP-growth.
- * Example usage:  ./bin/run-example mllib.JavaFPGrowthExample ./data/mllib/sample_fpgrowth.txt
- */
-public class JavaFPGrowthExample {
-
-  public static void main(String[] args) {
-    String inputFile;
-    double minSupport = 0.3;
-    int numPartition = -1;
-    if (args.length < 1) {
-      System.err.println(
-        "Usage: JavaFPGrowth <input_file> [minSupport] [numPartition]");
-      System.exit(1);
-    }
-    inputFile = args[0];
-    if (args.length >= 2) {
-      minSupport = Double.parseDouble(args[1]);
-    }
-    if (args.length >= 3) {
-      numPartition = Integer.parseInt(args[2]);
-    }
-
-    SparkConf sparkConf = new SparkConf().setAppName("JavaFPGrowthExample");
-    JavaSparkContext sc = new JavaSparkContext(sparkConf);
-
-    JavaRDD<ArrayList<String>> transactions = sc.textFile(inputFile).map(
-      new Function<String, ArrayList<String>>() {
-        @Override
-        public ArrayList<String> call(String s) {
-          return Lists.newArrayList(s.split(" "));
-        }
-      }
-    );
-
-    FPGrowthModel<String> model = new FPGrowth()
-      .setMinSupport(minSupport)
-      .setNumPartitions(numPartition)
-      .run(transactions);
-
-    for (FPGrowth.FreqItemset<String> s: model.freqItemsets().toJavaRDD().collect()) {
-      System.out.println("[" + Joiner.on(",").join(s.javaItems()) + "], " + s.freq());
-    }
-
-    sc.stop();
-  }
-}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeans.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeans.java
deleted file mode 100644
index e575eedeb465c..0000000000000
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeans.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import java.util.regex.Pattern;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-
-import org.apache.spark.mllib.clustering.KMeans;
-import org.apache.spark.mllib.clustering.KMeansModel;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-
-/**
- * Example using MLlib KMeans from Java.
- */
-public final class JavaKMeans {
-
-  private static class ParsePoint implements Function<String, Vector> {
-    private static final Pattern SPACE = Pattern.compile(" ");
-
-    @Override
-    public Vector call(String line) {
-      String[] tok = SPACE.split(line);
-      double[] point = new double[tok.length];
-      for (int i = 0; i < tok.length; ++i) {
-        point[i] = Double.parseDouble(tok[i]);
-      }
-      return Vectors.dense(point);
-    }
-  }
-
-  public static void main(String[] args) {
-    if (args.length < 3) {
-      System.err.println(
-        "Usage: JavaKMeans <input_file> <k> <max_iterations> [<runs>]");
-      System.exit(1);
-    }
-    String inputFile = args[0];
-    int k = Integer.parseInt(args[1]);
-    int iterations = Integer.parseInt(args[2]);
-    int runs = 1;
-
-    if (args.length >= 4) {
-      runs = Integer.parseInt(args[3]);
-    }
-    SparkConf sparkConf = new SparkConf().setAppName("JavaKMeans");
-    JavaSparkContext sc = new JavaSparkContext(sparkConf);
-    JavaRDD<String> lines = sc.textFile(inputFile);
-
-    JavaRDD<Vector> points = lines.map(new ParsePoint());
-
-    KMeansModel model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.K_MEANS_PARALLEL());
-
-    System.out.println("Cluster centers:");
-    for (Vector center : model.clusterCenters()) {
-      System.out.println(" " + center);
-    }
-    double cost = model.computeCost(points.rdd());
-    System.out.println("Cost: " + cost);
-
-    sc.stop();
-  }
-}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeansExample.java
index 006d96d11196c..2d89c768fcfca 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKMeansExample.java
@@ -58,6 +58,13 @@ public Vector call(String s) {
     int numIterations = 20;
     KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations);
 
+    System.out.println("Cluster centers:");
+    for (Vector center: clusters.clusterCenters()) {
+      System.out.println(" " + center);
+    }
+    double cost = clusters.computeCost(parsedData.rdd());
+    System.out.println("Cost: " + cost);
+
     // Evaluate clustering by computing Within Set Sum of Squared Errors
     double WSSSE = clusters.computeCost(parsedData.rdd());
     System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java
deleted file mode 100644
index de8e739ac9256..0000000000000
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import scala.Tuple2;
-
-import org.apache.spark.api.java.*;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.clustering.DistributedLDAModel;
-import org.apache.spark.mllib.clustering.LDA;
-import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.SparkConf;
-
-public class JavaLDAExample {
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("LDA Example");
-    JavaSparkContext sc = new JavaSparkContext(conf);
-
-    // Load and parse the data
-    String path = "data/mllib/sample_lda_data.txt";
-    JavaRDD<String> data = sc.textFile(path);
-    JavaRDD<Vector> parsedData = data.map(
-        new Function<String, Vector>() {
-          public Vector call(String s) {
-            String[] sarray = s.trim().split(" ");
-            double[] values = new double[sarray.length];
-            for (int i = 0; i < sarray.length; i++) {
-              values[i] = Double.parseDouble(sarray[i]);
-            }
-            return Vectors.dense(values);
-          }
-        }
-    );
-    // Index documents with unique IDs
-    JavaPairRDD<Long, Vector> corpus = JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
-        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
-          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
-            return doc_id.swap();
-          }
-        }
-    ));
-    corpus.cache();
-
-    // Cluster the documents into three topics using LDA
-    DistributedLDAModel ldaModel = (DistributedLDAModel)new LDA().setK(3).run(corpus);
-
-    // Output topics. Each is a distribution over words (matching word count vectors)
-    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
-        + " words):");
-    Matrix topics = ldaModel.topicsMatrix();
-    for (int topic = 0; topic < 3; topic++) {
-      System.out.print("Topic " + topic + ":");
-      for (int word = 0; word < ldaModel.vocabSize(); word++) {
-        System.out.print(" " + topics.apply(word, topic));
-      }
-      System.out.println();
-    }
-    sc.stop();
-  }
-}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLR.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLR.java
deleted file mode 100644
index eceb6927d5551..0000000000000
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLR.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import java.util.regex.Pattern;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-
-import org.apache.spark.mllib.classification.LogisticRegressionWithSGD;
-import org.apache.spark.mllib.classification.LogisticRegressionModel;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-
-/**
- * Logistic regression based classification using ML Lib.
- */
-public final class JavaLR {
-
-  static class ParsePoint implements Function<String, LabeledPoint> {
-    private static final Pattern COMMA = Pattern.compile(",");
-    private static final Pattern SPACE = Pattern.compile(" ");
-
-    @Override
-    public LabeledPoint call(String line) {
-      String[] parts = COMMA.split(line);
-      double y = Double.parseDouble(parts[0]);
-      String[] tok = SPACE.split(parts[1]);
-      double[] x = new double[tok.length];
-      for (int i = 0; i < tok.length; ++i) {
-        x[i] = Double.parseDouble(tok[i]);
-      }
-      return new LabeledPoint(y, Vectors.dense(x));
-    }
-  }
-
-  public static void main(String[] args) {
-    if (args.length != 3) {
-      System.err.println("Usage: JavaLR <input_dir> <step_size> <niters>");
-      System.exit(1);
-    }
-    SparkConf sparkConf = new SparkConf().setAppName("JavaLR");
-    JavaSparkContext sc = new JavaSparkContext(sparkConf);
-    JavaRDD<String> lines = sc.textFile(args[0]);
-    JavaRDD<LabeledPoint> points = lines.map(new ParsePoint()).cache();
-    double stepSize = Double.parseDouble(args[1]);
-    int iterations = Integer.parseInt(args[2]);
-
-    // Another way to configure LogisticRegression
-    //
-    // LogisticRegressionWithSGD lr = new LogisticRegressionWithSGD();
-    // lr.optimizer().setNumIterations(iterations)
-    //               .setStepSize(stepSize)
-    //               .setMiniBatchFraction(1.0);
-    // lr.setIntercept(true);
-    // LogisticRegressionModel model = lr.train(points.rdd());
-
-    LogisticRegressionModel model = LogisticRegressionWithSGD.train(points.rdd(),
-      iterations, stepSize);
-
-    System.out.print("Final w: " + model.weights());
-
-    sc.stop();
-  }
-}

From 0b387a5e1c7dfc1e3cc24c38690586c307dd534c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 6 Apr 2016 01:46:26 +0000
Subject: [PATCH 2/3] [SPARK-14301][Examples] Java examples code merge and
 clean up.

The following files have been restored:

ml/JavaDeveloperApiExample.java
ml/JavaSimpleTextClassificationPipeline.java
---
 .../examples/ml/JavaDeveloperApiExample.java  | 242 ++++++++++++++++++
 ...lectionViaTrainValidationSplitExample.java |  10 +
 .../JavaSimpleTextClassificationPipeline.java |  94 +++++++
 3 files changed, 346 insertions(+)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
new file mode 100644
index 0000000000000..fbd881766983f
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.List;
+
+import com.google.common.collect.Lists;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.classification.Classifier;
+import org.apache.spark.ml.classification.ClassificationModel;
+import org.apache.spark.ml.param.IntParam;
+import org.apache.spark.ml.param.ParamMap;
+import org.apache.spark.ml.util.Identifiable$;
+import org.apache.spark.mllib.linalg.BLAS;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+
+
+/**
+ * A simple example demonstrating how to write your own learning algorithm using Estimator,
+ * Transformer, and other abstractions.
+ * This mimics {@link org.apache.spark.ml.classification.LogisticRegression}.
+ *
+ * Run with
+ * <pre>
+ * bin/run-example ml.JavaDeveloperApiExample
+ * </pre>
+ */
+public class JavaDeveloperApiExample {
+
+  public static void main(String[] args) throws Exception {
+    SparkConf conf = new SparkConf().setAppName("JavaDeveloperApiExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // Prepare training data.
+    List<LabeledPoint> localTraining = Lists.newArrayList(
+        new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
+        new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
+        new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
+        new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
+    Dataset<Row> training = jsql.createDataFrame(
+        jsc.parallelize(localTraining), LabeledPoint.class);
+
+    // Create a LogisticRegression instance.  This instance is an Estimator.
+    MyJavaLogisticRegression lr = new MyJavaLogisticRegression();
+    // Print out the parameters, documentation, and any default values.
+    System.out.println("MyJavaLogisticRegression parameters:\n" + lr.explainParams() + "\n");
+
+    // We may set parameters using setter methods.
+    lr.setMaxIter(10);
+
+    // Learn a LogisticRegression model.  This uses the parameters stored in lr.
+    MyJavaLogisticRegressionModel model = lr.fit(training);
+
+    // Prepare test data.
+    List<LabeledPoint> localTest = Lists.newArrayList(
+        new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
+        new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
+        new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
+    Dataset<Row> test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
+
+    // Make predictions on test documents. cvModel uses the best model found (lrModel).
+    Dataset<Row> results = model.transform(test);
+    double sumPredictions = 0;
+    for (Row r : results.select("features", "label", "prediction").collectAsList()) {
+      sumPredictions += r.getDouble(2);
+    }
+    if (sumPredictions != 0.0) {
+      throw new Exception("MyJavaLogisticRegression predicted something other than 0," +
+          " even though all coefficients are 0!");
+    }
+
+    jsc.stop();
+  }
+}
+
+/**
+ * Example of defining a type of {@link Classifier}.
+ *
+ * Note: Some IDEs (e.g., IntelliJ) will complain that this will not compile due to
+ *       {@link org.apache.spark.ml.param.Params#set} using incompatible return types.
+ *       However, this should still compile and run successfully.
+ */
+class MyJavaLogisticRegression
+  extends Classifier<Vector, MyJavaLogisticRegression, MyJavaLogisticRegressionModel> {
+
+  MyJavaLogisticRegression() {
+    init();
+  }
+
+  MyJavaLogisticRegression(String uid) {
+    this.uid_ = uid;
+    init();
+  }
+
+  private String uid_ = Identifiable$.MODULE$.randomUID("myJavaLogReg");
+
+  @Override
+  public String uid() {
+    return uid_;
+  }
+
+  /**
+   * Param for max number of iterations
+   * <p>
+   * NOTE: The usual way to add a parameter to a model or algorithm is to include:
+   * - val myParamName: ParamType
+   * - def getMyParamName
+   * - def setMyParamName
+   */
+  IntParam maxIter = new IntParam(this, "maxIter", "max number of iterations");
+
+  int getMaxIter() { return (Integer) getOrDefault(maxIter); }
+
+  private void init() {
+    setMaxIter(100);
+  }
+
+  // The parameter setter is in this class since it should return type MyJavaLogisticRegression.
+  MyJavaLogisticRegression setMaxIter(int value) {
+    return (MyJavaLogisticRegression) set(maxIter, value);
+  }
+
+  // This method is used by fit().
+  // In Java, we have to make it public since Java does not understand Scala's protected modifier.
+  public MyJavaLogisticRegressionModel train(Dataset<Row> dataset) {
+    // Extract columns from data using helper method.
+    JavaRDD<LabeledPoint> oldDataset = extractLabeledPoints(dataset).toJavaRDD();
+
+    // Do learning to estimate the coefficients vector.
+    int numFeatures = oldDataset.take(1).get(0).features().size();
+    Vector coefficients = Vectors.zeros(numFeatures); // Learning would happen here.
+
+    // Create a model, and return it.
+    return new MyJavaLogisticRegressionModel(uid(), coefficients).setParent(this);
+  }
+
+  @Override
+  public MyJavaLogisticRegression copy(ParamMap extra) {
+    return defaultCopy(extra);
+  }
+}
+
+/**
+ * Example of defining a type of {@link ClassificationModel}.
+ *
+ * Note: Some IDEs (e.g., IntelliJ) will complain that this will not compile due to
+ *       {@link org.apache.spark.ml.param.Params#set} using incompatible return types.
+ *       However, this should still compile and run successfully.
+ */
+class MyJavaLogisticRegressionModel
+  extends ClassificationModel<Vector, MyJavaLogisticRegressionModel> {
+
+  private Vector coefficients_;
+  public Vector coefficients() { return coefficients_; }
+
+  MyJavaLogisticRegressionModel(String uid, Vector coefficients) {
+    this.uid_ = uid;
+    this.coefficients_ = coefficients;
+  }
+
+  private String uid_ = Identifiable$.MODULE$.randomUID("myJavaLogReg");
+
+  @Override
+  public String uid() {
+    return uid_;
+  }
+
+  // This uses the default implementation of transform(), which reads column "features" and outputs
+  // columns "prediction" and "rawPrediction."
+
+  // This uses the default implementation of predict(), which chooses the label corresponding to
+  // the maximum value returned by [[predictRaw()]].
+
+  /**
+   * Raw prediction for each possible label.
+   * The meaning of a "raw" prediction may vary between algorithms, but it intuitively gives
+   * a measure of confidence in each possible label (where larger = more confident).
+   * This internal method is used to implement [[transform()]] and output [[rawPredictionCol]].
+   *
+   * @return vector where element i is the raw prediction for label i.
+   * This raw prediction may be any real number, where a larger value indicates greater
+   * confidence for that label.
+   *
+   * In Java, we have to make this method public since Java does not understand Scala's protected
+   * modifier.
+   */
+  public Vector predictRaw(Vector features) {
+    double margin = BLAS.dot(features, coefficients_);
+    // There are 2 classes (binary classification), so we return a length-2 vector,
+    // where index i corresponds to class i (i = 0, 1).
+    return Vectors.dense(-margin, margin);
+  }
+
+  /**
+   * Number of classes the label can take.  2 indicates binary classification.
+   */
+  public int numClasses() { return 2; }
+
+  /**
+   * Number of features the model was trained on.
+   */
+  public int numFeatures() { return coefficients_.size(); }
+
+  /**
+   * Create a copy of the model.
+   * The copy is shallow, except for the embedded paramMap, which gets a deep copy.
+   * <p>
+   * This is used for the default implementation of [[transform()]].
+   *
+   * In Java, we have to make this method public since Java does not understand Scala's protected
+   * modifier.
+   */
+  @Override
+  public MyJavaLogisticRegressionModel copy(ParamMap extra) {
+    return copyValues(new MyJavaLogisticRegressionModel(uid(), coefficients_), extra)
+      .setParent(parent());
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
index 6ac4aea3c483c..c3baec26301d1 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
@@ -33,6 +33,16 @@
 
 /**
  * Java example for Model Selection via Train Validation Split.
+ *
+ * A simple example demonstrating model selection using TrainValidationSplit.
+ *
+ * The example is based on {@link org.apache.spark.examples.ml.JavaSimpleParamsExample}
+ * using linear regression.
+ *
+ * Run with
+ * {{{
+ * bin/run-example ml.JavaModelSelectionViaTrainValidationSplitExample
+ * }}}
  */
 public class JavaModelSelectionViaTrainValidationSplitExample {
   public static void main(String[] args) {
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
new file mode 100644
index 0000000000000..a18a60f448166
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.List;
+
+import com.google.common.collect.Lists;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.Pipeline;
+import org.apache.spark.ml.PipelineModel;
+import org.apache.spark.ml.PipelineStage;
+import org.apache.spark.ml.classification.LogisticRegression;
+import org.apache.spark.ml.feature.HashingTF;
+import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
+
+/**
+ * A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
+ * bean classes {@link LabeledDocument} and {@link Document} defined in the Scala counterpart of
+ * this example {@link SimpleTextClassificationPipeline}. Run with
+ * <pre>
+ * bin/run-example ml.JavaSimpleTextClassificationPipeline
+ * </pre>
+ */
+public class JavaSimpleTextClassificationPipeline {
+
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    // Prepare training documents, which are labeled.
+    List<LabeledDocument> localTraining = Lists.newArrayList(
+      new LabeledDocument(0L, "a b c d e spark", 1.0),
+      new LabeledDocument(1L, "b d", 0.0),
+      new LabeledDocument(2L, "spark f g h", 1.0),
+      new LabeledDocument(3L, "hadoop mapreduce", 0.0));
+    Dataset<Row> training =
+        jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
+
+    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
+    Tokenizer tokenizer = new Tokenizer()
+      .setInputCol("text")
+      .setOutputCol("words");
+    HashingTF hashingTF = new HashingTF()
+      .setNumFeatures(1000)
+      .setInputCol(tokenizer.getOutputCol())
+      .setOutputCol("features");
+    LogisticRegression lr = new LogisticRegression()
+      .setMaxIter(10)
+      .setRegParam(0.001);
+    Pipeline pipeline = new Pipeline()
+      .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
+
+    // Fit the pipeline to training documents.
+    PipelineModel model = pipeline.fit(training);
+
+    // Prepare test documents, which are unlabeled.
+    List<Document> localTest = Lists.newArrayList(
+      new Document(4L, "spark i j k"),
+      new Document(5L, "l m n"),
+      new Document(6L, "spark hadoop spark"),
+      new Document(7L, "apache hadoop"));
+    Dataset<Row> test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
+
+    // Make predictions on test documents.
+    Dataset<Row> predictions = model.transform(test);
+    for (Row r: predictions.select("id", "text", "probability", "prediction").collectAsList()) {
+      System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
+          + ", prediction=" + r.get(3));
+    }
+
+    jsc.stop();
+  }
+}

From dff87e6f70fe1a7036eb4d7d2eb5469fa00b923e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 7 Apr 2016 14:07:33 +0000
Subject: [PATCH 3/3] [SPARK-14301][Examples] Java examples code merge and
 clean up.

Better description of comment in JavaModelSelectionViaTrainValidationSplitExample.java.
---
 .../ml/JavaModelSelectionViaTrainValidationSplitExample.java  | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
index c3baec26301d1..4994f8f9fa857 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java
@@ -32,9 +32,7 @@
 import org.apache.spark.sql.SQLContext;
 
 /**
- * Java example for Model Selection via Train Validation Split.
- *
- * A simple example demonstrating model selection using TrainValidationSplit.
+ * Java example demonstrating model selection using TrainValidationSplit.
  *
  * The example is based on {@link org.apache.spark.examples.ml.JavaSimpleParamsExample}
  * using linear regression.