From 49b7012e7b67d3fc7db031d0604d76d4150034eb Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Mon, 1 Feb 2016 17:51:25 -0800
Subject: [PATCH 01/26] [SPARK-13019] raplce for summary staticstics, scala
 code

---
 docs/mllib-statistics.md                      | 14 +----
 .../ml/JavaSummaryStatisticsExample.java      |  7 +++
 .../ml/SummaryStatisticsExample.scala         | 51 +++++++++++++++++++
 3 files changed, 59 insertions(+), 13 deletions(-)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index 652d215fa8653..93c5204e63304 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -40,19 +40,7 @@ total count.
 
 Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
-
-val observations: RDD[Vector] = ... // an RDD of Vectors
-
-// Compute column summary statistics.
-val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
-println(summary.mean) // a dense vector containing the mean value for each column
-println(summary.variance) // column-wise variance
-println(summary.numNonzeros) // number of nonzeros in each column
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java
new file mode 100644
index 0000000000000..8c8e3ab0ef143
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java
@@ -0,0 +1,7 @@
+package org.apache.spark.examples.ml;
+
+/**
+ * Created by quickmobile on 16-02-01.
+ */
+public class JavaSummaryStatisticsExample {
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala
new file mode 100644
index 0000000000000..10781257ee4d6
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object SummaryStatisticsExample {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("MultivariateStatisticalSummaryExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val v1 = Vectors.dense(1.0, 10.0, 100.0)
+    val v2 = Vectors.dense(2.0, 20.0, 200.0)
+    val v3 = Vectors.dense(3.0, 30.0, 300.0)
+
+    val observations = sc.parallelize(Seq(v1, v2, v3))
+
+    // Compute column summary statistics.
+    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
+    println(summary.mean) // a dense vector containing the mean value for each column
+    println(summary.variance) // column-wise variance
+    println(summary.numNonzeros) // number of nonzeros in each column
+    // $example off$
+
+    sc.stop()
+  }
+}
+// scalastyle:on println

From 83592bcafa553bf9439da7db72552868b3ed967a Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Mon, 1 Feb 2016 21:01:20 -0800
Subject: [PATCH 02/26] [SPARK-13019] test out on/off, for import part

---
 .../org/apache/spark/examples/ml/SummaryStatisticsExample.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala
index 10781257ee4d6..68b9c19914897 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala
@@ -18,8 +18,10 @@
 // scalastyle:off println
 package org.apache.spark.examples.ml
 
+// $example on$
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
+// $example off$
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkConf, SparkContext}
 

From 069341b3d22c147fcf84e338e9b1b7d1f9fdae8e Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Tue, 2 Feb 2016 17:33:33 -0800
Subject: [PATCH 03/26] [SPARK-13019] create separate example files, but cannot
 compile yet

---
 docs/mllib-statistics.md                      | 390 +-----------------
 .../examples/ml/JavaCorrelationsExample.java  |  64 +++
 .../ml/JavaHypothesisTestingExample.java      |  79 ++++
 ...isTestingKolmogorovSmirnovTestExample.java |  59 +++
 .../JavaKernelDensityEstimationExample.java   |  59 +++
 .../ml/JavaRandomDataGenerationExample.java   |  61 +++
 .../ml/JavaStratifiedSamplingExample.java     |  52 +++
 .../ml/JavaSummaryStatisticsExample.java      |  56 ++-
 .../main/python/ml/correlations_example.py    |  50 +++
 .../python/ml/hypothesis_testing_example.py   |  66 +++
 ...testing_kolmogorov_smirnov_test_example.py |  48 +++
 .../ml/kernel_density_estimation_example.py   |  48 +++
 .../ml/random_data_generation_example.py      |  43 ++
 .../python/ml/stratified_sampling_example.py  |  39 ++
 .../python/ml/summary_statistics_example.py   |  45 ++
 .../examples/ml/CorrelationsExample.scala     |  58 +++
 .../ml/HypothesisTestingExample.scala         |  74 ++++
 ...sTestingKolmogorovSmirnovTestExample.scala |  61 +++
 .../ml/KernelDensityEstimationExample.scala   |  56 +++
 .../ml/RandomDataGenerationExample.scala      |  52 +++
 .../ml/StratifiedSamplingExample.scala        |  51 +++
 .../ml/SummaryStatisticsExample.scala         |   2 +-
 22 files changed, 1139 insertions(+), 374 deletions(-)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java
 create mode 100644 examples/src/main/python/ml/correlations_example.py
 create mode 100644 examples/src/main/python/ml/hypothesis_testing_example.py
 create mode 100644 examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py
 create mode 100644 examples/src/main/python/ml/kernel_density_estimation_example.py
 create mode 100644 examples/src/main/python/ml/random_data_generation_example.py
 create mode 100644 examples/src/main/python/ml/stratified_sampling_example.py
 create mode 100644 examples/src/main/python/ml/summary_statistics_example.py
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala

diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index 93c5204e63304..487ae12f3b6de 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -52,24 +52,7 @@ total count.
 
 Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
-import org.apache.spark.mllib.stat.Statistics;
-
-JavaSparkContext jsc = ...
-
-JavaRDD<Vector> mat = ... // an RDD of Vectors
-
-// Compute column summary statistics.
-MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
-System.out.println(summary.mean()); // a dense vector containing the mean value for each column
-System.out.println(summary.variance()); // column-wise variance
-System.out.println(summary.numNonzeros()); // number of nonzeros in each column
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -80,20 +63,7 @@ total count.
 
 Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.stat import Statistics
-
-sc = ... # SparkContext
-
-mat = ... # an RDD of Vectors
-
-# Compute column summary statistics.
-summary = Statistics.colStats(mat)
-print(summary.mean())
-print(summary.variance())
-print(summary.numNonzeros())
-
-{% endhighlight %}
+{% include_example python/ml/summary_statistics_example.py %}
 </div>
 
 </div>
@@ -112,27 +82,7 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp
 
 Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.stat.Statistics
-
-val sc: SparkContext = ...
-
-val seriesX: RDD[Double] = ... // a series
-val seriesY: RDD[Double] = ... // must have the same number of partitions and cardinality as seriesX
-
-// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
-// method is not specified, Pearson's method will be used by default. 
-val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
-
-val data: RDD[Vector] = ... // note that each Vector is a row and not a column
-
-// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
-// If a method is not specified, Pearson's method will be used by default. 
-val correlMatrix: Matrix = Statistics.corr(data, "pearson")
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/CorrelationsExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -142,28 +92,7 @@ a `JavaRDD<Vector>`, the output will be a `Double` or the correlation `Matrix` r
 
 Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.api.java.JavaDoubleRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.*;
-import org.apache.spark.mllib.stat.Statistics;
-
-JavaSparkContext jsc = ...
-
-JavaDoubleRDD seriesX = ... // a series
-JavaDoubleRDD seriesY = ... // must have the same number of partitions and cardinality as seriesX
-
-// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
-// method is not specified, Pearson's method will be used by default. 
-Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
-
-JavaRDD<Vector> data = ... // note that each Vector is a row and not a column
-
-// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
-// If a method is not specified, Pearson's method will be used by default. 
-Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaCorrelationsExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -173,24 +102,7 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp
 
 Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.stat import Statistics
-
-sc = ... # SparkContext
-
-seriesX = ... # a series
-seriesY = ... # must have the same number of partitions and cardinality as seriesX
-
-# Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
-# method is not specified, Pearson's method will be used by default. 
-print(Statistics.corr(seriesX, seriesY, method="pearson"))
-
-data = ... # an RDD of Vectors
-# calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
-# If a method is not specified, Pearson's method will be used by default. 
-print(Statistics.corr(data, method="pearson"))
-
-{% endhighlight %}
+{% include_example python/ml/correlations_example.py %}
 </div>
 
 </div>
@@ -216,21 +128,7 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K
 keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample 
 size, whereas sampling with replacement requires two additional passes.
 
-{% highlight scala %}
-import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
-import org.apache.spark.rdd.PairRDDFunctions
-
-val sc: SparkContext = ...
-
-val data = ... // an RDD[(K, V)] of any key value pairs
-val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key
-
-// Get an exact sample from each stratum
-val approxSample = data.sampleByKey(withReplacement = false, fractions)
-val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -240,22 +138,7 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K
 keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample 
 size, whereas sampling with replacement requires two additional passes.
 
-{% highlight java %}
-import java.util.Map;
-
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-
-JavaSparkContext jsc = ...
-
-JavaPairRDD<K, V> data = ... // an RDD of any key value pairs
-Map<K, Object> fractions = ... // specify the exact fraction desired from each key
-
-// Get an exact sample from each stratum
-JavaPairRDD<K, V> approxSample = data.sampleByKey(false, fractions);
-JavaPairRDD<K, V> exactSample = data.sampleByKeyExact(false, fractions);
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java %}
 </div>
 <div data-lang="python" markdown="1">
 [`sampleByKey()`](api/python/pyspark.html#pyspark.RDD.sampleByKey) allows users to
@@ -265,16 +148,7 @@ set of keys.
 
 *Note:* `sampleByKeyExact()` is currently not supported in Python.
 
-{% highlight python %}
-
-sc = ... # SparkContext
-
-data = ... # an RDD of any key value pairs
-fractions = ... # specify the exact fraction desired from each key as a dictionary
-
-approxSample = data.sampleByKey(False, fractions);
-
-{% endhighlight %}
+{% include_example python/ml/stratified_sampling_example.py %}
 </div>
 
 </div>
@@ -296,41 +170,7 @@ independence tests.
 run Pearson's chi-squared tests. The following example demonstrates how to run and interpret 
 hypothesis tests.
 
-{% highlight scala %}
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.stat.Statistics._
-
-val sc: SparkContext = ...
-
-val vec: Vector = ... // a vector composed of the frequencies of events
-
-// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, 
-// the test runs against a uniform distribution.  
-val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom, 
-                                 // test statistic, the method used, and the null hypothesis.
-
-val mat: Matrix = ... // a contingency matrix
-
-// conduct Pearson's independence test on the input contingency matrix
-val independenceTestResult = Statistics.chiSqTest(mat) 
-println(independenceTestResult) // summary of the test including the p-value, degrees of freedom...
-
-val obs: RDD[LabeledPoint] = ... // (feature, label) pairs.
-
-// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
-// the independence test. Returns an array containing the ChiSquaredTestResult for every feature 
-// against the label.
-val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
-var i = 1
-featureTestResults.foreach { result =>
-    println(s"Column $i:\n$result")
-    i += 1
-} // summary of the test 
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -340,44 +180,7 @@ hypothesis tests.
 
 Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.stat.Statistics;
-import org.apache.spark.mllib.stat.test.ChiSqTestResult;
-
-JavaSparkContext jsc = ...
-
-Vector vec = ... // a vector composed of the frequencies of events
-
-// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, 
-// the test runs against a uniform distribution.  
-ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
-// summary of the test including the p-value, degrees of freedom, test statistic, the method used, 
-// and the null hypothesis.
-System.out.println(goodnessOfFitTestResult);
-
-Matrix mat = ... // a contingency matrix
-
-// conduct Pearson's independence test on the input contingency matrix
-ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
-// summary of the test including the p-value, degrees of freedom...
-System.out.println(independenceTestResult);
-
-JavaRDD<LabeledPoint> obs = ... // an RDD of labeled points
-
-// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
-// the independence test. Returns an array containing the ChiSquaredTestResult for every feature 
-// against the label.
-ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
-int i = 1;
-for (ChiSqTestResult result : featureTestResults) {
-    System.out.println("Column " + i + ":");
-    System.out.println(result); // summary of the test
-    i++;
-}
+{% include_example java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java %}
 
 {% endhighlight %}
 </div>
@@ -389,39 +192,7 @@ hypothesis tests.
 
 Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
 
-{% highlight python %}
-from pyspark import SparkContext
-from pyspark.mllib.linalg import Vectors, Matrices
-from pyspark.mllib.regresssion import LabeledPoint
-from pyspark.mllib.stat import Statistics
-
-sc = SparkContext()
-
-vec = Vectors.dense(...) # a vector composed of the frequencies of events
-
-# compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
-# the test runs against a uniform distribution.
-goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom,
-                               # test statistic, the method used, and the null hypothesis.
-
-mat = Matrices.dense(...) # a contingency matrix
-
-# conduct Pearson's independence test on the input contingency matrix
-independenceTestResult = Statistics.chiSqTest(mat)
-print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom...
-
-obs = sc.parallelize(...)  # LabeledPoint(feature, label) .
-
-# The contingency table is constructed from an RDD of LabeledPoint and used to conduct
-# the independence test. Returns an array containing the ChiSquaredTestResult for every feature
-# against the label.
-featureTestResults = Statistics.chiSqTest(obs)
-
-for i, result in enumerate(featureTestResults):
-    print("Column $d:" % (i + 1))
-    print(result)
-{% endhighlight %}
+{% include_example python/ml/hypothesis_testing_example.py %}
 </div>
 
 </div>
@@ -443,21 +214,7 @@ and interpret the hypothesis tests.
 
 Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.stat.Statistics
-
-val data: RDD[Double] = ... // an RDD of sample data
-
-// run a KS test for the sample versus a standard normal distribution
-val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
-println(testResult) // summary of the test including the p-value, test statistic,
-                    // and null hypothesis
-                    // if our p-value indicates significance, we can reject the null hypothesis
-
-// perform a KS test using a cumulative distribution function of our making
-val myCDF: Double => Double = ...
-val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -467,23 +224,7 @@ and interpret the hypothesis tests.
 
 Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaDoubleRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-
-import org.apache.spark.mllib.stat.Statistics;
-import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
-
-JavaSparkContext jsc = ...
-JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...));
-KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
-// summary of the test including the p-value, test statistic,
-// and null hypothesis
-// if our p-value indicates significance, we can reject the null hypothesis
-System.out.println(testResult);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -493,19 +234,7 @@ and interpret the hypothesis tests.
 
 Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.stat import Statistics
-
-parallelData = sc.parallelize([1.0, 2.0, ... ])
-
-# run a KS test for the sample versus a standard normal distribution
-testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
-print(testResult) # summary of the test including the p-value, test statistic,
-                  # and null hypothesis
-                  # if our p-value indicates significance, we can reject the null hypothesis
-# Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
-# a lambda to calculate the CDF is not made available in the Python API
-{% endhighlight %}
+{% include_example python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py %}
 </div>
 </div>
 
@@ -550,18 +279,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.random.RandomRDDs._
-
-val sc: SparkContext = ...
-
-// Generate a random double RDD that contains 1 million i.i.d. values drawn from the
-// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
-val u = normalRDD(sc, 1000000L, 10)
-// Apply a transform to get a random double RDD following `N(1, 4)`.
-val v = u.map(x => 1.0 + 2.0 * x)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -572,24 +290,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Java docs](api/java/org/apache/spark/mllib/random/RandomRDDs) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.SparkContext;
-import org.apache.spark.api.JavaDoubleRDD;
-import static org.apache.spark.mllib.random.RandomRDDs.*;
-
-JavaSparkContext jsc = ...
-
-// Generate a random double RDD that contains 1 million i.i.d. values drawn from the
-// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
-JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10);
-// Apply a transform to get a random double RDD following `N(1, 4)`.
-JavaDoubleRDD v = u.map(
-  new Function<Double, Double>() {
-    public Double call(Double x) {
-      return 1.0 + 2.0 * x;
-    }
-  });
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -600,17 +301,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.random.RandomRDDs) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.random import RandomRDDs
-
-sc = ... # SparkContext
-
-# Generate a random double RDD that contains 1 million i.i.d. values drawn from the
-# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
-u = RandomRDDs.normalRDD(sc, 1000000L, 10)
-# Apply a transform to get a random double RDD following `N(1, 4)`.
-v = u.map(lambda x: 1.0 + 2.0 * x)
-{% endhighlight %}
+{% include_example python/ml/random_data_generation_example.py %}
 </div>
 </div>
 
@@ -632,21 +323,7 @@ to do so.
 
 Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.stat.KernelDensity
-import org.apache.spark.rdd.RDD
-
-val data: RDD[Double] = ... // an RDD of sample data
-
-// Construct the density estimator with the sample data and a standard deviation for the Gaussian
-// kernels
-val kd = new KernelDensity()
-  .setSample(data)
-  .setBandwidth(3.0)
-
-// Find density estimates for the given values
-val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -656,21 +333,7 @@ to do so.
 
 Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API.
 
-{% highlight java %}
-import org.apache.spark.mllib.stat.KernelDensity;
-import org.apache.spark.rdd.RDD;
-
-RDD<Double> data = ... // an RDD of sample data
-
-// Construct the density estimator with the sample data and a standard deviation for the Gaussian
-// kernels
-KernelDensity kd = new KernelDensity()
-  .setSample(data)
-  .setBandwidth(3.0);
-
-// Find density estimates for the given values
-double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -680,20 +343,7 @@ to do so.
 
 Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.stat import KernelDensity
-
-data = ... # an RDD of sample data
-
-# Construct the density estimator with the sample data and a standard deviation for the Gaussian
-# kernels
-kd = KernelDensity()
-kd.setSample(data)
-kd.setBandwidth(3.0)
-
-# Find density estimates for the given values
-densities = kd.estimate([-1.0, 2.0, 5.0])
-{% endhighlight %}
+{% include_example python/ml/kernel_density_estimation_example.py %}
 </div>
 
 </div>
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java
new file mode 100644
index 0000000000000..16eded92832aa
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.*;
+import org.apache.spark.mllib.stat.Statistics;
+// $example off$
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.mllib.linalg.Vectors;
+import java.util.Arrays;
+
+
+public class JavaCorrelationsExample {
+    public static void main(String[] args) {
+
+        SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample").setMaster("local[*]");
+
+        // $example on$
+        JavaSparkContext jsc = new JavaSparkContext(conf);
+
+        JavaDoubleRDD seriesX = jsc.parallelizeDoubles(Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series
+        JavaDoubleRDD seriesY = jsc.parallelizeDoubles(Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0})); // must have the same number of partitions and cardinality as seriesX
+
+        // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
+        // method is not specified, Pearson's method will be used by default.
+        Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
+        System.out.println("correlation is: " + correlation);
+
+        Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
+        Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
+        Vector v3 = Vectors.dense(5.0, 33.0, 366.0);
+        JavaRDD<Vector> data = jsc.parallelize(Arrays.asList(v1, v2, v3)); // note that each Vector is a row and not a column
+
+        // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+        // If a method is not specified, Pearson's method will be used by default.
+        Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
+        System.out.println(correlMatrix.toString());
+        // $example off$
+
+        jsc.stop();
+    }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java
new file mode 100644
index 0000000000000..d87366c2bd66c
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.stat.Statistics;
+import org.apache.spark.mllib.stat.test.ChiSqTestResult;
+// $example off$
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.mllib.linalg.Vectors;
+import java.util.Arrays;
+
+
+public class JavaHypothesisTestingExample {
+    public static void main(String[] args) {
+
+        SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
+        JavaSparkContext jsc = new JavaSparkContext(conf);
+        SQLContext sqlContext = new SQLContext(jsc);
+
+        // $example on$
+
+        // @note: todo
+
+        Vector vec = ... // a vector composed of the frequencies of events
+
+        // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
+        // the test runs against a uniform distribution.
+        ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
+        // summary of the test including the p-value, degrees of freedom, test statistic, the method used,
+        // and the null hypothesis.
+        System.out.println(goodnessOfFitTestResult);
+
+        Matrix mat = ... // a contingency matrix
+
+        // conduct Pearson's independence test on the input contingency matrix
+        ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
+        // summary of the test including the p-value, degrees of freedom...
+        System.out.println(independenceTestResult);
+
+        JavaRDD<LabeledPoint> obs = ... // an RDD of labeled points
+
+        // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+        // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
+        // against the label.
+        ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
+        int i = 1;
+        for (ChiSqTestResult result : featureTestResults) {
+            System.out.println("Column " + i + ":");
+            System.out.println(result); // summary of the test
+            i++;
+        }
+
+        // $example off$
+
+        jsc.stop();
+    }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
new file mode 100644
index 0000000000000..9ac0d1091e1f0
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+import org.apache.spark.mllib.stat.Statistics;
+import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
+// $example off$
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.mllib.linalg.Vectors;
+import java.util.Arrays;
+
+
+public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
+    public static void main(String[] args) {
+
+        SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
+        JavaSparkContext jsc = new JavaSparkContext(conf);
+        SQLContext sqlContext = new SQLContext(jsc);
+
+        // $example on$
+
+        // @note: todo
+
+        JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...));
+        KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
+        // summary of the test including the p-value, test statistic,
+        // and null hypothesis
+        // if our p-value indicates significance, we can reject the null hypothesis
+        System.out.println(testResult);
+
+        // $example off$
+
+        jsc.stop();
+    }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java
new file mode 100644
index 0000000000000..c3fc9a804f3c2
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.stat.KernelDensity;
+import org.apache.spark.rdd.RDD;
+// $example off$
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.mllib.linalg.Vectors;
+import java.util.Arrays;
+
+
+public class JavaKernelDensityEstimationExample {
+    public static void main(String[] args) {
+
+        SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
+        JavaSparkContext jsc = new JavaSparkContext(conf);
+        SQLContext sqlContext = new SQLContext(jsc);
+
+        // $example on$
+
+        // @note: todo
+
+        RDD<Double> data = ... // an RDD of sample data
+
+        // Construct the density estimator with the sample data and a standard deviation for the Gaussian
+        // kernels
+        KernelDensity kd = new KernelDensity()
+                .setSample(data)
+                .setBandwidth(3.0);
+
+        // Find density estimates for the given values
+        double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
+        // $example off$
+
+        jsc.stop();
+    }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java
new file mode 100644
index 0000000000000..c9e7e999462a5
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import org.apache.spark.SparkContext;
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaRDD;
+import static org.apache.spark.mllib.random.RandomRDDs.*;
+// $example off$
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.mllib.linalg.Vectors;
+import java.util.Arrays;
+
+
+public class JavaRandomDataGenerationExample {
+    public static void main(String[] args) {
+
+        SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample");
+        JavaSparkContext jsc = new JavaSparkContext(conf);
+        SQLContext sqlContext = new SQLContext(jsc);
+
+        // $example on$
+
+        // @note: todo
+
+        // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+        // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+        JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10);
+        // Apply a transform to get a random double RDD following `N(1, 4)`.
+        JavaDoubleRDD v = u.map(
+                new Function<Double, Double>() {
+                    public Double call(Double x) {
+                        return 1.0 + 2.0 * x;
+                    }
+                });
+
+        // $example off$
+
+        jsc.stop();
+    }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java
new file mode 100644
index 0000000000000..7df6afafe05f3
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Map;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example off$
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.mllib.linalg.Vectors;
+import java.util.Arrays;
+
+
+public class JavaStratifiedSamplingExample {
+    public static void main(String[] args) {
+
+        SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample");
+        JavaSparkContext jsc = new JavaSparkContext(conf);
+        SQLContext sqlContext = new SQLContext(jsc);
+
+        // $example on$
+//        JavaPairRDD<K, V> data = ... // an RDD of any key value pairs
+//        Map<K, Object> fractions = ... // specify the exact fraction desired from each key
+//
+//        // Get an exact sample from each stratum
+//        JavaPairRDD<K, V> approxSample = data.sampleByKey(false, fractions);
+//        JavaPairRDD<K, V> exactSample = data.sampleByKeyExact(false, fractions);
+        // $example off$
+
+        jsc.stop();
+    }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java
index 8c8e3ab0ef143..56822c7e96801 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java
@@ -1,7 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.examples.ml;
 
-/**
- * Created by quickmobile on 16-02-01.
- */
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
+import org.apache.spark.mllib.stat.Statistics;
+// $example off$
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.mllib.linalg.Vectors;
+import java.util.Arrays;
+
+
 public class JavaSummaryStatisticsExample {
+    public static void main(String[] args) {
+
+        SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
+        JavaSparkContext jsc = new JavaSparkContext(conf);
+        SQLContext sqlContext = new SQLContext(jsc);
+
+        // $example on$
+        Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
+        Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
+        Vector v3 = Vectors.dense(3.0, 30.0, 300.0);
+
+        JavaRDD<Vector> mat = jsc.parallelize(Arrays.asList(v1, v2, v3)); // an RDD of Vectors
+
+        // Compute column summary statistics.
+        MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
+        System.out.println(summary.mean()); // a dense vector containing the mean value for each column
+        System.out.println(summary.variance()); // column-wise variance
+        System.out.println(summary.numNonzeros()); // number of nonzeros in each column
+        // $example off$
+
+        jsc.stop();
+    }
 }
diff --git a/examples/src/main/python/ml/correlations_example.py b/examples/src/main/python/ml/correlations_example.py
new file mode 100644
index 0000000000000..35e089c6a0e7d
--- /dev/null
+++ b/examples/src/main/python/ml/correlations_example.py
@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+import numpy as np
+from pyspark.mllib.linalg import Vectors
+# $example on$
+from pyspark.mllib.stat import Statistics
+# $example off$
+
+if __name__ == "__main__":
+    # $example on$
+    sc = SparkContext(appName="CorrelationsExample") # SparkContext
+
+    seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series
+    seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) # must have the same number of partitions and cardinality as seriesX
+
+    # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
+    # method is not specified, Pearson's method will be used by default.
+    print(Statistics.corr(seriesX, seriesY, method="pearson"))
+
+    v1 = np.array([1.0, 10.0, 100.0])
+    v2 = np.array([2.0, 20.0, 200.0])
+    v3 = np.array([5.0, 33.0, 366.0])
+    data = sc.parallelize([v1, v2, v3]) # an RDD of Vectors
+
+    # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+    # If a method is not specified, Pearson's method will be used by default.
+    print(Statistics.corr(data, method="pearson"))
+
+    # $example off$
+
+    sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/python/ml/hypothesis_testing_example.py b/examples/src/main/python/ml/hypothesis_testing_example.py
new file mode 100644
index 0000000000000..afbf7bc4309f8
--- /dev/null
+++ b/examples/src/main/python/ml/hypothesis_testing_example.py
@@ -0,0 +1,66 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+import numpy as np
+from pyspark.mllib.linalg import Vectors
+# $example on$
+from pyspark import SparkContext
+from pyspark.mllib.linalg import Vectors, Matrices
+from pyspark.mllib.regresssion import LabeledPoint
+from pyspark.mllib.stat import Statistics
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="HypothesisTestingExample") # SparkContext
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+
+    # @note: todo
+
+    vec = Vectors.dense(...) # a vector composed of the frequencies of events
+
+    # compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
+    # the test runs against a uniform distribution.
+    goodnessOfFitTestResult = Statistics.chiSqTest(vec)
+    print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom,
+    # test statistic, the method used, and the null hypothesis.
+
+    mat = Matrices.dense(...) # a contingency matrix
+
+    # conduct Pearson's independence test on the input contingency matrix
+    independenceTestResult = Statistics.chiSqTest(mat)
+    print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom...
+
+    obs = sc.parallelize(...)  # LabeledPoint(feature, label) .
+
+    # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
+    # the independence test. Returns an array containing the ChiSquaredTestResult for every feature
+    # against the label.
+    featureTestResults = Statistics.chiSqTest(obs)
+
+    for i, result in enumerate(featureTestResults):
+        print("Column $d:" % (i + 1))
+        print(result)
+
+    # $example off$
+
+    sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py
new file mode 100644
index 0000000000000..c4ee776e32fe8
--- /dev/null
+++ b/examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+import numpy as np
+from pyspark.mllib.linalg import Vectors
+# $example on$
+from pyspark.mllib.stat import Statistics
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample") # SparkContext
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+
+    # @note: todo
+
+    parallelData = sc.parallelize([1.0, 2.0, ... ])
+
+    # run a KS test for the sample versus a standard normal distribution
+    testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
+    print(testResult) # summary of the test including the p-value, test statistic,
+    # and null hypothesis
+    # if our p-value indicates significance, we can reject the null hypothesis
+    # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
+    # a lambda to calculate the CDF is not made available in the Python API
+
+    # $example off$
+
+    sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/python/ml/kernel_density_estimation_example.py b/examples/src/main/python/ml/kernel_density_estimation_example.py
new file mode 100644
index 0000000000000..c71b96b30a771
--- /dev/null
+++ b/examples/src/main/python/ml/kernel_density_estimation_example.py
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+import numpy as np
+from pyspark.mllib.linalg import Vectors
+# $example on$
+from pyspark.mllib.stat import KernelDensity
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+
+    # @note: todo
+
+    data = ... # an RDD of sample data
+
+    # Construct the density estimator with the sample data and a standard deviation for the Gaussian
+    # kernels
+    kd = KernelDensity()
+    kd.setSample(data)
+    kd.setBandwidth(3.0)
+
+    # Find density estimates for the given values
+    densities = kd.estimate([-1.0, 2.0, 5.0])
+    # $example off$
+
+    sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/python/ml/random_data_generation_example.py b/examples/src/main/python/ml/random_data_generation_example.py
new file mode 100644
index 0000000000000..d42e33d464aba
--- /dev/null
+++ b/examples/src/main/python/ml/random_data_generation_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+import numpy as np
+from pyspark.mllib.linalg import Vectors
+# $example on$
+from pyspark.mllib.random import RandomRDDs
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="RandomDataGenerationExample") # SparkContext
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+
+    # @note: todo
+
+    # Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+    # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+    u = RandomRDDs.normalRDD(sc, 1000000L, 10)
+    # Apply a transform to get a random double RDD following `N(1, 4)`.
+    v = u.map(lambda x: 1.0 + 2.0 * x)
+    # $example off$
+
+    sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/python/ml/stratified_sampling_example.py b/examples/src/main/python/ml/stratified_sampling_example.py
new file mode 100644
index 0000000000000..0f6ede7335a85
--- /dev/null
+++ b/examples/src/main/python/ml/stratified_sampling_example.py
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+import numpy as np
+from pyspark.mllib.linalg import Vectors
+# $example on$
+from pyspark.mllib.stat import Statistics
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="StratifiedSamplingExample") # SparkContext
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    # data = ... # an RDD of any key value pairs
+    # fractions = ... # specify the exact fraction desired from each key as a dictionary
+    #
+    # approxSample = data.sampleByKey(False, fractions);
+    # $example off$
+
+    sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/python/ml/summary_statistics_example.py b/examples/src/main/python/ml/summary_statistics_example.py
new file mode 100644
index 0000000000000..fef018127451d
--- /dev/null
+++ b/examples/src/main/python/ml/summary_statistics_example.py
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+import numpy as np
+from pyspark.mllib.linalg import Vectors
+# $example on$
+from pyspark.mllib.stat import Statistics
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    v1 = np.array([1.0, 2.0, 3.0])
+    v2 = np.array([10.0, 20.0, 30.0])
+    v3 = np.array([100.0, 200.0, 300.0])
+    mat = sc.parallelize([v1, v2, v3]) # an RDD of Vectors
+
+    # Compute column summary statistics.
+    summary = Statistics.colStats(mat)
+    print(summary.mean()) # a dense vector containing the mean value for each column
+    print(summary.variance()) # column-wise variance
+    print(summary.numNonzeros()) # number of nonzeros in each column
+    # $example off$
+
+    sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala
new file mode 100644
index 0000000000000..179b87e21f9c9
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.stat.Statistics
+// $example off$
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object CorrelationsExample {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("CorrelationsExample").setMaster("local[*]")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series
+    val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555)) // must have the same number of partitions and cardinality as seriesX
+
+    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
+    // method is not specified, Pearson's method will be used by default.
+    val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
+    println(correlation)
+
+    val data: RDD[Vector] = sc.parallelize(Seq(Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(5.0, 33.0, 366.0))) // note that each Vector is a row and not a column
+
+    // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+    // If a method is not specified, Pearson's method will be used by default.
+    val correlMatrix: Matrix = Statistics.corr(data, "pearson")
+    println(correlMatrix.toString)
+    // $example off$
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala
new file mode 100644
index 0000000000000..f67e291dcd2f7
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.stat.Statistics._
+import org.apache.spark.rdd.RDD
+
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object HypothesisTestingExample {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("HypothesisTestingExample").setMaster("local[*]")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    // @note: todo
+    val vec: Vector = ... // a vector composed of the frequencies of events
+
+    // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
+    // the test runs against a uniform distribution.
+    val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
+    println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom,
+    // test statistic, the method used, and the null hypothesis.
+
+    val mat: Matrix = ... // a contingency matrix
+
+    // conduct Pearson's independence test on the input contingency matrix
+    val independenceTestResult = Statistics.chiSqTest(mat)
+    println(independenceTestResult) // summary of the test including the p-value, degrees of freedom...
+
+    val obs: RDD[LabeledPoint] = ... // (feature, label) pairs.
+
+    // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+    // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
+    // against the label.
+    val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
+    var i = 1
+    featureTestResults.foreach { result =>
+      println(s"Column $i:\n$result")
+      i += 1
+    } // summary of the test
+
+    // $example off$
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala
new file mode 100644
index 0000000000000..9c6e07f2242eb
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.rdd.RDD
+
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object HypothesisTestingKolmogorovSmirnovTestExample {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample").setMaster("local[*]")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    // @note: todo
+
+    val data: RDD[Double] = ... // an RDD of sample data
+
+    // run a KS test for the sample versus a standard normal distribution
+    val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
+    println(testResult) // summary of the test including the p-value, test statistic,
+    // and null hypothesis
+    // if our p-value indicates significance, we can reject the null hypothesis
+
+    // perform a KS test using a cumulative distribution function of our making
+    val myCDF: Double => Double = ...
+    val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
+
+    // $example off$
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala
new file mode 100644
index 0000000000000..ae9ecc9d4183a
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.mllib.stat.KernelDensity
+import org.apache.spark.rdd.RDD
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object KernelDensityEstimationExample {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("KernelDensityEstimationExample").setMaster("local[*]")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+
+    // @note: todo
+
+    val data: RDD[Double] = ... // an RDD of sample data
+
+    // Construct the density estimator with the sample data and a standard deviation for the Gaussian
+    // kernels
+    val kd = new KernelDensity()
+      .setSample(data)
+      .setBandwidth(3.0)
+
+    // Find density estimates for the given values
+    val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
+    // $example off$
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala
new file mode 100644
index 0000000000000..34f79a4f0d4dc
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.mllib.random.RandomRDDs._
+
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object RandomDataGenerationExample {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("RandomDataGenerationExample").setMaster("local[*]")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+
+    // @note: todo
+
+    // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+    // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+    val u = normalRDD(sc, 1000000L, 10)
+    // Apply a transform to get a random double RDD following `N(1, 4)`.
+    val v = u.map(x => 1.0 + 2.0 * x)
+
+    // $example off$
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala
new file mode 100644
index 0000000000000..7d5cf341f9d54
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.{SparkConf, SparkContext}
+// $example off$
+import org.apache.spark.sql.SQLContext
+
+
+object StratifiedSamplingExample {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("StratifiedSamplingExample").setMaster("local[*]")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    // @note: I don't know how to use class "import org.apache.spark.rdd.PairRDDFunctions"
+    val data = sc.parallelize(Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f'))) // an RDD[(K, V)] of any key value pairs
+    val fractions =  Map(1 -> 1.0, 2 -> 2.0, 3 -> 3.0)// specify the exact fraction desired from each key
+
+    // Get an exact sample from each stratum
+    val approxSample = data.sampleByKey(withReplacement = false, fractions)
+    val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
+
+    println(approxSample.toString)
+    println(exactSample.toString)
+    // $example off$
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala
index 68b9c19914897..eb81db6b58321 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala
@@ -29,7 +29,7 @@ object SummaryStatisticsExample {
 
   def main(args: Array[String]) {
 
-    val conf = new SparkConf().setAppName("MultivariateStatisticalSummaryExample")
+    val conf = new SparkConf().setAppName("SummaryStatisticsExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
 

From 2058b16bc3c097b204bb6c94226e1a8f85017679 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Wed, 3 Feb 2016 15:25:59 -0800
Subject: [PATCH 04/26] [SPARK-13019] move new files into mllib folder

---
 docs/mllib-statistics.md                      | 42 +++++++++----------
 examples/src/__init__.py                      |  1 +
 examples/src/main/__init__.py                 |  1 +
 .../JavaCorrelationsExample.java              |  2 +-
 .../JavaHypothesisTestingExample.java         |  2 +-
 ...isTestingKolmogorovSmirnovTestExample.java |  2 +-
 .../JavaKernelDensityEstimationExample.java   |  2 +-
 .../JavaRandomDataGenerationExample.java      |  2 +-
 .../JavaStratifiedSamplingExample.java        |  2 +-
 .../JavaSummaryStatisticsExample.java         |  2 +-
 examples/src/main/python/__init__.py          |  1 +
 examples/src/main/python/mllib/__init__.py    |  1 +
 .../{ml => mllib}/correlations_example.py     |  0
 .../hypothesis_testing_example.py             |  0
 ...testing_kolmogorov_smirnov_test_example.py |  0
 .../kernel_density_estimation_example.py      |  0
 .../random_data_generation_example.py         |  0
 .../stratified_sampling_example.py            |  0
 .../summary_statistics_example.py             |  0
 .../{ml => mllib}/CorrelationsExample.scala   |  2 +-
 .../HypothesisTestingExample.scala            |  4 +-
 ...sTestingKolmogorovSmirnovTestExample.scala |  5 +--
 .../KernelDensityEstimationExample.scala      |  2 +-
 .../RandomDataGenerationExample.scala         |  2 +-
 .../StratifiedSamplingExample.scala           |  2 +-
 .../SummaryStatisticsExample.scala            |  2 +-
 26 files changed, 39 insertions(+), 40 deletions(-)
 create mode 100644 examples/src/__init__.py
 create mode 100644 examples/src/main/__init__.py
 rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaCorrelationsExample.java (98%)
 rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaHypothesisTestingExample.java (98%)
 rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaHypothesisTestingKolmogorovSmirnovTestExample.java (98%)
 rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaKernelDensityEstimationExample.java (97%)
 rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaRandomDataGenerationExample.java (98%)
 rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaStratifiedSamplingExample.java (97%)
 rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaSummaryStatisticsExample.java (98%)
 create mode 100644 examples/src/main/python/__init__.py
 create mode 100644 examples/src/main/python/mllib/__init__.py
 rename examples/src/main/python/{ml => mllib}/correlations_example.py (100%)
 rename examples/src/main/python/{ml => mllib}/hypothesis_testing_example.py (100%)
 rename examples/src/main/python/{ml => mllib}/hypothesis_testing_kolmogorov_smirnov_test_example.py (100%)
 rename examples/src/main/python/{ml => mllib}/kernel_density_estimation_example.py (100%)
 rename examples/src/main/python/{ml => mllib}/random_data_generation_example.py (100%)
 rename examples/src/main/python/{ml => mllib}/stratified_sampling_example.py (100%)
 rename examples/src/main/python/{ml => mllib}/summary_statistics_example.py (100%)
 rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/CorrelationsExample.scala (98%)
 rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/HypothesisTestingExample.scala (95%)
 rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/HypothesisTestingKolmogorovSmirnovTestExample.scala (92%)
 rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/KernelDensityEstimationExample.scala (97%)
 rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/RandomDataGenerationExample.scala (97%)
 rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/StratifiedSamplingExample.scala (97%)
 rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/SummaryStatisticsExample.scala (97%)

diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index 487ae12f3b6de..bbbbd87a35610 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -40,7 +40,7 @@ total count.
 
 Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API.
 
-{% include_example scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -52,7 +52,7 @@ total count.
 
 Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -63,7 +63,7 @@ total count.
 
 Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API.
 
-{% include_example python/ml/summary_statistics_example.py %}
+{% include_example python/mllib/summary_statistics_example.py %}
 </div>
 
 </div>
@@ -82,7 +82,7 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp
 
 Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
 
-{% include_example scala/org/apache/spark/examples/ml/CorrelationsExample.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/CorrelationsExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -92,7 +92,7 @@ a `JavaRDD<Vector>`, the output will be a `Double` or the correlation `Matrix` r
 
 Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/ml/JavaCorrelationsExample.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -102,7 +102,7 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp
 
 Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
 
-{% include_example python/ml/correlations_example.py %}
+{% include_example python/mllib/correlations_example.py %}
 </div>
 
 </div>
@@ -128,7 +128,7 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K
 keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample 
 size, whereas sampling with replacement requires two additional passes.
 
-{% include_example scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -138,7 +138,7 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K
 keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample 
 size, whereas sampling with replacement requires two additional passes.
 
-{% include_example java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java %}
 </div>
 <div data-lang="python" markdown="1">
 [`sampleByKey()`](api/python/pyspark.html#pyspark.RDD.sampleByKey) allows users to
@@ -148,7 +148,7 @@ set of keys.
 
 *Note:* `sampleByKeyExact()` is currently not supported in Python.
 
-{% include_example python/ml/stratified_sampling_example.py %}
+{% include_example python/mllib/stratified_sampling_example.py %}
 </div>
 
 </div>
@@ -170,7 +170,7 @@ independence tests.
 run Pearson's chi-squared tests. The following example demonstrates how to run and interpret 
 hypothesis tests.
 
-{% include_example scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -180,7 +180,7 @@ hypothesis tests.
 
 Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java %}
 
 {% endhighlight %}
 </div>
@@ -192,7 +192,7 @@ hypothesis tests.
 
 Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
 
-{% include_example python/ml/hypothesis_testing_example.py %}
+{% include_example python/mllib/hypothesis_testing_example.py %}
 </div>
 
 </div>
@@ -214,7 +214,7 @@ and interpret the hypothesis tests.
 
 Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
 
-{% include_example scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -224,7 +224,7 @@ and interpret the hypothesis tests.
 
 Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -234,7 +234,7 @@ and interpret the hypothesis tests.
 
 Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
 
-{% include_example python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py %}
+{% include_example python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py %}
 </div>
 </div>
 
@@ -279,7 +279,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) for details on the API.
 
-{% include_example scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -290,7 +290,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Java docs](api/java/org/apache/spark/mllib/random/RandomRDDs) for details on the API.
 
-{% include_example java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -301,7 +301,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.random.RandomRDDs) for more details on the API.
 
-{% include_example python/ml/random_data_generation_example.py %}
+{% include_example python/mllib/random_data_generation_example.py %}
 </div>
 </div>
 
@@ -323,7 +323,7 @@ to do so.
 
 Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API.
 
-{% include_example scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -333,7 +333,7 @@ to do so.
 
 Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -343,7 +343,7 @@ to do so.
 
 Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API.
 
-{% include_example python/ml/kernel_density_estimation_example.py %}
+{% include_example python/mllib/kernel_density_estimation_example.py %}
 </div>
 
 </div>
diff --git a/examples/src/__init__.py b/examples/src/__init__.py
new file mode 100644
index 0000000000000..31a6ebb6ea4e4
--- /dev/null
+++ b/examples/src/__init__.py
@@ -0,0 +1 @@
+__author__ = 'quickmobile'
diff --git a/examples/src/main/__init__.py b/examples/src/main/__init__.py
new file mode 100644
index 0000000000000..31a6ebb6ea4e4
--- /dev/null
+++ b/examples/src/main/__init__.py
@@ -0,0 +1 @@
+__author__ = 'quickmobile'
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
similarity index 98%
rename from examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
index 16eded92832aa..6fb1ee6365a27 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.examples.ml;
+package org.apache.spark.examples.mllib;
 
 // $example on$
 import org.apache.spark.api.java.JavaRDD;
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
similarity index 98%
rename from examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
index d87366c2bd66c..8faf7f48a525d 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.examples.ml;
+package org.apache.spark.examples.mllib;
 
 // $example on$
 import org.apache.spark.api.java.JavaRDD;
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
similarity index 98%
rename from examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
index 9ac0d1091e1f0..02f91848884a8 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.examples.ml;
+package org.apache.spark.examples.mllib;
 
 // $example on$
 import java.util.Arrays;
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
similarity index 97%
rename from examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
index c3fc9a804f3c2..338a3fdf5ebc8 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.examples.ml;
+package org.apache.spark.examples.mllib;
 
 // $example on$
 import org.apache.spark.api.java.JavaRDD;
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
similarity index 98%
rename from examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
index c9e7e999462a5..f84966c076770 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.examples.ml;
+package org.apache.spark.examples.mllib;
 
 // $example on$
 import org.apache.spark.SparkContext;
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
similarity index 97%
rename from examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
index 7df6afafe05f3..6d11e97690413 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.examples.ml;
+package org.apache.spark.examples.mllib;
 
 // $example on$
 import java.util.Map;
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
similarity index 98%
rename from examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
index 56822c7e96801..ed7f9637e7627 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.examples.ml;
+package org.apache.spark.examples.mllib;
 
 // $example on$
 import org.apache.spark.api.java.JavaRDD;
diff --git a/examples/src/main/python/__init__.py b/examples/src/main/python/__init__.py
new file mode 100644
index 0000000000000..31a6ebb6ea4e4
--- /dev/null
+++ b/examples/src/main/python/__init__.py
@@ -0,0 +1 @@
+__author__ = 'quickmobile'
diff --git a/examples/src/main/python/mllib/__init__.py b/examples/src/main/python/mllib/__init__.py
new file mode 100644
index 0000000000000..31a6ebb6ea4e4
--- /dev/null
+++ b/examples/src/main/python/mllib/__init__.py
@@ -0,0 +1 @@
+__author__ = 'quickmobile'
diff --git a/examples/src/main/python/ml/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py
similarity index 100%
rename from examples/src/main/python/ml/correlations_example.py
rename to examples/src/main/python/mllib/correlations_example.py
diff --git a/examples/src/main/python/ml/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
similarity index 100%
rename from examples/src/main/python/ml/hypothesis_testing_example.py
rename to examples/src/main/python/mllib/hypothesis_testing_example.py
diff --git a/examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
similarity index 100%
rename from examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py
rename to examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
diff --git a/examples/src/main/python/ml/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py
similarity index 100%
rename from examples/src/main/python/ml/kernel_density_estimation_example.py
rename to examples/src/main/python/mllib/kernel_density_estimation_example.py
diff --git a/examples/src/main/python/ml/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py
similarity index 100%
rename from examples/src/main/python/ml/random_data_generation_example.py
rename to examples/src/main/python/mllib/random_data_generation_example.py
diff --git a/examples/src/main/python/ml/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py
similarity index 100%
rename from examples/src/main/python/ml/stratified_sampling_example.py
rename to examples/src/main/python/mllib/stratified_sampling_example.py
diff --git a/examples/src/main/python/ml/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py
similarity index 100%
rename from examples/src/main/python/ml/summary_statistics_example.py
rename to examples/src/main/python/mllib/summary_statistics_example.py
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
similarity index 98%
rename from examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
index 179b87e21f9c9..aeb5f7f802e00 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
@@ -16,7 +16,7 @@
  */
 
 // scalastyle:off println
-package org.apache.spark.examples.ml
+package org.apache.spark.examples.mllib
 
 // $example on$
 import org.apache.spark.mllib.linalg._
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
similarity index 95%
rename from examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index f67e291dcd2f7..be7f09c32a0e8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -16,13 +16,11 @@
  */
 
 // scalastyle:off println
-package org.apache.spark.examples.ml
+package org.apache.spark.examples.mllib
 
 // $example on$
-import org.apache.spark.SparkContext
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.stat.Statistics._
 import org.apache.spark.rdd.RDD
 
 // $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
similarity index 92%
rename from examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
index 9c6e07f2242eb..37528e44b7cc6 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
@@ -16,12 +16,9 @@
  */
 
 // scalastyle:off println
-package org.apache.spark.examples.ml
+package org.apache.spark.examples.mllib
 
 // $example on$
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.Statistics
 import org.apache.spark.rdd.RDD
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
similarity index 97%
rename from examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
index ae9ecc9d4183a..1326e187ba771 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
@@ -16,7 +16,7 @@
  */
 
 // scalastyle:off println
-package org.apache.spark.examples.ml
+package org.apache.spark.examples.mllib
 
 // $example on$
 import org.apache.spark.mllib.stat.KernelDensity
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
similarity index 97%
rename from examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
index 34f79a4f0d4dc..5de6162ade9d0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
@@ -16,7 +16,7 @@
  */
 
 // scalastyle:off println
-package org.apache.spark.examples.ml
+package org.apache.spark.examples.mllib
 
 // $example on$
 import org.apache.spark.mllib.random.RandomRDDs._
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
similarity index 97%
rename from examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index 7d5cf341f9d54..c01047e784357 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -16,7 +16,7 @@
  */
 
 // scalastyle:off println
-package org.apache.spark.examples.ml
+package org.apache.spark.examples.mllib
 
 // $example on$
 import org.apache.spark.{SparkConf, SparkContext}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
similarity index 97%
rename from examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
index eb81db6b58321..8876dbfcdb863 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
@@ -16,7 +16,7 @@
  */
 
 // scalastyle:off println
-package org.apache.spark.examples.ml
+package org.apache.spark.examples.mllib
 
 // $example on$
 import org.apache.spark.mllib.linalg.Vectors

From b3285423a625812f64968767b8063ca83ab58f3c Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Wed, 3 Feb 2016 15:29:04 -0800
Subject: [PATCH 05/26] [SPARK-13019] remote python init files

---
 examples/src/__init__.py                   | 1 -
 examples/src/main/__init__.py              | 1 -
 examples/src/main/python/__init__.py       | 1 -
 examples/src/main/python/mllib/__init__.py | 1 -
 4 files changed, 4 deletions(-)
 delete mode 100644 examples/src/__init__.py
 delete mode 100644 examples/src/main/__init__.py
 delete mode 100644 examples/src/main/python/__init__.py
 delete mode 100644 examples/src/main/python/mllib/__init__.py

diff --git a/examples/src/__init__.py b/examples/src/__init__.py
deleted file mode 100644
index 31a6ebb6ea4e4..0000000000000
--- a/examples/src/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-__author__ = 'quickmobile'
diff --git a/examples/src/main/__init__.py b/examples/src/main/__init__.py
deleted file mode 100644
index 31a6ebb6ea4e4..0000000000000
--- a/examples/src/main/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-__author__ = 'quickmobile'
diff --git a/examples/src/main/python/__init__.py b/examples/src/main/python/__init__.py
deleted file mode 100644
index 31a6ebb6ea4e4..0000000000000
--- a/examples/src/main/python/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-__author__ = 'quickmobile'
diff --git a/examples/src/main/python/mllib/__init__.py b/examples/src/main/python/mllib/__init__.py
deleted file mode 100644
index 31a6ebb6ea4e4..0000000000000
--- a/examples/src/main/python/mllib/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-__author__ = 'quickmobile'

From 12fda2be3033fb0477951c276366215ed2ed2736 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Wed, 3 Feb 2016 15:39:48 -0800
Subject: [PATCH 06/26] [SPARK-13019] comment broken code to pass complie
 process

---
 .../mllib/JavaHypothesisTestingExample.java   |  4 +-
 ...isTestingKolmogorovSmirnovTestExample.java |  4 +-
 .../JavaKernelDensityEstimationExample.java   |  4 +-
 .../JavaRandomDataGenerationExample.java      |  4 +-
 .../mllib/hypothesis_testing_example.py       | 48 +++++++++----------
 ...testing_kolmogorov_smirnov_test_example.py | 18 +++----
 .../kernel_density_estimation_example.py      | 30 ++++++------
 .../mllib/random_data_generation_example.py   | 12 ++---
 .../mllib/HypothesisTestingExample.scala      |  7 +--
 ...sTestingKolmogorovSmirnovTestExample.scala |  6 +--
 .../KernelDensityEstimationExample.scala      |  6 +--
 .../mllib/RandomDataGenerationExample.scala   |  5 +-
 12 files changed, 70 insertions(+), 78 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
index 8faf7f48a525d..023480252d833 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -42,7 +42,7 @@ public static void main(String[] args) {
         // $example on$
 
         // @note: todo
-
+/*
         Vector vec = ... // a vector composed of the frequencies of events
 
         // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
@@ -73,7 +73,7 @@ public static void main(String[] args) {
         }
 
         // $example off$
-
+*/
         jsc.stop();
     }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
index 02f91848884a8..c34e66541eb7a 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -41,7 +41,7 @@ public static void main(String[] args) {
         SQLContext sqlContext = new SQLContext(jsc);
 
         // $example on$
-
+/*
         // @note: todo
 
         JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...));
@@ -52,7 +52,7 @@ public static void main(String[] args) {
         System.out.println(testResult);
 
         // $example off$
-
+*/
         jsc.stop();
     }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
index 338a3fdf5ebc8..457ee0c5537c9 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -36,7 +36,7 @@ public static void main(String[] args) {
         SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
         JavaSparkContext jsc = new JavaSparkContext(conf);
         SQLContext sqlContext = new SQLContext(jsc);
-
+/*
         // $example on$
 
         // @note: todo
@@ -52,7 +52,7 @@ public static void main(String[] args) {
         // Find density estimates for the given values
         double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
         // $example off$
-
+*/
         jsc.stop();
     }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
index f84966c076770..f27476ffa6179 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
@@ -39,7 +39,7 @@ public static void main(String[] args) {
         SQLContext sqlContext = new SQLContext(jsc);
 
         // $example on$
-
+/*
         // @note: todo
 
         // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
@@ -54,7 +54,7 @@ public Double call(Double x) {
                 });
 
         // $example off$
-
+*/
         jsc.stop();
     }
 }
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
index afbf7bc4309f8..cee1ab6168475 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@@ -36,30 +36,30 @@
 
     # @note: todo
 
-    vec = Vectors.dense(...) # a vector composed of the frequencies of events
-
-    # compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
-    # the test runs against a uniform distribution.
-    goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-    print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom,
-    # test statistic, the method used, and the null hypothesis.
-
-    mat = Matrices.dense(...) # a contingency matrix
-
-    # conduct Pearson's independence test on the input contingency matrix
-    independenceTestResult = Statistics.chiSqTest(mat)
-    print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom...
-
-    obs = sc.parallelize(...)  # LabeledPoint(feature, label) .
-
-    # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
-    # the independence test. Returns an array containing the ChiSquaredTestResult for every feature
-    # against the label.
-    featureTestResults = Statistics.chiSqTest(obs)
-
-    for i, result in enumerate(featureTestResults):
-        print("Column $d:" % (i + 1))
-        print(result)
+    # vec = Vectors.dense(...) # a vector composed of the frequencies of events
+    #
+    # # compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
+    # # the test runs against a uniform distribution.
+    # goodnessOfFitTestResult = Statistics.chiSqTest(vec)
+    # print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom,
+    # # test statistic, the method used, and the null hypothesis.
+    #
+    # mat = Matrices.dense(...) # a contingency matrix
+    #
+    # # conduct Pearson's independence test on the input contingency matrix
+    # independenceTestResult = Statistics.chiSqTest(mat)
+    # print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom...
+    #
+    # obs = sc.parallelize(...)  # LabeledPoint(feature, label) .
+    #
+    # # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
+    # # the independence test. Returns an array containing the ChiSquaredTestResult for every feature
+    # # against the label.
+    # featureTestResults = Statistics.chiSqTest(obs)
+    #
+    # for i, result in enumerate(featureTestResults):
+    #     print("Column $d:" % (i + 1))
+    #     print(result)
 
     # $example off$
 
diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
index c4ee776e32fe8..5541250dd76ee 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
@@ -33,15 +33,15 @@
 
     # @note: todo
 
-    parallelData = sc.parallelize([1.0, 2.0, ... ])
-
-    # run a KS test for the sample versus a standard normal distribution
-    testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
-    print(testResult) # summary of the test including the p-value, test statistic,
-    # and null hypothesis
-    # if our p-value indicates significance, we can reject the null hypothesis
-    # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
-    # a lambda to calculate the CDF is not made available in the Python API
+    # parallelData = sc.parallelize([1.0, 2.0, ... ])
+    #
+    # # run a KS test for the sample versus a standard normal distribution
+    # testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
+    # print(testResult) # summary of the test including the p-value, test statistic,
+    # # and null hypothesis
+    # # if our p-value indicates significance, we can reject the null hypothesis
+    # # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
+    # # a lambda to calculate the CDF is not made available in the Python API
 
     # $example off$
 
diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py
index c71b96b30a771..383d9f37a9276 100644
--- a/examples/src/main/python/mllib/kernel_density_estimation_example.py
+++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py
@@ -29,20 +29,20 @@
     sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext
     sqlContext = SQLContext(sc)
 
-    # $example on$
-
-    # @note: todo
-
-    data = ... # an RDD of sample data
-
-    # Construct the density estimator with the sample data and a standard deviation for the Gaussian
-    # kernels
-    kd = KernelDensity()
-    kd.setSample(data)
-    kd.setBandwidth(3.0)
-
-    # Find density estimates for the given values
-    densities = kd.estimate([-1.0, 2.0, 5.0])
-    # $example off$
+    # # $example on$
+    #
+    # # @note: todo
+    #
+    # data = ... # an RDD of sample data
+    #
+    # # Construct the density estimator with the sample data and a standard deviation for the Gaussian
+    # # kernels
+    # kd = KernelDensity()
+    # kd.setSample(data)
+    # kd.setBandwidth(3.0)
+    #
+    # # Find density estimates for the given values
+    # densities = kd.estimate([-1.0, 2.0, 5.0])
+    # # $example off$
 
     sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/python/mllib/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py
index d42e33d464aba..db2a5e97e87d2 100644
--- a/examples/src/main/python/mllib/random_data_generation_example.py
+++ b/examples/src/main/python/mllib/random_data_generation_example.py
@@ -33,11 +33,11 @@
 
     # @note: todo
 
-    # Generate a random double RDD that contains 1 million i.i.d. values drawn from the
-    # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
-    u = RandomRDDs.normalRDD(sc, 1000000L, 10)
-    # Apply a transform to get a random double RDD following `N(1, 4)`.
-    v = u.map(lambda x: 1.0 + 2.0 * x)
-    # $example off$
+    # # Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+    # # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+    # u = RandomRDDs.normalRDD(sc, 1000000L, 10)
+    # # Apply a transform to get a random double RDD following `N(1, 4)`.
+    # v = u.map(lambda x: 1.0 + 2.0 * x)
+    # # $example off$
 
     sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index be7f09c32a0e8..d5883c7d89604 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -19,9 +19,6 @@
 package org.apache.spark.examples.mllib
 
 // $example on$
-import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.rdd.RDD
 
 // $example off$
 import org.apache.spark.sql.SQLContext
@@ -34,7 +31,7 @@ object HypothesisTestingExample {
     val conf = new SparkConf().setAppName("HypothesisTestingExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
-
+/*
     // $example on$
     // @note: todo
     val vec: Vector = ... // a vector composed of the frequencies of events
@@ -64,7 +61,7 @@ object HypothesisTestingExample {
     } // summary of the test
 
     // $example off$
-
+*/
     sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
index 37528e44b7cc6..1948069954748 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
@@ -19,8 +19,6 @@
 package org.apache.spark.examples.mllib
 
 // $example on$
-import org.apache.spark.mllib.stat.Statistics
-import org.apache.spark.rdd.RDD
 
 // $example off$
 import org.apache.spark.sql.SQLContext
@@ -36,7 +34,7 @@ object HypothesisTestingKolmogorovSmirnovTestExample {
 
     // $example on$
     // @note: todo
-
+/*
     val data: RDD[Double] = ... // an RDD of sample data
 
     // run a KS test for the sample versus a standard normal distribution
@@ -50,7 +48,7 @@ object HypothesisTestingKolmogorovSmirnovTestExample {
     val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
 
     // $example off$
-
+*/
     sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
index 1326e187ba771..f061318b2fd1d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
@@ -19,8 +19,6 @@
 package org.apache.spark.examples.mllib
 
 // $example on$
-import org.apache.spark.mllib.stat.KernelDensity
-import org.apache.spark.rdd.RDD
 // $example off$
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkConf, SparkContext}
@@ -32,7 +30,7 @@ object KernelDensityEstimationExample {
     val conf = new SparkConf().setAppName("KernelDensityEstimationExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
-
+/*
     // $example on$
 
     // @note: todo
@@ -48,7 +46,7 @@ object KernelDensityEstimationExample {
     // Find density estimates for the given values
     val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
     // $example off$
-
+*/
     sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
index 5de6162ade9d0..fe33f88d4f144 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
@@ -19,7 +19,6 @@
 package org.apache.spark.examples.mllib
 
 // $example on$
-import org.apache.spark.mllib.random.RandomRDDs._
 
 // $example off$
 import org.apache.spark.sql.SQLContext
@@ -32,7 +31,7 @@ object RandomDataGenerationExample {
     val conf = new SparkConf().setAppName("RandomDataGenerationExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
-
+/*
     // $example on$
 
     // @note: todo
@@ -44,7 +43,7 @@ object RandomDataGenerationExample {
     val v = u.map(x => 1.0 + 2.0 * x)
 
     // $example off$
-
+*/
     sc.stop()
   }
 }

From 2abfaa93b9dee4a86d304a54f59143a5fa0ee401 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Wed, 3 Feb 2016 17:18:33 -0800
Subject: [PATCH 07/26] [SPARK-13019] remove code block tag

---
 docs/mllib-statistics.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index bbbbd87a35610..a4b707453a2e3 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -182,7 +182,6 @@ Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/
 
 {% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java %}
 
-{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">

From 157da53ca1c6ceb13eb720e968d2e09fed44571d Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Wed, 3 Feb 2016 17:26:36 -0800
Subject: [PATCH 08/26] [SPARK-13019] make commented code explicit in html
 content

---
 .../examples/mllib/JavaHypothesisTestingExample.java      | 4 ++--
 ...JavaHypothesisTestingKolmogorovSmirnovTestExample.java | 4 ++--
 .../mllib/JavaKernelDensityEstimationExample.java         | 7 ++++---
 .../examples/mllib/JavaRandomDataGenerationExample.java   | 4 ++--
 .../spark/examples/mllib/HypothesisTestingExample.scala   | 7 ++++---
 .../HypothesisTestingKolmogorovSmirnovTestExample.scala   | 4 ++--
 .../examples/mllib/KernelDensityEstimationExample.scala   | 7 ++++---
 .../examples/mllib/RandomDataGenerationExample.scala      | 8 ++++----
 8 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
index 023480252d833..813c8ac936aca 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -71,9 +71,9 @@ public static void main(String[] args) {
             System.out.println(result); // summary of the test
             i++;
         }
-
-        // $example off$
 */
+        // $example off$
+
         jsc.stop();
     }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
index c34e66541eb7a..90fa8830b1ae6 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -50,9 +50,9 @@ public static void main(String[] args) {
         // and null hypothesis
         // if our p-value indicates significance, we can reject the null hypothesis
         System.out.println(testResult);
-
-        // $example off$
 */
+        // $example off$
+
         jsc.stop();
     }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
index 457ee0c5537c9..4229f59a64211 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -36,9 +36,9 @@ public static void main(String[] args) {
         SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
         JavaSparkContext jsc = new JavaSparkContext(conf);
         SQLContext sqlContext = new SQLContext(jsc);
-/*
-        // $example on$
 
+        // $example on$
+/*
         // @note: todo
 
         RDD<Double> data = ... // an RDD of sample data
@@ -51,8 +51,9 @@ public static void main(String[] args) {
 
         // Find density estimates for the given values
         double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
-        // $example off$
 */
+        // $example off$
+
         jsc.stop();
     }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
index f27476ffa6179..bad68ed5ba507 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
@@ -52,9 +52,9 @@ public Double call(Double x) {
                         return 1.0 + 2.0 * x;
                     }
                 });
-
-        // $example off$
 */
+        // $example off$
+
         jsc.stop();
     }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index d5883c7d89604..85cf226d3b5ac 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -31,8 +31,9 @@ object HypothesisTestingExample {
     val conf = new SparkConf().setAppName("HypothesisTestingExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
-/*
+
     // $example on$
+/*
     // @note: todo
     val vec: Vector = ... // a vector composed of the frequencies of events
 
@@ -59,9 +60,9 @@ object HypothesisTestingExample {
       println(s"Column $i:\n$result")
       i += 1
     } // summary of the test
-
-    // $example off$
 */
+    // $example off$
+
     sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
index 1948069954748..3e47287f06685 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
@@ -46,9 +46,9 @@ object HypothesisTestingKolmogorovSmirnovTestExample {
     // perform a KS test using a cumulative distribution function of our making
     val myCDF: Double => Double = ...
     val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
-
-    // $example off$
 */
+    // $example off$
+
     sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
index f061318b2fd1d..cc761fdff026a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
@@ -30,9 +30,9 @@ object KernelDensityEstimationExample {
     val conf = new SparkConf().setAppName("KernelDensityEstimationExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
-/*
-    // $example on$
 
+    // $example on$
+/*
     // @note: todo
 
     val data: RDD[Double] = ... // an RDD of sample data
@@ -45,8 +45,9 @@ object KernelDensityEstimationExample {
 
     // Find density estimates for the given values
     val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
-    // $example off$
 */
+    // $example off$
+
     sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
index fe33f88d4f144..7420efa4c2992 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
@@ -31,9 +31,9 @@ object RandomDataGenerationExample {
     val conf = new SparkConf().setAppName("RandomDataGenerationExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
-/*
-    // $example on$
 
+    // $example on$
+/*
     // @note: todo
 
     // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
@@ -41,9 +41,9 @@ object RandomDataGenerationExample {
     val u = normalRDD(sc, 1000000L, 10)
     // Apply a transform to get a random double RDD following `N(1, 4)`.
     val v = u.map(x => 1.0 + 2.0 * x)
-
-    // $example off$
 */
+    // $example off$
+
     sc.stop()
   }
 }

From 323304fe2aa3033a88429cf3a0e5adef345d2c24 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Thu, 4 Feb 2016 14:55:51 -0800
Subject: [PATCH 09/26] [SPARK-13019] Stratified Sampling working

---
 .../mllib/JavaStratifiedSamplingExample.java  | 44 +++++++++++++++----
 .../mllib/stratified_sampling_example.py      | 13 ++++--
 .../mllib/StratifiedSamplingExample.scala     |  6 +--
 3 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
index 6d11e97690413..768332558469a 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
@@ -18,7 +18,8 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
-import java.util.Map;
+import java.util.*;
+
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.JavaPairRDD;
@@ -26,27 +27,52 @@
 // $example off$
 
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.VoidFunction;
 import org.apache.spark.sql.SQLContext;
 import org.apache.spark.mllib.linalg.Vectors;
-import java.util.Arrays;
+import scala.Tuple2;
 
 
 public class JavaStratifiedSamplingExample {
     public static void main(String[] args) {
 
-        SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample");
+        SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample").setMaster("local[*]");
         JavaSparkContext jsc = new JavaSparkContext(conf);
         SQLContext sqlContext = new SQLContext(jsc);
 
         // $example on$
-//        JavaPairRDD<K, V> data = ... // an RDD of any key value pairs
-//        Map<K, Object> fractions = ... // specify the exact fraction desired from each key
-//
-//        // Get an exact sample from each stratum
-//        JavaPairRDD<K, V> approxSample = data.sampleByKey(false, fractions);
-//        JavaPairRDD<K, V> exactSample = data.sampleByKeyExact(false, fractions);
+        List<Tuple2<Integer, Character>> list = new ArrayList<>();
+        list.add(new Tuple2(1,'a'));
+        list.add(new Tuple2(1, 'b'));
+        list.add(new Tuple2(2,'c'));
+        list.add(new Tuple2(2,'d'));
+        list.add(new Tuple2(2, 'e'));
+        list.add(new Tuple2(3,'f'));
+
+        JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list); // an RDD of any key value pairs JavaPairRDD<K, V>
+        Map<Integer, Object> fractions = new HashMap<>(); // specify the exact fraction desired from each key Map<K, Object>
+        fractions.put(1, 0.1);
+        fractions.put(2, 0.6);
+        fractions.put(3, 0.3);
+
+        // Get an exact sample from each stratum
+        JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions); // JavaPairRDD<K, V>
+        JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions); // JavaPairRDD<K, V>
+
         // $example off$
 
+        approxSample.foreach(new VoidFunction<Tuple2<Integer, Character>>() {
+            public void call(Tuple2<Integer, Character> t) throws Exception {
+                System.out.println(t._1() + " " + t._2());
+            }
+        });
+
+        exactSample.foreach(new VoidFunction<Tuple2<Integer, Character>>() {
+            public void call(Tuple2<Integer, Character> t) throws Exception {
+                System.out.println(t._1() + " " + t._2());
+            }
+        });
+
         jsc.stop();
     }
 }
diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py
index 0f6ede7335a85..d44309d0f4ae5 100644
--- a/examples/src/main/python/mllib/stratified_sampling_example.py
+++ b/examples/src/main/python/mllib/stratified_sampling_example.py
@@ -30,10 +30,15 @@
     sqlContext = SQLContext(sc)
 
     # $example on$
-    # data = ... # an RDD of any key value pairs
-    # fractions = ... # specify the exact fraction desired from each key as a dictionary
-    #
-    # approxSample = data.sampleByKey(False, fractions);
+
+    data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')]) # an RDD of any key value pairs
+    fractions = {1:0.1, 2:0.6, 3:0.3} # specify the exact fraction desired from each key as a dictionary
+
+    approxSample = data.sampleByKey(False, fractions);
+
     # $example off$
 
+    for each in approxSample.collect():
+        print(each)
+
     sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index c01047e784357..0502e01235fdb 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -35,14 +35,14 @@ object StratifiedSamplingExample {
     // $example on$
     // @note: I don't know how to use class "import org.apache.spark.rdd.PairRDDFunctions"
     val data = sc.parallelize(Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f'))) // an RDD[(K, V)] of any key value pairs
-    val fractions =  Map(1 -> 1.0, 2 -> 2.0, 3 -> 3.0)// specify the exact fraction desired from each key
+    val fractions =  Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3)// specify the exact fraction desired from each key
 
     // Get an exact sample from each stratum
     val approxSample = data.sampleByKey(withReplacement = false, fractions)
     val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
 
-    println(approxSample.toString)
-    println(exactSample.toString)
+    approxSample.foreach(println)
+    exactSample.foreach(println)
     // $example off$
 
     sc.stop()

From 3692d30bfec83e53883a017339d08cfaf3223266 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Thu, 4 Feb 2016 17:14:01 -0800
Subject: [PATCH 10/26] [SPARK-13019] hypothesis testing working

---
 .../mllib/JavaHypothesisTestingExample.java   | 16 +++---
 .../mllib/hypothesis_testing_example.py       | 53 ++++++++++---------
 .../mllib/HypothesisTestingExample.scala      | 23 +++++---
 .../mllib/StratifiedSamplingExample.scala     |  5 +-
 4 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
index 813c8ac936aca..dbdfa6ec8df51 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -35,15 +35,13 @@
 public class JavaHypothesisTestingExample {
     public static void main(String[] args) {
 
-        SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
+        SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample").setMaster("local[*]");
         JavaSparkContext jsc = new JavaSparkContext(conf);
         SQLContext sqlContext = new SQLContext(jsc);
 
         // $example on$
 
-        // @note: todo
-/*
-        Vector vec = ... // a vector composed of the frequencies of events
+        Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); // a vector composed of the frequencies of events
 
         // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
         // the test runs against a uniform distribution.
@@ -52,14 +50,18 @@ public static void main(String[] args) {
         // and the null hypothesis.
         System.out.println(goodnessOfFitTestResult);
 
-        Matrix mat = ... // a contingency matrix
+        // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+        Matrix mat = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix
 
         // conduct Pearson's independence test on the input contingency matrix
         ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
         // summary of the test including the p-value, degrees of freedom...
         System.out.println(independenceTestResult);
 
-        JavaRDD<LabeledPoint> obs = ... // an RDD of labeled points
+        LabeledPoint p1 = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0));
+        LabeledPoint p2 = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0));
+        LabeledPoint p3 = new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5));
+        JavaRDD<LabeledPoint> obs = jsc.parallelize(Arrays.asList(p1, p2, p3)); // an RDD of labeled points
 
         // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
         // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
@@ -71,7 +73,7 @@ public static void main(String[] args) {
             System.out.println(result); // summary of the test
             i++;
         }
-*/
+
         // $example off$
 
         jsc.stop();
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
index cee1ab6168475..3d7fe646489dc 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@@ -24,7 +24,7 @@
 # $example on$
 from pyspark import SparkContext
 from pyspark.mllib.linalg import Vectors, Matrices
-from pyspark.mllib.regresssion import LabeledPoint
+from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.stat import Statistics
 # $example off$
 
@@ -34,32 +34,33 @@
 
     # $example on$
 
-    # @note: todo
+    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events
 
-    # vec = Vectors.dense(...) # a vector composed of the frequencies of events
-    #
-    # # compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
-    # # the test runs against a uniform distribution.
-    # goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-    # print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom,
-    # # test statistic, the method used, and the null hypothesis.
-    #
-    # mat = Matrices.dense(...) # a contingency matrix
-    #
-    # # conduct Pearson's independence test on the input contingency matrix
-    # independenceTestResult = Statistics.chiSqTest(mat)
-    # print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom...
-    #
-    # obs = sc.parallelize(...)  # LabeledPoint(feature, label) .
-    #
-    # # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
-    # # the independence test. Returns an array containing the ChiSquaredTestResult for every feature
-    # # against the label.
-    # featureTestResults = Statistics.chiSqTest(obs)
-    #
-    # for i, result in enumerate(featureTestResults):
-    #     print("Column $d:" % (i + 1))
-    #     print(result)
+    # compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
+    # the test runs against a uniform distribution.
+    goodnessOfFitTestResult = Statistics.chiSqTest(vec)
+    print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom,
+    # test statistic, the method used, and the null hypothesis.
+
+    mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix
+
+    # conduct Pearson's independence test on the input contingency matrix
+    independenceTestResult = Statistics.chiSqTest(mat)
+    print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom...
+
+    p1 = LabeledPoint(1.0, [1.0, 0.0, 3.0])
+    p2 = LabeledPoint(1.0, [1.0, 2.0, 0.0])
+    p3 = LabeledPoint(1.0, [-1.0, 0.0, -0.5])
+    obs = sc.parallelize([p1, p2, p3])  # LabeledPoint(feature, label) .
+
+    # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
+    # the independence test. Returns an array containing the ChiSquaredTestResult for every feature
+    # against the label.
+    featureTestResults = Statistics.chiSqTest(obs)
+
+    for i, result in enumerate(featureTestResults):
+        print("Column: " + str(i + 1))
+        print(result)
 
     # $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index 85cf226d3b5ac..9b414af60e705 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -19,6 +19,11 @@
 package org.apache.spark.examples.mllib
 
 // $example on$
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.mllib.stat.test.ChiSqTestResult
+import org.apache.spark.rdd.RDD
 
 // $example off$
 import org.apache.spark.sql.SQLContext
@@ -33,9 +38,8 @@ object HypothesisTestingExample {
     val sqlContext = new SQLContext(sc)
 
     // $example on$
-/*
-    // @note: todo
-    val vec: Vector = ... // a vector composed of the frequencies of events
+
+    val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) // a vector composed of the frequencies of events
 
     // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
     // the test runs against a uniform distribution.
@@ -43,13 +47,20 @@ object HypothesisTestingExample {
     println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom,
     // test statistic, the method used, and the null hypothesis.
 
-    val mat: Matrix = ... // a contingency matrix
+
+
+    // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+    val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) // a contingency matrix
 
     // conduct Pearson's independence test on the input contingency matrix
     val independenceTestResult = Statistics.chiSqTest(mat)
     println(independenceTestResult) // summary of the test including the p-value, degrees of freedom...
 
-    val obs: RDD[LabeledPoint] = ... // (feature, label) pairs.
+
+    val p1 = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
+    val p2 = LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0))
+    val p3 = LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
+    val obs: RDD[LabeledPoint] = sc.parallelize(Seq(p1, p2, p3)) // (feature, label) pairs.
 
     // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
     // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
@@ -60,7 +71,7 @@ object HypothesisTestingExample {
       println(s"Column $i:\n$result")
       i += 1
     } // summary of the test
-*/
+
     // $example off$
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index 0502e01235fdb..9144b2f0c5813 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -41,10 +41,11 @@ object StratifiedSamplingExample {
     val approxSample = data.sampleByKey(withReplacement = false, fractions)
     val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
 
-    approxSample.foreach(println)
-    exactSample.foreach(println)
     // $example off$
 
+    approxSample.foreach(println)
+    exactSample.foreach(println)
+    
     sc.stop()
   }
 }

From 89c3d2ed7d0134d27ecd5c2077e45c12e1702552 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Thu, 4 Feb 2016 17:33:25 -0800
Subject: [PATCH 11/26] [SPARK-13019] Hypothesis Testing Kolmogorov Smirnov
 Test Example is working

---
 ...isTestingKolmogorovSmirnovTestExample.java |  8 +++-----
 ...testing_kolmogorov_smirnov_test_example.py | 20 +++++++++----------
 ...sTestingKolmogorovSmirnovTestExample.scala | 12 ++++++-----
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
index 90fa8830b1ae6..d78e246d0799f 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -36,21 +36,19 @@
 public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
     public static void main(String[] args) {
 
-        SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
+        SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample").setMaster("local[*]");
         JavaSparkContext jsc = new JavaSparkContext(conf);
         SQLContext sqlContext = new SQLContext(jsc);
 
         // $example on$
-/*
-        // @note: todo
 
-        JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...));
+        JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
         KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
         // summary of the test including the p-value, test statistic,
         // and null hypothesis
         // if our p-value indicates significance, we can reject the null hypothesis
         System.out.println(testResult);
-*/
+
         // $example off$
 
         jsc.stop();
diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
index 5541250dd76ee..5189992ade9b5 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
@@ -31,17 +31,15 @@
 
     # $example on$
 
-    # @note: todo
-
-    # parallelData = sc.parallelize([1.0, 2.0, ... ])
-    #
-    # # run a KS test for the sample versus a standard normal distribution
-    # testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
-    # print(testResult) # summary of the test including the p-value, test statistic,
-    # # and null hypothesis
-    # # if our p-value indicates significance, we can reject the null hypothesis
-    # # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
-    # # a lambda to calculate the CDF is not made available in the Python API
+    parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25])
+
+    # run a KS test for the sample versus a standard normal distribution
+    testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
+    print(testResult) # summary of the test including the p-value, test statistic,
+    # and null hypothesis
+    # if our p-value indicates significance, we can reject the null hypothesis
+    # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
+    # a lambda to calculate the CDF is not made available in the Python API
 
     # $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
index 3e47287f06685..29cd68ee9fc3a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
@@ -19,6 +19,8 @@
 package org.apache.spark.examples.mllib
 
 // $example on$
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.rdd.RDD
 
 // $example off$
 import org.apache.spark.sql.SQLContext
@@ -33,9 +35,8 @@ object HypothesisTestingKolmogorovSmirnovTestExample {
     val sqlContext = new SQLContext(sc)
 
     // $example on$
-    // @note: todo
-/*
-    val data: RDD[Double] = ... // an RDD of sample data
+
+    val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data
 
     // run a KS test for the sample versus a standard normal distribution
     val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
@@ -44,9 +45,10 @@ object HypothesisTestingKolmogorovSmirnovTestExample {
     // if our p-value indicates significance, we can reject the null hypothesis
 
     // perform a KS test using a cumulative distribution function of our making
-    val myCDF: Double => Double = ...
+    val myCDF: Double => Double = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
     val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
-*/
+    println(testResult2)
+
     // $example off$
 
     sc.stop()

From 4dbbc6d32ed3c045e5b947df77f11195adb1255f Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Thu, 4 Feb 2016 17:37:43 -0800
Subject: [PATCH 12/26] [SPARK-13019] remove empty lines

---
 .../spark/examples/mllib/JavaHypothesisTestingExample.java   | 1 -
 .../JavaHypothesisTestingKolmogorovSmirnovTestExample.java   | 5 -----
 examples/src/main/python/mllib/hypothesis_testing_example.py | 1 -
 .../hypothesis_testing_kolmogorov_smirnov_test_example.py    | 1 -
 .../spark/examples/mllib/HypothesisTestingExample.scala      | 5 -----
 .../HypothesisTestingKolmogorovSmirnovTestExample.scala      | 2 --
 6 files changed, 15 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
index dbdfa6ec8df51..de6330667915f 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -40,7 +40,6 @@ public static void main(String[] args) {
         SQLContext sqlContext = new SQLContext(jsc);
 
         // $example on$
-
         Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); // a vector composed of the frequencies of events
 
         // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
index d78e246d0799f..0c40e2bd2f9b6 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -18,11 +18,8 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
-import java.util.Arrays;
-
 import org.apache.spark.api.java.JavaDoubleRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-
 import org.apache.spark.mllib.stat.Statistics;
 import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
 // $example off$
@@ -32,7 +29,6 @@
 import org.apache.spark.mllib.linalg.Vectors;
 import java.util.Arrays;
 
-
 public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
     public static void main(String[] args) {
 
@@ -41,7 +37,6 @@ public static void main(String[] args) {
         SQLContext sqlContext = new SQLContext(jsc);
 
         // $example on$
-
         JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
         KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
         // summary of the test including the p-value, test statistic,
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
index 3d7fe646489dc..91ed400210cdc 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@@ -33,7 +33,6 @@
     sqlContext = SQLContext(sc)
 
     # $example on$
-
     vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events
 
     # compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
index 5189992ade9b5..91499d4f1fdc6 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
@@ -30,7 +30,6 @@
     sqlContext = SQLContext(sc)
 
     # $example on$
-
     parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25])
 
     # run a KS test for the sample versus a standard normal distribution
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index 9b414af60e705..1b75535adba80 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -24,7 +24,6 @@ import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.Statistics
 import org.apache.spark.mllib.stat.test.ChiSqTestResult
 import org.apache.spark.rdd.RDD
-
 // $example off$
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkConf, SparkContext}
@@ -38,7 +37,6 @@ object HypothesisTestingExample {
     val sqlContext = new SQLContext(sc)
 
     // $example on$
-
     val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) // a vector composed of the frequencies of events
 
     // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
@@ -47,8 +45,6 @@ object HypothesisTestingExample {
     println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom,
     // test statistic, the method used, and the null hypothesis.
 
-
-
     // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
     val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) // a contingency matrix
 
@@ -56,7 +52,6 @@ object HypothesisTestingExample {
     val independenceTestResult = Statistics.chiSqTest(mat)
     println(independenceTestResult) // summary of the test including the p-value, degrees of freedom...
 
-
     val p1 = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
     val p2 = LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0))
     val p3 = LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
index 29cd68ee9fc3a..656c684ff1d21 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
@@ -21,7 +21,6 @@ package org.apache.spark.examples.mllib
 // $example on$
 import org.apache.spark.mllib.stat.Statistics
 import org.apache.spark.rdd.RDD
-
 // $example off$
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkConf, SparkContext}
@@ -35,7 +34,6 @@ object HypothesisTestingKolmogorovSmirnovTestExample {
     val sqlContext = new SQLContext(sc)
 
     // $example on$
-
     val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data
 
     // run a KS test for the sample versus a standard normal distribution

From f024fc3c5a6021f2d320c72f9ded78adc3347fbb Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Thu, 4 Feb 2016 18:03:26 -0800
Subject: [PATCH 13/26] [SPARK-13019] random data generation example working

---
 .../JavaRandomDataGenerationExample.java      | 26 ++++++++++++++-----
 .../mllib/random_data_generation_example.py   | 17 +++++++-----
 .../mllib/RandomDataGenerationExample.scala   | 11 ++++----
 3 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
index bad68ed5ba507..46bd1889bb803 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
@@ -26,35 +26,47 @@
 
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.VoidFunction;
 import org.apache.spark.sql.SQLContext;
 import org.apache.spark.mllib.linalg.Vectors;
 import java.util.Arrays;
 
 
+
 public class JavaRandomDataGenerationExample {
     public static void main(String[] args) {
 
-        SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample");
+        SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample").setMaster("local[*]");
         JavaSparkContext jsc = new JavaSparkContext(conf);
         SQLContext sqlContext = new SQLContext(jsc);
 
         // $example on$
-/*
-        // @note: todo
-
         // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
         // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
-        JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10);
+        JavaDoubleRDD u = normalJavaRDD(jsc, 1000L, 10);
         // Apply a transform to get a random double RDD following `N(1, 4)`.
-        JavaDoubleRDD v = u.map(
+        JavaRDD v = u.map(
                 new Function<Double, Double>() {
                     public Double call(Double x) {
                         return 1.0 + 2.0 * x;
                     }
                 });
-*/
+
         // $example off$
 
+        u.foreach(new VoidFunction<Double>() {
+            public void call(Double d) throws Exception {
+                System.out.println(d);
+            }
+        });
+
+        v.foreach(new VoidFunction<Double>() {
+            public void call(Double d) throws Exception {
+                System.out.println(d);
+            }
+        });
+
         jsc.stop();
     }
 }
diff --git a/examples/src/main/python/mllib/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py
index db2a5e97e87d2..7bec4bddeef81 100644
--- a/examples/src/main/python/mllib/random_data_generation_example.py
+++ b/examples/src/main/python/mllib/random_data_generation_example.py
@@ -30,14 +30,17 @@
     sqlContext = SQLContext(sc)
 
     # $example on$
+    # Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+    # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+    u = RandomRDDs.normalRDD(sc, 1000L, 10)
+    # Apply a transform to get a random double RDD following `N(1, 4)`.
+    v = u.map(lambda x: 1.0 + 2.0 * x)
+    # $example off$
 
-    # @note: todo
+    for each in u.collect():
+        print(each)
 
-    # # Generate a random double RDD that contains 1 million i.i.d. values drawn from the
-    # # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
-    # u = RandomRDDs.normalRDD(sc, 1000000L, 10)
-    # # Apply a transform to get a random double RDD following `N(1, 4)`.
-    # v = u.map(lambda x: 1.0 + 2.0 * x)
-    # # $example off$
+    for each in v.collect():
+        print(each)
 
     sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
index 7420efa4c2992..baa36bd7b7a1f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
@@ -19,7 +19,7 @@
 package org.apache.spark.examples.mllib
 
 // $example on$
-
+import org.apache.spark.mllib.random.RandomRDDs._
 // $example off$
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkConf, SparkContext}
@@ -33,16 +33,15 @@ object RandomDataGenerationExample {
     val sqlContext = new SQLContext(sc)
 
     // $example on$
-/*
-    // @note: todo
-
     // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
     // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
-    val u = normalRDD(sc, 1000000L, 10)
+    val u = normalRDD(sc, 1000L, 10)
     // Apply a transform to get a random double RDD following `N(1, 4)`.
     val v = u.map(x => 1.0 + 2.0 * x)
-*/
+
     // $example off$
+    u.foreach(print)
+    v.foreach(print)
 
     sc.stop()
   }

From 6f949cde4f18829b2a1c7f946373fc008f4a5bb1 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Sun, 7 Feb 2016 11:19:47 +0800
Subject: [PATCH 14/26] [SPARK-13019] Kernel Density Estimation Example is
 working

---
 .../JavaKernelDensityEstimationExample.java   | 12 ++++----
 .../kernel_density_estimation_example.py      | 29 +++++++++----------
 .../KernelDensityEstimationExample.scala      | 11 +++----
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
index 4229f59a64211..c62410dc7d770 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -18,6 +18,7 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
+import org.apache.spark.api.java.JavaDoubleRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.stat.KernelDensity;
@@ -33,15 +34,12 @@
 public class JavaKernelDensityEstimationExample {
     public static void main(String[] args) {
 
-        SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
+        SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample").setMaster("local[*]");
         JavaSparkContext jsc = new JavaSparkContext(conf);
         SQLContext sqlContext = new SQLContext(jsc);
 
         // $example on$
-/*
-        // @note: todo
-
-        RDD<Double> data = ... // an RDD of sample data
+        JavaRDD<Double> data = jsc.parallelize(Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data
 
         // Construct the density estimator with the sample data and a standard deviation for the Gaussian
         // kernels
@@ -51,9 +49,11 @@ public static void main(String[] args) {
 
         // Find density estimates for the given values
         double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
-*/
+
         // $example off$
 
+        System.out.println(Arrays.toString(densities));
+
         jsc.stop();
     }
 }
diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py
index 383d9f37a9276..a3055783db23d 100644
--- a/examples/src/main/python/mllib/kernel_density_estimation_example.py
+++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py
@@ -29,20 +29,19 @@
     sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext
     sqlContext = SQLContext(sc)
 
-    # # $example on$
-    #
-    # # @note: todo
-    #
-    # data = ... # an RDD of sample data
-    #
-    # # Construct the density estimator with the sample data and a standard deviation for the Gaussian
-    # # kernels
-    # kd = KernelDensity()
-    # kd.setSample(data)
-    # kd.setBandwidth(3.0)
-    #
-    # # Find density estimates for the given values
-    # densities = kd.estimate([-1.0, 2.0, 5.0])
-    # # $example off$
+    # $example on$
+    data = sc.parallelize([1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0]) # an RDD of sample data
+
+    # Construct the density estimator with the sample data and a standard deviation for the Gaussian
+    # kernels
+    kd = KernelDensity()
+    kd.setSample(data)
+    kd.setBandwidth(3.0)
+
+    # Find density estimates for the given values
+    densities = kd.estimate([-1.0, 2.0, 5.0])
+    # $example off$
+
+    print(densities)
 
     sc.stop()
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
index cc761fdff026a..636457e3fa0f2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
@@ -19,6 +19,8 @@
 package org.apache.spark.examples.mllib
 
 // $example on$
+import org.apache.spark.mllib.stat.KernelDensity
+import org.apache.spark.rdd.RDD
 // $example off$
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkConf, SparkContext}
@@ -32,10 +34,7 @@ object KernelDensityEstimationExample {
     val sqlContext = new SQLContext(sc)
 
     // $example on$
-/*
-    // @note: todo
-
-    val data: RDD[Double] = ... // an RDD of sample data
+    val data: RDD[Double] = sc.parallelize(Seq(1,1,1,2,3,4,5,5,6,7,8,9,9)) // an RDD of sample data
 
     // Construct the density estimator with the sample data and a standard deviation for the Gaussian
     // kernels
@@ -45,9 +44,11 @@ object KernelDensityEstimationExample {
 
     // Find density estimates for the given values
     val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
-*/
+
     // $example off$
 
+    densities.foreach(print)
+
     sc.stop()
   }
 }

From a4dd0fb5781dbdc162fd8652001b8de09b225297 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Sun, 7 Feb 2016 15:16:33 +0800
Subject: [PATCH 15/26] [SPARK-13019] code style check

---
 docs/mllib-statistics.md                      |  1 -
 .../mllib/JavaCorrelationsExample.java        | 13 ++++++++----
 ...isTestingKolmogorovSmirnovTestExample.java |  3 ++-
 .../JavaKernelDensityEstimationExample.java   |  3 ++-
 .../mllib/JavaStratifiedSamplingExample.java  |  8 +++++--
 .../main/python/mllib/correlations_example.py |  7 ++++---
 .../examples/mllib/CorrelationsExample.scala  | 16 ++++++++------
 .../mllib/HypothesisTestingExample.scala      | 21 +++++++++----------
 ...sTestingKolmogorovSmirnovTestExample.scala |  9 ++++----
 .../KernelDensityEstimationExample.scala      | 11 +++++-----
 .../mllib/RandomDataGenerationExample.scala   |  4 +---
 .../mllib/StratifiedSamplingExample.scala     | 11 ++++++----
 .../mllib/SummaryStatisticsExample.scala      |  4 +---
 13 files changed, 61 insertions(+), 50 deletions(-)

diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index a4b707453a2e3..b06829f0247dd 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -181,7 +181,6 @@ hypothesis tests.
 Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API.
 
 {% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java %}
-
 </div>
 
 <div data-lang="python" markdown="1">
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
index 6fb1ee6365a27..e12481fab10c0 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
@@ -35,12 +35,15 @@ public class JavaCorrelationsExample {
     public static void main(String[] args) {
 
         SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample").setMaster("local[*]");
+        JavaSparkContext jsc = new JavaSparkContext(conf);
 
         // $example on$
-        JavaSparkContext jsc = new JavaSparkContext(conf);
+        JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
+                Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series
 
-        JavaDoubleRDD seriesX = jsc.parallelizeDoubles(Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series
-        JavaDoubleRDD seriesY = jsc.parallelizeDoubles(Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0})); // must have the same number of partitions and cardinality as seriesX
+        // must have the same number of partitions and cardinality as seriesX
+        JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
+                Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0}));
 
         // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
         // method is not specified, Pearson's method will be used by default.
@@ -50,7 +53,9 @@ public static void main(String[] args) {
         Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
         Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
         Vector v3 = Vectors.dense(5.0, 33.0, 366.0);
-        JavaRDD<Vector> data = jsc.parallelize(Arrays.asList(v1, v2, v3)); // note that each Vector is a row and not a column
+
+        // note that each Vector is a row and not a column
+        JavaRDD<Vector> data = jsc.parallelize(Arrays.asList(v1, v2, v3));
 
         // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
         // If a method is not specified, Pearson's method will be used by default.
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
index 0c40e2bd2f9b6..875c2c8777c3b 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -32,7 +32,8 @@
 public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
     public static void main(String[] args) {
 
-        SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample").setMaster("local[*]");
+        SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample")
+                .setMaster("local[*]");
         JavaSparkContext jsc = new JavaSparkContext(conf);
         SQLContext sqlContext = new SQLContext(jsc);
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
index c62410dc7d770..72781eedfe635 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -39,7 +39,8 @@ public static void main(String[] args) {
         SQLContext sqlContext = new SQLContext(jsc);
 
         // $example on$
-        JavaRDD<Double> data = jsc.parallelize(Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data
+        JavaRDD<Double> data = jsc.parallelize(
+                Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data
 
         // Construct the density estimator with the sample data and a standard deviation for the Gaussian
         // kernels
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
index 768332558469a..7371e274dcbb1 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
@@ -49,8 +49,12 @@ public static void main(String[] args) {
         list.add(new Tuple2(2, 'e'));
         list.add(new Tuple2(3,'f'));
 
-        JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list); // an RDD of any key value pairs JavaPairRDD<K, V>
-        Map<Integer, Object> fractions = new HashMap<>(); // specify the exact fraction desired from each key Map<K, Object>
+        // an RDD of any key value pairs JavaPairRDD<K, V>
+        JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);
+
+        // specify the exact fraction desired from each key Map<K, Object>
+        Map<Integer, Object> fractions = new HashMap<>();
+
         fractions.put(1, 0.1);
         fractions.put(2, 0.6);
         fractions.put(3, 0.3);
diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py
index 35e089c6a0e7d..f20c8b6c03413 100644
--- a/examples/src/main/python/mllib/correlations_example.py
+++ b/examples/src/main/python/mllib/correlations_example.py
@@ -29,8 +29,9 @@
     # $example on$
     sc = SparkContext(appName="CorrelationsExample") # SparkContext
 
-    seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series
-    seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) # must have the same number of partitions and cardinality as seriesX
+    seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
+    seriesY = sc.parallelize(
+        [11.0, 22.0, 33.0, 33.0, 555.0])  # must have the same number of partitions and cardinality as seriesX
 
     # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
     # method is not specified, Pearson's method will be used by default.
@@ -39,7 +40,7 @@
     v1 = np.array([1.0, 10.0, 100.0])
     v2 = np.array([2.0, 20.0, 200.0])
     v3 = np.array([5.0, 33.0, 366.0])
-    data = sc.parallelize([v1, v2, v3]) # an RDD of Vectors
+    data = sc.parallelize([v1, v2, v3])  # an RDD of Vectors
 
     # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
     # If a method is not specified, Pearson's method will be used by default.
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
index aeb5f7f802e00..9b3c0321f067d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
@@ -18,13 +18,12 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
+import org.apache.spark.{SparkConf, SparkContext}
 // $example on$
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.stat.Statistics
 // $example off$
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.{SparkConf, SparkContext}
 
 object CorrelationsExample {
 
@@ -32,20 +31,25 @@ object CorrelationsExample {
 
     val conf = new SparkConf().setAppName("CorrelationsExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
-    val sqlContext = new SQLContext(sc)
 
     // $example on$
     val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series
-    val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555)) // must have the same number of partitions and cardinality as seriesX
+    val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555))
+    // must have the same number of partitions and cardinality as seriesX
 
     // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
     // method is not specified, Pearson's method will be used by default.
     val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
     println(correlation)
 
-    val data: RDD[Vector] = sc.parallelize(Seq(Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(5.0, 33.0, 366.0))) // note that each Vector is a row and not a column
+    val data: RDD[Vector] = sc.parallelize(
+      Seq(
+        Vectors.dense(1.0, 10.0, 100.0),
+        Vectors.dense(2.0, 20.0, 200.0),
+        Vectors.dense(5.0, 33.0, 366.0))
+    ) // note that each Vector is a row and not a column
 
-    // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+    // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method
     // If a method is not specified, Pearson's method will be used by default.
     val correlMatrix: Matrix = Statistics.corr(data, "pearson")
     println(correlMatrix.toString)
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index 1b75535adba80..fe3c280cf0082 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -18,6 +18,7 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
+import org.apache.spark.{SparkConf, SparkContext}
 // $example on$
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -25,8 +26,6 @@ import org.apache.spark.mllib.stat.Statistics
 import org.apache.spark.mllib.stat.test.ChiSqTestResult
 import org.apache.spark.rdd.RDD
 // $example off$
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.{SparkConf, SparkContext}
 
 object HypothesisTestingExample {
 
@@ -34,23 +33,23 @@ object HypothesisTestingExample {
 
     val conf = new SparkConf().setAppName("HypothesisTestingExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
-    val sqlContext = new SQLContext(sc)
 
     // $example on$
-    val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) // a vector composed of the frequencies of events
+    // a vector composed of the frequencies of events
+    val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)
 
-    // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
-    // the test runs against a uniform distribution.
+    // compute the goodness of fit. If a second vector to test against is not supplied
+    // as a parameter, the test runs against a uniform distribution.
     val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-    println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom,
-    // test statistic, the method used, and the null hypothesis.
+    println(goodnessOfFitTestResult) // summary of the test including the p-value,
+    // degrees of freedom, test statistic, the method used, and the null hypothesis.
 
-    // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
-    val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) // a contingency matrix
+    // a contingency matrix. Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+    val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
 
     // conduct Pearson's independence test on the input contingency matrix
     val independenceTestResult = Statistics.chiSqTest(mat)
-    println(independenceTestResult) // summary of the test including the p-value, degrees of freedom...
+    println(independenceTestResult) // summary of the test including the p-value, degrees of freedom
 
     val p1 = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
     val p2 = LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0))
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
index 656c684ff1d21..7ed96766fcd11 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
@@ -18,20 +18,19 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
+import org.apache.spark.{SparkConf, SparkContext}
 // $example on$
 import org.apache.spark.mllib.stat.Statistics
 import org.apache.spark.rdd.RDD
 // $example off$
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.{SparkConf, SparkContext}
 
 object HypothesisTestingKolmogorovSmirnovTestExample {
 
   def main(args: Array[String]) {
 
-    val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample").setMaster("local[*]")
+    val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample")
+      .setMaster("local[*]")
     val sc = new SparkContext(conf)
-    val sqlContext = new SQLContext(sc)
 
     // $example on$
     val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data
@@ -43,7 +42,7 @@ object HypothesisTestingKolmogorovSmirnovTestExample {
     // if our p-value indicates significance, we can reject the null hypothesis
 
     // perform a KS test using a cumulative distribution function of our making
-    val myCDF: Double => Double = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
+    val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
     val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
     println(testResult2)
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
index 636457e3fa0f2..31b5a5e1ad05c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
@@ -18,12 +18,11 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
+import org.apache.spark.{SparkConf, SparkContext}
 // $example on$
 import org.apache.spark.mllib.stat.KernelDensity
 import org.apache.spark.rdd.RDD
 // $example off$
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.{SparkConf, SparkContext}
 
 object KernelDensityEstimationExample {
 
@@ -31,13 +30,13 @@ object KernelDensityEstimationExample {
 
     val conf = new SparkConf().setAppName("KernelDensityEstimationExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
-    val sqlContext = new SQLContext(sc)
 
     // $example on$
-    val data: RDD[Double] = sc.parallelize(Seq(1,1,1,2,3,4,5,5,6,7,8,9,9)) // an RDD of sample data
+    // an RDD of sample data
+    val data: RDD[Double] = sc.parallelize(Seq(1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9))
 
-    // Construct the density estimator with the sample data and a standard deviation for the Gaussian
-    // kernels
+    // Construct the density estimator with the sample data and a standard deviation
+    // for the Gaussian kernels
     val kd = new KernelDensity()
       .setSample(data)
       .setBandwidth(3.0)
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
index baa36bd7b7a1f..91019a2ac9de5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
@@ -18,11 +18,10 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
+import org.apache.spark.{SparkConf, SparkContext}
 // $example on$
 import org.apache.spark.mllib.random.RandomRDDs._
 // $example off$
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.{SparkConf, SparkContext}
 
 object RandomDataGenerationExample {
 
@@ -30,7 +29,6 @@ object RandomDataGenerationExample {
 
     val conf = new SparkConf().setAppName("RandomDataGenerationExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
-    val sqlContext = new SQLContext(sc)
 
     // $example on$
     // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index 9144b2f0c5813..24a5407426894 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -33,9 +33,12 @@ object StratifiedSamplingExample {
     val sqlContext = new SQLContext(sc)
 
     // $example on$
-    // @note: I don't know how to use class "import org.apache.spark.rdd.PairRDDFunctions"
-    val data = sc.parallelize(Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f'))) // an RDD[(K, V)] of any key value pairs
-    val fractions =  Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3)// specify the exact fraction desired from each key
+    // an RDD[(K, V)] of any key value pairs
+    val data = sc.parallelize(
+      Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')))
+
+    // specify the exact fraction desired from each key
+    val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3)
 
     // Get an exact sample from each stratum
     val approxSample = data.sampleByKey(withReplacement = false, fractions)
@@ -45,7 +48,7 @@ object StratifiedSamplingExample {
 
     approxSample.foreach(println)
     exactSample.foreach(println)
-    
+
     sc.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
index 8876dbfcdb863..c2fe7976b4609 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
@@ -18,12 +18,11 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
+import org.apache.spark.{SparkConf, SparkContext}
 // $example on$
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
 // $example off$
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.{SparkConf, SparkContext}
 
 object SummaryStatisticsExample {
 
@@ -31,7 +30,6 @@ object SummaryStatisticsExample {
 
     val conf = new SparkConf().setAppName("SummaryStatisticsExample").setMaster("local[*]")
     val sc = new SparkContext(conf)
-    val sqlContext = new SQLContext(sc)
 
     // $example on$
     val v1 = Vectors.dense(1.0, 10.0, 100.0)

From 3a11802513a0d9a1c9f1dc5de8c37c09f5e97062 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Fri, 12 Feb 2016 08:29:37 +0800
Subject: [PATCH 16/26] [SPARK-13019] fix python style

---
 .../main/python/mllib/correlations_example.py    |  8 ++++----
 .../python/mllib/hypothesis_testing_example.py   | 16 ++++++++--------
 ...is_testing_kolmogorov_smirnov_test_example.py |  6 +++---
 .../mllib/kernel_density_estimation_example.py   |  7 ++++---
 .../mllib/random_data_generation_example.py      |  4 ++--
 .../python/mllib/stratified_sampling_example.py  | 12 +++++++-----
 .../python/mllib/summary_statistics_example.py   | 12 ++++++------
 7 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py
index f20c8b6c03413..e9ccca0dd5593 100644
--- a/examples/src/main/python/mllib/correlations_example.py
+++ b/examples/src/main/python/mllib/correlations_example.py
@@ -27,11 +27,11 @@
 
 if __name__ == "__main__":
     # $example on$
-    sc = SparkContext(appName="CorrelationsExample") # SparkContext
+    sc = SparkContext(appName="CorrelationsExample")  # SparkContext
 
     seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
-    seriesY = sc.parallelize(
-        [11.0, 22.0, 33.0, 33.0, 555.0])  # must have the same number of partitions and cardinality as seriesX
+    # seriesY must have the same number of partitions and cardinality as seriesX
+    seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])
 
     # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
     # method is not specified, Pearson's method will be used by default.
@@ -48,4 +48,4 @@
 
     # $example off$
 
-    sc.stop()
\ No newline at end of file
+    sc.stop()
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
index 91ed400210cdc..4d37e394af3b7 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@@ -29,23 +29,23 @@
 # $example off$
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="HypothesisTestingExample") # SparkContext
+    sc = SparkContext(appName="HypothesisTestingExample")  # SparkContext
     sqlContext = SQLContext(sc)
 
     # $example on$
-    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events
+    vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)  # a vector composed of the frequencies of events
 
-    # compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
-    # the test runs against a uniform distribution.
+    # compute the goodness of fit. If a second vector to test against
+    # is not supplied as a parameter, the test runs against a uniform distribution.
     goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-    print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom,
+    print(goodnessOfFitTestResult)  # summary of the test including the p-value, degrees of freedom,
     # test statistic, the method used, and the null hypothesis.
 
-    mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix
+    mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix
 
     # conduct Pearson's independence test on the input contingency matrix
     independenceTestResult = Statistics.chiSqTest(mat)
-    print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom...
+    print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom
 
     p1 = LabeledPoint(1.0, [1.0, 0.0, 3.0])
     p2 = LabeledPoint(1.0, [1.0, 2.0, 0.0])
@@ -63,4 +63,4 @@
 
     # $example off$
 
-    sc.stop()
\ No newline at end of file
+    sc.stop()
diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
index 91499d4f1fdc6..3e3c6ba0b96a8 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
@@ -26,7 +26,7 @@
 # $example off$
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample") # SparkContext
+    sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample")  # SparkContext
     sqlContext = SQLContext(sc)
 
     # $example on$
@@ -34,7 +34,7 @@
 
     # run a KS test for the sample versus a standard normal distribution
     testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
-    print(testResult) # summary of the test including the p-value, test statistic,
+    print(testResult)  # summary of the test including the p-value, test statistic,
     # and null hypothesis
     # if our p-value indicates significance, we can reject the null hypothesis
     # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
@@ -42,4 +42,4 @@
 
     # $example off$
 
-    sc.stop()
\ No newline at end of file
+    sc.stop()
diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py
index a3055783db23d..746027e6d599f 100644
--- a/examples/src/main/python/mllib/kernel_density_estimation_example.py
+++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py
@@ -26,11 +26,12 @@
 # $example off$
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext
+    sc = SparkContext(appName="KernelDensityEstimationExample")  # SparkContext
     sqlContext = SQLContext(sc)
 
     # $example on$
-    data = sc.parallelize([1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0]) # an RDD of sample data
+    # an RDD of sample data
+    data = sc.parallelize([1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0])
 
     # Construct the density estimator with the sample data and a standard deviation for the Gaussian
     # kernels
@@ -44,4 +45,4 @@
 
     print(densities)
 
-    sc.stop()
\ No newline at end of file
+    sc.stop()
diff --git a/examples/src/main/python/mllib/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py
index 7bec4bddeef81..7eb2ea81a8038 100644
--- a/examples/src/main/python/mllib/random_data_generation_example.py
+++ b/examples/src/main/python/mllib/random_data_generation_example.py
@@ -26,7 +26,7 @@
 # $example off$
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="RandomDataGenerationExample") # SparkContext
+    sc = SparkContext(appName="RandomDataGenerationExample")  # SparkContext
     sqlContext = SQLContext(sc)
 
     # $example on$
@@ -43,4 +43,4 @@
     for each in v.collect():
         print(each)
 
-    sc.stop()
\ No newline at end of file
+    sc.stop()
diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py
index d44309d0f4ae5..63bf2dddede01 100644
--- a/examples/src/main/python/mllib/stratified_sampling_example.py
+++ b/examples/src/main/python/mllib/stratified_sampling_example.py
@@ -26,19 +26,21 @@
 # $example off$
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="StratifiedSamplingExample") # SparkContext
+    sc = SparkContext(appName="StratifiedSamplingExample")  # SparkContext
     sqlContext = SQLContext(sc)
 
     # $example on$
+    # an RDD of any key value pairs
+    data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')])
 
-    data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')]) # an RDD of any key value pairs
-    fractions = {1:0.1, 2:0.6, 3:0.3} # specify the exact fraction desired from each key as a dictionary
+    # specify the exact fraction desired from each key as a dictionary
+    fractions = {1: 0.1, 2: 0.6, 3: 0.3}
 
-    approxSample = data.sampleByKey(False, fractions);
+    approxSample = data.sampleByKey(False, fractions)
 
     # $example off$
 
     for each in approxSample.collect():
         print(each)
 
-    sc.stop()
\ No newline at end of file
+    sc.stop()
diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py
index fef018127451d..2048bc432aa77 100644
--- a/examples/src/main/python/mllib/summary_statistics_example.py
+++ b/examples/src/main/python/mllib/summary_statistics_example.py
@@ -26,20 +26,20 @@
 # $example off$
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext
+    sc = SparkContext(appName="SummaryStatisticsExample")  # SparkContext
     sqlContext = SQLContext(sc)
 
     # $example on$
     v1 = np.array([1.0, 2.0, 3.0])
     v2 = np.array([10.0, 20.0, 30.0])
     v3 = np.array([100.0, 200.0, 300.0])
-    mat = sc.parallelize([v1, v2, v3]) # an RDD of Vectors
+    mat = sc.parallelize([v1, v2, v3])  # an RDD of Vectors
 
     # Compute column summary statistics.
     summary = Statistics.colStats(mat)
-    print(summary.mean()) # a dense vector containing the mean value for each column
-    print(summary.variance()) # column-wise variance
-    print(summary.numNonzeros()) # number of nonzeros in each column
+    print(summary.mean())  # a dense vector containing the mean value for each column
+    print(summary.variance())  # column-wise variance
+    print(summary.numNonzeros())  # number of nonzeros in each column
     # $example off$
 
-    sc.stop()
\ No newline at end of file
+    sc.stop()

From 0df3e65b7ea64165ec4e9301ddc8e91c1abcd082 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Tue, 16 Feb 2016 16:05:24 +0800
Subject: [PATCH 17/26] [SPARK-13019] remove setMaster, change java to 2-indent

---
 .../mllib/JavaHypothesisTestingExample.java   | 72 +++++++++----------
 ...isTestingKolmogorovSmirnovTestExample.java | 37 +++++-----
 .../JavaKernelDensityEstimationExample.java   | 40 +++++------
 .../JavaRandomDataGenerationExample.java      | 62 +++++++---------
 .../mllib/HypothesisTestingExample.scala      |  3 +-
 ...sTestingKolmogorovSmirnovTestExample.scala |  2 -
 .../KernelDensityEstimationExample.scala      |  3 +-
 .../mllib/RandomDataGenerationExample.scala   |  3 +-
 8 files changed, 97 insertions(+), 125 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
index de6330667915f..f1532ddee7bf5 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -25,56 +25,52 @@
 import org.apache.spark.mllib.stat.Statistics;
 import org.apache.spark.mllib.stat.test.ChiSqTestResult;
 // $example off$
-
 import org.apache.spark.SparkConf;
-import org.apache.spark.sql.SQLContext;
 import org.apache.spark.mllib.linalg.Vectors;
 import java.util.Arrays;
 
 
 public class JavaHypothesisTestingExample {
-    public static void main(String[] args) {
-
-        SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample").setMaster("local[*]");
-        JavaSparkContext jsc = new JavaSparkContext(conf);
-        SQLContext sqlContext = new SQLContext(jsc);
+  public static void main(String[] args) {
 
-        // $example on$
-        Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); // a vector composed of the frequencies of events
+    SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
 
-        // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
-        // the test runs against a uniform distribution.
-        ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
-        // summary of the test including the p-value, degrees of freedom, test statistic, the method used,
-        // and the null hypothesis.
-        System.out.println(goodnessOfFitTestResult);
+    // $example on$
+    Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); // a vector composed of the frequencies of events
 
-        // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
-        Matrix mat = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix
+    // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
+    // the test runs against a uniform distribution.
+    ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
+    // summary of the test including the p-value, degrees of freedom, test statistic, the method used,
+    // and the null hypothesis.
+    System.out.println(goodnessOfFitTestResult);
 
-        // conduct Pearson's independence test on the input contingency matrix
-        ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
-        // summary of the test including the p-value, degrees of freedom...
-        System.out.println(independenceTestResult);
+    // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+    Matrix mat = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix
 
-        LabeledPoint p1 = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0));
-        LabeledPoint p2 = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0));
-        LabeledPoint p3 = new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5));
-        JavaRDD<LabeledPoint> obs = jsc.parallelize(Arrays.asList(p1, p2, p3)); // an RDD of labeled points
+    // conduct Pearson's independence test on the input contingency matrix
+    ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
+    // summary of the test including the p-value, degrees of freedom...
+    System.out.println(independenceTestResult);
 
-        // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
-        // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
-        // against the label.
-        ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
-        int i = 1;
-        for (ChiSqTestResult result : featureTestResults) {
-            System.out.println("Column " + i + ":");
-            System.out.println(result); // summary of the test
-            i++;
-        }
+    LabeledPoint p1 = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0));
+    LabeledPoint p2 = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0));
+    LabeledPoint p3 = new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5));
+    JavaRDD<LabeledPoint> obs = jsc.parallelize(Arrays.asList(p1, p2, p3)); // an RDD of labeled points
 
-        // $example off$
-
-        jsc.stop();
+    // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+    // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
+    // against the label.
+    ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
+    int i = 1;
+    for (ChiSqTestResult result : featureTestResults) {
+        System.out.println("Column " + i + ":");
+        System.out.println(result); // summary of the test
+        i++;
     }
+    // $example off$
+
+    jsc.stop();
+  }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
index 875c2c8777c3b..2e1e9af224257 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -25,29 +25,24 @@
 // $example off$
 
 import org.apache.spark.SparkConf;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.mllib.linalg.Vectors;
 import java.util.Arrays;
 
 public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
-    public static void main(String[] args) {
-
-        SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample")
-                .setMaster("local[*]");
-        JavaSparkContext jsc = new JavaSparkContext(conf);
-        SQLContext sqlContext = new SQLContext(jsc);
-
-        // $example on$
-        JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
-        KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
-        // summary of the test including the p-value, test statistic,
-        // and null hypothesis
-        // if our p-value indicates significance, we can reject the null hypothesis
-        System.out.println(testResult);
-
-        // $example off$
-
-        jsc.stop();
-    }
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+
+    // $example on$
+    JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
+    KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
+    // summary of the test including the p-value, test statistic,
+    // and null hypothesis
+    // if our p-value indicates significance, we can reject the null hypothesis
+    System.out.println(testResult);
+    // $example off$
+
+    jsc.stop();
+  }
 }
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
index 72781eedfe635..f637d78574fc4 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -24,38 +24,32 @@
 import org.apache.spark.mllib.stat.KernelDensity;
 import org.apache.spark.rdd.RDD;
 // $example off$
-
 import org.apache.spark.SparkConf;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.mllib.linalg.Vectors;
 import java.util.Arrays;
 
-
 public class JavaKernelDensityEstimationExample {
-    public static void main(String[] args) {
-
-        SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample").setMaster("local[*]");
-        JavaSparkContext jsc = new JavaSparkContext(conf);
-        SQLContext sqlContext = new SQLContext(jsc);
+  public static void main(String[] args) {
 
-        // $example on$
-        JavaRDD<Double> data = jsc.parallelize(
-                Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data
+    SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
 
-        // Construct the density estimator with the sample data and a standard deviation for the Gaussian
-        // kernels
-        KernelDensity kd = new KernelDensity()
-                .setSample(data)
-                .setBandwidth(3.0);
+    // $example on$
+    JavaRDD<Double> data = jsc.parallelize(
+            Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data
 
-        // Find density estimates for the given values
-        double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
+    // Construct the density estimator with the sample data and a standard deviation for the Gaussian
+    // kernels
+    KernelDensity kd = new KernelDensity()
+            .setSample(data)
+            .setBandwidth(3.0);
 
-        // $example off$
+    // Find density estimates for the given values
+    double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
+    // $example off$
 
-        System.out.println(Arrays.toString(densities));
+    System.out.println(Arrays.toString(densities));
 
-        jsc.stop();
-    }
+    jsc.stop();
+  }
 }
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
index 46bd1889bb803..0b95cc6868512 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
@@ -18,7 +18,6 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
-import org.apache.spark.SparkContext;
 import org.apache.spark.api.java.JavaDoubleRDD;
 import org.apache.spark.api.java.JavaRDD;
 import static org.apache.spark.mllib.random.RandomRDDs.*;
@@ -28,46 +27,39 @@
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.VoidFunction;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.mllib.linalg.Vectors;
-import java.util.Arrays;
-
-
 
 public class JavaRandomDataGenerationExample {
-    public static void main(String[] args) {
-
-        SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample").setMaster("local[*]");
-        JavaSparkContext jsc = new JavaSparkContext(conf);
-        SQLContext sqlContext = new SQLContext(jsc);
+  public static void main(String[] args) {
 
-        // $example on$
-        // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
-        // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
-        JavaDoubleRDD u = normalJavaRDD(jsc, 1000L, 10);
-        // Apply a transform to get a random double RDD following `N(1, 4)`.
-        JavaRDD v = u.map(
-                new Function<Double, Double>() {
-                    public Double call(Double x) {
-                        return 1.0 + 2.0 * x;
-                    }
-                });
+    SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
 
-        // $example off$
+    // $example on$
+    // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+    // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+    JavaDoubleRDD u = normalJavaRDD(jsc, 1000L, 10);
+    // Apply a transform to get a random double RDD following `N(1, 4)`.
+    JavaRDD v = u.map(
+            new Function<Double, Double>() {
+                public Double call(Double x) {
+                    return 1.0 + 2.0 * x;
+                }
+            });
+    // $example off$
 
-        u.foreach(new VoidFunction<Double>() {
-            public void call(Double d) throws Exception {
-                System.out.println(d);
-            }
-        });
+    u.foreach(new VoidFunction<Double>() {
+        public void call(Double d) throws Exception {
+            System.out.println(d);
+        }
+    });
 
-        v.foreach(new VoidFunction<Double>() {
-            public void call(Double d) throws Exception {
-                System.out.println(d);
-            }
-        });
+    v.foreach(new VoidFunction<Double>() {
+        public void call(Double d) throws Exception {
+            System.out.println(d);
+        }
+    });
 
-        jsc.stop();
-    }
+    jsc.stop();
+  }
 }
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index fe3c280cf0082..c09e99011e579 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -31,7 +31,7 @@ object HypothesisTestingExample {
 
   def main(args: Array[String]) {
 
-    val conf = new SparkConf().setAppName("HypothesisTestingExample").setMaster("local[*]")
+    val conf = new SparkConf().setAppName("HypothesisTestingExample")
     val sc = new SparkContext(conf)
 
     // $example on$
@@ -65,7 +65,6 @@ object HypothesisTestingExample {
       println(s"Column $i:\n$result")
       i += 1
     } // summary of the test
-
     // $example off$
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
index 7ed96766fcd11..78660f45a43a0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
@@ -29,7 +29,6 @@ object HypothesisTestingKolmogorovSmirnovTestExample {
   def main(args: Array[String]) {
 
     val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample")
-      .setMaster("local[*]")
     val sc = new SparkContext(conf)
 
     // $example on$
@@ -45,7 +44,6 @@ object HypothesisTestingKolmogorovSmirnovTestExample {
     val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
     val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
     println(testResult2)
-
     // $example off$
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
index 31b5a5e1ad05c..402276ff086d8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
@@ -28,7 +28,7 @@ object KernelDensityEstimationExample {
 
   def main(args: Array[String]) {
 
-    val conf = new SparkConf().setAppName("KernelDensityEstimationExample").setMaster("local[*]")
+    val conf = new SparkConf().setAppName("KernelDensityEstimationExample")
     val sc = new SparkContext(conf)
 
     // $example on$
@@ -43,7 +43,6 @@ object KernelDensityEstimationExample {
 
     // Find density estimates for the given values
     val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
-
     // $example off$
 
     densities.foreach(print)
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
index 91019a2ac9de5..4ba8badbaa867 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
@@ -27,7 +27,7 @@ object RandomDataGenerationExample {
 
   def main(args: Array[String]) {
 
-    val conf = new SparkConf().setAppName("RandomDataGenerationExample").setMaster("local[*]")
+    val conf = new SparkConf().setAppName("RandomDataGenerationExample")
     val sc = new SparkContext(conf)
 
     // $example on$
@@ -36,7 +36,6 @@ object RandomDataGenerationExample {
     val u = normalRDD(sc, 1000L, 10)
     // Apply a transform to get a random double RDD following `N(1, 4)`.
     val v = u.map(x => 1.0 + 2.0 * x)
-
     // $example off$
     u.foreach(print)
     v.foreach(print)

From d817d0bbbea4913688bd6c3c66cecf95b9dbe198 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Sat, 20 Feb 2016 11:03:23 -0800
Subject: [PATCH 18/26] [SPARK-13019] more java style fix

---
 .../mllib/JavaCorrelationsExample.java        | 59 ++++++-----
 .../mllib/JavaHypothesisTestingExample.java   | 17 ++--
 ...isTestingKolmogorovSmirnovTestExample.java |  3 +-
 .../JavaKernelDensityEstimationExample.java   | 11 +--
 .../JavaRandomDataGenerationExample.java      | 32 +++---
 .../mllib/JavaStratifiedSamplingExample.java  | 99 +++++++++----------
 .../mllib/JavaSummaryStatisticsExample.java   | 41 ++++----
 7 files changed, 126 insertions(+), 136 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
index e12481fab10c0..f54da71d35040 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
@@ -17,53 +17,52 @@
 
 package org.apache.spark.examples.mllib;
 
+import org.apache.spark.SparkConf;
 // $example on$
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaDoubleRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.*;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.stat.Statistics;
 // $example off$
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.mllib.linalg.Vectors;
 import java.util.Arrays;
 
 
 public class JavaCorrelationsExample {
-    public static void main(String[] args) {
+  public static void main(String[] args) {
 
-        SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample").setMaster("local[*]");
-        JavaSparkContext jsc = new JavaSparkContext(conf);
+    SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
 
-        // $example on$
-        JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
-                Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series
+    // $example on$
+    JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
+      Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series
 
-        // must have the same number of partitions and cardinality as seriesX
-        JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
-                Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0}));
+    // must have the same number of partitions and cardinality as seriesX
+    JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
+      Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0}));
 
-        // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
-        // method is not specified, Pearson's method will be used by default.
-        Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
-        System.out.println("correlation is: " + correlation);
+    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
+    // method is not specified, Pearson's method will be used by default.
+    Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
+    System.out.println("correlation is: " + correlation);
 
-        Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
-        Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
-        Vector v3 = Vectors.dense(5.0, 33.0, 366.0);
+    Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
+    Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
+    Vector v3 = Vectors.dense(5.0, 33.0, 366.0);
 
-        // note that each Vector is a row and not a column
-        JavaRDD<Vector> data = jsc.parallelize(Arrays.asList(v1, v2, v3));
+    // note that each Vector is a row and not a column
+    JavaRDD<Vector> data = jsc.parallelize(Arrays.asList(v1, v2, v3));
 
-        // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
-        // If a method is not specified, Pearson's method will be used by default.
-        Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
-        System.out.println(correlMatrix.toString());
-        // $example off$
+    // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+    // If a method is not specified, Pearson's method will be used by default.
+    Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
+    System.out.println(correlMatrix.toString());
+    // $example off$
 
-        jsc.stop();
-    }
+    jsc.stop();
+  }
 }
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
index f1532ddee7bf5..6c7ef401483f9 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -17,19 +17,20 @@
 
 package org.apache.spark.examples.mllib;
 
+import org.apache.spark.SparkConf;
 // $example on$
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.*;
+import org.apache.spark.mllib.linalg.Matrices;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.stat.Statistics;
 import org.apache.spark.mllib.stat.test.ChiSqTestResult;
 // $example off$
-import org.apache.spark.SparkConf;
-import org.apache.spark.mllib.linalg.Vectors;
 import java.util.Arrays;
 
-
 public class JavaHypothesisTestingExample {
   public static void main(String[] args) {
 
@@ -47,7 +48,7 @@ public static void main(String[] args) {
     System.out.println(goodnessOfFitTestResult);
 
     // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
-    Matrix mat = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix
+    Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix
 
     // conduct Pearson's independence test on the input contingency matrix
     ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
@@ -65,9 +66,9 @@ public static void main(String[] args) {
     ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
     int i = 1;
     for (ChiSqTestResult result : featureTestResults) {
-        System.out.println("Column " + i + ":");
-        System.out.println(result); // summary of the test
-        i++;
+      System.out.println("Column " + i + ":");
+      System.out.println(result); // summary of the test
+      i++;
     }
     // $example off$
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
index 2e1e9af224257..238785019c814 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -17,14 +17,13 @@
 
 package org.apache.spark.examples.mllib;
 
+import org.apache.spark.SparkConf;
 // $example on$
 import org.apache.spark.api.java.JavaDoubleRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.stat.Statistics;
 import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
 // $example off$
-
-import org.apache.spark.SparkConf;
 import java.util.Arrays;
 
 public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
index f637d78574fc4..ec3241af2c7ea 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -17,14 +17,13 @@
 
 package org.apache.spark.examples.mllib;
 
+import org.apache.spark.SparkConf;
 // $example on$
-import org.apache.spark.api.java.JavaDoubleRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.stat.KernelDensity;
 import org.apache.spark.rdd.RDD;
 // $example off$
-import org.apache.spark.SparkConf;
 import java.util.Arrays;
 
 public class JavaKernelDensityEstimationExample {
@@ -35,16 +34,16 @@ public static void main(String[] args) {
 
     // $example on$
     JavaRDD<Double> data = jsc.parallelize(
-            Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data
+      Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); // an RDD of sample data
 
     // Construct the density estimator with the sample data and a standard deviation for the Gaussian
     // kernels
     KernelDensity kd = new KernelDensity()
-            .setSample(data)
-            .setBandwidth(3.0);
+      .setSample(data)
+      .setBandwidth(3.0);
 
     // Find density estimates for the given values
-    double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
+    double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});
     // $example off$
 
     System.out.println(Arrays.toString(densities));
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
index 0b95cc6868512..341b47acf8543 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
@@ -17,17 +17,17 @@
 
 package org.apache.spark.examples.mllib;
 
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.VoidFunction;
 // $example on$
 import org.apache.spark.api.java.JavaDoubleRDD;
 import org.apache.spark.api.java.JavaRDD;
+
 import static org.apache.spark.mllib.random.RandomRDDs.*;
 // $example off$
 
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.api.java.function.VoidFunction;
-
 public class JavaRandomDataGenerationExample {
   public static void main(String[] args) {
 
@@ -40,23 +40,23 @@ public static void main(String[] args) {
     JavaDoubleRDD u = normalJavaRDD(jsc, 1000L, 10);
     // Apply a transform to get a random double RDD following `N(1, 4)`.
     JavaRDD v = u.map(
-            new Function<Double, Double>() {
-                public Double call(Double x) {
-                    return 1.0 + 2.0 * x;
-                }
-            });
+      new Function<Double, Double>() {
+        public Double call(Double x) {
+          return 1.0 + 2.0 * x;
+        }
+      });
     // $example off$
 
     u.foreach(new VoidFunction<Double>() {
-        public void call(Double d) throws Exception {
-            System.out.println(d);
-        }
+      public void call(Double d) throws Exception {
+        System.out.println(d);
+      }
     });
 
     v.foreach(new VoidFunction<Double>() {
-        public void call(Double d) throws Exception {
-            System.out.println(d);
-        }
+      public void call(Double d) throws Exception {
+        System.out.println(d);
+      }
     });
 
     jsc.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
index 7371e274dcbb1..41bc0aa92525a 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
@@ -17,66 +17,59 @@
 
 package org.apache.spark.examples.mllib;
 
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.VoidFunction;
+import scala.Tuple2;
 // $example on$
 import java.util.*;
-
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 // $example off$
 
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.function.VoidFunction;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.mllib.linalg.Vectors;
-import scala.Tuple2;
-
-
 public class JavaStratifiedSamplingExample {
-    public static void main(String[] args) {
-
-        SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample").setMaster("local[*]");
-        JavaSparkContext jsc = new JavaSparkContext(conf);
-        SQLContext sqlContext = new SQLContext(jsc);
-
-        // $example on$
-        List<Tuple2<Integer, Character>> list = new ArrayList<>();
-        list.add(new Tuple2(1,'a'));
-        list.add(new Tuple2(1, 'b'));
-        list.add(new Tuple2(2,'c'));
-        list.add(new Tuple2(2,'d'));
-        list.add(new Tuple2(2, 'e'));
-        list.add(new Tuple2(3,'f'));
-
-        // an RDD of any key value pairs JavaPairRDD<K, V>
-        JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);
-
-        // specify the exact fraction desired from each key Map<K, Object>
-        Map<Integer, Object> fractions = new HashMap<>();
-
-        fractions.put(1, 0.1);
-        fractions.put(2, 0.6);
-        fractions.put(3, 0.3);
-
-        // Get an exact sample from each stratum
-        JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions); // JavaPairRDD<K, V>
-        JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions); // JavaPairRDD<K, V>
-
-        // $example off$
-
-        approxSample.foreach(new VoidFunction<Tuple2<Integer, Character>>() {
-            public void call(Tuple2<Integer, Character> t) throws Exception {
-                System.out.println(t._1() + " " + t._2());
-            }
-        });
-
-        exactSample.foreach(new VoidFunction<Tuple2<Integer, Character>>() {
-            public void call(Tuple2<Integer, Character> t) throws Exception {
-                System.out.println(t._1() + " " + t._2());
-            }
-        });
-
-        jsc.stop();
-    }
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+
+    // $example on$
+    List<Tuple2<Integer, Character>> list = new ArrayList<>();
+    list.add(new Tuple2(1, 'a'));
+    list.add(new Tuple2(1, 'b'));
+    list.add(new Tuple2(2, 'c'));
+    list.add(new Tuple2(2, 'd'));
+    list.add(new Tuple2(2, 'e'));
+    list.add(new Tuple2(3, 'f'));
+
+    // an RDD of any key value pairs JavaPairRDD<K, V>
+    JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);
+
+    // specify the exact fraction desired from each key Map<K, Object>
+    Map<Integer, Object> fractions = new HashMap<>();
+
+    fractions.put(1, 0.1);
+    fractions.put(2, 0.6);
+    fractions.put(3, 0.3);
+
+    // Get an exact sample from each stratum
+    JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions); // JavaPairRDD<K, V>
+    JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions); // JavaPairRDD<K, V>
+    // $example off$
+
+    approxSample.foreach(new VoidFunction<Tuple2<Integer, Character>>() {
+      public void call(Tuple2<Integer, Character> t) throws Exception {
+        System.out.println(t._1() + " " + t._2());
+      }
+    });
+
+    exactSample.foreach(new VoidFunction<Tuple2<Integer, Character>>() {
+      public void call(Tuple2<Integer, Character> t) throws Exception {
+        System.out.println(t._1() + " " + t._2());
+      }
+    });
+
+    jsc.stop();
+  }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
index ed7f9637e7627..eab0a1d9f2844 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
@@ -17,6 +17,9 @@
 
 package org.apache.spark.examples.mllib;
 
+import org.apache.spark.SparkConf;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.mllib.linalg.Vectors;
 // $example on$
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
@@ -24,34 +27,30 @@
 import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
 import org.apache.spark.mllib.stat.Statistics;
 // $example off$
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.mllib.linalg.Vectors;
 import java.util.Arrays;
 
 
 public class JavaSummaryStatisticsExample {
-    public static void main(String[] args) {
+  public static void main(String[] args) {
 
-        SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
-        JavaSparkContext jsc = new JavaSparkContext(conf);
-        SQLContext sqlContext = new SQLContext(jsc);
+    SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
 
-        // $example on$
-        Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
-        Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
-        Vector v3 = Vectors.dense(3.0, 30.0, 300.0);
+    // $example on$
+    Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
+    Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
+    Vector v3 = Vectors.dense(3.0, 30.0, 300.0);
 
-        JavaRDD<Vector> mat = jsc.parallelize(Arrays.asList(v1, v2, v3)); // an RDD of Vectors
+    JavaRDD<Vector> mat = jsc.parallelize(Arrays.asList(v1, v2, v3)); // an RDD of Vectors
 
-        // Compute column summary statistics.
-        MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
-        System.out.println(summary.mean()); // a dense vector containing the mean value for each column
-        System.out.println(summary.variance()); // column-wise variance
-        System.out.println(summary.numNonzeros()); // number of nonzeros in each column
-        // $example off$
+    // Compute column summary statistics.
+    MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
+    System.out.println(summary.mean()); // a dense vector containing the mean value for each column
+    System.out.println(summary.variance()); // column-wise variance
+    System.out.println(summary.numNonzeros()); // number of nonzeros in each column
+    // $example off$
 
-        jsc.stop();
-    }
+    jsc.stop();
+  }
 }

From f945222ad1cfd1f4756258a49b372aa7bd32d9fc Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Sun, 21 Feb 2016 14:09:58 -0800
Subject: [PATCH 19/26] [SPARK-13019] mainly re-organize java import

---
 .../examples/mllib/JavaCorrelationsExample.java  |  8 ++++----
 .../mllib/JavaHypothesisTestingExample.java      |  3 ++-
 ...hesisTestingKolmogorovSmirnovTestExample.java |  6 +++---
 .../JavaKernelDensityEstimationExample.java      | 11 ++++-------
 .../mllib/JavaStratifiedSamplingExample.java     | 16 +++++++++++-----
 .../mllib/JavaSummaryStatisticsExample.java      | 10 ++++------
 .../main/python/mllib/correlations_example.py    |  9 +++------
 .../python/mllib/hypothesis_testing_example.py   |  4 ----
 ...is_testing_kolmogorov_smirnov_test_example.py |  9 ++-------
 .../mllib/kernel_density_estimation_example.py   |  4 ----
 .../mllib/random_data_generation_example.py      |  4 ----
 .../python/mllib/stratified_sampling_example.py  |  8 --------
 .../python/mllib/summary_statistics_example.py   |  3 ---
 .../examples/mllib/CorrelationsExample.scala     |  2 +-
 .../mllib/RandomDataGenerationExample.scala      |  1 +
 .../mllib/StratifiedSamplingExample.scala        |  5 +----
 .../mllib/SummaryStatisticsExample.scala         |  2 +-
 17 files changed, 37 insertions(+), 68 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
index f54da71d35040..c3e3a789e755b 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
@@ -17,6 +17,8 @@
 
 package org.apache.spark.examples.mllib;
 
+import java.util.Arrays;
+
 import org.apache.spark.SparkConf;
 // $example on$
 import org.apache.spark.api.java.JavaRDD;
@@ -27,8 +29,6 @@
 import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.stat.Statistics;
 // $example off$
-import java.util.Arrays;
-
 
 public class JavaCorrelationsExample {
   public static void main(String[] args) {
@@ -44,8 +44,8 @@ public static void main(String[] args) {
     JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
       Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0}));
 
-    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
-    // method is not specified, Pearson's method will be used by default.
+    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
+    // If a method is not specified, Pearson's method will be used by default.
     Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
     System.out.println("correlation is: " + correlation);
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
index 6c7ef401483f9..8be28a11aff57 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -17,6 +17,8 @@
 
 package org.apache.spark.examples.mllib;
 
+import java.util.Arrays;
+
 import org.apache.spark.SparkConf;
 // $example on$
 import org.apache.spark.api.java.JavaRDD;
@@ -29,7 +31,6 @@
 import org.apache.spark.mllib.stat.Statistics;
 import org.apache.spark.mllib.stat.test.ChiSqTestResult;
 // $example off$
-import java.util.Arrays;
 
 public class JavaHypothesisTestingExample {
   public static void main(String[] args) {
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
index 238785019c814..9ae2907a38084 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -17,6 +17,8 @@
 
 package org.apache.spark.examples.mllib;
 
+import java.util.Arrays;
+
 import org.apache.spark.SparkConf;
 // $example on$
 import org.apache.spark.api.java.JavaDoubleRDD;
@@ -24,7 +26,6 @@
 import org.apache.spark.mllib.stat.Statistics;
 import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
 // $example off$
-import java.util.Arrays;
 
 public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
   public static void main(String[] args) {
@@ -35,8 +36,7 @@ public static void main(String[] args) {
     // $example on$
     JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
     KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
-    // summary of the test including the p-value, test statistic,
-    // and null hypothesis
+    // summary of the test including the p-value, test statistic, and null hypothesis
     // if our p-value indicates significance, we can reject the null hypothesis
     System.out.println(testResult);
     // $example off$
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
index ec3241af2c7ea..18ccd0a951c7b 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -17,14 +17,14 @@
 
 package org.apache.spark.examples.mllib;
 
+import java.util.Arrays;
+
 import org.apache.spark.SparkConf;
 // $example on$
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.stat.KernelDensity;
-import org.apache.spark.rdd.RDD;
 // $example off$
-import java.util.Arrays;
 
 public class JavaKernelDensityEstimationExample {
   public static void main(String[] args) {
@@ -36,11 +36,8 @@ public static void main(String[] args) {
     JavaRDD<Double> data = jsc.parallelize(
       Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); // an RDD of sample data
 
-    // Construct the density estimator with the sample data and a standard deviation for the Gaussian
-    // kernels
-    KernelDensity kd = new KernelDensity()
-      .setSample(data)
-      .setBandwidth(3.0);
+    // Construct the density estimator with the sample data and a standard deviation for the Gaussian kernels
+    KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);
 
     // Find density estimates for the given values
     double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
index 41bc0aa92525a..d80592182ed92 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
@@ -17,16 +17,22 @@
 
 package org.apache.spark.examples.mllib;
 
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.function.VoidFunction;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.HashMap;
+// $example on$
+import java.util.Map;
+// $example off$
+
 import scala.Tuple2;
+
+import org.apache.spark.api.java.function.VoidFunction;
 // $example on$
-import java.util.*;
-import org.apache.spark.api.java.JavaRDD;
+
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaSparkContext;
 // $example off$
+import org.apache.spark.SparkConf;
 
 public class JavaStratifiedSamplingExample {
   public static void main(String[] args) {
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
index eab0a1d9f2844..755e6e5a2982f 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
@@ -17,25 +17,23 @@
 
 package org.apache.spark.examples.mllib;
 
-import org.apache.spark.SparkConf;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.mllib.linalg.Vectors;
+import java.util.Arrays;
+
 // $example on$
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
 import org.apache.spark.mllib.stat.Statistics;
 // $example off$
-import java.util.Arrays;
-
+import org.apache.spark.SparkConf;
 
 public class JavaSummaryStatisticsExample {
   public static void main(String[] args) {
 
     SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
     JavaSparkContext jsc = new JavaSparkContext(conf);
-    SQLContext sqlContext = new SQLContext(jsc);
 
     // $example on$
     Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py
index e9ccca0dd5593..46bd5ede53baf 100644
--- a/examples/src/main/python/mllib/correlations_example.py
+++ b/examples/src/main/python/mllib/correlations_example.py
@@ -18,23 +18,21 @@
 from __future__ import print_function
 
 from pyspark import SparkContext
-from pyspark.sql import SQLContext
 import numpy as np
-from pyspark.mllib.linalg import Vectors
 # $example on$
 from pyspark.mllib.stat import Statistics
 # $example off$
 
 if __name__ == "__main__":
-    # $example on$
     sc = SparkContext(appName="CorrelationsExample")  # SparkContext
 
+    # $example on$
     seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
     # seriesY must have the same number of partitions and cardinality as seriesX
     seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])
 
-    # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
-    # method is not specified, Pearson's method will be used by default.
+    # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
+    # If a method is not specified, Pearson's method will be used by default.
     print(Statistics.corr(seriesX, seriesY, method="pearson"))
 
     v1 = np.array([1.0, 10.0, 100.0])
@@ -45,7 +43,6 @@
     # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
     # If a method is not specified, Pearson's method will be used by default.
     print(Statistics.corr(data, method="pearson"))
-
     # $example off$
 
     sc.stop()
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
index 4d37e394af3b7..f548d6566ba2a 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@@ -18,8 +18,6 @@
 from __future__ import print_function
 
 from pyspark import SparkContext
-from pyspark.sql import SQLContext
-import numpy as np
 from pyspark.mllib.linalg import Vectors
 # $example on$
 from pyspark import SparkContext
@@ -30,7 +28,6 @@
 
 if __name__ == "__main__":
     sc = SparkContext(appName="HypothesisTestingExample")  # SparkContext
-    sqlContext = SQLContext(sc)
 
     # $example on$
     vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)  # a vector composed of the frequencies of events
@@ -60,7 +57,6 @@
     for i, result in enumerate(featureTestResults):
         print("Column: " + str(i + 1))
         print(result)
-
     # $example off$
 
     sc.stop()
diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
index 3e3c6ba0b96a8..15d63ef86b2e7 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
@@ -18,28 +18,23 @@
 from __future__ import print_function
 
 from pyspark import SparkContext
-from pyspark.sql import SQLContext
-import numpy as np
-from pyspark.mllib.linalg import Vectors
 # $example on$
 from pyspark.mllib.stat import Statistics
 # $example off$
 
 if __name__ == "__main__":
     sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample")  # SparkContext
-    sqlContext = SQLContext(sc)
 
     # $example on$
     parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25])
 
     # run a KS test for the sample versus a standard normal distribution
     testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
-    print(testResult)  # summary of the test including the p-value, test statistic,
-    # and null hypothesis
+    # summary of the test including the p-value, test statistic, and null hypothesis
     # if our p-value indicates significance, we can reject the null hypothesis
     # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with
     # a lambda to calculate the CDF is not made available in the Python API
-
+    print(testResult)
     # $example off$
 
     sc.stop()
diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py
index 746027e6d599f..3e8f7241a4a1e 100644
--- a/examples/src/main/python/mllib/kernel_density_estimation_example.py
+++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py
@@ -18,16 +18,12 @@
 from __future__ import print_function
 
 from pyspark import SparkContext
-from pyspark.sql import SQLContext
-import numpy as np
-from pyspark.mllib.linalg import Vectors
 # $example on$
 from pyspark.mllib.stat import KernelDensity
 # $example off$
 
 if __name__ == "__main__":
     sc = SparkContext(appName="KernelDensityEstimationExample")  # SparkContext
-    sqlContext = SQLContext(sc)
 
     # $example on$
     # an RDD of sample data
diff --git a/examples/src/main/python/mllib/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py
index 7eb2ea81a8038..c08d631c5db7d 100644
--- a/examples/src/main/python/mllib/random_data_generation_example.py
+++ b/examples/src/main/python/mllib/random_data_generation_example.py
@@ -18,16 +18,12 @@
 from __future__ import print_function
 
 from pyspark import SparkContext
-from pyspark.sql import SQLContext
-import numpy as np
-from pyspark.mllib.linalg import Vectors
 # $example on$
 from pyspark.mllib.random import RandomRDDs
 # $example off$
 
 if __name__ == "__main__":
     sc = SparkContext(appName="RandomDataGenerationExample")  # SparkContext
-    sqlContext = SQLContext(sc)
 
     # $example on$
     # Generate a random double RDD that contains 1 million i.i.d. values drawn from the
diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py
index 63bf2dddede01..a13f8f08dd68b 100644
--- a/examples/src/main/python/mllib/stratified_sampling_example.py
+++ b/examples/src/main/python/mllib/stratified_sampling_example.py
@@ -18,16 +18,9 @@
 from __future__ import print_function
 
 from pyspark import SparkContext
-from pyspark.sql import SQLContext
-import numpy as np
-from pyspark.mllib.linalg import Vectors
-# $example on$
-from pyspark.mllib.stat import Statistics
-# $example off$
 
 if __name__ == "__main__":
     sc = SparkContext(appName="StratifiedSamplingExample")  # SparkContext
-    sqlContext = SQLContext(sc)
 
     # $example on$
     # an RDD of any key value pairs
@@ -37,7 +30,6 @@
     fractions = {1: 0.1, 2: 0.6, 3: 0.3}
 
     approxSample = data.sampleByKey(False, fractions)
-
     # $example off$
 
     for each in approxSample.collect():
diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py
index 2048bc432aa77..eb6ef272a4f66 100644
--- a/examples/src/main/python/mllib/summary_statistics_example.py
+++ b/examples/src/main/python/mllib/summary_statistics_example.py
@@ -18,16 +18,13 @@
 from __future__ import print_function
 
 from pyspark import SparkContext
-from pyspark.sql import SQLContext
 import numpy as np
-from pyspark.mllib.linalg import Vectors
 # $example on$
 from pyspark.mllib.stat import Statistics
 # $example off$
 
 if __name__ == "__main__":
     sc = SparkContext(appName="SummaryStatisticsExample")  # SparkContext
-    sqlContext = SQLContext(sc)
 
     # $example on$
     v1 = np.array([1.0, 2.0, 3.0])
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
index 9b3c0321f067d..e395a25dc6a2e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
@@ -29,7 +29,7 @@ object CorrelationsExample {
 
   def main(args: Array[String]) {
 
-    val conf = new SparkConf().setAppName("CorrelationsExample").setMaster("local[*]")
+    val conf = new SparkConf().setAppName("CorrelationsExample")
     val sc = new SparkContext(conf)
 
     // $example on$
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
index 4ba8badbaa867..de48ae58ce0e1 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
@@ -37,6 +37,7 @@ object RandomDataGenerationExample {
     // Apply a transform to get a random double RDD following `N(1, 4)`.
     val v = u.map(x => 1.0 + 2.0 * x)
     // $example off$
+
     u.foreach(print)
     v.foreach(print)
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index 24a5407426894..453e4a2f9d283 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -21,16 +21,13 @@ package org.apache.spark.examples.mllib
 // $example on$
 import org.apache.spark.{SparkConf, SparkContext}
 // $example off$
-import org.apache.spark.sql.SQLContext
-
 
 object StratifiedSamplingExample {
 
   def main(args: Array[String]) {
 
-    val conf = new SparkConf().setAppName("StratifiedSamplingExample").setMaster("local[*]")
+    val conf = new SparkConf().setAppName("StratifiedSamplingExample")
     val sc = new SparkContext(conf)
-    val sqlContext = new SQLContext(sc)
 
     // $example on$
     // an RDD[(K, V)] of any key value pairs
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
index c2fe7976b4609..675c07aeab954 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
@@ -28,7 +28,7 @@ object SummaryStatisticsExample {
 
   def main(args: Array[String]) {
 
-    val conf = new SparkConf().setAppName("SummaryStatisticsExample").setMaster("local[*]")
+    val conf = new SparkConf().setAppName("SummaryStatisticsExample")
     val sc = new SparkContext(conf)
 
     // $example on$

From aec10cac879f29a3edb03e0855e60a772ca30c18 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Tue, 23 Feb 2016 13:18:09 -0800
Subject: [PATCH 20/26] [SPARK-13019] re-organize python import

---
 examples/src/main/python/mllib/correlations_example.py       | 3 ++-
 examples/src/main/python/mllib/hypothesis_testing_example.py | 2 --
 examples/src/main/python/mllib/summary_statistics_example.py | 3 ++-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py
index 46bd5ede53baf..a0d2e7d8be7eb 100644
--- a/examples/src/main/python/mllib/correlations_example.py
+++ b/examples/src/main/python/mllib/correlations_example.py
@@ -17,8 +17,9 @@
 
 from __future__ import print_function
 
-from pyspark import SparkContext
 import numpy as np
+
+from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.stat import Statistics
 # $example off$
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
index f548d6566ba2a..ca3d12bc153fd 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@@ -18,9 +18,7 @@
 from __future__ import print_function
 
 from pyspark import SparkContext
-from pyspark.mllib.linalg import Vectors
 # $example on$
-from pyspark import SparkContext
 from pyspark.mllib.linalg import Vectors, Matrices
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.stat import Statistics
diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py
index eb6ef272a4f66..088f9f8807498 100644
--- a/examples/src/main/python/mllib/summary_statistics_example.py
+++ b/examples/src/main/python/mllib/summary_statistics_example.py
@@ -17,8 +17,9 @@
 
 from __future__ import print_function
 
-from pyspark import SparkContext
 import numpy as np
+
+from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.stat import Statistics
 # $example off$

From e2737eedd6c45c82f25045442b1d811ab2c395ec Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Sun, 6 Mar 2016 14:45:28 -0800
Subject: [PATCH 21/26] [SPARK-13019] code review improvement

---
 docs/mllib-statistics.md                      | 44 ++++++++++++-
 .../mllib/JavaCorrelationsExample.java        | 14 ++--
 .../mllib/JavaHypothesisTestingExample.java   | 30 +++++----
 ...isTestingKolmogorovSmirnovTestExample.java | 12 ++--
 .../JavaKernelDensityEstimationExample.java   | 12 ++--
 .../JavaRandomDataGenerationExample.java      | 65 -------------------
 .../mllib/JavaStratifiedSamplingExample.java  | 22 +++----
 .../mllib/JavaSummaryStatisticsExample.java   | 12 ++--
 .../main/python/mllib/correlations_example.py |  2 +-
 .../mllib/hypothesis_testing_example.py       |  7 +-
 .../mllib/summary_statistics_example.py       | 10 +--
 .../examples/mllib/CorrelationsExample.scala  |  8 +--
 .../mllib/HypothesisTestingExample.scala      | 22 ++++---
 ...sTestingKolmogorovSmirnovTestExample.scala | 11 ++--
 .../KernelDensityEstimationExample.scala      |  4 +-
 .../mllib/RandomDataGenerationExample.scala   | 48 --------------
 .../mllib/StratifiedSamplingExample.scala     |  6 +-
 .../mllib/SummaryStatisticsExample.scala      |  8 +--
 18 files changed, 139 insertions(+), 198 deletions(-)
 delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
 delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala

diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index b06829f0247dd..62faa1bfa45ff 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -277,7 +277,18 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) for details on the API.
 
-{% include_example scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala %}
+-{% highlight scala %}
+-import org.apache.spark.SparkContext
+-import org.apache.spark.mllib.random.RandomRDDs._
+-
+-val sc: SparkContext = ...
+-
+-// Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+-// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+-val u = normalRDD(sc, 1000000L, 10)
+-// Apply a transform to get a random double RDD following `N(1, 4)`.
+-val v = u.map(x => 1.0 + 2.0 * x)
+-{% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -288,7 +299,24 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Java docs](api/java/org/apache/spark/mllib/random/RandomRDDs) for details on the API.
 
-{% include_example java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java %}
+-{% highlight java %}
+ -import org.apache.spark.SparkContext;
+ -import org.apache.spark.api.JavaDoubleRDD;
+ -import static org.apache.spark.mllib.random.RandomRDDs.*;
+ -
+ -JavaSparkContext jsc = ...
+ -
+ -// Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+ -// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+ -JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10);
+ -// Apply a transform to get a random double RDD following `N(1, 4)`.
+ -JavaDoubleRDD v = u.map(
+ -  new Function<Double, Double>() {
+ -    public Double call(Double x) {
+ -      return 1.0 + 2.0 * x;
+ -    }
+ -  });
+ -{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -299,7 +327,17 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.random.RandomRDDs) for more details on the API.
 
-{% include_example python/mllib/random_data_generation_example.py %}
+-{% highlight python %}
+ -from pyspark.mllib.random import RandomRDDs
+ -
+ -sc = ... # SparkContext
+ -
+ -# Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+ -# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+ -u = RandomRDDs.normalRDD(sc, 1000000L, 10)
+ -# Apply a transform to get a random double RDD following `N(1, 4)`.
+ -v = u.map(lambda x: 1.0 + 2.0 * x)
+ -{% endhighlight %}
 </div>
 </div>
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
index c3e3a789e755b..c27c1d01bab58 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
@@ -17,13 +17,13 @@
 
 package org.apache.spark.examples.mllib;
 
-import java.util.Arrays;
-
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
 // $example on$
-import org.apache.spark.api.java.JavaRDD;
+import java.util.Arrays;
+
 import org.apache.spark.api.java.JavaDoubleRDD;
-import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.mllib.linalg.Matrix;
 import org.apache.spark.mllib.linalg.Vector;
 import org.apache.spark.mllib.linalg.Vectors;
@@ -38,16 +38,16 @@ public static void main(String[] args) {
 
     // $example on$
     JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
-      Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series
+      Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0));  // a series
 
     // must have the same number of partitions and cardinality as seriesX
     JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
-      Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0}));
+      Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));
 
     // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
     // If a method is not specified, Pearson's method will be used by default.
     Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
-    System.out.println("correlation is: " + correlation);
+    System.out.println("Correlation is: " + correlation);
 
     Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
     Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
index 8be28a11aff57..0960b07a98557 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -17,12 +17,13 @@
 
 package org.apache.spark.examples.mllib;
 
-import java.util.Arrays;
-
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+
 // $example on$
+import java.util.Arrays;
+
 import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.linalg.Matrices;
 import org.apache.spark.mllib.linalg.Matrix;
 import org.apache.spark.mllib.linalg.Vector;
@@ -39,27 +40,31 @@ public static void main(String[] args) {
     JavaSparkContext jsc = new JavaSparkContext(conf);
 
     // $example on$
-    Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); // a vector composed of the frequencies of events
+    // a vector composed of the frequencies of events
+    Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25);
 
-    // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
-    // the test runs against a uniform distribution.
+    // compute the goodness of fit. If a second vector to test against is not supplied
+    // as a parameter, the test runs against a uniform distribution.
     ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
-    // summary of the test including the p-value, degrees of freedom, test statistic, the method used,
-    // and the null hypothesis.
+    // summary of the test including the p-value, degrees of freedom, test statistic,
+    // the method used, and the null hypothesis.
     System.out.println(goodnessOfFitTestResult);
+    System.out.println();
 
-    // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
-    Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix
+    // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
+    Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
 
     // conduct Pearson's independence test on the input contingency matrix
     ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
     // summary of the test including the p-value, degrees of freedom...
     System.out.println(independenceTestResult);
+    System.out.println();
 
     LabeledPoint p1 = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0));
     LabeledPoint p2 = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0));
     LabeledPoint p3 = new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5));
-    JavaRDD<LabeledPoint> obs = jsc.parallelize(Arrays.asList(p1, p2, p3)); // an RDD of labeled points
+    // an RDD of labeled points
+    JavaRDD<LabeledPoint> obs = jsc.parallelize(Arrays.asList(p1, p2, p3));
 
     // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
     // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
@@ -68,7 +73,8 @@ public static void main(String[] args) {
     int i = 1;
     for (ChiSqTestResult result : featureTestResults) {
       System.out.println("Column " + i + ":");
-      System.out.println(result); // summary of the test
+      System.out.println(result);  // summary of the test
+      System.out.println();
       i++;
     }
     // $example off$
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
index 9ae2907a38084..fe611c9ae67c9 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java
@@ -17,12 +17,12 @@
 
 package org.apache.spark.examples.mllib;
 
-import java.util.Arrays;
-
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
 // $example on$
+import java.util.Arrays;
+
 import org.apache.spark.api.java.JavaDoubleRDD;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.stat.Statistics;
 import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
 // $example off$
@@ -30,12 +30,14 @@
 public class JavaHypothesisTestingKolmogorovSmirnovTestExample {
   public static void main(String[] args) {
 
-    SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
+    SparkConf conf =
+      new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
     JavaSparkContext jsc = new JavaSparkContext(conf);
 
     // $example on$
     JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
-    KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
+    KolmogorovSmirnovTestResult testResult =
+      Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
     // summary of the test including the p-value, test statistic, and null hypothesis
     // if our p-value indicates significance, we can reject the null hypothesis
     System.out.println(testResult);
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
index 18ccd0a951c7b..2f25b7534164a 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -17,12 +17,12 @@
 
 package org.apache.spark.examples.mllib;
 
-import java.util.Arrays;
-
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
 // $example on$
+import java.util.Arrays;
+
 import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.stat.KernelDensity;
 // $example off$
 
@@ -33,10 +33,12 @@ public static void main(String[] args) {
     JavaSparkContext jsc = new JavaSparkContext(conf);
 
     // $example on$
+    // an RDD of sample data
     JavaRDD<Double> data = jsc.parallelize(
-      Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); // an RDD of sample data
+      Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0));
 
-    // Construct the density estimator with the sample data and a standard deviation for the Gaussian kernels
+    // Construct the density estimator with the sample data
+    // and a standard deviation for the Gaussian kernels
     KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);
 
     // Find density estimates for the given values
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
deleted file mode 100644
index 341b47acf8543..0000000000000
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.mllib;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.api.java.function.VoidFunction;
-// $example on$
-import org.apache.spark.api.java.JavaDoubleRDD;
-import org.apache.spark.api.java.JavaRDD;
-
-import static org.apache.spark.mllib.random.RandomRDDs.*;
-// $example off$
-
-public class JavaRandomDataGenerationExample {
-  public static void main(String[] args) {
-
-    SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample");
-    JavaSparkContext jsc = new JavaSparkContext(conf);
-
-    // $example on$
-    // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
-    // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
-    JavaDoubleRDD u = normalJavaRDD(jsc, 1000L, 10);
-    // Apply a transform to get a random double RDD following `N(1, 4)`.
-    JavaRDD v = u.map(
-      new Function<Double, Double>() {
-        public Double call(Double x) {
-          return 1.0 + 2.0 * x;
-        }
-      });
-    // $example off$
-
-    u.foreach(new VoidFunction<Double>() {
-      public void call(Double d) throws Exception {
-        System.out.println(d);
-      }
-    });
-
-    v.foreach(new VoidFunction<Double>() {
-      public void call(Double d) throws Exception {
-        System.out.println(d);
-      }
-    });
-
-    jsc.stop();
-  }
-}
-
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
index d80592182ed92..0cec6e2e51214 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
@@ -17,22 +17,20 @@
 
 package org.apache.spark.examples.mllib;
 
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+
+// $example on$
 import java.util.ArrayList;
-import java.util.List;
 import java.util.HashMap;
-// $example on$
+import java.util.List;
 import java.util.Map;
-// $example off$
 
 import scala.Tuple2;
 
-import org.apache.spark.api.java.function.VoidFunction;
-// $example on$
-
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.function.VoidFunction;
 // $example off$
-import org.apache.spark.SparkConf;
 
 public class JavaStratifiedSamplingExample {
   public static void main(String[] args) {
@@ -60,8 +58,10 @@ public static void main(String[] args) {
     fractions.put(3, 0.3);
 
     // Get an exact sample from each stratum
-    JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions); // JavaPairRDD<K, V>
-    JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions); // JavaPairRDD<K, V>
+    JavaPairRDD<Integer, Character> approxSample =
+      data.sampleByKey(false, fractions); // JavaPairRDD<K, V>
+    JavaPairRDD<Integer, Character> exactSample =
+      data.sampleByKeyExact(false, fractions); // JavaPairRDD<K, V>
     // $example off$
 
     approxSample.foreach(new VoidFunction<Tuple2<Integer, Character>>() {
@@ -69,7 +69,7 @@ public void call(Tuple2<Integer, Character> t) throws Exception {
         System.out.println(t._1() + " " + t._2());
       }
     });
-
+    System.out.println();
     exactSample.foreach(new VoidFunction<Tuple2<Integer, Character>>() {
       public void call(Tuple2<Integer, Character> t) throws Exception {
         System.out.println(t._1() + " " + t._2());
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
index 755e6e5a2982f..aae06679d3e2d 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
@@ -17,17 +17,17 @@
 
 package org.apache.spark.examples.mllib;
 
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
 import java.util.Arrays;
 
-// $example on$
 import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.mllib.linalg.Vector;
 import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
 import org.apache.spark.mllib.stat.Statistics;
 // $example off$
-import org.apache.spark.SparkConf;
 
 public class JavaSummaryStatisticsExample {
   public static void main(String[] args) {
@@ -44,9 +44,9 @@ public static void main(String[] args) {
 
     // Compute column summary statistics.
     MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
-    System.out.println(summary.mean()); // a dense vector containing the mean value for each column
-    System.out.println(summary.variance()); // column-wise variance
-    System.out.println(summary.numNonzeros()); // number of nonzeros in each column
+    System.out.println(summary.mean());  // a dense vector containing the mean value for each column
+    System.out.println(summary.variance());  // column-wise variance
+    System.out.println(summary.numNonzeros());  // number of nonzeros in each column
     // $example off$
 
     jsc.stop();
diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py
index a0d2e7d8be7eb..2163d08934d5c 100644
--- a/examples/src/main/python/mllib/correlations_example.py
+++ b/examples/src/main/python/mllib/correlations_example.py
@@ -34,7 +34,7 @@
 
     # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
     # If a method is not specified, Pearson's method will be used by default.
-    print(Statistics.corr(seriesX, seriesY, method="pearson"))
+    print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))
 
     v1 = np.array([1.0, 10.0, 100.0])
     v2 = np.array([2.0, 20.0, 200.0])
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
index ca3d12bc153fd..5e2521ae93e28 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@@ -19,7 +19,7 @@
 
 from pyspark import SparkContext
 # $example on$
-from pyspark.mllib.linalg import Vectors, Matrices
+from pyspark.mllib.linalg import Matrices, Vectors
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.stat import Statistics
 # $example off$
@@ -35,17 +35,19 @@
     goodnessOfFitTestResult = Statistics.chiSqTest(vec)
     print(goodnessOfFitTestResult)  # summary of the test including the p-value, degrees of freedom,
     # test statistic, the method used, and the null hypothesis.
+    print()
 
     mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix
 
     # conduct Pearson's independence test on the input contingency matrix
     independenceTestResult = Statistics.chiSqTest(mat)
     print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom
+    print()
 
     p1 = LabeledPoint(1.0, [1.0, 0.0, 3.0])
     p2 = LabeledPoint(1.0, [1.0, 2.0, 0.0])
     p3 = LabeledPoint(1.0, [-1.0, 0.0, -0.5])
-    obs = sc.parallelize([p1, p2, p3])  # LabeledPoint(feature, label) .
+    obs = sc.parallelize([p1, p2, p3])  # LabeledPoint(feature, label)
 
     # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
     # the independence test. Returns an array containing the ChiSquaredTestResult for every feature
@@ -55,6 +57,7 @@
     for i, result in enumerate(featureTestResults):
         print("Column: " + str(i + 1))
         print(result)
+        print()
     # $example off$
 
     sc.stop()
diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py
index 088f9f8807498..0b9d9ddbc9122 100644
--- a/examples/src/main/python/mllib/summary_statistics_example.py
+++ b/examples/src/main/python/mllib/summary_statistics_example.py
@@ -17,10 +17,10 @@
 
 from __future__ import print_function
 
-import numpy as np
-
 from pyspark import SparkContext
 # $example on$
+import numpy as np
+
 from pyspark.mllib.stat import Statistics
 # $example off$
 
@@ -28,9 +28,9 @@
     sc = SparkContext(appName="SummaryStatisticsExample")  # SparkContext
 
     # $example on$
-    v1 = np.array([1.0, 2.0, 3.0])
-    v2 = np.array([10.0, 20.0, 30.0])
-    v3 = np.array([100.0, 200.0, 300.0])
+    v1 = np.array([1.0, 10.0, 100.0])
+    v2 = np.array([2.0, 20.0, 200.0])
+    v3 = np.array([3.0, 30.0, 300.0])
     mat = sc.parallelize([v1, v2, v3])  # an RDD of Vectors
 
     # Compute column summary statistics.
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
index e395a25dc6a2e..69b6cef551f45 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
@@ -22,25 +22,25 @@ import org.apache.spark.{SparkConf, SparkContext}
 // $example on$
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.stat.Statistics
-// $example off$
 import org.apache.spark.rdd.RDD
+// $example off$
 
 object CorrelationsExample {
 
-  def main(args: Array[String]) {
+  def main(args: Array[String]): Unit = {
 
     val conf = new SparkConf().setAppName("CorrelationsExample")
     val sc = new SparkContext(conf)
 
     // $example on$
-    val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series
+    val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5))  // a series
     val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555))
     // must have the same number of partitions and cardinality as seriesX
 
     // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
     // method is not specified, Pearson's method will be used by default.
     val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
-    println(correlation)
+    println(s"Correlation is: $correlation")
 
     val data: RDD[Vector] = sc.parallelize(
       Seq(
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index c09e99011e579..1b548eedaaf74 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -41,30 +41,34 @@ object HypothesisTestingExample {
     // compute the goodness of fit. If a second vector to test against is not supplied
     // as a parameter, the test runs against a uniform distribution.
     val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-    println(goodnessOfFitTestResult) // summary of the test including the p-value,
-    // degrees of freedom, test statistic, the method used, and the null hypothesis.
+    // summary of the test including the p-value, degrees of freedom, test statistic, the method
+    // used, and the null hypothesis.
+    println(goodnessOfFitTestResult)
+    println()
 
     // a contingency matrix. Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
     val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
 
     // conduct Pearson's independence test on the input contingency matrix
     val independenceTestResult = Statistics.chiSqTest(mat)
-    println(independenceTestResult) // summary of the test including the p-value, degrees of freedom
+    // summary of the test including the p-value, degrees of freedom
+    println(independenceTestResult)
+    println()
 
     val p1 = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
     val p2 = LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0))
     val p3 = LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
-    val obs: RDD[LabeledPoint] = sc.parallelize(Seq(p1, p2, p3)) // (feature, label) pairs.
+    val obs: RDD[LabeledPoint] = sc.parallelize(Seq(p1, p2, p3))  // (feature, label) pairs.
 
     // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
     // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
     // against the label.
     val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
-    var i = 1
-    featureTestResults.foreach { result =>
-      println(s"Column $i:\n$result")
-      i += 1
-    } // summary of the test
+    featureTestResults.zipWithIndex.foreach { result =>
+      println(s"Column " + (result._2 + 1).toString + ":")
+      println(result._1)
+      println()
+    }  // summary of the test
     // $example off$
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
index 78660f45a43a0..840874cf3c2fe 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala
@@ -26,19 +26,20 @@ import org.apache.spark.rdd.RDD
 
 object HypothesisTestingKolmogorovSmirnovTestExample {
 
-  def main(args: Array[String]) {
+  def main(args: Array[String]): Unit = {
 
     val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample")
     val sc = new SparkContext(conf)
 
     // $example on$
-    val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data
+    val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25))  // an RDD of sample data
 
     // run a KS test for the sample versus a standard normal distribution
     val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
-    println(testResult) // summary of the test including the p-value, test statistic,
-    // and null hypothesis
-    // if our p-value indicates significance, we can reject the null hypothesis
+    // summary of the test including the p-value, test statistic, and null hypothesis if our p-value
+    // indicates significance, we can reject the null hypothesis.
+    println(testResult)
+    println()
 
     // perform a KS test using a cumulative distribution function of our making
     val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
index 402276ff086d8..cc5d159b36cc9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala
@@ -26,7 +26,7 @@ import org.apache.spark.rdd.RDD
 
 object KernelDensityEstimationExample {
 
-  def main(args: Array[String]) {
+  def main(args: Array[String]): Unit = {
 
     val conf = new SparkConf().setAppName("KernelDensityEstimationExample")
     val sc = new SparkContext(conf)
@@ -45,7 +45,7 @@ object KernelDensityEstimationExample {
     val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
     // $example off$
 
-    densities.foreach(print)
+    densities.foreach(println)
 
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
deleted file mode 100644
index de48ae58ce0e1..0000000000000
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.{SparkConf, SparkContext}
-// $example on$
-import org.apache.spark.mllib.random.RandomRDDs._
-// $example off$
-
-object RandomDataGenerationExample {
-
-  def main(args: Array[String]) {
-
-    val conf = new SparkConf().setAppName("RandomDataGenerationExample")
-    val sc = new SparkContext(conf)
-
-    // $example on$
-    // Generate a random double RDD that contains 1 million i.i.d. values drawn from the
-    // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
-    val u = normalRDD(sc, 1000L, 10)
-    // Apply a transform to get a random double RDD following `N(1, 4)`.
-    val v = u.map(x => 1.0 + 2.0 * x)
-    // $example off$
-
-    u.foreach(print)
-    v.foreach(print)
-
-    sc.stop()
-  }
-}
-// scalastyle:on println
-
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index 453e4a2f9d283..f0084dada2240 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -18,13 +18,11 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
-// $example on$
 import org.apache.spark.{SparkConf, SparkContext}
-// $example off$
 
 object StratifiedSamplingExample {
 
-  def main(args: Array[String]) {
+  def main(args: Array[String]): Unit = {
 
     val conf = new SparkConf().setAppName("StratifiedSamplingExample")
     val sc = new SparkContext(conf)
@@ -40,10 +38,10 @@ object StratifiedSamplingExample {
     // Get an exact sample from each stratum
     val approxSample = data.sampleByKey(withReplacement = false, fractions)
     val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
-
     // $example off$
 
     approxSample.foreach(println)
+    println()
     exactSample.foreach(println)
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
index 675c07aeab954..473b6789fd375 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
@@ -26,7 +26,7 @@ import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
 
 object SummaryStatisticsExample {
 
-  def main(args: Array[String]) {
+  def main(args: Array[String]): Unit = {
 
     val conf = new SparkConf().setAppName("SummaryStatisticsExample")
     val sc = new SparkContext(conf)
@@ -40,9 +40,9 @@ object SummaryStatisticsExample {
 
     // Compute column summary statistics.
     val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
-    println(summary.mean) // a dense vector containing the mean value for each column
-    println(summary.variance) // column-wise variance
-    println(summary.numNonzeros) // number of nonzeros in each column
+    println(summary.mean)  // a dense vector containing the mean value for each column
+    println(summary.variance)  // column-wise variance
+    println(summary.numNonzeros)  // number of nonzeros in each column
     // $example off$
 
     sc.stop()

From 33293947bde90fd29014587cd42533df121bd783 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Sun, 6 Mar 2016 14:49:29 -0800
Subject: [PATCH 22/26] [SPARK-13019] sorry, forget to delete python file

---
 .../mllib/random_data_generation_example.py   | 42 -------------------
 1 file changed, 42 deletions(-)
 delete mode 100644 examples/src/main/python/mllib/random_data_generation_example.py

diff --git a/examples/src/main/python/mllib/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py
deleted file mode 100644
index c08d631c5db7d..0000000000000
--- a/examples/src/main/python/mllib/random_data_generation_example.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark import SparkContext
-# $example on$
-from pyspark.mllib.random import RandomRDDs
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="RandomDataGenerationExample")  # SparkContext
-
-    # $example on$
-    # Generate a random double RDD that contains 1 million i.i.d. values drawn from the
-    # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
-    u = RandomRDDs.normalRDD(sc, 1000L, 10)
-    # Apply a transform to get a random double RDD following `N(1, 4)`.
-    v = u.map(lambda x: 1.0 + 2.0 * x)
-    # $example off$
-
-    for each in u.collect():
-        print(each)
-
-    for each in v.collect():
-        print(each)
-
-    sc.stop()

From acf7096e750b9150fbe309fc6c90aecb27b1102d Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Mon, 7 Mar 2016 13:30:49 -0800
Subject: [PATCH 23/26] [SPARK-13019] removing '-'s

---
 docs/mllib-statistics.md                      | 158 +++++++++---------
 .../mllib/JavaStratifiedSamplingExample.java  |   4 +-
 .../examples/mllib/CorrelationsExample.scala  |   4 +-
 3 files changed, 83 insertions(+), 83 deletions(-)

diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
index 62faa1bfa45ff..3f6e25ed7c04d 100644
--- a/docs/mllib-statistics.md
+++ b/docs/mllib-statistics.md
@@ -10,24 +10,24 @@ displayTitle: Basic Statistics - spark.mllib
 
 `\[
 \newcommand{\R}{\mathbb{R}}
-\newcommand{\E}{\mathbb{E}} 
+\newcommand{\E}{\mathbb{E}}
 \newcommand{\x}{\mathbf{x}}
 \newcommand{\y}{\mathbf{y}}
 \newcommand{\wv}{\mathbf{w}}
 \newcommand{\av}{\mathbf{\alpha}}
 \newcommand{\bv}{\mathbf{b}}
 \newcommand{\N}{\mathbb{N}}
-\newcommand{\id}{\mathbf{I}} 
-\newcommand{\ind}{\mathbf{1}} 
-\newcommand{\0}{\mathbf{0}} 
-\newcommand{\unit}{\mathbf{e}} 
-\newcommand{\one}{\mathbf{1}} 
+\newcommand{\id}{\mathbf{I}}
+\newcommand{\ind}{\mathbf{1}}
+\newcommand{\0}{\mathbf{0}}
+\newcommand{\unit}{\mathbf{e}}
+\newcommand{\one}{\mathbf{1}}
 \newcommand{\zero}{\mathbf{0}}
 \]`
 
-## Summary statistics 
+## Summary statistics
 
-We provide column summary statistics for `RDD[Vector]` through the function `colStats` 
+We provide column summary statistics for `RDD[Vector]` through the function `colStats`
 available in `Statistics`.
 
 <div class="codetabs">
@@ -71,13 +71,13 @@ Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.m
 ## Correlations
 
 Calculating the correlation between two series of data is a common operation in Statistics. In `spark.mllib`
-we provide the flexibility to calculate pairwise correlations among many series. The supported 
+we provide the flexibility to calculate pairwise correlations among many series. The supported
 correlation methods are currently Pearson's and Spearman's correlation.
- 
+
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to 
-calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or 
+[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to
+calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or
 an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
 
 Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API.
@@ -86,8 +86,8 @@ Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mll
 </div>
 
 <div data-lang="java" markdown="1">
-[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to 
-calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or 
+[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to
+calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or
 a `JavaRDD<Vector>`, the output will be a `Double` or the correlation `Matrix` respectively.
 
 Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API.
@@ -96,8 +96,8 @@ Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Stati
 </div>
 
 <div data-lang="python" markdown="1">
-[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to 
-calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or 
+[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to
+calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or
 an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
 
 Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API.
@@ -111,21 +111,21 @@ Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.ml
 
 Unlike the other statistics functions, which reside in `spark.mllib`, stratified sampling methods,
 `sampleByKey` and `sampleByKeyExact`, can be performed on RDD's of key-value pairs. For stratified
-sampling, the keys can be thought of as a label and the value as a specific attribute. For example 
-the key can be man or woman, or document ids, and the respective values can be the list of ages 
-of the people in the population or the list of words in the documents. The `sampleByKey` method 
-will flip a coin to decide whether an observation will be sampled or not, therefore requires one 
-pass over the data, and provides an *expected* sample size. `sampleByKeyExact` requires significant 
+sampling, the keys can be thought of as a label and the value as a specific attribute. For example
+the key can be man or woman, or document ids, and the respective values can be the list of ages
+of the people in the population or the list of words in the documents. The `sampleByKey` method
+will flip a coin to decide whether an observation will be sampled or not, therefore requires one
+pass over the data, and provides an *expected* sample size. `sampleByKeyExact` requires significant
 more resources than the per-stratum simple random sampling used in `sampleByKey`, but will provide
-the exact sampling size with 99.99% confidence. `sampleByKeyExact` is currently not supported in 
+the exact sampling size with 99.99% confidence. `sampleByKeyExact` is currently not supported in
 python.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 [`sampleByKeyExact()`](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) allows users to
-sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired 
+sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired
 fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of
-keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample 
+keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample
 size, whereas sampling with replacement requires two additional passes.
 
 {% include_example scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala %}
@@ -133,17 +133,17 @@ size, whereas sampling with replacement requires two additional passes.
 
 <div data-lang="java" markdown="1">
 [`sampleByKeyExact()`](api/java/org/apache/spark/api/java/JavaPairRDD.html) allows users to
-sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired 
+sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired
 fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of
-keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample 
+keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample
 size, whereas sampling with replacement requires two additional passes.
 
 {% include_example java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java %}
 </div>
 <div data-lang="python" markdown="1">
 [`sampleByKey()`](api/python/pyspark.html#pyspark.RDD.sampleByKey) allows users to
-sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the 
-desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the 
+sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the
+desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the
 set of keys.
 
 *Note:* `sampleByKeyExact()` is currently not supported in Python.
@@ -155,27 +155,27 @@ set of keys.
 
 ## Hypothesis testing
 
-Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically 
-significant, whether this result occurred by chance or not. `spark.mllib` currently supports Pearson's 
+Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically
+significant, whether this result occurred by chance or not. `spark.mllib` currently supports Pearson's
 chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine
-whether the goodness of fit or the independence test is conducted. The goodness of fit test requires 
+whether the goodness of fit or the independence test is conducted. The goodness of fit test requires
 an input type of `Vector`, whereas the independence test requires a `Matrix` as input.
 
-`spark.mllib` also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared 
+`spark.mllib` also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared
 independence tests.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to 
-run Pearson's chi-squared tests. The following example demonstrates how to run and interpret 
+[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to
+run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
 hypothesis tests.
 
 {% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
-[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to 
-run Pearson's chi-squared tests. The following example demonstrates how to run and interpret 
+[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to
+run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
 hypothesis tests.
 
 Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API.
@@ -197,11 +197,11 @@ Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.ml
 
 Additionally, `spark.mllib` provides a 1-sample, 2-sided implementation of the Kolmogorov-Smirnov (KS) test
 for equality of probability distributions. By providing the name of a theoretical distribution
-(currently solely supported for the normal distribution) and its parameters, or a function to 
+(currently solely supported for the normal distribution) and its parameters, or a function to
 calculate the cumulative distribution according to a given theoretical distribution, the user can
 test the null hypothesis that their sample is drawn from that distribution. In the case that the
 user tests against the normal distribution (`distName="norm"`), but does not provide distribution
-parameters, the test initializes to the standard normal distribution and logs an appropriate 
+parameters, the test initializes to the standard normal distribution and logs an appropriate
 message.
 
 <div class="codetabs">
@@ -277,18 +277,18 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) for details on the API.
 
--{% highlight scala %}
--import org.apache.spark.SparkContext
--import org.apache.spark.mllib.random.RandomRDDs._
--
--val sc: SparkContext = ...
--
--// Generate a random double RDD that contains 1 million i.i.d. values drawn from the
--// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
--val u = normalRDD(sc, 1000000L, 10)
--// Apply a transform to get a random double RDD following `N(1, 4)`.
--val v = u.map(x => 1.0 + 2.0 * x)
--{% endhighlight %}
+{% highlight scala %}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.random.RandomRDDs._
+
+val sc: SparkContext = ...
+
+// Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+val u = normalRDD(sc, 1000000L, 10)
+// Apply a transform to get a random double RDD following `N(1, 4)`.
+val v = u.map(x => 1.0 + 2.0 * x)
+{% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -299,24 +299,24 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Java docs](api/java/org/apache/spark/mllib/random/RandomRDDs) for details on the API.
 
--{% highlight java %}
- -import org.apache.spark.SparkContext;
- -import org.apache.spark.api.JavaDoubleRDD;
- -import static org.apache.spark.mllib.random.RandomRDDs.*;
- -
- -JavaSparkContext jsc = ...
- -
- -// Generate a random double RDD that contains 1 million i.i.d. values drawn from the
- -// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
- -JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10);
- -// Apply a transform to get a random double RDD following `N(1, 4)`.
- -JavaDoubleRDD v = u.map(
- -  new Function<Double, Double>() {
- -    public Double call(Double x) {
- -      return 1.0 + 2.0 * x;
- -    }
- -  });
- -{% endhighlight %}
+{% highlight java %}
+import org.apache.spark.SparkContext;
+import org.apache.spark.api.JavaDoubleRDD;
+import static org.apache.spark.mllib.random.RandomRDDs.*;
+
+JavaSparkContext jsc = ...
+
+// Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10);
+// Apply a transform to get a random double RDD following `N(1, 4)`.
+JavaDoubleRDD v = u.map(
+  new Function<Double, Double>() {
+    public Double call(Double x) {
+      return 1.0 + 2.0 * x;
+    }
+  });
+{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
@@ -327,17 +327,17 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`.
 
 Refer to the [`RandomRDDs` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.random.RandomRDDs) for more details on the API.
 
--{% highlight python %}
- -from pyspark.mllib.random import RandomRDDs
- -
- -sc = ... # SparkContext
- -
- -# Generate a random double RDD that contains 1 million i.i.d. values drawn from the
- -# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
- -u = RandomRDDs.normalRDD(sc, 1000000L, 10)
- -# Apply a transform to get a random double RDD following `N(1, 4)`.
- -v = u.map(lambda x: 1.0 + 2.0 * x)
- -{% endhighlight %}
+{% highlight python %}
+from pyspark.mllib.random import RandomRDDs
+
+sc = ... # SparkContext
+
+# Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+u = RandomRDDs.normalRDD(sc, 1000000L, 10)
+# Apply a transform to get a random double RDD following `N(1, 4)`.
+v = u.map(lambda x: 1.0 + 2.0 * x)
+{% endhighlight %}
 </div>
 </div>
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
index 0cec6e2e51214..0fa051783a014 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
@@ -59,9 +59,9 @@ public static void main(String[] args) {
 
     // Get an exact sample from each stratum
     JavaPairRDD<Integer, Character> approxSample =
-      data.sampleByKey(false, fractions); // JavaPairRDD<K, V>
+      data.sampleByKey(false, fractions);  // JavaPairRDD<K, V>
     JavaPairRDD<Integer, Character> exactSample =
-      data.sampleByKeyExact(false, fractions); // JavaPairRDD<K, V>
+      data.sampleByKeyExact(false, fractions);  // JavaPairRDD<K, V>
     // $example off$
 
     approxSample.foreach(new VoidFunction<Tuple2<Integer, Character>>() {
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
index 69b6cef551f45..1202caf534e95 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala
@@ -34,8 +34,8 @@ object CorrelationsExample {
 
     // $example on$
     val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5))  // a series
-    val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555))
     // must have the same number of partitions and cardinality as seriesX
+    val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555))
 
     // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
     // method is not specified, Pearson's method will be used by default.
@@ -47,7 +47,7 @@ object CorrelationsExample {
         Vectors.dense(1.0, 10.0, 100.0),
         Vectors.dense(2.0, 20.0, 200.0),
         Vectors.dense(5.0, 33.0, 366.0))
-    ) // note that each Vector is a row and not a column
+    )  // note that each Vector is a row and not a column
 
     // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method
     // If a method is not specified, Pearson's method will be used by default.

From a4eb28d07a99e559132160f8ae9ac993d47d8fa3 Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Thu, 17 Mar 2016 12:21:18 -0700
Subject: [PATCH 24/26] [SPARK-13019] use asList() for concise code

---
 .../mllib/JavaCorrelationsExample.java        | 12 ++--
 .../mllib/JavaHypothesisTestingExample.java   | 20 +++----
 .../JavaKernelDensityEstimationExample.java   |  2 +-
 .../mllib/JavaStratifiedSamplingExample.java  | 58 +++++++++----------
 .../mllib/JavaSummaryStatisticsExample.java   | 12 ++--
 .../main/python/mllib/correlations_example.py |  7 +--
 .../mllib/hypothesis_testing_example.py       | 26 +++++----
 ...testing_kolmogorov_smirnov_test_example.py |  2 +-
 .../mllib/summary_statistics_example.py       |  7 +--
 .../mllib/HypothesisTestingExample.scala      | 26 +++++----
 .../mllib/StratifiedSamplingExample.scala     | 11 ++--
 .../mllib/SummaryStatisticsExample.scala      | 12 ++--
 12 files changed, 100 insertions(+), 95 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
index c27c1d01bab58..fd19b43504ac1 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java
@@ -49,12 +49,14 @@ public static void main(String[] args) {
     Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
     System.out.println("Correlation is: " + correlation);
 
-    Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
-    Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
-    Vector v3 = Vectors.dense(5.0, 33.0, 366.0);
-
     // note that each Vector is a row and not a column
-    JavaRDD<Vector> data = jsc.parallelize(Arrays.asList(v1, v2, v3));
+    JavaRDD<Vector> data = jsc.parallelize(
+      Arrays.asList(
+        Vectors.dense(1.0, 10.0, 100.0),
+        Vectors.dense(2.0, 20.0, 200.0),
+        Vectors.dense(5.0, 33.0, 366.0)
+      )
+    );
 
     // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
     // If a method is not specified, Pearson's method will be used by default.
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
index 0960b07a98557..b48b95ff1d2a3 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java
@@ -48,8 +48,7 @@ public static void main(String[] args) {
     ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
     // summary of the test including the p-value, degrees of freedom, test statistic,
     // the method used, and the null hypothesis.
-    System.out.println(goodnessOfFitTestResult);
-    System.out.println();
+    System.out.println(goodnessOfFitTestResult + "\n");
 
     // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
     Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
@@ -57,14 +56,16 @@ public static void main(String[] args) {
     // conduct Pearson's independence test on the input contingency matrix
     ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
     // summary of the test including the p-value, degrees of freedom...
-    System.out.println(independenceTestResult);
-    System.out.println();
+    System.out.println(independenceTestResult + "\n");
 
-    LabeledPoint p1 = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0));
-    LabeledPoint p2 = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0));
-    LabeledPoint p3 = new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5));
     // an RDD of labeled points
-    JavaRDD<LabeledPoint> obs = jsc.parallelize(Arrays.asList(p1, p2, p3));
+    JavaRDD<LabeledPoint> obs = jsc.parallelize(
+      Arrays.asList(
+        new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
+        new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
+        new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
+      )
+    );
 
     // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
     // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
@@ -73,8 +74,7 @@ public static void main(String[] args) {
     int i = 1;
     for (ChiSqTestResult result : featureTestResults) {
       System.out.println("Column " + i + ":");
-      System.out.println(result);  // summary of the test
-      System.out.println();
+      System.out.println(result + "\n");  // summary of the test
       i++;
     }
     // $example off$
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
index 2f25b7534164a..41de0d90eccd7 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java
@@ -43,9 +43,9 @@ public static void main(String[] args) {
 
     // Find density estimates for the given values
     double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});
-    // $example off$
 
     System.out.println(Arrays.toString(densities));
+    // $example off$
 
     jsc.stop();
   }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
index 0fa051783a014..f5a451019bd21 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java
@@ -17,14 +17,12 @@
 
 package org.apache.spark.examples.mllib;
 
+import com.google.common.collect.ImmutableMap;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 
 // $example on$
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 
 import scala.Tuple2;
 
@@ -39,42 +37,38 @@ public static void main(String[] args) {
     JavaSparkContext jsc = new JavaSparkContext(conf);
 
     // $example on$
-    List<Tuple2<Integer, Character>> list = new ArrayList<>();
-    list.add(new Tuple2(1, 'a'));
-    list.add(new Tuple2(1, 'b'));
-    list.add(new Tuple2(2, 'c'));
-    list.add(new Tuple2(2, 'd'));
-    list.add(new Tuple2(2, 'e'));
-    list.add(new Tuple2(3, 'f'));
+    List<Tuple2<Integer, Character>> list = new ArrayList<Tuple2<Integer, Character>>(
+      Arrays.<Tuple2<Integer, Character>>asList(
+        new Tuple2(1, 'a'),
+        new Tuple2(1, 'b'),
+        new Tuple2(2, 'c'),
+        new Tuple2(2, 'd'),
+        new Tuple2(2, 'e'),
+        new Tuple2(3, 'f')
+      )
+    );
 
-    // an RDD of any key value pairs JavaPairRDD<K, V>
     JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);
 
     // specify the exact fraction desired from each key Map<K, Object>
-    Map<Integer, Object> fractions = new HashMap<>();
-
-    fractions.put(1, 0.1);
-    fractions.put(2, 0.6);
-    fractions.put(3, 0.3);
+    ImmutableMap<Integer, Object> fractions =
+      ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3);
 
+    // Get an approximate sample from each stratum
+    JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions);
     // Get an exact sample from each stratum
-    JavaPairRDD<Integer, Character> approxSample =
-      data.sampleByKey(false, fractions);  // JavaPairRDD<K, V>
-    JavaPairRDD<Integer, Character> exactSample =
-      data.sampleByKeyExact(false, fractions);  // JavaPairRDD<K, V>
+    JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions);
     // $example off$
 
-    approxSample.foreach(new VoidFunction<Tuple2<Integer, Character>>() {
-      public void call(Tuple2<Integer, Character> t) throws Exception {
-        System.out.println(t._1() + " " + t._2());
-      }
-    });
-    System.out.println();
-    exactSample.foreach(new VoidFunction<Tuple2<Integer, Character>>() {
-      public void call(Tuple2<Integer, Character> t) throws Exception {
-        System.out.println(t._1() + " " + t._2());
-      }
-    });
+    System.out.println("approxSample size is " + approxSample.collect().size());
+    for (Tuple2<Integer, Character> t : approxSample.collect()) {
+      System.out.println(t._1() + " " + t._2());
+    }
+
+    System.out.println("exactSample size is " + exactSample.collect().size());
+    for (Tuple2<Integer, Character> t : exactSample.collect()) {
+      System.out.println(t._1() + " " + t._2());
+    }
 
     jsc.stop();
   }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
index aae06679d3e2d..278706bc8f6ed 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java
@@ -36,11 +36,13 @@ public static void main(String[] args) {
     JavaSparkContext jsc = new JavaSparkContext(conf);
 
     // $example on$
-    Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
-    Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
-    Vector v3 = Vectors.dense(3.0, 30.0, 300.0);
-
-    JavaRDD<Vector> mat = jsc.parallelize(Arrays.asList(v1, v2, v3)); // an RDD of Vectors
+    JavaRDD<Vector> mat = jsc.parallelize(
+      Arrays.asList(
+        Vectors.dense(1.0, 10.0, 100.0),
+        Vectors.dense(2.0, 20.0, 200.0),
+        Vectors.dense(3.0, 30.0, 300.0)
+      )
+    ); // an RDD of Vectors
 
     // Compute column summary statistics.
     MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py
index 2163d08934d5c..66d18f6e5df17 100644
--- a/examples/src/main/python/mllib/correlations_example.py
+++ b/examples/src/main/python/mllib/correlations_example.py
@@ -36,10 +36,9 @@
     # If a method is not specified, Pearson's method will be used by default.
     print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))
 
-    v1 = np.array([1.0, 10.0, 100.0])
-    v2 = np.array([2.0, 20.0, 200.0])
-    v3 = np.array([5.0, 33.0, 366.0])
-    data = sc.parallelize([v1, v2, v3])  # an RDD of Vectors
+    data = sc.parallelize(
+        [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])]
+    )  # an RDD of Vectors
 
     # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
     # If a method is not specified, Pearson's method will be used by default.
diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py
index 5e2521ae93e28..e566ead0d318d 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_example.py
@@ -25,7 +25,7 @@
 # $example off$
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="HypothesisTestingExample")  # SparkContext
+    sc = SparkContext(appName="HypothesisTestingExample")
 
     # $example on$
     vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)  # a vector composed of the frequencies of events
@@ -33,21 +33,25 @@
     # compute the goodness of fit. If a second vector to test against
     # is not supplied as a parameter, the test runs against a uniform distribution.
     goodnessOfFitTestResult = Statistics.chiSqTest(vec)
-    print(goodnessOfFitTestResult)  # summary of the test including the p-value, degrees of freedom,
+
+    # summary of the test including the p-value, degrees of freedom,
     # test statistic, the method used, and the null hypothesis.
-    print()
+    print("%s\n" % goodnessOfFitTestResult)
 
     mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix
 
     # conduct Pearson's independence test on the input contingency matrix
     independenceTestResult = Statistics.chiSqTest(mat)
-    print(independenceTestResult)  # summary of the test including the p-value, degrees of freedom
-    print()
 
-    p1 = LabeledPoint(1.0, [1.0, 0.0, 3.0])
-    p2 = LabeledPoint(1.0, [1.0, 2.0, 0.0])
-    p3 = LabeledPoint(1.0, [-1.0, 0.0, -0.5])
-    obs = sc.parallelize([p1, p2, p3])  # LabeledPoint(feature, label)
+    # summary of the test including the p-value, degrees of freedom,
+    # test statistic, the method used, and the null hypothesis.
+    print("%s\n" % independenceTestResult)
+
+    obs = sc.parallelize(
+        [LabeledPoint(1.0, [1.0, 0.0, 3.0]),
+         LabeledPoint(1.0, [1.0, 2.0, 0.0]),
+         LabeledPoint(1.0, [-1.0, 0.0, -0.5])]
+    )  # LabeledPoint(feature, label)
 
     # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
     # the independence test. Returns an array containing the ChiSquaredTestResult for every feature
@@ -55,9 +59,7 @@
     featureTestResults = Statistics.chiSqTest(obs)
 
     for i, result in enumerate(featureTestResults):
-        print("Column: " + str(i + 1))
-        print(result)
-        print()
+        print("Column %d:\n%s" % (i + 1, result))
     # $example off$
 
     sc.stop()
diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
index 15d63ef86b2e7..ef380dee79d3d 100644
--- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
+++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py
@@ -23,7 +23,7 @@
 # $example off$
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample")  # SparkContext
+    sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample")
 
     # $example on$
     parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25])
diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py
index 0b9d9ddbc9122..d55d1a2c2d0e1 100644
--- a/examples/src/main/python/mllib/summary_statistics_example.py
+++ b/examples/src/main/python/mllib/summary_statistics_example.py
@@ -28,10 +28,9 @@
     sc = SparkContext(appName="SummaryStatisticsExample")  # SparkContext
 
     # $example on$
-    v1 = np.array([1.0, 10.0, 100.0])
-    v2 = np.array([2.0, 20.0, 200.0])
-    v3 = np.array([3.0, 30.0, 300.0])
-    mat = sc.parallelize([v1, v2, v3])  # an RDD of Vectors
+    mat = sc.parallelize(
+        [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])]
+    )  # an RDD of Vectors
 
     # Compute column summary statistics.
     summary = Statistics.colStats(mat)
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index 1b548eedaaf74..0d391a3637c07 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -43,8 +43,7 @@ object HypothesisTestingExample {
     val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
     // summary of the test including the p-value, degrees of freedom, test statistic, the method
     // used, and the null hypothesis.
-    println(goodnessOfFitTestResult)
-    println()
+    println(s"$goodnessOfFitTestResult\n")
 
     // a contingency matrix. Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
     val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
@@ -52,22 +51,25 @@ object HypothesisTestingExample {
     // conduct Pearson's independence test on the input contingency matrix
     val independenceTestResult = Statistics.chiSqTest(mat)
     // summary of the test including the p-value, degrees of freedom
-    println(independenceTestResult)
-    println()
+    println(s"$independenceTestResult\n")
 
-    val p1 = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
-    val p2 = LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0))
-    val p3 = LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5))
-    val obs: RDD[LabeledPoint] = sc.parallelize(Seq(p1, p2, p3))  // (feature, label) pairs.
+    val obs: RDD[LabeledPoint] =
+      sc.parallelize(
+        Seq(
+          LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
+          LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)),
+          LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)
+          )
+        )
+      ) // (feature, label) pairs.
 
     // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
     // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
     // against the label.
     val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
-    featureTestResults.zipWithIndex.foreach { result =>
-      println(s"Column " + (result._2 + 1).toString + ":")
-      println(result._1)
-      println()
+    featureTestResults.zipWithIndex.foreach { case (k, v) =>
+      println("Column " + (v + 1).toString + ":")
+      println(k)
     }  // summary of the test
     // $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index f0084dada2240..169467926ce46 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -35,14 +35,17 @@ object StratifiedSamplingExample {
     // specify the exact fraction desired from each key
     val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3)
 
-    // Get an exact sample from each stratum
+    // Get an approximate sample from each stratum
     val approxSample = data.sampleByKey(withReplacement = false, fractions)
+    // Get an exact sample from each stratum
     val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
     // $example off$
 
-    approxSample.foreach(println)
-    println()
-    exactSample.foreach(println)
+    println("approxSample size is " + approxSample.collect().size.toString)
+    approxSample.collect().foreach(println)
+
+    println("exactSample its size is " + exactSample.collect().size.toString)
+    exactSample.collect().foreach(println)
 
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
index 473b6789fd375..948b443c0a754 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala
@@ -32,11 +32,13 @@ object SummaryStatisticsExample {
     val sc = new SparkContext(conf)
 
     // $example on$
-    val v1 = Vectors.dense(1.0, 10.0, 100.0)
-    val v2 = Vectors.dense(2.0, 20.0, 200.0)
-    val v3 = Vectors.dense(3.0, 30.0, 300.0)
-
-    val observations = sc.parallelize(Seq(v1, v2, v3))
+    val observations = sc.parallelize(
+      Seq(
+        Vectors.dense(1.0, 10.0, 100.0),
+        Vectors.dense(2.0, 20.0, 200.0),
+        Vectors.dense(3.0, 30.0, 300.0)
+      )
+    )
 
     // Compute column summary statistics.
     val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)

From 892fe600e48b49b26a29120c99d171db02c659ab Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Mon, 21 Mar 2016 18:19:29 -0700
Subject: [PATCH 25/26] [SPARK-13019] fix arguments passing for 2.10

---
 .../spark/examples/mllib/StratifiedSamplingExample.scala    | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index 169467926ce46..e100cf09c6391 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -19,6 +19,7 @@
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.util.Utils
 
 object StratifiedSamplingExample {
 
@@ -34,11 +35,12 @@ object StratifiedSamplingExample {
 
     // specify the exact fraction desired from each key
     val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3)
+    val seed = Utils.random.nextLong
 
     // Get an approximate sample from each stratum
-    val approxSample = data.sampleByKey(withReplacement = false, fractions)
+    val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions, seed)
     // Get an exact sample from each stratum
-    val exactSample = data.sampleByKeyExact(withReplacement = false, fractions)
+    val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions, seed)
     // $example off$
 
     println("approxSample size is " + approxSample.collect().size.toString)

From ceebd3600efc7d6b509c4dae9c08e13890574fec Mon Sep 17 00:00:00 2001
From: Xin Ren <iamshrek@126.com>
Date: Tue, 22 Mar 2016 14:49:40 -0700
Subject: [PATCH 26/26] [SPARK-13019] remove variable 'seed'

---
 .../spark/examples/mllib/StratifiedSamplingExample.scala    | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index e100cf09c6391..16b074ef60699 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -19,7 +19,6 @@
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.util.Utils
 
 object StratifiedSamplingExample {
 
@@ -35,12 +34,11 @@ object StratifiedSamplingExample {
 
     // specify the exact fraction desired from each key
     val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3)
-    val seed = Utils.random.nextLong
 
     // Get an approximate sample from each stratum
-    val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions, seed)
+    val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions)
     // Get an exact sample from each stratum
-    val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions, seed)
+    val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions)
     // $example off$
 
     println("approxSample size is " + approxSample.collect().size.toString)