From 49b7012e7b67d3fc7db031d0604d76d4150034eb Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Mon, 1 Feb 2016 17:51:25 -0800 Subject: [PATCH 01/26] [SPARK-13019] raplce for summary staticstics, scala code --- docs/mllib-statistics.md | 14 +---- .../ml/JavaSummaryStatisticsExample.java | 7 +++ .../ml/SummaryStatisticsExample.scala | 51 +++++++++++++++++++ 3 files changed, 59 insertions(+), 13 deletions(-) create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index 652d215fa8653..93c5204e63304 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -40,19 +40,7 @@ total count. Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.linalg.Vector -import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} - -val observations: RDD[Vector] = ... // an RDD of Vectors - -// Compute column summary statistics. -val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) -println(summary.mean) // a dense vector containing the mean value for each column -println(summary.variance) // column-wise variance -println(summary.numNonzeros) // number of nonzeros in each column - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala %}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java new file mode 100644 index 0000000000000..8c8e3ab0ef143 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java @@ -0,0 +1,7 @@ +package org.apache.spark.examples.ml; + +/** + * Created by quickmobile on 16-02-01. + */ +public class JavaSummaryStatisticsExample { +} diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala new file mode 100644 index 0000000000000..10781257ee4d6 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object SummaryStatisticsExample { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("MultivariateStatisticalSummaryExample") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + + // $example on$ + val v1 = Vectors.dense(1.0, 10.0, 100.0) + val v2 = Vectors.dense(2.0, 20.0, 200.0) + val v3 = Vectors.dense(3.0, 30.0, 300.0) + + val observations = sc.parallelize(Seq(v1, v2, v3)) + + // Compute column summary statistics. + val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) + println(summary.mean) // a dense vector containing the mean value for each column + println(summary.variance) // column-wise variance + println(summary.numNonzeros) // number of nonzeros in each column + // $example off$ + + sc.stop() + } +} +// scalastyle:on println From 83592bcafa553bf9439da7db72552868b3ed967a Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Mon, 1 Feb 2016 21:01:20 -0800 Subject: [PATCH 02/26] [SPARK-13019] test out on/off, for import part --- .../org/apache/spark/examples/ml/SummaryStatisticsExample.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala index 10781257ee4d6..68b9c19914897 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala @@ -18,8 +18,10 @@ // scalastyle:off println package org.apache.spark.examples.ml +// $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} +// $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} From 069341b3d22c147fcf84e338e9b1b7d1f9fdae8e Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Tue, 2 Feb 2016 17:33:33 -0800 Subject: [PATCH 03/26] [SPARK-13019] create separate example files, but cannot compile yet --- docs/mllib-statistics.md | 390 +----------------- .../examples/ml/JavaCorrelationsExample.java | 64 +++ .../ml/JavaHypothesisTestingExample.java | 79 ++++ ...isTestingKolmogorovSmirnovTestExample.java | 59 +++ .../JavaKernelDensityEstimationExample.java | 59 +++ .../ml/JavaRandomDataGenerationExample.java | 61 +++ .../ml/JavaStratifiedSamplingExample.java | 52 +++ .../ml/JavaSummaryStatisticsExample.java | 56 ++- .../main/python/ml/correlations_example.py | 50 +++ .../python/ml/hypothesis_testing_example.py | 66 +++ ...testing_kolmogorov_smirnov_test_example.py | 48 +++ .../ml/kernel_density_estimation_example.py | 48 +++ .../ml/random_data_generation_example.py | 43 ++ .../python/ml/stratified_sampling_example.py | 39 ++ .../python/ml/summary_statistics_example.py | 45 ++ .../examples/ml/CorrelationsExample.scala | 58 +++ .../ml/HypothesisTestingExample.scala | 74 ++++ ...sTestingKolmogorovSmirnovTestExample.scala | 61 +++ .../ml/KernelDensityEstimationExample.scala | 56 +++ .../ml/RandomDataGenerationExample.scala | 52 +++ .../ml/StratifiedSamplingExample.scala | 51 +++ .../ml/SummaryStatisticsExample.scala | 2 +- 22 files changed, 1139 insertions(+), 374 deletions(-) create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java create mode 100644 examples/src/main/python/ml/correlations_example.py create mode 100644 examples/src/main/python/ml/hypothesis_testing_example.py create mode 100644 examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py create mode 100644 examples/src/main/python/ml/kernel_density_estimation_example.py create mode 100644 examples/src/main/python/ml/random_data_generation_example.py create mode 100644 examples/src/main/python/ml/stratified_sampling_example.py create mode 100644 examples/src/main/python/ml/summary_statistics_example.py create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index 93c5204e63304..487ae12f3b6de 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -52,24 +52,7 @@ total count. Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API. -{% highlight java %} -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; -import org.apache.spark.mllib.stat.Statistics; - -JavaSparkContext jsc = ... - -JavaRDD mat = ... // an RDD of Vectors - -// Compute column summary statistics. -MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); -System.out.println(summary.mean()); // a dense vector containing the mean value for each column -System.out.println(summary.variance()); // column-wise variance -System.out.println(summary.numNonzeros()); // number of nonzeros in each column - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java %}
@@ -80,20 +63,7 @@ total count. Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API. -{% highlight python %} -from pyspark.mllib.stat import Statistics - -sc = ... # SparkContext - -mat = ... # an RDD of Vectors - -# Compute column summary statistics. -summary = Statistics.colStats(mat) -print(summary.mean()) -print(summary.variance()) -print(summary.numNonzeros()) - -{% endhighlight %} +{% include_example python/ml/summary_statistics_example.py %}
@@ -112,27 +82,7 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext -import org.apache.spark.mllib.linalg._ -import org.apache.spark.mllib.stat.Statistics - -val sc: SparkContext = ... - -val seriesX: RDD[Double] = ... // a series -val seriesY: RDD[Double] = ... // must have the same number of partitions and cardinality as seriesX - -// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a -// method is not specified, Pearson's method will be used by default. -val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") - -val data: RDD[Vector] = ... // note that each Vector is a row and not a column - -// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. -// If a method is not specified, Pearson's method will be used by default. -val correlMatrix: Matrix = Statistics.corr(data, "pearson") - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/ml/CorrelationsExample.scala %}
@@ -142,28 +92,7 @@ a `JavaRDD`, the output will be a `Double` or the correlation `Matrix` r Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API. -{% highlight java %} -import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.*; -import org.apache.spark.mllib.stat.Statistics; - -JavaSparkContext jsc = ... - -JavaDoubleRDD seriesX = ... // a series -JavaDoubleRDD seriesY = ... // must have the same number of partitions and cardinality as seriesX - -// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a -// method is not specified, Pearson's method will be used by default. -Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); - -JavaRDD data = ... // note that each Vector is a row and not a column - -// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. -// If a method is not specified, Pearson's method will be used by default. -Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/ml/JavaCorrelationsExample.java %}
@@ -173,24 +102,7 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% highlight python %} -from pyspark.mllib.stat import Statistics - -sc = ... # SparkContext - -seriesX = ... # a series -seriesY = ... # must have the same number of partitions and cardinality as seriesX - -# Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a -# method is not specified, Pearson's method will be used by default. -print(Statistics.corr(seriesX, seriesY, method="pearson")) - -data = ... # an RDD of Vectors -# calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. -# If a method is not specified, Pearson's method will be used by default. -print(Statistics.corr(data, method="pearson")) - -{% endhighlight %} +{% include_example python/ml/correlations_example.py %}
@@ -216,21 +128,7 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample size, whereas sampling with replacement requires two additional passes. -{% highlight scala %} -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ -import org.apache.spark.rdd.PairRDDFunctions - -val sc: SparkContext = ... - -val data = ... // an RDD[(K, V)] of any key value pairs -val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key - -// Get an exact sample from each stratum -val approxSample = data.sampleByKey(withReplacement = false, fractions) -val exactSample = data.sampleByKeyExact(withReplacement = false, fractions) - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala %}
@@ -240,22 +138,7 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample size, whereas sampling with replacement requires two additional passes. -{% highlight java %} -import java.util.Map; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; - -JavaSparkContext jsc = ... - -JavaPairRDD data = ... // an RDD of any key value pairs -Map fractions = ... // specify the exact fraction desired from each key - -// Get an exact sample from each stratum -JavaPairRDD approxSample = data.sampleByKey(false, fractions); -JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java %}
[`sampleByKey()`](api/python/pyspark.html#pyspark.RDD.sampleByKey) allows users to @@ -265,16 +148,7 @@ set of keys. *Note:* `sampleByKeyExact()` is currently not supported in Python. -{% highlight python %} - -sc = ... # SparkContext - -data = ... # an RDD of any key value pairs -fractions = ... # specify the exact fraction desired from each key as a dictionary - -approxSample = data.sampleByKey(False, fractions); - -{% endhighlight %} +{% include_example python/ml/stratified_sampling_example.py %}
@@ -296,41 +170,7 @@ independence tests. run Pearson's chi-squared tests. The following example demonstrates how to run and interpret hypothesis tests. -{% highlight scala %} -import org.apache.spark.SparkContext -import org.apache.spark.mllib.linalg._ -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.stat.Statistics._ - -val sc: SparkContext = ... - -val vec: Vector = ... // a vector composed of the frequencies of events - -// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, -// the test runs against a uniform distribution. -val goodnessOfFitTestResult = Statistics.chiSqTest(vec) -println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom, - // test statistic, the method used, and the null hypothesis. - -val mat: Matrix = ... // a contingency matrix - -// conduct Pearson's independence test on the input contingency matrix -val independenceTestResult = Statistics.chiSqTest(mat) -println(independenceTestResult) // summary of the test including the p-value, degrees of freedom... - -val obs: RDD[LabeledPoint] = ... // (feature, label) pairs. - -// The contingency table is constructed from the raw (feature, label) pairs and used to conduct -// the independence test. Returns an array containing the ChiSquaredTestResult for every feature -// against the label. -val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs) -var i = 1 -featureTestResults.foreach { result => - println(s"Column $i:\n$result") - i += 1 -} // summary of the test - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala %}
@@ -340,44 +180,7 @@ hypothesis tests. Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API. -{% highlight java %} -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.*; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.stat.Statistics; -import org.apache.spark.mllib.stat.test.ChiSqTestResult; - -JavaSparkContext jsc = ... - -Vector vec = ... // a vector composed of the frequencies of events - -// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, -// the test runs against a uniform distribution. -ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); -// summary of the test including the p-value, degrees of freedom, test statistic, the method used, -// and the null hypothesis. -System.out.println(goodnessOfFitTestResult); - -Matrix mat = ... // a contingency matrix - -// conduct Pearson's independence test on the input contingency matrix -ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); -// summary of the test including the p-value, degrees of freedom... -System.out.println(independenceTestResult); - -JavaRDD obs = ... // an RDD of labeled points - -// The contingency table is constructed from the raw (feature, label) pairs and used to conduct -// the independence test. Returns an array containing the ChiSquaredTestResult for every feature -// against the label. -ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); -int i = 1; -for (ChiSqTestResult result : featureTestResults) { - System.out.println("Column " + i + ":"); - System.out.println(result); // summary of the test - i++; -} +{% include_example java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java %} {% endhighlight %}
@@ -389,39 +192,7 @@ hypothesis tests. Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% highlight python %} -from pyspark import SparkContext -from pyspark.mllib.linalg import Vectors, Matrices -from pyspark.mllib.regresssion import LabeledPoint -from pyspark.mllib.stat import Statistics - -sc = SparkContext() - -vec = Vectors.dense(...) # a vector composed of the frequencies of events - -# compute the goodness of fit. If a second vector to test against is not supplied as a parameter, -# the test runs against a uniform distribution. -goodnessOfFitTestResult = Statistics.chiSqTest(vec) -print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, - # test statistic, the method used, and the null hypothesis. - -mat = Matrices.dense(...) # a contingency matrix - -# conduct Pearson's independence test on the input contingency matrix -independenceTestResult = Statistics.chiSqTest(mat) -print(independenceTestResult) # summary of the test including the p-value, degrees of freedom... - -obs = sc.parallelize(...) # LabeledPoint(feature, label) . - -# The contingency table is constructed from an RDD of LabeledPoint and used to conduct -# the independence test. Returns an array containing the ChiSquaredTestResult for every feature -# against the label. -featureTestResults = Statistics.chiSqTest(obs) - -for i, result in enumerate(featureTestResults): - print("Column $d:" % (i + 1)) - print(result) -{% endhighlight %} +{% include_example python/ml/hypothesis_testing_example.py %} @@ -443,21 +214,7 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.stat.Statistics - -val data: RDD[Double] = ... // an RDD of sample data - -// run a KS test for the sample versus a standard normal distribution -val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) -println(testResult) // summary of the test including the p-value, test statistic, - // and null hypothesis - // if our p-value indicates significance, we can reject the null hypothesis - -// perform a KS test using a cumulative distribution function of our making -val myCDF: Double => Double = ... -val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala %}
@@ -467,23 +224,7 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API. -{% highlight java %} -import java.util.Arrays; - -import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.api.java.JavaSparkContext; - -import org.apache.spark.mllib.stat.Statistics; -import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; - -JavaSparkContext jsc = ... -JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...)); -KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); -// summary of the test including the p-value, test statistic, -// and null hypothesis -// if our p-value indicates significance, we can reject the null hypothesis -System.out.println(testResult); -{% endhighlight %} +{% include_example java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java %}
@@ -493,19 +234,7 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% highlight python %} -from pyspark.mllib.stat import Statistics - -parallelData = sc.parallelize([1.0, 2.0, ... ]) - -# run a KS test for the sample versus a standard normal distribution -testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) -print(testResult) # summary of the test including the p-value, test statistic, - # and null hypothesis - # if our p-value indicates significance, we can reject the null hypothesis -# Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with -# a lambda to calculate the CDF is not made available in the Python API -{% endhighlight %} +{% include_example python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py %}
@@ -550,18 +279,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) for details on the API. -{% highlight scala %} -import org.apache.spark.SparkContext -import org.apache.spark.mllib.random.RandomRDDs._ - -val sc: SparkContext = ... - -// Generate a random double RDD that contains 1 million i.i.d. values drawn from the -// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. -val u = normalRDD(sc, 1000000L, 10) -// Apply a transform to get a random double RDD following `N(1, 4)`. -val v = u.map(x => 1.0 + 2.0 * x) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala %}
@@ -572,24 +290,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Java docs](api/java/org/apache/spark/mllib/random/RandomRDDs) for details on the API. -{% highlight java %} -import org.apache.spark.SparkContext; -import org.apache.spark.api.JavaDoubleRDD; -import static org.apache.spark.mllib.random.RandomRDDs.*; - -JavaSparkContext jsc = ... - -// Generate a random double RDD that contains 1 million i.i.d. values drawn from the -// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. -JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10); -// Apply a transform to get a random double RDD following `N(1, 4)`. -JavaDoubleRDD v = u.map( - new Function() { - public Double call(Double x) { - return 1.0 + 2.0 * x; - } - }); -{% endhighlight %} +{% include_example java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java %}
@@ -600,17 +301,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.random.RandomRDDs) for more details on the API. -{% highlight python %} -from pyspark.mllib.random import RandomRDDs - -sc = ... # SparkContext - -# Generate a random double RDD that contains 1 million i.i.d. values drawn from the -# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. -u = RandomRDDs.normalRDD(sc, 1000000L, 10) -# Apply a transform to get a random double RDD following `N(1, 4)`. -v = u.map(lambda x: 1.0 + 2.0 * x) -{% endhighlight %} +{% include_example python/ml/random_data_generation_example.py %}
@@ -632,21 +323,7 @@ to do so. Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.stat.KernelDensity -import org.apache.spark.rdd.RDD - -val data: RDD[Double] = ... // an RDD of sample data - -// Construct the density estimator with the sample data and a standard deviation for the Gaussian -// kernels -val kd = new KernelDensity() - .setSample(data) - .setBandwidth(3.0) - -// Find density estimates for the given values -val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala %}
@@ -656,21 +333,7 @@ to do so. Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API. -{% highlight java %} -import org.apache.spark.mllib.stat.KernelDensity; -import org.apache.spark.rdd.RDD; - -RDD data = ... // an RDD of sample data - -// Construct the density estimator with the sample data and a standard deviation for the Gaussian -// kernels -KernelDensity kd = new KernelDensity() - .setSample(data) - .setBandwidth(3.0); - -// Find density estimates for the given values -double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); -{% endhighlight %} +{% include_example java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java %}
@@ -680,20 +343,7 @@ to do so. Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API. -{% highlight python %} -from pyspark.mllib.stat import KernelDensity - -data = ... # an RDD of sample data - -# Construct the density estimator with the sample data and a standard deviation for the Gaussian -# kernels -kd = KernelDensity() -kd.setSample(data) -kd.setBandwidth(3.0) - -# Find density estimates for the given values -densities = kd.estimate([-1.0, 2.0, 5.0]) -{% endhighlight %} +{% include_example python/ml/kernel_density_estimation_example.py %}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java new file mode 100644 index 0000000000000..16eded92832aa --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +// $example on$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.*; +import org.apache.spark.mllib.stat.Statistics; +// $example off$ + +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.mllib.linalg.Vectors; +import java.util.Arrays; + + +public class JavaCorrelationsExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample").setMaster("local[*]"); + + // $example on$ + JavaSparkContext jsc = new JavaSparkContext(conf); + + JavaDoubleRDD seriesX = jsc.parallelizeDoubles(Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series + JavaDoubleRDD seriesY = jsc.parallelizeDoubles(Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0})); // must have the same number of partitions and cardinality as seriesX + + // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a + // method is not specified, Pearson's method will be used by default. + Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); + System.out.println("correlation is: " + correlation); + + Vector v1 = Vectors.dense(1.0, 10.0, 100.0); + Vector v2 = Vectors.dense(2.0, 20.0, 200.0); + Vector v3 = Vectors.dense(5.0, 33.0, 366.0); + JavaRDD data = jsc.parallelize(Arrays.asList(v1, v2, v3)); // note that each Vector is a row and not a column + + // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. + // If a method is not specified, Pearson's method will be used by default. + Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); + System.out.println(correlMatrix.toString()); + // $example off$ + + jsc.stop(); + } +} + diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java new file mode 100644 index 0000000000000..d87366c2bd66c --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +// $example on$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.*; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.stat.Statistics; +import org.apache.spark.mllib.stat.test.ChiSqTestResult; +// $example off$ + +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.mllib.linalg.Vectors; +import java.util.Arrays; + + +public class JavaHypothesisTestingExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + SQLContext sqlContext = new SQLContext(jsc); + + // $example on$ + + // @note: todo + + Vector vec = ... // a vector composed of the frequencies of events + + // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, + // the test runs against a uniform distribution. + ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); + // summary of the test including the p-value, degrees of freedom, test statistic, the method used, + // and the null hypothesis. + System.out.println(goodnessOfFitTestResult); + + Matrix mat = ... // a contingency matrix + + // conduct Pearson's independence test on the input contingency matrix + ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); + // summary of the test including the p-value, degrees of freedom... + System.out.println(independenceTestResult); + + JavaRDD obs = ... // an RDD of labeled points + + // The contingency table is constructed from the raw (feature, label) pairs and used to conduct + // the independence test. Returns an array containing the ChiSquaredTestResult for every feature + // against the label. + ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); + int i = 1; + for (ChiSqTestResult result : featureTestResults) { + System.out.println("Column " + i + ":"); + System.out.println(result); // summary of the test + i++; + } + + // $example off$ + + jsc.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java new file mode 100644 index 0000000000000..9ac0d1091e1f0 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +// $example on$ +import java.util.Arrays; + +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import org.apache.spark.mllib.stat.Statistics; +import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; +// $example off$ + +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.mllib.linalg.Vectors; +import java.util.Arrays; + + +public class JavaHypothesisTestingKolmogorovSmirnovTestExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + SQLContext sqlContext = new SQLContext(jsc); + + // $example on$ + + // @note: todo + + JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...)); + KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); + // summary of the test including the p-value, test statistic, + // and null hypothesis + // if our p-value indicates significance, we can reject the null hypothesis + System.out.println(testResult); + + // $example off$ + + jsc.stop(); + } +} + diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java new file mode 100644 index 0000000000000..c3fc9a804f3c2 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +// $example on$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.stat.KernelDensity; +import org.apache.spark.rdd.RDD; +// $example off$ + +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.mllib.linalg.Vectors; +import java.util.Arrays; + + +public class JavaKernelDensityEstimationExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + SQLContext sqlContext = new SQLContext(jsc); + + // $example on$ + + // @note: todo + + RDD data = ... // an RDD of sample data + + // Construct the density estimator with the sample data and a standard deviation for the Gaussian + // kernels + KernelDensity kd = new KernelDensity() + .setSample(data) + .setBandwidth(3.0); + + // Find density estimates for the given values + double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); + // $example off$ + + jsc.stop(); + } +} + diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java new file mode 100644 index 0000000000000..c9e7e999462a5 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +// $example on$ +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaRDD; +import static org.apache.spark.mllib.random.RandomRDDs.*; +// $example off$ + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.mllib.linalg.Vectors; +import java.util.Arrays; + + +public class JavaRandomDataGenerationExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + SQLContext sqlContext = new SQLContext(jsc); + + // $example on$ + + // @note: todo + + // Generate a random double RDD that contains 1 million i.i.d. values drawn from the + // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. + JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10); + // Apply a transform to get a random double RDD following `N(1, 4)`. + JavaDoubleRDD v = u.map( + new Function() { + public Double call(Double x) { + return 1.0 + 2.0 * x; + } + }); + + // $example off$ + + jsc.stop(); + } +} + diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java new file mode 100644 index 0000000000000..7df6afafe05f3 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +// $example on$ +import java.util.Map; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; +// $example off$ + +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.mllib.linalg.Vectors; +import java.util.Arrays; + + +public class JavaStratifiedSamplingExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + SQLContext sqlContext = new SQLContext(jsc); + + // $example on$ +// JavaPairRDD data = ... // an RDD of any key value pairs +// Map fractions = ... // specify the exact fraction desired from each key +// +// // Get an exact sample from each stratum +// JavaPairRDD approxSample = data.sampleByKey(false, fractions); +// JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); + // $example off$ + + jsc.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java index 8c8e3ab0ef143..56822c7e96801 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java @@ -1,7 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.examples.ml; -/** - * Created by quickmobile on 16-02-01. - */ +// $example on$ +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; +import org.apache.spark.mllib.stat.Statistics; +// $example off$ + +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.mllib.linalg.Vectors; +import java.util.Arrays; + + public class JavaSummaryStatisticsExample { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + SQLContext sqlContext = new SQLContext(jsc); + + // $example on$ + Vector v1 = Vectors.dense(1.0, 10.0, 100.0); + Vector v2 = Vectors.dense(2.0, 20.0, 200.0); + Vector v3 = Vectors.dense(3.0, 30.0, 300.0); + + JavaRDD mat = jsc.parallelize(Arrays.asList(v1, v2, v3)); // an RDD of Vectors + + // Compute column summary statistics. + MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); + System.out.println(summary.mean()); // a dense vector containing the mean value for each column + System.out.println(summary.variance()); // column-wise variance + System.out.println(summary.numNonzeros()); // number of nonzeros in each column + // $example off$ + + jsc.stop(); + } } diff --git a/examples/src/main/python/ml/correlations_example.py b/examples/src/main/python/ml/correlations_example.py new file mode 100644 index 0000000000000..35e089c6a0e7d --- /dev/null +++ b/examples/src/main/python/ml/correlations_example.py @@ -0,0 +1,50 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +from pyspark.sql import SQLContext +import numpy as np +from pyspark.mllib.linalg import Vectors +# $example on$ +from pyspark.mllib.stat import Statistics +# $example off$ + +if __name__ == "__main__": + # $example on$ + sc = SparkContext(appName="CorrelationsExample") # SparkContext + + seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series + seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) # must have the same number of partitions and cardinality as seriesX + + # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a + # method is not specified, Pearson's method will be used by default. + print(Statistics.corr(seriesX, seriesY, method="pearson")) + + v1 = np.array([1.0, 10.0, 100.0]) + v2 = np.array([2.0, 20.0, 200.0]) + v3 = np.array([5.0, 33.0, 366.0]) + data = sc.parallelize([v1, v2, v3]) # an RDD of Vectors + + # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. + # If a method is not specified, Pearson's method will be used by default. + print(Statistics.corr(data, method="pearson")) + + # $example off$ + + sc.stop() \ No newline at end of file diff --git a/examples/src/main/python/ml/hypothesis_testing_example.py b/examples/src/main/python/ml/hypothesis_testing_example.py new file mode 100644 index 0000000000000..afbf7bc4309f8 --- /dev/null +++ b/examples/src/main/python/ml/hypothesis_testing_example.py @@ -0,0 +1,66 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +from pyspark.sql import SQLContext +import numpy as np +from pyspark.mllib.linalg import Vectors +# $example on$ +from pyspark import SparkContext +from pyspark.mllib.linalg import Vectors, Matrices +from pyspark.mllib.regresssion import LabeledPoint +from pyspark.mllib.stat import Statistics +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="HypothesisTestingExample") # SparkContext + sqlContext = SQLContext(sc) + + # $example on$ + + # @note: todo + + vec = Vectors.dense(...) # a vector composed of the frequencies of events + + # compute the goodness of fit. If a second vector to test against is not supplied as a parameter, + # the test runs against a uniform distribution. + goodnessOfFitTestResult = Statistics.chiSqTest(vec) + print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, + # test statistic, the method used, and the null hypothesis. + + mat = Matrices.dense(...) # a contingency matrix + + # conduct Pearson's independence test on the input contingency matrix + independenceTestResult = Statistics.chiSqTest(mat) + print(independenceTestResult) # summary of the test including the p-value, degrees of freedom... + + obs = sc.parallelize(...) # LabeledPoint(feature, label) . + + # The contingency table is constructed from an RDD of LabeledPoint and used to conduct + # the independence test. Returns an array containing the ChiSquaredTestResult for every feature + # against the label. + featureTestResults = Statistics.chiSqTest(obs) + + for i, result in enumerate(featureTestResults): + print("Column $d:" % (i + 1)) + print(result) + + # $example off$ + + sc.stop() \ No newline at end of file diff --git a/examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py new file mode 100644 index 0000000000000..c4ee776e32fe8 --- /dev/null +++ b/examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +from pyspark.sql import SQLContext +import numpy as np +from pyspark.mllib.linalg import Vectors +# $example on$ +from pyspark.mllib.stat import Statistics +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample") # SparkContext + sqlContext = SQLContext(sc) + + # $example on$ + + # @note: todo + + parallelData = sc.parallelize([1.0, 2.0, ... ]) + + # run a KS test for the sample versus a standard normal distribution + testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) + print(testResult) # summary of the test including the p-value, test statistic, + # and null hypothesis + # if our p-value indicates significance, we can reject the null hypothesis + # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with + # a lambda to calculate the CDF is not made available in the Python API + + # $example off$ + + sc.stop() \ No newline at end of file diff --git a/examples/src/main/python/ml/kernel_density_estimation_example.py b/examples/src/main/python/ml/kernel_density_estimation_example.py new file mode 100644 index 0000000000000..c71b96b30a771 --- /dev/null +++ b/examples/src/main/python/ml/kernel_density_estimation_example.py @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +from pyspark.sql import SQLContext +import numpy as np +from pyspark.mllib.linalg import Vectors +# $example on$ +from pyspark.mllib.stat import KernelDensity +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext + sqlContext = SQLContext(sc) + + # $example on$ + + # @note: todo + + data = ... # an RDD of sample data + + # Construct the density estimator with the sample data and a standard deviation for the Gaussian + # kernels + kd = KernelDensity() + kd.setSample(data) + kd.setBandwidth(3.0) + + # Find density estimates for the given values + densities = kd.estimate([-1.0, 2.0, 5.0]) + # $example off$ + + sc.stop() \ No newline at end of file diff --git a/examples/src/main/python/ml/random_data_generation_example.py b/examples/src/main/python/ml/random_data_generation_example.py new file mode 100644 index 0000000000000..d42e33d464aba --- /dev/null +++ b/examples/src/main/python/ml/random_data_generation_example.py @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +from pyspark.sql import SQLContext +import numpy as np +from pyspark.mllib.linalg import Vectors +# $example on$ +from pyspark.mllib.random import RandomRDDs +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="RandomDataGenerationExample") # SparkContext + sqlContext = SQLContext(sc) + + # $example on$ + + # @note: todo + + # Generate a random double RDD that contains 1 million i.i.d. values drawn from the + # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. + u = RandomRDDs.normalRDD(sc, 1000000L, 10) + # Apply a transform to get a random double RDD following `N(1, 4)`. + v = u.map(lambda x: 1.0 + 2.0 * x) + # $example off$ + + sc.stop() \ No newline at end of file diff --git a/examples/src/main/python/ml/stratified_sampling_example.py b/examples/src/main/python/ml/stratified_sampling_example.py new file mode 100644 index 0000000000000..0f6ede7335a85 --- /dev/null +++ b/examples/src/main/python/ml/stratified_sampling_example.py @@ -0,0 +1,39 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +from pyspark.sql import SQLContext +import numpy as np +from pyspark.mllib.linalg import Vectors +# $example on$ +from pyspark.mllib.stat import Statistics +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="StratifiedSamplingExample") # SparkContext + sqlContext = SQLContext(sc) + + # $example on$ + # data = ... # an RDD of any key value pairs + # fractions = ... # specify the exact fraction desired from each key as a dictionary + # + # approxSample = data.sampleByKey(False, fractions); + # $example off$ + + sc.stop() \ No newline at end of file diff --git a/examples/src/main/python/ml/summary_statistics_example.py b/examples/src/main/python/ml/summary_statistics_example.py new file mode 100644 index 0000000000000..fef018127451d --- /dev/null +++ b/examples/src/main/python/ml/summary_statistics_example.py @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +from pyspark import SparkContext +from pyspark.sql import SQLContext +import numpy as np +from pyspark.mllib.linalg import Vectors +# $example on$ +from pyspark.mllib.stat import Statistics +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext + sqlContext = SQLContext(sc) + + # $example on$ + v1 = np.array([1.0, 2.0, 3.0]) + v2 = np.array([10.0, 20.0, 30.0]) + v3 = np.array([100.0, 200.0, 300.0]) + mat = sc.parallelize([v1, v2, v3]) # an RDD of Vectors + + # Compute column summary statistics. + summary = Statistics.colStats(mat) + print(summary.mean()) # a dense vector containing the mean value for each column + print(summary.variance()) # column-wise variance + print(summary.numNonzeros()) # number of nonzeros in each column + # $example off$ + + sc.stop() \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala new file mode 100644 index 0000000000000..179b87e21f9c9 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.stat.Statistics +// $example off$ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object CorrelationsExample { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("CorrelationsExample").setMaster("local[*]") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + + // $example on$ + val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series + val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555)) // must have the same number of partitions and cardinality as seriesX + + // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a + // method is not specified, Pearson's method will be used by default. + val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") + println(correlation) + + val data: RDD[Vector] = sc.parallelize(Seq(Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(5.0, 33.0, 366.0))) // note that each Vector is a row and not a column + + // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. + // If a method is not specified, Pearson's method will be used by default. + val correlMatrix: Matrix = Statistics.corr(data, "pearson") + println(correlMatrix.toString) + // $example off$ + + sc.stop() + } +} +// scalastyle:on println + diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala new file mode 100644 index 0000000000000..f67e291dcd2f7 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.SparkContext +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.stat.Statistics._ +import org.apache.spark.rdd.RDD + +// $example off$ +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object HypothesisTestingExample { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("HypothesisTestingExample").setMaster("local[*]") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + + // $example on$ + // @note: todo + val vec: Vector = ... // a vector composed of the frequencies of events + + // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, + // the test runs against a uniform distribution. + val goodnessOfFitTestResult = Statistics.chiSqTest(vec) + println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom, + // test statistic, the method used, and the null hypothesis. + + val mat: Matrix = ... // a contingency matrix + + // conduct Pearson's independence test on the input contingency matrix + val independenceTestResult = Statistics.chiSqTest(mat) + println(independenceTestResult) // summary of the test including the p-value, degrees of freedom... + + val obs: RDD[LabeledPoint] = ... // (feature, label) pairs. + + // The contingency table is constructed from the raw (feature, label) pairs and used to conduct + // the independence test. Returns an array containing the ChiSquaredTestResult for every feature + // against the label. + val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs) + var i = 1 + featureTestResults.foreach { result => + println(s"Column $i:\n$result") + i += 1 + } // summary of the test + + // $example off$ + + sc.stop() + } +} +// scalastyle:on println + diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala new file mode 100644 index 0000000000000..9c6e07f2242eb --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.SparkContext +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.rdd.RDD + +// $example off$ +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object HypothesisTestingKolmogorovSmirnovTestExample { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample").setMaster("local[*]") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + + // $example on$ + // @note: todo + + val data: RDD[Double] = ... // an RDD of sample data + + // run a KS test for the sample versus a standard normal distribution + val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) + println(testResult) // summary of the test including the p-value, test statistic, + // and null hypothesis + // if our p-value indicates significance, we can reject the null hypothesis + + // perform a KS test using a cumulative distribution function of our making + val myCDF: Double => Double = ... + val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) + + // $example off$ + + sc.stop() + } +} +// scalastyle:on println + diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala new file mode 100644 index 0000000000000..ae9ecc9d4183a --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.mllib.stat.KernelDensity +import org.apache.spark.rdd.RDD +// $example off$ +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object KernelDensityEstimationExample { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("KernelDensityEstimationExample").setMaster("local[*]") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + + // $example on$ + + // @note: todo + + val data: RDD[Double] = ... // an RDD of sample data + + // Construct the density estimator with the sample data and a standard deviation for the Gaussian + // kernels + val kd = new KernelDensity() + .setSample(data) + .setBandwidth(3.0) + + // Find density estimates for the given values + val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) + // $example off$ + + sc.stop() + } +} +// scalastyle:on println + diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala new file mode 100644 index 0000000000000..34f79a4f0d4dc --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.mllib.random.RandomRDDs._ + +// $example off$ +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object RandomDataGenerationExample { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("RandomDataGenerationExample").setMaster("local[*]") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + + // $example on$ + + // @note: todo + + // Generate a random double RDD that contains 1 million i.i.d. values drawn from the + // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. + val u = normalRDD(sc, 1000000L, 10) + // Apply a transform to get a random double RDD following `N(1, 4)`. + val v = u.map(x => 1.0 + 2.0 * x) + + // $example off$ + + sc.stop() + } +} +// scalastyle:on println + diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala new file mode 100644 index 0000000000000..7d5cf341f9d54 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.{SparkConf, SparkContext} +// $example off$ +import org.apache.spark.sql.SQLContext + + +object StratifiedSamplingExample { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("StratifiedSamplingExample").setMaster("local[*]") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + + // $example on$ + // @note: I don't know how to use class "import org.apache.spark.rdd.PairRDDFunctions" + val data = sc.parallelize(Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f'))) // an RDD[(K, V)] of any key value pairs + val fractions = Map(1 -> 1.0, 2 -> 2.0, 3 -> 3.0)// specify the exact fraction desired from each key + + // Get an exact sample from each stratum + val approxSample = data.sampleByKey(withReplacement = false, fractions) + val exactSample = data.sampleByKeyExact(withReplacement = false, fractions) + + println(approxSample.toString) + println(exactSample.toString) + // $example off$ + + sc.stop() + } +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala index 68b9c19914897..eb81db6b58321 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala @@ -29,7 +29,7 @@ object SummaryStatisticsExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("MultivariateStatisticalSummaryExample") + val conf = new SparkConf().setAppName("SummaryStatisticsExample").setMaster("local[*]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) From 2058b16bc3c097b204bb6c94226e1a8f85017679 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Wed, 3 Feb 2016 15:25:59 -0800 Subject: [PATCH 04/26] [SPARK-13019] move new files into mllib folder --- docs/mllib-statistics.md | 42 +++++++++---------- examples/src/__init__.py | 1 + examples/src/main/__init__.py | 1 + .../JavaCorrelationsExample.java | 2 +- .../JavaHypothesisTestingExample.java | 2 +- ...isTestingKolmogorovSmirnovTestExample.java | 2 +- .../JavaKernelDensityEstimationExample.java | 2 +- .../JavaRandomDataGenerationExample.java | 2 +- .../JavaStratifiedSamplingExample.java | 2 +- .../JavaSummaryStatisticsExample.java | 2 +- examples/src/main/python/__init__.py | 1 + examples/src/main/python/mllib/__init__.py | 1 + .../{ml => mllib}/correlations_example.py | 0 .../hypothesis_testing_example.py | 0 ...testing_kolmogorov_smirnov_test_example.py | 0 .../kernel_density_estimation_example.py | 0 .../random_data_generation_example.py | 0 .../stratified_sampling_example.py | 0 .../summary_statistics_example.py | 0 .../{ml => mllib}/CorrelationsExample.scala | 2 +- .../HypothesisTestingExample.scala | 4 +- ...sTestingKolmogorovSmirnovTestExample.scala | 5 +-- .../KernelDensityEstimationExample.scala | 2 +- .../RandomDataGenerationExample.scala | 2 +- .../StratifiedSamplingExample.scala | 2 +- .../SummaryStatisticsExample.scala | 2 +- 26 files changed, 39 insertions(+), 40 deletions(-) create mode 100644 examples/src/__init__.py create mode 100644 examples/src/main/__init__.py rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaCorrelationsExample.java (98%) rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaHypothesisTestingExample.java (98%) rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaHypothesisTestingKolmogorovSmirnovTestExample.java (98%) rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaKernelDensityEstimationExample.java (97%) rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaRandomDataGenerationExample.java (98%) rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaStratifiedSamplingExample.java (97%) rename examples/src/main/java/org/apache/spark/examples/{ml => mllib}/JavaSummaryStatisticsExample.java (98%) create mode 100644 examples/src/main/python/__init__.py create mode 100644 examples/src/main/python/mllib/__init__.py rename examples/src/main/python/{ml => mllib}/correlations_example.py (100%) rename examples/src/main/python/{ml => mllib}/hypothesis_testing_example.py (100%) rename examples/src/main/python/{ml => mllib}/hypothesis_testing_kolmogorov_smirnov_test_example.py (100%) rename examples/src/main/python/{ml => mllib}/kernel_density_estimation_example.py (100%) rename examples/src/main/python/{ml => mllib}/random_data_generation_example.py (100%) rename examples/src/main/python/{ml => mllib}/stratified_sampling_example.py (100%) rename examples/src/main/python/{ml => mllib}/summary_statistics_example.py (100%) rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/CorrelationsExample.scala (98%) rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/HypothesisTestingExample.scala (95%) rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/HypothesisTestingKolmogorovSmirnovTestExample.scala (92%) rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/KernelDensityEstimationExample.scala (97%) rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/RandomDataGenerationExample.scala (97%) rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/StratifiedSamplingExample.scala (97%) rename examples/src/main/scala/org/apache/spark/examples/{ml => mllib}/SummaryStatisticsExample.scala (97%) diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index 487ae12f3b6de..bbbbd87a35610 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -40,7 +40,7 @@ total count. Refer to the [`MultivariateStatisticalSummary` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary) for details on the API. -{% include_example scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala %} +{% include_example scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala %}
@@ -52,7 +52,7 @@ total count. Refer to the [`MultivariateStatisticalSummary` Java docs](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html) for details on the API. -{% include_example java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java %}
@@ -63,7 +63,7 @@ total count. Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.MultivariateStatisticalSummary) for more details on the API. -{% include_example python/ml/summary_statistics_example.py %} +{% include_example python/mllib/summary_statistics_example.py %}
@@ -82,7 +82,7 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API. -{% include_example scala/org/apache/spark/examples/ml/CorrelationsExample.scala %} +{% include_example scala/org/apache/spark/examples/mllib/CorrelationsExample.scala %}
@@ -92,7 +92,7 @@ a `JavaRDD`, the output will be a `Double` or the correlation `Matrix` r Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API. -{% include_example java/org/apache/spark/examples/ml/JavaCorrelationsExample.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java %}
@@ -102,7 +102,7 @@ an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` resp Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% include_example python/ml/correlations_example.py %} +{% include_example python/mllib/correlations_example.py %}
@@ -128,7 +128,7 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample size, whereas sampling with replacement requires two additional passes. -{% include_example scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala %} +{% include_example scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala %}
@@ -138,7 +138,7 @@ fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample size, whereas sampling with replacement requires two additional passes. -{% include_example java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java %}
[`sampleByKey()`](api/python/pyspark.html#pyspark.RDD.sampleByKey) allows users to @@ -148,7 +148,7 @@ set of keys. *Note:* `sampleByKeyExact()` is currently not supported in Python. -{% include_example python/ml/stratified_sampling_example.py %} +{% include_example python/mllib/stratified_sampling_example.py %}
@@ -170,7 +170,7 @@ independence tests. run Pearson's chi-squared tests. The following example demonstrates how to run and interpret hypothesis tests. -{% include_example scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala %} +{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala %}
@@ -180,7 +180,7 @@ hypothesis tests. Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API. -{% include_example java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java %} {% endhighlight %}
@@ -192,7 +192,7 @@ hypothesis tests. Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% include_example python/ml/hypothesis_testing_example.py %} +{% include_example python/mllib/hypothesis_testing_example.py %} @@ -214,7 +214,7 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API. -{% include_example scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala %} +{% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala %}
@@ -224,7 +224,7 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API. -{% include_example java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java %}
@@ -234,7 +234,7 @@ and interpret the hypothesis tests. Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. -{% include_example python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py %} +{% include_example python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py %}
@@ -279,7 +279,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) for details on the API. -{% include_example scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala %} +{% include_example scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala %}
@@ -290,7 +290,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Java docs](api/java/org/apache/spark/mllib/random/RandomRDDs) for details on the API. -{% include_example java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java %}
@@ -301,7 +301,7 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.random.RandomRDDs) for more details on the API. -{% include_example python/ml/random_data_generation_example.py %} +{% include_example python/mllib/random_data_generation_example.py %}
@@ -323,7 +323,7 @@ to do so. Refer to the [`KernelDensity` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.KernelDensity) for details on the API. -{% include_example scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala %} +{% include_example scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala %}
@@ -333,7 +333,7 @@ to do so. Refer to the [`KernelDensity` Java docs](api/java/org/apache/spark/mllib/stat/KernelDensity.html) for details on the API. -{% include_example java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java %}
@@ -343,7 +343,7 @@ to do so. Refer to the [`KernelDensity` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.KernelDensity) for more details on the API. -{% include_example python/ml/kernel_density_estimation_example.py %} +{% include_example python/mllib/kernel_density_estimation_example.py %}
diff --git a/examples/src/__init__.py b/examples/src/__init__.py new file mode 100644 index 0000000000000..31a6ebb6ea4e4 --- /dev/null +++ b/examples/src/__init__.py @@ -0,0 +1 @@ +__author__ = 'quickmobile' diff --git a/examples/src/main/__init__.py b/examples/src/main/__init__.py new file mode 100644 index 0000000000000..31a6ebb6ea4e4 --- /dev/null +++ b/examples/src/main/__init__.py @@ -0,0 +1 @@ +__author__ = 'quickmobile' diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java similarity index 98% rename from examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java index 16eded92832aa..6fb1ee6365a27 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.examples.ml; +package org.apache.spark.examples.mllib; // $example on$ import org.apache.spark.api.java.JavaRDD; diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java similarity index 98% rename from examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java index d87366c2bd66c..8faf7f48a525d 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.examples.ml; +package org.apache.spark.examples.mllib; // $example on$ import org.apache.spark.api.java.JavaRDD; diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java similarity index 98% rename from examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java index 9ac0d1091e1f0..02f91848884a8 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.examples.ml; +package org.apache.spark.examples.mllib; // $example on$ import java.util.Arrays; diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java similarity index 97% rename from examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java index c3fc9a804f3c2..338a3fdf5ebc8 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKernelDensityEstimationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.examples.ml; +package org.apache.spark.examples.mllib; // $example on$ import org.apache.spark.api.java.JavaRDD; diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java similarity index 98% rename from examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java index c9e7e999462a5..f84966c076770 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomDataGenerationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.examples.ml; +package org.apache.spark.examples.mllib; // $example on$ import org.apache.spark.SparkContext; diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java similarity index 97% rename from examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java index 7df6afafe05f3..6d11e97690413 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStratifiedSamplingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.examples.ml; +package org.apache.spark.examples.mllib; // $example on$ import java.util.Map; diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java similarity index 98% rename from examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java index 56822c7e96801..ed7f9637e7627 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSummaryStatisticsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.examples.ml; +package org.apache.spark.examples.mllib; // $example on$ import org.apache.spark.api.java.JavaRDD; diff --git a/examples/src/main/python/__init__.py b/examples/src/main/python/__init__.py new file mode 100644 index 0000000000000..31a6ebb6ea4e4 --- /dev/null +++ b/examples/src/main/python/__init__.py @@ -0,0 +1 @@ +__author__ = 'quickmobile' diff --git a/examples/src/main/python/mllib/__init__.py b/examples/src/main/python/mllib/__init__.py new file mode 100644 index 0000000000000..31a6ebb6ea4e4 --- /dev/null +++ b/examples/src/main/python/mllib/__init__.py @@ -0,0 +1 @@ +__author__ = 'quickmobile' diff --git a/examples/src/main/python/ml/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py similarity index 100% rename from examples/src/main/python/ml/correlations_example.py rename to examples/src/main/python/mllib/correlations_example.py diff --git a/examples/src/main/python/ml/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py similarity index 100% rename from examples/src/main/python/ml/hypothesis_testing_example.py rename to examples/src/main/python/mllib/hypothesis_testing_example.py diff --git a/examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py similarity index 100% rename from examples/src/main/python/ml/hypothesis_testing_kolmogorov_smirnov_test_example.py rename to examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py diff --git a/examples/src/main/python/ml/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py similarity index 100% rename from examples/src/main/python/ml/kernel_density_estimation_example.py rename to examples/src/main/python/mllib/kernel_density_estimation_example.py diff --git a/examples/src/main/python/ml/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py similarity index 100% rename from examples/src/main/python/ml/random_data_generation_example.py rename to examples/src/main/python/mllib/random_data_generation_example.py diff --git a/examples/src/main/python/ml/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py similarity index 100% rename from examples/src/main/python/ml/stratified_sampling_example.py rename to examples/src/main/python/mllib/stratified_sampling_example.py diff --git a/examples/src/main/python/ml/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py similarity index 100% rename from examples/src/main/python/ml/summary_statistics_example.py rename to examples/src/main/python/mllib/summary_statistics_example.py diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala similarity index 98% rename from examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala index 179b87e21f9c9..aeb5f7f802e00 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala @@ -16,7 +16,7 @@ */ // scalastyle:off println -package org.apache.spark.examples.ml +package org.apache.spark.examples.mllib // $example on$ import org.apache.spark.mllib.linalg._ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala similarity index 95% rename from examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala index f67e291dcd2f7..be7f09c32a0e8 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala @@ -16,13 +16,11 @@ */ // scalastyle:off println -package org.apache.spark.examples.ml +package org.apache.spark.examples.mllib // $example on$ -import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.stat.Statistics._ import org.apache.spark.rdd.RDD // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala similarity index 92% rename from examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala index 9c6e07f2242eb..37528e44b7cc6 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/HypothesisTestingKolmogorovSmirnovTestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala @@ -16,12 +16,9 @@ */ // scalastyle:off println -package org.apache.spark.examples.ml +package org.apache.spark.examples.mllib // $example on$ -import org.apache.spark.SparkContext -import org.apache.spark.mllib.linalg._ -import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala similarity index 97% rename from examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala index ae9ecc9d4183a..1326e187ba771 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/KernelDensityEstimationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala @@ -16,7 +16,7 @@ */ // scalastyle:off println -package org.apache.spark.examples.ml +package org.apache.spark.examples.mllib // $example on$ import org.apache.spark.mllib.stat.KernelDensity diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala similarity index 97% rename from examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala index 34f79a4f0d4dc..5de6162ade9d0 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomDataGenerationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala @@ -16,7 +16,7 @@ */ // scalastyle:off println -package org.apache.spark.examples.ml +package org.apache.spark.examples.mllib // $example on$ import org.apache.spark.mllib.random.RandomRDDs._ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala similarity index 97% rename from examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala index 7d5cf341f9d54..c01047e784357 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/StratifiedSamplingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala @@ -16,7 +16,7 @@ */ // scalastyle:off println -package org.apache.spark.examples.ml +package org.apache.spark.examples.mllib // $example on$ import org.apache.spark.{SparkConf, SparkContext} diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala similarity index 97% rename from examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala index eb81db6b58321..8876dbfcdb863 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/SummaryStatisticsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala @@ -16,7 +16,7 @@ */ // scalastyle:off println -package org.apache.spark.examples.ml +package org.apache.spark.examples.mllib // $example on$ import org.apache.spark.mllib.linalg.Vectors From b3285423a625812f64968767b8063ca83ab58f3c Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Wed, 3 Feb 2016 15:29:04 -0800 Subject: [PATCH 05/26] [SPARK-13019] remote python init files --- examples/src/__init__.py | 1 - examples/src/main/__init__.py | 1 - examples/src/main/python/__init__.py | 1 - examples/src/main/python/mllib/__init__.py | 1 - 4 files changed, 4 deletions(-) delete mode 100644 examples/src/__init__.py delete mode 100644 examples/src/main/__init__.py delete mode 100644 examples/src/main/python/__init__.py delete mode 100644 examples/src/main/python/mllib/__init__.py diff --git a/examples/src/__init__.py b/examples/src/__init__.py deleted file mode 100644 index 31a6ebb6ea4e4..0000000000000 --- a/examples/src/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__author__ = 'quickmobile' diff --git a/examples/src/main/__init__.py b/examples/src/main/__init__.py deleted file mode 100644 index 31a6ebb6ea4e4..0000000000000 --- a/examples/src/main/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__author__ = 'quickmobile' diff --git a/examples/src/main/python/__init__.py b/examples/src/main/python/__init__.py deleted file mode 100644 index 31a6ebb6ea4e4..0000000000000 --- a/examples/src/main/python/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__author__ = 'quickmobile' diff --git a/examples/src/main/python/mllib/__init__.py b/examples/src/main/python/mllib/__init__.py deleted file mode 100644 index 31a6ebb6ea4e4..0000000000000 --- a/examples/src/main/python/mllib/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__author__ = 'quickmobile' From 12fda2be3033fb0477951c276366215ed2ed2736 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Wed, 3 Feb 2016 15:39:48 -0800 Subject: [PATCH 06/26] [SPARK-13019] comment broken code to pass complie process --- .../mllib/JavaHypothesisTestingExample.java | 4 +- ...isTestingKolmogorovSmirnovTestExample.java | 4 +- .../JavaKernelDensityEstimationExample.java | 4 +- .../JavaRandomDataGenerationExample.java | 4 +- .../mllib/hypothesis_testing_example.py | 48 +++++++++---------- ...testing_kolmogorov_smirnov_test_example.py | 18 +++---- .../kernel_density_estimation_example.py | 30 ++++++------ .../mllib/random_data_generation_example.py | 12 ++--- .../mllib/HypothesisTestingExample.scala | 7 +-- ...sTestingKolmogorovSmirnovTestExample.scala | 6 +-- .../KernelDensityEstimationExample.scala | 6 +-- .../mllib/RandomDataGenerationExample.scala | 5 +- 12 files changed, 70 insertions(+), 78 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java index 8faf7f48a525d..023480252d833 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -42,7 +42,7 @@ public static void main(String[] args) { // $example on$ // @note: todo - +/* Vector vec = ... // a vector composed of the frequencies of events // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, @@ -73,7 +73,7 @@ public static void main(String[] args) { } // $example off$ - +*/ jsc.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java index 02f91848884a8..c34e66541eb7a 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -41,7 +41,7 @@ public static void main(String[] args) { SQLContext sqlContext = new SQLContext(jsc); // $example on$ - +/* // @note: todo JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...)); @@ -52,7 +52,7 @@ public static void main(String[] args) { System.out.println(testResult); // $example off$ - +*/ jsc.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java index 338a3fdf5ebc8..457ee0c5537c9 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -36,7 +36,7 @@ public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); - +/* // $example on$ // @note: todo @@ -52,7 +52,7 @@ public static void main(String[] args) { // Find density estimates for the given values double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); // $example off$ - +*/ jsc.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java index f84966c076770..f27476ffa6179 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java @@ -39,7 +39,7 @@ public static void main(String[] args) { SQLContext sqlContext = new SQLContext(jsc); // $example on$ - +/* // @note: todo // Generate a random double RDD that contains 1 million i.i.d. values drawn from the @@ -54,7 +54,7 @@ public Double call(Double x) { }); // $example off$ - +*/ jsc.stop(); } } diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py index afbf7bc4309f8..cee1ab6168475 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_example.py @@ -36,30 +36,30 @@ # @note: todo - vec = Vectors.dense(...) # a vector composed of the frequencies of events - - # compute the goodness of fit. If a second vector to test against is not supplied as a parameter, - # the test runs against a uniform distribution. - goodnessOfFitTestResult = Statistics.chiSqTest(vec) - print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, - # test statistic, the method used, and the null hypothesis. - - mat = Matrices.dense(...) # a contingency matrix - - # conduct Pearson's independence test on the input contingency matrix - independenceTestResult = Statistics.chiSqTest(mat) - print(independenceTestResult) # summary of the test including the p-value, degrees of freedom... - - obs = sc.parallelize(...) # LabeledPoint(feature, label) . - - # The contingency table is constructed from an RDD of LabeledPoint and used to conduct - # the independence test. Returns an array containing the ChiSquaredTestResult for every feature - # against the label. - featureTestResults = Statistics.chiSqTest(obs) - - for i, result in enumerate(featureTestResults): - print("Column $d:" % (i + 1)) - print(result) + # vec = Vectors.dense(...) # a vector composed of the frequencies of events + # + # # compute the goodness of fit. If a second vector to test against is not supplied as a parameter, + # # the test runs against a uniform distribution. + # goodnessOfFitTestResult = Statistics.chiSqTest(vec) + # print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, + # # test statistic, the method used, and the null hypothesis. + # + # mat = Matrices.dense(...) # a contingency matrix + # + # # conduct Pearson's independence test on the input contingency matrix + # independenceTestResult = Statistics.chiSqTest(mat) + # print(independenceTestResult) # summary of the test including the p-value, degrees of freedom... + # + # obs = sc.parallelize(...) # LabeledPoint(feature, label) . + # + # # The contingency table is constructed from an RDD of LabeledPoint and used to conduct + # # the independence test. Returns an array containing the ChiSquaredTestResult for every feature + # # against the label. + # featureTestResults = Statistics.chiSqTest(obs) + # + # for i, result in enumerate(featureTestResults): + # print("Column $d:" % (i + 1)) + # print(result) # $example off$ diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py index c4ee776e32fe8..5541250dd76ee 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py @@ -33,15 +33,15 @@ # @note: todo - parallelData = sc.parallelize([1.0, 2.0, ... ]) - - # run a KS test for the sample versus a standard normal distribution - testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) - print(testResult) # summary of the test including the p-value, test statistic, - # and null hypothesis - # if our p-value indicates significance, we can reject the null hypothesis - # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with - # a lambda to calculate the CDF is not made available in the Python API + # parallelData = sc.parallelize([1.0, 2.0, ... ]) + # + # # run a KS test for the sample versus a standard normal distribution + # testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) + # print(testResult) # summary of the test including the p-value, test statistic, + # # and null hypothesis + # # if our p-value indicates significance, we can reject the null hypothesis + # # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with + # # a lambda to calculate the CDF is not made available in the Python API # $example off$ diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py index c71b96b30a771..383d9f37a9276 100644 --- a/examples/src/main/python/mllib/kernel_density_estimation_example.py +++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py @@ -29,20 +29,20 @@ sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext sqlContext = SQLContext(sc) - # $example on$ - - # @note: todo - - data = ... # an RDD of sample data - - # Construct the density estimator with the sample data and a standard deviation for the Gaussian - # kernels - kd = KernelDensity() - kd.setSample(data) - kd.setBandwidth(3.0) - - # Find density estimates for the given values - densities = kd.estimate([-1.0, 2.0, 5.0]) - # $example off$ + # # $example on$ + # + # # @note: todo + # + # data = ... # an RDD of sample data + # + # # Construct the density estimator with the sample data and a standard deviation for the Gaussian + # # kernels + # kd = KernelDensity() + # kd.setSample(data) + # kd.setBandwidth(3.0) + # + # # Find density estimates for the given values + # densities = kd.estimate([-1.0, 2.0, 5.0]) + # # $example off$ sc.stop() \ No newline at end of file diff --git a/examples/src/main/python/mllib/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py index d42e33d464aba..db2a5e97e87d2 100644 --- a/examples/src/main/python/mllib/random_data_generation_example.py +++ b/examples/src/main/python/mllib/random_data_generation_example.py @@ -33,11 +33,11 @@ # @note: todo - # Generate a random double RDD that contains 1 million i.i.d. values drawn from the - # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. - u = RandomRDDs.normalRDD(sc, 1000000L, 10) - # Apply a transform to get a random double RDD following `N(1, 4)`. - v = u.map(lambda x: 1.0 + 2.0 * x) - # $example off$ + # # Generate a random double RDD that contains 1 million i.i.d. values drawn from the + # # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. + # u = RandomRDDs.normalRDD(sc, 1000000L, 10) + # # Apply a transform to get a random double RDD following `N(1, 4)`. + # v = u.map(lambda x: 1.0 + 2.0 * x) + # # $example off$ sc.stop() \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala index be7f09c32a0e8..d5883c7d89604 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala @@ -19,9 +19,6 @@ package org.apache.spark.examples.mllib // $example on$ -import org.apache.spark.mllib.linalg._ -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.rdd.RDD // $example off$ import org.apache.spark.sql.SQLContext @@ -34,7 +31,7 @@ object HypothesisTestingExample { val conf = new SparkConf().setAppName("HypothesisTestingExample").setMaster("local[*]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) - +/* // $example on$ // @note: todo val vec: Vector = ... // a vector composed of the frequencies of events @@ -64,7 +61,7 @@ object HypothesisTestingExample { } // summary of the test // $example off$ - +*/ sc.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala index 37528e44b7cc6..1948069954748 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala @@ -19,8 +19,6 @@ package org.apache.spark.examples.mllib // $example on$ -import org.apache.spark.mllib.stat.Statistics -import org.apache.spark.rdd.RDD // $example off$ import org.apache.spark.sql.SQLContext @@ -36,7 +34,7 @@ object HypothesisTestingKolmogorovSmirnovTestExample { // $example on$ // @note: todo - +/* val data: RDD[Double] = ... // an RDD of sample data // run a KS test for the sample versus a standard normal distribution @@ -50,7 +48,7 @@ object HypothesisTestingKolmogorovSmirnovTestExample { val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) // $example off$ - +*/ sc.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala index 1326e187ba771..f061318b2fd1d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala @@ -19,8 +19,6 @@ package org.apache.spark.examples.mllib // $example on$ -import org.apache.spark.mllib.stat.KernelDensity -import org.apache.spark.rdd.RDD // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} @@ -32,7 +30,7 @@ object KernelDensityEstimationExample { val conf = new SparkConf().setAppName("KernelDensityEstimationExample").setMaster("local[*]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) - +/* // $example on$ // @note: todo @@ -48,7 +46,7 @@ object KernelDensityEstimationExample { // Find density estimates for the given values val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) // $example off$ - +*/ sc.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala index 5de6162ade9d0..fe33f88d4f144 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala @@ -19,7 +19,6 @@ package org.apache.spark.examples.mllib // $example on$ -import org.apache.spark.mllib.random.RandomRDDs._ // $example off$ import org.apache.spark.sql.SQLContext @@ -32,7 +31,7 @@ object RandomDataGenerationExample { val conf = new SparkConf().setAppName("RandomDataGenerationExample").setMaster("local[*]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) - +/* // $example on$ // @note: todo @@ -44,7 +43,7 @@ object RandomDataGenerationExample { val v = u.map(x => 1.0 + 2.0 * x) // $example off$ - +*/ sc.stop() } } From 2abfaa93b9dee4a86d304a54f59143a5fa0ee401 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Wed, 3 Feb 2016 17:18:33 -0800 Subject: [PATCH 07/26] [SPARK-13019] remove code block tag --- docs/mllib-statistics.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index bbbbd87a35610..a4b707453a2e3 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -182,7 +182,6 @@ Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/ {% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java %} -{% endhighlight %}
From 157da53ca1c6ceb13eb720e968d2e09fed44571d Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Wed, 3 Feb 2016 17:26:36 -0800 Subject: [PATCH 08/26] [SPARK-13019] make commented code explicit in html content --- .../examples/mllib/JavaHypothesisTestingExample.java | 4 ++-- ...JavaHypothesisTestingKolmogorovSmirnovTestExample.java | 4 ++-- .../mllib/JavaKernelDensityEstimationExample.java | 7 ++++--- .../examples/mllib/JavaRandomDataGenerationExample.java | 4 ++-- .../spark/examples/mllib/HypothesisTestingExample.scala | 7 ++++--- .../HypothesisTestingKolmogorovSmirnovTestExample.scala | 4 ++-- .../examples/mllib/KernelDensityEstimationExample.scala | 7 ++++--- .../examples/mllib/RandomDataGenerationExample.scala | 8 ++++---- 8 files changed, 24 insertions(+), 21 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java index 023480252d833..813c8ac936aca 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -71,9 +71,9 @@ public static void main(String[] args) { System.out.println(result); // summary of the test i++; } - - // $example off$ */ + // $example off$ + jsc.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java index c34e66541eb7a..90fa8830b1ae6 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -50,9 +50,9 @@ public static void main(String[] args) { // and null hypothesis // if our p-value indicates significance, we can reject the null hypothesis System.out.println(testResult); - - // $example off$ */ + // $example off$ + jsc.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java index 457ee0c5537c9..4229f59a64211 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -36,9 +36,9 @@ public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); -/* - // $example on$ + // $example on$ +/* // @note: todo RDD data = ... // an RDD of sample data @@ -51,8 +51,9 @@ public static void main(String[] args) { // Find density estimates for the given values double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); - // $example off$ */ + // $example off$ + jsc.stop(); } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java index f27476ffa6179..bad68ed5ba507 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java @@ -52,9 +52,9 @@ public Double call(Double x) { return 1.0 + 2.0 * x; } }); - - // $example off$ */ + // $example off$ + jsc.stop(); } } diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala index d5883c7d89604..85cf226d3b5ac 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala @@ -31,8 +31,9 @@ object HypothesisTestingExample { val conf = new SparkConf().setAppName("HypothesisTestingExample").setMaster("local[*]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) -/* + // $example on$ +/* // @note: todo val vec: Vector = ... // a vector composed of the frequencies of events @@ -59,9 +60,9 @@ object HypothesisTestingExample { println(s"Column $i:\n$result") i += 1 } // summary of the test - - // $example off$ */ + // $example off$ + sc.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala index 1948069954748..3e47287f06685 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala @@ -46,9 +46,9 @@ object HypothesisTestingKolmogorovSmirnovTestExample { // perform a KS test using a cumulative distribution function of our making val myCDF: Double => Double = ... val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) - - // $example off$ */ + // $example off$ + sc.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala index f061318b2fd1d..cc761fdff026a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala @@ -30,9 +30,9 @@ object KernelDensityEstimationExample { val conf = new SparkConf().setAppName("KernelDensityEstimationExample").setMaster("local[*]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) -/* - // $example on$ + // $example on$ +/* // @note: todo val data: RDD[Double] = ... // an RDD of sample data @@ -45,8 +45,9 @@ object KernelDensityEstimationExample { // Find density estimates for the given values val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) - // $example off$ */ + // $example off$ + sc.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala index fe33f88d4f144..7420efa4c2992 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala @@ -31,9 +31,9 @@ object RandomDataGenerationExample { val conf = new SparkConf().setAppName("RandomDataGenerationExample").setMaster("local[*]") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) -/* - // $example on$ + // $example on$ +/* // @note: todo // Generate a random double RDD that contains 1 million i.i.d. values drawn from the @@ -41,9 +41,9 @@ object RandomDataGenerationExample { val u = normalRDD(sc, 1000000L, 10) // Apply a transform to get a random double RDD following `N(1, 4)`. val v = u.map(x => 1.0 + 2.0 * x) - - // $example off$ */ + // $example off$ + sc.stop() } } From 323304fe2aa3033a88429cf3a0e5adef345d2c24 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Thu, 4 Feb 2016 14:55:51 -0800 Subject: [PATCH 09/26] [SPARK-13019] Stratified Sampling working --- .../mllib/JavaStratifiedSamplingExample.java | 44 +++++++++++++++---- .../mllib/stratified_sampling_example.py | 13 ++++-- .../mllib/StratifiedSamplingExample.scala | 6 +-- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java index 6d11e97690413..768332558469a 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java @@ -18,7 +18,8 @@ package org.apache.spark.examples.mllib; // $example on$ -import java.util.Map; +import java.util.*; + import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaPairRDD; @@ -26,27 +27,52 @@ // $example off$ import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.sql.SQLContext; import org.apache.spark.mllib.linalg.Vectors; -import java.util.Arrays; +import scala.Tuple2; public class JavaStratifiedSamplingExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample"); + SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample").setMaster("local[*]"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); // $example on$ -// JavaPairRDD data = ... // an RDD of any key value pairs -// Map fractions = ... // specify the exact fraction desired from each key -// -// // Get an exact sample from each stratum -// JavaPairRDD approxSample = data.sampleByKey(false, fractions); -// JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); + List> list = new ArrayList<>(); + list.add(new Tuple2(1,'a')); + list.add(new Tuple2(1, 'b')); + list.add(new Tuple2(2,'c')); + list.add(new Tuple2(2,'d')); + list.add(new Tuple2(2, 'e')); + list.add(new Tuple2(3,'f')); + + JavaPairRDD data = jsc.parallelizePairs(list); // an RDD of any key value pairs JavaPairRDD + Map fractions = new HashMap<>(); // specify the exact fraction desired from each key Map + fractions.put(1, 0.1); + fractions.put(2, 0.6); + fractions.put(3, 0.3); + + // Get an exact sample from each stratum + JavaPairRDD approxSample = data.sampleByKey(false, fractions); // JavaPairRDD + JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); // JavaPairRDD + // $example off$ + approxSample.foreach(new VoidFunction>() { + public void call(Tuple2 t) throws Exception { + System.out.println(t._1() + " " + t._2()); + } + }); + + exactSample.foreach(new VoidFunction>() { + public void call(Tuple2 t) throws Exception { + System.out.println(t._1() + " " + t._2()); + } + }); + jsc.stop(); } } diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py index 0f6ede7335a85..d44309d0f4ae5 100644 --- a/examples/src/main/python/mllib/stratified_sampling_example.py +++ b/examples/src/main/python/mllib/stratified_sampling_example.py @@ -30,10 +30,15 @@ sqlContext = SQLContext(sc) # $example on$ - # data = ... # an RDD of any key value pairs - # fractions = ... # specify the exact fraction desired from each key as a dictionary - # - # approxSample = data.sampleByKey(False, fractions); + + data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')]) # an RDD of any key value pairs + fractions = {1:0.1, 2:0.6, 3:0.3} # specify the exact fraction desired from each key as a dictionary + + approxSample = data.sampleByKey(False, fractions); + # $example off$ + for each in approxSample.collect(): + print(each) + sc.stop() \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala index c01047e784357..0502e01235fdb 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala @@ -35,14 +35,14 @@ object StratifiedSamplingExample { // $example on$ // @note: I don't know how to use class "import org.apache.spark.rdd.PairRDDFunctions" val data = sc.parallelize(Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f'))) // an RDD[(K, V)] of any key value pairs - val fractions = Map(1 -> 1.0, 2 -> 2.0, 3 -> 3.0)// specify the exact fraction desired from each key + val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3)// specify the exact fraction desired from each key // Get an exact sample from each stratum val approxSample = data.sampleByKey(withReplacement = false, fractions) val exactSample = data.sampleByKeyExact(withReplacement = false, fractions) - println(approxSample.toString) - println(exactSample.toString) + approxSample.foreach(println) + exactSample.foreach(println) // $example off$ sc.stop() From 3692d30bfec83e53883a017339d08cfaf3223266 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Thu, 4 Feb 2016 17:14:01 -0800 Subject: [PATCH 10/26] [SPARK-13019] hypothesis testing working --- .../mllib/JavaHypothesisTestingExample.java | 16 +++--- .../mllib/hypothesis_testing_example.py | 53 ++++++++++--------- .../mllib/HypothesisTestingExample.scala | 23 +++++--- .../mllib/StratifiedSamplingExample.scala | 5 +- 4 files changed, 56 insertions(+), 41 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java index 813c8ac936aca..dbdfa6ec8df51 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -35,15 +35,13 @@ public class JavaHypothesisTestingExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample"); + SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample").setMaster("local[*]"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); // $example on$ - // @note: todo -/* - Vector vec = ... // a vector composed of the frequencies of events + Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); // a vector composed of the frequencies of events // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, // the test runs against a uniform distribution. @@ -52,14 +50,18 @@ public static void main(String[] args) { // and the null hypothesis. System.out.println(goodnessOfFitTestResult); - Matrix mat = ... // a contingency matrix + // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) + Matrix mat = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix // conduct Pearson's independence test on the input contingency matrix ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); // summary of the test including the p-value, degrees of freedom... System.out.println(independenceTestResult); - JavaRDD obs = ... // an RDD of labeled points + LabeledPoint p1 = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)); + LabeledPoint p2 = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)); + LabeledPoint p3 = new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)); + JavaRDD obs = jsc.parallelize(Arrays.asList(p1, p2, p3)); // an RDD of labeled points // The contingency table is constructed from the raw (feature, label) pairs and used to conduct // the independence test. Returns an array containing the ChiSquaredTestResult for every feature @@ -71,7 +73,7 @@ public static void main(String[] args) { System.out.println(result); // summary of the test i++; } -*/ + // $example off$ jsc.stop(); diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py index cee1ab6168475..3d7fe646489dc 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_example.py @@ -24,7 +24,7 @@ # $example on$ from pyspark import SparkContext from pyspark.mllib.linalg import Vectors, Matrices -from pyspark.mllib.regresssion import LabeledPoint +from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics # $example off$ @@ -34,32 +34,33 @@ # $example on$ - # @note: todo + vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events - # vec = Vectors.dense(...) # a vector composed of the frequencies of events - # - # # compute the goodness of fit. If a second vector to test against is not supplied as a parameter, - # # the test runs against a uniform distribution. - # goodnessOfFitTestResult = Statistics.chiSqTest(vec) - # print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, - # # test statistic, the method used, and the null hypothesis. - # - # mat = Matrices.dense(...) # a contingency matrix - # - # # conduct Pearson's independence test on the input contingency matrix - # independenceTestResult = Statistics.chiSqTest(mat) - # print(independenceTestResult) # summary of the test including the p-value, degrees of freedom... - # - # obs = sc.parallelize(...) # LabeledPoint(feature, label) . - # - # # The contingency table is constructed from an RDD of LabeledPoint and used to conduct - # # the independence test. Returns an array containing the ChiSquaredTestResult for every feature - # # against the label. - # featureTestResults = Statistics.chiSqTest(obs) - # - # for i, result in enumerate(featureTestResults): - # print("Column $d:" % (i + 1)) - # print(result) + # compute the goodness of fit. If a second vector to test against is not supplied as a parameter, + # the test runs against a uniform distribution. + goodnessOfFitTestResult = Statistics.chiSqTest(vec) + print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, + # test statistic, the method used, and the null hypothesis. + + mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix + + # conduct Pearson's independence test on the input contingency matrix + independenceTestResult = Statistics.chiSqTest(mat) + print(independenceTestResult) # summary of the test including the p-value, degrees of freedom... + + p1 = LabeledPoint(1.0, [1.0, 0.0, 3.0]) + p2 = LabeledPoint(1.0, [1.0, 2.0, 0.0]) + p3 = LabeledPoint(1.0, [-1.0, 0.0, -0.5]) + obs = sc.parallelize([p1, p2, p3]) # LabeledPoint(feature, label) . + + # The contingency table is constructed from an RDD of LabeledPoint and used to conduct + # the independence test. Returns an array containing the ChiSquaredTestResult for every feature + # against the label. + featureTestResults = Statistics.chiSqTest(obs) + + for i, result in enumerate(featureTestResults): + print("Column: " + str(i + 1)) + print(result) # $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala index 85cf226d3b5ac..9b414af60e705 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala @@ -19,6 +19,11 @@ package org.apache.spark.examples.mllib // $example on$ +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.mllib.stat.test.ChiSqTestResult +import org.apache.spark.rdd.RDD // $example off$ import org.apache.spark.sql.SQLContext @@ -33,9 +38,8 @@ object HypothesisTestingExample { val sqlContext = new SQLContext(sc) // $example on$ -/* - // @note: todo - val vec: Vector = ... // a vector composed of the frequencies of events + + val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) // a vector composed of the frequencies of events // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, // the test runs against a uniform distribution. @@ -43,13 +47,20 @@ object HypothesisTestingExample { println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom, // test statistic, the method used, and the null hypothesis. - val mat: Matrix = ... // a contingency matrix + + + // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) + val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) // a contingency matrix // conduct Pearson's independence test on the input contingency matrix val independenceTestResult = Statistics.chiSqTest(mat) println(independenceTestResult) // summary of the test including the p-value, degrees of freedom... - val obs: RDD[LabeledPoint] = ... // (feature, label) pairs. + + val p1 = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) + val p2 = LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)) + val p3 = LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)) + val obs: RDD[LabeledPoint] = sc.parallelize(Seq(p1, p2, p3)) // (feature, label) pairs. // The contingency table is constructed from the raw (feature, label) pairs and used to conduct // the independence test. Returns an array containing the ChiSquaredTestResult for every feature @@ -60,7 +71,7 @@ object HypothesisTestingExample { println(s"Column $i:\n$result") i += 1 } // summary of the test -*/ + // $example off$ sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala index 0502e01235fdb..9144b2f0c5813 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala @@ -41,10 +41,11 @@ object StratifiedSamplingExample { val approxSample = data.sampleByKey(withReplacement = false, fractions) val exactSample = data.sampleByKeyExact(withReplacement = false, fractions) - approxSample.foreach(println) - exactSample.foreach(println) // $example off$ + approxSample.foreach(println) + exactSample.foreach(println) + sc.stop() } } From 89c3d2ed7d0134d27ecd5c2077e45c12e1702552 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Thu, 4 Feb 2016 17:33:25 -0800 Subject: [PATCH 11/26] [SPARK-13019] Hypothesis Testing Kolmogorov Smirnov Test Example is working --- ...isTestingKolmogorovSmirnovTestExample.java | 8 +++----- ...testing_kolmogorov_smirnov_test_example.py | 20 +++++++++---------- ...sTestingKolmogorovSmirnovTestExample.scala | 12 ++++++----- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java index 90fa8830b1ae6..d78e246d0799f 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -36,21 +36,19 @@ public class JavaHypothesisTestingKolmogorovSmirnovTestExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample"); + SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample").setMaster("local[*]"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); // $example on$ -/* - // @note: todo - JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0, ...)); + JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25)); KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); // summary of the test including the p-value, test statistic, // and null hypothesis // if our p-value indicates significance, we can reject the null hypothesis System.out.println(testResult); -*/ + // $example off$ jsc.stop(); diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py index 5541250dd76ee..5189992ade9b5 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py @@ -31,17 +31,15 @@ # $example on$ - # @note: todo - - # parallelData = sc.parallelize([1.0, 2.0, ... ]) - # - # # run a KS test for the sample versus a standard normal distribution - # testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) - # print(testResult) # summary of the test including the p-value, test statistic, - # # and null hypothesis - # # if our p-value indicates significance, we can reject the null hypothesis - # # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with - # # a lambda to calculate the CDF is not made available in the Python API + parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25]) + + # run a KS test for the sample versus a standard normal distribution + testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) + print(testResult) # summary of the test including the p-value, test statistic, + # and null hypothesis + # if our p-value indicates significance, we can reject the null hypothesis + # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with + # a lambda to calculate the CDF is not made available in the Python API # $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala index 3e47287f06685..29cd68ee9fc3a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala @@ -19,6 +19,8 @@ package org.apache.spark.examples.mllib // $example on$ +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.rdd.RDD // $example off$ import org.apache.spark.sql.SQLContext @@ -33,9 +35,8 @@ object HypothesisTestingKolmogorovSmirnovTestExample { val sqlContext = new SQLContext(sc) // $example on$ - // @note: todo -/* - val data: RDD[Double] = ... // an RDD of sample data + + val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data // run a KS test for the sample versus a standard normal distribution val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) @@ -44,9 +45,10 @@ object HypothesisTestingKolmogorovSmirnovTestExample { // if our p-value indicates significance, we can reject the null hypothesis // perform a KS test using a cumulative distribution function of our making - val myCDF: Double => Double = ... + val myCDF: Double => Double = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) -*/ + println(testResult2) + // $example off$ sc.stop() From 4dbbc6d32ed3c045e5b947df77f11195adb1255f Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Thu, 4 Feb 2016 17:37:43 -0800 Subject: [PATCH 12/26] [SPARK-13019] remove empty lines --- .../spark/examples/mllib/JavaHypothesisTestingExample.java | 1 - .../JavaHypothesisTestingKolmogorovSmirnovTestExample.java | 5 ----- examples/src/main/python/mllib/hypothesis_testing_example.py | 1 - .../hypothesis_testing_kolmogorov_smirnov_test_example.py | 1 - .../spark/examples/mllib/HypothesisTestingExample.scala | 5 ----- .../HypothesisTestingKolmogorovSmirnovTestExample.scala | 2 -- 6 files changed, 15 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java index dbdfa6ec8df51..de6330667915f 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -40,7 +40,6 @@ public static void main(String[] args) { SQLContext sqlContext = new SQLContext(jsc); // $example on$ - Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); // a vector composed of the frequencies of events // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java index d78e246d0799f..0c40e2bd2f9b6 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -18,11 +18,8 @@ package org.apache.spark.examples.mllib; // $example on$ -import java.util.Arrays; - import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaSparkContext; - import org.apache.spark.mllib.stat.Statistics; import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; // $example off$ @@ -32,7 +29,6 @@ import org.apache.spark.mllib.linalg.Vectors; import java.util.Arrays; - public class JavaHypothesisTestingKolmogorovSmirnovTestExample { public static void main(String[] args) { @@ -41,7 +37,6 @@ public static void main(String[] args) { SQLContext sqlContext = new SQLContext(jsc); // $example on$ - JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25)); KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); // summary of the test including the p-value, test statistic, diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py index 3d7fe646489dc..91ed400210cdc 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_example.py @@ -33,7 +33,6 @@ sqlContext = SQLContext(sc) # $example on$ - vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events # compute the goodness of fit. If a second vector to test against is not supplied as a parameter, diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py index 5189992ade9b5..91499d4f1fdc6 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py @@ -30,7 +30,6 @@ sqlContext = SQLContext(sc) # $example on$ - parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25]) # run a KS test for the sample versus a standard normal distribution diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala index 9b414af60e705..1b75535adba80 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala @@ -24,7 +24,6 @@ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.stat.test.ChiSqTestResult import org.apache.spark.rdd.RDD - // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} @@ -38,7 +37,6 @@ object HypothesisTestingExample { val sqlContext = new SQLContext(sc) // $example on$ - val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) // a vector composed of the frequencies of events // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, @@ -47,8 +45,6 @@ object HypothesisTestingExample { println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom, // test statistic, the method used, and the null hypothesis. - - // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) // a contingency matrix @@ -56,7 +52,6 @@ object HypothesisTestingExample { val independenceTestResult = Statistics.chiSqTest(mat) println(independenceTestResult) // summary of the test including the p-value, degrees of freedom... - val p1 = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) val p2 = LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)) val p3 = LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala index 29cd68ee9fc3a..656c684ff1d21 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala @@ -21,7 +21,6 @@ package org.apache.spark.examples.mllib // $example on$ import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD - // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} @@ -35,7 +34,6 @@ object HypothesisTestingKolmogorovSmirnovTestExample { val sqlContext = new SQLContext(sc) // $example on$ - val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data // run a KS test for the sample versus a standard normal distribution From f024fc3c5a6021f2d320c72f9ded78adc3347fbb Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Thu, 4 Feb 2016 18:03:26 -0800 Subject: [PATCH 13/26] [SPARK-13019] random data generation example working --- .../JavaRandomDataGenerationExample.java | 26 ++++++++++++++----- .../mllib/random_data_generation_example.py | 17 +++++++----- .../mllib/RandomDataGenerationExample.scala | 11 ++++---- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java index bad68ed5ba507..46bd1889bb803 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java @@ -26,35 +26,47 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.sql.SQLContext; import org.apache.spark.mllib.linalg.Vectors; import java.util.Arrays; + public class JavaRandomDataGenerationExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample"); + SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample").setMaster("local[*]"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); // $example on$ -/* - // @note: todo - // Generate a random double RDD that contains 1 million i.i.d. values drawn from the // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. - JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10); + JavaDoubleRDD u = normalJavaRDD(jsc, 1000L, 10); // Apply a transform to get a random double RDD following `N(1, 4)`. - JavaDoubleRDD v = u.map( + JavaRDD v = u.map( new Function() { public Double call(Double x) { return 1.0 + 2.0 * x; } }); -*/ + // $example off$ + u.foreach(new VoidFunction() { + public void call(Double d) throws Exception { + System.out.println(d); + } + }); + + v.foreach(new VoidFunction() { + public void call(Double d) throws Exception { + System.out.println(d); + } + }); + jsc.stop(); } } diff --git a/examples/src/main/python/mllib/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py index db2a5e97e87d2..7bec4bddeef81 100644 --- a/examples/src/main/python/mllib/random_data_generation_example.py +++ b/examples/src/main/python/mllib/random_data_generation_example.py @@ -30,14 +30,17 @@ sqlContext = SQLContext(sc) # $example on$ + # Generate a random double RDD that contains 1 million i.i.d. values drawn from the + # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. + u = RandomRDDs.normalRDD(sc, 1000L, 10) + # Apply a transform to get a random double RDD following `N(1, 4)`. + v = u.map(lambda x: 1.0 + 2.0 * x) + # $example off$ - # @note: todo + for each in u.collect(): + print(each) - # # Generate a random double RDD that contains 1 million i.i.d. values drawn from the - # # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. - # u = RandomRDDs.normalRDD(sc, 1000000L, 10) - # # Apply a transform to get a random double RDD following `N(1, 4)`. - # v = u.map(lambda x: 1.0 + 2.0 * x) - # # $example off$ + for each in v.collect(): + print(each) sc.stop() \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala index 7420efa4c2992..baa36bd7b7a1f 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala @@ -19,7 +19,7 @@ package org.apache.spark.examples.mllib // $example on$ - +import org.apache.spark.mllib.random.RandomRDDs._ // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} @@ -33,16 +33,15 @@ object RandomDataGenerationExample { val sqlContext = new SQLContext(sc) // $example on$ -/* - // @note: todo - // Generate a random double RDD that contains 1 million i.i.d. values drawn from the // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. - val u = normalRDD(sc, 1000000L, 10) + val u = normalRDD(sc, 1000L, 10) // Apply a transform to get a random double RDD following `N(1, 4)`. val v = u.map(x => 1.0 + 2.0 * x) -*/ + // $example off$ + u.foreach(print) + v.foreach(print) sc.stop() } From 6f949cde4f18829b2a1c7f946373fc008f4a5bb1 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Sun, 7 Feb 2016 11:19:47 +0800 Subject: [PATCH 14/26] [SPARK-13019] Kernel Density Estimation Example is working --- .../JavaKernelDensityEstimationExample.java | 12 ++++---- .../kernel_density_estimation_example.py | 29 +++++++++---------- .../KernelDensityEstimationExample.scala | 11 +++---- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java index 4229f59a64211..c62410dc7d770 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -18,6 +18,7 @@ package org.apache.spark.examples.mllib; // $example on$ +import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.stat.KernelDensity; @@ -33,15 +34,12 @@ public class JavaKernelDensityEstimationExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample"); + SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample").setMaster("local[*]"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); // $example on$ -/* - // @note: todo - - RDD data = ... // an RDD of sample data + JavaRDD data = jsc.parallelize(Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data // Construct the density estimator with the sample data and a standard deviation for the Gaussian // kernels @@ -51,9 +49,11 @@ public static void main(String[] args) { // Find density estimates for the given values double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); -*/ + // $example off$ + System.out.println(Arrays.toString(densities)); + jsc.stop(); } } diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py index 383d9f37a9276..a3055783db23d 100644 --- a/examples/src/main/python/mllib/kernel_density_estimation_example.py +++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py @@ -29,20 +29,19 @@ sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext sqlContext = SQLContext(sc) - # # $example on$ - # - # # @note: todo - # - # data = ... # an RDD of sample data - # - # # Construct the density estimator with the sample data and a standard deviation for the Gaussian - # # kernels - # kd = KernelDensity() - # kd.setSample(data) - # kd.setBandwidth(3.0) - # - # # Find density estimates for the given values - # densities = kd.estimate([-1.0, 2.0, 5.0]) - # # $example off$ + # $example on$ + data = sc.parallelize([1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0]) # an RDD of sample data + + # Construct the density estimator with the sample data and a standard deviation for the Gaussian + # kernels + kd = KernelDensity() + kd.setSample(data) + kd.setBandwidth(3.0) + + # Find density estimates for the given values + densities = kd.estimate([-1.0, 2.0, 5.0]) + # $example off$ + + print(densities) sc.stop() \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala index cc761fdff026a..636457e3fa0f2 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala @@ -19,6 +19,8 @@ package org.apache.spark.examples.mllib // $example on$ +import org.apache.spark.mllib.stat.KernelDensity +import org.apache.spark.rdd.RDD // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} @@ -32,10 +34,7 @@ object KernelDensityEstimationExample { val sqlContext = new SQLContext(sc) // $example on$ -/* - // @note: todo - - val data: RDD[Double] = ... // an RDD of sample data + val data: RDD[Double] = sc.parallelize(Seq(1,1,1,2,3,4,5,5,6,7,8,9,9)) // an RDD of sample data // Construct the density estimator with the sample data and a standard deviation for the Gaussian // kernels @@ -45,9 +44,11 @@ object KernelDensityEstimationExample { // Find density estimates for the given values val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) -*/ + // $example off$ + densities.foreach(print) + sc.stop() } } From a4dd0fb5781dbdc162fd8652001b8de09b225297 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Sun, 7 Feb 2016 15:16:33 +0800 Subject: [PATCH 15/26] [SPARK-13019] code style check --- docs/mllib-statistics.md | 1 - .../mllib/JavaCorrelationsExample.java | 13 ++++++++---- ...isTestingKolmogorovSmirnovTestExample.java | 3 ++- .../JavaKernelDensityEstimationExample.java | 3 ++- .../mllib/JavaStratifiedSamplingExample.java | 8 +++++-- .../main/python/mllib/correlations_example.py | 7 ++++--- .../examples/mllib/CorrelationsExample.scala | 16 ++++++++------ .../mllib/HypothesisTestingExample.scala | 21 +++++++++---------- ...sTestingKolmogorovSmirnovTestExample.scala | 9 ++++---- .../KernelDensityEstimationExample.scala | 11 +++++----- .../mllib/RandomDataGenerationExample.scala | 4 +--- .../mllib/StratifiedSamplingExample.scala | 11 ++++++---- .../mllib/SummaryStatisticsExample.scala | 4 +--- 13 files changed, 61 insertions(+), 50 deletions(-) diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index a4b707453a2e3..b06829f0247dd 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -181,7 +181,6 @@ hypothesis tests. Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API. {% include_example java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java %} -
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java index 6fb1ee6365a27..e12481fab10c0 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java @@ -35,12 +35,15 @@ public class JavaCorrelationsExample { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample").setMaster("local[*]"); + JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ - JavaSparkContext jsc = new JavaSparkContext(conf); + JavaDoubleRDD seriesX = jsc.parallelizeDoubles( + Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series - JavaDoubleRDD seriesX = jsc.parallelizeDoubles(Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series - JavaDoubleRDD seriesY = jsc.parallelizeDoubles(Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0})); // must have the same number of partitions and cardinality as seriesX + // must have the same number of partitions and cardinality as seriesX + JavaDoubleRDD seriesY = jsc.parallelizeDoubles( + Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0})); // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a // method is not specified, Pearson's method will be used by default. @@ -50,7 +53,9 @@ public static void main(String[] args) { Vector v1 = Vectors.dense(1.0, 10.0, 100.0); Vector v2 = Vectors.dense(2.0, 20.0, 200.0); Vector v3 = Vectors.dense(5.0, 33.0, 366.0); - JavaRDD data = jsc.parallelize(Arrays.asList(v1, v2, v3)); // note that each Vector is a row and not a column + + // note that each Vector is a row and not a column + JavaRDD data = jsc.parallelize(Arrays.asList(v1, v2, v3)); // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. // If a method is not specified, Pearson's method will be used by default. diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java index 0c40e2bd2f9b6..875c2c8777c3b 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -32,7 +32,8 @@ public class JavaHypothesisTestingKolmogorovSmirnovTestExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample").setMaster("local[*]"); + SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample") + .setMaster("local[*]"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java index c62410dc7d770..72781eedfe635 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -39,7 +39,8 @@ public static void main(String[] args) { SQLContext sqlContext = new SQLContext(jsc); // $example on$ - JavaRDD data = jsc.parallelize(Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data + JavaRDD data = jsc.parallelize( + Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data // Construct the density estimator with the sample data and a standard deviation for the Gaussian // kernels diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java index 768332558469a..7371e274dcbb1 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java @@ -49,8 +49,12 @@ public static void main(String[] args) { list.add(new Tuple2(2, 'e')); list.add(new Tuple2(3,'f')); - JavaPairRDD data = jsc.parallelizePairs(list); // an RDD of any key value pairs JavaPairRDD - Map fractions = new HashMap<>(); // specify the exact fraction desired from each key Map + // an RDD of any key value pairs JavaPairRDD + JavaPairRDD data = jsc.parallelizePairs(list); + + // specify the exact fraction desired from each key Map + Map fractions = new HashMap<>(); + fractions.put(1, 0.1); fractions.put(2, 0.6); fractions.put(3, 0.3); diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py index 35e089c6a0e7d..f20c8b6c03413 100644 --- a/examples/src/main/python/mllib/correlations_example.py +++ b/examples/src/main/python/mllib/correlations_example.py @@ -29,8 +29,9 @@ # $example on$ sc = SparkContext(appName="CorrelationsExample") # SparkContext - seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series - seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) # must have the same number of partitions and cardinality as seriesX + seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series + seriesY = sc.parallelize( + [11.0, 22.0, 33.0, 33.0, 555.0]) # must have the same number of partitions and cardinality as seriesX # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a # method is not specified, Pearson's method will be used by default. @@ -39,7 +40,7 @@ v1 = np.array([1.0, 10.0, 100.0]) v2 = np.array([2.0, 20.0, 200.0]) v3 = np.array([5.0, 33.0, 366.0]) - data = sc.parallelize([v1, v2, v3]) # an RDD of Vectors + data = sc.parallelize([v1, v2, v3]) # an RDD of Vectors # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala index aeb5f7f802e00..9b3c0321f067d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala @@ -18,13 +18,12 @@ // scalastyle:off println package org.apache.spark.examples.mllib +import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.stat.Statistics // $example off$ import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SQLContext -import org.apache.spark.{SparkConf, SparkContext} object CorrelationsExample { @@ -32,20 +31,25 @@ object CorrelationsExample { val conf = new SparkConf().setAppName("CorrelationsExample").setMaster("local[*]") val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) // $example on$ val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series - val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555)) // must have the same number of partitions and cardinality as seriesX + val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555)) + // must have the same number of partitions and cardinality as seriesX // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a // method is not specified, Pearson's method will be used by default. val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") println(correlation) - val data: RDD[Vector] = sc.parallelize(Seq(Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(5.0, 33.0, 366.0))) // note that each Vector is a row and not a column + val data: RDD[Vector] = sc.parallelize( + Seq( + Vectors.dense(1.0, 10.0, 100.0), + Vectors.dense(2.0, 20.0, 200.0), + Vectors.dense(5.0, 33.0, 366.0)) + ) // note that each Vector is a row and not a column - // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. + // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method // If a method is not specified, Pearson's method will be used by default. val correlMatrix: Matrix = Statistics.corr(data, "pearson") println(correlMatrix.toString) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala index 1b75535adba80..fe3c280cf0082 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala @@ -18,6 +18,7 @@ // scalastyle:off println package org.apache.spark.examples.mllib +import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint @@ -25,8 +26,6 @@ import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.stat.test.ChiSqTestResult import org.apache.spark.rdd.RDD // $example off$ -import org.apache.spark.sql.SQLContext -import org.apache.spark.{SparkConf, SparkContext} object HypothesisTestingExample { @@ -34,23 +33,23 @@ object HypothesisTestingExample { val conf = new SparkConf().setAppName("HypothesisTestingExample").setMaster("local[*]") val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) // $example on$ - val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) // a vector composed of the frequencies of events + // a vector composed of the frequencies of events + val vec: Vector = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) - // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, - // the test runs against a uniform distribution. + // compute the goodness of fit. If a second vector to test against is not supplied + // as a parameter, the test runs against a uniform distribution. val goodnessOfFitTestResult = Statistics.chiSqTest(vec) - println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom, - // test statistic, the method used, and the null hypothesis. + println(goodnessOfFitTestResult) // summary of the test including the p-value, + // degrees of freedom, test statistic, the method used, and the null hypothesis. - // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) - val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) // a contingency matrix + // a contingency matrix. Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) + val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) // conduct Pearson's independence test on the input contingency matrix val independenceTestResult = Statistics.chiSqTest(mat) - println(independenceTestResult) // summary of the test including the p-value, degrees of freedom... + println(independenceTestResult) // summary of the test including the p-value, degrees of freedom val p1 = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) val p2 = LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala index 656c684ff1d21..7ed96766fcd11 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala @@ -18,20 +18,19 @@ // scalastyle:off println package org.apache.spark.examples.mllib +import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD // $example off$ -import org.apache.spark.sql.SQLContext -import org.apache.spark.{SparkConf, SparkContext} object HypothesisTestingKolmogorovSmirnovTestExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample").setMaster("local[*]") + val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample") + .setMaster("local[*]") val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) // $example on$ val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data @@ -43,7 +42,7 @@ object HypothesisTestingKolmogorovSmirnovTestExample { // if our p-value indicates significance, we can reject the null hypothesis // perform a KS test using a cumulative distribution function of our making - val myCDF: Double => Double = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) + val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) println(testResult2) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala index 636457e3fa0f2..31b5a5e1ad05c 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala @@ -18,12 +18,11 @@ // scalastyle:off println package org.apache.spark.examples.mllib +import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.stat.KernelDensity import org.apache.spark.rdd.RDD // $example off$ -import org.apache.spark.sql.SQLContext -import org.apache.spark.{SparkConf, SparkContext} object KernelDensityEstimationExample { @@ -31,13 +30,13 @@ object KernelDensityEstimationExample { val conf = new SparkConf().setAppName("KernelDensityEstimationExample").setMaster("local[*]") val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) // $example on$ - val data: RDD[Double] = sc.parallelize(Seq(1,1,1,2,3,4,5,5,6,7,8,9,9)) // an RDD of sample data + // an RDD of sample data + val data: RDD[Double] = sc.parallelize(Seq(1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9)) - // Construct the density estimator with the sample data and a standard deviation for the Gaussian - // kernels + // Construct the density estimator with the sample data and a standard deviation + // for the Gaussian kernels val kd = new KernelDensity() .setSample(data) .setBandwidth(3.0) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala index baa36bd7b7a1f..91019a2ac9de5 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala @@ -18,11 +18,10 @@ // scalastyle:off println package org.apache.spark.examples.mllib +import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.random.RandomRDDs._ // $example off$ -import org.apache.spark.sql.SQLContext -import org.apache.spark.{SparkConf, SparkContext} object RandomDataGenerationExample { @@ -30,7 +29,6 @@ object RandomDataGenerationExample { val conf = new SparkConf().setAppName("RandomDataGenerationExample").setMaster("local[*]") val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) // $example on$ // Generate a random double RDD that contains 1 million i.i.d. values drawn from the diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala index 9144b2f0c5813..24a5407426894 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala @@ -33,9 +33,12 @@ object StratifiedSamplingExample { val sqlContext = new SQLContext(sc) // $example on$ - // @note: I don't know how to use class "import org.apache.spark.rdd.PairRDDFunctions" - val data = sc.parallelize(Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f'))) // an RDD[(K, V)] of any key value pairs - val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3)// specify the exact fraction desired from each key + // an RDD[(K, V)] of any key value pairs + val data = sc.parallelize( + Seq((1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f'))) + + // specify the exact fraction desired from each key + val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3) // Get an exact sample from each stratum val approxSample = data.sampleByKey(withReplacement = false, fractions) @@ -45,7 +48,7 @@ object StratifiedSamplingExample { approxSample.foreach(println) exactSample.foreach(println) - + sc.stop() } } diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala index 8876dbfcdb863..c2fe7976b4609 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala @@ -18,12 +18,11 @@ // scalastyle:off println package org.apache.spark.examples.mllib +import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} // $example off$ -import org.apache.spark.sql.SQLContext -import org.apache.spark.{SparkConf, SparkContext} object SummaryStatisticsExample { @@ -31,7 +30,6 @@ object SummaryStatisticsExample { val conf = new SparkConf().setAppName("SummaryStatisticsExample").setMaster("local[*]") val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) // $example on$ val v1 = Vectors.dense(1.0, 10.0, 100.0) From 3a11802513a0d9a1c9f1dc5de8c37c09f5e97062 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Fri, 12 Feb 2016 08:29:37 +0800 Subject: [PATCH 16/26] [SPARK-13019] fix python style --- .../main/python/mllib/correlations_example.py | 8 ++++---- .../python/mllib/hypothesis_testing_example.py | 16 ++++++++-------- ...is_testing_kolmogorov_smirnov_test_example.py | 6 +++--- .../mllib/kernel_density_estimation_example.py | 7 ++++--- .../mllib/random_data_generation_example.py | 4 ++-- .../python/mllib/stratified_sampling_example.py | 12 +++++++----- .../python/mllib/summary_statistics_example.py | 12 ++++++------ 7 files changed, 34 insertions(+), 31 deletions(-) diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py index f20c8b6c03413..e9ccca0dd5593 100644 --- a/examples/src/main/python/mllib/correlations_example.py +++ b/examples/src/main/python/mllib/correlations_example.py @@ -27,11 +27,11 @@ if __name__ == "__main__": # $example on$ - sc = SparkContext(appName="CorrelationsExample") # SparkContext + sc = SparkContext(appName="CorrelationsExample") # SparkContext seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series - seriesY = sc.parallelize( - [11.0, 22.0, 33.0, 33.0, 555.0]) # must have the same number of partitions and cardinality as seriesX + # seriesY must have the same number of partitions and cardinality as seriesX + seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a # method is not specified, Pearson's method will be used by default. @@ -48,4 +48,4 @@ # $example off$ - sc.stop() \ No newline at end of file + sc.stop() diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py index 91ed400210cdc..4d37e394af3b7 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_example.py @@ -29,23 +29,23 @@ # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="HypothesisTestingExample") # SparkContext + sc = SparkContext(appName="HypothesisTestingExample") # SparkContext sqlContext = SQLContext(sc) # $example on$ - vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events + vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events - # compute the goodness of fit. If a second vector to test against is not supplied as a parameter, - # the test runs against a uniform distribution. + # compute the goodness of fit. If a second vector to test against + # is not supplied as a parameter, the test runs against a uniform distribution. goodnessOfFitTestResult = Statistics.chiSqTest(vec) - print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, + print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. - mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix + mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix # conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(mat) - print(independenceTestResult) # summary of the test including the p-value, degrees of freedom... + print(independenceTestResult) # summary of the test including the p-value, degrees of freedom p1 = LabeledPoint(1.0, [1.0, 0.0, 3.0]) p2 = LabeledPoint(1.0, [1.0, 2.0, 0.0]) @@ -63,4 +63,4 @@ # $example off$ - sc.stop() \ No newline at end of file + sc.stop() diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py index 91499d4f1fdc6..3e3c6ba0b96a8 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py @@ -26,7 +26,7 @@ # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample") # SparkContext + sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample") # SparkContext sqlContext = SQLContext(sc) # $example on$ @@ -34,7 +34,7 @@ # run a KS test for the sample versus a standard normal distribution testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) - print(testResult) # summary of the test including the p-value, test statistic, + print(testResult) # summary of the test including the p-value, test statistic, # and null hypothesis # if our p-value indicates significance, we can reject the null hypothesis # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with @@ -42,4 +42,4 @@ # $example off$ - sc.stop() \ No newline at end of file + sc.stop() diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py index a3055783db23d..746027e6d599f 100644 --- a/examples/src/main/python/mllib/kernel_density_estimation_example.py +++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py @@ -26,11 +26,12 @@ # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext + sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext sqlContext = SQLContext(sc) # $example on$ - data = sc.parallelize([1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0]) # an RDD of sample data + # an RDD of sample data + data = sc.parallelize([1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0]) # Construct the density estimator with the sample data and a standard deviation for the Gaussian # kernels @@ -44,4 +45,4 @@ print(densities) - sc.stop() \ No newline at end of file + sc.stop() diff --git a/examples/src/main/python/mllib/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py index 7bec4bddeef81..7eb2ea81a8038 100644 --- a/examples/src/main/python/mllib/random_data_generation_example.py +++ b/examples/src/main/python/mllib/random_data_generation_example.py @@ -26,7 +26,7 @@ # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="RandomDataGenerationExample") # SparkContext + sc = SparkContext(appName="RandomDataGenerationExample") # SparkContext sqlContext = SQLContext(sc) # $example on$ @@ -43,4 +43,4 @@ for each in v.collect(): print(each) - sc.stop() \ No newline at end of file + sc.stop() diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py index d44309d0f4ae5..63bf2dddede01 100644 --- a/examples/src/main/python/mllib/stratified_sampling_example.py +++ b/examples/src/main/python/mllib/stratified_sampling_example.py @@ -26,19 +26,21 @@ # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="StratifiedSamplingExample") # SparkContext + sc = SparkContext(appName="StratifiedSamplingExample") # SparkContext sqlContext = SQLContext(sc) # $example on$ + # an RDD of any key value pairs + data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')]) - data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')]) # an RDD of any key value pairs - fractions = {1:0.1, 2:0.6, 3:0.3} # specify the exact fraction desired from each key as a dictionary + # specify the exact fraction desired from each key as a dictionary + fractions = {1: 0.1, 2: 0.6, 3: 0.3} - approxSample = data.sampleByKey(False, fractions); + approxSample = data.sampleByKey(False, fractions) # $example off$ for each in approxSample.collect(): print(each) - sc.stop() \ No newline at end of file + sc.stop() diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py index fef018127451d..2048bc432aa77 100644 --- a/examples/src/main/python/mllib/summary_statistics_example.py +++ b/examples/src/main/python/mllib/summary_statistics_example.py @@ -26,20 +26,20 @@ # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext + sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext sqlContext = SQLContext(sc) # $example on$ v1 = np.array([1.0, 2.0, 3.0]) v2 = np.array([10.0, 20.0, 30.0]) v3 = np.array([100.0, 200.0, 300.0]) - mat = sc.parallelize([v1, v2, v3]) # an RDD of Vectors + mat = sc.parallelize([v1, v2, v3]) # an RDD of Vectors # Compute column summary statistics. summary = Statistics.colStats(mat) - print(summary.mean()) # a dense vector containing the mean value for each column - print(summary.variance()) # column-wise variance - print(summary.numNonzeros()) # number of nonzeros in each column + print(summary.mean()) # a dense vector containing the mean value for each column + print(summary.variance()) # column-wise variance + print(summary.numNonzeros()) # number of nonzeros in each column # $example off$ - sc.stop() \ No newline at end of file + sc.stop() From 0df3e65b7ea64165ec4e9301ddc8e91c1abcd082 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Tue, 16 Feb 2016 16:05:24 +0800 Subject: [PATCH 17/26] [SPARK-13019] remove setMaster, change java to 2-indent --- .../mllib/JavaHypothesisTestingExample.java | 72 +++++++++---------- ...isTestingKolmogorovSmirnovTestExample.java | 37 +++++----- .../JavaKernelDensityEstimationExample.java | 40 +++++------ .../JavaRandomDataGenerationExample.java | 62 +++++++--------- .../mllib/HypothesisTestingExample.scala | 3 +- ...sTestingKolmogorovSmirnovTestExample.scala | 2 - .../KernelDensityEstimationExample.scala | 3 +- .../mllib/RandomDataGenerationExample.scala | 3 +- 8 files changed, 97 insertions(+), 125 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java index de6330667915f..f1532ddee7bf5 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -25,56 +25,52 @@ import org.apache.spark.mllib.stat.Statistics; import org.apache.spark.mllib.stat.test.ChiSqTestResult; // $example off$ - import org.apache.spark.SparkConf; -import org.apache.spark.sql.SQLContext; import org.apache.spark.mllib.linalg.Vectors; import java.util.Arrays; public class JavaHypothesisTestingExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample").setMaster("local[*]"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + public static void main(String[] args) { - // $example on$ - Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); // a vector composed of the frequencies of events + SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); - // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, - // the test runs against a uniform distribution. - ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); - // summary of the test including the p-value, degrees of freedom, test statistic, the method used, - // and the null hypothesis. - System.out.println(goodnessOfFitTestResult); + // $example on$ + Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); // a vector composed of the frequencies of events - // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) - Matrix mat = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix + // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, + // the test runs against a uniform distribution. + ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); + // summary of the test including the p-value, degrees of freedom, test statistic, the method used, + // and the null hypothesis. + System.out.println(goodnessOfFitTestResult); - // conduct Pearson's independence test on the input contingency matrix - ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); - // summary of the test including the p-value, degrees of freedom... - System.out.println(independenceTestResult); + // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) + Matrix mat = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix - LabeledPoint p1 = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)); - LabeledPoint p2 = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)); - LabeledPoint p3 = new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)); - JavaRDD obs = jsc.parallelize(Arrays.asList(p1, p2, p3)); // an RDD of labeled points + // conduct Pearson's independence test on the input contingency matrix + ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); + // summary of the test including the p-value, degrees of freedom... + System.out.println(independenceTestResult); - // The contingency table is constructed from the raw (feature, label) pairs and used to conduct - // the independence test. Returns an array containing the ChiSquaredTestResult for every feature - // against the label. - ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); - int i = 1; - for (ChiSqTestResult result : featureTestResults) { - System.out.println("Column " + i + ":"); - System.out.println(result); // summary of the test - i++; - } + LabeledPoint p1 = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)); + LabeledPoint p2 = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)); + LabeledPoint p3 = new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)); + JavaRDD obs = jsc.parallelize(Arrays.asList(p1, p2, p3)); // an RDD of labeled points - // $example off$ - - jsc.stop(); + // The contingency table is constructed from the raw (feature, label) pairs and used to conduct + // the independence test. Returns an array containing the ChiSquaredTestResult for every feature + // against the label. + ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); + int i = 1; + for (ChiSqTestResult result : featureTestResults) { + System.out.println("Column " + i + ":"); + System.out.println(result); // summary of the test + i++; } + // $example off$ + + jsc.stop(); + } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java index 875c2c8777c3b..2e1e9af224257 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -25,29 +25,24 @@ // $example off$ import org.apache.spark.SparkConf; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.mllib.linalg.Vectors; import java.util.Arrays; public class JavaHypothesisTestingKolmogorovSmirnovTestExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample") - .setMaster("local[*]"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); - - // $example on$ - JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25)); - KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); - // summary of the test including the p-value, test statistic, - // and null hypothesis - // if our p-value indicates significance, we can reject the null hypothesis - System.out.println(testResult); - - // $example off$ - - jsc.stop(); - } + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25)); + KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); + // summary of the test including the p-value, test statistic, + // and null hypothesis + // if our p-value indicates significance, we can reject the null hypothesis + System.out.println(testResult); + // $example off$ + + jsc.stop(); + } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java index 72781eedfe635..f637d78574fc4 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -24,38 +24,32 @@ import org.apache.spark.mllib.stat.KernelDensity; import org.apache.spark.rdd.RDD; // $example off$ - import org.apache.spark.SparkConf; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.mllib.linalg.Vectors; import java.util.Arrays; - public class JavaKernelDensityEstimationExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample").setMaster("local[*]"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + public static void main(String[] args) { - // $example on$ - JavaRDD data = jsc.parallelize( - Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data + SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); - // Construct the density estimator with the sample data and a standard deviation for the Gaussian - // kernels - KernelDensity kd = new KernelDensity() - .setSample(data) - .setBandwidth(3.0); + // $example on$ + JavaRDD data = jsc.parallelize( + Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data - // Find density estimates for the given values - double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); + // Construct the density estimator with the sample data and a standard deviation for the Gaussian + // kernels + KernelDensity kd = new KernelDensity() + .setSample(data) + .setBandwidth(3.0); - // $example off$ + // Find density estimates for the given values + double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); + // $example off$ - System.out.println(Arrays.toString(densities)); + System.out.println(Arrays.toString(densities)); - jsc.stop(); - } + jsc.stop(); + } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java index 46bd1889bb803..0b95cc6868512 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java @@ -18,7 +18,6 @@ package org.apache.spark.examples.mllib; // $example on$ -import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaRDD; import static org.apache.spark.mllib.random.RandomRDDs.*; @@ -28,46 +27,39 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.VoidFunction; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.mllib.linalg.Vectors; -import java.util.Arrays; - - public class JavaRandomDataGenerationExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample").setMaster("local[*]"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + public static void main(String[] args) { - // $example on$ - // Generate a random double RDD that contains 1 million i.i.d. values drawn from the - // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. - JavaDoubleRDD u = normalJavaRDD(jsc, 1000L, 10); - // Apply a transform to get a random double RDD following `N(1, 4)`. - JavaRDD v = u.map( - new Function() { - public Double call(Double x) { - return 1.0 + 2.0 * x; - } - }); + SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); - // $example off$ + // $example on$ + // Generate a random double RDD that contains 1 million i.i.d. values drawn from the + // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. + JavaDoubleRDD u = normalJavaRDD(jsc, 1000L, 10); + // Apply a transform to get a random double RDD following `N(1, 4)`. + JavaRDD v = u.map( + new Function() { + public Double call(Double x) { + return 1.0 + 2.0 * x; + } + }); + // $example off$ - u.foreach(new VoidFunction() { - public void call(Double d) throws Exception { - System.out.println(d); - } - }); + u.foreach(new VoidFunction() { + public void call(Double d) throws Exception { + System.out.println(d); + } + }); - v.foreach(new VoidFunction() { - public void call(Double d) throws Exception { - System.out.println(d); - } - }); + v.foreach(new VoidFunction() { + public void call(Double d) throws Exception { + System.out.println(d); + } + }); - jsc.stop(); - } + jsc.stop(); + } } diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala index fe3c280cf0082..c09e99011e579 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala @@ -31,7 +31,7 @@ object HypothesisTestingExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("HypothesisTestingExample").setMaster("local[*]") + val conf = new SparkConf().setAppName("HypothesisTestingExample") val sc = new SparkContext(conf) // $example on$ @@ -65,7 +65,6 @@ object HypothesisTestingExample { println(s"Column $i:\n$result") i += 1 } // summary of the test - // $example off$ sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala index 7ed96766fcd11..78660f45a43a0 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala @@ -29,7 +29,6 @@ object HypothesisTestingKolmogorovSmirnovTestExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample") - .setMaster("local[*]") val sc = new SparkContext(conf) // $example on$ @@ -45,7 +44,6 @@ object HypothesisTestingKolmogorovSmirnovTestExample { val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) println(testResult2) - // $example off$ sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala index 31b5a5e1ad05c..402276ff086d8 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala @@ -28,7 +28,7 @@ object KernelDensityEstimationExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("KernelDensityEstimationExample").setMaster("local[*]") + val conf = new SparkConf().setAppName("KernelDensityEstimationExample") val sc = new SparkContext(conf) // $example on$ @@ -43,7 +43,6 @@ object KernelDensityEstimationExample { // Find density estimates for the given values val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) - // $example off$ densities.foreach(print) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala index 91019a2ac9de5..4ba8badbaa867 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala @@ -27,7 +27,7 @@ object RandomDataGenerationExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("RandomDataGenerationExample").setMaster("local[*]") + val conf = new SparkConf().setAppName("RandomDataGenerationExample") val sc = new SparkContext(conf) // $example on$ @@ -36,7 +36,6 @@ object RandomDataGenerationExample { val u = normalRDD(sc, 1000L, 10) // Apply a transform to get a random double RDD following `N(1, 4)`. val v = u.map(x => 1.0 + 2.0 * x) - // $example off$ u.foreach(print) v.foreach(print) From d817d0bbbea4913688bd6c3c66cecf95b9dbe198 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Sat, 20 Feb 2016 11:03:23 -0800 Subject: [PATCH 18/26] [SPARK-13019] more java style fix --- .../mllib/JavaCorrelationsExample.java | 59 ++++++----- .../mllib/JavaHypothesisTestingExample.java | 17 ++-- ...isTestingKolmogorovSmirnovTestExample.java | 3 +- .../JavaKernelDensityEstimationExample.java | 11 +-- .../JavaRandomDataGenerationExample.java | 32 +++--- .../mllib/JavaStratifiedSamplingExample.java | 99 +++++++++---------- .../mllib/JavaSummaryStatisticsExample.java | 41 ++++---- 7 files changed, 126 insertions(+), 136 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java index e12481fab10c0..f54da71d35040 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java @@ -17,53 +17,52 @@ package org.apache.spark.examples.mllib; +import org.apache.spark.SparkConf; // $example on$ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.*; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.stat.Statistics; // $example off$ - -import org.apache.spark.SparkConf; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.mllib.linalg.Vectors; import java.util.Arrays; public class JavaCorrelationsExample { - public static void main(String[] args) { + public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample").setMaster("local[*]"); - JavaSparkContext jsc = new JavaSparkContext(conf); + SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); - // $example on$ - JavaDoubleRDD seriesX = jsc.parallelizeDoubles( - Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series + // $example on$ + JavaDoubleRDD seriesX = jsc.parallelizeDoubles( + Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series - // must have the same number of partitions and cardinality as seriesX - JavaDoubleRDD seriesY = jsc.parallelizeDoubles( - Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0})); + // must have the same number of partitions and cardinality as seriesX + JavaDoubleRDD seriesY = jsc.parallelizeDoubles( + Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0})); - // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a - // method is not specified, Pearson's method will be used by default. - Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); - System.out.println("correlation is: " + correlation); + // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a + // method is not specified, Pearson's method will be used by default. + Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); + System.out.println("correlation is: " + correlation); - Vector v1 = Vectors.dense(1.0, 10.0, 100.0); - Vector v2 = Vectors.dense(2.0, 20.0, 200.0); - Vector v3 = Vectors.dense(5.0, 33.0, 366.0); + Vector v1 = Vectors.dense(1.0, 10.0, 100.0); + Vector v2 = Vectors.dense(2.0, 20.0, 200.0); + Vector v3 = Vectors.dense(5.0, 33.0, 366.0); - // note that each Vector is a row and not a column - JavaRDD data = jsc.parallelize(Arrays.asList(v1, v2, v3)); + // note that each Vector is a row and not a column + JavaRDD data = jsc.parallelize(Arrays.asList(v1, v2, v3)); - // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. - // If a method is not specified, Pearson's method will be used by default. - Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); - System.out.println(correlMatrix.toString()); - // $example off$ + // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. + // If a method is not specified, Pearson's method will be used by default. + Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); + System.out.println(correlMatrix.toString()); + // $example off$ - jsc.stop(); - } + jsc.stop(); + } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java index f1532ddee7bf5..6c7ef401483f9 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -17,19 +17,20 @@ package org.apache.spark.examples.mllib; +import org.apache.spark.SparkConf; // $example on$ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.*; +import org.apache.spark.mllib.linalg.Matrices; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.stat.Statistics; import org.apache.spark.mllib.stat.test.ChiSqTestResult; // $example off$ -import org.apache.spark.SparkConf; -import org.apache.spark.mllib.linalg.Vectors; import java.util.Arrays; - public class JavaHypothesisTestingExample { public static void main(String[] args) { @@ -47,7 +48,7 @@ public static void main(String[] args) { System.out.println(goodnessOfFitTestResult); // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) - Matrix mat = Matrices.dense(3, 2, new double[] {1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix + Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix // conduct Pearson's independence test on the input contingency matrix ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); @@ -65,9 +66,9 @@ public static void main(String[] args) { ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); int i = 1; for (ChiSqTestResult result : featureTestResults) { - System.out.println("Column " + i + ":"); - System.out.println(result); // summary of the test - i++; + System.out.println("Column " + i + ":"); + System.out.println(result); // summary of the test + i++; } // $example off$ diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java index 2e1e9af224257..238785019c814 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -17,14 +17,13 @@ package org.apache.spark.examples.mllib; +import org.apache.spark.SparkConf; // $example on$ import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.stat.Statistics; import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; // $example off$ - -import org.apache.spark.SparkConf; import java.util.Arrays; public class JavaHypothesisTestingKolmogorovSmirnovTestExample { diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java index f637d78574fc4..ec3241af2c7ea 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -17,14 +17,13 @@ package org.apache.spark.examples.mllib; +import org.apache.spark.SparkConf; // $example on$ -import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.stat.KernelDensity; import org.apache.spark.rdd.RDD; // $example off$ -import org.apache.spark.SparkConf; import java.util.Arrays; public class JavaKernelDensityEstimationExample { @@ -35,16 +34,16 @@ public static void main(String[] args) { // $example on$ JavaRDD data = jsc.parallelize( - Arrays.asList(1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,6.0,7.0,8.0,9.0,9.0)); // an RDD of sample data + Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); // an RDD of sample data // Construct the density estimator with the sample data and a standard deviation for the Gaussian // kernels KernelDensity kd = new KernelDensity() - .setSample(data) - .setBandwidth(3.0); + .setSample(data) + .setBandwidth(3.0); // Find density estimates for the given values - double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0}); + double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0}); // $example off$ System.out.println(Arrays.toString(densities)); diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java index 0b95cc6868512..341b47acf8543 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java @@ -17,17 +17,17 @@ package org.apache.spark.examples.mllib; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.VoidFunction; // $example on$ import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaRDD; + import static org.apache.spark.mllib.random.RandomRDDs.*; // $example off$ -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.VoidFunction; - public class JavaRandomDataGenerationExample { public static void main(String[] args) { @@ -40,23 +40,23 @@ public static void main(String[] args) { JavaDoubleRDD u = normalJavaRDD(jsc, 1000L, 10); // Apply a transform to get a random double RDD following `N(1, 4)`. JavaRDD v = u.map( - new Function() { - public Double call(Double x) { - return 1.0 + 2.0 * x; - } - }); + new Function() { + public Double call(Double x) { + return 1.0 + 2.0 * x; + } + }); // $example off$ u.foreach(new VoidFunction() { - public void call(Double d) throws Exception { - System.out.println(d); - } + public void call(Double d) throws Exception { + System.out.println(d); + } }); v.foreach(new VoidFunction() { - public void call(Double d) throws Exception { - System.out.println(d); - } + public void call(Double d) throws Exception { + System.out.println(d); + } }); jsc.stop(); diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java index 7371e274dcbb1..41bc0aa92525a 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java @@ -17,66 +17,59 @@ package org.apache.spark.examples.mllib; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.VoidFunction; +import scala.Tuple2; // $example on$ import java.util.*; - import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; // $example off$ -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.VoidFunction; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.mllib.linalg.Vectors; -import scala.Tuple2; - - public class JavaStratifiedSamplingExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample").setMaster("local[*]"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); - - // $example on$ - List> list = new ArrayList<>(); - list.add(new Tuple2(1,'a')); - list.add(new Tuple2(1, 'b')); - list.add(new Tuple2(2,'c')); - list.add(new Tuple2(2,'d')); - list.add(new Tuple2(2, 'e')); - list.add(new Tuple2(3,'f')); - - // an RDD of any key value pairs JavaPairRDD - JavaPairRDD data = jsc.parallelizePairs(list); - - // specify the exact fraction desired from each key Map - Map fractions = new HashMap<>(); - - fractions.put(1, 0.1); - fractions.put(2, 0.6); - fractions.put(3, 0.3); - - // Get an exact sample from each stratum - JavaPairRDD approxSample = data.sampleByKey(false, fractions); // JavaPairRDD - JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); // JavaPairRDD - - // $example off$ - - approxSample.foreach(new VoidFunction>() { - public void call(Tuple2 t) throws Exception { - System.out.println(t._1() + " " + t._2()); - } - }); - - exactSample.foreach(new VoidFunction>() { - public void call(Tuple2 t) throws Exception { - System.out.println(t._1() + " " + t._2()); - } - }); - - jsc.stop(); - } + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + + // $example on$ + List> list = new ArrayList<>(); + list.add(new Tuple2(1, 'a')); + list.add(new Tuple2(1, 'b')); + list.add(new Tuple2(2, 'c')); + list.add(new Tuple2(2, 'd')); + list.add(new Tuple2(2, 'e')); + list.add(new Tuple2(3, 'f')); + + // an RDD of any key value pairs JavaPairRDD + JavaPairRDD data = jsc.parallelizePairs(list); + + // specify the exact fraction desired from each key Map + Map fractions = new HashMap<>(); + + fractions.put(1, 0.1); + fractions.put(2, 0.6); + fractions.put(3, 0.3); + + // Get an exact sample from each stratum + JavaPairRDD approxSample = data.sampleByKey(false, fractions); // JavaPairRDD + JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); // JavaPairRDD + // $example off$ + + approxSample.foreach(new VoidFunction>() { + public void call(Tuple2 t) throws Exception { + System.out.println(t._1() + " " + t._2()); + } + }); + + exactSample.foreach(new VoidFunction>() { + public void call(Tuple2 t) throws Exception { + System.out.println(t._1() + " " + t._2()); + } + }); + + jsc.stop(); + } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java index ed7f9637e7627..eab0a1d9f2844 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java @@ -17,6 +17,9 @@ package org.apache.spark.examples.mllib; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.mllib.linalg.Vectors; // $example on$ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -24,34 +27,30 @@ import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; import org.apache.spark.mllib.stat.Statistics; // $example off$ - -import org.apache.spark.SparkConf; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.mllib.linalg.Vectors; import java.util.Arrays; public class JavaSummaryStatisticsExample { - public static void main(String[] args) { + public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); + SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample"); + JavaSparkContext jsc = new JavaSparkContext(conf); + SQLContext sqlContext = new SQLContext(jsc); - // $example on$ - Vector v1 = Vectors.dense(1.0, 10.0, 100.0); - Vector v2 = Vectors.dense(2.0, 20.0, 200.0); - Vector v3 = Vectors.dense(3.0, 30.0, 300.0); + // $example on$ + Vector v1 = Vectors.dense(1.0, 10.0, 100.0); + Vector v2 = Vectors.dense(2.0, 20.0, 200.0); + Vector v3 = Vectors.dense(3.0, 30.0, 300.0); - JavaRDD mat = jsc.parallelize(Arrays.asList(v1, v2, v3)); // an RDD of Vectors + JavaRDD mat = jsc.parallelize(Arrays.asList(v1, v2, v3)); // an RDD of Vectors - // Compute column summary statistics. - MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); - System.out.println(summary.mean()); // a dense vector containing the mean value for each column - System.out.println(summary.variance()); // column-wise variance - System.out.println(summary.numNonzeros()); // number of nonzeros in each column - // $example off$ + // Compute column summary statistics. + MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); + System.out.println(summary.mean()); // a dense vector containing the mean value for each column + System.out.println(summary.variance()); // column-wise variance + System.out.println(summary.numNonzeros()); // number of nonzeros in each column + // $example off$ - jsc.stop(); - } + jsc.stop(); + } } From f945222ad1cfd1f4756258a49b372aa7bd32d9fc Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Sun, 21 Feb 2016 14:09:58 -0800 Subject: [PATCH 19/26] [SPARK-13019] mainly re-organize java import --- .../examples/mllib/JavaCorrelationsExample.java | 8 ++++---- .../mllib/JavaHypothesisTestingExample.java | 3 ++- ...hesisTestingKolmogorovSmirnovTestExample.java | 6 +++--- .../JavaKernelDensityEstimationExample.java | 11 ++++------- .../mllib/JavaStratifiedSamplingExample.java | 16 +++++++++++----- .../mllib/JavaSummaryStatisticsExample.java | 10 ++++------ .../main/python/mllib/correlations_example.py | 9 +++------ .../python/mllib/hypothesis_testing_example.py | 4 ---- ...is_testing_kolmogorov_smirnov_test_example.py | 9 ++------- .../mllib/kernel_density_estimation_example.py | 4 ---- .../mllib/random_data_generation_example.py | 4 ---- .../python/mllib/stratified_sampling_example.py | 8 -------- .../python/mllib/summary_statistics_example.py | 3 --- .../examples/mllib/CorrelationsExample.scala | 2 +- .../mllib/RandomDataGenerationExample.scala | 1 + .../mllib/StratifiedSamplingExample.scala | 5 +---- .../mllib/SummaryStatisticsExample.scala | 2 +- 17 files changed, 37 insertions(+), 68 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java index f54da71d35040..c3e3a789e755b 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java @@ -17,6 +17,8 @@ package org.apache.spark.examples.mllib; +import java.util.Arrays; + import org.apache.spark.SparkConf; // $example on$ import org.apache.spark.api.java.JavaRDD; @@ -27,8 +29,6 @@ import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.stat.Statistics; // $example off$ -import java.util.Arrays; - public class JavaCorrelationsExample { public static void main(String[] args) { @@ -44,8 +44,8 @@ public static void main(String[] args) { JavaDoubleRDD seriesY = jsc.parallelizeDoubles( Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0})); - // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a - // method is not specified, Pearson's method will be used by default. + // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. + // If a method is not specified, Pearson's method will be used by default. Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); System.out.println("correlation is: " + correlation); diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java index 6c7ef401483f9..8be28a11aff57 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -17,6 +17,8 @@ package org.apache.spark.examples.mllib; +import java.util.Arrays; + import org.apache.spark.SparkConf; // $example on$ import org.apache.spark.api.java.JavaRDD; @@ -29,7 +31,6 @@ import org.apache.spark.mllib.stat.Statistics; import org.apache.spark.mllib.stat.test.ChiSqTestResult; // $example off$ -import java.util.Arrays; public class JavaHypothesisTestingExample { public static void main(String[] args) { diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java index 238785019c814..9ae2907a38084 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -17,6 +17,8 @@ package org.apache.spark.examples.mllib; +import java.util.Arrays; + import org.apache.spark.SparkConf; // $example on$ import org.apache.spark.api.java.JavaDoubleRDD; @@ -24,7 +26,6 @@ import org.apache.spark.mllib.stat.Statistics; import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; // $example off$ -import java.util.Arrays; public class JavaHypothesisTestingKolmogorovSmirnovTestExample { public static void main(String[] args) { @@ -35,8 +36,7 @@ public static void main(String[] args) { // $example on$ JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25)); KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); - // summary of the test including the p-value, test statistic, - // and null hypothesis + // summary of the test including the p-value, test statistic, and null hypothesis // if our p-value indicates significance, we can reject the null hypothesis System.out.println(testResult); // $example off$ diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java index ec3241af2c7ea..18ccd0a951c7b 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -17,14 +17,14 @@ package org.apache.spark.examples.mllib; +import java.util.Arrays; + import org.apache.spark.SparkConf; // $example on$ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.stat.KernelDensity; -import org.apache.spark.rdd.RDD; // $example off$ -import java.util.Arrays; public class JavaKernelDensityEstimationExample { public static void main(String[] args) { @@ -36,11 +36,8 @@ public static void main(String[] args) { JavaRDD data = jsc.parallelize( Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); // an RDD of sample data - // Construct the density estimator with the sample data and a standard deviation for the Gaussian - // kernels - KernelDensity kd = new KernelDensity() - .setSample(data) - .setBandwidth(3.0); + // Construct the density estimator with the sample data and a standard deviation for the Gaussian kernels + KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0); // Find density estimates for the given values double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0}); diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java index 41bc0aa92525a..d80592182ed92 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java @@ -17,16 +17,22 @@ package org.apache.spark.examples.mllib; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.VoidFunction; +import java.util.ArrayList; +import java.util.List; +import java.util.HashMap; +// $example on$ +import java.util.Map; +// $example off$ + import scala.Tuple2; + +import org.apache.spark.api.java.function.VoidFunction; // $example on$ -import java.util.*; -import org.apache.spark.api.java.JavaRDD; + import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; // $example off$ +import org.apache.spark.SparkConf; public class JavaStratifiedSamplingExample { public static void main(String[] args) { diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java index eab0a1d9f2844..755e6e5a2982f 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java @@ -17,25 +17,23 @@ package org.apache.spark.examples.mllib; -import org.apache.spark.SparkConf; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.mllib.linalg.Vectors; +import java.util.Arrays; + // $example on$ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; import org.apache.spark.mllib.stat.Statistics; // $example off$ -import java.util.Arrays; - +import org.apache.spark.SparkConf; public class JavaSummaryStatisticsExample { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample"); JavaSparkContext jsc = new JavaSparkContext(conf); - SQLContext sqlContext = new SQLContext(jsc); // $example on$ Vector v1 = Vectors.dense(1.0, 10.0, 100.0); diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py index e9ccca0dd5593..46bd5ede53baf 100644 --- a/examples/src/main/python/mllib/correlations_example.py +++ b/examples/src/main/python/mllib/correlations_example.py @@ -18,23 +18,21 @@ from __future__ import print_function from pyspark import SparkContext -from pyspark.sql import SQLContext import numpy as np -from pyspark.mllib.linalg import Vectors # $example on$ from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": - # $example on$ sc = SparkContext(appName="CorrelationsExample") # SparkContext + # $example on$ seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0]) # a series # seriesY must have the same number of partitions and cardinality as seriesX seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0]) - # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a - # method is not specified, Pearson's method will be used by default. + # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. + # If a method is not specified, Pearson's method will be used by default. print(Statistics.corr(seriesX, seriesY, method="pearson")) v1 = np.array([1.0, 10.0, 100.0]) @@ -45,7 +43,6 @@ # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. print(Statistics.corr(data, method="pearson")) - # $example off$ sc.stop() diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py index 4d37e394af3b7..f548d6566ba2a 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_example.py @@ -18,8 +18,6 @@ from __future__ import print_function from pyspark import SparkContext -from pyspark.sql import SQLContext -import numpy as np from pyspark.mllib.linalg import Vectors # $example on$ from pyspark import SparkContext @@ -30,7 +28,6 @@ if __name__ == "__main__": sc = SparkContext(appName="HypothesisTestingExample") # SparkContext - sqlContext = SQLContext(sc) # $example on$ vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events @@ -60,7 +57,6 @@ for i, result in enumerate(featureTestResults): print("Column: " + str(i + 1)) print(result) - # $example off$ sc.stop() diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py index 3e3c6ba0b96a8..15d63ef86b2e7 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py @@ -18,28 +18,23 @@ from __future__ import print_function from pyspark import SparkContext -from pyspark.sql import SQLContext -import numpy as np -from pyspark.mllib.linalg import Vectors # $example on$ from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample") # SparkContext - sqlContext = SQLContext(sc) # $example on$ parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25]) # run a KS test for the sample versus a standard normal distribution testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1) - print(testResult) # summary of the test including the p-value, test statistic, - # and null hypothesis + # summary of the test including the p-value, test statistic, and null hypothesis # if our p-value indicates significance, we can reject the null hypothesis # Note that the Scala functionality of calling Statistics.kolmogorovSmirnovTest with # a lambda to calculate the CDF is not made available in the Python API - + print(testResult) # $example off$ sc.stop() diff --git a/examples/src/main/python/mllib/kernel_density_estimation_example.py b/examples/src/main/python/mllib/kernel_density_estimation_example.py index 746027e6d599f..3e8f7241a4a1e 100644 --- a/examples/src/main/python/mllib/kernel_density_estimation_example.py +++ b/examples/src/main/python/mllib/kernel_density_estimation_example.py @@ -18,16 +18,12 @@ from __future__ import print_function from pyspark import SparkContext -from pyspark.sql import SQLContext -import numpy as np -from pyspark.mllib.linalg import Vectors # $example on$ from pyspark.mllib.stat import KernelDensity # $example off$ if __name__ == "__main__": sc = SparkContext(appName="KernelDensityEstimationExample") # SparkContext - sqlContext = SQLContext(sc) # $example on$ # an RDD of sample data diff --git a/examples/src/main/python/mllib/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py index 7eb2ea81a8038..c08d631c5db7d 100644 --- a/examples/src/main/python/mllib/random_data_generation_example.py +++ b/examples/src/main/python/mllib/random_data_generation_example.py @@ -18,16 +18,12 @@ from __future__ import print_function from pyspark import SparkContext -from pyspark.sql import SQLContext -import numpy as np -from pyspark.mllib.linalg import Vectors # $example on$ from pyspark.mllib.random import RandomRDDs # $example off$ if __name__ == "__main__": sc = SparkContext(appName="RandomDataGenerationExample") # SparkContext - sqlContext = SQLContext(sc) # $example on$ # Generate a random double RDD that contains 1 million i.i.d. values drawn from the diff --git a/examples/src/main/python/mllib/stratified_sampling_example.py b/examples/src/main/python/mllib/stratified_sampling_example.py index 63bf2dddede01..a13f8f08dd68b 100644 --- a/examples/src/main/python/mllib/stratified_sampling_example.py +++ b/examples/src/main/python/mllib/stratified_sampling_example.py @@ -18,16 +18,9 @@ from __future__ import print_function from pyspark import SparkContext -from pyspark.sql import SQLContext -import numpy as np -from pyspark.mllib.linalg import Vectors -# $example on$ -from pyspark.mllib.stat import Statistics -# $example off$ if __name__ == "__main__": sc = SparkContext(appName="StratifiedSamplingExample") # SparkContext - sqlContext = SQLContext(sc) # $example on$ # an RDD of any key value pairs @@ -37,7 +30,6 @@ fractions = {1: 0.1, 2: 0.6, 3: 0.3} approxSample = data.sampleByKey(False, fractions) - # $example off$ for each in approxSample.collect(): diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py index 2048bc432aa77..eb6ef272a4f66 100644 --- a/examples/src/main/python/mllib/summary_statistics_example.py +++ b/examples/src/main/python/mllib/summary_statistics_example.py @@ -18,16 +18,13 @@ from __future__ import print_function from pyspark import SparkContext -from pyspark.sql import SQLContext import numpy as np -from pyspark.mllib.linalg import Vectors # $example on$ from pyspark.mllib.stat import Statistics # $example off$ if __name__ == "__main__": sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext - sqlContext = SQLContext(sc) # $example on$ v1 = np.array([1.0, 2.0, 3.0]) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala index 9b3c0321f067d..e395a25dc6a2e 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala @@ -29,7 +29,7 @@ object CorrelationsExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("CorrelationsExample").setMaster("local[*]") + val conf = new SparkConf().setAppName("CorrelationsExample") val sc = new SparkContext(conf) // $example on$ diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala index 4ba8badbaa867..de48ae58ce0e1 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala @@ -37,6 +37,7 @@ object RandomDataGenerationExample { // Apply a transform to get a random double RDD following `N(1, 4)`. val v = u.map(x => 1.0 + 2.0 * x) // $example off$ + u.foreach(print) v.foreach(print) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala index 24a5407426894..453e4a2f9d283 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala @@ -21,16 +21,13 @@ package org.apache.spark.examples.mllib // $example on$ import org.apache.spark.{SparkConf, SparkContext} // $example off$ -import org.apache.spark.sql.SQLContext - object StratifiedSamplingExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("StratifiedSamplingExample").setMaster("local[*]") + val conf = new SparkConf().setAppName("StratifiedSamplingExample") val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) // $example on$ // an RDD[(K, V)] of any key value pairs diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala index c2fe7976b4609..675c07aeab954 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala @@ -28,7 +28,7 @@ object SummaryStatisticsExample { def main(args: Array[String]) { - val conf = new SparkConf().setAppName("SummaryStatisticsExample").setMaster("local[*]") + val conf = new SparkConf().setAppName("SummaryStatisticsExample") val sc = new SparkContext(conf) // $example on$ From aec10cac879f29a3edb03e0855e60a772ca30c18 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Tue, 23 Feb 2016 13:18:09 -0800 Subject: [PATCH 20/26] [SPARK-13019] re-organize python import --- examples/src/main/python/mllib/correlations_example.py | 3 ++- examples/src/main/python/mllib/hypothesis_testing_example.py | 2 -- examples/src/main/python/mllib/summary_statistics_example.py | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py index 46bd5ede53baf..a0d2e7d8be7eb 100644 --- a/examples/src/main/python/mllib/correlations_example.py +++ b/examples/src/main/python/mllib/correlations_example.py @@ -17,8 +17,9 @@ from __future__ import print_function -from pyspark import SparkContext import numpy as np + +from pyspark import SparkContext # $example on$ from pyspark.mllib.stat import Statistics # $example off$ diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py index f548d6566ba2a..ca3d12bc153fd 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_example.py @@ -18,9 +18,7 @@ from __future__ import print_function from pyspark import SparkContext -from pyspark.mllib.linalg import Vectors # $example on$ -from pyspark import SparkContext from pyspark.mllib.linalg import Vectors, Matrices from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py index eb6ef272a4f66..088f9f8807498 100644 --- a/examples/src/main/python/mllib/summary_statistics_example.py +++ b/examples/src/main/python/mllib/summary_statistics_example.py @@ -17,8 +17,9 @@ from __future__ import print_function -from pyspark import SparkContext import numpy as np + +from pyspark import SparkContext # $example on$ from pyspark.mllib.stat import Statistics # $example off$ From e2737eedd6c45c82f25045442b1d811ab2c395ec Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Sun, 6 Mar 2016 14:45:28 -0800 Subject: [PATCH 21/26] [SPARK-13019] code review improvement --- docs/mllib-statistics.md | 44 ++++++++++++- .../mllib/JavaCorrelationsExample.java | 14 ++-- .../mllib/JavaHypothesisTestingExample.java | 30 +++++---- ...isTestingKolmogorovSmirnovTestExample.java | 12 ++-- .../JavaKernelDensityEstimationExample.java | 12 ++-- .../JavaRandomDataGenerationExample.java | 65 ------------------- .../mllib/JavaStratifiedSamplingExample.java | 22 +++---- .../mllib/JavaSummaryStatisticsExample.java | 12 ++-- .../main/python/mllib/correlations_example.py | 2 +- .../mllib/hypothesis_testing_example.py | 7 +- .../mllib/summary_statistics_example.py | 10 +-- .../examples/mllib/CorrelationsExample.scala | 8 +-- .../mllib/HypothesisTestingExample.scala | 22 ++++--- ...sTestingKolmogorovSmirnovTestExample.scala | 11 ++-- .../KernelDensityEstimationExample.scala | 4 +- .../mllib/RandomDataGenerationExample.scala | 48 -------------- .../mllib/StratifiedSamplingExample.scala | 6 +- .../mllib/SummaryStatisticsExample.scala | 8 +-- 18 files changed, 139 insertions(+), 198 deletions(-) delete mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index b06829f0247dd..62faa1bfa45ff 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -277,7 +277,18 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) for details on the API. -{% include_example scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala %} +-{% highlight scala %} +-import org.apache.spark.SparkContext +-import org.apache.spark.mllib.random.RandomRDDs._ +- +-val sc: SparkContext = ... +- +-// Generate a random double RDD that contains 1 million i.i.d. values drawn from the +-// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. +-val u = normalRDD(sc, 1000000L, 10) +-// Apply a transform to get a random double RDD following `N(1, 4)`. +-val v = u.map(x => 1.0 + 2.0 * x) +-{% endhighlight %}
@@ -288,7 +299,24 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Java docs](api/java/org/apache/spark/mllib/random/RandomRDDs) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java %} +-{% highlight java %} + -import org.apache.spark.SparkContext; + -import org.apache.spark.api.JavaDoubleRDD; + -import static org.apache.spark.mllib.random.RandomRDDs.*; + - + -JavaSparkContext jsc = ... + - + -// Generate a random double RDD that contains 1 million i.i.d. values drawn from the + -// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. + -JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10); + -// Apply a transform to get a random double RDD following `N(1, 4)`. + -JavaDoubleRDD v = u.map( + - new Function() { + - public Double call(Double x) { + - return 1.0 + 2.0 * x; + - } + - }); + -{% endhighlight %}
@@ -299,7 +327,17 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.random.RandomRDDs) for more details on the API. -{% include_example python/mllib/random_data_generation_example.py %} +-{% highlight python %} + -from pyspark.mllib.random import RandomRDDs + - + -sc = ... # SparkContext + - + -# Generate a random double RDD that contains 1 million i.i.d. values drawn from the + -# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. + -u = RandomRDDs.normalRDD(sc, 1000000L, 10) + -# Apply a transform to get a random double RDD following `N(1, 4)`. + -v = u.map(lambda x: 1.0 + 2.0 * x) + -{% endhighlight %}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java index c3e3a789e755b..c27c1d01bab58 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java @@ -17,13 +17,13 @@ package org.apache.spark.examples.mllib; -import java.util.Arrays; - import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; // $example on$ -import org.apache.spark.api.java.JavaRDD; +import java.util.Arrays; + import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.JavaRDD; import org.apache.spark.mllib.linalg.Matrix; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; @@ -38,16 +38,16 @@ public static void main(String[] args) { // $example on$ JavaDoubleRDD seriesX = jsc.parallelizeDoubles( - Arrays.asList(new Double[]{1.0, 2.0, 3.0, 3.0, 5.0})); // a series + Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series // must have the same number of partitions and cardinality as seriesX JavaDoubleRDD seriesY = jsc.parallelizeDoubles( - Arrays.asList(new Double[]{11.0, 22.0, 33.0, 33.0, 555.0})); + Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0)); // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. // If a method is not specified, Pearson's method will be used by default. Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); - System.out.println("correlation is: " + correlation); + System.out.println("Correlation is: " + correlation); Vector v1 = Vectors.dense(1.0, 10.0, 100.0); Vector v2 = Vectors.dense(2.0, 20.0, 200.0); diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java index 8be28a11aff57..0960b07a98557 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -17,12 +17,13 @@ package org.apache.spark.examples.mllib; -import java.util.Arrays; - import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + // $example on$ +import java.util.Arrays; + import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.linalg.Matrices; import org.apache.spark.mllib.linalg.Matrix; import org.apache.spark.mllib.linalg.Vector; @@ -39,27 +40,31 @@ public static void main(String[] args) { JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ - Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); // a vector composed of the frequencies of events + // a vector composed of the frequencies of events + Vector vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25); - // compute the goodness of fit. If a second vector to test against is not supplied as a parameter, - // the test runs against a uniform distribution. + // compute the goodness of fit. If a second vector to test against is not supplied + // as a parameter, the test runs against a uniform distribution. ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); - // summary of the test including the p-value, degrees of freedom, test statistic, the method used, - // and the null hypothesis. + // summary of the test including the p-value, degrees of freedom, test statistic, + // the method used, and the null hypothesis. System.out.println(goodnessOfFitTestResult); + System.out.println(); - // Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) - Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // a contingency matrix + // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) + Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); // conduct Pearson's independence test on the input contingency matrix ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); // summary of the test including the p-value, degrees of freedom... System.out.println(independenceTestResult); + System.out.println(); LabeledPoint p1 = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)); LabeledPoint p2 = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)); LabeledPoint p3 = new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)); - JavaRDD obs = jsc.parallelize(Arrays.asList(p1, p2, p3)); // an RDD of labeled points + // an RDD of labeled points + JavaRDD obs = jsc.parallelize(Arrays.asList(p1, p2, p3)); // The contingency table is constructed from the raw (feature, label) pairs and used to conduct // the independence test. Returns an array containing the ChiSquaredTestResult for every feature @@ -68,7 +73,8 @@ public static void main(String[] args) { int i = 1; for (ChiSqTestResult result : featureTestResults) { System.out.println("Column " + i + ":"); - System.out.println(result); // summary of the test + System.out.println(result); // summary of the test + System.out.println(); i++; } // $example off$ diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java index 9ae2907a38084..fe611c9ae67c9 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingKolmogorovSmirnovTestExample.java @@ -17,12 +17,12 @@ package org.apache.spark.examples.mllib; -import java.util.Arrays; - import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; // $example on$ +import java.util.Arrays; + import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.stat.Statistics; import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult; // $example off$ @@ -30,12 +30,14 @@ public class JavaHypothesisTestingKolmogorovSmirnovTestExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample"); + SparkConf conf = + new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample"); JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25)); - KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); + KolmogorovSmirnovTestResult testResult = + Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); // summary of the test including the p-value, test statistic, and null hypothesis // if our p-value indicates significance, we can reject the null hypothesis System.out.println(testResult); diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java index 18ccd0a951c7b..2f25b7534164a 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -17,12 +17,12 @@ package org.apache.spark.examples.mllib; -import java.util.Arrays; - import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; // $example on$ +import java.util.Arrays; + import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.stat.KernelDensity; // $example off$ @@ -33,10 +33,12 @@ public static void main(String[] args) { JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ + // an RDD of sample data JavaRDD data = jsc.parallelize( - Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); // an RDD of sample data + Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); - // Construct the density estimator with the sample data and a standard deviation for the Gaussian kernels + // Construct the density estimator with the sample data + // and a standard deviation for the Gaussian kernels KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0); // Find density estimates for the given values diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java deleted file mode 100644 index 341b47acf8543..0000000000000 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomDataGenerationExample.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.examples.mllib; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.VoidFunction; -// $example on$ -import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.api.java.JavaRDD; - -import static org.apache.spark.mllib.random.RandomRDDs.*; -// $example off$ - -public class JavaRandomDataGenerationExample { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("JavaRandomDataGenerationExample"); - JavaSparkContext jsc = new JavaSparkContext(conf); - - // $example on$ - // Generate a random double RDD that contains 1 million i.i.d. values drawn from the - // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. - JavaDoubleRDD u = normalJavaRDD(jsc, 1000L, 10); - // Apply a transform to get a random double RDD following `N(1, 4)`. - JavaRDD v = u.map( - new Function() { - public Double call(Double x) { - return 1.0 + 2.0 * x; - } - }); - // $example off$ - - u.foreach(new VoidFunction() { - public void call(Double d) throws Exception { - System.out.println(d); - } - }); - - v.foreach(new VoidFunction() { - public void call(Double d) throws Exception { - System.out.println(d); - } - }); - - jsc.stop(); - } -} - diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java index d80592182ed92..0cec6e2e51214 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java @@ -17,22 +17,20 @@ package org.apache.spark.examples.mllib; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; + +// $example on$ import java.util.ArrayList; -import java.util.List; import java.util.HashMap; -// $example on$ +import java.util.List; import java.util.Map; -// $example off$ import scala.Tuple2; -import org.apache.spark.api.java.function.VoidFunction; -// $example on$ - -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.function.VoidFunction; // $example off$ -import org.apache.spark.SparkConf; public class JavaStratifiedSamplingExample { public static void main(String[] args) { @@ -60,8 +58,10 @@ public static void main(String[] args) { fractions.put(3, 0.3); // Get an exact sample from each stratum - JavaPairRDD approxSample = data.sampleByKey(false, fractions); // JavaPairRDD - JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); // JavaPairRDD + JavaPairRDD approxSample = + data.sampleByKey(false, fractions); // JavaPairRDD + JavaPairRDD exactSample = + data.sampleByKeyExact(false, fractions); // JavaPairRDD // $example off$ approxSample.foreach(new VoidFunction>() { @@ -69,7 +69,7 @@ public void call(Tuple2 t) throws Exception { System.out.println(t._1() + " " + t._2()); } }); - + System.out.println(); exactSample.foreach(new VoidFunction>() { public void call(Tuple2 t) throws Exception { System.out.println(t._1() + " " + t._2()); diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java index 755e6e5a2982f..aae06679d3e2d 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java @@ -17,17 +17,17 @@ package org.apache.spark.examples.mllib; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +// $example on$ import java.util.Arrays; -// $example on$ import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; import org.apache.spark.mllib.stat.Statistics; // $example off$ -import org.apache.spark.SparkConf; public class JavaSummaryStatisticsExample { public static void main(String[] args) { @@ -44,9 +44,9 @@ public static void main(String[] args) { // Compute column summary statistics. MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); - System.out.println(summary.mean()); // a dense vector containing the mean value for each column - System.out.println(summary.variance()); // column-wise variance - System.out.println(summary.numNonzeros()); // number of nonzeros in each column + System.out.println(summary.mean()); // a dense vector containing the mean value for each column + System.out.println(summary.variance()); // column-wise variance + System.out.println(summary.numNonzeros()); // number of nonzeros in each column // $example off$ jsc.stop(); diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py index a0d2e7d8be7eb..2163d08934d5c 100644 --- a/examples/src/main/python/mllib/correlations_example.py +++ b/examples/src/main/python/mllib/correlations_example.py @@ -34,7 +34,7 @@ # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. - print(Statistics.corr(seriesX, seriesY, method="pearson")) + print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson"))) v1 = np.array([1.0, 10.0, 100.0]) v2 = np.array([2.0, 20.0, 200.0]) diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py index ca3d12bc153fd..5e2521ae93e28 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_example.py @@ -19,7 +19,7 @@ from pyspark import SparkContext # $example on$ -from pyspark.mllib.linalg import Vectors, Matrices +from pyspark.mllib.linalg import Matrices, Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics # $example off$ @@ -35,17 +35,19 @@ goodnessOfFitTestResult = Statistics.chiSqTest(vec) print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. + print() mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix # conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(mat) print(independenceTestResult) # summary of the test including the p-value, degrees of freedom + print() p1 = LabeledPoint(1.0, [1.0, 0.0, 3.0]) p2 = LabeledPoint(1.0, [1.0, 2.0, 0.0]) p3 = LabeledPoint(1.0, [-1.0, 0.0, -0.5]) - obs = sc.parallelize([p1, p2, p3]) # LabeledPoint(feature, label) . + obs = sc.parallelize([p1, p2, p3]) # LabeledPoint(feature, label) # The contingency table is constructed from an RDD of LabeledPoint and used to conduct # the independence test. Returns an array containing the ChiSquaredTestResult for every feature @@ -55,6 +57,7 @@ for i, result in enumerate(featureTestResults): print("Column: " + str(i + 1)) print(result) + print() # $example off$ sc.stop() diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py index 088f9f8807498..0b9d9ddbc9122 100644 --- a/examples/src/main/python/mllib/summary_statistics_example.py +++ b/examples/src/main/python/mllib/summary_statistics_example.py @@ -17,10 +17,10 @@ from __future__ import print_function -import numpy as np - from pyspark import SparkContext # $example on$ +import numpy as np + from pyspark.mllib.stat import Statistics # $example off$ @@ -28,9 +28,9 @@ sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext # $example on$ - v1 = np.array([1.0, 2.0, 3.0]) - v2 = np.array([10.0, 20.0, 30.0]) - v3 = np.array([100.0, 200.0, 300.0]) + v1 = np.array([1.0, 10.0, 100.0]) + v2 = np.array([2.0, 20.0, 200.0]) + v3 = np.array([3.0, 30.0, 300.0]) mat = sc.parallelize([v1, v2, v3]) # an RDD of Vectors # Compute column summary statistics. diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala index e395a25dc6a2e..69b6cef551f45 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala @@ -22,25 +22,25 @@ import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.stat.Statistics -// $example off$ import org.apache.spark.rdd.RDD +// $example off$ object CorrelationsExample { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("CorrelationsExample") val sc = new SparkContext(conf) // $example on$ - val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series + val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555)) // must have the same number of partitions and cardinality as seriesX // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a // method is not specified, Pearson's method will be used by default. val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") - println(correlation) + println(s"Correlation is: $correlation") val data: RDD[Vector] = sc.parallelize( Seq( diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala index c09e99011e579..1b548eedaaf74 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala @@ -41,30 +41,34 @@ object HypothesisTestingExample { // compute the goodness of fit. If a second vector to test against is not supplied // as a parameter, the test runs against a uniform distribution. val goodnessOfFitTestResult = Statistics.chiSqTest(vec) - println(goodnessOfFitTestResult) // summary of the test including the p-value, - // degrees of freedom, test statistic, the method used, and the null hypothesis. + // summary of the test including the p-value, degrees of freedom, test statistic, the method + // used, and the null hypothesis. + println(goodnessOfFitTestResult) + println() // a contingency matrix. Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) // conduct Pearson's independence test on the input contingency matrix val independenceTestResult = Statistics.chiSqTest(mat) - println(independenceTestResult) // summary of the test including the p-value, degrees of freedom + // summary of the test including the p-value, degrees of freedom + println(independenceTestResult) + println() val p1 = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) val p2 = LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)) val p3 = LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)) - val obs: RDD[LabeledPoint] = sc.parallelize(Seq(p1, p2, p3)) // (feature, label) pairs. + val obs: RDD[LabeledPoint] = sc.parallelize(Seq(p1, p2, p3)) // (feature, label) pairs. // The contingency table is constructed from the raw (feature, label) pairs and used to conduct // the independence test. Returns an array containing the ChiSquaredTestResult for every feature // against the label. val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs) - var i = 1 - featureTestResults.foreach { result => - println(s"Column $i:\n$result") - i += 1 - } // summary of the test + featureTestResults.zipWithIndex.foreach { result => + println(s"Column " + (result._2 + 1).toString + ":") + println(result._1) + println() + } // summary of the test // $example off$ sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala index 78660f45a43a0..840874cf3c2fe 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingKolmogorovSmirnovTestExample.scala @@ -26,19 +26,20 @@ import org.apache.spark.rdd.RDD object HypothesisTestingKolmogorovSmirnovTestExample { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample") val sc = new SparkContext(conf) // $example on$ - val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data + val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data // run a KS test for the sample versus a standard normal distribution val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) - println(testResult) // summary of the test including the p-value, test statistic, - // and null hypothesis - // if our p-value indicates significance, we can reject the null hypothesis + // summary of the test including the p-value, test statistic, and null hypothesis if our p-value + // indicates significance, we can reject the null hypothesis. + println(testResult) + println() // perform a KS test using a cumulative distribution function of our making val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala index 402276ff086d8..cc5d159b36cc9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KernelDensityEstimationExample.scala @@ -26,7 +26,7 @@ import org.apache.spark.rdd.RDD object KernelDensityEstimationExample { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("KernelDensityEstimationExample") val sc = new SparkContext(conf) @@ -45,7 +45,7 @@ object KernelDensityEstimationExample { val densities = kd.estimate(Array(-1.0, 2.0, 5.0)) // $example off$ - densities.foreach(print) + densities.foreach(println) sc.stop() } diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala deleted file mode 100644 index de48ae58ce0e1..0000000000000 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomDataGenerationExample.scala +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.mllib - -import org.apache.spark.{SparkConf, SparkContext} -// $example on$ -import org.apache.spark.mllib.random.RandomRDDs._ -// $example off$ - -object RandomDataGenerationExample { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("RandomDataGenerationExample") - val sc = new SparkContext(conf) - - // $example on$ - // Generate a random double RDD that contains 1 million i.i.d. values drawn from the - // standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. - val u = normalRDD(sc, 1000L, 10) - // Apply a transform to get a random double RDD following `N(1, 4)`. - val v = u.map(x => 1.0 + 2.0 * x) - // $example off$ - - u.foreach(print) - v.foreach(print) - - sc.stop() - } -} -// scalastyle:on println - diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala index 453e4a2f9d283..f0084dada2240 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala @@ -18,13 +18,11 @@ // scalastyle:off println package org.apache.spark.examples.mllib -// $example on$ import org.apache.spark.{SparkConf, SparkContext} -// $example off$ object StratifiedSamplingExample { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("StratifiedSamplingExample") val sc = new SparkContext(conf) @@ -40,10 +38,10 @@ object StratifiedSamplingExample { // Get an exact sample from each stratum val approxSample = data.sampleByKey(withReplacement = false, fractions) val exactSample = data.sampleByKeyExact(withReplacement = false, fractions) - // $example off$ approxSample.foreach(println) + println() exactSample.foreach(println) sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala index 675c07aeab954..473b6789fd375 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala @@ -26,7 +26,7 @@ import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} object SummaryStatisticsExample { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SummaryStatisticsExample") val sc = new SparkContext(conf) @@ -40,9 +40,9 @@ object SummaryStatisticsExample { // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) - println(summary.mean) // a dense vector containing the mean value for each column - println(summary.variance) // column-wise variance - println(summary.numNonzeros) // number of nonzeros in each column + println(summary.mean) // a dense vector containing the mean value for each column + println(summary.variance) // column-wise variance + println(summary.numNonzeros) // number of nonzeros in each column // $example off$ sc.stop() From 33293947bde90fd29014587cd42533df121bd783 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Sun, 6 Mar 2016 14:49:29 -0800 Subject: [PATCH 22/26] [SPARK-13019] sorry, forget to delete python file --- .../mllib/random_data_generation_example.py | 42 ------------------- 1 file changed, 42 deletions(-) delete mode 100644 examples/src/main/python/mllib/random_data_generation_example.py diff --git a/examples/src/main/python/mllib/random_data_generation_example.py b/examples/src/main/python/mllib/random_data_generation_example.py deleted file mode 100644 index c08d631c5db7d..0000000000000 --- a/examples/src/main/python/mllib/random_data_generation_example.py +++ /dev/null @@ -1,42 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function - -from pyspark import SparkContext -# $example on$ -from pyspark.mllib.random import RandomRDDs -# $example off$ - -if __name__ == "__main__": - sc = SparkContext(appName="RandomDataGenerationExample") # SparkContext - - # $example on$ - # Generate a random double RDD that contains 1 million i.i.d. values drawn from the - # standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. - u = RandomRDDs.normalRDD(sc, 1000L, 10) - # Apply a transform to get a random double RDD following `N(1, 4)`. - v = u.map(lambda x: 1.0 + 2.0 * x) - # $example off$ - - for each in u.collect(): - print(each) - - for each in v.collect(): - print(each) - - sc.stop() From acf7096e750b9150fbe309fc6c90aecb27b1102d Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Mon, 7 Mar 2016 13:30:49 -0800 Subject: [PATCH 23/26] [SPARK-13019] removing '-'s --- docs/mllib-statistics.md | 158 +++++++++--------- .../mllib/JavaStratifiedSamplingExample.java | 4 +- .../examples/mllib/CorrelationsExample.scala | 4 +- 3 files changed, 83 insertions(+), 83 deletions(-) diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md index 62faa1bfa45ff..3f6e25ed7c04d 100644 --- a/docs/mllib-statistics.md +++ b/docs/mllib-statistics.md @@ -10,24 +10,24 @@ displayTitle: Basic Statistics - spark.mllib `\[ \newcommand{\R}{\mathbb{R}} -\newcommand{\E}{\mathbb{E}} +\newcommand{\E}{\mathbb{E}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \newcommand{\wv}{\mathbf{w}} \newcommand{\av}{\mathbf{\alpha}} \newcommand{\bv}{\mathbf{b}} \newcommand{\N}{\mathbb{N}} -\newcommand{\id}{\mathbf{I}} -\newcommand{\ind}{\mathbf{1}} -\newcommand{\0}{\mathbf{0}} -\newcommand{\unit}{\mathbf{e}} -\newcommand{\one}{\mathbf{1}} +\newcommand{\id}{\mathbf{I}} +\newcommand{\ind}{\mathbf{1}} +\newcommand{\0}{\mathbf{0}} +\newcommand{\unit}{\mathbf{e}} +\newcommand{\one}{\mathbf{1}} \newcommand{\zero}{\mathbf{0}} \]` -## Summary statistics +## Summary statistics -We provide column summary statistics for `RDD[Vector]` through the function `colStats` +We provide column summary statistics for `RDD[Vector]` through the function `colStats` available in `Statistics`.
@@ -71,13 +71,13 @@ Refer to the [`MultivariateStatisticalSummary` Python docs](api/python/pyspark.m ## Correlations Calculating the correlation between two series of data is a common operation in Statistics. In `spark.mllib` -we provide the flexibility to calculate pairwise correlations among many series. The supported +we provide the flexibility to calculate pairwise correlations among many series. The supported correlation methods are currently Pearson's and Spearman's correlation. - +
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to -calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or +[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to +calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively. Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mllib.stat.Statistics) for details on the API. @@ -86,8 +86,8 @@ Refer to the [`Statistics` Scala docs](api/scala/index.html#org.apache.spark.mll
-[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to -calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or +[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to +calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or a `JavaRDD`, the output will be a `Double` or the correlation `Matrix` respectively. Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Statistics.html) for details on the API. @@ -96,8 +96,8 @@ Refer to the [`Statistics` Java docs](api/java/org/apache/spark/mllib/stat/Stati
-[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to -calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or +[`Statistics`](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) provides methods to +calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively. Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.stat.Statistics) for more details on the API. @@ -111,21 +111,21 @@ Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.ml Unlike the other statistics functions, which reside in `spark.mllib`, stratified sampling methods, `sampleByKey` and `sampleByKeyExact`, can be performed on RDD's of key-value pairs. For stratified -sampling, the keys can be thought of as a label and the value as a specific attribute. For example -the key can be man or woman, or document ids, and the respective values can be the list of ages -of the people in the population or the list of words in the documents. The `sampleByKey` method -will flip a coin to decide whether an observation will be sampled or not, therefore requires one -pass over the data, and provides an *expected* sample size. `sampleByKeyExact` requires significant +sampling, the keys can be thought of as a label and the value as a specific attribute. For example +the key can be man or woman, or document ids, and the respective values can be the list of ages +of the people in the population or the list of words in the documents. The `sampleByKey` method +will flip a coin to decide whether an observation will be sampled or not, therefore requires one +pass over the data, and provides an *expected* sample size. `sampleByKeyExact` requires significant more resources than the per-stratum simple random sampling used in `sampleByKey`, but will provide -the exact sampling size with 99.99% confidence. `sampleByKeyExact` is currently not supported in +the exact sampling size with 99.99% confidence. `sampleByKeyExact` is currently not supported in python.
[`sampleByKeyExact()`](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) allows users to -sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired +sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of -keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample +keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample size, whereas sampling with replacement requires two additional passes. {% include_example scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala %} @@ -133,17 +133,17 @@ size, whereas sampling with replacement requires two additional passes.
[`sampleByKeyExact()`](api/java/org/apache/spark/api/java/JavaPairRDD.html) allows users to -sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired +sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of -keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample +keys. Sampling without replacement requires one additional pass over the RDD to guarantee sample size, whereas sampling with replacement requires two additional passes. {% include_example java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java %}
[`sampleByKey()`](api/python/pyspark.html#pyspark.RDD.sampleByKey) allows users to -sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the -desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the +sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the +desired fraction for key $k$, $n_k$ is the number of key-value pairs for key $k$, and $K$ is the set of keys. *Note:* `sampleByKeyExact()` is currently not supported in Python. @@ -155,27 +155,27 @@ set of keys. ## Hypothesis testing -Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically -significant, whether this result occurred by chance or not. `spark.mllib` currently supports Pearson's +Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically +significant, whether this result occurred by chance or not. `spark.mllib` currently supports Pearson's chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine -whether the goodness of fit or the independence test is conducted. The goodness of fit test requires +whether the goodness of fit or the independence test is conducted. The goodness of fit test requires an input type of `Vector`, whereas the independence test requires a `Matrix` as input. -`spark.mllib` also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared +`spark.mllib` also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared independence tests.
-[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to -run Pearson's chi-squared tests. The following example demonstrates how to run and interpret +[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to +run Pearson's chi-squared tests. The following example demonstrates how to run and interpret hypothesis tests. {% include_example scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala %}
-[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to -run Pearson's chi-squared tests. The following example demonstrates how to run and interpret +[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to +run Pearson's chi-squared tests. The following example demonstrates how to run and interpret hypothesis tests. Refer to the [`ChiSqTestResult` Java docs](api/java/org/apache/spark/mllib/stat/test/ChiSqTestResult.html) for details on the API. @@ -197,11 +197,11 @@ Refer to the [`Statistics` Python docs](api/python/pyspark.mllib.html#pyspark.ml Additionally, `spark.mllib` provides a 1-sample, 2-sided implementation of the Kolmogorov-Smirnov (KS) test for equality of probability distributions. By providing the name of a theoretical distribution -(currently solely supported for the normal distribution) and its parameters, or a function to +(currently solely supported for the normal distribution) and its parameters, or a function to calculate the cumulative distribution according to a given theoretical distribution, the user can test the null hypothesis that their sample is drawn from that distribution. In the case that the user tests against the normal distribution (`distName="norm"`), but does not provide distribution -parameters, the test initializes to the standard normal distribution and logs an appropriate +parameters, the test initializes to the standard normal distribution and logs an appropriate message.
@@ -277,18 +277,18 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Scala docs](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) for details on the API. --{% highlight scala %} --import org.apache.spark.SparkContext --import org.apache.spark.mllib.random.RandomRDDs._ -- --val sc: SparkContext = ... -- --// Generate a random double RDD that contains 1 million i.i.d. values drawn from the --// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. --val u = normalRDD(sc, 1000000L, 10) --// Apply a transform to get a random double RDD following `N(1, 4)`. --val v = u.map(x => 1.0 + 2.0 * x) --{% endhighlight %} +{% highlight scala %} +import org.apache.spark.SparkContext +import org.apache.spark.mllib.random.RandomRDDs._ + +val sc: SparkContext = ... + +// Generate a random double RDD that contains 1 million i.i.d. values drawn from the +// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. +val u = normalRDD(sc, 1000000L, 10) +// Apply a transform to get a random double RDD following `N(1, 4)`. +val v = u.map(x => 1.0 + 2.0 * x) +{% endhighlight %}
@@ -299,24 +299,24 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Java docs](api/java/org/apache/spark/mllib/random/RandomRDDs) for details on the API. --{% highlight java %} - -import org.apache.spark.SparkContext; - -import org.apache.spark.api.JavaDoubleRDD; - -import static org.apache.spark.mllib.random.RandomRDDs.*; - - - -JavaSparkContext jsc = ... - - - -// Generate a random double RDD that contains 1 million i.i.d. values drawn from the - -// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. - -JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10); - -// Apply a transform to get a random double RDD following `N(1, 4)`. - -JavaDoubleRDD v = u.map( - - new Function() { - - public Double call(Double x) { - - return 1.0 + 2.0 * x; - - } - - }); - -{% endhighlight %} +{% highlight java %} +import org.apache.spark.SparkContext; +import org.apache.spark.api.JavaDoubleRDD; +import static org.apache.spark.mllib.random.RandomRDDs.*; + +JavaSparkContext jsc = ... + +// Generate a random double RDD that contains 1 million i.i.d. values drawn from the +// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. +JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10); +// Apply a transform to get a random double RDD following `N(1, 4)`. +JavaDoubleRDD v = u.map( + new Function() { + public Double call(Double x) { + return 1.0 + 2.0 * x; + } + }); +{% endhighlight %}
@@ -327,17 +327,17 @@ distribution `N(0, 1)`, and then map it to `N(1, 4)`. Refer to the [`RandomRDDs` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.random.RandomRDDs) for more details on the API. --{% highlight python %} - -from pyspark.mllib.random import RandomRDDs - - - -sc = ... # SparkContext - - - -# Generate a random double RDD that contains 1 million i.i.d. values drawn from the - -# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. - -u = RandomRDDs.normalRDD(sc, 1000000L, 10) - -# Apply a transform to get a random double RDD following `N(1, 4)`. - -v = u.map(lambda x: 1.0 + 2.0 * x) - -{% endhighlight %} +{% highlight python %} +from pyspark.mllib.random import RandomRDDs + +sc = ... # SparkContext + +# Generate a random double RDD that contains 1 million i.i.d. values drawn from the +# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions. +u = RandomRDDs.normalRDD(sc, 1000000L, 10) +# Apply a transform to get a random double RDD following `N(1, 4)`. +v = u.map(lambda x: 1.0 + 2.0 * x) +{% endhighlight %}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java index 0cec6e2e51214..0fa051783a014 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java @@ -59,9 +59,9 @@ public static void main(String[] args) { // Get an exact sample from each stratum JavaPairRDD approxSample = - data.sampleByKey(false, fractions); // JavaPairRDD + data.sampleByKey(false, fractions); // JavaPairRDD JavaPairRDD exactSample = - data.sampleByKeyExact(false, fractions); // JavaPairRDD + data.sampleByKeyExact(false, fractions); // JavaPairRDD // $example off$ approxSample.foreach(new VoidFunction>() { diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala index 69b6cef551f45..1202caf534e95 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/CorrelationsExample.scala @@ -34,8 +34,8 @@ object CorrelationsExample { // $example on$ val seriesX: RDD[Double] = sc.parallelize(Array(1, 2, 3, 3, 5)) // a series - val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555)) // must have the same number of partitions and cardinality as seriesX + val seriesY: RDD[Double] = sc.parallelize(Array(11, 22, 33, 33, 555)) // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a // method is not specified, Pearson's method will be used by default. @@ -47,7 +47,7 @@ object CorrelationsExample { Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(5.0, 33.0, 366.0)) - ) // note that each Vector is a row and not a column + ) // note that each Vector is a row and not a column // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method // If a method is not specified, Pearson's method will be used by default. From a4eb28d07a99e559132160f8ae9ac993d47d8fa3 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Thu, 17 Mar 2016 12:21:18 -0700 Subject: [PATCH 24/26] [SPARK-13019] use asList() for concise code --- .../mllib/JavaCorrelationsExample.java | 12 ++-- .../mllib/JavaHypothesisTestingExample.java | 20 +++---- .../JavaKernelDensityEstimationExample.java | 2 +- .../mllib/JavaStratifiedSamplingExample.java | 58 +++++++++---------- .../mllib/JavaSummaryStatisticsExample.java | 12 ++-- .../main/python/mllib/correlations_example.py | 7 +-- .../mllib/hypothesis_testing_example.py | 26 +++++---- ...testing_kolmogorov_smirnov_test_example.py | 2 +- .../mllib/summary_statistics_example.py | 7 +-- .../mllib/HypothesisTestingExample.scala | 26 +++++---- .../mllib/StratifiedSamplingExample.scala | 11 ++-- .../mllib/SummaryStatisticsExample.scala | 12 ++-- 12 files changed, 100 insertions(+), 95 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java index c27c1d01bab58..fd19b43504ac1 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaCorrelationsExample.java @@ -49,12 +49,14 @@ public static void main(String[] args) { Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); System.out.println("Correlation is: " + correlation); - Vector v1 = Vectors.dense(1.0, 10.0, 100.0); - Vector v2 = Vectors.dense(2.0, 20.0, 200.0); - Vector v3 = Vectors.dense(5.0, 33.0, 366.0); - // note that each Vector is a row and not a column - JavaRDD data = jsc.parallelize(Arrays.asList(v1, v2, v3)); + JavaRDD data = jsc.parallelize( + Arrays.asList( + Vectors.dense(1.0, 10.0, 100.0), + Vectors.dense(2.0, 20.0, 200.0), + Vectors.dense(5.0, 33.0, 366.0) + ) + ); // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. // If a method is not specified, Pearson's method will be used by default. diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java index 0960b07a98557..b48b95ff1d2a3 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaHypothesisTestingExample.java @@ -48,8 +48,7 @@ public static void main(String[] args) { ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); // summary of the test including the p-value, degrees of freedom, test statistic, // the method used, and the null hypothesis. - System.out.println(goodnessOfFitTestResult); - System.out.println(); + System.out.println(goodnessOfFitTestResult + "\n"); // Create a contingency matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) Matrix mat = Matrices.dense(3, 2, new double[]{1.0, 3.0, 5.0, 2.0, 4.0, 6.0}); @@ -57,14 +56,16 @@ public static void main(String[] args) { // conduct Pearson's independence test on the input contingency matrix ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); // summary of the test including the p-value, degrees of freedom... - System.out.println(independenceTestResult); - System.out.println(); + System.out.println(independenceTestResult + "\n"); - LabeledPoint p1 = new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)); - LabeledPoint p2 = new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)); - LabeledPoint p3 = new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)); // an RDD of labeled points - JavaRDD obs = jsc.parallelize(Arrays.asList(p1, p2, p3)); + JavaRDD obs = jsc.parallelize( + Arrays.asList( + new LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)), + new LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)), + new LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)) + ) + ); // The contingency table is constructed from the raw (feature, label) pairs and used to conduct // the independence test. Returns an array containing the ChiSquaredTestResult for every feature @@ -73,8 +74,7 @@ public static void main(String[] args) { int i = 1; for (ChiSqTestResult result : featureTestResults) { System.out.println("Column " + i + ":"); - System.out.println(result); // summary of the test - System.out.println(); + System.out.println(result + "\n"); // summary of the test i++; } // $example off$ diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java index 2f25b7534164a..41de0d90eccd7 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaKernelDensityEstimationExample.java @@ -43,9 +43,9 @@ public static void main(String[] args) { // Find density estimates for the given values double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0}); - // $example off$ System.out.println(Arrays.toString(densities)); + // $example off$ jsc.stop(); } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java index 0fa051783a014..f5a451019bd21 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaStratifiedSamplingExample.java @@ -17,14 +17,12 @@ package org.apache.spark.examples.mllib; +import com.google.common.collect.ImmutableMap; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; // $example on$ -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import scala.Tuple2; @@ -39,42 +37,38 @@ public static void main(String[] args) { JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ - List> list = new ArrayList<>(); - list.add(new Tuple2(1, 'a')); - list.add(new Tuple2(1, 'b')); - list.add(new Tuple2(2, 'c')); - list.add(new Tuple2(2, 'd')); - list.add(new Tuple2(2, 'e')); - list.add(new Tuple2(3, 'f')); + List> list = new ArrayList>( + Arrays.>asList( + new Tuple2(1, 'a'), + new Tuple2(1, 'b'), + new Tuple2(2, 'c'), + new Tuple2(2, 'd'), + new Tuple2(2, 'e'), + new Tuple2(3, 'f') + ) + ); - // an RDD of any key value pairs JavaPairRDD JavaPairRDD data = jsc.parallelizePairs(list); // specify the exact fraction desired from each key Map - Map fractions = new HashMap<>(); - - fractions.put(1, 0.1); - fractions.put(2, 0.6); - fractions.put(3, 0.3); + ImmutableMap fractions = + ImmutableMap.of(1, (Object)0.1, 2, (Object) 0.6, 3, (Object) 0.3); + // Get an approximate sample from each stratum + JavaPairRDD approxSample = data.sampleByKey(false, fractions); // Get an exact sample from each stratum - JavaPairRDD approxSample = - data.sampleByKey(false, fractions); // JavaPairRDD - JavaPairRDD exactSample = - data.sampleByKeyExact(false, fractions); // JavaPairRDD + JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions); // $example off$ - approxSample.foreach(new VoidFunction>() { - public void call(Tuple2 t) throws Exception { - System.out.println(t._1() + " " + t._2()); - } - }); - System.out.println(); - exactSample.foreach(new VoidFunction>() { - public void call(Tuple2 t) throws Exception { - System.out.println(t._1() + " " + t._2()); - } - }); + System.out.println("approxSample size is " + approxSample.collect().size()); + for (Tuple2 t : approxSample.collect()) { + System.out.println(t._1() + " " + t._2()); + } + + System.out.println("exactSample size is " + exactSample.collect().size()); + for (Tuple2 t : exactSample.collect()) { + System.out.println(t._1() + " " + t._2()); + } jsc.stop(); } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java index aae06679d3e2d..278706bc8f6ed 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSummaryStatisticsExample.java @@ -36,11 +36,13 @@ public static void main(String[] args) { JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ - Vector v1 = Vectors.dense(1.0, 10.0, 100.0); - Vector v2 = Vectors.dense(2.0, 20.0, 200.0); - Vector v3 = Vectors.dense(3.0, 30.0, 300.0); - - JavaRDD mat = jsc.parallelize(Arrays.asList(v1, v2, v3)); // an RDD of Vectors + JavaRDD mat = jsc.parallelize( + Arrays.asList( + Vectors.dense(1.0, 10.0, 100.0), + Vectors.dense(2.0, 20.0, 200.0), + Vectors.dense(3.0, 30.0, 300.0) + ) + ); // an RDD of Vectors // Compute column summary statistics. MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); diff --git a/examples/src/main/python/mllib/correlations_example.py b/examples/src/main/python/mllib/correlations_example.py index 2163d08934d5c..66d18f6e5df17 100644 --- a/examples/src/main/python/mllib/correlations_example.py +++ b/examples/src/main/python/mllib/correlations_example.py @@ -36,10 +36,9 @@ # If a method is not specified, Pearson's method will be used by default. print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson"))) - v1 = np.array([1.0, 10.0, 100.0]) - v2 = np.array([2.0, 20.0, 200.0]) - v3 = np.array([5.0, 33.0, 366.0]) - data = sc.parallelize([v1, v2, v3]) # an RDD of Vectors + data = sc.parallelize( + [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])] + ) # an RDD of Vectors # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. # If a method is not specified, Pearson's method will be used by default. diff --git a/examples/src/main/python/mllib/hypothesis_testing_example.py b/examples/src/main/python/mllib/hypothesis_testing_example.py index 5e2521ae93e28..e566ead0d318d 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_example.py @@ -25,7 +25,7 @@ # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="HypothesisTestingExample") # SparkContext + sc = SparkContext(appName="HypothesisTestingExample") # $example on$ vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events @@ -33,21 +33,25 @@ # compute the goodness of fit. If a second vector to test against # is not supplied as a parameter, the test runs against a uniform distribution. goodnessOfFitTestResult = Statistics.chiSqTest(vec) - print(goodnessOfFitTestResult) # summary of the test including the p-value, degrees of freedom, + + # summary of the test including the p-value, degrees of freedom, # test statistic, the method used, and the null hypothesis. - print() + print("%s\n" % goodnessOfFitTestResult) mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix # conduct Pearson's independence test on the input contingency matrix independenceTestResult = Statistics.chiSqTest(mat) - print(independenceTestResult) # summary of the test including the p-value, degrees of freedom - print() - p1 = LabeledPoint(1.0, [1.0, 0.0, 3.0]) - p2 = LabeledPoint(1.0, [1.0, 2.0, 0.0]) - p3 = LabeledPoint(1.0, [-1.0, 0.0, -0.5]) - obs = sc.parallelize([p1, p2, p3]) # LabeledPoint(feature, label) + # summary of the test including the p-value, degrees of freedom, + # test statistic, the method used, and the null hypothesis. + print("%s\n" % independenceTestResult) + + obs = sc.parallelize( + [LabeledPoint(1.0, [1.0, 0.0, 3.0]), + LabeledPoint(1.0, [1.0, 2.0, 0.0]), + LabeledPoint(1.0, [-1.0, 0.0, -0.5])] + ) # LabeledPoint(feature, label) # The contingency table is constructed from an RDD of LabeledPoint and used to conduct # the independence test. Returns an array containing the ChiSquaredTestResult for every feature @@ -55,9 +59,7 @@ featureTestResults = Statistics.chiSqTest(obs) for i, result in enumerate(featureTestResults): - print("Column: " + str(i + 1)) - print(result) - print() + print("Column %d:\n%s" % (i + 1, result)) # $example off$ sc.stop() diff --git a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py index 15d63ef86b2e7..ef380dee79d3d 100644 --- a/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py +++ b/examples/src/main/python/mllib/hypothesis_testing_kolmogorov_smirnov_test_example.py @@ -23,7 +23,7 @@ # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample") # SparkContext + sc = SparkContext(appName="HypothesisTestingKolmogorovSmirnovTestExample") # $example on$ parallelData = sc.parallelize([0.1, 0.15, 0.2, 0.3, 0.25]) diff --git a/examples/src/main/python/mllib/summary_statistics_example.py b/examples/src/main/python/mllib/summary_statistics_example.py index 0b9d9ddbc9122..d55d1a2c2d0e1 100644 --- a/examples/src/main/python/mllib/summary_statistics_example.py +++ b/examples/src/main/python/mllib/summary_statistics_example.py @@ -28,10 +28,9 @@ sc = SparkContext(appName="SummaryStatisticsExample") # SparkContext # $example on$ - v1 = np.array([1.0, 10.0, 100.0]) - v2 = np.array([2.0, 20.0, 200.0]) - v3 = np.array([3.0, 30.0, 300.0]) - mat = sc.parallelize([v1, v2, v3]) # an RDD of Vectors + mat = sc.parallelize( + [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])] + ) # an RDD of Vectors # Compute column summary statistics. summary = Statistics.colStats(mat) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala index 1b548eedaaf74..0d391a3637c07 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala @@ -43,8 +43,7 @@ object HypothesisTestingExample { val goodnessOfFitTestResult = Statistics.chiSqTest(vec) // summary of the test including the p-value, degrees of freedom, test statistic, the method // used, and the null hypothesis. - println(goodnessOfFitTestResult) - println() + println(s"$goodnessOfFitTestResult\n") // a contingency matrix. Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0)) val mat: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) @@ -52,22 +51,25 @@ object HypothesisTestingExample { // conduct Pearson's independence test on the input contingency matrix val independenceTestResult = Statistics.chiSqTest(mat) // summary of the test including the p-value, degrees of freedom - println(independenceTestResult) - println() + println(s"$independenceTestResult\n") - val p1 = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) - val p2 = LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)) - val p3 = LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5)) - val obs: RDD[LabeledPoint] = sc.parallelize(Seq(p1, p2, p3)) // (feature, label) pairs. + val obs: RDD[LabeledPoint] = + sc.parallelize( + Seq( + LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)), + LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 0.0)), + LabeledPoint(-1.0, Vectors.dense(-1.0, 0.0, -0.5) + ) + ) + ) // (feature, label) pairs. // The contingency table is constructed from the raw (feature, label) pairs and used to conduct // the independence test. Returns an array containing the ChiSquaredTestResult for every feature // against the label. val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs) - featureTestResults.zipWithIndex.foreach { result => - println(s"Column " + (result._2 + 1).toString + ":") - println(result._1) - println() + featureTestResults.zipWithIndex.foreach { case (k, v) => + println("Column " + (v + 1).toString + ":") + println(k) } // summary of the test // $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala index f0084dada2240..169467926ce46 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala @@ -35,14 +35,17 @@ object StratifiedSamplingExample { // specify the exact fraction desired from each key val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3) - // Get an exact sample from each stratum + // Get an approximate sample from each stratum val approxSample = data.sampleByKey(withReplacement = false, fractions) + // Get an exact sample from each stratum val exactSample = data.sampleByKeyExact(withReplacement = false, fractions) // $example off$ - approxSample.foreach(println) - println() - exactSample.foreach(println) + println("approxSample size is " + approxSample.collect().size.toString) + approxSample.collect().foreach(println) + + println("exactSample its size is " + exactSample.collect().size.toString) + exactSample.collect().foreach(println) sc.stop() } diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala index 473b6789fd375..948b443c0a754 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SummaryStatisticsExample.scala @@ -32,11 +32,13 @@ object SummaryStatisticsExample { val sc = new SparkContext(conf) // $example on$ - val v1 = Vectors.dense(1.0, 10.0, 100.0) - val v2 = Vectors.dense(2.0, 20.0, 200.0) - val v3 = Vectors.dense(3.0, 30.0, 300.0) - - val observations = sc.parallelize(Seq(v1, v2, v3)) + val observations = sc.parallelize( + Seq( + Vectors.dense(1.0, 10.0, 100.0), + Vectors.dense(2.0, 20.0, 200.0), + Vectors.dense(3.0, 30.0, 300.0) + ) + ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) From 892fe600e48b49b26a29120c99d171db02c659ab Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Mon, 21 Mar 2016 18:19:29 -0700 Subject: [PATCH 25/26] [SPARK-13019] fix arguments passing for 2.10 --- .../spark/examples/mllib/StratifiedSamplingExample.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala index 169467926ce46..e100cf09c6391 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala @@ -19,6 +19,7 @@ package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.util.Utils object StratifiedSamplingExample { @@ -34,11 +35,12 @@ object StratifiedSamplingExample { // specify the exact fraction desired from each key val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3) + val seed = Utils.random.nextLong // Get an approximate sample from each stratum - val approxSample = data.sampleByKey(withReplacement = false, fractions) + val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions, seed) // Get an exact sample from each stratum - val exactSample = data.sampleByKeyExact(withReplacement = false, fractions) + val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions, seed) // $example off$ println("approxSample size is " + approxSample.collect().size.toString) From ceebd3600efc7d6b509c4dae9c08e13890574fec Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Tue, 22 Mar 2016 14:49:40 -0700 Subject: [PATCH 26/26] [SPARK-13019] remove variable 'seed' --- .../spark/examples/mllib/StratifiedSamplingExample.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala index e100cf09c6391..16b074ef60699 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala @@ -19,7 +19,6 @@ package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.util.Utils object StratifiedSamplingExample { @@ -35,12 +34,11 @@ object StratifiedSamplingExample { // specify the exact fraction desired from each key val fractions = Map(1 -> 0.1, 2 -> 0.6, 3 -> 0.3) - val seed = Utils.random.nextLong // Get an approximate sample from each stratum - val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions, seed) + val approxSample = data.sampleByKey(withReplacement = false, fractions = fractions) // Get an exact sample from each stratum - val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions, seed) + val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions) // $example off$ println("approxSample size is " + approxSample.collect().size.toString)