From fec4d9d8b9569000125e5e3778d8a5521f4f0b72 Mon Sep 17 00:00:00 2001 From: Burak Date: Mon, 25 Aug 2014 18:44:43 -0700 Subject: [PATCH 1/2] [SPARK-2830][MLlib] Stats Toolkit documentation updated --- docs/mllib-stats.md | 311 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 289 insertions(+), 22 deletions(-) diff --git a/docs/mllib-stats.md b/docs/mllib-stats.md index f25dca746ba3a..e4b5bcbfa110e 100644 --- a/docs/mllib-stats.md +++ b/docs/mllib-stats.md @@ -99,69 +99,336 @@ v = u.map(lambda x: 1.0 + 2.0 * x) -## Stratified Sampling +## Correlation Calculation + +Calculating the correlation between two series of data is a common operation in Statistics. In MLlib +we provide the flexibility to calculate correlation between many series. The supported correlation +methods are currently Pearson's and Spearman's correlation. + +
+
+[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to +calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or +an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively. + +{% highlight scala %} +import org.apache.spark.SparkContext +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.stat.Statistics + +val sc: SparkContext = ... + +val seriesX: RDD[Double] = ... // a series +val seriesY: RDD[Double] = ... // must have the same number of partitions and cardinality as seriesX + +// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a +// method is not specified, Pearson's method will be used by default. +val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") + +val data: RDD[Vector] = ... // note that each Vector is a row and not a column + +// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. +// If a method is not specified, Pearson's method will be used by default. +val correlMatrix: Matrix = Statistics.corr(data, "pearson") + +{% endhighlight %} +
+ +
+[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to +calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or +a `JavaRDD`, the output will be a `Double` or the correlation `Matrix` respectively. + +{% highlight java %} +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.*; +import org.apache.spark.mllib.stat.Statistics; + +JavaSparkContext jsc = ... + +JavaDoubleRDD seriesX = ... // a series +JavaDoubleRDD seriesY = ... // must have the same number of partitions and cardinality as seriesX + +// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a +// method is not specified, Pearson's method will be used by default. +Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); + +JavaRDD data = ... // note that each Vector is a row and not a column + +// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. +// If a method is not specified, Pearson's method will be used by default. +Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); + +{% endhighlight %} +
+ +
+[`Statistics`](api/python/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to +calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or +an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively. + +Support for `RowMatrix` operations in python currently don't exist, but will be added in future +releases. + +{% highlight python %} +from pyspark.mllib.stat import Statistics + +sc = ... # SparkContext + +seriesX = ... # a series +seriesY = ... # must have the same number of partitions and cardinality as seriesX + +# Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a +# method is not specified, Pearson's method will be used by default. +print Statistics.corr(seriesX, seriesY, method="pearson") + +data = ... # an RDD of Vectors +# calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. +# If a method is not specified, Pearson's method will be used by default. +print Statistics.corr(data, method="pearson") + +{% endhighlight %} +
+ +
+ +## Stratified Sampling + +Unlike the other statistics functions, which reside in MLLib, stratified sampling methods, +`sampleByKey` and `sampleByKeyExact`, can be accessed in `PairRDDFunctions` in core, as stratified +sampling is tightly coupled with the PairRDD data type, and the function signature conforms to the +other *ByKey* methods in PairRDDFunctions. A separate method for exact sample size support exists +as it requires significant more resources than the per-stratum simple random sampling used in +`sampleByKey`. + +
+
+[`sampleByKeyExact()`](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) allows users to +sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired +fraction for key $k$, and $n_k$ is the number of key-value pairs for key $k$. +Sampling without replacement requires one additional pass over the RDD to guarantee sample +size, whereas sampling with replacement requires two additional passes. + +{% highlight scala %} +import org.apache.spark.SparkContext +import org.apache.spark.rdd.PairRDDFunctions + +val sc: SparkContext = ... + +val data = ... // an RDD[(K,V)] of any key value pairs +val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key + +// Get an exact sample from each stratum +val sample = data.sampleByKeyExact(withReplacement=false, fractions) + +{% endhighlight %} +
+ +
+[`sampleByKeyExact()`](api/java/org/apache/spark/api/java/JavaPairRDD.html) allows users to +sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired +fraction for key $k$, and $n_k$ is the number of key-value pairs for key $k$. +Sampling without replacement requires one additional pass over the RDD to guarantee sample +size, whereas sampling with replacement requires two additional passes. + +{% highlight java %} +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; + +JavaSparkContext jsc = ... + +JavaPairRDD data = ... // an RDD of any key value pairs +java.util.Map fractions = ... // specify the exact fraction desired from each key + +// Get an exact sample from each stratum +JavaPairRDD sample = data.sampleByKeyExact(false, fractions); + +{% endhighlight %} +
+ +
## Summary Statistics ### Multivariate summary statistics -We provide column summary statistics for `RowMatrix` (note: this functionality is not currently supported in `IndexedRowMatrix` or `CoordinateMatrix`). -If the number of columns is not large, e.g., on the order of thousands, then the -covariance matrix can also be computed as a local matrix, which requires $\mathcal{O}(n^2)$ storage where $n$ is the -number of columns. The total CPU time is $\mathcal{O}(m n^2)$, where $m$ is the number of rows, -and is faster if the rows are sparse. +We provide column summary statistics for `RDD[Vector]` through the static function `colStats` +available in `Statistics`.
-[`computeColumnSummaryStatistics()`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) returns an instance of +[`colStats()`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) returns an instance of [`MultivariateStatisticalSummary`](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary), which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count. {% highlight scala %} -import org.apache.spark.mllib.linalg.Matrix -import org.apache.spark.mllib.linalg.distributed.RowMatrix -import org.apache.spark.mllib.stat.MultivariateStatisticalSummary +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} -val mat: RowMatrix = ... // a RowMatrix +val mat: RDD[Vector] = ... // an RDD of Vectors // Compute column summary statistics. -val summary: MultivariateStatisticalSummary = mat.computeColumnSummaryStatistics() +val summary: MultivariateStatisticalSummary = Statistics.colStats(mat) println(summary.mean) // a dense vector containing the mean value for each column println(summary.variance) // column-wise variance println(summary.numNonzeros) // number of nonzeros in each column -// Compute the covariance matrix. -val cov: Matrix = mat.computeCovariance() {% endhighlight %}
-[`RowMatrix#computeColumnSummaryStatistics`](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html#computeColumnSummaryStatistics()) returns an instance of +[`colStats()`](api/java/org/apache/spark/mllib/stat/Statistics.html) returns an instance of [`MultivariateStatisticalSummary`](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html), which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the total count. {% highlight java %} -import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.mllib.linalg.distributed.RowMatrix; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; +import org.apache.spark.mllib.stat.Statistics; + +JavaSparkContext jsc = ... -RowMatrix mat = ... // a RowMatrix +JavaRDD mat = ... // an RDD of Vectors // Compute column summary statistics. -MultivariateStatisticalSummary summary = mat.computeColumnSummaryStatistics(); +MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); System.out.println(summary.mean()); // a dense vector containing the mean value for each column System.out.println(summary.variance()); // column-wise variance System.out.println(summary.numNonzeros()); // number of nonzeros in each column -// Compute the covariance matrix. -Matrix cov = mat.computeCovariance(); {% endhighlight %}
+ +
+[`colStats()`](api/python/index.html#org.apache.spark.mllib.stat.Statistics$) returns an instance of +[`MultivariateStatisticalSummary`](api/python/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary$), +which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the +total count. + +{% highlight python %} +from pyspark.mllib.stat import Statistics + +sc = ... # SparkContext + +mat = ... # an RDD of Vectors + +# Compute column summary statistics. +summary = Statistics.colStats(mat) +print summary.mean() +print summary.variance() +print summary.numNonzeros() + +{% endhighlight %}
+
+ + +## Hypothesis Testing + +Hypothesis testing is a power tool in statistics to determine whether a result is statistically +significant, whether this result occurred by chance or not. MLlib currently supports Pearson's +chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine +whether the goodness of fit or the independence test is conducted. The goodness of fit test requires +an input type of `Vector`, whereas the independence test requires a `Matrix` as input. + +MLlib also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared +independence tests. + +
+
+[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to +run Pearson's chi-squared tests. The following example demonstrates how to run and interpret +hypothesis tests. -## Hypothesis Testing +{% highlight scala %} +import org.apache.spark.SparkContext +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.stat.Statistics._ + +val sc: SparkContext = ... + +val vec: Vector = ... // a vector composed of the frequencies of events + +// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, +// the test runs against a uniform distribution. +val goodnessOfFitTestResult = Statistics.chiSqTest(vec) +println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom, + // test statistic, the method used, and the null hypothesis. + +val mat: Matrix = ... // a contingency matrix + +// conduct Pearson's independence test on the input contingency matrix +val independenceTestResult = Statistics.chiSqTest(mat) +println(independenceTestResult) // summary of the test including the p-value, degrees of freedom... + +val obs: RDD[LabeledPoint] = ... // (feature, label) pairs. + +// The contingency table is constructed from the raw (feature, label) pairs and used to conduct +// the independence test. Returns an array containing the ChiSquaredTestResult for every feature +// against the label. +val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs) +var i: Integer = 1 +featureTestResults.foreach{ result => + println(s"Column $i: \n$result") + i += 1 +} // summary of the test + +{% endhighlight %} +
+ +
+[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to +run Pearson's chi-squared tests. The following example demonstrates how to run and interpret +hypothesis tests. + +{% highlight java %} +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.*; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.stat.Statistics; +import org.apache.spark.mllib.stat.test.ChiSqTestResult; + +JavaSparkContext jsc = ... + +Vector vec = ... // a vector composed of the frequencies of events + +// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, +// the test runs against a uniform distribution. +ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec); +// summary of the test including the p-value, degrees of freedom, test statistic, the method used, +// and the null hypothesis. +System.out.println(goodnessOfFitTestResult); + +Matrix mat = ... // a contingency matrix + +// conduct Pearson's independence test on the input contingency matrix +ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); +// summary of the test including the p-value, degrees of freedom... +System.out.println(independenceTestResult); + +JavaRDD obs = ... // (feature, label) pairs. + +// The contingency table is constructed from the raw (feature, label) pairs and used to conduct +// the independence test. Returns an array containing the ChiSquaredTestResult for every feature +// against the label. +ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); +int i = 1; +for (ChiSqTestResult individualSummary:featureTestResults){ + System.out.println("Column " + i++ + ":"); + System.out.println(individualSummary); // summary of the test +} + +{% endhighlight %} +
+ +
From 213fe3f31f708ff0ee56d56e36644b51c0bba56e Mon Sep 17 00:00:00 2001 From: Burak Date: Mon, 25 Aug 2014 23:28:13 -0700 Subject: [PATCH 2/2] [SPARK-2839][MLlib] Modifications made according to review --- docs/mllib-stats.md | 218 ++++++++++++++++++++++++-------------------- 1 file changed, 120 insertions(+), 98 deletions(-) diff --git a/docs/mllib-stats.md b/docs/mllib-stats.md index e4b5bcbfa110e..a1926657adb96 100644 --- a/docs/mllib-stats.md +++ b/docs/mllib-stats.md @@ -25,6 +25,87 @@ displayTitle: MLlib - Statistics Functionality \newcommand{\zero}{\mathbf{0}} \]` +## Summary Statistics + +### Multivariate summary statistics + +We provide column summary statistics for `RDD[Vector]` through the function `colStats` +available in `Statistics`. + +
+
+ +[`colStats()`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) returns an instance of +[`MultivariateStatisticalSummary`](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary), +which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the +total count. + +{% highlight scala %} +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} + +val observations: RDD[Vector] = ... // an RDD of Vectors + +// Compute column summary statistics. +val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) +println(summary.mean) // a dense vector containing the mean value for each column +println(summary.variance) // column-wise variance +println(summary.numNonzeros) // number of nonzeros in each column + +{% endhighlight %} +
+ +
+ +[`colStats()`](api/java/org/apache/spark/mllib/stat/Statistics.html) returns an instance of +[`MultivariateStatisticalSummary`](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html), +which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the +total count. + +{% highlight java %} +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; +import org.apache.spark.mllib.stat.Statistics; + +JavaSparkContext jsc = ... + +JavaRDD mat = ... // an RDD of Vectors + +// Compute column summary statistics. +MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); +System.out.println(summary.mean()); // a dense vector containing the mean value for each column +System.out.println(summary.variance()); // column-wise variance +System.out.println(summary.numNonzeros()); // number of nonzeros in each column + +{% endhighlight %} +
+ +
+[`colStats()`](api/python/pyspark.mllib.stat.Statistics-class.html) returns an instance of +[`MultivariateStatisticalSummary`](api/python/pyspark.mllib.stat.MultivariateStatisticalSummary-class.html), +which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the +total count. + +{% highlight python %} +from pyspark.mllib.stat import Statistics + +sc = ... # SparkContext + +mat = ... # an RDD of Vectors + +# Compute column summary statistics. +summary = Statistics.colStats(mat) +print summary.mean() +print summary.variance() +print summary.numNonzeros() + +{% endhighlight %} +
+ +
+ ## Random data generation Random data generation is useful for randomized algorithms, prototyping, and performance testing. @@ -99,11 +180,11 @@ v = u.map(lambda x: 1.0 + 2.0 * x) -## Correlation Calculation +## Correlations calculation Calculating the correlation between two series of data is a common operation in Statistics. In MLlib -we provide the flexibility to calculate correlation between many series. The supported correlation -methods are currently Pearson's and Spearman's correlation. +we provide the flexibility to calculate pairwise correlations among many series. The supported +correlation methods are currently Pearson's and Spearman's correlation.
@@ -164,13 +245,10 @@ Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
-[`Statistics`](api/python/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to +[`Statistics`](api/python/pyspark.mllib.stat.Statistics-class.html) provides methods to calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively. -Support for `RowMatrix` operations in python currently don't exist, but will be added in future -releases. - {% highlight python %} from pyspark.mllib.stat import Statistics @@ -193,14 +271,15 @@ print Statistics.corr(data, method="pearson")
-## Stratified Sampling +## Stratified sampling Unlike the other statistics functions, which reside in MLLib, stratified sampling methods, -`sampleByKey` and `sampleByKeyExact`, can be accessed in `PairRDDFunctions` in core, as stratified -sampling is tightly coupled with the PairRDD data type, and the function signature conforms to the -other *ByKey* methods in PairRDDFunctions. A separate method for exact sample size support exists -as it requires significant more resources than the per-stratum simple random sampling used in -`sampleByKey`. +`sampleByKey` and `sampleByKeyExact`, can be performed on RDD's of key-value pairs. For stratified +sampling, the keys can be thought of as a label and the value as a specific attribute. For example +the key can be man or woman, or document ids, and the respective values can be the list of ages +of the people in the population or the list of words in the documents. A separate method for exact +sample size support exists as it requires significant more resources than the per-stratum simple +random sampling used in `sampleByKey`. `sampleByKeyExact` is currently not supported in python.
@@ -212,15 +291,16 @@ size, whereas sampling with replacement requires two additional passes. {% highlight scala %} import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ import org.apache.spark.rdd.PairRDDFunctions val sc: SparkContext = ... -val data = ... // an RDD[(K,V)] of any key value pairs +val data = ... // an RDD[(K, V)] of any key value pairs val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key // Get an exact sample from each stratum -val sample = data.sampleByKeyExact(withReplacement=false, fractions) +val sample = data.sampleByKeyExact(withReplacement = false, fractions) {% endhighlight %}
@@ -233,107 +313,48 @@ Sampling without replacement requires one additional pass over the RDD to guaran size, whereas sampling with replacement requires two additional passes. {% highlight java %} +import java.util.Map; + import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; JavaSparkContext jsc = ... -JavaPairRDD data = ... // an RDD of any key value pairs -java.util.Map fractions = ... // specify the exact fraction desired from each key +JavaPairRDD data = ... // an RDD of any key value pairs +Map fractions = ... // specify the exact fraction desired from each key // Get an exact sample from each stratum -JavaPairRDD sample = data.sampleByKeyExact(false, fractions); +JavaPairRDD sample = data.sampleByKeyExact(false, fractions); {% endhighlight %}
+
+[`sampleByKey()`](api/python/pyspark.rdd.RDD-class.html#sampleByKey) allows users to +sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the +desired fraction for key $k$, and $n_k$ is the number of key-value pairs for key $k$. +Sampling without replacement requires one additional pass over the RDD to guarantee sample +size, whereas sampling with replacement requires two additional passes. -
- -## Summary Statistics - -### Multivariate summary statistics - -We provide column summary statistics for `RDD[Vector]` through the static function `colStats` -available in `Statistics`. - -
-
- -[`colStats()`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) returns an instance of -[`MultivariateStatisticalSummary`](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary), -which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the -total count. - -{% highlight scala %} -import org.apache.spark.mllib.linalg.Vector -import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} - -val mat: RDD[Vector] = ... // an RDD of Vectors - -// Compute column summary statistics. -val summary: MultivariateStatisticalSummary = Statistics.colStats(mat) -println(summary.mean) // a dense vector containing the mean value for each column -println(summary.variance) // column-wise variance -println(summary.numNonzeros) // number of nonzeros in each column - -{% endhighlight %} -
- -
- -[`colStats()`](api/java/org/apache/spark/mllib/stat/Statistics.html) returns an instance of -[`MultivariateStatisticalSummary`](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html), -which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the -total count. - -{% highlight java %} -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; -import org.apache.spark.mllib.stat.Statistics; - -JavaSparkContext jsc = ... - -JavaRDD mat = ... // an RDD of Vectors - -// Compute column summary statistics. -MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd()); -System.out.println(summary.mean()); // a dense vector containing the mean value for each column -System.out.println(summary.variance()); // column-wise variance -System.out.println(summary.numNonzeros()); // number of nonzeros in each column - -{% endhighlight %} -
+*Note:* `sampleByKeyExact()` is currently not supported in Python. -
-[`colStats()`](api/python/index.html#org.apache.spark.mllib.stat.Statistics$) returns an instance of -[`MultivariateStatisticalSummary`](api/python/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary$), -which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the -total count. {% highlight python %} -from pyspark.mllib.stat import Statistics sc = ... # SparkContext -mat = ... # an RDD of Vectors +data = ... # an RDD of any key value pairs +fractions = ... # specify the exact fraction desired from each key as a dictionary -# Compute column summary statistics. -summary = Statistics.colStats(mat) -print summary.mean() -print summary.variance() -print summary.numNonzeros() +sample = data.sampleByKeyExact(False, fractions); {% endhighlight %}
+## Hypothesis testing -## Hypothesis Testing - -Hypothesis testing is a power tool in statistics to determine whether a result is statistically +Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically significant, whether this result occurred by chance or not. MLlib currently supports Pearson's chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine whether the goodness of fit or the independence test is conducted. The goodness of fit test requires @@ -376,9 +397,9 @@ val obs: RDD[LabeledPoint] = ... // (feature, label) pairs. // the independence test. Returns an array containing the ChiSquaredTestResult for every feature // against the label. val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs) -var i: Integer = 1 -featureTestResults.foreach{ result => - println(s"Column $i: \n$result") +var i = 1 +featureTestResults.foreach { result => + println(s"Column $i:\n$result") i += 1 } // summary of the test @@ -416,16 +437,17 @@ ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat); // summary of the test including the p-value, degrees of freedom... System.out.println(independenceTestResult); -JavaRDD obs = ... // (feature, label) pairs. +JavaRDD obs = ... // an RDD of labeled points // The contingency table is constructed from the raw (feature, label) pairs and used to conduct // the independence test. Returns an array containing the ChiSquaredTestResult for every feature // against the label. ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd()); int i = 1; -for (ChiSqTestResult individualSummary:featureTestResults){ - System.out.println("Column " + i++ + ":"); - System.out.println(individualSummary); // summary of the test +for (ChiSqTestResult result : featureTestResults) { + System.out.println("Column " + i + ":"); + System.out.println(result); // summary of the test + i++; } {% endhighlight %}