From fec4d9d8b9569000125e5e3778d8a5521f4f0b72 Mon Sep 17 00:00:00 2001
From: Burak <brkyvz@gmail.com>
Date: Mon, 25 Aug 2014 18:44:43 -0700
Subject: [PATCH 1/2] [SPARK-2830][MLlib] Stats Toolkit documentation updated

---
 docs/mllib-stats.md | 311 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 289 insertions(+), 22 deletions(-)
diff --git a/docs/mllib-stats.md b/docs/mllib-stats.md
index f25dca746ba3a..e4b5bcbfa110e 100644
--- a/docs/mllib-stats.md
+++ b/docs/mllib-stats.md
@@ -99,69 +99,336 @@ v = u.map(lambda x: 1.0 + 2.0 * x)
 
 </div>
 
-## Stratified Sampling 
+## Correlation Calculation
+
+Calculating the correlation between two series of data is a common operation in Statistics. In MLlib
+we provide the flexibility to calculate correlation between many series. The supported correlation
+methods are currently Pearson's and Spearman's correlation.
+ 
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to 
+calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or 
+an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
+
+{% highlight scala %}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.stat.Statistics
+
+val sc: SparkContext = ...
+
+val seriesX: RDD[Double] = ... // a series
+val seriesY: RDD[Double] = ... // must have the same number of partitions and cardinality as seriesX
+
+// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
+// method is not specified, Pearson's method will be used by default. 
+val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
+
+val data: RDD[Vector] = ... // note that each Vector is a row and not a column
+
+// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+// If a method is not specified, Pearson's method will be used by default. 
+val correlMatrix: Matrix = Statistics.corr(data, "pearson")
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to 
+calculate correlations between series. Depending on the type of input, two `JavaDoubleRDD`s or 
+a `JavaRDD<Vector>`, the output will be a `Double` or the correlation `Matrix` respectively.
+
+{% highlight java %}
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.*;
+import org.apache.spark.mllib.stat.Statistics;
+
+JavaSparkContext jsc = ...
+
+JavaDoubleRDD seriesX = ... // a series
+JavaDoubleRDD seriesY = ... // must have the same number of partitions and cardinality as seriesX
+
+// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
+// method is not specified, Pearson's method will be used by default. 
+Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
+
+JavaRDD<Vector> data = ... // note that each Vector is a row and not a column
+
+// calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+// If a method is not specified, Pearson's method will be used by default. 
+Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
+
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+[`Statistics`](api/python/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to 
+calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or 
+an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
+
+Support for `RowMatrix` operations in python currently don't exist, but will be added in future 
+releases.  
+
+{% highlight python %}
+from pyspark.mllib.stat import Statistics
+
+sc = ... # SparkContext
+
+seriesX = ... # a series
+seriesY = ... # must have the same number of partitions and cardinality as seriesX
+
+# Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
+# method is not specified, Pearson's method will be used by default. 
+print Statistics.corr(seriesX, seriesY, method="pearson")
+
+data = ... # an RDD of Vectors
+# calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
+# If a method is not specified, Pearson's method will be used by default. 
+print Statistics.corr(data, method="pearson")
+
+{% endhighlight %}
+</div>
+
+</div>
+
+## Stratified Sampling
+
+Unlike the other statistics functions, which reside in MLLib, stratified sampling methods, 
+`sampleByKey` and `sampleByKeyExact`, can be accessed in `PairRDDFunctions` in core, as stratified 
+sampling is tightly coupled with the PairRDD data type, and the function signature conforms to the 
+other *ByKey* methods in PairRDDFunctions. A separate method for exact sample size support exists 
+as it requires significant more resources than the per-stratum simple random sampling used in 
+`sampleByKey`.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+[`sampleByKeyExact()`](api/scala/index.html#org.apache.spark.rdd.PairRDDFunctions) allows users to
+sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired 
+fraction for key $k$, and $n_k$ is the number of key-value pairs for key $k$. 
+Sampling without replacement requires one additional pass over the RDD to guarantee sample 
+size, whereas sampling with replacement requires two additional passes.
+
+{% highlight scala %}
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.PairRDDFunctions
+
+val sc: SparkContext = ...
+
+val data = ... // an RDD[(K,V)] of any key value pairs
+val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key
+
+// Get an exact sample from each stratum
+val sample = data.sampleByKeyExact(withReplacement=false, fractions)
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+[`sampleByKeyExact()`](api/java/org/apache/spark/api/java/JavaPairRDD.html) allows users to
+sample exactly $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the desired 
+fraction for key $k$, and $n_k$ is the number of key-value pairs for key $k$. 
+Sampling without replacement requires one additional pass over the RDD to guarantee sample 
+size, whereas sampling with replacement requires two additional passes.
+
+{% highlight java %}
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+JavaSparkContext jsc = ...
+
+JavaPairRDD<K,V> data = ... // an RDD of any key value pairs
+java.util.Map<K,Object> fractions = ... // specify the exact fraction desired from each key
+
+// Get an exact sample from each stratum
+JavaPairRDD<K,V> sample = data.sampleByKeyExact(false, fractions);
+
+{% endhighlight %}
+</div>
+
+</div>
 
 ## Summary Statistics 
 
 ### Multivariate summary statistics
 
-We provide column summary statistics for `RowMatrix` (note: this functionality is not currently supported in `IndexedRowMatrix` or `CoordinateMatrix`). 
-If the number of columns is not large, e.g., on the order of thousands, then the 
-covariance matrix can also be computed as a local matrix, which requires $\mathcal{O}(n^2)$ storage where $n$ is the
-number of columns. The total CPU time is $\mathcal{O}(m n^2)$, where $m$ is the number of rows,
-and is faster if the rows are sparse.
+We provide column summary statistics for `RDD[Vector]` through the static function `colStats` 
+available in `Statistics`.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-[`computeColumnSummaryStatistics()`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) returns an instance of
+[`colStats()`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) returns an instance of
 [`MultivariateStatisticalSummary`](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary),
 which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
 total count.
 
 {% highlight scala %}
-import org.apache.spark.mllib.linalg.Matrix
-import org.apache.spark.mllib.linalg.distributed.RowMatrix
-import org.apache.spark.mllib.stat.MultivariateStatisticalSummary
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
 
-val mat: RowMatrix = ... // a RowMatrix
+val mat: RDD[Vector] = ... // an RDD of Vectors
 
 // Compute column summary statistics.
-val summary: MultivariateStatisticalSummary = mat.computeColumnSummaryStatistics()
+val summary: MultivariateStatisticalSummary = Statistics.colStats(mat)
 println(summary.mean) // a dense vector containing the mean value for each column
 println(summary.variance) // column-wise variance
 println(summary.numNonzeros) // number of nonzeros in each column
 
-// Compute the covariance matrix.
-val cov: Matrix = mat.computeCovariance()
 {% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
 
-[`RowMatrix#computeColumnSummaryStatistics`](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html#computeColumnSummaryStatistics()) returns an instance of
+[`colStats()`](api/java/org/apache/spark/mllib/stat/Statistics.html) returns an instance of
 [`MultivariateStatisticalSummary`](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html),
 which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
 total count.
 
 {% highlight java %}
-import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.linalg.distributed.RowMatrix;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
 import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
+import org.apache.spark.mllib.stat.Statistics;
+
+JavaSparkContext jsc = ...
 
-RowMatrix mat = ... // a RowMatrix
+JavaRDD<Vector> mat = ... // an RDD of Vectors
 
 // Compute column summary statistics.
-MultivariateStatisticalSummary summary = mat.computeColumnSummaryStatistics();
+MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
 System.out.println(summary.mean()); // a dense vector containing the mean value for each column
 System.out.println(summary.variance()); // column-wise variance
 System.out.println(summary.numNonzeros()); // number of nonzeros in each column
 
-// Compute the covariance matrix.
-Matrix cov = mat.computeCovariance();
 {% endhighlight %}
 </div>
+
+<div data-lang="python" markdown="1">
+[`colStats()`](api/python/index.html#org.apache.spark.mllib.stat.Statistics$) returns an instance of
+[`MultivariateStatisticalSummary`](api/python/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary$),
+which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
+total count.
+
+{% highlight python %}
+from pyspark.mllib.stat import Statistics
+
+sc = ... # SparkContext
+
+mat = ... # an RDD of Vectors
+
+# Compute column summary statistics.
+summary = Statistics.colStats(mat)
+print summary.mean()
+print summary.variance()
+print summary.numNonzeros()
+
+{% endhighlight %}
 </div>
 
+</div>
+
+
+## Hypothesis Testing
+
+Hypothesis testing is a power tool in statistics to determine whether a result is statistically 
+significant, whether this result occurred by chance or not. MLlib currently supports Pearson's 
+chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine 
+whether the goodness of fit or the independence test is conducted. The goodness of fit test requires 
+an input type of `Vector`, whereas the independence test requires a `Matrix` as input.
+
+MLlib also supports the input type `RDD[LabeledPoint]` to enable feature selection via chi-squared 
+independence tests.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to 
+run Pearson's chi-squared tests. The following example demonstrates how to run and interpret 
+hypothesis tests.
 
-## Hypothesis Testing 
+{% highlight scala %}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.stat.Statistics._
+
+val sc: SparkContext = ...
+
+val vec: Vector = ... // a vector composed of the frequencies of events
+
+// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, 
+// the test runs against a uniform distribution.  
+val goodnessOfFitTestResult = Statistics.chiSqTest(vec)
+println(goodnessOfFitTestResult) // summary of the test including the p-value, degrees of freedom, 
+                                 // test statistic, the method used, and the null hypothesis.
+
+val mat: Matrix = ... // a contingency matrix
+
+// conduct Pearson's independence test on the input contingency matrix
+val independenceTestResult = Statistics.chiSqTest(mat) 
+println(independenceTestResult) // summary of the test including the p-value, degrees of freedom...
+
+val obs: RDD[LabeledPoint] = ... // (feature, label) pairs.
+
+// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+// the independence test. Returns an array containing the ChiSquaredTestResult for every feature 
+// against the label.
+val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
+var i: Integer = 1
+featureTestResults.foreach{ result =>
+    println(s"Column $i: \n$result")
+    i += 1
+} // summary of the test 
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+[`Statistics`](api/java/org/apache/spark/mllib/stat/Statistics.html) provides methods to 
+run Pearson's chi-squared tests. The following example demonstrates how to run and interpret 
+hypothesis tests.
+
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.stat.Statistics;
+import org.apache.spark.mllib.stat.test.ChiSqTestResult;
+
+JavaSparkContext jsc = ...
+
+Vector vec = ... // a vector composed of the frequencies of events
+
+// compute the goodness of fit. If a second vector to test against is not supplied as a parameter, 
+// the test runs against a uniform distribution.  
+ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
+// summary of the test including the p-value, degrees of freedom, test statistic, the method used, 
+// and the null hypothesis.
+System.out.println(goodnessOfFitTestResult);
+
+Matrix mat = ... // a contingency matrix
+
+// conduct Pearson's independence test on the input contingency matrix
+ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
+// summary of the test including the p-value, degrees of freedom...
+System.out.println(independenceTestResult);
+
+JavaRDD<LabeledPoint> obs = ... // (feature, label) pairs.
+
+// The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+// the independence test. Returns an array containing the ChiSquaredTestResult for every feature 
+// against the label.
+ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
+int i = 1;
+for (ChiSqTestResult individualSummary:featureTestResults){
+    System.out.println("Column " + i++ + ":");
+    System.out.println(individualSummary); // summary of the test 
+}
+
+{% endhighlight %}
+</div>
+
+</div>

From 213fe3f31f708ff0ee56d56e36644b51c0bba56e Mon Sep 17 00:00:00 2001
From: Burak <brkyvz@gmail.com>
Date: Mon, 25 Aug 2014 23:28:13 -0700
Subject: [PATCH 2/2] [SPARK-2839][MLlib] Modifications made according to
 review

---
 docs/mllib-stats.md | 218 ++++++++++++++++++++++++--------------------
 1 file changed, 120 insertions(+), 98 deletions(-)

diff --git a/docs/mllib-stats.md b/docs/mllib-stats.md
index e4b5bcbfa110e..a1926657adb96 100644
--- a/docs/mllib-stats.md
+++ b/docs/mllib-stats.md
@@ -25,6 +25,87 @@ displayTitle: <a href="mllib-guide.html">MLlib</a> - Statistics Functionality
 \newcommand{\zero}{\mathbf{0}}
 \]`
 
+## Summary Statistics 
+
+### Multivariate summary statistics
+
+We provide column summary statistics for `RDD[Vector]` through the function `colStats` 
+available in `Statistics`.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+[`colStats()`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) returns an instance of
+[`MultivariateStatisticalSummary`](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary),
+which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
+total count.
+
+{% highlight scala %}
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
+
+val observations: RDD[Vector] = ... // an RDD of Vectors
+
+// Compute column summary statistics.
+val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
+println(summary.mean) // a dense vector containing the mean value for each column
+println(summary.variance) // column-wise variance
+println(summary.numNonzeros) // number of nonzeros in each column
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+[`colStats()`](api/java/org/apache/spark/mllib/stat/Statistics.html) returns an instance of
+[`MultivariateStatisticalSummary`](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html),
+which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
+total count.
+
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
+import org.apache.spark.mllib.stat.Statistics;
+
+JavaSparkContext jsc = ...
+
+JavaRDD<Vector> mat = ... // an RDD of Vectors
+
+// Compute column summary statistics.
+MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
+System.out.println(summary.mean()); // a dense vector containing the mean value for each column
+System.out.println(summary.variance()); // column-wise variance
+System.out.println(summary.numNonzeros()); // number of nonzeros in each column
+
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+[`colStats()`](api/python/pyspark.mllib.stat.Statistics-class.html) returns an instance of
+[`MultivariateStatisticalSummary`](api/python/pyspark.mllib.stat.MultivariateStatisticalSummary-class.html),
+which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
+total count.
+
+{% highlight python %}
+from pyspark.mllib.stat import Statistics
+
+sc = ... # SparkContext
+
+mat = ... # an RDD of Vectors
+
+# Compute column summary statistics.
+summary = Statistics.colStats(mat)
+print summary.mean()
+print summary.variance()
+print summary.numNonzeros()
+
+{% endhighlight %}
+</div>
+
+</div>
+
 ## Random data generation
 
 Random data generation is useful for randomized algorithms, prototyping, and performance testing.
@@ -99,11 +180,11 @@ v = u.map(lambda x: 1.0 + 2.0 * x)
 
 </div>
 
-## Correlation Calculation
+## Correlations calculation
 
 Calculating the correlation between two series of data is a common operation in Statistics. In MLlib
-we provide the flexibility to calculate correlation between many series. The supported correlation
-methods are currently Pearson's and Spearman's correlation.
+we provide the flexibility to calculate pairwise correlations among many series. The supported 
+correlation methods are currently Pearson's and Spearman's correlation.
  
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -164,13 +245,10 @@ Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
 </div>
 
 <div data-lang="python" markdown="1">
-[`Statistics`](api/python/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to 
+[`Statistics`](api/python/pyspark.mllib.stat.Statistics-class.html) provides methods to 
 calculate correlations between series. Depending on the type of input, two `RDD[Double]`s or 
 an `RDD[Vector]`, the output will be a `Double` or the correlation `Matrix` respectively.
 
-Support for `RowMatrix` operations in python currently don't exist, but will be added in future 
-releases.  
-
 {% highlight python %}
 from pyspark.mllib.stat import Statistics
 
@@ -193,14 +271,15 @@ print Statistics.corr(data, method="pearson")
 
 </div>
 
-## Stratified Sampling
+## Stratified sampling
 
 Unlike the other statistics functions, which reside in MLLib, stratified sampling methods, 
-`sampleByKey` and `sampleByKeyExact`, can be accessed in `PairRDDFunctions` in core, as stratified 
-sampling is tightly coupled with the PairRDD data type, and the function signature conforms to the 
-other *ByKey* methods in PairRDDFunctions. A separate method for exact sample size support exists 
-as it requires significant more resources than the per-stratum simple random sampling used in 
-`sampleByKey`.
+`sampleByKey` and `sampleByKeyExact`, can be performed on RDD's of key-value pairs. For stratified
+sampling, the keys can be thought of as a label and the value as a specific attribute. For example 
+the key can be man or woman, or document ids, and the respective values can be the list of ages 
+of the people in the population or the list of words in the documents. A separate method for exact 
+sample size support exists as it requires significant more resources than the per-stratum simple 
+random sampling used in `sampleByKey`. `sampleByKeyExact` is currently not supported in python.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -212,15 +291,16 @@ size, whereas sampling with replacement requires two additional passes.
 
 {% highlight scala %}
 import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.PairRDDFunctions
 
 val sc: SparkContext = ...
 
-val data = ... // an RDD[(K,V)] of any key value pairs
+val data = ... // an RDD[(K, V)] of any key value pairs
 val fractions: Map[K, Double] = ... // specify the exact fraction desired from each key
 
 // Get an exact sample from each stratum
-val sample = data.sampleByKeyExact(withReplacement=false, fractions)
+val sample = data.sampleByKeyExact(withReplacement = false, fractions)
 
 {% endhighlight %}
 </div>
@@ -233,107 +313,48 @@ Sampling without replacement requires one additional pass over the RDD to guaran
 size, whereas sampling with replacement requires two additional passes.
 
 {% highlight java %}
+import java.util.Map;
+
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 
 JavaSparkContext jsc = ...
 
-JavaPairRDD<K,V> data = ... // an RDD of any key value pairs
-java.util.Map<K,Object> fractions = ... // specify the exact fraction desired from each key
+JavaPairRDD<K, V> data = ... // an RDD of any key value pairs
+Map<K, Object> fractions = ... // specify the exact fraction desired from each key
 
 // Get an exact sample from each stratum
-JavaPairRDD<K,V> sample = data.sampleByKeyExact(false, fractions);
+JavaPairRDD<K, V> sample = data.sampleByKeyExact(false, fractions);
 
 {% endhighlight %}
 </div>
+<div data-lang="python" markdown="1">
+[`sampleByKey()`](api/python/pyspark.rdd.RDD-class.html#sampleByKey) allows users to
+sample approximately $\lceil f_k \cdot n_k \rceil \, \forall k \in K$ items, where $f_k$ is the 
+desired fraction for key $k$, and $n_k$ is the number of key-value pairs for key $k$. 
+Sampling without replacement requires one additional pass over the RDD to guarantee sample 
+size, whereas sampling with replacement requires two additional passes.
 
-</div>
-
-## Summary Statistics 
-
-### Multivariate summary statistics
-
-We provide column summary statistics for `RDD[Vector]` through the static function `colStats` 
-available in `Statistics`.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-[`colStats()`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) returns an instance of
-[`MultivariateStatisticalSummary`](api/scala/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary),
-which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
-total count.
-
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
-
-val mat: RDD[Vector] = ... // an RDD of Vectors
-
-// Compute column summary statistics.
-val summary: MultivariateStatisticalSummary = Statistics.colStats(mat)
-println(summary.mean) // a dense vector containing the mean value for each column
-println(summary.variance) // column-wise variance
-println(summary.numNonzeros) // number of nonzeros in each column
-
-{% endhighlight %}
-</div>
-
-<div data-lang="java" markdown="1">
-
-[`colStats()`](api/java/org/apache/spark/mllib/stat/Statistics.html) returns an instance of
-[`MultivariateStatisticalSummary`](api/java/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.html),
-which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
-total count.
-
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
-import org.apache.spark.mllib.stat.Statistics;
-
-JavaSparkContext jsc = ...
-
-JavaRDD<Vector> mat = ... // an RDD of Vectors
-
-// Compute column summary statistics.
-MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
-System.out.println(summary.mean()); // a dense vector containing the mean value for each column
-System.out.println(summary.variance()); // column-wise variance
-System.out.println(summary.numNonzeros()); // number of nonzeros in each column
-
-{% endhighlight %}
-</div>
+*Note:* `sampleByKeyExact()` is currently not supported in Python.
 
-<div data-lang="python" markdown="1">
-[`colStats()`](api/python/index.html#org.apache.spark.mllib.stat.Statistics$) returns an instance of
-[`MultivariateStatisticalSummary`](api/python/index.html#org.apache.spark.mllib.stat.MultivariateStatisticalSummary$),
-which contains the column-wise max, min, mean, variance, and number of nonzeros, as well as the
-total count.
 
 {% highlight python %}
-from pyspark.mllib.stat import Statistics
 
 sc = ... # SparkContext
 
-mat = ... # an RDD of Vectors
+data = ... # an RDD of any key value pairs
+fractions = ... # specify the exact fraction desired from each key as a dictionary
 
-# Compute column summary statistics.
-summary = Statistics.colStats(mat)
-print summary.mean()
-print summary.variance()
-print summary.numNonzeros()
+sample = data.sampleByKeyExact(False, fractions);
 
 {% endhighlight %}
 </div>
 
 </div>
 
+## Hypothesis testing
 
-## Hypothesis Testing
-
-Hypothesis testing is a power tool in statistics to determine whether a result is statistically 
+Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically 
 significant, whether this result occurred by chance or not. MLlib currently supports Pearson's 
 chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine 
 whether the goodness of fit or the independence test is conducted. The goodness of fit test requires 
@@ -376,9 +397,9 @@ val obs: RDD[LabeledPoint] = ... // (feature, label) pairs.
 // the independence test. Returns an array containing the ChiSquaredTestResult for every feature 
 // against the label.
 val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
-var i: Integer = 1
-featureTestResults.foreach{ result =>
-    println(s"Column $i: \n$result")
+var i = 1
+featureTestResults.foreach { result =>
+    println(s"Column $i:\n$result")
     i += 1
 } // summary of the test 
 
@@ -416,16 +437,17 @@ ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
 // summary of the test including the p-value, degrees of freedom...
 System.out.println(independenceTestResult);
 
-JavaRDD<LabeledPoint> obs = ... // (feature, label) pairs.
+JavaRDD<LabeledPoint> obs = ... // an RDD of labeled points
 
 // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
 // the independence test. Returns an array containing the ChiSquaredTestResult for every feature 
 // against the label.
 ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
 int i = 1;
-for (ChiSqTestResult individualSummary:featureTestResults){
-    System.out.println("Column " + i++ + ":");
-    System.out.println(individualSummary); // summary of the test 
+for (ChiSqTestResult result : featureTestResults) {
+    System.out.println("Column " + i + ":");
+    System.out.println(result); // summary of the test
+    i++;
 }
 
 {% endhighlight %}