From 377d7a9fff84929bc086471656a8ba47561e8b17 Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Mon, 9 Nov 2015 12:09:17 +0530
Subject: [PATCH 01/13] Intial commit..added all files

---
 docs/mllib-evaluation-metrics.md              | 940 +-----------------
 .../mllib/JavaBinaryClassification.java       | 113 +++
 .../examples/mllib/JavaLinearRegression.java  |  90 ++
 .../mllib/JavaMultiLabelClassification.java   |  77 ++
 .../mllib/JavaMulticlassClassification.java   |  92 ++
 .../spark/examples/mllib/JavaRanking.java     | 175 ++++
 .../mllib/binary_classification_metrics.py    |  63 ++
 .../main/python/mllib/multi_class_metrics.py  |  69 ++
 .../main/python/mllib/multi_label_metrics.py  |  63 ++
 .../src/main/python/mllib/ranking_metrics.py  |  54 +
 .../main/python/mllib/regression_metrics.py   |  55 +
 .../mllib/BinaryClassificationMetrics.scala   | 109 ++
 .../spark/examples/mllib/MultiLabelMetrics    |  69 ++
 .../examples/mllib/MulticlassMetrics.scala    | 103 ++
 .../spark/examples/mllib/RankingMetrics.scala | 102 ++
 .../examples/mllib/RegressionMetrics.scala    |  66 ++
 16 files changed, 1315 insertions(+), 925 deletions(-)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
 create mode 100644 examples/src/main/python/mllib/binary_classification_metrics.py
 create mode 100644 examples/src/main/python/mllib/multi_class_metrics.py
 create mode 100644 examples/src/main/python/mllib/multi_label_metrics.py
 create mode 100644 examples/src/main/python/mllib/ranking_metrics.py
 create mode 100644 examples/src/main/python/mllib/regression_metrics.py
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala
diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index f73eff637dc36..2991249161046 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -104,214 +104,21 @@ data, and evaluate the performance of the algorithm by several binary evaluation
 <div data-lang="scala" markdown="1">
 Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) and [`BinaryClassificationMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
-import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.util.MLUtils
-
-// Load training data in LIBSVM format
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")
-
-// Split data into training (60%) and test (40%)
-val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L)
-training.cache()
-
-// Run training algorithm to build the model
-val model = new LogisticRegressionWithLBFGS()
-  .setNumClasses(2)
-  .run(training)
-
-// Clear the prediction threshold so the model will return probabilities
-model.clearThreshold
-
-// Compute raw scores on the test set
-val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
-  val prediction = model.predict(features)
-  (prediction, label)
-}
-
-// Instantiate metrics object
-val metrics = new BinaryClassificationMetrics(predictionAndLabels)
-
-// Precision by threshold
-val precision = metrics.precisionByThreshold
-precision.foreach { case (t, p) =>
-    println(s"Threshold: $t, Precision: $p")
-}
-
-// Recall by threshold
-val recall = metrics.recallByThreshold
-recall.foreach { case (t, r) =>
-    println(s"Threshold: $t, Recall: $r")
-}
-
-// Precision-Recall Curve
-val PRC = metrics.pr
-
-// F-measure
-val f1Score = metrics.fMeasureByThreshold
-f1Score.foreach { case (t, f) =>
-    println(s"Threshold: $t, F-score: $f, Beta = 1")
-}
-
-val beta = 0.5
-val fScore = metrics.fMeasureByThreshold(beta)
-f1Score.foreach { case (t, f) =>
-    println(s"Threshold: $t, F-score: $f, Beta = 0.5")
-}
-
-// AUPRC
-val auPRC = metrics.areaUnderPR
-println("Area under precision-recall curve = " + auPRC)
-
-// Compute thresholds used in ROC and PR curves
-val thresholds = precision.map(_._1)
-
-// ROC Curve
-val roc = metrics.roc
-
-// AUROC
-val auROC = metrics.areaUnderROC
-println("Area under ROC = " + auROC)
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala %}
 
 </div>
 
 <div data-lang="java" markdown="1">
 Refer to the [`LogisticRegressionModel` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionModel.html) and [`LogisticRegressionWithLBFGS` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html) for details on the API.
 
-{% highlight java %}
-import scala.Tuple2;
-
-import org.apache.spark.api.java.*;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.classification.LogisticRegressionModel;
-import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
-import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
-
-public class BinaryClassification {
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics");
-    SparkContext sc = new SparkContext(conf);
-    String path = "data/mllib/sample_binary_classification_data.txt";
-    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
-
-    // Split initial RDD into two... [60% training data, 40% testing data].
-    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
-    JavaRDD<LabeledPoint> training = splits[0].cache();
-    JavaRDD<LabeledPoint> test = splits[1];
-
-    // Run training algorithm to build the model.
-    final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
-      .setNumClasses(2)
-      .run(training.rdd());
-
-    // Clear the prediction threshold so the model will return probabilities
-    model.clearThreshold();
-
-    // Compute raw scores on the test set.
-    JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
-      new Function<LabeledPoint, Tuple2<Object, Object>>() {
-        public Tuple2<Object, Object> call(LabeledPoint p) {
-          Double prediction = model.predict(p.features());
-          return new Tuple2<Object, Object>(prediction, p.label());
-        }
-      }
-    );
-
-    // Get evaluation metrics.
-    BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictionAndLabels.rdd());
-
-    // Precision by threshold
-    JavaRDD<Tuple2<Object, Object>> precision = metrics.precisionByThreshold().toJavaRDD();
-    System.out.println("Precision by threshold: " + precision.toArray());
-
-    // Recall by threshold
-    JavaRDD<Tuple2<Object, Object>> recall = metrics.recallByThreshold().toJavaRDD();
-    System.out.println("Recall by threshold: " + recall.toArray());
-
-    // F Score by threshold
-    JavaRDD<Tuple2<Object, Object>> f1Score = metrics.fMeasureByThreshold().toJavaRDD();
-    System.out.println("F1 Score by threshold: " + f1Score.toArray());
-
-    JavaRDD<Tuple2<Object, Object>> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD();
-    System.out.println("F2 Score by threshold: " + f2Score.toArray());
-
-    // Precision-recall curve
-    JavaRDD<Tuple2<Object, Object>> prc = metrics.pr().toJavaRDD();
-    System.out.println("Precision-recall curve: " + prc.toArray());
-
-    // Thresholds
-    JavaRDD<Double> thresholds = precision.map(
-      new Function<Tuple2<Object, Object>, Double>() {
-        public Double call (Tuple2<Object, Object> t) {
-          return new Double(t._1().toString());
-        }
-      }
-    );
-
-    // ROC Curve
-    JavaRDD<Tuple2<Object, Object>> roc = metrics.roc().toJavaRDD();
-    System.out.println("ROC curve: " + roc.toArray());
-
-    // AUPRC
-    System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR());
-
-    // AUROC
-    System.out.println("Area under ROC = " + metrics.areaUnderROC());
-
-    // Save and load model
-    model.save(sc, "myModelPath");
-    LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
-  }
-}
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaBinaryClassification.java %}
 
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`BinaryClassificationMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.BinaryClassificationMetrics) and [`LogisticRegressionWithLBFGS` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.LogisticRegressionWithLBFGS) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.classification import LogisticRegressionWithLBFGS
-from pyspark.mllib.evaluation import BinaryClassificationMetrics
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.util import MLUtils
-
-# Several of the methods available in scala are currently missing from pyspark
-
-# Load training data in LIBSVM format
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")
-
-# Split data into training (60%) and test (40%)
-training, test = data.randomSplit([0.6, 0.4], seed = 11L)
-training.cache()
-
-# Run training algorithm to build the model
-model = LogisticRegressionWithLBFGS.train(training)
-
-# Compute raw scores on the test set
-predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
-
-# Instantiate metrics object
-metrics = BinaryClassificationMetrics(predictionAndLabels)
-
-# Area under precision-recall curve
-print("Area under PR = %s" % metrics.areaUnderPR)
-
-# Area under ROC curve
-print("Area under ROC = %s" % metrics.areaUnderROC)
-
-{% endhighlight %}
-
+{% include_example python/mllib/binary_classification_metrics.py %}
 </div>
 </div>
 
@@ -433,204 +240,21 @@ the data, and evaluate the performance of the algorithm by several multiclass cl
 <div data-lang="scala" markdown="1">
 Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MulticlassMetrics) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
-import org.apache.spark.mllib.evaluation.MulticlassMetrics
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.util.MLUtils
-
-// Load training data in LIBSVM format
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
-
-// Split data into training (60%) and test (40%)
-val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L)
-training.cache()
-
-// Run training algorithm to build the model
-val model = new LogisticRegressionWithLBFGS()
-  .setNumClasses(3)
-  .run(training)
-
-// Compute raw scores on the test set
-val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
-  val prediction = model.predict(features)
-  (prediction, label)
-}
-
-// Instantiate metrics object
-val metrics = new MulticlassMetrics(predictionAndLabels)
-
-// Confusion matrix
-println("Confusion matrix:")
-println(metrics.confusionMatrix)
-
-// Overall Statistics
-val precision = metrics.precision
-val recall = metrics.recall // same as true positive rate
-val f1Score = metrics.fMeasure
-println("Summary Statistics")
-println(s"Precision = $precision")
-println(s"Recall = $recall")
-println(s"F1 Score = $f1Score")
-
-// Precision by label
-val labels = metrics.labels
-labels.foreach { l =>
-    println(s"Precision($l) = " + metrics.precision(l))
-}
-
-// Recall by label
-labels.foreach { l =>
-    println(s"Recall($l) = " + metrics.recall(l))
-}
-
-// False positive rate by label
-labels.foreach { l =>
-    println(s"FPR($l) = " + metrics.falsePositiveRate(l))
-}
-
-// F-measure by label
-labels.foreach { l =>
-    println(s"F1-Score($l) = " + metrics.fMeasure(l))
-}
-
-// Weighted stats
-println(s"Weighted precision: ${metrics.weightedPrecision}")
-println(s"Weighted recall: ${metrics.weightedRecall}")
-println(s"Weighted F1 score: ${metrics.weightedFMeasure}")
-println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}")
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala %}
 
 </div>
 
 <div data-lang="java" markdown="1">
 Refer to the [`MulticlassMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MulticlassMetrics.html) for details on the API.
 
-{% highlight java %}
-import scala.Tuple2;
-
-import org.apache.spark.api.java.*;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.classification.LogisticRegressionModel;
-import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
-import org.apache.spark.mllib.evaluation.MulticlassMetrics;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
-
-public class MulticlassClassification {
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Multiclass Classification Metrics");
-    SparkContext sc = new SparkContext(conf);
-    String path = "data/mllib/sample_multiclass_classification_data.txt";
-    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
-
-    // Split initial RDD into two... [60% training data, 40% testing data].
-    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
-    JavaRDD<LabeledPoint> training = splits[0].cache();
-    JavaRDD<LabeledPoint> test = splits[1];
-
-    // Run training algorithm to build the model.
-    final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
-      .setNumClasses(3)
-      .run(training.rdd());
-
-    // Compute raw scores on the test set.
-    JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
-      new Function<LabeledPoint, Tuple2<Object, Object>>() {
-        public Tuple2<Object, Object> call(LabeledPoint p) {
-          Double prediction = model.predict(p.features());
-          return new Tuple2<Object, Object>(prediction, p.label());
-        }
-      }
-    );
-
-    // Get evaluation metrics.
-    MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
-
-    // Confusion matrix
-    Matrix confusion = metrics.confusionMatrix();
-    System.out.println("Confusion matrix: \n" + confusion);
-
-    // Overall statistics
-    System.out.println("Precision = " + metrics.precision());
-    System.out.println("Recall = " + metrics.recall());
-    System.out.println("F1 Score = " + metrics.fMeasure());
-
-    // Stats by labels
-    for (int i = 0; i < metrics.labels().length; i++) {
-        System.out.format("Class %f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
-        System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
-        System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i]));
-    }
-
-    //Weighted stats
-    System.out.format("Weighted precision = %f\n", metrics.weightedPrecision());
-    System.out.format("Weighted recall = %f\n", metrics.weightedRecall());
-    System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure());
-    System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate());
-
-    // Save and load model
-    model.save(sc, "myModelPath");
-    LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
-  }
-}
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassMetrics.java %}
 
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`MulticlassMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.MulticlassMetrics) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.classification import LogisticRegressionWithLBFGS
-from pyspark.mllib.util import MLUtils
-from pyspark.mllib.evaluation import MulticlassMetrics
-
-# Load training data in LIBSVM format
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
-
-# Split data into training (60%) and test (40%)
-training, test = data.randomSplit([0.6, 0.4], seed = 11L)
-training.cache()
-
-# Run training algorithm to build the model
-model = LogisticRegressionWithLBFGS.train(training, numClasses=3)
-
-# Compute raw scores on the test set
-predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
-
-# Instantiate metrics object
-metrics = MulticlassMetrics(predictionAndLabels)
-
-# Overall statistics
-precision = metrics.precision()
-recall = metrics.recall()
-f1Score = metrics.fMeasure()
-print("Summary Stats")
-print("Precision = %s" % precision)
-print("Recall = %s" % recall)
-print("F1 Score = %s" % f1Score)
-
-# Statistics by class
-labels = data.map(lambda lp: lp.label).distinct().collect()
-for label in sorted(labels):
-    print("Class %s precision = %s" % (label, metrics.precision(label)))
-    print("Class %s recall = %s" % (label, metrics.recall(label)))
-    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
-
-# Weighted stats
-print("Weighted recall = %s" % metrics.weightedRecall)
-print("Weighted precision = %s" % metrics.weightedPrecision)
-print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
-print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
-print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
-{% endhighlight %}
+{% include_example python/mllib/multi_class_metrics.py %}
 
 </div>
 </div>
@@ -766,154 +390,21 @@ True classes:
 <div data-lang="scala" markdown="1">
 Refer to the [`MultilabelMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MultilabelMetrics) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.evaluation.MultilabelMetrics
-import org.apache.spark.rdd.RDD;
-
-val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize(
-  Seq((Array(0.0, 1.0), Array(0.0, 2.0)),
-    (Array(0.0, 2.0), Array(0.0, 1.0)),
-    (Array(), Array(0.0)),
-    (Array(2.0), Array(2.0)),
-    (Array(2.0, 0.0), Array(2.0, 0.0)),
-    (Array(0.0, 1.0, 2.0), Array(0.0, 1.0)),
-    (Array(1.0), Array(1.0, 2.0))), 2)
-
-// Instantiate metrics object
-val metrics = new MultilabelMetrics(scoreAndLabels)
-
-// Summary stats
-println(s"Recall = ${metrics.recall}")
-println(s"Precision = ${metrics.precision}")
-println(s"F1 measure = ${metrics.f1Measure}")
-println(s"Accuracy = ${metrics.accuracy}")
-
-// Individual label stats
-metrics.labels.foreach(label => println(s"Class $label precision = ${metrics.precision(label)}"))
-metrics.labels.foreach(label => println(s"Class $label recall = ${metrics.recall(label)}"))
-metrics.labels.foreach(label => println(s"Class $label F1-score = ${metrics.f1Measure(label)}"))
-
-// Micro stats
-println(s"Micro recall = ${metrics.microRecall}")
-println(s"Micro precision = ${metrics.microPrecision}")
-println(s"Micro F1 measure = ${metrics.microF1Measure}")
-
-// Hamming loss
-println(s"Hamming loss = ${metrics.hammingLoss}")
-
-// Subset accuracy
-println(s"Subset accuracy = ${metrics.subsetAccuracy}")
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala %}
 
 </div>
 
 <div data-lang="java" markdown="1">
 Refer to the [`MultilabelMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MultilabelMetrics.html) for details on the API.
 
-{% highlight java %}
-import scala.Tuple2;
-
-import org.apache.spark.api.java.*;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.mllib.evaluation.MultilabelMetrics;
-import org.apache.spark.SparkConf;
-import java.util.Arrays;
-import java.util.List;
-
-public class MultilabelClassification {
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics");
-    JavaSparkContext sc = new JavaSparkContext(conf);
-
-    List<Tuple2<double[], double[]>> data = Arrays.asList(
-        new Tuple2<double[], double[]>(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}),
-        new Tuple2<double[], double[]>(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}),
-        new Tuple2<double[], double[]>(new double[]{}, new double[]{0.0}),
-        new Tuple2<double[], double[]>(new double[]{2.0}, new double[]{2.0}),
-        new Tuple2<double[], double[]>(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}),
-        new Tuple2<double[], double[]>(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}),
-        new Tuple2<double[], double[]>(new double[]{1.0}, new double[]{1.0, 2.0})
-        );
-    JavaRDD<Tuple2<double[], double[]>> scoreAndLabels = sc.parallelize(data);
-
-    // Instantiate metrics object
-    MultilabelMetrics metrics = new MultilabelMetrics(scoreAndLabels.rdd());
-
-    // Summary stats
-    System.out.format("Recall = %f\n", metrics.recall());
-    System.out.format("Precision = %f\n", metrics.precision());
-    System.out.format("F1 measure = %f\n", metrics.f1Measure());
-    System.out.format("Accuracy = %f\n", metrics.accuracy());
-
-    // Stats by labels
-    for (int i = 0; i < metrics.labels().length - 1; i++) {
-        System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
-        System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
-        System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure(metrics.labels()[i]));
-    }
-
-    // Micro stats
-    System.out.format("Micro recall = %f\n", metrics.microRecall());
-    System.out.format("Micro precision = %f\n", metrics.microPrecision());
-    System.out.format("Micro F1 measure = %f\n", metrics.microF1Measure());
-
-    // Hamming loss
-    System.out.format("Hamming loss = %f\n", metrics.hammingLoss());
-
-    // Subset accuracy
-    System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy());
-
-  }
-}
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java %}
 
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`MultilabelMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.MultilabelMetrics) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.evaluation import MultilabelMetrics
-
-scoreAndLabels = sc.parallelize([
-    ([0.0, 1.0], [0.0, 2.0]),
-    ([0.0, 2.0], [0.0, 1.0]),
-    ([], [0.0]),
-    ([2.0], [2.0]),
-    ([2.0, 0.0], [2.0, 0.0]),
-    ([0.0, 1.0, 2.0], [0.0, 1.0]),
-    ([1.0], [1.0, 2.0])])
-
-# Instantiate metrics object
-metrics = MultilabelMetrics(scoreAndLabels)
-
-# Summary stats
-print("Recall = %s" % metrics.recall())
-print("Precision = %s" % metrics.precision())
-print("F1 measure = %s" % metrics.f1Measure())
-print("Accuracy = %s" % metrics.accuracy)
-
-# Individual label stats
-labels = scoreAndLabels.flatMap(lambda x: x[1]).distinct().collect()
-for label in labels:
-    print("Class %s precision = %s" % (label, metrics.precision(label)))
-    print("Class %s recall = %s" % (label, metrics.recall(label)))
-    print("Class %s F1 Measure = %s" % (label, metrics.f1Measure(label)))
-
-# Micro stats
-print("Micro precision = %s" % metrics.microPrecision)
-print("Micro recall = %s" % metrics.microRecall)
-print("Micro F1 measure = %s" % metrics.microF1Measure)
-
-# Hamming loss
-print("Hamming loss = %s" % metrics.hammingLoss)
-
-# Subset accuracy
-print("Subset accuracy = %s" % metrics.subsetAccuracy)
-
-{% endhighlight %}
+{% include_example python/mllib/multi_label_metrics.py %}
 
 </div>
 </div>
@@ -1027,280 +518,21 @@ expanded world of non-positive weights are "the same as never having interacted
 <div data-lang="scala" markdown="1">
 Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RegressionMetrics) and [`RankingMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RankingMetrics) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics}
-import org.apache.spark.mllib.recommendation.{ALS, Rating}
-
-// Read in the ratings data
-val ratings = sc.textFile("data/mllib/sample_movielens_data.txt").map { line =>
-  val fields = line.split("::")
-  Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble - 2.5)
-}.cache()
-
-// Map ratings to 1 or 0, 1 indicating a movie that should be recommended
-val binarizedRatings = ratings.map(r => Rating(r.user, r.product, if (r.rating > 0) 1.0 else 0.0)).cache()
-
-// Summarize ratings
-val numRatings = ratings.count()
-val numUsers = ratings.map(_.user).distinct().count()
-val numMovies = ratings.map(_.product).distinct().count()
-println(s"Got $numRatings ratings from $numUsers users on $numMovies movies.")
-
-// Build the model
-val numIterations = 10
-val rank = 10
-val lambda = 0.01
-val model = ALS.train(ratings, rank, numIterations, lambda)
-
-// Define a function to scale ratings from 0 to 1
-def scaledRating(r: Rating): Rating = {
-  val scaledRating = math.max(math.min(r.rating, 1.0), 0.0)
-  Rating(r.user, r.product, scaledRating)
-}
-
-// Get sorted top ten predictions for each user and then scale from [0, 1]
-val userRecommended = model.recommendProductsForUsers(10).map{ case (user, recs) =>
-  (user, recs.map(scaledRating))
-}
-
-// Assume that any movie a user rated 3 or higher (which maps to a 1) is a relevant document
-// Compare with top ten most relevant documents
-val userMovies = binarizedRatings.groupBy(_.user)
-val relevantDocuments = userMovies.join(userRecommended).map{ case (user, (actual, predictions)) =>
-  (predictions.map(_.product), actual.filter(_.rating > 0.0).map(_.product).toArray)
-}
-
-// Instantiate metrics object
-val metrics = new RankingMetrics(relevantDocuments)
-
-// Precision at K
-Array(1, 3, 5).foreach{ k =>
-  println(s"Precision at $k = ${metrics.precisionAt(k)}")
-}
-
-// Mean average precision
-println(s"Mean average precision = ${metrics.meanAveragePrecision}")
-
-// Normalized discounted cumulative gain
-Array(1, 3, 5).foreach{ k =>
-  println(s"NDCG at $k = ${metrics.ndcgAt(k)}")
-}
-
-// Get predictions for each data point
-val allPredictions = model.predict(ratings.map(r => (r.user, r.product))).map(r => ((r.user, r.product), r.rating))
-val allRatings = ratings.map(r => ((r.user, r.product), r.rating))
-val predictionsAndLabels = allPredictions.join(allRatings).map{ case ((user, product), (predicted, actual)) =>
-  (predicted, actual)
-}
-
-// Get the RMSE using regression metrics
-val regressionMetrics = new RegressionMetrics(predictionsAndLabels)
-println(s"RMSE = ${regressionMetrics.rootMeanSquaredError}")
-
-// R-squared
-println(s"R-squared = ${regressionMetrics.r2}")
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/RankingMetrics.scala %}
 
 </div>
 
 <div data-lang="java" markdown="1">
 Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) and [`RankingMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RankingMetrics.html) for details on the API.
 
-{% highlight java %}
-import scala.Tuple2;
-
-import org.apache.spark.api.java.*;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.function.Function;
-import java.util.*;
-import org.apache.spark.mllib.evaluation.RegressionMetrics;
-import org.apache.spark.mllib.evaluation.RankingMetrics;
-import org.apache.spark.mllib.recommendation.ALS;
-import org.apache.spark.mllib.recommendation.Rating;
-
-// Read in the ratings data
-public class Ranking {
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Ranking Metrics");
-    JavaSparkContext sc = new JavaSparkContext(conf);
-    String path = "data/mllib/sample_movielens_data.txt";
-    JavaRDD<String> data = sc.textFile(path);
-    JavaRDD<Rating> ratings = data.map(
-      new Function<String, Rating>() {
-        public Rating call(String line) {
-          String[] parts = line.split("::");
-          return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]) - 2.5);
-        }
-      }
-    );
-    ratings.cache();
-
-    // Train an ALS model
-    final MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), 10, 10, 0.01);
-
-    // Get top 10 recommendations for every user and scale ratings from 0 to 1
-    JavaRDD<Tuple2<Object, Rating[]>> userRecs = model.recommendProductsForUsers(10).toJavaRDD();
-    JavaRDD<Tuple2<Object, Rating[]>> userRecsScaled = userRecs.map(
-      new Function<Tuple2<Object, Rating[]>, Tuple2<Object, Rating[]>>() {
-        public Tuple2<Object, Rating[]> call(Tuple2<Object, Rating[]> t) {
-          Rating[] scaledRatings = new Rating[t._2().length];
-          for (int i = 0; i < scaledRatings.length; i++) {
-            double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0);
-            scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating);
-          }
-          return new Tuple2<Object, Rating[]>(t._1(), scaledRatings);
-        }
-      }
-    );
-    JavaPairRDD<Object, Rating[]> userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled);
-
-    // Map ratings to 1 or 0, 1 indicating a movie that should be recommended
-    JavaRDD<Rating> binarizedRatings = ratings.map(
-      new Function<Rating, Rating>() {
-        public Rating call(Rating r) {
-          double binaryRating;
-          if (r.rating() > 0.0) {
-            binaryRating = 1.0;
-          }
-          else {
-            binaryRating = 0.0;
-          }
-          return new Rating(r.user(), r.product(), binaryRating);
-        }
-      }
-    );
-
-    // Group ratings by common user
-    JavaPairRDD<Object, Iterable<Rating>> userMovies = binarizedRatings.groupBy(
-      new Function<Rating, Object>() {
-        public Object call(Rating r) {
-          return r.user();
-        }
-      }
-    );
-
-    // Get true relevant documents from all user ratings
-    JavaPairRDD<Object, List<Integer>> userMoviesList = userMovies.mapValues(
-      new Function<Iterable<Rating>, List<Integer>>() {
-        public List<Integer> call(Iterable<Rating> docs) {
-          List<Integer> products = new ArrayList<Integer>();
-          for (Rating r : docs) {
-            if (r.rating() > 0.0) {
-              products.add(r.product());
-            }
-          }
-          return products;
-        }
-      }
-    );
-
-    // Extract the product id from each recommendation
-    JavaPairRDD<Object, List<Integer>> userRecommendedList = userRecommended.mapValues(
-      new Function<Rating[], List<Integer>>() {
-        public List<Integer> call(Rating[] docs) {
-          List<Integer> products = new ArrayList<Integer>();
-          for (Rating r : docs) {
-            products.add(r.product());
-          }
-          return products;
-        }
-      }
-    );
-    JavaRDD<Tuple2<List<Integer>, List<Integer>>> relevantDocs = userMoviesList.join(userRecommendedList).values();
-
-    // Instantiate the metrics object
-    RankingMetrics metrics = RankingMetrics.of(relevantDocs);
-
-    // Precision and NDCG at k
-    Integer[] kVector = {1, 3, 5};
-    for (Integer k : kVector) {
-      System.out.format("Precision at %d = %f\n", k, metrics.precisionAt(k));
-      System.out.format("NDCG at %d = %f\n", k, metrics.ndcgAt(k));
-    }
-
-    // Mean average precision
-    System.out.format("Mean average precision = %f\n", metrics.meanAveragePrecision());
-
-    // Evaluate the model using numerical ratings and regression metrics
-    JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
-      new Function<Rating, Tuple2<Object, Object>>() {
-        public Tuple2<Object, Object> call(Rating r) {
-          return new Tuple2<Object, Object>(r.user(), r.product());
-        }
-      }
-    );
-    JavaPairRDD<Tuple2<Integer, Integer>, Object> predictions = JavaPairRDD.fromJavaRDD(
-      model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
-        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
-          public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r){
-            return new Tuple2<Tuple2<Integer, Integer>, Object>(
-              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
-          }
-        }
-    ));
-    JavaRDD<Tuple2<Object, Object>> ratesAndPreds =
-      JavaPairRDD.fromJavaRDD(ratings.map(
-        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
-          public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r){
-            return new Tuple2<Tuple2<Integer, Integer>, Object>(
-              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
-          }
-        }
-    )).join(predictions).values();
-
-    // Create regression metrics object
-    RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd());
-
-    // Root mean squared error
-    System.out.format("RMSE = %f\n", regressionMetrics.rootMeanSquaredError());
-
-    // R-squared
-    System.out.format("R-squared = %f\n", regressionMetrics.r2());
-  }
-}
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaRanking.java %}
 
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`RegressionMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RegressionMetrics) and [`RankingMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.recommendation import ALS, Rating
-from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
-
-#  Read in the ratings data
-lines = sc.textFile("data/mllib/sample_movielens_data.txt")
-
-def parseLine(line):
-    fields = line.split("::")
-    return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5)
-ratings = lines.map(lambda r: parseLine(r))
-
-# Train a model on to predict user-product ratings
-model = ALS.train(ratings, 10, 10, 0.01)
-
-# Get predicted ratings on all existing user-product pairs
-testData = ratings.map(lambda p: (p.user, p.product))
-predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating))
-
-ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating))
-scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1])
-
-# Instantiate regression metrics to compare predicted and actual ratings
-metrics = RegressionMetrics(scoreAndLabels)
-
-# Root mean sqaured error
-print("RMSE = %s" % metrics.rootMeanSquaredError)
-
-# R-squared
-print("R-squared = %s" % metrics.r2)
-
-{% endhighlight %}
+{% include_example python/mllib/ranking_metrics.py %}
 
 </div>
 </div>
@@ -1350,163 +582,21 @@ and evaluate the performance of the algorithm by several regression metrics.
 <div data-lang="scala" markdown="1">
 Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RegressionMetrics) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.regression.LinearRegressionModel
-import org.apache.spark.mllib.regression.LinearRegressionWithSGD
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.evaluation.RegressionMetrics
-import org.apache.spark.mllib.util.MLUtils
-
-// Load the data
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_linear_regression_data.txt").cache()
-
-// Build the model
-val numIterations = 100
-val model = LinearRegressionWithSGD.train(data, numIterations)
-
-// Get predictions
-val valuesAndPreds = data.map{ point =>
-  val prediction = model.predict(point.features)
-  (prediction, point.label)
-}
-
-// Instantiate metrics object
-val metrics = new RegressionMetrics(valuesAndPreds)
-
-// Squared error
-println(s"MSE = ${metrics.meanSquaredError}")
-println(s"RMSE = ${metrics.rootMeanSquaredError}")
-
-// R-squared
-println(s"R-squared = ${metrics.r2}")
-
-// Mean absolute error
-println(s"MAE = ${metrics.meanAbsoluteError}")
-
-// Explained variance
-println(s"Explained variance = ${metrics.explainedVariance}")
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/RegressionMetrics.scala %}
 
 </div>
 
 <div data-lang="java" markdown="1">
 Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) for details on the API.
 
-{% highlight java %}
-import scala.Tuple2;
-
-import org.apache.spark.api.java.*;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.regression.LinearRegressionModel;
-import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
-import org.apache.spark.mllib.evaluation.RegressionMetrics;
-import org.apache.spark.SparkConf;
-
-public class LinearRegression {
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Linear Regression Example");
-    JavaSparkContext sc = new JavaSparkContext(conf);
-
-    // Load and parse the data
-    String path = "data/mllib/sample_linear_regression_data.txt";
-    JavaRDD<String> data = sc.textFile(path);
-    JavaRDD<LabeledPoint> parsedData = data.map(
-      new Function<String, LabeledPoint>() {
-        public LabeledPoint call(String line) {
-          String[] parts = line.split(" ");
-          double[] v = new double[parts.length - 1];
-          for (int i = 1; i < parts.length - 1; i++)
-            v[i - 1] = Double.parseDouble(parts[i].split(":")[1]);
-          return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
-        }
-      }
-    );
-    parsedData.cache();
-
-    // Building the model
-    int numIterations = 100;
-    final LinearRegressionModel model =
-      LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations);
-
-    // Evaluate model on training examples and compute training error
-    JavaRDD<Tuple2<Object, Object>> valuesAndPreds = parsedData.map(
-      new Function<LabeledPoint, Tuple2<Object, Object>>() {
-        public Tuple2<Object, Object> call(LabeledPoint point) {
-          double prediction = model.predict(point.features());
-          return new Tuple2<Object, Object>(prediction, point.label());
-        }
-      }
-    );
-
-    // Instantiate metrics object
-    RegressionMetrics metrics = new RegressionMetrics(valuesAndPreds.rdd());
-
-    // Squared error
-    System.out.format("MSE = %f\n", metrics.meanSquaredError());
-    System.out.format("RMSE = %f\n", metrics.rootMeanSquaredError());
-
-    // R-squared
-    System.out.format("R Squared = %f\n", metrics.r2());
-
-    // Mean absolute error
-    System.out.format("MAE = %f\n", metrics.meanAbsoluteError());
-
-    // Explained variance
-    System.out.format("Explained Variance = %f\n", metrics.explainedVariance());
-
-    // Save and load model
-    model.save(sc.sc(), "myModelPath");
-    LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath");
-  }
-}
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaLinearRegression.java %}
 
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`RegressionMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RegressionMetrics) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
-from pyspark.mllib.evaluation import RegressionMetrics
-from pyspark.mllib.linalg import DenseVector
-
-# Load and parse the data
-def parsePoint(line):
-    values = line.split()
-    return LabeledPoint(float(values[0]), DenseVector([float(x.split(':')[1]) for x in values[1:]]))
-
-data = sc.textFile("data/mllib/sample_linear_regression_data.txt")
-parsedData = data.map(parsePoint)
-
-# Build the model
-model = LinearRegressionWithSGD.train(parsedData)
-
-# Get predictions
-valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))
-
-# Instantiate metrics object
-metrics = RegressionMetrics(valuesAndPreds)
-
-# Squared Error
-print("MSE = %s" % metrics.meanSquaredError)
-print("RMSE = %s" % metrics.rootMeanSquaredError)
-
-# R-squared
-print("R-squared = %s" % metrics.r2)
-
-# Mean absolute error
-print("MAE = %s" % metrics.meanAbsoluteError)
-
-# Explained variance
-print("Explained variance = %s" % metrics.explainedVariance)
-
-{% endhighlight %}
+{% include_example python/mllib/regression_metrics.py %}
 
 </div>
 </div>
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
new file mode 100644
index 0000000000000..45da1fec120ab
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib;
+
+// $example on$
+
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.classification.LogisticRegressionModel;
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+// $example off$
+
+public class JavaBinaryClassification {
+    public static void main(String[] args) {
+
+        SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics");
+        SparkContext sc = new SparkContext(conf);
+        String path = "data/mllib/sample_binary_classification_data.txt";
+        JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
+
+        // Split initial RDD into two... [60% training data, 40% testing data].
+        JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L);
+        JavaRDD<LabeledPoint> training = splits[0].cache();
+        JavaRDD<LabeledPoint> test = splits[1];
+
+        // Run training algorithm to build the model.
+        final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
+                .setNumClasses(2)
+                .run(training.rdd());
+
+        // Clear the prediction threshold so the model will return probabilities
+        model.clearThreshold();
+
+        // Compute raw scores on the test set.
+        JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
+                new Function<LabeledPoint, Tuple2<Object, Object>>() {
+                    public Tuple2<Object, Object> call(LabeledPoint p) {
+                        Double prediction = model.predict(p.features());
+                        return new Tuple2<Object, Object>(prediction, p.label());
+                    }
+                }
+        );
+
+        // Get evaluation metrics.
+        BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictionAndLabels.rdd());
+
+        // Precision by threshold
+        JavaRDD<Tuple2<Object, Object>> precision = metrics.precisionByThreshold().toJavaRDD();
+        System.out.println("Precision by threshold: " + precision.toArray());
+
+        // Recall by threshold
+        JavaRDD<Tuple2<Object, Object>> recall = metrics.recallByThreshold().toJavaRDD();
+        System.out.println("Recall by threshold: " + recall.toArray());
+
+        // F Score by threshold
+        JavaRDD<Tuple2<Object, Object>> f1Score = metrics.fMeasureByThreshold().toJavaRDD();
+        System.out.println("F1 Score by threshold: " + f1Score.toArray());
+
+        JavaRDD<Tuple2<Object, Object>> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD();
+        System.out.println("F2 Score by threshold: " + f2Score.toArray());
+
+        // Precision-recall curve
+        JavaRDD<Tuple2<Object, Object>> prc = metrics.pr().toJavaRDD();
+        System.out.println("Precision-recall curve: " + prc.toArray());
+
+        // Thresholds
+        JavaRDD<Double> thresholds = precision.map(
+                new Function<Tuple2<Object, Object>, Double>() {
+                    public Double call(Tuple2<Object, Object> t) {
+                        return new Double(t._1().toString());
+                    }
+                }
+        );
+
+        // ROC Curve
+        JavaRDD<Tuple2<Object, Object>> roc = metrics.roc().toJavaRDD();
+        System.out.println("ROC curve: " + roc.toArray());
+
+        // AUPRC
+        System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR());
+
+        // AUROC
+        System.out.println("Area under ROC = " + metrics.areaUnderROC());
+
+        // Save and load model
+        model.save(sc, "myModelPath");
+        LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
+    }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java
new file mode 100644
index 0000000000000..309efced045b6
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib;
+
+// $example on$
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.regression.LinearRegressionModel;
+import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
+import org.apache.spark.mllib.evaluation.RegressionMetrics;
+import org.apache.spark.SparkConf;
+// $example off$
+
+// Read in the ratings data
+public class JavaLinearRegression {
+    public static void main(String[] args) {
+        SparkConf conf = new SparkConf().setAppName("Linear Regression Example");
+        JavaSparkContext sc = new JavaSparkContext(conf);
+
+        // Load and parse the data
+        String path = "data/mllib/sample_linear_regression_data.txt";
+        JavaRDD<String> data = sc.textFile(path);
+        JavaRDD<LabeledPoint> parsedData = data.map(
+                new Function<String, LabeledPoint>() {
+                    public LabeledPoint call(String line) {
+                        String[] parts = line.split(" ");
+                        double[] v = new double[parts.length - 1];
+                        for (int i = 1; i < parts.length - 1; i++)
+                            v[i - 1] = Double.parseDouble(parts[i].split(":")[1]);
+                        return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
+                    }
+                }
+        );
+        parsedData.cache();
+
+        // Building the model
+        int numIterations = 100;
+        final LinearRegressionModel model =
+                LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations);
+
+        // Evaluate model on training examples and compute training error
+        JavaRDD<Tuple2<Object, Object>> valuesAndPreds = parsedData.map(
+                new Function<LabeledPoint, Tuple2<Object, Object>>() {
+                    public Tuple2<Object, Object> call(LabeledPoint point) {
+                        double prediction = model.predict(point.features());
+                        return new Tuple2<Object, Object>(prediction, point.label());
+                    }
+                }
+        );
+
+        // Instantiate metrics object
+        RegressionMetrics metrics = new RegressionMetrics(valuesAndPreds.rdd());
+
+        // Squared error
+        System.out.format("MSE = %f\n", metrics.meanSquaredError());
+        System.out.format("RMSE = %f\n", metrics.rootMeanSquaredError());
+
+        // R-squared
+        System.out.format("R Squared = %f\n", metrics.r2());
+
+        // Mean absolute error
+        System.out.format("MAE = %f\n", metrics.meanAbsoluteError());
+
+        // Explained variance
+        System.out.format("Explained Variance = %f\n", metrics.explainedVariance());
+
+        // Save and load model
+        model.save(sc.sc(), "myModelPath");
+        LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath");
+    }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
new file mode 100644
index 0000000000000..5b3a03f0830b1
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.mllib.evaluation.MultilabelMetrics;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+import java.util.Arrays;
+import java.util.List;
+// $example off$
+
+public class MultilabelClassification {
+    public static void main(String[] args) {
+        SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics");
+        JavaSparkContext sc = new JavaSparkContext(conf);
+
+        List<Tuple2<double[], double[]>> data = Arrays.asList(
+                new Tuple2<double[], double[]>(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}),
+                new Tuple2<double[], double[]>(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}),
+                new Tuple2<double[], double[]>(new double[]{}, new double[]{0.0}),
+                new Tuple2<double[], double[]>(new double[]{2.0}, new double[]{2.0}),
+                new Tuple2<double[], double[]>(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}),
+                new Tuple2<double[], double[]>(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}),
+                new Tuple2<double[], double[]>(new double[]{1.0}, new double[]{1.0, 2.0})
+        );
+        JavaRDD<Tuple2<double[], double[]>> scoreAndLabels = sc.parallelize(data);
+
+        // Instantiate metrics object
+        MultilabelMetrics metrics = new MultilabelMetrics(scoreAndLabels.rdd());
+
+        // Summary stats
+        System.out.format("Recall = %f\n", metrics.recall());
+        System.out.format("Precision = %f\n", metrics.precision());
+        System.out.format("F1 measure = %f\n", metrics.f1Measure());
+        System.out.format("Accuracy = %f\n", metrics.accuracy());
+
+        // Stats by labels
+        for (int i = 0; i < metrics.labels().length - 1; i++) {
+            System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
+            System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
+            System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure(metrics.labels()[i]));
+        }
+
+        // Micro stats
+        System.out.format("Micro recall = %f\n", metrics.microRecall());
+        System.out.format("Micro precision = %f\n", metrics.microPrecision());
+        System.out.format("Micro F1 measure = %f\n", metrics.microF1Measure());
+
+        // Hamming loss
+        System.out.format("Hamming loss = %f\n", metrics.hammingLoss());
+
+        // Subset accuracy
+        System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy());
+
+    }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
new file mode 100644
index 0000000000000..a5e92df358d14
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.classification.LogisticRegressionModel;
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
+import org.apache.spark.mllib.evaluation.MulticlassMetrics;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+
+
+public class JavaMulticlassClassification {
+    public static void main(String[] args) {
+        SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics");
+        SparkContext sc = new SparkContext(conf);
+        String path = "data/mllib/sample_multiclass_classification_data.txt";
+        JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
+
+        // Split initial RDD into two... [60% training data, 40% testing data].
+        JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
+        JavaRDD<LabeledPoint> training = splits[0].cache();
+        JavaRDD<LabeledPoint> test = splits[1];
+
+        // Run training algorithm to build the model.
+        final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
+                .setNumClasses(3)
+                .run(training.rdd());
+
+        // Compute raw scores on the test set.
+        JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
+                new Function<LabeledPoint, Tuple2<Object, Object>>() {
+                    public Tuple2<Object, Object> call(LabeledPoint p) {
+                        Double prediction = model.predict(p.features());
+                        return new Tuple2<Object, Object>(prediction, p.label());
+                    }
+                }
+        );
+
+        // Get evaluation metrics.
+        MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
+
+        // Confusion matrix
+        Matrix confusion = metrics.confusionMatrix();
+        System.out.println("Confusion matrix: \n" + confusion);
+
+        // Overall statistics
+        System.out.println("Precision = " + metrics.precision());
+        System.out.println("Recall = " + metrics.recall());
+        System.out.println("F1 Score = " + metrics.fMeasure());
+
+        // Stats by labels
+        for (int i = 0; i < metrics.labels().length; i++) {
+            System.out.format("Class %f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
+            System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
+            System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i]));
+        }
+
+        //Weighted stats
+        System.out.format("Weighted precision = %f\n", metrics.weightedPrecision());
+        System.out.format("Weighted recall = %f\n", metrics.weightedRecall());
+        System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure());
+        System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate());
+
+        // Save and load model
+        model.save(sc, "myModelPath");
+        LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
+    }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
new file mode 100644
index 0000000000000..2162cc658f193
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.Function;
+import java.util.*;
+import org.apache.spark.mllib.evaluation.RegressionMetrics;
+import org.apache.spark.mllib.evaluation.RankingMetrics;
+import org.apache.spark.mllib.recommendation.ALS;
+import org.apache.spark.mllib.recommendation.Rating;
+// $example off$
+
+// Read in the ratings data
+public class JavaRanking {
+    public static void main(String[] args) {
+        SparkConf conf = new SparkConf().setAppName("Ranking Metrics");
+        JavaSparkContext sc = new JavaSparkContext(conf);
+        String path = "data/mllib/sample_movielens_data.txt";
+        JavaRDD<String> data = sc.textFile(path);
+        JavaRDD<Rating> ratings = data.map(
+                new Function<String, Rating>() {
+                    public Rating call(String line) {
+                        String[] parts = line.split("::");
+                        return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]) - 2.5);
+                    }
+                }
+        );
+        ratings.cache();
+
+        // Train an ALS model
+        final MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), 10, 10, 0.01);
+
+        // Get top 10 recommendations for every user and scale ratings from 0 to 1
+        JavaRDD<Tuple2<Object, Rating[]>> userRecs = model.recommendProductsForUsers(10).toJavaRDD();
+        JavaRDD<Tuple2<Object, Rating[]>> userRecsScaled = userRecs.map(
+                new Function<Tuple2<Object, Rating[]>, Tuple2<Object, Rating[]>>() {
+                    public Tuple2<Object, Rating[]> call(Tuple2<Object, Rating[]> t) {
+                        Rating[] scaledRatings = new Rating[t._2().length];
+                        for (int i = 0; i < scaledRatings.length; i++) {
+                            double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0);
+                            scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating);
+                        }
+                        return new Tuple2<Object, Rating[]>(t._1(), scaledRatings);
+                    }
+                }
+        );
+        JavaPairRDD<Object, Rating[]> userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled);
+
+        // Map ratings to 1 or 0, 1 indicating a movie that should be recommended
+        JavaRDD<Rating> binarizedRatings = ratings.map(
+                new Function<Rating, Rating>() {
+                    public Rating call(Rating r) {
+                        double binaryRating;
+                        if (r.rating() > 0.0) {
+                            binaryRating = 1.0;
+                        }
+                        else {
+                            binaryRating = 0.0;
+                        }
+                        return new Rating(r.user(), r.product(), binaryRating);
+                    }
+                }
+        );
+
+        // Group ratings by common user
+        JavaPairRDD<Object, Iterable<Rating>> userMovies = binarizedRatings.groupBy(
+                new Function<Rating, Object>() {
+                    public Object call(Rating r) {
+                        return r.user();
+                    }
+                }
+        );
+
+        // Get true relevant documents from all user ratings
+        JavaPairRDD<Object, List<Integer>> userMoviesList = userMovies.mapValues(
+                new Function<Iterable<Rating>, List<Integer>>() {
+                    public List<Integer> call(Iterable<Rating> docs) {
+                        List<Integer> products = new ArrayList<Integer>();
+                        for (Rating r : docs) {
+                            if (r.rating() > 0.0) {
+                                products.add(r.product());
+                            }
+                        }
+                        return products;
+                    }
+                }
+        );
+
+        // Extract the product id from each recommendation
+        JavaPairRDD<Object, List<Integer>> userRecommendedList = userRecommended.mapValues(
+                new Function<Rating[], List<Integer>>() {
+                    public List<Integer> call(Rating[] docs) {
+                        List<Integer> products = new ArrayList<Integer>();
+                        for (Rating r : docs) {
+                            products.add(r.product());
+                        }
+                        return products;
+                    }
+                }
+        );
+        JavaRDD<Tuple2<List<Integer>, List<Integer>>> relevantDocs = userMoviesList.join(userRecommendedList).values();
+
+        // Instantiate the metrics object
+        RankingMetrics metrics = RankingMetrics.of(relevantDocs);
+
+        // Precision and NDCG at k
+        Integer[] kVector = {1, 3, 5};
+        for (Integer k : kVector) {
+            System.out.format("Precision at %d = %f\n", k, metrics.precisionAt(k));
+            System.out.format("NDCG at %d = %f\n", k, metrics.ndcgAt(k));
+        }
+
+        // Mean average precision
+        System.out.format("Mean average precision = %f\n", metrics.meanAveragePrecision());
+
+        // Evaluate the model using numerical ratings and regression metrics
+        JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
+                new Function<Rating, Tuple2<Object, Object>>() {
+                    public Tuple2<Object, Object> call(Rating r) {
+                        return new Tuple2<Object, Object>(r.user(), r.product());
+                    }
+                }
+        );
+        JavaPairRDD<Tuple2<Integer, Integer>, Object> predictions = JavaPairRDD.fromJavaRDD(
+                model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
+                        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
+                            public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r){
+                                return new Tuple2<Tuple2<Integer, Integer>, Object>(
+                                        new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+                            }
+                        }
+                ));
+        JavaRDD<Tuple2<Object, Object>> ratesAndPreds =
+                JavaPairRDD.fromJavaRDD(ratings.map(
+                        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
+                            public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r){
+                                return new Tuple2<Tuple2<Integer, Integer>, Object>(
+                                        new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+                            }
+                        }
+                )).join(predictions).values();
+
+        // Create regression metrics object
+        RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd());
+
+        // Root mean squared error
+        System.out.format("RMSE = %f\n", regressionMetrics.rootMeanSquaredError());
+
+        // R-squared
+        System.out.format("R-squared = %f\n", regressionMetrics.r2());
+    }
+}
diff --git a/examples/src/main/python/mllib/binary_classification_metrics.py b/examples/src/main/python/mllib/binary_classification_metrics.py
new file mode 100644
index 0000000000000..9155b02083b0c
--- /dev/null
+++ b/examples/src/main/python/mllib/binary_classification_metrics.py
@@ -0,0 +1,63 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+"""
+Binary Classification Metrics Example.
+"""
+from __future__ import print_function
+
+import sys
+
+# $example on$
+from pyspark import SparkContext,SQLContext
+from pyspark.mllib.classification import LogisticRegressionWithLBFGS
+from pyspark.mllib.evaluation import BinaryClassificationMetrics
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.util import MLUtils
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="BinaryClassificationMetrics")
+    sqlContext = SQLContext(sc)
+
+# Several of the methods available in scala are currently missing from pyspark
+
+# $example on$
+# Load training data in LIBSVM format
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")
+
+# Split data into training (60%) and test (40%)
+training, test = data.randomSplit([0.6, 0.4], seed = 11L)
+training.cache()
+
+# Run training algorithm to build the model
+model = LogisticRegressionWithLBFGS.train(training)
+
+# Compute raw scores on the test set
+predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
+
+# Instantiate metrics object
+metrics = BinaryClassificationMetrics(predictionAndLabels)
+
+# Area under precision-recall curve
+print("Area under PR = %s" % metrics.areaUnderPR)
+
+# Area under ROC curve
+print("Area under ROC = %s" % metrics.areaUnderROC)
+# $example off$
+
diff --git a/examples/src/main/python/mllib/multi_class_metrics.py b/examples/src/main/python/mllib/multi_class_metrics.py
new file mode 100644
index 0000000000000..07c0f462b188f
--- /dev/null
+++ b/examples/src/main/python/mllib/multi_class_metrics.py
@@ -0,0 +1,69 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# $example on$
+
+from pyspark.mllib.classification import LogisticRegressionWithLBFGS
+from pyspark.mllib.util import MLUtils
+from pyspark.mllib.evaluation import MulticlassMetrics
+
+# $example off$
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="MultiClassMetrics")
+
+# Several of the methods available in scala are currently missing from pyspark
+
+# Load training data in LIBSVM format
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
+
+# Split data into training (60%) and test (40%)
+training, test = data.randomSplit([0.6, 0.4], seed = 11L)
+training.cache()
+
+# Run training algorithm to build the model
+model = LogisticRegressionWithLBFGS.train(training, numClasses=3)
+
+# Compute raw scores on the test set
+predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
+
+# Instantiate metrics object
+metrics = MulticlassMetrics(predictionAndLabels)
+
+# Overall statistics
+precision = metrics.precision()
+recall = metrics.recall()
+f1Score = metrics.fMeasure()
+print("Summary Stats")
+print("Precision = %s" % precision)
+print("Recall = %s" % recall)
+print("F1 Score = %s" % f1Score)
+
+# Statistics by class
+labels = data.map(lambda lp: lp.label).distinct().collect()
+for label in sorted(labels):
+    print("Class %s precision = %s" % (label, metrics.precision(label)))
+    print("Class %s recall = %s" % (label, metrics.recall(label)))
+    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
+
+# Weighted stats
+print("Weighted recall = %s" % metrics.weightedRecall)
+print("Weighted precision = %s" % metrics.weightedPrecision)
+print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
+print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
+print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
\ No newline at end of file
diff --git a/examples/src/main/python/mllib/multi_label_metrics.py b/examples/src/main/python/mllib/multi_label_metrics.py
new file mode 100644
index 0000000000000..93b7e1a0cce2e
--- /dev/null
+++ b/examples/src/main/python/mllib/multi_label_metrics.py
@@ -0,0 +1,63 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# $example on$
+
+from pyspark.mllib.evaluation import MultilabelMetrics
+from pyspark.mllib.util import MLUtils
+from pyspark import SparkContext
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="MultiClassMetrics")
+# $example on$
+scoreAndLabels = sc.parallelize([
+    ([0.0, 1.0], [0.0, 2.0]),
+    ([0.0, 2.0], [0.0, 1.0]),
+    ([], [0.0]),
+    ([2.0], [2.0]),
+    ([2.0, 0.0], [2.0, 0.0]),
+    ([0.0, 1.0, 2.0], [0.0, 1.0]),
+    ([1.0], [1.0, 2.0])])
+
+# Instantiate metrics object
+metrics = MultilabelMetrics(scoreAndLabels)
+
+# Summary stats
+print("Recall = %s" % metrics.recall())
+print("Precision = %s" % metrics.precision())
+print("F1 measure = %s" % metrics.f1Measure())
+print("Accuracy = %s" % metrics.accuracy)
+
+# Individual label stats
+labels = scoreAndLabels.flatMap(lambda x: x[1]).distinct().collect()
+for label in labels:
+    print("Class %s precision = %s" % (label, metrics.precision(label)))
+    print("Class %s recall = %s" % (label, metrics.recall(label)))
+    print("Class %s F1 Measure = %s" % (label, metrics.f1Measure(label)))
+
+# Micro stats
+print("Micro precision = %s" % metrics.microPrecision)
+print("Micro recall = %s" % metrics.microRecall)
+print("Micro F1 measure = %s" % metrics.microF1Measure)
+
+# Hamming loss
+print("Hamming loss = %s" % metrics.hammingLoss)
+
+# Subset accuracy
+print("Subset accuracy = %s" % metrics.subsetAccuracy)
+# $example off$
\ No newline at end of file
diff --git a/examples/src/main/python/mllib/ranking_metrics.py b/examples/src/main/python/mllib/ranking_metrics.py
new file mode 100644
index 0000000000000..7f8032ce17028
--- /dev/null
+++ b/examples/src/main/python/mllib/ranking_metrics.py
@@ -0,0 +1,54 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# $example on$
+from pyspark import SparkContext
+from pyspark.mllib.recommendation import ALS, Rating
+from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="Ranking Metrics")
+
+# Several of the methods available in scala are currently missing from pyspark
+
+#  Read in the ratings data
+lines = sc.textFile("data/mllib/sample_movielens_data.txt")
+
+def parseLine(line):
+    fields = line.split("::")
+    return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5)
+ratings = lines.map(lambda r: parseLine(r))
+
+# Train a model on to predict user-product ratings
+model = ALS.train(ratings, 10, 10, 0.01)
+
+# Get predicted ratings on all existing user-product pairs
+testData = ratings.map(lambda p: (p.user, p.product))
+predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating))
+
+ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating))
+scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1])
+
+# Instantiate regression metrics to compare predicted and actual ratings
+metrics = RegressionMetrics(scoreAndLabels)
+
+# Root mean sqaured error
+print("RMSE = %s" % metrics.rootMeanSquaredError)
+
+# R-squared
+print("R-squared = %s" % metrics.r2)
\ No newline at end of file
diff --git a/examples/src/main/python/mllib/regression_metrics.py b/examples/src/main/python/mllib/regression_metrics.py
new file mode 100644
index 0000000000000..601268da546db
--- /dev/null
+++ b/examples/src/main/python/mllib/regression_metrics.py
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# $example on$
+from pyspark import SparkContext
+from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
+from pyspark.mllib.evaluation import RegressionMetrics
+from pyspark.mllib.linalg import DenseVector
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="Regression Metrics")
+
+def parsePoint(line):
+    values = line.split()
+    return LabeledPoint(float(values[0]), DenseVector([float(x.split(':')[1]) for x in values[1:]]))
+
+data = sc.textFile("data/mllib/sample_linear_regression_data.txt")
+parsedData = data.map(parsePoint)
+
+# Build the model
+model = LinearRegressionWithSGD.train(parsedData)
+
+# Get predictions
+valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))
+
+# Instantiate metrics object
+metrics = RegressionMetrics(valuesAndPreds)
+
+# Squared Error
+print("MSE = %s" % metrics.meanSquaredError)
+print("RMSE = %s" % metrics.rootMeanSquaredError)
+
+# R-squared
+print("R-squared = %s" % metrics.r2)
+
+# Mean absolute error
+print("MAE = %s" % metrics.meanAbsoluteError)
+
+# Explained variance
+print("Explained variance = %s" % metrics.explainedVariance)
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala
new file mode 100644
index 0000000000000..db640ccc4a08e
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+
+// $example on$
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.MLUtils
+// $example off$
+
+object BinaryClassificationMetrics {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("BinaryClassificationMetrics")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+    import sqlContext.implicits._
+    // $example on$
+    // Load training data in LIBSVM format
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/data/mllib/sample_binary_classification_data.txt")
+
+    // Split data into training (60%) and test (40%)
+    val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L)
+    training.cache()
+
+    // Run training algorithm to build the model
+    val model = new LogisticRegressionWithLBFGS()
+      .setNumClasses(2)
+      .run(training)
+
+    // Clear the prediction threshold so the model will return probabilities
+    model.clearThreshold
+
+    // Compute raw scores on the test set
+    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
+      val prediction = model.predict(features)
+      (prediction, label)
+    }
+
+    // Instantiate metrics object
+    val metrics = new BinaryClassificationMetrics(predictionAndLabels)
+
+    // Precision by threshold
+    val precision = metrics.precisionByThreshold
+    precision.foreach { case (t, p) =>
+      println(s"Threshold: $t, Precision: $p")
+    }
+
+    // Recall by threshold
+    val recall = metrics.recallByThreshold
+    recall.foreach { case (t, r) =>
+      println(s"Threshold: $t, Recall: $r")
+    }
+
+    // Precision-Recall Curve
+    val PRC = metrics.pr
+
+    // F-measure
+    val f1Score = metrics.fMeasureByThreshold
+    f1Score.foreach { case (t, f) =>
+      println(s"Threshold: $t, F-score: $f, Beta = 1")
+    }
+
+    val beta = 0.5
+    val fScore = metrics.fMeasureByThreshold(beta)
+    f1Score.foreach { case (t, f) =>
+      println(s"Threshold: $t, F-score: $f, Beta = 0.5")
+    }
+
+    // AUPRC
+    val auPRC = metrics.areaUnderPR
+    println("Area under precision-recall curve = " + auPRC)
+
+    // Compute thresholds used in ROC and PR curves
+    val thresholds = precision.map(_._1)
+
+    // ROC Curve
+    val roc = metrics.roc
+
+    // AUROC
+    val auROC = metrics.areaUnderROC
+    println("Area under ROC = " + auROC)
+
+    // $example off$
+
+  }
+}
+// scalastyle:on println
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics
new file mode 100644
index 0000000000000..a5fcb145b650e
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+
+import org.apache.spark.mllib.evaluation.MultilabelMetrics
+import org.apache.spark.rdd.RDD;
+object MultiLabelMetrics {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("MultiLabelMetrics")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+    import sqlContext.implicits._
+    // $example on$
+    val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize(
+      Seq((Array(0.0, 1.0), Array(0.0, 2.0)),
+        (Array(0.0, 2.0), Array(0.0, 1.0)),
+        (Array(), Array(0.0)),
+        (Array(2.0), Array(2.0)),
+        (Array(2.0, 0.0), Array(2.0, 0.0)),
+        (Array(0.0, 1.0, 2.0), Array(0.0, 1.0)),
+        (Array(1.0), Array(1.0, 2.0))), 2)
+
+    // Instantiate metrics object
+    val metrics = new MultilabelMetrics(scoreAndLabels)
+
+    // Summary stats
+    println(s"Recall = ${metrics.recall}")
+    println(s"Precision = ${metrics.precision}")
+    println(s"F1 measure = ${metrics.f1Measure}")
+    println(s"Accuracy = ${metrics.accuracy}")
+
+    // Individual label stats
+    metrics.labels.foreach(label => println(s"Class $label precision = ${metrics.precision(label)}"))
+    metrics.labels.foreach(label => println(s"Class $label recall = ${metrics.recall(label)}"))
+    metrics.labels.foreach(label => println(s"Class $label F1-score = ${metrics.f1Measure(label)}"))
+
+    // Micro stats
+    println(s"Micro recall = ${metrics.microRecall}")
+    println(s"Micro precision = ${metrics.microPrecision}")
+    println(s"Micro F1 measure = ${metrics.microF1Measure}")
+
+    // Hamming loss
+    println(s"Hamming loss = ${metrics.hammingLoss}")
+
+    // Subset accuracy
+    println(s"Subset accuracy = ${metrics.subsetAccuracy}")
+  }
+}
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala
new file mode 100644
index 0000000000000..0ed3c633f19d8
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
+import org.apache.spark.mllib.evaluation.MulticlassMetrics
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.MLUtils
+// $example off$
+
+import org.apache.spark.{SparkContext, SparkConf}
+
+object MulticlassMetrics {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("MulticlassMetrics")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    // Load training data in LIBSVM format
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
+
+    // Split data into training (60%) and test (40%)
+    val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L)
+    training.cache()
+
+    // Run training algorithm to build the model
+    val model = new LogisticRegressionWithLBFGS()
+      .setNumClasses(3)
+      .run(training)
+
+    // Compute raw scores on the test set
+    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
+      val prediction = model.predict(features)
+      (prediction, label)
+    }
+
+    // Instantiate metrics object
+    val metrics = new MulticlassMetrics(predictionAndLabels)
+
+    // Confusion matrix
+    println("Confusion matrix:")
+    println(metrics.confusionMatrix)
+
+    // Overall Statistics
+    val precision = metrics.precision
+    val recall = metrics.recall // same as true positive rate
+    val f1Score = metrics.fMeasure
+    println("Summary Statistics")
+    println(s"Precision = $precision")
+    println(s"Recall = $recall")
+    println(s"F1 Score = $f1Score")
+
+    // Precision by label
+    val labels = metrics.labels
+    labels.foreach { l =>
+      println(s"Precision($l) = " + metrics.precision(l))
+    }
+
+    // Recall by label
+    labels.foreach { l =>
+      println(s"Recall($l) = " + metrics.recall(l))
+    }
+
+    // False positive rate by label
+    labels.foreach { l =>
+      println(s"FPR($l) = " + metrics.falsePositiveRate(l))
+    }
+
+    // F-measure by label
+    labels.foreach { l =>
+      println(s"F1-Score($l) = " + metrics.fMeasure(l))
+    }
+
+    // Weighted stats
+    println(s"Weighted precision: ${metrics.weightedPrecision}")
+    println(s"Weighted recall: ${metrics.weightedRecall}")
+    println(s"Weighted F1 score: ${metrics.weightedFMeasure}")
+    println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}")
+
+    // $example off$
+
+  }
+}
+// scalastyle:on println
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala
new file mode 100644
index 0000000000000..2d6b6455bb5e3
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics}
+import org.apache.spark.mllib.recommendation.{ALS, Rating}
+object RankingMetrics {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("RankingMetrics")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+    import sqlContext.implicits._
+    // Read in the ratings data
+    val ratings = sc.textFile("data/mllib/sample_movielens_data.txt").map { line =>
+      val fields = line.split("::")
+      Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble - 2.5)
+    }.cache()
+
+    // Map ratings to 1 or 0, 1 indicating a movie that should be recommended
+    val binarizedRatings = ratings.map(r => Rating(r.user, r.product, if (r.rating > 0) 1.0 else 0.0)).cache()
+
+    // Summarize ratings
+    val numRatings = ratings.count()
+    val numUsers = ratings.map(_.user).distinct().count()
+    val numMovies = ratings.map(_.product).distinct().count()
+    println(s"Got $numRatings ratings from $numUsers users on $numMovies movies.")
+
+    // Build the model
+    val numIterations = 10
+    val rank = 10
+    val lambda = 0.01
+    val model = ALS.train(ratings, rank, numIterations, lambda)
+
+    // Define a function to scale ratings from 0 to 1
+    def scaledRating(r: Rating): Rating = {
+      val scaledRating = math.max(math.min(r.rating, 1.0), 0.0)
+      Rating(r.user, r.product, scaledRating)
+    }
+
+    // Get sorted top ten predictions for each user and then scale from [0, 1]
+    val userRecommended = model.recommendProductsForUsers(10).map { case (user, recs) =>
+      (user, recs.map(scaledRating))
+    }
+
+    // Assume that any movie a user rated 3 or higher (which maps to a 1) is a relevant document
+    // Compare with top ten most relevant documents
+    val userMovies = binarizedRatings.groupBy(_.user)
+    val relevantDocuments = userMovies.join(userRecommended).map { case (user, (actual, predictions)) =>
+      (predictions.map(_.product), actual.filter(_.rating > 0.0).map(_.product).toArray)
+    }
+
+    // Instantiate metrics object
+    val metrics = new RankingMetrics(relevantDocuments)
+
+    // Precision at K
+    Array(1, 3, 5).foreach { k =>
+      println(s"Precision at $k = ${metrics.precisionAt(k)}")
+    }
+
+    // Mean average precision
+    println(s"Mean average precision = ${metrics.meanAveragePrecision}")
+
+    // Normalized discounted cumulative gain
+    Array(1, 3, 5).foreach { k =>
+      println(s"NDCG at $k = ${metrics.ndcgAt(k)}")
+    }
+
+    // Get predictions for each data point
+    val allPredictions = model.predict(ratings.map(r => (r.user, r.product))).map(r => ((r.user, r.product), r.rating))
+    val allRatings = ratings.map(r => ((r.user, r.product), r.rating))
+    val predictionsAndLabels = allPredictions.join(allRatings).map { case ((user, product), (predicted, actual)) =>
+      (predicted, actual)
+    }
+
+    // Get the RMSE using regression metrics
+    val regressionMetrics = new RegressionMetrics(predictionsAndLabels)
+    println(s"RMSE = ${regressionMetrics.rootMeanSquaredError}")
+
+    // R-squared
+    println(s"R-squared = ${regressionMetrics.r2}")
+  }
+}
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala
new file mode 100644
index 0000000000000..71b1c2d2cda77
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.regression.LinearRegressionModel
+import org.apache.spark.mllib.regression.LinearRegressionWithSGD
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.evaluation.RegressionMetrics
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object RegressionMetrics {
+
+  def main(args: Array[String]) {
+
+    val conf = new SparkConf().setAppName("RegressionMetrics")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+    // Load the data
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_linear_regression_data.txt").cache()
+
+    // Build the model
+    val numIterations = 100
+    val model = LinearRegressionWithSGD.train(data, numIterations)
+
+    // Get predictions
+    val valuesAndPreds = data.map{ point =>
+      val prediction = model.predict(point.features)
+      (prediction, point.label)
+    }
+
+    // Instantiate metrics object
+    val metrics = new RegressionMetrics(valuesAndPreds)
+
+    // Squared error
+    println(s"MSE = ${metrics.meanSquaredError}")
+    println(s"RMSE = ${metrics.rootMeanSquaredError}")
+
+    // R-squared
+    println(s"R-squared = ${metrics.r2}")
+
+    // Mean absolute error
+    println(s"MAE = ${metrics.meanAbsoluteError}")
+
+    // Explained variance
+    println(s"Explained variance = ${metrics.explainedVariance}")
+  }
+}
\ No newline at end of file

From cb9c846ffdb9d8d2f7deb42ecb2e4254caaf2462 Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Fri, 13 Nov 2015 13:35:22 +0530
Subject: [PATCH 02/13] Fixed scala issues

---
 docs/mllib-evaluation-metrics.md              |  2 +-
 .../mllib/JavaBinaryClassification.java       |  4 +-
 .../examples/mllib/JavaLinearRegression.java  |  4 +-
 .../mllib/JavaMultiLabelClassification.java   |  7 +-
 .../mllib/JavaMulticlassClassification.java   |  4 +-
 .../spark/examples/mllib/JavaRanking.java     |  3 +-
 .../mllib/binary_classification_metrics.py    | 40 +++++-----
 .../main/python/mllib/multi_class_metrics.py  | 70 ++++++++---------
 .../main/python/mllib/multi_label_metrics.py  | 78 +++++++++----------
 .../src/main/python/mllib/ranking_metrics.py  | 45 +++++------
 .../main/python/mllib/regression_metrics.py   | 47 +++++------
 .../spark/examples/mllib/MultiLabelMetrics    |  4 +-
 .../spark/examples/mllib/RankingMetrics.scala |  4 +
 .../examples/mllib/RegressionMetrics.scala    |  5 +-
 14 files changed, 168 insertions(+), 149 deletions(-)

diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index 2991249161046..7a9792c4a1455 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -247,7 +247,7 @@ Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.sp
 <div data-lang="java" markdown="1">
 Refer to the [`MulticlassMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MulticlassMetrics.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassMetrics.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java %}
 
 </div>
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
index 45da1fec120ab..b17dc79abff16 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
@@ -19,7 +19,6 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
-
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
@@ -32,7 +31,7 @@
 import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
-// $example off$
+
 
 public class JavaBinaryClassification {
     public static void main(String[] args) {
@@ -111,3 +110,4 @@ public Double call(Tuple2<Object, Object> t) {
         LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
     }
 }
+// $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java
index 309efced045b6..cc60409b42859 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java
@@ -19,6 +19,7 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
+import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
 import org.apache.spark.api.java.function.Function;
@@ -28,7 +29,7 @@
 import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
 import org.apache.spark.mllib.evaluation.RegressionMetrics;
 import org.apache.spark.SparkConf;
-// $example off$
+
 
 // Read in the ratings data
 public class JavaLinearRegression {
@@ -88,3 +89,4 @@ public Tuple2<Object, Object> call(LabeledPoint point) {
         LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath");
     }
 }
+// $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
index 5b3a03f0830b1..53204523bc865 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
@@ -25,12 +25,12 @@
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.mllib.evaluation.MultilabelMetrics;
 import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
 import java.util.Arrays;
 import java.util.List;
 // $example off$
-
-public class MultilabelClassification {
+import org.apache.spark.SparkContext;
+// $example on$
+public class JavaMultiLabelClassification {
     public static void main(String[] args) {
         SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics");
         JavaSparkContext sc = new JavaSparkContext(conf);
@@ -75,3 +75,4 @@ public static void main(String[] args) {
 
     }
 }
+// $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
index a5e92df358d14..0e74da7a883d1 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
@@ -18,6 +18,7 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
+// $example on$
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
@@ -89,4 +90,5 @@ public Tuple2<Object, Object> call(LabeledPoint p) {
         model.save(sc, "myModelPath");
         LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
     }
-}
\ No newline at end of file
+}
+// $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
index 2162cc658f193..b389a09c2715f 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
@@ -31,7 +31,7 @@
 import org.apache.spark.mllib.evaluation.RankingMetrics;
 import org.apache.spark.mllib.recommendation.ALS;
 import org.apache.spark.mllib.recommendation.Rating;
-// $example off$
+
 
 // Read in the ratings data
 public class JavaRanking {
@@ -173,3 +173,4 @@ public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r){
         System.out.format("R-squared = %f\n", regressionMetrics.r2());
     }
 }
+// $example off$
\ No newline at end of file
diff --git a/examples/src/main/python/mllib/binary_classification_metrics.py b/examples/src/main/python/mllib/binary_classification_metrics.py
index 9155b02083b0c..85583c7e6cfa7 100644
--- a/examples/src/main/python/mllib/binary_classification_metrics.py
+++ b/examples/src/main/python/mllib/binary_classification_metrics.py
@@ -23,8 +23,9 @@
 
 import sys
 
+
+from pyspark import SparkContext, SQLContext
 # $example on$
-from pyspark import SparkContext,SQLContext
 from pyspark.mllib.classification import LogisticRegressionWithLBFGS
 from pyspark.mllib.evaluation import BinaryClassificationMetrics
 from pyspark.mllib.regression import LabeledPoint
@@ -35,29 +36,28 @@
     sc = SparkContext(appName="BinaryClassificationMetrics")
     sqlContext = SQLContext(sc)
 
-# Several of the methods available in scala are currently missing from pyspark
-
-# $example on$
-# Load training data in LIBSVM format
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")
+    # Several of the methods available in scala are currently missing from pyspark
 
-# Split data into training (60%) and test (40%)
-training, test = data.randomSplit([0.6, 0.4], seed = 11L)
-training.cache()
+    # $example on$
+    # Load training data in LIBSVM format
+    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")
 
-# Run training algorithm to build the model
-model = LogisticRegressionWithLBFGS.train(training)
+    # Split data into training (60%) and test (40%)
+    training, test = data.randomSplit([0.6, 0.4], seed=11L)
+    training.cache()
 
-# Compute raw scores on the test set
-predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
+    # Run training algorithm to build the model
+    model = LogisticRegressionWithLBFGS.train(training)
 
-# Instantiate metrics object
-metrics = BinaryClassificationMetrics(predictionAndLabels)
+    # Compute raw scores on the test set
+    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
 
-# Area under precision-recall curve
-print("Area under PR = %s" % metrics.areaUnderPR)
+    # Instantiate metrics object
+    metrics = BinaryClassificationMetrics(predictionAndLabels)
 
-# Area under ROC curve
-print("Area under ROC = %s" % metrics.areaUnderROC)
-# $example off$
+    # Area under precision-recall curve
+    print("Area under PR = %s" % metrics.areaUnderPR)
 
+    # Area under ROC curve
+    print("Area under ROC = %s" % metrics.areaUnderROC)
+    # $example off$
diff --git a/examples/src/main/python/mllib/multi_class_metrics.py b/examples/src/main/python/mllib/multi_class_metrics.py
index 07c0f462b188f..7959b7230a563 100644
--- a/examples/src/main/python/mllib/multi_class_metrics.py
+++ b/examples/src/main/python/mllib/multi_class_metrics.py
@@ -16,54 +16,54 @@
 #
 
 # $example on$
-
 from pyspark.mllib.classification import LogisticRegressionWithLBFGS
 from pyspark.mllib.util import MLUtils
 from pyspark.mllib.evaluation import MulticlassMetrics
-
 # $example off$
+
 from pyspark import SparkContext
 
 if __name__ == "__main__":
     sc = SparkContext(appName="MultiClassMetrics")
 
-# Several of the methods available in scala are currently missing from pyspark
-
-# Load training data in LIBSVM format
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
+    # Several of the methods available in scala are currently missing from pyspark
+    # $example on$
+    # Load training data in LIBSVM format
+    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
 
-# Split data into training (60%) and test (40%)
-training, test = data.randomSplit([0.6, 0.4], seed = 11L)
-training.cache()
+    # Split data into training (60%) and test (40%)
+    training, test = data.randomSplit([0.6, 0.4], seed=11L)
+    training.cache()
 
-# Run training algorithm to build the model
-model = LogisticRegressionWithLBFGS.train(training, numClasses=3)
+    # Run training algorithm to build the model
+    model = LogisticRegressionWithLBFGS.train(training, numClasses=3)
 
-# Compute raw scores on the test set
-predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
+    # Compute raw scores on the test set
+    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
 
-# Instantiate metrics object
-metrics = MulticlassMetrics(predictionAndLabels)
+    # Instantiate metrics object
+    metrics = MulticlassMetrics(predictionAndLabels)
 
-# Overall statistics
-precision = metrics.precision()
-recall = metrics.recall()
-f1Score = metrics.fMeasure()
-print("Summary Stats")
-print("Precision = %s" % precision)
-print("Recall = %s" % recall)
-print("F1 Score = %s" % f1Score)
+    # Overall statistics
+    precision = metrics.precision()
+    recall = metrics.recall()
+    f1Score = metrics.fMeasure()
+    print("Summary Stats")
+    print("Precision = %s" % precision)
+    print("Recall = %s" % recall)
+    print("F1 Score = %s" % f1Score)
 
-# Statistics by class
-labels = data.map(lambda lp: lp.label).distinct().collect()
-for label in sorted(labels):
-    print("Class %s precision = %s" % (label, metrics.precision(label)))
-    print("Class %s recall = %s" % (label, metrics.recall(label)))
-    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
+    # Statistics by class
+    labels = data.map(lambda lp: lp.label).distinct().collect()
+    for label in sorted(labels):
+        print("Class %s precision = %s" % (label, metrics.precision(label)))
+        print("Class %s recall = %s" % (label, metrics.recall(label)))
+        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
 
-# Weighted stats
-print("Weighted recall = %s" % metrics.weightedRecall)
-print("Weighted precision = %s" % metrics.weightedPrecision)
-print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
-print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
-print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
\ No newline at end of file
+    # Weighted stats
+    print("Weighted recall = %s" % metrics.weightedRecall)
+    print("Weighted precision = %s" % metrics.weightedPrecision)
+    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
+    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
+    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
+    # $example off$
diff --git a/examples/src/main/python/mllib/multi_label_metrics.py b/examples/src/main/python/mllib/multi_label_metrics.py
index 93b7e1a0cce2e..d02d8d862d1e1 100644
--- a/examples/src/main/python/mllib/multi_label_metrics.py
+++ b/examples/src/main/python/mllib/multi_label_metrics.py
@@ -16,48 +16,48 @@
 #
 
 # $example on$
-
 from pyspark.mllib.evaluation import MultilabelMetrics
+# $example off$
 from pyspark.mllib.util import MLUtils
 from pyspark import SparkContext
-# $example off$
+
 
 if __name__ == "__main__":
     sc = SparkContext(appName="MultiClassMetrics")
-# $example on$
-scoreAndLabels = sc.parallelize([
-    ([0.0, 1.0], [0.0, 2.0]),
-    ([0.0, 2.0], [0.0, 1.0]),
-    ([], [0.0]),
-    ([2.0], [2.0]),
-    ([2.0, 0.0], [2.0, 0.0]),
-    ([0.0, 1.0, 2.0], [0.0, 1.0]),
-    ([1.0], [1.0, 2.0])])
-
-# Instantiate metrics object
-metrics = MultilabelMetrics(scoreAndLabels)
-
-# Summary stats
-print("Recall = %s" % metrics.recall())
-print("Precision = %s" % metrics.precision())
-print("F1 measure = %s" % metrics.f1Measure())
-print("Accuracy = %s" % metrics.accuracy)
-
-# Individual label stats
-labels = scoreAndLabels.flatMap(lambda x: x[1]).distinct().collect()
-for label in labels:
-    print("Class %s precision = %s" % (label, metrics.precision(label)))
-    print("Class %s recall = %s" % (label, metrics.recall(label)))
-    print("Class %s F1 Measure = %s" % (label, metrics.f1Measure(label)))
-
-# Micro stats
-print("Micro precision = %s" % metrics.microPrecision)
-print("Micro recall = %s" % metrics.microRecall)
-print("Micro F1 measure = %s" % metrics.microF1Measure)
-
-# Hamming loss
-print("Hamming loss = %s" % metrics.hammingLoss)
-
-# Subset accuracy
-print("Subset accuracy = %s" % metrics.subsetAccuracy)
-# $example off$
\ No newline at end of file
+    # $example on$
+    scoreAndLabels = sc.parallelize([
+        ([0.0, 1.0], [0.0, 2.0]),
+        ([0.0, 2.0], [0.0, 1.0]),
+        ([], [0.0]),
+        ([2.0], [2.0]),
+        ([2.0, 0.0], [2.0, 0.0]),
+        ([0.0, 1.0, 2.0], [0.0, 1.0]),
+        ([1.0], [1.0, 2.0])])
+
+    # Instantiate metrics object
+    metrics = MultilabelMetrics(scoreAndLabels)
+
+    # Summary stats
+    print("Recall = %s" % metrics.recall())
+    print("Precision = %s" % metrics.precision())
+    print("F1 measure = %s" % metrics.f1Measure())
+    print("Accuracy = %s" % metrics.accuracy)
+
+    # Individual label stats
+    labels = scoreAndLabels.flatMap(lambda x: x[1]).distinct().collect()
+    for label in labels:
+        print("Class %s precision = %s" % (label, metrics.precision(label)))
+        print("Class %s recall = %s" % (label, metrics.recall(label)))
+        print("Class %s F1 Measure = %s" % (label, metrics.f1Measure(label)))
+
+    # Micro stats
+    print("Micro precision = %s" % metrics.microPrecision)
+    print("Micro recall = %s" % metrics.microRecall)
+    print("Micro F1 measure = %s" % metrics.microF1Measure)
+
+    # Hamming loss
+    print("Hamming loss = %s" % metrics.hammingLoss)
+
+    # Subset accuracy
+    print("Subset accuracy = %s" % metrics.subsetAccuracy)
+    # $example off$
diff --git a/examples/src/main/python/mllib/ranking_metrics.py b/examples/src/main/python/mllib/ranking_metrics.py
index 7f8032ce17028..6fcdf3032d3dc 100644
--- a/examples/src/main/python/mllib/ranking_metrics.py
+++ b/examples/src/main/python/mllib/ranking_metrics.py
@@ -16,39 +16,40 @@
 #
 
 # $example on$
-from pyspark import SparkContext
 from pyspark.mllib.recommendation import ALS, Rating
 from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
 # $example off$
+from pyspark import SparkContext
 
 if __name__ == "__main__":
     sc = SparkContext(appName="Ranking Metrics")
 
-# Several of the methods available in scala are currently missing from pyspark
-
-#  Read in the ratings data
-lines = sc.textFile("data/mllib/sample_movielens_data.txt")
+    # Several of the methods available in scala are currently missing from pyspark
+    # $example on$
+    #  Read in the ratings data
+    lines = sc.textFile("data/mllib/sample_movielens_data.txt")
 
-def parseLine(line):
-    fields = line.split("::")
-    return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5)
-ratings = lines.map(lambda r: parseLine(r))
+    def parseLine(line):
+        fields = line.split("::")
+        return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5)
+    ratings = lines.map(lambda r: parseLine(r))
 
-# Train a model on to predict user-product ratings
-model = ALS.train(ratings, 10, 10, 0.01)
+    # Train a model on to predict user-product ratings
+    model = ALS.train(ratings, 10, 10, 0.01)
 
-# Get predicted ratings on all existing user-product pairs
-testData = ratings.map(lambda p: (p.user, p.product))
-predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating))
+    # Get predicted ratings on all existing user-product pairs
+    testData = ratings.map(lambda p: (p.user, p.product))
+    predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating))
 
-ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating))
-scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1])
+    ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating))
+    scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1])
 
-# Instantiate regression metrics to compare predicted and actual ratings
-metrics = RegressionMetrics(scoreAndLabels)
+    # Instantiate regression metrics to compare predicted and actual ratings
+    metrics = RegressionMetrics(scoreAndLabels)
 
-# Root mean sqaured error
-print("RMSE = %s" % metrics.rootMeanSquaredError)
+    # Root mean sqaured error
+    print("RMSE = %s" % metrics.rootMeanSquaredError)
 
-# R-squared
-print("R-squared = %s" % metrics.r2)
\ No newline at end of file
+    # R-squared
+    print("R-squared = %s" % metrics.r2)
+    # $example off$
diff --git a/examples/src/main/python/mllib/regression_metrics.py b/examples/src/main/python/mllib/regression_metrics.py
index 601268da546db..2b90f2457267c 100644
--- a/examples/src/main/python/mllib/regression_metrics.py
+++ b/examples/src/main/python/mllib/regression_metrics.py
@@ -16,40 +16,43 @@
 #
 
 # $example on$
-from pyspark import SparkContext
 from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
 from pyspark.mllib.evaluation import RegressionMetrics
 from pyspark.mllib.linalg import DenseVector
 # $example off$
 
+from pyspark import SparkContext
+
 if __name__ == "__main__":
     sc = SparkContext(appName="Regression Metrics")
+    # $example on$
+    # Load and parse the data
+    def parsePoint(line):
+        values = line.split()
+        return LabeledPoint(float(values[0]),DenseVector([float(x.split(':')[1]) for x in values[1:]]))
 
-def parsePoint(line):
-    values = line.split()
-    return LabeledPoint(float(values[0]), DenseVector([float(x.split(':')[1]) for x in values[1:]]))
-
-data = sc.textFile("data/mllib/sample_linear_regression_data.txt")
-parsedData = data.map(parsePoint)
+    data = sc.textFile("data/mllib/sample_linear_regression_data.txt")
+    parsedData = data.map(parsePoint)
 
-# Build the model
-model = LinearRegressionWithSGD.train(parsedData)
+    # Build the model
+    model = LinearRegressionWithSGD.train(parsedData)
 
-# Get predictions
-valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))
+    # Get predictions
+    valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))
 
-# Instantiate metrics object
-metrics = RegressionMetrics(valuesAndPreds)
+    # Instantiate metrics object
+    metrics = RegressionMetrics(valuesAndPreds)
 
-# Squared Error
-print("MSE = %s" % metrics.meanSquaredError)
-print("RMSE = %s" % metrics.rootMeanSquaredError)
+    # Squared Error
+    print("MSE = %s" % metrics.meanSquaredError)
+    print("RMSE = %s" % metrics.rootMeanSquaredError)
 
-# R-squared
-print("R-squared = %s" % metrics.r2)
+    # R-squared
+    print("R-squared = %s" % metrics.r2)
 
-# Mean absolute error
-print("MAE = %s" % metrics.meanAbsoluteError)
+    # Mean absolute error
+    print("MAE = %s" % metrics.meanAbsoluteError)
 
-# Explained variance
-print("Explained variance = %s" % metrics.explainedVariance)
+    # Explained variance
+    print("Explained variance = %s" % metrics.explainedVariance)
+    # $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics
index a5fcb145b650e..020b86d2b332c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics
@@ -20,9 +20,10 @@ package org.apache.spark.examples.mllib
 
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkContext, SparkConf}
-
+// $example on$
 import org.apache.spark.mllib.evaluation.MultilabelMetrics
 import org.apache.spark.rdd.RDD;
+// $example off$
 object MultiLabelMetrics {
 
   def main(args: Array[String]) {
@@ -65,5 +66,6 @@ object MultiLabelMetrics {
 
     // Subset accuracy
     println(s"Subset accuracy = ${metrics.subsetAccuracy}")
+    // $example off$
   }
 }
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala
index 2d6b6455bb5e3..9a7a25357f596 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala
@@ -20,8 +20,10 @@ package org.apache.spark.examples.mllib
 
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkContext, SparkConf}
+// $example on$
 import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics}
 import org.apache.spark.mllib.recommendation.{ALS, Rating}
+// $example off$
 object RankingMetrics {
 
   def main(args: Array[String]) {
@@ -30,6 +32,7 @@ object RankingMetrics {
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
     import sqlContext.implicits._
+    // $example on$
     // Read in the ratings data
     val ratings = sc.textFile("data/mllib/sample_movielens_data.txt").map { line =>
       val fields = line.split("::")
@@ -98,5 +101,6 @@ object RankingMetrics {
 
     // R-squared
     println(s"R-squared = ${regressionMetrics.r2}")
+    // $example off$
   }
 }
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala
index 71b1c2d2cda77..7dc77caeafa7a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala
@@ -17,13 +17,14 @@
 
 // scalastyle:off println
 package org.apache.spark.examples.mllib
-
+// $example on$
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.regression.LinearRegressionModel
 import org.apache.spark.mllib.regression.LinearRegressionWithSGD
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.evaluation.RegressionMetrics
 import org.apache.spark.mllib.util.MLUtils
+// $example off$
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkConf, SparkContext}
 
@@ -34,6 +35,7 @@ object RegressionMetrics {
     val conf = new SparkConf().setAppName("RegressionMetrics")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
+    // $example on$
     // Load the data
     val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_linear_regression_data.txt").cache()
 
@@ -62,5 +64,6 @@ object RegressionMetrics {
 
     // Explained variance
     println(s"Explained variance = ${metrics.explainedVariance}")
+    // $example off$
   }
 }
\ No newline at end of file

From ed33687f85833f94f845f27ba361cf8d6dbb0169 Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Fri, 13 Nov 2015 14:31:38 +0530
Subject: [PATCH 03/13] fixed styling issues

---
 dev/_site/README.md                           |   5 +
 dev/_site/audit-release/README.md             |  11 +
 dev/_site/audit-release/audit_release.py      | 237 ++++++++
 .../audit-release/blank_maven_build/pom.xml   |  43 ++
 .../audit-release/blank_sbt_build/build.sbt   |  30 +
 .../audit-release/maven_app_core/input.txt    |   8 +
 .../audit-release/maven_app_core/pom.xml      |  52 ++
 .../src/main/java/SimpleApp.java              |  42 ++
 .../audit-release/sbt_app_core/build.sbt      |  28 +
 .../audit-release/sbt_app_core/input.txt      |   8 +
 .../src/main/scala/SparkApp.scala             |  63 ++
 .../audit-release/sbt_app_ganglia/build.sbt   |  30 +
 .../src/main/scala/SparkApp.scala             |  41 ++
 .../audit-release/sbt_app_graphx/build.sbt    |  28 +
 .../src/main/scala/GraphxApp.scala            |  55 ++
 .../audit-release/sbt_app_hive/build.sbt      |  29 +
 dev/_site/audit-release/sbt_app_hive/data.txt |   9 +
 .../sbt_app_hive/src/main/scala/HiveApp.scala |  59 ++
 .../audit-release/sbt_app_kinesis/build.sbt   |  28 +
 .../src/main/scala/SparkApp.scala             |  35 ++
 dev/_site/audit-release/sbt_app_sql/build.sbt |  28 +
 .../sbt_app_sql/src/main/scala/SqlApp.scala   |  61 ++
 .../audit-release/sbt_app_streaming/build.sbt |  28 +
 .../src/main/scala/StreamingApp.scala         |  65 ++
 dev/_site/change-scala-version.sh             |  70 +++
 dev/_site/change-version-to-2.10.sh           |  23 +
 dev/_site/change-version-to-2.11.sh           |  23 +
 dev/_site/check-license                       |  85 +++
 .../create-release/generate-changelist.py     | 148 +++++
 .../create-release/generate-contributors.py   | 248 ++++++++
 dev/_site/create-release/known_translations   | 167 ++++++
 dev/_site/create-release/release-build.sh     | 326 ++++++++++
 dev/_site/create-release/release-tag.sh       |  79 +++
 dev/_site/create-release/releaseutils.py      | 260 ++++++++
 .../create-release/translate-contributors.py  | 253 ++++++++
 dev/_site/github_jira_sync.py                 | 147 +++++
 dev/_site/lint-python                         | 114 ++++
 dev/_site/lint-r                              |  41 ++
 dev/_site/lint-r.R                            |  37 ++
 dev/_site/lint-scala                          |  23 +
 dev/_site/merge_spark_pr.py                   | 453 ++++++++++++++
 dev/_site/mima                                |  54 ++
 dev/_site/run-tests                           |  23 +
 dev/_site/run-tests-jenkins                   |  28 +
 dev/_site/run-tests-jenkins.py                | 228 +++++++
 dev/_site/run-tests.py                        | 561 ++++++++++++++++++
 dev/_site/scalastyle                          |  34 ++
 dev/_site/sparktestsupport/modules.py         | 437 ++++++++++++++
 dev/_site/sparktestsupport/shellutils.py      | 115 ++++
 dev/_site/tests/pr_merge_ability.sh           |  39 ++
 dev/_site/tests/pr_new_dependencies.sh        | 117 ++++
 dev/_site/tests/pr_public_classes.sh          |  65 ++
 ...tiLabelMetrics => MultiLabelMetrics.scala} |   0
 53 files changed, 5221 insertions(+)
 create mode 100644 dev/_site/README.md
 create mode 100644 dev/_site/audit-release/README.md
 create mode 100755 dev/_site/audit-release/audit_release.py
 create mode 100644 dev/_site/audit-release/blank_maven_build/pom.xml
 create mode 100644 dev/_site/audit-release/blank_sbt_build/build.sbt
 create mode 100644 dev/_site/audit-release/maven_app_core/input.txt
 create mode 100644 dev/_site/audit-release/maven_app_core/pom.xml
 create mode 100644 dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java
 create mode 100644 dev/_site/audit-release/sbt_app_core/build.sbt
 create mode 100644 dev/_site/audit-release/sbt_app_core/input.txt
 create mode 100644 dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
 create mode 100644 dev/_site/audit-release/sbt_app_ganglia/build.sbt
 create mode 100644 dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala
 create mode 100644 dev/_site/audit-release/sbt_app_graphx/build.sbt
 create mode 100644 dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala
 create mode 100644 dev/_site/audit-release/sbt_app_hive/build.sbt
 create mode 100644 dev/_site/audit-release/sbt_app_hive/data.txt
 create mode 100644 dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
 create mode 100644 dev/_site/audit-release/sbt_app_kinesis/build.sbt
 create mode 100644 dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
 create mode 100644 dev/_site/audit-release/sbt_app_sql/build.sbt
 create mode 100644 dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
 create mode 100644 dev/_site/audit-release/sbt_app_streaming/build.sbt
 create mode 100644 dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala
 create mode 100755 dev/_site/change-scala-version.sh
 create mode 100755 dev/_site/change-version-to-2.10.sh
 create mode 100755 dev/_site/change-version-to-2.11.sh
 create mode 100755 dev/_site/check-license
 create mode 100755 dev/_site/create-release/generate-changelist.py
 create mode 100755 dev/_site/create-release/generate-contributors.py
 create mode 100644 dev/_site/create-release/known_translations
 create mode 100755 dev/_site/create-release/release-build.sh
 create mode 100755 dev/_site/create-release/release-tag.sh
 create mode 100755 dev/_site/create-release/releaseutils.py
 create mode 100755 dev/_site/create-release/translate-contributors.py
 create mode 100755 dev/_site/github_jira_sync.py
 create mode 100755 dev/_site/lint-python
 create mode 100755 dev/_site/lint-r
 create mode 100644 dev/_site/lint-r.R
 create mode 100755 dev/_site/lint-scala
 create mode 100755 dev/_site/merge_spark_pr.py
 create mode 100755 dev/_site/mima
 create mode 100755 dev/_site/run-tests
 create mode 100755 dev/_site/run-tests-jenkins
 create mode 100755 dev/_site/run-tests-jenkins.py
 create mode 100755 dev/_site/run-tests.py
 create mode 100755 dev/_site/scalastyle
 create mode 100644 dev/_site/sparktestsupport/modules.py
 create mode 100644 dev/_site/sparktestsupport/shellutils.py
 create mode 100755 dev/_site/tests/pr_merge_ability.sh
 create mode 100755 dev/_site/tests/pr_new_dependencies.sh
 create mode 100755 dev/_site/tests/pr_public_classes.sh
 rename examples/src/main/scala/org/apache/spark/examples/mllib/{MultiLabelMetrics => MultiLabelMetrics.scala} (100%)

diff --git a/dev/_site/README.md b/dev/_site/README.md
new file mode 100644
index 0000000000000..2b0f3d8ee8924
--- /dev/null
+++ b/dev/_site/README.md
@@ -0,0 +1,5 @@
+# Spark Developer Scripts
+This directory contains scripts useful to developers when packaging,
+testing, or committing to Spark.
+
+Many of these scripts require Apache credentials to work correctly.
diff --git a/dev/_site/audit-release/README.md b/dev/_site/audit-release/README.md
new file mode 100644
index 0000000000000..f72f8c653a265
--- /dev/null
+++ b/dev/_site/audit-release/README.md
@@ -0,0 +1,11 @@
+# Test Application Builds
+This directory includes test applications which are built when auditing releases. You can
+run them locally by setting appropriate environment variables.
+
+```
+$ cd sbt_app_core
+$ SCALA_VERSION=2.10.5 \
+  SPARK_VERSION=1.0.0-SNAPSHOT \
+  SPARK_RELEASE_REPOSITORY=file:///home/patrick/.ivy2/local \
+  sbt run
+```
diff --git a/dev/_site/audit-release/audit_release.py b/dev/_site/audit-release/audit_release.py
new file mode 100755
index 0000000000000..27d1dd784ce2e
--- /dev/null
+++ b/dev/_site/audit-release/audit_release.py
@@ -0,0 +1,237 @@
+#!/usr/bin/python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Audits binary and maven artifacts for a Spark release.
+# Requires GPG and Maven.
+# usage:
+#   python audit_release.py
+
+import os
+import re
+import shutil
+import subprocess
+import sys
+import time
+import urllib2
+
+# Note: The following variables must be set before use!
+RELEASE_URL = "http://people.apache.org/~andrewor14/spark-1.1.1-rc1/"
+RELEASE_KEY = "XXXXXXXX" # Your 8-digit hex
+RELEASE_REPOSITORY = "https://repository.apache.org/content/repositories/orgapachespark-1033"
+RELEASE_VERSION = "1.1.1"
+SCALA_VERSION = "2.10.5"
+SCALA_BINARY_VERSION = "2.10"
+
+# Do not set these
+LOG_FILE_NAME = "spark_audit_%s" % time.strftime("%h_%m_%Y_%I_%M_%S")
+LOG_FILE = open(LOG_FILE_NAME, 'w')
+WORK_DIR = "/tmp/audit_%s" % int(time.time())
+MAVEN_CMD = "mvn"
+GPG_CMD = "gpg"
+SBT_CMD = "sbt -Dsbt.log.noformat=true"
+
+# Track failures to print them at the end
+failures = []
+
+# Log a message. Use sparingly because this flushes every write.
+def log(msg):
+    LOG_FILE.write(msg + "\n")
+    LOG_FILE.flush()
+
+def log_and_print(msg):
+    print msg
+    log(msg)
+
+# Prompt the user to delete the scratch directory used
+def clean_work_files():
+    response = raw_input("OK to delete scratch directory '%s'? (y/N) " % WORK_DIR)
+    if response == "y":
+        shutil.rmtree(WORK_DIR)
+
+# Run the given command and log its output to the log file
+def run_cmd(cmd, exit_on_failure=True):
+    log("Running command: %s" % cmd)
+    ret = subprocess.call(cmd, shell=True, stdout=LOG_FILE, stderr=LOG_FILE)
+    if ret != 0 and exit_on_failure:
+        log_and_print("Command failed: %s" % cmd)
+        clean_work_files()
+        sys.exit(-1)
+    return ret
+
+def run_cmd_with_output(cmd):
+    log_and_print("Running command: %s" % cmd)
+    return subprocess.check_output(cmd, shell=True, stderr=LOG_FILE)
+
+# Test if the given condition is successful
+# If so, print the pass message; otherwise print the failure message
+def test(cond, msg):
+    return passed(msg) if cond else failed(msg)
+
+def passed(msg):
+    log_and_print("[PASSED] %s" % msg)
+
+def failed(msg):
+    failures.append(msg)
+    log_and_print("[**FAILED**] %s" % msg)
+
+def get_url(url):
+    return urllib2.urlopen(url).read()
+
+# If the path exists, prompt the user to delete it
+# If the resource is not deleted, abort
+def ensure_path_not_present(path):
+    full_path = os.path.expanduser(path)
+    if os.path.exists(full_path):
+        print "Found %s locally." % full_path
+        response = raw_input("This can interfere with testing published artifacts. OK to delete? (y/N) ")
+        if response == "y":
+            shutil.rmtree(full_path)
+        else:
+            print "Abort."
+            sys.exit(-1)
+
+log_and_print("|-------- Starting Spark audit tests for release %s --------|" % RELEASE_VERSION)
+log_and_print("Log output can be found in %s" % LOG_FILE_NAME)
+
+original_dir = os.getcwd()
+
+# For each of these modules, we'll test an 'empty' application in sbt and
+# maven that links against them. This will catch issues with messed up
+# dependencies within those projects.
+modules = [
+    "spark-core", "spark-bagel", "spark-mllib", "spark-streaming", "spark-repl",
+    "spark-graphx", "spark-streaming-flume", "spark-streaming-kafka",
+    "spark-streaming-mqtt", "spark-streaming-twitter", "spark-streaming-zeromq",
+    "spark-catalyst", "spark-sql", "spark-hive", "spark-streaming-kinesis-asl"
+]
+modules = map(lambda m: "%s_%s" % (m, SCALA_BINARY_VERSION), modules)
+
+# Check for directories that might interfere with tests
+local_ivy_spark = "~/.ivy2/local/org.apache.spark"
+cache_ivy_spark = "~/.ivy2/cache/org.apache.spark"
+local_maven_kafka = "~/.m2/repository/org/apache/kafka"
+local_maven_kafka = "~/.m2/repository/org/apache/spark"
+map(ensure_path_not_present, [local_ivy_spark, cache_ivy_spark, local_maven_kafka])
+
+# SBT build tests
+log_and_print("==== Building SBT modules ====")
+os.chdir("blank_sbt_build")
+os.environ["SPARK_VERSION"] = RELEASE_VERSION
+os.environ["SCALA_VERSION"] = SCALA_VERSION
+os.environ["SPARK_RELEASE_REPOSITORY"] = RELEASE_REPOSITORY
+os.environ["SPARK_AUDIT_MASTER"] = "local"
+for module in modules:
+    log("==== Building module %s in SBT ====" % module)
+    os.environ["SPARK_MODULE"] = module
+    ret = run_cmd("%s clean update" % SBT_CMD, exit_on_failure=False)
+    test(ret == 0, "SBT build against '%s' module" % module)
+os.chdir(original_dir)
+
+# SBT application tests
+log_and_print("==== Building SBT applications ====")
+for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming", "sbt_app_sql", "sbt_app_hive", "sbt_app_kinesis"]:
+    log("==== Building application %s in SBT ====" % app)
+    os.chdir(app)
+    ret = run_cmd("%s clean run" % SBT_CMD, exit_on_failure=False)
+    test(ret == 0, "SBT application (%s)" % app)
+    os.chdir(original_dir)
+
+# Maven build tests
+os.chdir("blank_maven_build")
+log_and_print("==== Building Maven modules ====")
+for module in modules:
+    log("==== Building module %s in maven ====" % module)
+    cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" '
+           '-Dspark.module="%s" clean compile' %
+           (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, module))
+    ret = run_cmd(cmd, exit_on_failure=False)
+    test(ret == 0, "maven build against '%s' module" % module)
+os.chdir(original_dir)
+
+# Maven application tests
+log_and_print("==== Building Maven applications ====")
+os.chdir("maven_app_core")
+mvn_exec_cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" '
+                '-Dscala.binary.version="%s" clean compile '
+                'exec:java -Dexec.mainClass="SimpleApp"' %
+                (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, SCALA_BINARY_VERSION))
+ret = run_cmd(mvn_exec_cmd, exit_on_failure=False)
+test(ret == 0, "maven application (core)")
+os.chdir(original_dir)
+
+# Binary artifact tests
+if os.path.exists(WORK_DIR):
+    print "Working directory '%s' already exists" % WORK_DIR
+    sys.exit(-1)
+os.mkdir(WORK_DIR)
+os.chdir(WORK_DIR)
+
+index_page = get_url(RELEASE_URL)
+artifact_regex = r = re.compile("<a href=\"(.*.tgz)\">")
+artifacts = r.findall(index_page)
+
+# Verify artifact integrity
+for artifact in artifacts:
+    log_and_print("==== Verifying download integrity for artifact: %s ====" % artifact)
+
+    artifact_url = "%s/%s" % (RELEASE_URL, artifact)
+    key_file = "%s.asc" % artifact
+    run_cmd("wget %s" % artifact_url)
+    run_cmd("wget %s/%s" % (RELEASE_URL, key_file))
+    run_cmd("wget %s%s" % (artifact_url, ".sha"))
+
+    # Verify signature
+    run_cmd("%s --keyserver pgp.mit.edu --recv-key %s" % (GPG_CMD, RELEASE_KEY))
+    run_cmd("%s %s" % (GPG_CMD, key_file))
+    passed("Artifact signature verified.")
+
+    # Verify md5
+    my_md5 = run_cmd_with_output("%s --print-md MD5 %s" % (GPG_CMD, artifact)).strip()
+    release_md5 = get_url("%s.md5" % artifact_url).strip()
+    test(my_md5 == release_md5, "Artifact MD5 verified.")
+
+    # Verify sha
+    my_sha = run_cmd_with_output("%s --print-md SHA512 %s" % (GPG_CMD, artifact)).strip()
+    release_sha = get_url("%s.sha" % artifact_url).strip()
+    test(my_sha == release_sha, "Artifact SHA verified.")
+
+    # Verify Apache required files
+    dir_name = artifact.replace(".tgz", "")
+    run_cmd("tar xvzf %s" % artifact)
+    base_files = os.listdir(dir_name)
+    test("CHANGES.txt" in base_files, "Tarball contains CHANGES.txt file")
+    test("NOTICE" in base_files, "Tarball contains NOTICE file")
+    test("LICENSE" in base_files, "Tarball contains LICENSE file")
+
+    os.chdir(WORK_DIR)
+
+# Report result
+log_and_print("\n")
+if len(failures) == 0:
+    log_and_print("*** ALL TESTS PASSED ***")
+else:
+    log_and_print("XXXXX SOME TESTS DID NOT PASS XXXXX")
+    for f in failures:
+        log_and_print("  %s" % f)
+os.chdir(original_dir)
+
+# Clean up
+clean_work_files()
+
+log_and_print("|-------- Spark release audit complete --------|")
diff --git a/dev/_site/audit-release/blank_maven_build/pom.xml b/dev/_site/audit-release/blank_maven_build/pom.xml
new file mode 100644
index 0000000000000..02dd9046c9a49
--- /dev/null
+++ b/dev/_site/audit-release/blank_maven_build/pom.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project>
+  <groupId>spark.audit</groupId>
+  <artifactId>spark-audit</artifactId>
+  <modelVersion>4.0.0</modelVersion>
+  <name>Spark Release Auditor</name>
+  <packaging>jar</packaging>
+  <version>1.0</version>
+  <repositories>
+    <repository>
+      <id>Spray.cc repository</id>
+      <url>http://repo.spray.cc</url>
+    </repository>
+    <repository>
+      <id>Spark Staging Repo</id>
+      <url>${spark.release.repository}</url>
+    </repository>
+  </repositories>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>${spark.module}</artifactId>
+      <version>${spark.version}</version>
+    </dependency>
+  </dependencies>
+</project>
diff --git a/dev/_site/audit-release/blank_sbt_build/build.sbt b/dev/_site/audit-release/blank_sbt_build/build.sbt
new file mode 100644
index 0000000000000..62815542e5bd9
--- /dev/null
+++ b/dev/_site/audit-release/blank_sbt_build/build.sbt
@@ -0,0 +1,30 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+name := "Spark Release Auditor"
+
+version := "1.0"
+
+scalaVersion := System.getenv.get("SCALA_VERSION")
+
+libraryDependencies += "org.apache.spark" % System.getenv.get("SPARK_MODULE") % System.getenv.get("SPARK_VERSION")
+
+resolvers ++= Seq(
+  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
+  "Eclipse Paho Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/",
+  "Maven Repository" at "http://repo1.maven.org/maven2/",
+  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/maven_app_core/input.txt b/dev/_site/audit-release/maven_app_core/input.txt
new file mode 100644
index 0000000000000..837b6f85ae97f
--- /dev/null
+++ b/dev/_site/audit-release/maven_app_core/input.txt
@@ -0,0 +1,8 @@
+a
+b
+c
+d
+a
+b
+c
+d
diff --git a/dev/_site/audit-release/maven_app_core/pom.xml b/dev/_site/audit-release/maven_app_core/pom.xml
new file mode 100644
index 0000000000000..b516396825573
--- /dev/null
+++ b/dev/_site/audit-release/maven_app_core/pom.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project>
+  <groupId>spark.audit</groupId>
+  <artifactId>spark-audit</artifactId>
+  <modelVersion>4.0.0</modelVersion>
+  <name>Simple Project</name>
+  <packaging>jar</packaging>
+  <version>1.0</version>
+  <repositories>
+    <repository>
+      <id>Spray.cc repository</id>
+      <url>http://repo.spray.cc</url>
+    </repository>
+    <repository>
+      <id>Spark Staging Repo</id>
+      <url>${spark.release.repository}</url>
+    </repository>
+  </repositories>
+  <dependencies>
+    <dependency> <!-- Spark dependency -->
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${spark.version}</version>
+    </dependency>
+  </dependencies>
+  <!-- Makes sure we get a fairly recent compiler plugin. -->
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.1</version>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java b/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java
new file mode 100644
index 0000000000000..5217689e7c092
--- /dev/null
+++ b/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+
+public class SimpleApp {
+  public static void main(String[] args) {
+    String logFile = "input.txt";
+    JavaSparkContext sc = new JavaSparkContext("local", "Simple App");
+    JavaRDD<String> logData = sc.textFile(logFile).cache();
+
+    long numAs = logData.filter(new Function<String, Boolean>() {
+      public Boolean call(String s) { return s.contains("a"); }
+    }).count();
+
+    long numBs = logData.filter(new Function<String, Boolean>() {
+      public Boolean call(String s) { return s.contains("b"); }
+    }).count();
+
+   if (numAs != 2 || numBs != 2) {
+     System.out.println("Failed to parse log files with Spark");
+     System.exit(-1);
+   }
+   System.out.println("Test succeeded");
+   sc.stop();
+  }
+}
diff --git a/dev/_site/audit-release/sbt_app_core/build.sbt b/dev/_site/audit-release/sbt_app_core/build.sbt
new file mode 100644
index 0000000000000..291b1d6440bac
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_core/build.sbt
@@ -0,0 +1,28 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+name := "Simple Project"
+
+version := "1.0"
+
+scalaVersion := System.getenv.get("SCALA_VERSION")
+
+libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION")
+
+resolvers ++= Seq(
+  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
+  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_core/input.txt b/dev/_site/audit-release/sbt_app_core/input.txt
new file mode 100644
index 0000000000000..837b6f85ae97f
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_core/input.txt
@@ -0,0 +1,8 @@
+a
+b
+c
+d
+a
+b
+c
+d
diff --git a/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
new file mode 100644
index 0000000000000..61d91c70e9709
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package main.scala
+
+import scala.util.Try
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+
+object SimpleApp {
+  def main(args: Array[String]) {
+    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
+      case Some(master) => new SparkConf().setAppName("Simple Spark App").setMaster(master)
+      case None => new SparkConf().setAppName("Simple Spark App")
+    }
+    val logFile = "input.txt"
+    val sc = new SparkContext(conf)
+    val logData = sc.textFile(logFile, 2).cache()
+    val numAs = logData.filter(line => line.contains("a")).count()
+    val numBs = logData.filter(line => line.contains("b")).count()
+    if (numAs != 2 || numBs != 2) {
+      println("Failed to parse log files with Spark")
+      System.exit(-1)
+    }
+
+    // Regression test for SPARK-1167: Remove metrics-ganglia from default build due to LGPL issue
+    val foundConsole = Try(Class.forName("org.apache.spark.metrics.sink.ConsoleSink")).isSuccess
+    val foundGanglia = Try(Class.forName("org.apache.spark.metrics.sink.GangliaSink")).isSuccess
+    if (!foundConsole) {
+      println("Console sink not loaded via spark-core")
+      System.exit(-1)
+    }
+    if (foundGanglia) {
+      println("Ganglia sink was loaded via spark-core")
+      System.exit(-1)
+    }
+
+    // Remove kinesis from default build due to ASL license issue
+    val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess
+    if (foundKinesis) {
+      println("Kinesis was loaded via spark-core")
+      System.exit(-1)
+    }
+  }
+}
+// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_ganglia/build.sbt b/dev/_site/audit-release/sbt_app_ganglia/build.sbt
new file mode 100644
index 0000000000000..6d9474acf5bbc
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_ganglia/build.sbt
@@ -0,0 +1,30 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+name := "Ganglia Test"
+
+version := "1.0"
+
+scalaVersion := System.getenv.get("SCALA_VERSION")
+
+libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION")
+
+libraryDependencies += "org.apache.spark" %% "spark-ganglia-lgpl" % System.getenv.get("SPARK_VERSION")
+
+resolvers ++= Seq(
+  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
+  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala
new file mode 100644
index 0000000000000..9f7ae75d0b477
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package main.scala
+
+import scala.util.Try
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+
+object SimpleApp {
+  def main(args: Array[String]) {
+    // Regression test for SPARK-1167: Remove metrics-ganglia from default build due to LGPL issue
+    val foundConsole = Try(Class.forName("org.apache.spark.metrics.sink.ConsoleSink")).isSuccess
+    val foundGanglia = Try(Class.forName("org.apache.spark.metrics.sink.GangliaSink")).isSuccess
+    if (!foundConsole) {
+      println("Console sink not loaded via spark-core")
+      System.exit(-1)
+    }
+    if (!foundGanglia) {
+      println("Ganglia sink not loaded via spark-ganglia-lgpl")
+      System.exit(-1)
+    }
+  }
+}
+// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_graphx/build.sbt b/dev/_site/audit-release/sbt_app_graphx/build.sbt
new file mode 100644
index 0000000000000..dd11245e67d44
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_graphx/build.sbt
@@ -0,0 +1,28 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+name := "Simple Project"
+
+version := "1.0"
+
+scalaVersion := System.getenv.get("SCALA_VERSION")
+
+libraryDependencies += "org.apache.spark" %% "spark-graphx" % System.getenv.get("SPARK_VERSION")
+
+resolvers ++= Seq(
+  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
+  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala b/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala
new file mode 100644
index 0000000000000..2f0b6ef9a5672
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package main.scala
+
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.SparkContext._
+import org.apache.spark.graphx._
+import org.apache.spark.rdd.RDD
+
+object GraphXApp {
+  def main(args: Array[String]) {
+    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
+      case Some(master) => new SparkConf().setAppName("Simple GraphX App").setMaster(master)
+      case None => new SparkConf().setAppName("Simple Graphx App")
+    }
+    val sc = new SparkContext(conf)
+    SparkContext.jarOfClass(this.getClass).foreach(sc.addJar)
+
+    val users: RDD[(VertexId, (String, String))] =
+      sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
+                           (5L, ("franklin", "prof")), (2L, ("istoica", "prof")),
+                           (4L, ("peter", "student"))))
+    val relationships: RDD[Edge[String]] =
+      sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
+                           Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"),
+                           Edge(4L, 0L, "student"),   Edge(5L, 0L, "colleague")))
+    val defaultUser = ("John Doe", "Missing")
+    val graph = Graph(users, relationships, defaultUser)
+    // Notice that there is a user 0 (for which we have no information) connected to users
+    // 4 (peter) and 5 (franklin).
+    val triplets = graph.triplets.map(e => (e.srcAttr._1, e.dstAttr._1)).collect
+    if (!triplets.exists(_ == ("peter", "John Doe"))) {
+      println("Failed to run GraphX")
+      System.exit(-1)
+    }
+    println("Test succeeded")
+  }
+}
+// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_hive/build.sbt b/dev/_site/audit-release/sbt_app_hive/build.sbt
new file mode 100644
index 0000000000000..c8824f2b15e55
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_hive/build.sbt
@@ -0,0 +1,29 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+name := "Simple Project"
+
+version := "1.0"
+
+scalaVersion := System.getenv.get("SCALA_VERSION")
+
+libraryDependencies += "org.apache.spark" %% "spark-hive" % System.getenv.get("SPARK_VERSION")
+
+resolvers ++= Seq(
+  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
+  "Maven Repository" at "http://repo1.maven.org/maven2/",
+  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_hive/data.txt b/dev/_site/audit-release/sbt_app_hive/data.txt
new file mode 100644
index 0000000000000..0229e67f51e01
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_hive/data.txt
@@ -0,0 +1,9 @@
+0val_0
+1val_1
+2val_2
+3val_3
+4val_4
+5val_5
+6val_6
+7val_7
+9val_9
diff --git a/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala b/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
new file mode 100644
index 0000000000000..4a980ec071ae4
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package main.scala
+
+import scala.collection.mutable.{ListBuffer, Queue}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.hive.HiveContext
+
+case class Person(name: String, age: Int)
+
+object SparkSqlExample {
+
+  def main(args: Array[String]) {
+    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
+      case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master)
+      case None => new SparkConf().setAppName("Simple Sql App")
+    }
+    val sc = new SparkContext(conf)
+    val hiveContext = new HiveContext(sc)
+
+    import hiveContext._
+    sql("DROP TABLE IF EXISTS src")
+    sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+    sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src")
+    val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect()
+    results.foreach(println)
+    
+    def test(f: => Boolean, failureMsg: String) = {
+      if (!f) {
+        println(failureMsg)
+        System.exit(-1)
+      }
+    }
+    
+    test(results.size == 5, "Unexpected number of selected elements: " + results)
+    println("Test succeeded")
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_kinesis/build.sbt b/dev/_site/audit-release/sbt_app_kinesis/build.sbt
new file mode 100644
index 0000000000000..981bc7957b5ed
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_kinesis/build.sbt
@@ -0,0 +1,28 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+name := "Kinesis Test"
+
+version := "1.0"
+
+scalaVersion := System.getenv.get("SCALA_VERSION")
+
+libraryDependencies += "org.apache.spark" %% "spark-streaming-kinesis-asl" % System.getenv.get("SPARK_VERSION")
+
+resolvers ++= Seq(
+  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
+  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
new file mode 100644
index 0000000000000..adc25b57d6aa5
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package main.scala
+
+import scala.util.Try
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+
+object SimpleApp {
+  def main(args: Array[String]) {
+    val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess
+    if (!foundKinesis) {
+      println("Kinesis not loaded via kinesis-asl")
+      System.exit(-1)
+    }
+  }
+}
+// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_sql/build.sbt b/dev/_site/audit-release/sbt_app_sql/build.sbt
new file mode 100644
index 0000000000000..9116180f71a44
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_sql/build.sbt
@@ -0,0 +1,28 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+name := "Simple Project"
+
+version := "1.0"
+
+scalaVersion := System.getenv.get("SCALA_VERSION")
+
+libraryDependencies += "org.apache.spark" %% "spark-sql" % System.getenv.get("SPARK_VERSION")
+
+resolvers ++= Seq(
+  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
+  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala b/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
new file mode 100644
index 0000000000000..69c1154dc0955
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package main.scala
+
+import scala.collection.mutable.{ListBuffer, Queue}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
+
+case class Person(name: String, age: Int)
+
+object SparkSqlExample {
+
+  def main(args: Array[String]) {
+    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
+      case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master)
+      case None => new SparkConf().setAppName("Simple Sql App")
+    }
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    import sqlContext.implicits._
+    import sqlContext._
+
+    val people = sc.makeRDD(1 to 100, 10).map(x => Person(s"Name$x", x)).toDF()
+    people.registerTempTable("people")
+    val teenagers = sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
+    val teenagerNames = teenagers.map(t => "Name: " + t(0)).collect()
+    teenagerNames.foreach(println)
+
+    def test(f: => Boolean, failureMsg: String) = {
+      if (!f) {
+        println(failureMsg)
+        System.exit(-1)
+      }
+    }
+    
+    test(teenagerNames.size == 7, "Unexpected number of selected elements: " + teenagerNames)
+    println("Test succeeded")
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_streaming/build.sbt b/dev/_site/audit-release/sbt_app_streaming/build.sbt
new file mode 100644
index 0000000000000..cb369d516dd16
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_streaming/build.sbt
@@ -0,0 +1,28 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+name := "Simple Project"
+
+version := "1.0"
+
+scalaVersion := System.getenv.get("SCALA_VERSION")
+
+libraryDependencies += "org.apache.spark" %% "spark-streaming" % System.getenv.get("SPARK_VERSION")
+
+resolvers ++= Seq(
+  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
+  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala b/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala
new file mode 100644
index 0000000000000..d6a074687f4a1
--- /dev/null
+++ b/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package main.scala
+
+import scala.collection.mutable.{ListBuffer, Queue}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming._
+
+object SparkStreamingExample {
+
+  def main(args: Array[String]) {
+    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
+      case Some(master) => new SparkConf().setAppName("Simple Streaming App").setMaster(master)
+      case None => new SparkConf().setAppName("Simple Streaming App")
+    }
+    val ssc = new StreamingContext(conf, Seconds(1))
+    val seen = ListBuffer[RDD[Int]]()
+
+    val rdd1 = ssc.sparkContext.makeRDD(1 to 100, 10)
+    val rdd2 = ssc.sparkContext.makeRDD(1 to 1000, 10)
+    val rdd3 = ssc.sparkContext.makeRDD(1 to 10000, 10)
+
+    val queue = Queue(rdd1, rdd2, rdd3)
+    val stream = ssc.queueStream(queue)
+
+    stream.foreachRDD(rdd => seen += rdd)
+    ssc.start()
+    Thread.sleep(5000)
+
+    def test(f: => Boolean, failureMsg: String) = {
+      if (!f) {
+        println(failureMsg)
+        System.exit(-1)
+      }
+    }
+
+    val rddCounts = seen.map(rdd => rdd.count()).filter(_ > 0)
+    test(rddCounts.length == 3, "Did not collect three RDD's from stream")
+    test(rddCounts.toSet == Set(100, 1000, 10000), "Did not find expected streams")
+
+    println("Test succeeded")
+
+    ssc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/dev/_site/change-scala-version.sh b/dev/_site/change-scala-version.sh
new file mode 100755
index 0000000000000..d7975dfb6475c
--- /dev/null
+++ b/dev/_site/change-scala-version.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+VALID_VERSIONS=( 2.10 2.11 )
+
+usage() {
+  echo "Usage: $(basename $0) [-h|--help] <version>
+where :
+  -h| --help Display this help text
+  valid version values : ${VALID_VERSIONS[*]}
+" 1>&2
+  exit 1
+}
+
+if [[ ($# -ne 1) || ( $1 == "--help") ||  $1 == "-h" ]]; then
+  usage
+fi
+
+TO_VERSION=$1
+
+check_scala_version() {
+  for i in ${VALID_VERSIONS[*]}; do [ $i = "$1" ] && return 0; done
+  echo "Invalid Scala version: $1. Valid versions: ${VALID_VERSIONS[*]}" 1>&2
+  exit 1
+}
+
+check_scala_version "$TO_VERSION"
+
+if [ $TO_VERSION = "2.11" ]; then
+  FROM_VERSION="2.10"
+else
+  FROM_VERSION="2.11"
+fi
+
+sed_i() {
+  sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2"
+}
+
+export -f sed_i
+
+BASEDIR=$(dirname $0)/..
+find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \
+  -exec bash -c "sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' {}" \;
+
+# Also update <scala.binary.version> in parent POM
+# Match any scala binary version to ensure idempotency
+sed_i '1,/<scala\.binary\.version>[0-9]*\.[0-9]*</s/<scala\.binary\.version>[0-9]*\.[0-9]*</<scala.binary.version>'$TO_VERSION'</' \
+  "$BASEDIR/pom.xml"
+
+# Update source of scaladocs
+echo "$BASEDIR/docs/_plugins/copy_api_dirs.rb"
+sed_i 's/scala\-'$FROM_VERSION'/scala\-'$TO_VERSION'/' "$BASEDIR/docs/_plugins/copy_api_dirs.rb"
diff --git a/dev/_site/change-version-to-2.10.sh b/dev/_site/change-version-to-2.10.sh
new file mode 100755
index 0000000000000..0962d34c52f28
--- /dev/null
+++ b/dev/_site/change-version-to-2.10.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script exists for backwards compability. Use change-scala-version.sh instead.
+echo "This script is deprecated. Please instead run: change-scala-version.sh 2.10"
+
+$(dirname $0)/change-scala-version.sh 2.10
diff --git a/dev/_site/change-version-to-2.11.sh b/dev/_site/change-version-to-2.11.sh
new file mode 100755
index 0000000000000..4ccfeef09fd04
--- /dev/null
+++ b/dev/_site/change-version-to-2.11.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script exists for backwards compability. Use change-scala-version.sh instead.
+echo "This script is deprecated. Please instead run: change-scala-version.sh 2.11"
+
+$(dirname $0)/change-scala-version.sh 2.11
diff --git a/dev/_site/check-license b/dev/_site/check-license
new file mode 100755
index 0000000000000..10740cfdc5242
--- /dev/null
+++ b/dev/_site/check-license
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+acquire_rat_jar () {
+
+  URL="http://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar"
+
+  JAR="$rat_jar"
+
+  # Download rat launch jar if it hasn't been downloaded yet
+  if [ ! -f "$JAR" ]; then
+    # Download
+    printf "Attempting to fetch rat\n"
+    JAR_DL="${JAR}.part"
+    if [ $(command -v curl) ]; then
+      curl -L --silent "${URL}" > "$JAR_DL" && mv "$JAR_DL" "$JAR"
+    elif [ $(command -v wget) ]; then
+      wget --quiet ${URL} -O "$JAR_DL" && mv "$JAR_DL" "$JAR"
+    else
+      printf "You do not have curl or wget installed, please install rat manually.\n"
+      exit -1
+    fi
+  fi
+
+  unzip -tq "$JAR" &> /dev/null
+  if [ $? -ne 0 ]; then 
+    # We failed to download
+    rm "$JAR"
+    printf "Our attempt to download rat locally to ${JAR} failed. Please install rat manually.\n"
+    exit -1
+  fi
+}
+
+# Go to the Spark project root directory
+FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
+cd "$FWDIR"
+
+if test -x "$JAVA_HOME/bin/java"; then
+    declare java_cmd="$JAVA_HOME/bin/java"
+else
+    declare java_cmd=java
+fi
+
+export RAT_VERSION=0.10
+export rat_jar="$FWDIR"/lib/apache-rat-${RAT_VERSION}.jar
+mkdir -p "$FWDIR"/lib
+
+[[ -f "$rat_jar" ]] || acquire_rat_jar || {
+    echo "Download failed. Obtain the rat jar manually and place it at $rat_jar"
+    exit 1
+}
+
+$java_cmd -jar "$rat_jar" -E "$FWDIR"/.rat-excludes  -d "$FWDIR" > rat-results.txt
+
+if [ $? -ne 0 ]; then
+   echo "RAT exited abnormally"
+   exit 1
+fi
+
+ERRORS="$(cat rat-results.txt | grep -e "??")"
+
+if test ! -z "$ERRORS"; then 
+    echo "Could not find Apache license headers in the following files:"
+    echo "$ERRORS"
+    exit 1
+else 
+    echo -e "RAT checks passed."
+fi
diff --git a/dev/_site/create-release/generate-changelist.py b/dev/_site/create-release/generate-changelist.py
new file mode 100755
index 0000000000000..2e1a35a629342
--- /dev/null
+++ b/dev/_site/create-release/generate-changelist.py
@@ -0,0 +1,148 @@
+#!/usr/bin/python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Creates CHANGES.txt from git history.
+#
+# Usage:
+#   First set the new release version and old CHANGES.txt version in this file.
+#   Make sure you have SPARK_HOME set.
+#   $  python generate-changelist.py
+
+
+import os
+import sys
+import subprocess
+import time
+import traceback
+
+SPARK_HOME = os.environ["SPARK_HOME"]
+NEW_RELEASE_VERSION = "1.0.0"
+PREV_RELEASE_GIT_TAG = "v0.9.1"
+
+CHANGELIST = "CHANGES.txt"
+OLD_CHANGELIST = "%s.old" % (CHANGELIST)
+NEW_CHANGELIST = "%s.new" % (CHANGELIST)
+TMP_CHANGELIST = "%s.tmp" % (CHANGELIST)
+
+# date before first PR in TLP Spark repo
+SPARK_REPO_CHANGE_DATE1 = time.strptime("2014-02-26", "%Y-%m-%d")
+# date after last PR in incubator Spark repo
+SPARK_REPO_CHANGE_DATE2 = time.strptime("2014-03-01", "%Y-%m-%d")
+# Threshold PR number that differentiates PRs to TLP
+# and incubator repos
+SPARK_REPO_PR_NUM_THRESH = 200
+
+LOG_FILE_NAME = "changes_%s" % time.strftime("%h_%m_%Y_%I_%M_%S")
+LOG_FILE = open(LOG_FILE_NAME, 'w')
+
+
+def run_cmd(cmd):
+    try:
+        print >> LOG_FILE, "Running command: %s" % cmd
+        output = subprocess.check_output(cmd, shell=True, stderr=LOG_FILE)
+        print >> LOG_FILE, "Output: %s" % output
+        return output
+    except:
+        traceback.print_exc()
+        cleanup()
+        sys.exit(1)
+
+
+def append_to_changelist(string):
+    with open(TMP_CHANGELIST, "a") as f:
+        print >> f, string
+
+
+def cleanup(ask=True):
+    if ask is True:
+        print "OK to delete temporary and log files? (y/N): "
+        response = raw_input()
+    if ask is False or (ask is True and response == "y"):
+        if os.path.isfile(TMP_CHANGELIST):
+            os.remove(TMP_CHANGELIST)
+        if os.path.isfile(OLD_CHANGELIST):
+            os.remove(OLD_CHANGELIST)
+        LOG_FILE.close()
+        os.remove(LOG_FILE_NAME)
+
+
+print "Generating new %s for Spark release %s" % (CHANGELIST, NEW_RELEASE_VERSION)
+os.chdir(SPARK_HOME)
+if os.path.isfile(TMP_CHANGELIST):
+    os.remove(TMP_CHANGELIST)
+if os.path.isfile(OLD_CHANGELIST):
+    os.remove(OLD_CHANGELIST)
+
+append_to_changelist("Spark Change Log")
+append_to_changelist("----------------")
+append_to_changelist("")
+append_to_changelist("Release %s" % NEW_RELEASE_VERSION)
+append_to_changelist("")
+
+print "Getting commits between tag %s and HEAD" % PREV_RELEASE_GIT_TAG
+hashes = run_cmd("git log %s..HEAD --pretty='%%h'" % PREV_RELEASE_GIT_TAG).split()
+
+print "Getting details of %s commits" % len(hashes)
+for h in hashes:
+    date = run_cmd("git log %s -1 --pretty='%%ad' --date=iso | head -1" % h).strip()
+    subject = run_cmd("git log %s -1 --pretty='%%s' | head -1" % h).strip()
+    body = run_cmd("git log %s -1 --pretty='%%b'" % h)
+    committer = run_cmd("git log %s -1 --pretty='%%cn <%%ce>' | head -1" % h).strip()
+    body_lines = body.split("\n")
+
+    if "Merge pull" in subject:
+        # Parse old format commit message
+        append_to_changelist("  %s %s" % (h, date))
+        append_to_changelist("  %s" % subject)
+        append_to_changelist("  [%s]" % body_lines[0])
+        append_to_changelist("")
+
+    elif "maven-release" not in subject:
+        # Parse new format commit message
+        # Get authors from commit message, committer otherwise
+        authors = [committer]
+        if "Author:" in body:
+            authors = [line.split(":")[1].strip() for line in body_lines if "Author:" in line]
+
+        # Generate GitHub PR URL for easy access if possible
+        github_url = ""
+        if "Closes #" in body:
+            pr_num = [line.split()[1].lstrip("#") for line in body_lines if "Closes #" in line][0]
+            github_url = "github.com/apache/spark/pull/%s" % pr_num
+            day = time.strptime(date.split()[0], "%Y-%m-%d")
+            if (day < SPARK_REPO_CHANGE_DATE1 or
+                (day < SPARK_REPO_CHANGE_DATE2 and pr_num < SPARK_REPO_PR_NUM_THRESH)):
+                github_url = "github.com/apache/incubator-spark/pull/%s" % pr_num
+
+        append_to_changelist("  %s" % subject)
+        append_to_changelist("  %s" % ', '.join(authors))
+        # for author in authors:
+        #     append_to_changelist("  %s" % author)
+        append_to_changelist("  %s" % date)
+        if len(github_url) > 0:
+            append_to_changelist("  Commit: %s, %s" % (h, github_url))
+        else:
+            append_to_changelist("  Commit: %s" % h)
+        append_to_changelist("")
+
+# Append old change list
+print "Appending changelist from tag %s" % PREV_RELEASE_GIT_TAG
+run_cmd("git show %s:%s | tail -n +3 >> %s" % (PREV_RELEASE_GIT_TAG, CHANGELIST, TMP_CHANGELIST))
+run_cmd("cp %s %s" % (TMP_CHANGELIST, NEW_CHANGELIST))
+print "New change list generated as %s" % NEW_CHANGELIST
+cleanup(False)
diff --git a/dev/_site/create-release/generate-contributors.py b/dev/_site/create-release/generate-contributors.py
new file mode 100755
index 0000000000000..db9c680a4bad3
--- /dev/null
+++ b/dev/_site/create-release/generate-contributors.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This script automates the process of creating release notes.
+
+import os
+import re
+import sys
+
+from releaseutils import *
+
+# You must set the following before use!
+JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
+RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2")
+PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0")
+
+# If the release tags are not provided, prompt the user to provide them
+while not tag_exists(RELEASE_TAG):
+    RELEASE_TAG = raw_input("Please provide a valid release tag: ")
+while not tag_exists(PREVIOUS_RELEASE_TAG):
+    print "Please specify the previous release tag."
+    PREVIOUS_RELEASE_TAG = raw_input(\
+      "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")
+
+# Gather commits found in the new tag but not in the old tag.
+# This filters commits based on both the git hash and the PR number.
+# If either is present in the old tag, then we ignore the commit.
+print "Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG)
+release_commits = get_commits(RELEASE_TAG)
+previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG)
+previous_release_hashes = set()
+previous_release_prs = set()
+for old_commit in previous_release_commits:
+    previous_release_hashes.add(old_commit.get_hash())
+    if old_commit.get_pr_number():
+        previous_release_prs.add(old_commit.get_pr_number())
+new_commits = []
+for this_commit in release_commits:
+    this_hash = this_commit.get_hash()
+    this_pr_number = this_commit.get_pr_number()
+    if this_hash in previous_release_hashes:
+        continue
+    if this_pr_number and this_pr_number in previous_release_prs:
+        continue
+    new_commits.append(this_commit)
+if not new_commits:
+    sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))
+
+# Prompt the user for confirmation that the commit range is correct
+print "\n=================================================================================="
+print "JIRA server: %s" % JIRA_API_BASE
+print "Release tag: %s" % RELEASE_TAG
+print "Previous release tag: %s" % PREVIOUS_RELEASE_TAG
+print "Number of commits in this range: %s" % len(new_commits)
+print
+def print_indented(_list):
+    for x in _list: print "  %s" % x
+if yesOrNoPrompt("Show all commits?"):
+    print_indented(new_commits)
+print "==================================================================================\n"
+if not yesOrNoPrompt("Does this look correct?"):
+    sys.exit("Ok, exiting")
+
+# Filter out special commits
+releases = []
+maintenance = []
+reverts = []
+nojiras = []
+filtered_commits = []
+def is_release(commit_title):
+    return re.findall("\[release\]", commit_title.lower()) or\
+      "preparing spark release" in commit_title.lower() or\
+      "preparing development version" in commit_title.lower() or\
+      "CHANGES.txt" in commit_title
+def is_maintenance(commit_title):
+    return "maintenance" in commit_title.lower() or\
+      "manually close" in commit_title.lower()
+def has_no_jira(commit_title):
+    return not re.findall("SPARK-[0-9]+", commit_title.upper())
+def is_revert(commit_title):
+    return "revert" in commit_title.lower()
+def is_docs(commit_title):
+    return re.findall("docs*", commit_title.lower()) or\
+      "programming guide" in commit_title.lower()
+for c in new_commits:
+    t = c.get_title()
+    if not t: continue
+    elif is_release(t): releases.append(c)
+    elif is_maintenance(t): maintenance.append(c)
+    elif is_revert(t): reverts.append(c)
+    elif is_docs(t): filtered_commits.append(c) # docs may not have JIRA numbers
+    elif has_no_jira(t): nojiras.append(c)
+    else: filtered_commits.append(c)
+
+# Warn against ignored commits
+if releases or maintenance or reverts or nojiras:
+    print "\n=================================================================================="
+    if releases: print "Found %d release commits" % len(releases)
+    if maintenance: print "Found %d maintenance commits" % len(maintenance)
+    if reverts: print "Found %d revert commits" % len(reverts)
+    if nojiras: print "Found %d commits with no JIRA" % len(nojiras)
+    print "* Warning: these commits will be ignored.\n"
+    if yesOrNoPrompt("Show ignored commits?"):
+        if releases: print "Release (%d)" % len(releases); print_indented(releases)
+        if maintenance: print "Maintenance (%d)" % len(maintenance); print_indented(maintenance)
+        if reverts: print "Revert (%d)" % len(reverts); print_indented(reverts)
+        if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
+    print "==================== Warning: the above commits will be ignored ==================\n"
+prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits)
+if not yesOrNoPrompt(prompt_msg):
+    sys.exit("Ok, exiting.")
+
+# Keep track of warnings to tell the user at the end
+warnings = []
+
+# Mapping from the invalid author name to its associated JIRA issues
+# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471")
+invalid_authors = {}
+
+# Populate a map that groups issues and components by author
+# It takes the form: Author name -> { Contribution type -> Spark components }
+# For instance,
+# {
+#   'Andrew Or': {
+#     'bug fixes': ['windows', 'core', 'web ui'],
+#     'improvements': ['core']
+#   },
+#   'Tathagata Das' : {
+#     'bug fixes': ['streaming']
+#     'new feature': ['streaming']
+#   }
+# }
+#
+author_info = {}
+jira_options = { "server": JIRA_API_BASE }
+jira_client = JIRA(options = jira_options)
+print "\n=========================== Compiling contributor list ==========================="
+for commit in filtered_commits:
+    _hash = commit.get_hash()
+    title = commit.get_title()
+    issues = re.findall("SPARK-[0-9]+", title.upper())
+    author = commit.get_author()
+    date = get_date(_hash)
+    # If the author name is invalid, keep track of it along
+    # with all associated issues so we can translate it later
+    if is_valid_author(author):
+        author = capitalize_author(author)
+    else:
+        if author not in invalid_authors:
+            invalid_authors[author] = set()
+        for issue in issues:
+            invalid_authors[author].add(issue)
+    # Parse components from the commit title, if any
+    commit_components = find_components(title, _hash)
+    # Populate or merge an issue into author_info[author]
+    def populate(issue_type, components):
+        components = components or [CORE_COMPONENT] # assume core if no components provided
+        if author not in author_info:
+            author_info[author] = {}
+        if issue_type not in author_info[author]:
+            author_info[author][issue_type] = set()
+        for component in components:
+            author_info[author][issue_type].add(component)
+    # Find issues and components associated with this commit
+    for issue in issues:
+        try:
+            jira_issue = jira_client.issue(issue)
+            jira_type = jira_issue.fields.issuetype.name
+            jira_type = translate_issue_type(jira_type, issue, warnings)
+            jira_components = [translate_component(c.name, _hash, warnings)\
+              for c in jira_issue.fields.components]
+            all_components = set(jira_components + commit_components)
+            populate(jira_type, all_components)
+        except Exception as e:
+            print "Unexpected error:", e
+    # For docs without an associated JIRA, manually add it ourselves
+    if is_docs(title) and not issues:
+        populate("documentation", commit_components)
+    print "  Processed commit %s authored by %s on %s" % (_hash, author, date)
+print "==================================================================================\n"
+
+# Write to contributors file ordered by author names
+# Each line takes the format " * Author name -- semi-colon delimited contributions"
+# e.g. * Andrew Or -- Bug fixes in Windows, Core, and Web UI; improvements in Core
+# e.g. * Tathagata Das -- Bug fixes and new features in Streaming
+contributors_file = open(contributors_file_name, "w")
+authors = author_info.keys()
+authors.sort()
+for author in authors:
+    contribution = ""
+    components = set()
+    issue_types = set()
+    for issue_type, comps in author_info[author].items():
+        components.update(comps)
+        issue_types.add(issue_type)
+    # If there is only one component, mention it only once
+    # e.g. Bug fixes, improvements in MLlib
+    if len(components) == 1:
+        contribution = "%s in %s" % (nice_join(issue_types), next(iter(components)))
+    # Otherwise, group contributions by issue types instead of modules
+    # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN
+    else:
+        contributions = ["%s in %s" % (issue_type, nice_join(comps)) \
+          for issue_type, comps in author_info[author].items()]
+        contribution = "; ".join(contributions)
+    # Do not use python's capitalize() on the whole string to preserve case
+    assert contribution
+    contribution = contribution[0].capitalize() + contribution[1:]
+    # If the author name is invalid, use an intermediate format that
+    # can be translated through translate-contributors.py later
+    # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
+    if author in invalid_authors and invalid_authors[author]:
+        author = author + "/" + "/".join(invalid_authors[author])
+    #line = " * %s -- %s" % (author, contribution)
+    line = author
+    contributors_file.write(line + "\n")
+contributors_file.close()
+print "Contributors list is successfully written to %s!" % contributors_file_name
+
+# Prompt the user to translate author names if necessary
+if invalid_authors:
+    warnings.append("Found the following invalid authors:")
+    for a in invalid_authors:
+        warnings.append("\t%s" % a)
+    warnings.append("Please run './translate-contributors.py' to translate them.")
+
+# Log any warnings encountered in the process
+if warnings:
+    print "\n============ Warnings encountered while creating the contributor list ============"
+    for w in warnings: print w
+    print "Please correct these in the final contributors list at %s." % contributors_file_name
+    print "==================================================================================\n"
+
diff --git a/dev/_site/create-release/known_translations b/dev/_site/create-release/known_translations
new file mode 100644
index 0000000000000..3563fe3cc3c03
--- /dev/null
+++ b/dev/_site/create-release/known_translations
@@ -0,0 +1,167 @@
+# This is a mapping of names to be translated through translate-contributors.py
+# The format expected on each line should be: <old name> - <new name>
+CodingCat - Nan Zhu
+CrazyJvm - Chao Chen
+EugenCepoi - Eugen Cepoi
+GraceH - Jie Huang
+JerryLead - Lijie Xu
+Leolh - Liu Hao
+Lewuathe - Kai Sasaki
+RongGu - Rong Gu
+Shiti - Shiti Saxena
+Victsm - Min Shen
+WangTaoTheTonic - Wang Tao
+XuTingjun - Tingjun Xu
+YanTangZhai - Yantang Zhai
+alexdebrie - Alex DeBrie
+alokito - Alok Saldanha
+anantasty - Anant Asthana
+andrewor14 - Andrew Or
+aniketbhatnagar - Aniket Bhatnagar
+arahuja - Arun Ahuja
+brkyvz - Burak Yavuz
+chesterxgchen - Chester Chen
+chiragaggarwal - Chirag Aggarwal
+chouqin - Qiping Li
+cocoatomo - Tomohiko K.
+coderfi - Fairiz Azizi
+coderxiang - Shuo Xiang
+davies - Davies Liu
+epahomov - Egor Pahomov
+falaki - Hossein Falaki
+freeman-lab - Jeremy Freeman
+industrial-sloth - Jascha Swisher
+jackylk - Jacky Li
+jayunit100 - Jay Vyas
+jerryshao - Saisai Shao
+jkbradley - Joseph Bradley
+lianhuiwang - Lianhui Wang
+lirui-intel - Rui Li
+luluorta - Lu Lu
+luogankun - Gankun Luo
+maji2014 - Derek Ma
+mccheah - Matthew Cheah
+mengxr - Xiangrui Meng
+nartz - Nathan Artz
+odedz - Oded Zimerman
+ravipesala - Ravindra Pesala
+roxchkplusony - Victor Tso
+scwf - Wang Fei
+shimingfei - Shiming Fei
+surq - Surong Quan
+suyanNone - Su Yan
+tedyu - Ted Yu
+tigerquoll - Dale Richardson
+wangxiaojing - Xiaojing Wang
+watermen - Yadong Qi
+witgo - Guoqiang Li
+xinyunh - Xinyun Huang
+zsxwing - Shixiong Zhu
+Bilna - Bilna P
+DoingDone9 - Doing Done
+Earne - Ernest
+FlytxtRnD - Meethu Mathew
+GenTang - Gen TANG
+JoshRosen - Josh Rosen
+MechCoder - Manoj Kumar
+OopsOutOfMemory - Sheng Li
+Peishen-Jia - Peishen Jia
+SaintBacchus - Huang Zhaowei
+azagrebin - Andrey Zagrebin
+bzz - Alexander Bezzubov
+fjiang6 - Fan Jiang
+gasparms - Gaspar Munoz
+guowei2 - Guo Wei
+hhbyyh - Yuhao Yang
+hseagle - Peng Xu
+javadba - Stephen Boesch
+jbencook - Ben Cook
+kul - Kuldeep
+ligangty - Gang Li
+marsishandsome - Liangliang Gu
+medale - Markus Dale
+nemccarthy - Nathan McCarthy
+nxwhite-str - Nate Crosswhite
+seayi - Xiaohua Yi
+tianyi - Yi Tian
+uncleGen - Uncle Gen
+viper-kun - Xu Kun
+x1- - Yuri Saito
+zapletal-martin - Martin Zapletal
+zuxqoj - Shekhar Bansal
+mingyukim - Mingyu Kim
+sigmoidanalytics - Mayur Rustagi
+AiHe - Ai He
+BenFradet - Ben Fradet
+FavioVazquez - Favio Vazquez
+JaysonSunshine - Jayson Sunshine
+Liuchang0812 - Liu Chang
+Sephiroth-Lin - Sephiroth Lin
+dobashim - Masaru Dobashi
+ehnalis - Zoltan Zvara
+emres - Emre Sevinc
+gchen - Guancheng Chen
+haiyangsea - Haiyang Sea
+hlin09 - Hao Lin
+hqzizania - Qian Huang
+jeanlyn - Jean Lyn
+jerluc - Jeremy A. Lucas
+jrabary - Jaonary Rabarisoa
+judynash - Judy Nash
+kaka1992 - Chen Song
+ksonj - Kalle Jepsen
+kuromatsu-nobuyuki - Nobuyuki Kuromatsu
+lazyman500 - Dong Xu
+leahmcguire - Leah McGuire
+mbittmann - Mark Bittmann
+mbonaci - Marko Bonaci
+meawoppl - Matthew Goodman
+nyaapa - Arsenii Krasikov
+phatak-dev - Madhukara Phatak
+prabeesh - Prabeesh K
+rakeshchalasani - Rakesh Chalasani
+rekhajoshm - Rekha Joshi
+sisihj - June He
+szheng79 - Shuai Zheng
+texasmichelle - Michelle Casbon
+vinodkc - Vinod KC
+yongtang - Yong Tang
+ypcat - Pei-Lun Lee
+zhichao-li - Zhichao Li
+zzcclp - Zhichao Zhang
+979969786 - Yuming Wang
+Rosstin - Rosstin Murphy
+ameyc - Amey Chaugule
+animeshbaranawal - Animesh Baranawal
+cafreeman - Chris Freeman
+lee19 - Lee
+lockwobr - Brian Lockwood
+navis - Navis Ryu
+pparkkin - Paavo Parkkinen
+HyukjinKwon - Hyukjin Kwon
+JDrit - Joseph Batchik
+JuhongPark - Juhong Park
+KaiXinXiaoLei - KaiXinXIaoLei
+NamelessAnalyst - NamelessAnalyst
+alyaxey - Alex Slusarenko
+baishuo - Shuo Bai
+fe2s - Oleksiy Dyagilev
+felixcheung - Felix Cheung
+feynmanliang - Feynman Liang
+josepablocam - Jose Cambronero
+kai-zeng - Kai Zeng
+mosessky - mosessky
+msannell - Michael Sannella
+nishkamravi2 - Nishkam Ravi
+noel-smith - Noel Smith
+petz2000 - Patrick Baier
+qiansl127 - Shilei Qian
+rahulpalamuttam - Rahul Palamuttam
+rowan000 - Rowan Chattaway
+sarutak - Kousuke Saruta
+sethah - Seth Hendrickson
+small-wang - Wang Wei
+stanzhai - Stan Zhai
+tien-dungle - Tien-Dung Le
+xuchenCN - Xu Chen
+zhangjiajin - Zhang JiaJin
diff --git a/dev/_site/create-release/release-build.sh b/dev/_site/create-release/release-build.sh
new file mode 100755
index 0000000000000..cb79e9eba06e2
--- /dev/null
+++ b/dev/_site/create-release/release-build.sh
@@ -0,0 +1,326 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+function exit_with_usage {
+  cat << EOF
+usage: release-build.sh <package|docs|publish-snapshot|publish-release>
+Creates build deliverables from a Spark commit.
+
+Top level targets are
+  package: Create binary packages and copy them to people.apache
+  docs: Build docs and copy them to people.apache
+  publish-snapshot: Publish snapshot release to Apache snapshots
+  publish-release: Publish a release to Apache release repo
+
+All other inputs are environment variables
+
+GIT_REF - Release tag or commit to build from
+SPARK_VERSION - Release identifier used when publishing
+SPARK_PACKAGE_VERSION - Release identifier in top level package directory
+REMOTE_PARENT_DIR - Parent in which to create doc or release builds.
+REMOTE_PARENT_MAX_LENGTH - If set, parent directory will be cleaned to only
+ have this number of subdirectories (by deleting old ones). WARNING: This deletes data.
+
+ASF_USERNAME - Username of ASF committer account
+ASF_PASSWORD - Password of ASF committer account
+ASF_RSA_KEY - RSA private key file for ASF committer account
+
+GPG_KEY - GPG key used to sign release artifacts
+GPG_PASSPHRASE - Passphrase for GPG key
+EOF
+  exit 1
+}
+
+set -e
+
+if [ $# -eq 0 ]; then
+  exit_with_usage
+fi
+
+if [[ $@ == *"help"* ]]; then
+  exit_with_usage
+fi
+
+for env in ASF_USERNAME ASF_RSA_KEY GPG_PASSPHRASE GPG_KEY; do
+  if [ -z "${!env}" ]; then
+    echo "ERROR: $env must be set to run this script"
+    exit_with_usage
+  fi
+done
+
+# Commit ref to checkout when building
+GIT_REF=${GIT_REF:-master}
+
+# Destination directory parent on remote server
+REMOTE_PARENT_DIR=${REMOTE_PARENT_DIR:-/home/$ASF_USERNAME/public_html}
+
+SSH="ssh -o ConnectTimeout=300 -o StrictHostKeyChecking=no -i $ASF_RSA_KEY"
+GPG="gpg --no-tty --batch"
+NEXUS_ROOT=https://repository.apache.org/service/local/staging
+NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads
+BASE_DIR=$(pwd)
+
+MVN="build/mvn --force"
+PUBLISH_PROFILES="-Pyarn -Phive -Phadoop-2.2"
+PUBLISH_PROFILES="$PUBLISH_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
+
+rm -rf spark
+git clone https://git-wip-us.apache.org/repos/asf/spark.git
+cd spark
+git checkout $GIT_REF
+git_hash=`git rev-parse --short HEAD`
+echo "Checked out Spark git hash $git_hash"
+
+if [ -z "$SPARK_VERSION" ]; then
+  SPARK_VERSION=$($MVN help:evaluate -Dexpression=project.version \
+    | grep -v INFO | grep -v WARNING | grep -v Download)
+fi
+
+if [ -z "$SPARK_PACKAGE_VERSION" ]; then
+  SPARK_PACKAGE_VERSION="${SPARK_VERSION}-$(date +%Y_%m_%d_%H_%M)-${git_hash}"
+fi
+
+DEST_DIR_NAME="spark-$SPARK_PACKAGE_VERSION"
+USER_HOST="$ASF_USERNAME@people.apache.org"
+
+git clean -d -f -x
+rm .gitignore
+rm -rf .git
+cd ..
+
+if [ -n "$REMOTE_PARENT_MAX_LENGTH" ]; then
+  old_dirs=$($SSH $USER_HOST ls -t $REMOTE_PARENT_DIR | tail -n +$REMOTE_PARENT_MAX_LENGTH)
+  for old_dir in $old_dirs; do
+    echo "Removing directory: $old_dir"
+    $SSH $USER_HOST rm -r $REMOTE_PARENT_DIR/$old_dir
+  done
+fi
+
+if [[ "$1" == "package" ]]; then
+  # Source and binary tarballs
+  echo "Packaging release tarballs"
+  cp -r spark spark-$SPARK_VERSION
+  tar cvzf spark-$SPARK_VERSION.tgz spark-$SPARK_VERSION
+  echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour --output spark-$SPARK_VERSION.tgz.asc \
+    --detach-sig spark-$SPARK_VERSION.tgz
+  echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md MD5 spark-$SPARK_VERSION.tgz > \
+    spark-$SPARK_VERSION.tgz.md5
+  echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+    SHA512 spark-$SPARK_VERSION.tgz > spark-$SPARK_VERSION.tgz.sha
+  rm -rf spark-$SPARK_VERSION
+
+  # Updated for each binary build
+  make_binary_release() {
+    NAME=$1
+    FLAGS=$2
+    ZINC_PORT=$3
+    cp -r spark spark-$SPARK_VERSION-bin-$NAME
+
+    cd spark-$SPARK_VERSION-bin-$NAME
+
+    # TODO There should probably be a flag to make-distribution to allow 2.11 support
+    if [[ $FLAGS == *scala-2.11* ]]; then
+      ./dev/change-scala-version.sh 2.11
+    fi
+
+    export ZINC_PORT=$ZINC_PORT
+    echo "Creating distribution: $NAME ($FLAGS)"
+
+    # Get maven home set by MVN
+    MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'`
+
+    ./make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
+      -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
+    cd ..
+    cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz .
+
+    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
+      --output spark-$SPARK_VERSION-bin-$NAME.tgz.asc \
+      --detach-sig spark-$SPARK_VERSION-bin-$NAME.tgz
+    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+      MD5 spark-$SPARK_VERSION-bin-$NAME.tgz > \
+      spark-$SPARK_VERSION-bin-$NAME.tgz.md5
+    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+      SHA512 spark-$SPARK_VERSION-bin-$NAME.tgz > \
+      spark-$SPARK_VERSION-bin-$NAME.tgz.sha
+  }
+
+  # TODO: Check exit codes of children here:
+  # http://stackoverflow.com/questions/1570262/shell-get-exit-code-of-background-process
+
+  # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds
+  # share the same Zinc server.
+  make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" &
+  make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" &
+  make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" &
+  make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" &
+  make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" &
+  make_binary_release "hadoop2.6" "-Psparkr -Phadoop-2.6 -Phive -Phive-thriftserver -Pyarn" "3034" &
+  make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn" "3037" &
+  make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn" "3038" &
+  wait
+  rm -rf spark-$SPARK_VERSION-bin-*/
+
+  # Copy data
+  dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-bin"
+  echo "Copying release tarballs to $dest_dir"
+  $SSH $USER_HOST mkdir $dest_dir
+  rsync -e "$SSH" spark-* $USER_HOST:$dest_dir
+  echo "Linking /latest to $dest_dir"
+  $SSH $USER_HOST rm -f "$REMOTE_PARENT_DIR/latest"
+  $SSH $USER_HOST ln -s $dest_dir "$REMOTE_PARENT_DIR/latest"
+  exit 0
+fi
+
+if [[ "$1" == "docs" ]]; then
+  # Documentation
+  cd spark
+  echo "Building Spark docs"
+  dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-docs"
+  cd docs
+  # Compile docs with Java 7 to use nicer format
+  # TODO: Make configurable to add this: PRODUCTION=1
+  PRODUCTION=1 RELEASE_VERSION="$SPARK_VERSION" jekyll build
+  echo "Copying release documentation to $dest_dir"
+  $SSH $USER_HOST mkdir $dest_dir
+  echo "Linking /latest to $dest_dir"
+  $SSH $USER_HOST rm -f "$REMOTE_PARENT_DIR/latest"
+  $SSH $USER_HOST ln -s $dest_dir "$REMOTE_PARENT_DIR/latest"
+  rsync -e "$SSH" -r _site/* $USER_HOST:$dest_dir
+  cd ..
+  exit 0
+fi
+
+if [[ "$1" == "publish-snapshot" ]]; then
+  cd spark
+  # Publish Spark to Maven release repo
+  echo "Deploying Spark SNAPSHOT at '$GIT_REF' ($git_hash)"
+  echo "Publish version is $SPARK_VERSION"
+  if [[ ! $SPARK_VERSION == *"SNAPSHOT"* ]]; then
+    echo "ERROR: Snapshots must have a version containing SNAPSHOT"
+    echo "ERROR: You gave version '$SPARK_VERSION'"
+    exit 1
+  fi
+  # Coerce the requested version
+  $MVN versions:set -DnewVersion=$SPARK_VERSION
+  tmp_settings="tmp-settings.xml"
+  echo "<settings><servers><server>" > $tmp_settings
+  echo "<id>apache.snapshots.https</id><username>$ASF_USERNAME</username>" >> $tmp_settings
+  echo "<password>$ASF_PASSWORD</password>" >> $tmp_settings
+  echo "</server></servers></settings>" >> $tmp_settings
+
+  # Generate random point for Zinc
+  export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)")
+
+  $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES \
+    -Phive-thriftserver deploy
+  ./dev/change-scala-version.sh 2.11
+  $MVN -DzincPort=$ZINC_PORT -Dscala-2.11 --settings $tmp_settings \
+    -DskipTests $PUBLISH_PROFILES clean deploy
+
+  # Clean-up Zinc nailgun process
+  /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
+
+  rm $tmp_settings
+  cd ..
+  exit 0
+fi
+
+if [[ "$1" == "publish-release" ]]; then
+  cd spark
+  # Publish Spark to Maven release repo
+  echo "Publishing Spark checkout at '$GIT_REF' ($git_hash)"
+  echo "Publish version is $SPARK_VERSION"
+  # Coerce the requested version
+  $MVN versions:set -DnewVersion=$SPARK_VERSION
+
+  # Using Nexus API documented here:
+  # https://support.sonatype.com/entries/39720203-Uploading-to-a-Staging-Repository-via-REST-API
+  echo "Creating Nexus staging repository"
+  repo_request="<promoteRequest><data><description>Apache Spark $SPARK_VERSION (commit $git_hash)</description></data></promoteRequest>"
+  out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \
+    -H "Content-Type:application/xml" -v \
+    $NEXUS_ROOT/profiles/$NEXUS_PROFILE/start)
+  staged_repo_id=$(echo $out | sed -e "s/.*\(orgapachespark-[0-9]\{4\}\).*/\1/")
+  echo "Created Nexus staging repository: $staged_repo_id"
+
+  tmp_repo=$(mktemp -d spark-repo-XXXXX)
+
+  # Generate random point for Zinc
+  export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)")
+
+  $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $PUBLISH_PROFILES \
+    -Phive-thriftserver clean install
+
+  ./dev/change-scala-version.sh 2.11
+
+  $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -Dscala-2.11 \
+    -DskipTests $PUBLISH_PROFILES clean install
+
+  # Clean-up Zinc nailgun process
+  /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
+
+  ./dev/change-version-to-2.10.sh
+
+  pushd $tmp_repo/org/apache/spark
+
+  # Remove any extra files generated during install
+  find . -type f |grep -v \.jar |grep -v \.pom | xargs rm
+
+  echo "Creating hash and signature files"
+  for file in $(find . -type f)
+  do
+    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --output $file.asc \
+      --detach-sig --armour $file;
+    if [ $(command -v md5) ]; then
+      # Available on OS X; -q to keep only hash
+      md5 -q $file > $file.md5
+    else
+      # Available on Linux; cut to keep only hash
+      md5sum $file | cut -f1 -d' ' > $file.md5
+    fi
+    sha1sum $file | cut -f1 -d' ' > $file.sha1
+  done
+
+  nexus_upload=$NEXUS_ROOT/deployByRepositoryId/$staged_repo_id
+  echo "Uplading files to $nexus_upload"
+  for file in $(find . -type f)
+  do
+    # strip leading ./
+    file_short=$(echo $file | sed -e "s/\.\///")
+    dest_url="$nexus_upload/org/apache/spark/$file_short"
+    echo "  Uploading $file_short"
+    curl -u $ASF_USERNAME:$ASF_PASSWORD --upload-file $file_short $dest_url
+  done
+
+  echo "Closing nexus staging repository"
+  repo_request="<promoteRequest><data><stagedRepositoryId>$staged_repo_id</stagedRepositoryId><description>Apache Spark $SPARK_VERSION (commit $git_hash)</description></data></promoteRequest>"
+  out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \
+    -H "Content-Type:application/xml" -v \
+    $NEXUS_ROOT/profiles/$NEXUS_PROFILE/finish)
+  echo "Closed Nexus staging repository: $staged_repo_id"
+  popd
+  rm -rf $tmp_repo
+  cd ..
+  exit 0
+fi
+
+cd ..
+rm -rf spark
+echo "ERROR: expects to be called with 'package', 'docs', 'publish-release' or 'publish-snapshot'"
diff --git a/dev/_site/create-release/release-tag.sh b/dev/_site/create-release/release-tag.sh
new file mode 100755
index 0000000000000..b0a3374becc6a
--- /dev/null
+++ b/dev/_site/create-release/release-tag.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+function exit_with_usage {
+  cat << EOF
+usage: tag-release.sh
+Tags a Spark release on a particular branch.
+
+Inputs are specified with the following environment variables:
+ASF_USERNAME - Apache Username
+ASF_PASSWORD - Apache Password
+GIT_NAME - Name to use with git
+GIT_EMAIL - E-mail address to use with git
+GIT_BRANCH - Git branch on which to make release
+RELEASE_VERSION - Version used in pom files for release
+RELEASE_TAG - Name of release tag
+NEXT_VERSION - Development version after release
+EOF
+  exit 1
+}
+
+set -e
+
+if [[ $@ == *"help"* ]]; then
+  exit_with_usage
+fi
+
+for env in ASF_USERNAME ASF_PASSWORD RELEASE_VERSION RELEASE_TAG NEXT_VERSION GIT_EMAIL GIT_NAME GIT_BRANCH; do
+  if [ -z "${!env}" ]; then
+    echo "$env must be set to run this script"
+    exit 1
+  fi
+done
+
+ASF_SPARK_REPO="git-wip-us.apache.org/repos/asf/spark.git"
+MVN="build/mvn --force"
+
+rm -rf spark
+git clone https://$ASF_USERNAME:$ASF_PASSWORD@$ASF_SPARK_REPO -b $GIT_BRANCH
+cd spark
+
+git config user.name "$GIT_NAME"
+git config user.email $GIT_EMAIL
+
+# Create release version
+$MVN versions:set -DnewVersion=$RELEASE_VERSION | grep -v "no value" # silence logs
+git commit -a -m "Preparing Spark release $RELEASE_TAG"
+echo "Creating tag $RELEASE_TAG at the head of $GIT_BRANCH"
+git tag $RELEASE_TAG
+
+# TODO: It would be nice to do some verifications here
+#       i.e. check whether ec2 scripts have the new version
+
+# Create next version
+$MVN versions:set -DnewVersion=$NEXT_VERSION | grep -v "no value" # silence logs
+git commit -a -m "Preparing development version $NEXT_VERSION"
+
+# Push changes
+git push origin $RELEASE_TAG
+git push origin HEAD:$GIT_BRANCH
+
+cd ..
+rm -rf spark
diff --git a/dev/_site/create-release/releaseutils.py b/dev/_site/create-release/releaseutils.py
new file mode 100755
index 0000000000000..7f152b7f53559
--- /dev/null
+++ b/dev/_site/create-release/releaseutils.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file contains helper methods used in creating a release.
+
+import re
+import sys
+from subprocess import Popen, PIPE
+
+try:
+    from jira.client import JIRA
+    # Old versions have JIRAError in exceptions package, new (0.5+) in utils.
+    try:
+        from jira.exceptions import JIRAError
+    except ImportError:
+        from jira.utils import JIRAError
+except ImportError:
+    print "This tool requires the jira-python library"
+    print "Install using 'sudo pip install jira'"
+    sys.exit(-1)
+
+try:
+    from github import Github
+    from github import GithubException
+except ImportError:
+    print "This tool requires the PyGithub library"
+    print "Install using 'sudo pip install PyGithub'"
+    sys.exit(-1)
+
+try:
+    import unidecode
+except ImportError:
+    print "This tool requires the unidecode library to decode obscure github usernames"
+    print "Install using 'sudo pip install unidecode'"
+    sys.exit(-1)
+
+# Contributors list file name
+contributors_file_name = "contributors.txt"
+
+# Prompt the user to answer yes or no until they do so
+def yesOrNoPrompt(msg):
+    response = raw_input("%s [y/n]: " % msg)
+    while response != "y" and response != "n":
+        return yesOrNoPrompt(msg)
+    return response == "y"
+
+# Utility functions run git commands (written with Git 1.8.5)
+def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
+def run_cmd_error(cmd): return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1]
+def get_date(commit_hash):
+    return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash])
+def tag_exists(tag):
+    stderr = run_cmd_error(["git", "show", tag])
+    return "error" not in stderr
+
+# A type-safe representation of a commit
+class Commit:
+    def __init__(self, _hash, author, title, pr_number = None):
+        self._hash = _hash
+        self.author = author
+        self.title = title
+        self.pr_number = pr_number
+    def get_hash(self): return self._hash
+    def get_author(self): return self.author
+    def get_title(self): return self.title
+    def get_pr_number(self): return self.pr_number
+    def __str__(self):
+        closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else ""
+        return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr)
+
+# Return all commits that belong to the specified tag.
+#
+# Under the hood, this runs a `git log` on that tag and parses the fields
+# from the command output to construct a list of Commit objects. Note that
+# because certain fields reside in the commit description and cannot be parsed
+# through the Github API itself, we need to do some intelligent regex parsing
+# to extract those fields.
+#
+# This is written using Git 1.8.5.
+def get_commits(tag):
+    commit_start_marker = "|=== COMMIT START MARKER ===|"
+    commit_end_marker = "|=== COMMIT END MARKER ===|"
+    field_end_marker = "|=== COMMIT FIELD END MARKER ===|"
+    log_format =\
+        commit_start_marker + "%h" +\
+        field_end_marker + "%an" +\
+        field_end_marker + "%s" +\
+        commit_end_marker + "%b"
+    output = run_cmd(["git", "log", "--quiet", "--pretty=format:" + log_format, tag])
+    commits = []
+    raw_commits = [c for c in output.split(commit_start_marker) if c]
+    for commit in raw_commits:
+        if commit.count(commit_end_marker) != 1:
+            print "Commit end marker not found in commit: "
+            for line in commit.split("\n"): print line
+            sys.exit(1)
+        # Separate commit digest from the body
+        # From the digest we extract the hash, author and the title
+        # From the body, we extract the PR number and the github username
+        [commit_digest, commit_body] = commit.split(commit_end_marker)
+        if commit_digest.count(field_end_marker) != 2:
+            sys.exit("Unexpected format in commit: %s" % commit_digest)
+        [_hash, author, title] = commit_digest.split(field_end_marker)
+        # The PR number and github username is in the commit message
+        # itself and cannot be accessed through any Github API
+        pr_number = None
+        match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body)
+        if match:
+            [pr_number, github_username] = match.groups()
+            # If the author name is not valid, use the github
+            # username so we can translate it properly later
+            if not is_valid_author(author):
+                author = github_username
+        # Guard against special characters
+        author = unidecode.unidecode(unicode(author, "UTF-8")).strip()
+        commit = Commit(_hash, author, title, pr_number)
+        commits.append(commit)
+    return commits
+
+# Maintain a mapping for translating issue types to contributions in the release notes
+# This serves an additional function of warning the user against unknown issue types
+# Note: This list is partially derived from this link:
+# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/issuetypes
+# Keep these in lower case
+known_issue_types = {
+    "bug": "bug fixes",
+    "build": "build fixes",
+    "dependency upgrade": "build fixes",
+    "improvement": "improvements",
+    "new feature": "new features",
+    "documentation": "documentation",
+    "test": "test",
+    "task": "improvement",
+    "sub-task": "improvement"
+}
+
+# Maintain a mapping for translating component names when creating the release notes
+# This serves an additional function of warning the user against unknown components
+# Note: This list is largely derived from this link:
+# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/components
+CORE_COMPONENT = "Core"
+known_components = {
+    "block manager": CORE_COMPONENT,
+    "build": CORE_COMPONENT,
+    "deploy": CORE_COMPONENT,
+    "documentation": CORE_COMPONENT,
+    "ec2": "EC2",
+    "examples": CORE_COMPONENT,
+    "graphx": "GraphX",
+    "input/output": CORE_COMPONENT,
+    "java api": "Java API",
+    "mesos": "Mesos",
+    "ml": "MLlib",
+    "mllib": "MLlib",
+    "project infra": "Project Infra",
+    "pyspark": "PySpark",
+    "shuffle": "Shuffle",
+    "spark core": CORE_COMPONENT,
+    "spark shell": CORE_COMPONENT,
+    "sql": "SQL",
+    "streaming": "Streaming",
+    "web ui": "Web UI",
+    "windows": "Windows",
+    "yarn": "YARN"
+}
+
+# Translate issue types using a format appropriate for writing contributions
+# If an unknown issue type is encountered, warn the user
+def translate_issue_type(issue_type, issue_id, warnings):
+    issue_type = issue_type.lower()
+    if issue_type in known_issue_types:
+        return known_issue_types[issue_type]
+    else:
+        warnings.append("Unknown issue type \"%s\" (see %s)" % (issue_type, issue_id))
+        return issue_type
+
+# Translate component names using a format appropriate for writing contributions
+# If an unknown component is encountered, warn the user
+def translate_component(component, commit_hash, warnings):
+    component = component.lower()
+    if component in known_components:
+        return known_components[component]
+    else:
+        warnings.append("Unknown component \"%s\" (see %s)" % (component, commit_hash))
+        return component
+
+# Parse components in the commit message
+# The returned components are already filtered and translated
+def find_components(commit, commit_hash):
+    components = re.findall("\[\w*\]", commit.lower())
+    components = [translate_component(c, commit_hash)\
+        for c in components if c in known_components]
+    return components
+
+# Join a list of strings in a human-readable manner
+# e.g. ["Juice"] -> "Juice"
+# e.g. ["Juice", "baby"] -> "Juice and baby"
+# e.g. ["Juice", "baby", "moon"] -> "Juice, baby, and moon"
+def nice_join(str_list):
+    str_list = list(str_list) # sometimes it's a set
+    if not str_list:
+        return ""
+    elif len(str_list) == 1:
+        return next(iter(str_list))
+    elif len(str_list) == 2:
+        return " and ".join(str_list)
+    else:
+        return ", ".join(str_list[:-1]) + ", and " + str_list[-1]
+
+# Return the full name of the specified user on Github
+# If the user doesn't exist, return None
+def get_github_name(author, github_client):
+    if github_client:
+        try:
+            return github_client.get_user(author).name
+        except GithubException as e:
+            # If this is not a "not found" exception
+            if e.status != 404:
+                raise e
+    return None
+
+# Return the full name of the specified user on JIRA
+# If the user doesn't exist, return None
+def get_jira_name(author, jira_client):
+    if jira_client:
+        try:
+            return jira_client.user(author).displayName
+        except JIRAError as e:
+            # If this is not a "not found" exception
+            if e.status_code != 404:
+                raise e
+    return None
+
+# Return whether the given name is in the form <First Name><space><Last Name>
+def is_valid_author(author):
+    if not author: return False
+    return " " in author and not re.findall("[0-9]", author)
+
+# Capitalize the first letter of each word in the given author name
+def capitalize_author(author):
+    if not author: return None
+    words = author.split(" ")
+    words = [w[0].capitalize() + w[1:] for w in words if w]
+    return " ".join(words)
+
diff --git a/dev/_site/create-release/translate-contributors.py b/dev/_site/create-release/translate-contributors.py
new file mode 100755
index 0000000000000..86fa02d87b9a0
--- /dev/null
+++ b/dev/_site/create-release/translate-contributors.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script translates invalid authors in the contributors list generated
+# by generate-contributors.py. When the script encounters an author name that
+# is considered invalid, it searches Github and JIRA in an attempt to search
+# for replacements. This tool runs in two modes:
+#
+# (1) Interactive mode: For each invalid author name, this script presents
+# all candidate replacements to the user and awaits user response. In this
+# mode, the user may also input a custom name. This is the default.
+#
+# (2) Non-interactive mode: For each invalid author name, this script replaces
+# the name with the first valid candidate it can find. If there is none, it
+# uses the original name. This can be enabled through the --non-interactive flag.
+
+import os
+import sys
+
+from releaseutils import *
+
+# You must set the following before use!
+JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
+JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
+JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
+GITHUB_API_TOKEN = os.environ.get("GITHUB_API_TOKEN", None)
+if not JIRA_USERNAME or not JIRA_PASSWORD:
+    sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set")
+if not GITHUB_API_TOKEN:
+    sys.exit("GITHUB_API_TOKEN must be set")
+
+# Write new contributors list to <old_file_name>.final
+if not os.path.isfile(contributors_file_name):
+    print "Contributors file %s does not exist!" % contributors_file_name
+    print "Have you run ./generate-contributors.py yet?"
+    sys.exit(1)
+contributors_file = open(contributors_file_name, "r")
+warnings = []
+
+# In non-interactive mode, this script will choose the first replacement that is valid
+INTERACTIVE_MODE = True
+if len(sys.argv) > 1:
+    options = set(sys.argv[1:])
+    if "--non-interactive" in options:
+        INTERACTIVE_MODE = False
+if INTERACTIVE_MODE:
+    print "Running in interactive mode. To disable this, provide the --non-interactive flag."
+
+# Setup Github and JIRA clients
+jira_options = { "server": JIRA_API_BASE }
+jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
+github_client = Github(GITHUB_API_TOKEN)
+
+# Load known author translations that are cached locally
+known_translations = {}
+known_translations_file_name = "known_translations"
+known_translations_file = open(known_translations_file_name, "r")
+for line in known_translations_file:
+    if line.startswith("#"): continue
+    [old_name, new_name] = line.strip("\n").split(" - ")
+    known_translations[old_name] = new_name
+known_translations_file.close()
+
+# Open again in case the user adds new mappings
+known_translations_file = open(known_translations_file_name, "a")
+
+# Generate candidates for the given author. This should only be called if the given author
+# name does not represent a full name as this operation is somewhat expensive. Under the
+# hood, it makes several calls to the Github and JIRA API servers to find the candidates.
+#
+# This returns a list of (candidate name, source) 2-tuples. E.g.
+# [
+#   (NOT_FOUND, "No full name found for Github user andrewor14"),
+#   ("Andrew Or", "Full name of JIRA user andrewor14"),
+#   ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"),
+#   ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"),
+#   (NOT_FOUND, "No assignee found for SPARK-1763")
+# ]
+NOT_FOUND = "Not found"
+def generate_candidates(author, issues):
+    candidates = []
+    # First check for full name of Github user
+    github_name = get_github_name(author, github_client)
+    if github_name:
+        candidates.append((github_name, "Full name of Github user %s" % author))
+    else:
+        candidates.append((NOT_FOUND, "No full name found for Github user %s" % author))
+    # Then do the same for JIRA user
+    jira_name = get_jira_name(author, jira_client)
+    if jira_name:
+        candidates.append((jira_name, "Full name of JIRA user %s" % author))
+    else:
+        candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % author))
+    # Then do the same for the assignee of each of the associated JIRAs
+    # Note that a given issue may not have an assignee, or the assignee may not have a full name
+    for issue in issues:
+        try:
+            jira_issue = jira_client.issue(issue)
+        except JIRAError as e:
+            # Do not exit just because an issue is not found!
+            if e.status_code == 404:
+                warnings.append("Issue %s not found!" % issue)
+                continue
+            raise e
+        jira_assignee = jira_issue.fields.assignee
+        if jira_assignee:
+            user_name = jira_assignee.name
+            display_name = jira_assignee.displayName
+            if display_name:
+                candidates.append((display_name, "Full name of %s assignee %s" % (issue, user_name)))
+            else:
+                candidates.append((NOT_FOUND, "No full name found for %s assignee %" % (issue, user_name)))
+        else:
+            candidates.append((NOT_FOUND, "No assignee found for %s" % issue))
+    # Guard against special characters in candidate names
+    # Note that the candidate name may already be in unicode (JIRA returns this)
+    for i, (candidate, source) in enumerate(candidates):
+        try:
+            candidate = unicode(candidate, "UTF-8")
+        except TypeError:
+            # already in unicode
+            pass
+        candidate = unidecode.unidecode(candidate).strip()
+        candidates[i] = (candidate, source)
+    return candidates
+
+# Translate each invalid author by searching for possible candidates from Github and JIRA
+# In interactive mode, this script presents the user with a list of choices and have the user
+# select from this list. Additionally, the user may also choose to enter a custom name.
+# In non-interactive mode, this script picks the first valid author name from the candidates
+# If no such name exists, the original name is used (without the JIRA numbers).
+print "\n========================== Translating contributor list =========================="
+lines = contributors_file.readlines()
+contributions = []
+for i, line in enumerate(lines):
+    temp_author = line.strip(" * ").split(" -- ")[0]
+    print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines))
+    if not temp_author:
+        error_msg = "    ERROR: Expected the following format \" * <author> -- <contributions>\"\n"
+        error_msg += "    ERROR: Actual = %s" % line
+        print error_msg
+        warnings.append(error_msg)
+        contributions.append(line)
+        continue
+    author = temp_author.split("/")[0]
+    # Use the local copy of known translations where possible
+    if author in known_translations:
+        line = line.replace(temp_author, known_translations[author])
+    elif not is_valid_author(author):
+        new_author = author
+        issues = temp_author.split("/")[1:]
+        candidates = generate_candidates(author, issues)
+        # Print out potential replacement candidates along with the sources, e.g.
+        #   [X] No full name found for Github user andrewor14
+        #   [X] No assignee found for SPARK-1763
+        #   [0] Andrew Or - Full name of JIRA user andrewor14
+        #   [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14
+        #   [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14
+        #   [3] andrewor14 - Raw Github username
+        #   [4] Custom
+        candidate_names = []
+        bad_prompts = [] # Prompts that can't actually be selected; print these first.
+        good_prompts = [] # Prompts that contain valid choices
+        for candidate, source in candidates:
+            if candidate == NOT_FOUND:
+                bad_prompts.append("    [X] %s" % source)
+            else:
+                index = len(candidate_names)
+                candidate_names.append(candidate)
+                good_prompts.append("    [%d] %s - %s" % (index, candidate, source))
+        raw_index = len(candidate_names)
+        custom_index = len(candidate_names) + 1
+        for p in bad_prompts: print p
+        if bad_prompts: print "    ---"
+        for p in good_prompts: print p
+        # In interactive mode, additionally provide "custom" option and await user response
+        if INTERACTIVE_MODE:
+            print "    [%d] %s - Raw Github username" % (raw_index, author)
+            print "    [%d] Custom" % custom_index
+            response = raw_input("    Your choice: ")
+            last_index = custom_index
+            while not response.isdigit() or int(response) > last_index:
+                response = raw_input("    Please enter an integer between 0 and %d: " % last_index)
+            response = int(response)
+            if response == custom_index:
+                new_author = raw_input("    Please type a custom name for this author: ")
+            elif response != raw_index:
+                new_author = candidate_names[response]
+        # In non-interactive mode, just pick the first candidate
+        else:
+            valid_candidate_names = [name for name, _ in candidates\
+                if is_valid_author(name) and name != NOT_FOUND]
+            if valid_candidate_names:
+                new_author = valid_candidate_names[0]
+        # Finally, capitalize the author and replace the original one with it
+        # If the final replacement is still invalid, log a warning
+        if is_valid_author(new_author):
+            new_author = capitalize_author(new_author)
+        else:
+            warnings.append("Unable to find a valid name %s for author %s" % (author, temp_author))
+        print "    * Replacing %s with %s" % (author, new_author)
+        # If we are in interactive mode, prompt the user whether we want to remember this new mapping
+        if INTERACTIVE_MODE and\
+          author not in known_translations and\
+          yesOrNoPrompt("    Add mapping %s -> %s to known translations file?" % (author, new_author)):
+            known_translations_file.write("%s - %s\n" % (author, new_author))
+            known_translations_file.flush()
+        line = line.replace(temp_author, author)
+    contributions.append(line)
+print "==================================================================================\n"
+contributors_file.close()
+known_translations_file.close()
+
+# Sort the contributions before writing them to the new file.
+# Additionally, check if there are any duplicate author rows.
+# This could happen if the same user has both a valid full
+# name (e.g. Andrew Or) and an invalid one (andrewor14).
+# If so, warn the user about this at the end.
+contributions.sort()
+all_authors = set()
+new_contributors_file_name = contributors_file_name + ".final"
+new_contributors_file = open(new_contributors_file_name, "w")
+for line in contributions:
+    author = line.strip(" * ").split(" -- ")[0]
+    if author in all_authors:
+        warnings.append("Detected duplicate author name %s. Please merge these manually." % author)
+    all_authors.add(author)
+    new_contributors_file.write(line)
+new_contributors_file.close()
+
+print "Translated contributors list successfully written to %s!" % new_contributors_file_name
+
+# Log any warnings encountered in the process
+if warnings:
+    print "\n========== Warnings encountered while translating the contributor list ==========="
+    for w in warnings: print w
+    print "Please manually correct these in the final contributors list at %s." % new_contributors_file_name
+    print "==================================================================================\n"
+
diff --git a/dev/_site/github_jira_sync.py b/dev/_site/github_jira_sync.py
new file mode 100755
index 0000000000000..287f0ca24a7df
--- /dev/null
+++ b/dev/_site/github_jira_sync.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Utility for updating JIRA's with information about Github pull requests
+
+import json
+import os
+import re
+import sys
+import urllib2
+
+try:
+    import jira.client
+except ImportError:
+    print "This tool requires the jira-python library"
+    print "Install using 'sudo pip install jira'"
+    sys.exit(-1)
+
+# User facing configs
+GITHUB_API_BASE = os.environ.get("GITHUB_API_BASE", "https://api.github.com/repos/apache/spark")
+JIRA_PROJECT_NAME = os.environ.get("JIRA_PROJECT_NAME", "SPARK")
+JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
+JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "apachespark")
+JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "XXX")
+# Maximum number of updates to perform in one run
+MAX_UPDATES = int(os.environ.get("MAX_UPDATES", "100000"))
+# Cut-off for oldest PR on which to comment. Useful for avoiding
+# "notification overload" when running for the first time.
+MIN_COMMENT_PR = int(os.environ.get("MIN_COMMENT_PR", "1496"))
+
+# File used as an opitimization to store maximum previously seen PR
+# Used mostly because accessing ASF JIRA is slow, so we want to avoid checking
+# the state of JIRA's that are tied to PR's we've already looked at.
+MAX_FILE = ".github-jira-max"
+
+def get_url(url):
+    try:
+        return urllib2.urlopen(url)
+    except urllib2.HTTPError as e:
+        print "Unable to fetch URL, exiting: %s" % url
+        sys.exit(-1)
+
+def get_json(urllib_response):
+    return json.load(urllib_response)
+
+# Return a list of (JIRA id, JSON dict) tuples:
+# e.g. [('SPARK-1234', {.. json ..}), ('SPARK-5687', {.. json ..})}
+def get_jira_prs():
+    result = []
+    has_next_page = True
+    page_num = 0
+    while has_next_page:
+	page = get_url(GITHUB_API_BASE + "/pulls?page=%s&per_page=100" % page_num)
+	page_json = get_json(page)
+
+	for pull in page_json:
+	    jiras = re.findall(JIRA_PROJECT_NAME + "-[0-9]{4,5}", pull['title'])
+	    for jira in jiras:
+		result = result + [(jira,  pull)]
+
+	# Check if there is another page
+	link_header = filter(lambda k: k.startswith("Link"), page.info().headers)[0]
+	if not "next"in link_header:
+	    has_next_page = False
+	else:
+	    page_num = page_num + 1
+    return result
+
+def set_max_pr(max_val):
+    f = open(MAX_FILE, 'w')
+    f.write("%s" % max_val)
+    f.close()
+    print "Writing largest PR number seen: %s" % max_val
+
+def get_max_pr():
+    if os.path.exists(MAX_FILE):
+        result = int(open(MAX_FILE, 'r').read())
+        print "Read largest PR number previously seen: %s" % result
+        return result
+    else:
+        return 0
+
+jira_client = jira.client.JIRA({'server': JIRA_API_BASE},
+                                basic_auth=(JIRA_USERNAME, JIRA_PASSWORD))
+
+jira_prs = get_jira_prs()
+
+previous_max = get_max_pr()
+print "Retrieved %s JIRA PR's from Github" % len(jira_prs)
+jira_prs = [(k, v) for k, v in jira_prs if int(v['number']) > previous_max]
+print "%s PR's remain after excluding visted ones" % len(jira_prs)
+
+num_updates = 0
+considered = []
+for issue, pr in sorted(jira_prs, key=lambda (k, v): int(v['number'])):
+    if num_updates >= MAX_UPDATES:
+      break
+    pr_num = int(pr['number'])
+
+    print "Checking issue %s" % issue
+    considered = considered + [pr_num]
+
+    url = pr['html_url']
+    title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) 
+    try:
+      existing_links = map(lambda l: l.raw['object']['url'], jira_client.remote_links(issue))
+    except:
+      print "Failure reading JIRA %s (does it exist?)" % issue
+      print sys.exc_info()[0]
+      continue
+
+    if url in existing_links:
+        continue
+
+    icon = {"title": "Pull request #%s" % pr['number'], 
+      "url16x16": "https://assets-cdn.github.com/favicon.ico"}
+    destination = {"title": title, "url": url, "icon": icon}
+    # For all possible fields see:
+    # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links     
+    # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"} 
+    jira_client.add_remote_link(issue, destination)
+    
+    comment = "User '%s' has created a pull request for this issue:" % pr['user']['login']
+    comment = comment + ("\n%s" % pr['html_url'])
+    if pr_num >= MIN_COMMENT_PR:
+        jira_client.add_comment(issue, comment)
+    
+    print "Added link %s <-> PR #%s" % (issue, pr['number'])
+    num_updates = num_updates + 1
+
+if len(considered) > 0:
+    set_max_pr(max(considered))
diff --git a/dev/_site/lint-python b/dev/_site/lint-python
new file mode 100755
index 0000000000000..0b97213ae3dff
--- /dev/null
+++ b/dev/_site/lint-python
@@ -0,0 +1,114 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
+PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport"
+PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py ./dev/run-tests-jenkins.py"
+PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
+PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
+PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt"
+
+cd "$SPARK_ROOT_DIR"
+
+# compileall: https://docs.python.org/2/library/compileall.html
+python -B -m compileall -q -l $PATHS_TO_CHECK > "$PEP8_REPORT_PATH"
+compile_status="${PIPESTATUS[0]}"
+
+# Get pep8 at runtime so that we don't rely on it being installed on the build server.
+#+ See: https://github.com/apache/spark/pull/1744#issuecomment-50982162
+#+ TODOs:
+#+  - Download pep8 from PyPI. It's more "official".
+PEP8_VERSION="1.6.2"
+PEP8_SCRIPT_PATH="$SPARK_ROOT_DIR/dev/pep8-$PEP8_VERSION.py"
+PEP8_SCRIPT_REMOTE_PATH="https://raw.githubusercontent.com/jcrocholl/pep8/$PEP8_VERSION/pep8.py"
+
+if [ ! -e "$PEP8_SCRIPT_PATH" ]; then
+    curl --silent -o "$PEP8_SCRIPT_PATH" "$PEP8_SCRIPT_REMOTE_PATH"    
+    curl_status="$?"
+
+    if [ "$curl_status" -ne 0 ]; then
+        echo "Failed to download pep8.py from \"$PEP8_SCRIPT_REMOTE_PATH\"."
+        exit "$curl_status"
+    fi
+fi
+
+# Easy install pylint in /dev/pylint. To easy_install into a directory, the PYTHONPATH should
+# be set to the directory.
+# dev/pylint should be appended to the PATH variable as well.
+# Jenkins by default installs the pylint3 version, so for now this just checks the code quality
+# of python3.
+export "PYTHONPATH=$SPARK_ROOT_DIR/dev/pylint"
+export "PYLINT_HOME=$PYTHONPATH"
+export "PATH=$PYTHONPATH:$PATH"
+
+# if [ ! -d "$PYLINT_HOME" ]; then
+#     mkdir "$PYLINT_HOME"
+#     # Redirect the annoying pylint installation output.
+#     easy_install -d "$PYLINT_HOME" pylint==1.4.4 &>> "$PYLINT_INSTALL_INFO"
+#     easy_install_status="$?"
+#
+#     if [ "$easy_install_status" -ne 0 ]; then
+#         echo "Unable to install pylint locally in \"$PYTHONPATH\"."
+#         cat "$PYLINT_INSTALL_INFO"
+#         exit "$easy_install_status"
+#     fi
+#
+#     rm "$PYLINT_INSTALL_INFO"
+#
+# fi
+
+# There is no need to write this output to a file
+#+ first, but we do so so that the check status can
+#+ be output before the report, like with the
+#+ scalastyle and RAT checks.
+python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK >> "$PEP8_REPORT_PATH"
+pep8_status="${PIPESTATUS[0]}"
+
+if [ "$compile_status" -eq 0 -a "$pep8_status" -eq 0 ]; then
+    lint_status=0
+else
+    lint_status=1
+fi
+
+if [ "$lint_status" -ne 0 ]; then
+    echo "PEP8 checks failed."
+    cat "$PEP8_REPORT_PATH"
+else
+    echo "PEP8 checks passed."
+fi
+
+rm "$PEP8_REPORT_PATH"
+
+# for to_be_checked in "$PATHS_TO_CHECK"
+# do
+#     pylint --rcfile="$SPARK_ROOT_DIR/pylintrc" $to_be_checked >> "$PYLINT_REPORT_PATH"
+# done
+
+# if [ "${PIPESTATUS[0]}" -ne 0 ]; then
+#     lint_status=1
+#     echo "Pylint checks failed."
+#     cat "$PYLINT_REPORT_PATH"
+# else
+#     echo "Pylint checks passed."
+# fi
+
+# rm "$PYLINT_REPORT_PATH"
+
+exit "$lint_status"
diff --git a/dev/_site/lint-r b/dev/_site/lint-r
new file mode 100755
index 0000000000000..bfda0bca15eb7
--- /dev/null
+++ b/dev/_site/lint-r
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)"
+LINT_R_REPORT_FILE_NAME="$SPARK_ROOT_DIR/dev/lint-r-report.log"
+
+
+if ! type "Rscript" > /dev/null; then
+  echo "ERROR: You should install R"
+  exit
+fi
+
+`which Rscript` --vanilla "$SPARK_ROOT_DIR/dev/lint-r.R" "$SPARK_ROOT_DIR" | tee "$LINT_R_REPORT_FILE_NAME"
+
+NUM_LINES=`wc -l < "$LINT_R_REPORT_FILE_NAME" | awk '{print $1}'`
+if [ "$NUM_LINES" = "0" ] ; then
+  lint_status=0
+  echo "lintr checks passed."
+else
+  lint_status=1
+  echo "lintr checks failed."
+fi
+
+exit "$lint_status"
diff --git a/dev/_site/lint-r.R b/dev/_site/lint-r.R
new file mode 100644
index 0000000000000..999eef571b824
--- /dev/null
+++ b/dev/_site/lint-r.R
@@ -0,0 +1,37 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+argv <- commandArgs(TRUE)
+SPARK_ROOT_DIR <- as.character(argv[1])
+LOCAL_LIB_LOC <- file.path(SPARK_ROOT_DIR, "R", "lib")
+
+# Checks if SparkR is installed in a local directory.
+if (! library(SparkR, lib.loc = LOCAL_LIB_LOC, logical.return = TRUE)) {
+  stop("You should install SparkR in a local directory with `R/install-dev.sh`.")
+}
+
+# Installs lintr from Github in a local directory.
+# NOTE: The CRAN's version is too old to adapt to our rules.
+if ("lintr" %in% row.names(installed.packages())  == FALSE) {
+  devtools::install_github("jimhester/lintr")
+}
+
+library(lintr)
+library(methods)
+library(testthat)
+path.to.package <- file.path(SPARK_ROOT_DIR, "R", "pkg")
+lint_package(path.to.package, cache = FALSE)
diff --git a/dev/_site/lint-scala b/dev/_site/lint-scala
new file mode 100755
index 0000000000000..c676dfdf4f44e
--- /dev/null
+++ b/dev/_site/lint-scala
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)"
+
+"$SCRIPT_DIR/scalastyle"
diff --git a/dev/_site/merge_spark_pr.py b/dev/_site/merge_spark_pr.py
new file mode 100755
index 0000000000000..bf1a000f46791
--- /dev/null
+++ b/dev/_site/merge_spark_pr.py
@@ -0,0 +1,453 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Utility for creating well-formed pull request merges and pushing them to Apache.
+#   usage: ./apache-pr-merge.py    (see config env vars below)
+#
+# This utility assumes you already have local a Spark git folder and that you
+# have added remotes corresponding to both (i) the github apache Spark
+# mirror and (ii) the apache git repo.
+
+import json
+import os
+import re
+import subprocess
+import sys
+import urllib2
+
+try:
+    import jira.client
+    JIRA_IMPORTED = True
+except ImportError:
+    JIRA_IMPORTED = False
+
+# Location of your Spark git development area
+SPARK_HOME = os.environ.get("SPARK_HOME", os.getcwd())
+# Remote name which points to the Gihub site
+PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "apache-github")
+# Remote name which points to Apache git
+PUSH_REMOTE_NAME = os.environ.get("PUSH_REMOTE_NAME", "apache")
+# ASF JIRA username
+JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "")
+# ASF JIRA password
+JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "")
+# OAuth key used for issuing requests against the GitHub API. If this is not defined, then requests
+# will be unauthenticated. You should only need to configure this if you find yourself regularly
+# exceeding your IP's unauthenticated request rate limit. You can create an OAuth key at
+# https://github.com/settings/tokens. This script only requires the "public_repo" scope.
+GITHUB_OAUTH_KEY = os.environ.get("GITHUB_OAUTH_KEY")
+
+
+GITHUB_BASE = "https://github.com/apache/spark/pull"
+GITHUB_API_BASE = "https://api.github.com/repos/apache/spark"
+JIRA_BASE = "https://issues.apache.org/jira/browse"
+JIRA_API_BASE = "https://issues.apache.org/jira"
+# Prefix added to temporary branches
+BRANCH_PREFIX = "PR_TOOL"
+
+
+def get_json(url):
+    try:
+        request = urllib2.Request(url)
+        if GITHUB_OAUTH_KEY:
+            request.add_header('Authorization', 'token %s' % GITHUB_OAUTH_KEY)
+        return json.load(urllib2.urlopen(request))
+    except urllib2.HTTPError as e:
+        if "X-RateLimit-Remaining" in e.headers and e.headers["X-RateLimit-Remaining"] == '0':
+            print "Exceeded the GitHub API rate limit; see the instructions in " + \
+                  "dev/merge_spark_pr.py to configure an OAuth token for making authenticated " + \
+                  "GitHub requests."
+        else:
+            print "Unable to fetch URL, exiting: %s" % url
+        sys.exit(-1)
+
+
+def fail(msg):
+    print msg
+    clean_up()
+    sys.exit(-1)
+
+
+def run_cmd(cmd):
+    print cmd
+    if isinstance(cmd, list):
+        return subprocess.check_output(cmd)
+    else:
+        return subprocess.check_output(cmd.split(" "))
+
+
+def continue_maybe(prompt):
+    result = raw_input("\n%s (y/n): " % prompt)
+    if result.lower() != "y":
+        fail("Okay, exiting")
+
+def clean_up():
+    print "Restoring head pointer to %s" % original_head
+    run_cmd("git checkout %s" % original_head)
+
+    branches = run_cmd("git branch").replace(" ", "").split("\n")
+
+    for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches):
+        print "Deleting local branch %s" % branch
+        run_cmd("git branch -D %s" % branch)
+
+
+# merge the requested PR and return the merge hash
+def merge_pr(pr_num, target_ref, title, body, pr_repo_desc):
+    pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num)
+    target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, target_ref.upper())
+    run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, pr_branch_name))
+    run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, target_branch_name))
+    run_cmd("git checkout %s" % target_branch_name)
+
+    had_conflicts = False
+    try:
+        run_cmd(['git', 'merge', pr_branch_name, '--squash'])
+    except Exception as e:
+        msg = "Error merging: %s\nWould you like to manually fix-up this merge?" % e
+        continue_maybe(msg)
+        msg = "Okay, please fix any conflicts and 'git add' conflicting files... Finished?"
+        continue_maybe(msg)
+        had_conflicts = True
+
+    commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
+                             '--pretty=format:%an <%ae>']).split("\n")
+    distinct_authors = sorted(set(commit_authors),
+                              key=lambda x: commit_authors.count(x), reverse=True)
+    primary_author = raw_input(
+        "Enter primary author in the format of \"name <email>\" [%s]: " %
+        distinct_authors[0])
+    if primary_author == "":
+        primary_author = distinct_authors[0]
+
+    commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
+                      '--pretty=format:%h [%an] %s']).split("\n\n")
+
+    merge_message_flags = []
+
+    merge_message_flags += ["-m", title]
+    if body is not None:
+        # We remove @ symbols from the body to avoid triggering e-mails
+        # to people every time someone creates a public fork of Spark.
+        merge_message_flags += ["-m", body.replace("@", "")]
+
+    authors = "\n".join(["Author: %s" % a for a in distinct_authors])
+
+    merge_message_flags += ["-m", authors]
+
+    if had_conflicts:
+        committer_name = run_cmd("git config --get user.name").strip()
+        committer_email = run_cmd("git config --get user.email").strip()
+        message = "This patch had conflicts when merged, resolved by\nCommitter: %s <%s>" % (
+            committer_name, committer_email)
+        merge_message_flags += ["-m", message]
+
+    # The string "Closes #%s" string is required for GitHub to correctly close the PR
+    merge_message_flags += ["-m", "Closes #%s from %s." % (pr_num, pr_repo_desc)]
+
+    run_cmd(['git', 'commit', '--author="%s"' % primary_author] + merge_message_flags)
+
+    continue_maybe("Merge complete (local ref %s). Push to %s?" % (
+        target_branch_name, PUSH_REMOTE_NAME))
+
+    try:
+        run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, target_ref))
+    except Exception as e:
+        clean_up()
+        fail("Exception while pushing: %s" % e)
+
+    merge_hash = run_cmd("git rev-parse %s" % target_branch_name)[:8]
+    clean_up()
+    print("Pull request #%s merged!" % pr_num)
+    print("Merge hash: %s" % merge_hash)
+    return merge_hash
+
+
+def cherry_pick(pr_num, merge_hash, default_branch):
+    pick_ref = raw_input("Enter a branch name [%s]: " % default_branch)
+    if pick_ref == "":
+        pick_ref = default_branch
+
+    pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, pick_ref.upper())
+
+    run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, pick_branch_name))
+    run_cmd("git checkout %s" % pick_branch_name)
+
+    try:
+        run_cmd("git cherry-pick -sx %s" % merge_hash)
+    except Exception as e:
+        msg = "Error cherry-picking: %s\nWould you like to manually fix-up this merge?" % e
+        continue_maybe(msg)
+        msg = "Okay, please fix any conflicts and finish the cherry-pick. Finished?"
+        continue_maybe(msg)
+
+    continue_maybe("Pick complete (local ref %s). Push to %s?" % (
+        pick_branch_name, PUSH_REMOTE_NAME))
+
+    try:
+        run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref))
+    except Exception as e:
+        clean_up()
+        fail("Exception while pushing: %s" % e)
+
+    pick_hash = run_cmd("git rev-parse %s" % pick_branch_name)[:8]
+    clean_up()
+
+    print("Pull request #%s picked into %s!" % (pr_num, pick_ref))
+    print("Pick hash: %s" % pick_hash)
+    return pick_ref
+
+
+def fix_version_from_branch(branch, versions):
+    # Note: Assumes this is a sorted (newest->oldest) list of un-released versions
+    if branch == "master":
+        return versions[0]
+    else:
+        branch_ver = branch.replace("branch-", "")
+        return filter(lambda x: x.name.startswith(branch_ver), versions)[-1]
+
+
+def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
+    asf_jira = jira.client.JIRA({'server': JIRA_API_BASE},
+                                basic_auth=(JIRA_USERNAME, JIRA_PASSWORD))
+
+    jira_id = raw_input("Enter a JIRA id [%s]: " % default_jira_id)
+    if jira_id == "":
+        jira_id = default_jira_id
+
+    try:
+        issue = asf_jira.issue(jira_id)
+    except Exception as e:
+        fail("ASF JIRA could not find %s\n%s" % (jira_id, e))
+
+    cur_status = issue.fields.status.name
+    cur_summary = issue.fields.summary
+    cur_assignee = issue.fields.assignee
+    if cur_assignee is None:
+        cur_assignee = "NOT ASSIGNED!!!"
+    else:
+        cur_assignee = cur_assignee.displayName
+
+    if cur_status == "Resolved" or cur_status == "Closed":
+        fail("JIRA issue %s already has status '%s'" % (jira_id, cur_status))
+    print ("=== JIRA %s ===" % jira_id)
+    print ("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" % (
+        cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id))
+
+    versions = asf_jira.project_versions("SPARK")
+    versions = sorted(versions, key=lambda x: x.name, reverse=True)
+    versions = filter(lambda x: x.raw['released'] is False, versions)
+    # Consider only x.y.z versions
+    versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions)
+
+    default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches)
+    for v in default_fix_versions:
+        # Handles the case where we have forked a release branch but not yet made the release.
+        # In this case, if the PR is committed to the master branch and the release branch, we
+        # only consider the release branch to be the fix version. E.g. it is not valid to have
+        # both 1.1.0 and 1.0.0 as fix versions.
+        (major, minor, patch) = v.split(".")
+        if patch == "0":
+            previous = "%s.%s.%s" % (major, int(minor) - 1, 0)
+            if previous in default_fix_versions:
+                default_fix_versions = filter(lambda x: x != v, default_fix_versions)
+    default_fix_versions = ",".join(default_fix_versions)
+
+    fix_versions = raw_input("Enter comma-separated fix version(s) [%s]: " % default_fix_versions)
+    if fix_versions == "":
+        fix_versions = default_fix_versions
+    fix_versions = fix_versions.replace(" ", "").split(",")
+
+    def get_version_json(version_str):
+        return filter(lambda v: v.name == version_str, versions)[0].raw
+
+    jira_fix_versions = map(lambda v: get_version_json(v), fix_versions)
+
+    resolve = filter(lambda a: a['name'] == "Resolve Issue", asf_jira.transitions(jira_id))[0]
+    resolution = filter(lambda r: r.raw['name'] == "Fixed", asf_jira.resolutions())[0]
+    asf_jira.transition_issue(
+        jira_id, resolve["id"], fixVersions = jira_fix_versions,
+        comment = comment, resolution = {'id': resolution.raw['id']})
+
+    print "Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions)
+
+
+def resolve_jira_issues(title, merge_branches, comment):
+    jira_ids = re.findall("SPARK-[0-9]{4,5}", title)
+
+    if len(jira_ids) == 0:
+        resolve_jira_issue(merge_branches, comment)
+    for jira_id in jira_ids:
+        resolve_jira_issue(merge_branches, comment, jira_id)
+
+
+def standardize_jira_ref(text):
+    """
+    Standardize the [SPARK-XXXXX] [MODULE] prefix
+    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to "[SPARK-XXX][MLLIB] Issue"
+
+    >>> standardize_jira_ref("[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful")
+    '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is successful'
+    >>> standardize_jira_ref("[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests")
+    '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull requests'
+    >>> standardize_jira_ref("[MLlib] Spark  5954: Top by key")
+    '[SPARK-5954][MLLIB] Top by key'
+    >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl")
+    '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
+    >>> standardize_jira_ref("SPARK-1094 Support MiMa for reporting binary compatibility accross versions.")
+    '[SPARK-1094] Support MiMa for reporting binary compatibility accross versions.'
+    >>> standardize_jira_ref("[WIP]  [SPARK-1146] Vagrant support for Spark")
+    '[SPARK-1146][WIP] Vagrant support for Spark'
+    >>> standardize_jira_ref("SPARK-1032. If Yarn app fails before registering, app master stays aroun...")
+    '[SPARK-1032] If Yarn app fails before registering, app master stays aroun...'
+    >>> standardize_jira_ref("[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.")
+    '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.'
+    >>> standardize_jira_ref("Additional information for users building from source code")
+    'Additional information for users building from source code'
+    """
+    jira_refs = []
+    components = []
+
+    # If the string is compliant, no need to process any further
+    if (re.search(r'^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+', text)):
+        return text
+
+    # Extract JIRA ref(s):
+    pattern = re.compile(r'(SPARK[-\s]*[0-9]{3,6})+', re.IGNORECASE)
+    for ref in pattern.findall(text):
+        # Add brackets, replace spaces with a dash, & convert to uppercase
+        jira_refs.append('[' + re.sub(r'\s+', '-', ref.upper()) + ']')
+        text = text.replace(ref, '')
+
+    # Extract spark component(s):
+    # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
+    pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE)
+    for component in pattern.findall(text):
+        components.append(component.upper())
+        text = text.replace(component, '')
+
+    # Cleanup any remaining symbols:
+    pattern = re.compile(r'^\W+(.*)', re.IGNORECASE)
+    if (pattern.search(text) is not None):
+        text = pattern.search(text).groups()[0]
+
+    # Assemble full text (JIRA ref(s), module(s), remaining text)
+    clean_text = ''.join(jira_refs).strip() + ''.join(components).strip() + " " + text.strip()
+
+    # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were included
+    clean_text = re.sub(r'\s+', ' ', clean_text.strip())
+
+    return clean_text
+
+def main():
+    global original_head
+
+    os.chdir(SPARK_HOME)
+    original_head = run_cmd("git rev-parse HEAD")[:8]
+
+    branches = get_json("%s/branches" % GITHUB_API_BASE)
+    branch_names = filter(lambda x: x.startswith("branch-"), [x['name'] for x in branches])
+    # Assumes branch names can be sorted lexicographically
+    latest_branch = sorted(branch_names, reverse=True)[0]
+
+    pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ")
+    pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num))
+    pr_events = get_json("%s/issues/%s/events" % (GITHUB_API_BASE, pr_num))
+
+    url = pr["url"]
+
+    # Decide whether to use the modified title or not
+    modified_title = standardize_jira_ref(pr["title"])
+    if modified_title != pr["title"]:
+        print "I've re-written the title as follows to match the standard format:"
+        print "Original: %s" % pr["title"]
+        print "Modified: %s" % modified_title
+        result = raw_input("Would you like to use the modified title? (y/n): ")
+        if result.lower() == "y":
+            title = modified_title
+            print "Using modified title:"
+        else:
+            title = pr["title"]
+            print "Using original title:"
+        print title
+    else:
+        title = pr["title"]
+
+    body = pr["body"]
+    target_ref = pr["base"]["ref"]
+    user_login = pr["user"]["login"]
+    base_ref = pr["head"]["ref"]
+    pr_repo_desc = "%s/%s" % (user_login, base_ref)
+
+    # Merged pull requests don't appear as merged in the GitHub API;
+    # Instead, they're closed by asfgit.
+    merge_commits = \
+        [e for e in pr_events if e["actor"]["login"] == "asfgit" and e["event"] == "closed"]
+
+    if merge_commits:
+        merge_hash = merge_commits[0]["commit_id"]
+        message = get_json("%s/commits/%s" % (GITHUB_API_BASE, merge_hash))["commit"]["message"]
+
+        print "Pull request %s has already been merged, assuming you want to backport" % pr_num
+        commit_is_downloaded = run_cmd(['git', 'rev-parse', '--quiet', '--verify',
+                                    "%s^{commit}" % merge_hash]).strip() != ""
+        if not commit_is_downloaded:
+            fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num)
+
+        print "Found commit %s:\n%s" % (merge_hash, message)
+        cherry_pick(pr_num, merge_hash, latest_branch)
+        sys.exit(0)
+
+    if not bool(pr["mergeable"]):
+        msg = "Pull request %s is not mergeable in its current form.\n" % pr_num + \
+            "Continue? (experts only!)"
+        continue_maybe(msg)
+
+    print ("\n=== Pull Request #%s ===" % pr_num)
+    print ("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % (
+        title, pr_repo_desc, target_ref, url))
+    continue_maybe("Proceed with merging pull request #%s?" % pr_num)
+
+    merged_refs = [target_ref]
+
+    merge_hash = merge_pr(pr_num, target_ref, title, body, pr_repo_desc)
+
+    pick_prompt = "Would you like to pick %s into another branch?" % merge_hash
+    while raw_input("\n%s (y/n): " % pick_prompt).lower() == "y":
+        merged_refs = merged_refs + [cherry_pick(pr_num, merge_hash, latest_branch)]
+
+    if JIRA_IMPORTED:
+        if JIRA_USERNAME and JIRA_PASSWORD:
+            continue_maybe("Would you like to update an associated JIRA?")
+            jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (pr_num, GITHUB_BASE, pr_num)
+            resolve_jira_issues(title, merged_refs, jira_comment)
+        else:
+            print "JIRA_USERNAME and JIRA_PASSWORD not set"
+            print "Exiting without trying to close the associated JIRA."
+    else:
+        print "Could not find jira-python library. Run 'sudo pip install jira' to install."
+        print "Exiting without trying to close the associated JIRA."
+
+if __name__ == "__main__":
+    import doctest
+    (failure_count, test_count) = doctest.testmod()
+    if failure_count:
+        exit(-1)
+
+    main()
diff --git a/dev/_site/mima b/dev/_site/mima
new file mode 100755
index 0000000000000..2952fa65d42ff
--- /dev/null
+++ b/dev/_site/mima
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -o pipefail
+set -e
+
+# Go to the Spark project root directory
+FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
+cd "$FWDIR"
+
+echo -e "q\n" | build/sbt oldDeps/update
+rm -f .generated-mima*
+
+generate_mima_ignore() {
+  SPARK_JAVA_OPTS="-XX:MaxPermSize=1g -Xmx2g" \
+    ./bin/spark-class org.apache.spark.tools.GenerateMIMAIgnore
+}
+
+# Generate Mima Ignore is called twice, first with latest built jars
+# on the classpath and then again with previous version jars on the classpath.
+# Because of a bug in GenerateMIMAIgnore that when old jars are ahead on classpath
+# it did not process the new classes (which are in assembly jar).
+generate_mima_ignore
+
+export SPARK_CLASSPATH="`find lib_managed \( -name '*spark*jar' -a -type f \) | tr "\\n" ":"`"
+echo "SPARK_CLASSPATH=$SPARK_CLASSPATH"
+
+generate_mima_ignore
+
+echo -e "q\n" | build/sbt mima-report-binary-issues | grep -v -e "info.*Resolving"
+ret_val=$?
+
+if [ $ret_val != 0 ]; then
+  echo "NOTE: Exceptions to binary compatibility can be added in project/MimaExcludes.scala"
+fi
+
+rm -f .generated-mima*
+exit $ret_val
diff --git a/dev/_site/run-tests b/dev/_site/run-tests
new file mode 100755
index 0000000000000..257d1e8d50bb4
--- /dev/null
+++ b/dev/_site/run-tests
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FWDIR="$(cd "`dirname $0`"/..; pwd)"
+cd "$FWDIR"
+
+exec python -u ./dev/run-tests.py "$@"
diff --git a/dev/_site/run-tests-jenkins b/dev/_site/run-tests-jenkins
new file mode 100755
index 0000000000000..e79accf9e987a
--- /dev/null
+++ b/dev/_site/run-tests-jenkins
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Wrapper script that runs the Spark tests then reports QA results
+# to github via its API.
+# Environment variables are populated by the code here:
+#+ https://github.com/jenkinsci/ghprb-plugin/blob/master/src/main/java/org/jenkinsci/plugins/ghprb/GhprbTrigger.java#L139
+
+FWDIR="$(cd "`dirname $0`"/..; pwd)"
+cd "$FWDIR"
+
+exec python -u ./dev/run-tests-jenkins.py "$@"
diff --git a/dev/_site/run-tests-jenkins.py b/dev/_site/run-tests-jenkins.py
new file mode 100755
index 0000000000000..623004310e189
--- /dev/null
+++ b/dev/_site/run-tests-jenkins.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python2
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+import os
+import sys
+import json
+import urllib2
+import functools
+import subprocess
+
+from sparktestsupport import SPARK_HOME, ERROR_CODES
+from sparktestsupport.shellutils import run_cmd
+
+
+def print_err(msg):
+    """
+    Given a set of arguments, will print them to the STDERR stream
+    """
+    print(msg, file=sys.stderr)
+
+
+def post_message_to_github(msg, ghprb_pull_id):
+    print("Attempting to post to Github...")
+
+    url = "https://api.github.com/repos/apache/spark/issues/" + ghprb_pull_id + "/comments"
+    github_oauth_key = os.environ["GITHUB_OAUTH_KEY"]
+
+    posted_message = json.dumps({"body": msg})
+    request = urllib2.Request(url,
+                              headers={
+                                  "Authorization": "token %s" % github_oauth_key,
+                                  "Content-Type": "application/json"
+                              },
+                              data=posted_message)
+    try:
+        response = urllib2.urlopen(request)
+
+        if response.getcode() == 201:
+            print(" > Post successful.")
+    except urllib2.HTTPError as http_e:
+        print_err("Failed to post message to Github.")
+        print_err(" > http_code: %s" % http_e.code)
+        print_err(" > api_response: %s" % http_e.read())
+        print_err(" > data: %s" % posted_message)
+    except urllib2.URLError as url_e:
+        print_err("Failed to post message to Github.")
+        print_err(" > urllib2_status: %s" % url_e.reason[1])
+        print_err(" > data: %s" % posted_message)
+
+
+def pr_message(build_display_name,
+               build_url,
+               ghprb_pull_id,
+               short_commit_hash,
+               commit_url,
+               msg,
+               post_msg=''):
+    # align the arguments properly for string formatting
+    str_args = (build_display_name,
+                msg,
+                build_url,
+                ghprb_pull_id,
+                short_commit_hash,
+                commit_url,
+                str(' ' + post_msg + '.') if post_msg else '.')
+    return '**[Test build %s %s](%sconsoleFull)** for PR %s at commit [`%s`](%s)%s' % str_args
+
+
+def run_pr_checks(pr_tests, ghprb_actual_commit, sha1):
+    """
+    Executes a set of pull request checks to ease development and report issues with various
+    components such as style, linting, dependencies, compatibilities, etc.
+    @return a list of messages to post back to Github
+    """
+    # Ensure we save off the current HEAD to revert to
+    current_pr_head = run_cmd(['git', 'rev-parse', 'HEAD'], return_output=True).strip()
+    pr_results = list()
+
+    for pr_test in pr_tests:
+        test_name = pr_test + '.sh'
+        pr_results.append(run_cmd(['bash', os.path.join(SPARK_HOME, 'dev', 'tests', test_name),
+                                   ghprb_actual_commit, sha1],
+                                  return_output=True).rstrip())
+        # Ensure, after each test, that we're back on the current PR
+        run_cmd(['git', 'checkout', '-f', current_pr_head])
+    return pr_results
+
+
+def run_tests(tests_timeout):
+    """
+    Runs the `dev/run-tests` script and responds with the correct error message
+    under the various failure scenarios.
+    @return a tuple containing the test result code and the result note to post to Github
+    """
+
+    test_result_code = subprocess.Popen(['timeout',
+                                         tests_timeout,
+                                         os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait()
+
+    failure_note_by_errcode = {
+        1: 'executing the `dev/run-tests` script',  # error to denote run-tests script failures
+        ERROR_CODES["BLOCK_GENERAL"]: 'some tests',
+        ERROR_CODES["BLOCK_RAT"]: 'RAT tests',
+        ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests',
+        ERROR_CODES["BLOCK_PYTHON_STYLE"]: 'Python style tests',
+        ERROR_CODES["BLOCK_R_STYLE"]: 'R style tests',
+        ERROR_CODES["BLOCK_DOCUMENTATION"]: 'to generate documentation',
+        ERROR_CODES["BLOCK_BUILD"]: 'to build',
+        ERROR_CODES["BLOCK_MIMA"]: 'MiMa tests',
+        ERROR_CODES["BLOCK_SPARK_UNIT_TESTS"]: 'Spark unit tests',
+        ERROR_CODES["BLOCK_PYSPARK_UNIT_TESTS"]: 'PySpark unit tests',
+        ERROR_CODES["BLOCK_SPARKR_UNIT_TESTS"]: 'SparkR unit tests',
+        ERROR_CODES["BLOCK_TIMEOUT"]: 'from timeout after a configured wait of \`%s\`' % (
+            tests_timeout)
+    }
+
+    if test_result_code == 0:
+        test_result_note = ' * This patch passes all tests.'
+    else:
+        test_result_note = ' * This patch **fails %s**.' % failure_note_by_errcode[test_result_code]
+
+    return [test_result_code, test_result_note]
+
+
+def main():
+    # Important Environment Variables
+    # ---
+    # $ghprbActualCommit
+    #   This is the hash of the most recent commit in the PR.
+    #   The merge-base of this and master is the commit from which the PR was branched.
+    # $sha1
+    #   If the patch merges cleanly, this is a reference to the merge commit hash
+    #     (e.g. "origin/pr/2606/merge").
+    #   If the patch does not merge cleanly, it is equal to $ghprbActualCommit.
+    #   The merge-base of this and master in the case of a clean merge is the most recent commit
+    #     against master.
+    ghprb_pull_id = os.environ["ghprbPullId"]
+    ghprb_actual_commit = os.environ["ghprbActualCommit"]
+    ghprb_pull_title = os.environ["ghprbPullTitle"]
+    sha1 = os.environ["sha1"]
+
+    # Marks this build as a pull request build.
+    os.environ["AMP_JENKINS_PRB"] = "true"
+    # Switch to a Maven-based build if the PR title contains "test-maven":
+    if "test-maven" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_TOOL"] = "maven"
+    # Switch the Hadoop profile based on the PR title:
+    if "test-hadoop1.0" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop1.0"
+    if "test-hadoop2.2" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.0"
+    if "test-hadoop2.2" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.2"
+    if "test-hadoop2.3" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.3"
+
+    build_display_name = os.environ["BUILD_DISPLAY_NAME"]
+    build_url = os.environ["BUILD_URL"]
+
+    commit_url = "https://github.com/apache/spark/commit/" + ghprb_actual_commit
+
+    # GitHub doesn't auto-link short hashes when submitted via the API, unfortunately. :(
+    short_commit_hash = ghprb_actual_commit[0:7]
+
+    # format: http://linux.die.net/man/1/timeout
+    # must be less than the timeout configured on Jenkins (currently 300m)
+    tests_timeout = "250m"
+
+    # Array to capture all test names to run on the pull request. These tests are represented
+    # by their file equivalents in the dev/tests/ directory.
+    #
+    # To write a PR test:
+    #   * the file must reside within the dev/tests directory
+    #   * be an executable bash script
+    #   * accept three arguments on the command line, the first being the Github PR long commit
+    #     hash, the second the Github SHA1 hash, and the final the current PR hash
+    #   * and, lastly, return string output to be included in the pr message output that will
+    #     be posted to Github
+    pr_tests = [
+        "pr_merge_ability",
+        "pr_public_classes"
+        # DISABLED (pwendell) "pr_new_dependencies"
+    ]
+
+    # `bind_message_base` returns a function to generate messages for Github posting
+    github_message = functools.partial(pr_message,
+                                       build_display_name,
+                                       build_url,
+                                       ghprb_pull_id,
+                                       short_commit_hash,
+                                       commit_url)
+
+    # post start message
+    post_message_to_github(github_message('has started'), ghprb_pull_id)
+
+    pr_check_results = run_pr_checks(pr_tests, ghprb_actual_commit, sha1)
+
+    test_result_code, test_result_note = run_tests(tests_timeout)
+
+    # post end message
+    result_message = github_message('has finished')
+    result_message += '\n' + test_result_note + '\n'
+    result_message += '\n'.join(pr_check_results)
+
+    post_message_to_github(result_message, ghprb_pull_id)
+
+    sys.exit(test_result_code)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dev/_site/run-tests.py b/dev/_site/run-tests.py
new file mode 100755
index 0000000000000..9e1abb0697192
--- /dev/null
+++ b/dev/_site/run-tests.py
@@ -0,0 +1,561 @@
+#!/usr/bin/env python2
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+import itertools
+from optparse import OptionParser
+import os
+import random
+import re
+import sys
+import subprocess
+from collections import namedtuple
+
+from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
+from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
+import sparktestsupport.modules as modules
+
+
+# -------------------------------------------------------------------------------------------------
+# Functions for traversing module dependency graph
+# -------------------------------------------------------------------------------------------------
+
+
+def determine_modules_for_files(filenames):
+    """
+    Given a list of filenames, return the set of modules that contain those files.
+    If a file is not associated with a more specific submodule, then this method will consider that
+    file to belong to the 'root' module.
+
+    >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
+    ['pyspark-core', 'sql']
+    >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
+    ['root']
+    """
+    changed_modules = set()
+    for filename in filenames:
+        matched_at_least_one_module = False
+        for module in modules.all_modules:
+            if module.contains_file(filename):
+                changed_modules.add(module)
+                matched_at_least_one_module = True
+        if not matched_at_least_one_module:
+            changed_modules.add(modules.root)
+    return changed_modules
+
+
+def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None):
+    """
+    Given a git commit and target ref, use the set of files changed in the diff in order to
+    determine which modules' tests should be run.
+
+    >>> [x.name for x in determine_modules_for_files( \
+            identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))]
+    ['graphx']
+    >>> 'root' in [x.name for x in determine_modules_for_files( \
+         identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
+    True
+    """
+    if target_branch is None and target_ref is None:
+        raise AttributeError("must specify either target_branch or target_ref")
+    elif target_branch is not None and target_ref is not None:
+        raise AttributeError("must specify either target_branch or target_ref, not both")
+    if target_branch is not None:
+        diff_target = target_branch
+        run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
+    else:
+        diff_target = target_ref
+    raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
+                                         universal_newlines=True)
+    # Remove any empty strings
+    return [f for f in raw_output.split('\n') if f]
+
+
+def setup_test_environ(environ):
+    print("[info] Setup the following environment variables for tests: ")
+    for (k, v) in environ.items():
+        print("%s=%s" % (k, v))
+        os.environ[k] = v
+
+
+def determine_modules_to_test(changed_modules):
+    """
+    Given a set of modules that have changed, compute the transitive closure of those modules'
+    dependent modules in order to determine the set of modules that should be tested.
+
+    >>> sorted(x.name for x in determine_modules_to_test([modules.root]))
+    ['root']
+    >>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
+    ['examples', 'graphx']
+    >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
+    >>> x # doctest: +NORMALIZE_WHITESPACE
+    ['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \
+     'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql']
+    """
+    # If we're going to have to run all of the tests, then we can just short-circuit
+    # and return 'root'. No module depends on root, so if it appears then it will be
+    # in changed_modules.
+    if modules.root in changed_modules:
+        return [modules.root]
+    modules_to_test = set()
+    for module in changed_modules:
+        modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
+    return modules_to_test.union(set(changed_modules))
+
+
+def determine_tags_to_exclude(changed_modules):
+    tags = []
+    for m in modules.all_modules:
+        if m not in changed_modules:
+            tags += m.test_tags
+    return tags
+
+
+# -------------------------------------------------------------------------------------------------
+# Functions for working with subprocesses and shell tools
+# -------------------------------------------------------------------------------------------------
+
+
+def determine_java_executable():
+    """Will return the path of the java executable that will be used by Spark's
+    tests or `None`"""
+
+    # Any changes in the way that Spark's build detects java must be reflected
+    # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to
+    # the `java` executable on the path
+
+    java_home = os.environ.get("JAVA_HOME")
+
+    # check if there is an executable at $JAVA_HOME/bin/java
+    java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None
+    # if the java_exe wasn't set, check for a `java` version on the $PATH
+    return java_exe if java_exe else which("java")
+
+
+JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 'update'])
+
+
+def determine_java_version(java_exe):
+    """Given a valid java executable will return its version in named tuple format
+    with accessors '.major', '.minor', '.patch', '.update'"""
+
+    raw_output = subprocess.check_output([java_exe, "-version"],
+                                         stderr=subprocess.STDOUT,
+                                         universal_newlines=True)
+
+    raw_output_lines = raw_output.split('\n')
+
+    # find raw version string, eg 'java version "1.8.0_25"'
+    raw_version_str = next(x for x in raw_output_lines if " version " in x)
+
+    match = re.search('(\d+)\.(\d+)\.(\d+)_(\d+)', raw_version_str)
+
+    major = int(match.group(1))
+    minor = int(match.group(2))
+    patch = int(match.group(3))
+    update = int(match.group(4))
+
+    return JavaVersion(major, minor, patch, update)
+
+# -------------------------------------------------------------------------------------------------
+# Functions for running the other build and test scripts
+# -------------------------------------------------------------------------------------------------
+
+
+def set_title_and_block(title, err_block):
+    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES[err_block])
+    line_str = '=' * 72
+
+    print('')
+    print(line_str)
+    print(title)
+    print(line_str)
+
+
+def run_apache_rat_checks():
+    set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
+    run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
+
+
+def run_scala_style_checks():
+    set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
+    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")])
+
+
+def run_python_style_checks():
+    set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
+    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
+
+
+def run_sparkr_style_checks():
+    set_title_and_block("Running R style checks", "BLOCK_R_STYLE")
+
+    if which("R"):
+        # R style check should be executed after `install-dev.sh`.
+        # Since warnings about `no visible global function definition` appear
+        # without the installation. SEE ALSO: SPARK-9121.
+        run_cmd([os.path.join(SPARK_HOME, "dev", "lint-r")])
+    else:
+        print("Ignoring SparkR style check as R was not found in PATH")
+
+
+def build_spark_documentation():
+    set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
+    os.environ["PRODUCTION"] = "1 jekyll build"
+
+    os.chdir(os.path.join(SPARK_HOME, "docs"))
+
+    jekyll_bin = which("jekyll")
+
+    if not jekyll_bin:
+        print("[error] Cannot find a version of `jekyll` on the system; please",
+              " install one and retry to build documentation.")
+        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
+    else:
+        run_cmd([jekyll_bin, "build"])
+
+    os.chdir(SPARK_HOME)
+
+
+def get_zinc_port():
+    """
+    Get a randomized port on which to start Zinc
+    """
+    return random.randrange(3030, 4030)
+
+
+def kill_zinc_on_port(zinc_port):
+    """
+    Kill the Zinc process running on the given port, if one exists.
+    """
+    cmd = ("/usr/sbin/lsof -P |grep %s | grep LISTEN "
+           "| awk '{ print $2; }' | xargs kill") % zinc_port
+    subprocess.check_call(cmd, shell=True)
+
+
+def exec_maven(mvn_args=()):
+    """Will call Maven in the current directory with the list of mvn_args passed
+    in and returns the subprocess for any further processing"""
+
+    zinc_port = get_zinc_port()
+    os.environ["ZINC_PORT"] = "%s" % zinc_port
+    zinc_flag = "-DzincPort=%s" % zinc_port
+    flags = [os.path.join(SPARK_HOME, "build", "mvn"), "--force", zinc_flag]
+    run_cmd(flags + mvn_args)
+    kill_zinc_on_port(zinc_port)
+
+
+def exec_sbt(sbt_args=()):
+    """Will call SBT in the current directory with the list of mvn_args passed
+    in and returns the subprocess for any further processing"""
+
+    sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args
+
+    sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" +
+                                   "^.*[warn].*Merging" + "|" +
+                                   "^.*[info].*Including")
+
+    # NOTE: echo "q" is needed because sbt on encountering a build file
+    # with failure (either resolution or compilation) prompts the user for
+    # input either q, r, etc to quit or retry. This echo is there to make it
+    # not block.
+    echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE)
+    sbt_proc = subprocess.Popen(sbt_cmd,
+                                stdin=echo_proc.stdout,
+                                stdout=subprocess.PIPE)
+    echo_proc.wait()
+    for line in iter(sbt_proc.stdout.readline, ''):
+        if not sbt_output_filter.match(line):
+            print(line, end='')
+    retcode = sbt_proc.wait()
+
+    if retcode > 0:
+        exit_from_command_with_retcode(sbt_cmd, retcode)
+
+
+def get_hadoop_profiles(hadoop_version):
+    """
+    For the given Hadoop version tag, return a list of SBT profile flags for
+    building and testing against that Hadoop version.
+    """
+
+    sbt_maven_hadoop_profiles = {
+        "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.2.1"],
+        "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"],
+        "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"],
+        "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
+        "hadoop2.6": ["-Pyarn", "-Phadoop-2.6"],
+    }
+
+    if hadoop_version in sbt_maven_hadoop_profiles:
+        return sbt_maven_hadoop_profiles[hadoop_version]
+    else:
+        print("[error] Could not find", hadoop_version, "in the list. Valid options",
+              " are", sbt_maven_hadoop_profiles.keys())
+        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
+
+
+def build_spark_maven(hadoop_version):
+    # Enable all of the profiles for the build:
+    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
+    mvn_goals = ["clean", "package", "-DskipTests"]
+    profiles_and_goals = build_profiles + mvn_goals
+
+    print("[info] Building Spark (w/Hive 1.2.1) using Maven with these arguments: ",
+          " ".join(profiles_and_goals))
+
+    exec_maven(profiles_and_goals)
+
+
+def build_spark_sbt(hadoop_version):
+    # Enable all of the profiles for the build:
+    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
+    sbt_goals = ["package",
+                 "assembly/assembly",
+                 "streaming-kafka-assembly/assembly",
+                 "streaming-flume-assembly/assembly",
+                 "streaming-mqtt-assembly/assembly",
+                 "streaming-mqtt/test:assembly",
+                 "streaming-kinesis-asl-assembly/assembly"]
+    profiles_and_goals = build_profiles + sbt_goals
+
+    print("[info] Building Spark (w/Hive 1.2.1) using SBT with these arguments: ",
+          " ".join(profiles_and_goals))
+
+    exec_sbt(profiles_and_goals)
+
+
+def build_apache_spark(build_tool, hadoop_version):
+    """Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or
+    `maven`). Defaults to using `sbt`."""
+
+    set_title_and_block("Building Spark", "BLOCK_BUILD")
+
+    rm_r("lib_managed")
+
+    if build_tool == "maven":
+        build_spark_maven(hadoop_version)
+    else:
+        build_spark_sbt(hadoop_version)
+
+
+def detect_binary_inop_with_mima():
+    set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
+    run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
+
+
+def run_scala_tests_maven(test_profiles):
+    mvn_test_goals = ["test", "--fail-at-end"]
+
+    profiles_and_goals = test_profiles + mvn_test_goals
+
+    print("[info] Running Spark tests using Maven with these arguments: ",
+          " ".join(profiles_and_goals))
+
+    exec_maven(profiles_and_goals)
+
+
+def run_scala_tests_sbt(test_modules, test_profiles):
+
+    sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
+
+    if not sbt_test_goals:
+        return
+
+    profiles_and_goals = test_profiles + list(sbt_test_goals)
+
+    print("[info] Running Spark tests using SBT with these arguments: ",
+          " ".join(profiles_and_goals))
+
+    exec_sbt(profiles_and_goals)
+
+
+def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags):
+    """Function to properly execute all tests passed in as a set from the
+    `determine_test_suites` function"""
+    set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
+
+    test_modules = set(test_modules)
+
+    test_profiles = get_hadoop_profiles(hadoop_version) + \
+        list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules)))
+
+    if excluded_tags:
+        test_profiles += ['-Dtest.exclude.tags=' + ",".join(excluded_tags)]
+
+    if build_tool == "maven":
+        run_scala_tests_maven(test_profiles)
+    else:
+        run_scala_tests_sbt(test_modules, test_profiles)
+
+
+def run_python_tests(test_modules, parallelism):
+    set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
+
+    command = [os.path.join(SPARK_HOME, "python", "run-tests")]
+    if test_modules != [modules.root]:
+        command.append("--modules=%s" % ','.join(m.name for m in test_modules))
+    command.append("--parallelism=%i" % parallelism)
+    run_cmd(command)
+
+
+def run_sparkr_tests():
+    set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
+
+    if which("R"):
+        run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
+    else:
+        print("Ignoring SparkR tests as R was not found in PATH")
+
+
+def parse_opts():
+    parser = OptionParser(
+        prog="run-tests"
+    )
+    parser.add_option(
+        "-p", "--parallelism", type="int", default=4,
+        help="The number of suites to test in parallel (default %default)"
+    )
+
+    (opts, args) = parser.parse_args()
+    if args:
+        parser.error("Unsupported arguments: %s" % ' '.join(args))
+    if opts.parallelism < 1:
+        parser.error("Parallelism cannot be less than 1")
+    return opts
+
+
+def main():
+    opts = parse_opts()
+    # Ensure the user home directory (HOME) is valid and is an absolute directory
+    if not USER_HOME or not os.path.isabs(USER_HOME):
+        print("[error] Cannot determine your home directory as an absolute path;",
+              " ensure the $HOME environment variable is set properly.")
+        sys.exit(1)
+
+    os.chdir(SPARK_HOME)
+
+    rm_r(os.path.join(SPARK_HOME, "work"))
+    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
+    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))
+
+    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])
+
+    java_exe = determine_java_executable()
+
+    if not java_exe:
+        print("[error] Cannot find a version of `java` on the system; please",
+              " install one and retry.")
+        sys.exit(2)
+
+    java_version = determine_java_version(java_exe)
+
+    if java_version.minor < 8:
+        print("[warn] Java 8 tests will not run because JDK version is < 1.8.")
+
+    # install SparkR
+    if which("R"):
+        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
+    else:
+        print("Can't install SparkR as R is was not found in PATH")
+
+    if os.environ.get("AMPLAB_JENKINS"):
+        # if we're on the Amplab Jenkins build servers setup variables
+        # to reflect the environment settings
+        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
+        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3")
+        test_env = "amplab_jenkins"
+        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
+        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
+    else:
+        # else we're running locally and can use local settings
+        build_tool = "sbt"
+        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.3")
+        test_env = "local"
+
+    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
+          "under environment", test_env)
+
+    changed_modules = None
+    changed_files = None
+    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
+        target_branch = os.environ["ghprbTargetBranch"]
+        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
+        changed_modules = determine_modules_for_files(changed_files)
+        excluded_tags = determine_tags_to_exclude(changed_modules)
+    if not changed_modules:
+        changed_modules = [modules.root]
+        excluded_tags = []
+    print("[info] Found the following changed modules:",
+          ", ".join(x.name for x in changed_modules))
+
+    # setup environment variables
+    # note - the 'root' module doesn't collect environment variables for all modules. Because the
+    # environment variables should not be set if a module is not changed, even if running the 'root'
+    # module. So here we should use changed_modules rather than test_modules.
+    test_environ = {}
+    for m in changed_modules:
+        test_environ.update(m.environ)
+    setup_test_environ(test_environ)
+
+    test_modules = determine_modules_to_test(changed_modules)
+
+    # license checks
+    run_apache_rat_checks()
+
+    # style checks
+    if not changed_files or any(f.endswith(".scala") for f in changed_files):
+        run_scala_style_checks()
+    if not changed_files or any(f.endswith(".py") for f in changed_files):
+        run_python_style_checks()
+    if not changed_files or any(f.endswith(".R") for f in changed_files):
+        run_sparkr_style_checks()
+
+    # determine if docs were changed and if we're inside the amplab environment
+    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
+    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
+    #    build_spark_documentation()
+
+    # spark build
+    build_apache_spark(build_tool, hadoop_version)
+
+    # backwards compatibility checks
+    if build_tool == "sbt":
+        # Note: compatiblity tests only supported in sbt for now
+        detect_binary_inop_with_mima()
+
+    # run the test suites
+    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)
+
+    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
+    if modules_with_python_tests:
+        run_python_tests(modules_with_python_tests, opts.parallelism)
+    if any(m.should_run_r_tests for m in test_modules):
+        run_sparkr_tests()
+
+
+def _test():
+    import doctest
+    failure_count = doctest.testmod()[0]
+    if failure_count:
+        exit(-1)
+
+if __name__ == "__main__":
+    _test()
+    main()
diff --git a/dev/_site/scalastyle b/dev/_site/scalastyle
new file mode 100755
index 0000000000000..ad93f7e85b27c
--- /dev/null
+++ b/dev/_site/scalastyle
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+echo -e "q\n" | build/sbt -Pkinesis-asl -Phive -Phive-thriftserver scalastyle > scalastyle.txt
+echo -e "q\n" | build/sbt -Pkinesis-asl -Phive -Phive-thriftserver test:scalastyle >> scalastyle.txt
+# Check style with YARN built too
+echo -e "q\n" | build/sbt -Pkinesis-asl -Pyarn -Phadoop-2.2 scalastyle >> scalastyle.txt
+echo -e "q\n" | build/sbt -Pkinesis-asl -Pyarn -Phadoop-2.2 test:scalastyle >> scalastyle.txt
+
+ERRORS=$(cat scalastyle.txt | awk '{if($1~/error/)print}')
+rm scalastyle.txt
+
+if test ! -z "$ERRORS"; then
+    echo -e "Scalastyle checks failed at following occurrences:\n$ERRORS"
+    exit 1
+else
+    echo -e "Scalastyle checks passed."
+fi
diff --git a/dev/_site/sparktestsupport/modules.py b/dev/_site/sparktestsupport/modules.py
new file mode 100644
index 0000000000000..d65547e04db4b
--- /dev/null
+++ b/dev/_site/sparktestsupport/modules.py
@@ -0,0 +1,437 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import itertools
+import re
+
+all_modules = []
+
+
+class Module(object):
+    """
+    A module is the basic abstraction in our test runner script. Each module consists of a set of
+    source files, a set of test commands, and a set of dependencies on other modules. We use modules
+    to define a dependency graph that lets determine which tests to run based on which files have
+    changed.
+    """
+
+    def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={},
+                 sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(),
+                 test_tags=(), should_run_r_tests=False):
+        """
+        Define a new module.
+
+        :param name: A short module name, for display in logging and error messages.
+        :param dependencies: A set of dependencies for this module. This should only include direct
+            dependencies; transitive dependencies are resolved automatically.
+        :param source_file_regexes: a set of regexes that match source files belonging to this
+            module. These regexes are applied by attempting to match at the beginning of the
+            filename strings.
+        :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
+            order to build and test this module (e.g. '-PprofileName').
+        :param environ: A dict of environment variables that should be set when files in this
+            module are changed.
+        :param sbt_test_goals: A set of SBT test goals for testing this module.
+        :param python_test_goals: A set of Python test goals for testing this module.
+        :param blacklisted_python_implementations: A set of Python implementations that are not
+            supported by this module's Python components. The values in this set should match
+            strings returned by Python's `platform.python_implementation()`.
+        :param test_tags A set of tags that will be excluded when running unit tests if the module
+            is not explicitly changed.
+        :param should_run_r_tests: If true, changes in this module will trigger all R tests.
+        """
+        self.name = name
+        self.dependencies = dependencies
+        self.source_file_prefixes = source_file_regexes
+        self.sbt_test_goals = sbt_test_goals
+        self.build_profile_flags = build_profile_flags
+        self.environ = environ
+        self.python_test_goals = python_test_goals
+        self.blacklisted_python_implementations = blacklisted_python_implementations
+        self.test_tags = test_tags
+        self.should_run_r_tests = should_run_r_tests
+
+        self.dependent_modules = set()
+        for dep in dependencies:
+            dep.dependent_modules.add(self)
+        all_modules.append(self)
+
+    def contains_file(self, filename):
+        return any(re.match(p, filename) for p in self.source_file_prefixes)
+
+
+sql = Module(
+    name="sql",
+    dependencies=[],
+    source_file_regexes=[
+        "sql/(?!hive-thriftserver)",
+        "bin/spark-sql",
+    ],
+    build_profile_flags=[
+        "-Phive",
+    ],
+    sbt_test_goals=[
+        "catalyst/test",
+        "sql/test",
+        "hive/test",
+    ],
+    test_tags=[
+        "org.apache.spark.tags.ExtendedHiveTest"
+    ]
+)
+
+
+hive_thriftserver = Module(
+    name="hive-thriftserver",
+    dependencies=[sql],
+    source_file_regexes=[
+        "sql/hive-thriftserver",
+        "sbin/start-thriftserver.sh",
+    ],
+    build_profile_flags=[
+        "-Phive-thriftserver",
+    ],
+    sbt_test_goals=[
+        "hive-thriftserver/test",
+    ]
+)
+
+
+graphx = Module(
+    name="graphx",
+    dependencies=[],
+    source_file_regexes=[
+        "graphx/",
+    ],
+    sbt_test_goals=[
+        "graphx/test"
+    ]
+)
+
+
+streaming = Module(
+    name="streaming",
+    dependencies=[],
+    source_file_regexes=[
+        "streaming",
+    ],
+    sbt_test_goals=[
+        "streaming/test",
+    ]
+)
+
+
+# Don't set the dependencies because changes in other modules should not trigger Kinesis tests.
+# Kinesis tests depends on external Amazon kinesis service. We should run these tests only when
+# files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't
+# fail other PRs.
+streaming_kinesis_asl = Module(
+    name="streaming-kinesis-asl",
+    dependencies=[],
+    source_file_regexes=[
+        "extras/kinesis-asl/",
+        "extras/kinesis-asl-assembly/",
+    ],
+    build_profile_flags=[
+        "-Pkinesis-asl",
+    ],
+    environ={
+        "ENABLE_KINESIS_TESTS": "1"
+    },
+    sbt_test_goals=[
+        "streaming-kinesis-asl/test",
+    ]
+)
+
+
+streaming_zeromq = Module(
+    name="streaming-zeromq",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/zeromq",
+    ],
+    sbt_test_goals=[
+        "streaming-zeromq/test",
+    ]
+)
+
+
+streaming_twitter = Module(
+    name="streaming-twitter",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/twitter",
+    ],
+    sbt_test_goals=[
+        "streaming-twitter/test",
+    ]
+)
+
+
+streaming_mqtt = Module(
+    name="streaming-mqtt",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/mqtt",
+        "external/mqtt-assembly",
+    ],
+    sbt_test_goals=[
+        "streaming-mqtt/test",
+    ]
+)
+
+
+streaming_kafka = Module(
+    name="streaming-kafka",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/kafka",
+        "external/kafka-assembly",
+    ],
+    sbt_test_goals=[
+        "streaming-kafka/test",
+    ]
+)
+
+
+streaming_flume_sink = Module(
+    name="streaming-flume-sink",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/flume-sink",
+    ],
+    sbt_test_goals=[
+        "streaming-flume-sink/test",
+    ]
+)
+
+
+streaming_flume = Module(
+    name="streaming-flume",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/flume",
+    ],
+    sbt_test_goals=[
+        "streaming-flume/test",
+    ]
+)
+
+
+streaming_flume_assembly = Module(
+    name="streaming-flume-assembly",
+    dependencies=[streaming_flume, streaming_flume_sink],
+    source_file_regexes=[
+        "external/flume-assembly",
+    ]
+)
+
+
+mllib = Module(
+    name="mllib",
+    dependencies=[streaming, sql],
+    source_file_regexes=[
+        "data/mllib/",
+        "mllib/",
+    ],
+    sbt_test_goals=[
+        "mllib/test",
+    ]
+)
+
+
+examples = Module(
+    name="examples",
+    dependencies=[graphx, mllib, streaming, sql],
+    source_file_regexes=[
+        "examples/",
+    ],
+    sbt_test_goals=[
+        "examples/test",
+    ]
+)
+
+
+pyspark_core = Module(
+    name="pyspark-core",
+    dependencies=[],
+    source_file_regexes=[
+        "python/(?!pyspark/(ml|mllib|sql|streaming))"
+    ],
+    python_test_goals=[
+        "pyspark.rdd",
+        "pyspark.context",
+        "pyspark.conf",
+        "pyspark.broadcast",
+        "pyspark.accumulators",
+        "pyspark.serializers",
+        "pyspark.profiler",
+        "pyspark.shuffle",
+        "pyspark.tests",
+    ]
+)
+
+
+pyspark_sql = Module(
+    name="pyspark-sql",
+    dependencies=[pyspark_core, sql],
+    source_file_regexes=[
+        "python/pyspark/sql"
+    ],
+    python_test_goals=[
+        "pyspark.sql.types",
+        "pyspark.sql.context",
+        "pyspark.sql.column",
+        "pyspark.sql.dataframe",
+        "pyspark.sql.group",
+        "pyspark.sql.functions",
+        "pyspark.sql.readwriter",
+        "pyspark.sql.window",
+        "pyspark.sql.tests",
+    ]
+)
+
+
+pyspark_streaming = Module(
+    name="pyspark-streaming",
+    dependencies=[
+        pyspark_core,
+        streaming,
+        streaming_kafka,
+        streaming_flume_assembly,
+        streaming_mqtt,
+        streaming_kinesis_asl
+    ],
+    source_file_regexes=[
+        "python/pyspark/streaming"
+    ],
+    python_test_goals=[
+        "pyspark.streaming.util",
+        "pyspark.streaming.tests",
+    ]
+)
+
+
+pyspark_mllib = Module(
+    name="pyspark-mllib",
+    dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib],
+    source_file_regexes=[
+        "python/pyspark/mllib"
+    ],
+    python_test_goals=[
+        "pyspark.mllib.classification",
+        "pyspark.mllib.clustering",
+        "pyspark.mllib.evaluation",
+        "pyspark.mllib.feature",
+        "pyspark.mllib.fpm",
+        "pyspark.mllib.linalg.__init__",
+        "pyspark.mllib.linalg.distributed",
+        "pyspark.mllib.random",
+        "pyspark.mllib.recommendation",
+        "pyspark.mllib.regression",
+        "pyspark.mllib.stat._statistics",
+        "pyspark.mllib.stat.KernelDensity",
+        "pyspark.mllib.tree",
+        "pyspark.mllib.util",
+        "pyspark.mllib.tests",
+    ],
+    blacklisted_python_implementations=[
+        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
+    ]
+)
+
+
+pyspark_ml = Module(
+    name="pyspark-ml",
+    dependencies=[pyspark_core, pyspark_mllib],
+    source_file_regexes=[
+        "python/pyspark/ml/"
+    ],
+    python_test_goals=[
+        "pyspark.ml.feature",
+        "pyspark.ml.classification",
+        "pyspark.ml.clustering",
+        "pyspark.ml.recommendation",
+        "pyspark.ml.regression",
+        "pyspark.ml.tuning",
+        "pyspark.ml.tests",
+        "pyspark.ml.evaluation",
+    ],
+    blacklisted_python_implementations=[
+        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
+    ]
+)
+
+sparkr = Module(
+    name="sparkr",
+    dependencies=[sql, mllib],
+    source_file_regexes=[
+        "R/",
+    ],
+    should_run_r_tests=True
+)
+
+
+docs = Module(
+    name="docs",
+    dependencies=[],
+    source_file_regexes=[
+        "docs/",
+    ]
+)
+
+
+ec2 = Module(
+    name="ec2",
+    dependencies=[],
+    source_file_regexes=[
+        "ec2/",
+    ]
+)
+
+
+yarn = Module(
+    name="yarn",
+    dependencies=[],
+    source_file_regexes=[
+        "yarn/",
+        "network/yarn/",
+    ],
+    sbt_test_goals=[
+        "yarn/test",
+        "network-yarn/test",
+    ],
+    test_tags=[
+        "org.apache.spark.tags.ExtendedYarnTest"
+    ]
+)
+
+# The root module is a dummy module which is used to run all of the tests.
+# No other modules should directly depend on this module.
+root = Module(
+    name="root",
+    dependencies=[],
+    source_file_regexes=[],
+    # In order to run all of the tests, enable every test profile:
+    build_profile_flags=list(set(
+        itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
+    sbt_test_goals=[
+        "test",
+    ],
+    python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)),
+    should_run_r_tests=True
+)
diff --git a/dev/_site/sparktestsupport/shellutils.py b/dev/_site/sparktestsupport/shellutils.py
new file mode 100644
index 0000000000000..d280e797077d1
--- /dev/null
+++ b/dev/_site/sparktestsupport/shellutils.py
@@ -0,0 +1,115 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+import os
+import shutil
+import subprocess
+import sys
+
+
+if sys.version_info >= (2, 7):
+    subprocess_check_output = subprocess.check_output
+    subprocess_check_call = subprocess.check_call
+else:
+    # SPARK-8763
+    # backported from subprocess module in Python 2.7
+    def subprocess_check_output(*popenargs, **kwargs):
+        if 'stdout' in kwargs:
+            raise ValueError('stdout argument not allowed, it will be overridden.')
+        process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
+        output, unused_err = process.communicate()
+        retcode = process.poll()
+        if retcode:
+            cmd = kwargs.get("args")
+            if cmd is None:
+                cmd = popenargs[0]
+            raise subprocess.CalledProcessError(retcode, cmd, output=output)
+        return output
+
+    # backported from subprocess module in Python 2.7
+    def subprocess_check_call(*popenargs, **kwargs):
+        retcode = call(*popenargs, **kwargs)
+        if retcode:
+            cmd = kwargs.get("args")
+            if cmd is None:
+                cmd = popenargs[0]
+            raise CalledProcessError(retcode, cmd)
+        return 0
+
+
+def exit_from_command_with_retcode(cmd, retcode):
+    print("[error] running", ' '.join(cmd), "; received return code", retcode)
+    sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
+
+
+def rm_r(path):
+    """
+    Given an arbitrary path, properly remove it with the correct Python construct if it exists.
+    From: http://stackoverflow.com/a/9559881
+    """
+
+    if os.path.isdir(path):
+        shutil.rmtree(path)
+    elif os.path.exists(path):
+        os.remove(path)
+
+
+def run_cmd(cmd, return_output=False):
+    """
+    Given a command as a list of arguments will attempt to execute the command
+    and, on failure, print an error message and exit.
+    """
+
+    if not isinstance(cmd, list):
+        cmd = cmd.split()
+    try:
+        if return_output:
+            return subprocess_check_output(cmd)
+        else:
+            return subprocess_check_call(cmd)
+    except subprocess.CalledProcessError as e:
+        exit_from_command_with_retcode(e.cmd, e.returncode)
+
+
+def is_exe(path):
+    """
+    Check if a given path is an executable file.
+    From: http://stackoverflow.com/a/377028
+    """
+
+    return os.path.isfile(path) and os.access(path, os.X_OK)
+
+
+def which(program):
+    """
+    Find and return the given program by its absolute path or 'None' if the program cannot be found.
+    From: http://stackoverflow.com/a/377028
+    """
+
+    fpath = os.path.split(program)[0]
+
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ.get("PATH").split(os.pathsep):
+            path = path.strip('"')
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+    return None
diff --git a/dev/_site/tests/pr_merge_ability.sh b/dev/_site/tests/pr_merge_ability.sh
new file mode 100755
index 0000000000000..d9a347fe24a8c
--- /dev/null
+++ b/dev/_site/tests/pr_merge_ability.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# This script follows the base format for testing pull requests against
+# another branch and returning results to be published. More details can be
+# found at dev/run-tests-jenkins.
+#
+# Arg1: The Github Pull Request Actual Commit
+#+ known as `ghprbActualCommit` in `run-tests-jenkins`
+# Arg2: The SHA1 hash
+#+ known as `sha1` in `run-tests-jenkins`
+#
+
+ghprbActualCommit="$1"
+sha1="$2"
+
+# check PR merge-ability
+if [ "${sha1}" == "${ghprbActualCommit}" ]; then
+  echo " * This patch **does not merge cleanly**."
+else
+  echo " * This patch merges cleanly."
+fi
diff --git a/dev/_site/tests/pr_new_dependencies.sh b/dev/_site/tests/pr_new_dependencies.sh
new file mode 100755
index 0000000000000..fdfb3c62aff58
--- /dev/null
+++ b/dev/_site/tests/pr_new_dependencies.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# This script follows the base format for testing pull requests against
+# another branch and returning results to be published. More details can be
+# found at dev/run-tests-jenkins.
+#
+# Arg1: The Github Pull Request Actual Commit
+#+ known as `ghprbActualCommit` in `run-tests-jenkins`
+# Arg2: The SHA1 hash
+#+ known as `sha1` in `run-tests-jenkins`
+# Arg3: Current PR Commit Hash
+#+ the PR hash for the current commit
+#
+
+ghprbActualCommit="$1"
+sha1="$2"
+current_pr_head="$3"
+
+MVN_BIN="build/mvn"
+CURR_CP_FILE="my-classpath.txt"
+MASTER_CP_FILE="master-classpath.txt"
+
+# First switch over to the master branch
+git checkout -f master
+# Find and copy all pom.xml files into a *.gate file that we can check
+# against through various `git` changes
+find -name "pom.xml" -exec cp {} {}.gate \;
+# Switch back to the current PR
+git checkout -f "${current_pr_head}"
+
+# Check if any *.pom files from the current branch are different from the master
+difference_q=""
+for p in $(find -name "pom.xml"); do
+  [[ -f "${p}" && -f "${p}.gate" ]] && \
+    difference_q="${difference_q}$(diff $p.gate $p)"
+done
+
+# If no pom files were changed we can easily say no new dependencies were added
+if [ -z "${difference_q}" ]; then
+  echo " * This patch does not change any dependencies."
+else
+  # Else we need to manually build spark to determine what, if any, dependencies
+  # were added into the Spark assembly jar
+  ${MVN_BIN} clean package dependency:build-classpath -DskipTests 2>/dev/null | \
+    sed -n -e '/Building Spark Project Assembly/,$p' | \
+    grep --context=1 -m 2 "Dependencies classpath:" | \
+    head -n 3 | \
+    tail -n 1 | \
+    tr ":" "\n" | \
+    rev | \
+    cut -d "/" -f 1 | \
+    rev | \
+    sort > ${CURR_CP_FILE}
+
+  # Checkout the master branch to compare against
+  git checkout -f master
+
+  ${MVN_BIN} clean package dependency:build-classpath -DskipTests 2>/dev/null | \
+    sed -n -e '/Building Spark Project Assembly/,$p' | \
+    grep --context=1 -m 2 "Dependencies classpath:" | \
+    head -n 3 | \
+    tail -n 1 | \
+    tr ":" "\n" | \
+    rev | \
+    cut -d "/" -f 1 | \
+    rev | \
+    sort > ${MASTER_CP_FILE}
+
+  DIFF_RESULTS="`diff ${CURR_CP_FILE} ${MASTER_CP_FILE}`"
+
+  if [ -z "${DIFF_RESULTS}" ]; then
+    echo " * This patch does not change any dependencies."
+  else
+    # Pretty print the new dependencies
+    added_deps=$(echo "${DIFF_RESULTS}" | grep "<" | cut -d' ' -f2 | awk '{printf "   * \`"$1"\`\\n"}')
+    removed_deps=$(echo "${DIFF_RESULTS}" | grep ">" | cut -d' ' -f2 | awk '{printf "   * \`"$1"\`\\n"}')
+    added_deps_text=" * This patch **adds the following new dependencies:**\n${added_deps}"
+    removed_deps_text=" * This patch **removes the following dependencies:**\n${removed_deps}"
+
+    # Construct the final returned message with proper 
+    return_mssg=""
+    [ -n "${added_deps}" ] && return_mssg="${added_deps_text}"
+    if [ -n "${removed_deps}" ]; then
+      if [ -n "${return_mssg}" ]; then
+        return_mssg="${return_mssg}\n${removed_deps_text}"
+      else
+        return_mssg="${removed_deps_text}"
+      fi
+    fi
+    echo "${return_mssg}"
+  fi
+  
+  # Remove the files we've left over
+  [ -f "${CURR_CP_FILE}" ] && rm -f "${CURR_CP_FILE}"
+  [ -f "${MASTER_CP_FILE}" ] && rm -f "${MASTER_CP_FILE}"
+
+  # Clean up our mess from the Maven builds just in case
+  ${MVN_BIN} clean &>/dev/null
+fi
diff --git a/dev/_site/tests/pr_public_classes.sh b/dev/_site/tests/pr_public_classes.sh
new file mode 100755
index 0000000000000..927295b88c963
--- /dev/null
+++ b/dev/_site/tests/pr_public_classes.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# This script follows the base format for testing pull requests against
+# another branch and returning results to be published. More details can be
+# found at dev/run-tests-jenkins.
+#
+# Arg1: The Github Pull Request Actual Commit
+#+ known as `ghprbActualCommit` in `run-tests-jenkins`
+# Arg2: The SHA1 hash
+#+ known as `sha1` in `run-tests-jenkins`
+#
+
+# We diff master...$ghprbActualCommit because that gets us changes introduced in the PR
+#+ and not anything else added to master since the PR was branched.
+
+ghprbActualCommit="$1"
+sha1="$2"
+
+source_files=$(
+  git diff master...$ghprbActualCommit --name-only  `# diff patch against master from branch point` \
+    | grep -v -e "\/test"                               `# ignore files in test directories` \
+    | grep -e "\.py$" -e "\.java$" -e "\.scala$"        `# include only code files` \
+    | tr "\n" " "
+)
+new_public_classes=$(
+  git diff master...$ghprbActualCommit ${source_files}      `# diff patch against master from branch point` \
+    | grep "^\+"                              `# filter in only added lines` \
+    | sed -r -e "s/^\+//g"                    `# remove the leading +` \
+    | grep -e "trait " -e "class "            `# filter in lines with these key words` \
+    | grep -e "{" -e "("                      `# filter in lines with these key words, too` \
+    | grep -v -e "\@\@" -e "private"          `# exclude lines with these words` \
+    | grep -v -e "^// " -e "^/\*" -e "^ \* "  `# exclude comment lines` \
+    | sed -r -e "s/\{.*//g"                   `# remove from the { onwards` \
+    | sed -r -e "s/\}//g"                     `# just in case, remove }; they mess the JSON` \
+    | sed -r -e "s/\"/\\\\\"/g"               `# escape double quotes; they mess the JSON` \
+    | sed -r -e "s/^(.*)$/\`\1\`/g"           `# surround with backticks for style` \
+    | sed -r -e "s/^/  \* /g"                 `# prepend '  *' to start of line` \
+    | sed -r -e "s/$/\\\n/g"                  `# append newline to end of line` \
+    | tr -d "\n"                              `# remove actual LF characters`
+)
+
+if [ -z "$new_public_classes" ]; then
+  echo " * This patch adds no public classes."
+else
+  public_classes_note=" * This patch adds the following public classes _(experimental)_:"
+  echo "${public_classes_note}\n${new_public_classes}"
+fi
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala
similarity index 100%
rename from examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics
rename to examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala

From 3af5fa334d57c6825bd4db7cf6b6ee564b97a2ff Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Fri, 13 Nov 2015 14:33:40 +0530
Subject: [PATCH 04/13] removed build files

---
 dev/_site/README.md                           |   5 -
 dev/_site/audit-release/README.md             |  11 -
 dev/_site/audit-release/audit_release.py      | 237 --------
 .../audit-release/blank_maven_build/pom.xml   |  43 --
 .../audit-release/blank_sbt_build/build.sbt   |  30 -
 .../audit-release/maven_app_core/input.txt    |   8 -
 .../audit-release/maven_app_core/pom.xml      |  52 --
 .../src/main/java/SimpleApp.java              |  42 --
 .../audit-release/sbt_app_core/build.sbt      |  28 -
 .../audit-release/sbt_app_core/input.txt      |   8 -
 .../src/main/scala/SparkApp.scala             |  63 --
 .../audit-release/sbt_app_ganglia/build.sbt   |  30 -
 .../src/main/scala/SparkApp.scala             |  41 --
 .../audit-release/sbt_app_graphx/build.sbt    |  28 -
 .../src/main/scala/GraphxApp.scala            |  55 --
 .../audit-release/sbt_app_hive/build.sbt      |  29 -
 dev/_site/audit-release/sbt_app_hive/data.txt |   9 -
 .../sbt_app_hive/src/main/scala/HiveApp.scala |  59 --
 .../audit-release/sbt_app_kinesis/build.sbt   |  28 -
 .../src/main/scala/SparkApp.scala             |  35 --
 dev/_site/audit-release/sbt_app_sql/build.sbt |  28 -
 .../sbt_app_sql/src/main/scala/SqlApp.scala   |  61 --
 .../audit-release/sbt_app_streaming/build.sbt |  28 -
 .../src/main/scala/StreamingApp.scala         |  65 --
 dev/_site/change-scala-version.sh             |  70 ---
 dev/_site/change-version-to-2.10.sh           |  23 -
 dev/_site/change-version-to-2.11.sh           |  23 -
 dev/_site/check-license                       |  85 ---
 .../create-release/generate-changelist.py     | 148 -----
 .../create-release/generate-contributors.py   | 248 --------
 dev/_site/create-release/known_translations   | 167 ------
 dev/_site/create-release/release-build.sh     | 326 ----------
 dev/_site/create-release/release-tag.sh       |  79 ---
 dev/_site/create-release/releaseutils.py      | 260 --------
 .../create-release/translate-contributors.py  | 253 --------
 dev/_site/github_jira_sync.py                 | 147 -----
 dev/_site/lint-python                         | 114 ----
 dev/_site/lint-r                              |  41 --
 dev/_site/lint-r.R                            |  37 --
 dev/_site/lint-scala                          |  23 -
 dev/_site/merge_spark_pr.py                   | 453 --------------
 dev/_site/mima                                |  54 --
 dev/_site/run-tests                           |  23 -
 dev/_site/run-tests-jenkins                   |  28 -
 dev/_site/run-tests-jenkins.py                | 228 -------
 dev/_site/run-tests.py                        | 561 ------------------
 dev/_site/scalastyle                          |  34 --
 dev/_site/sparktestsupport/modules.py         | 437 --------------
 dev/_site/sparktestsupport/shellutils.py      | 115 ----
 dev/_site/tests/pr_merge_ability.sh           |  39 --
 dev/_site/tests/pr_new_dependencies.sh        | 117 ----
 dev/_site/tests/pr_public_classes.sh          |  65 --
 52 files changed, 5221 deletions(-)
 delete mode 100644 dev/_site/README.md
 delete mode 100644 dev/_site/audit-release/README.md
 delete mode 100755 dev/_site/audit-release/audit_release.py
 delete mode 100644 dev/_site/audit-release/blank_maven_build/pom.xml
 delete mode 100644 dev/_site/audit-release/blank_sbt_build/build.sbt
 delete mode 100644 dev/_site/audit-release/maven_app_core/input.txt
 delete mode 100644 dev/_site/audit-release/maven_app_core/pom.xml
 delete mode 100644 dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java
 delete mode 100644 dev/_site/audit-release/sbt_app_core/build.sbt
 delete mode 100644 dev/_site/audit-release/sbt_app_core/input.txt
 delete mode 100644 dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
 delete mode 100644 dev/_site/audit-release/sbt_app_ganglia/build.sbt
 delete mode 100644 dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala
 delete mode 100644 dev/_site/audit-release/sbt_app_graphx/build.sbt
 delete mode 100644 dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala
 delete mode 100644 dev/_site/audit-release/sbt_app_hive/build.sbt
 delete mode 100644 dev/_site/audit-release/sbt_app_hive/data.txt
 delete mode 100644 dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
 delete mode 100644 dev/_site/audit-release/sbt_app_kinesis/build.sbt
 delete mode 100644 dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
 delete mode 100644 dev/_site/audit-release/sbt_app_sql/build.sbt
 delete mode 100644 dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
 delete mode 100644 dev/_site/audit-release/sbt_app_streaming/build.sbt
 delete mode 100644 dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala
 delete mode 100755 dev/_site/change-scala-version.sh
 delete mode 100755 dev/_site/change-version-to-2.10.sh
 delete mode 100755 dev/_site/change-version-to-2.11.sh
 delete mode 100755 dev/_site/check-license
 delete mode 100755 dev/_site/create-release/generate-changelist.py
 delete mode 100755 dev/_site/create-release/generate-contributors.py
 delete mode 100644 dev/_site/create-release/known_translations
 delete mode 100755 dev/_site/create-release/release-build.sh
 delete mode 100755 dev/_site/create-release/release-tag.sh
 delete mode 100755 dev/_site/create-release/releaseutils.py
 delete mode 100755 dev/_site/create-release/translate-contributors.py
 delete mode 100755 dev/_site/github_jira_sync.py
 delete mode 100755 dev/_site/lint-python
 delete mode 100755 dev/_site/lint-r
 delete mode 100644 dev/_site/lint-r.R
 delete mode 100755 dev/_site/lint-scala
 delete mode 100755 dev/_site/merge_spark_pr.py
 delete mode 100755 dev/_site/mima
 delete mode 100755 dev/_site/run-tests
 delete mode 100755 dev/_site/run-tests-jenkins
 delete mode 100755 dev/_site/run-tests-jenkins.py
 delete mode 100755 dev/_site/run-tests.py
 delete mode 100755 dev/_site/scalastyle
 delete mode 100644 dev/_site/sparktestsupport/modules.py
 delete mode 100644 dev/_site/sparktestsupport/shellutils.py
 delete mode 100755 dev/_site/tests/pr_merge_ability.sh
 delete mode 100755 dev/_site/tests/pr_new_dependencies.sh
 delete mode 100755 dev/_site/tests/pr_public_classes.sh

diff --git a/dev/_site/README.md b/dev/_site/README.md
deleted file mode 100644
index 2b0f3d8ee8924..0000000000000
--- a/dev/_site/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Spark Developer Scripts
-This directory contains scripts useful to developers when packaging,
-testing, or committing to Spark.
-
-Many of these scripts require Apache credentials to work correctly.
diff --git a/dev/_site/audit-release/README.md b/dev/_site/audit-release/README.md
deleted file mode 100644
index f72f8c653a265..0000000000000
--- a/dev/_site/audit-release/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Test Application Builds
-This directory includes test applications which are built when auditing releases. You can
-run them locally by setting appropriate environment variables.
-
-```
-$ cd sbt_app_core
-$ SCALA_VERSION=2.10.5 \
-  SPARK_VERSION=1.0.0-SNAPSHOT \
-  SPARK_RELEASE_REPOSITORY=file:///home/patrick/.ivy2/local \
-  sbt run
-```
diff --git a/dev/_site/audit-release/audit_release.py b/dev/_site/audit-release/audit_release.py
deleted file mode 100755
index 27d1dd784ce2e..0000000000000
--- a/dev/_site/audit-release/audit_release.py
+++ /dev/null
@@ -1,237 +0,0 @@
-#!/usr/bin/python
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Audits binary and maven artifacts for a Spark release.
-# Requires GPG and Maven.
-# usage:
-#   python audit_release.py
-
-import os
-import re
-import shutil
-import subprocess
-import sys
-import time
-import urllib2
-
-# Note: The following variables must be set before use!
-RELEASE_URL = "http://people.apache.org/~andrewor14/spark-1.1.1-rc1/"
-RELEASE_KEY = "XXXXXXXX" # Your 8-digit hex
-RELEASE_REPOSITORY = "https://repository.apache.org/content/repositories/orgapachespark-1033"
-RELEASE_VERSION = "1.1.1"
-SCALA_VERSION = "2.10.5"
-SCALA_BINARY_VERSION = "2.10"
-
-# Do not set these
-LOG_FILE_NAME = "spark_audit_%s" % time.strftime("%h_%m_%Y_%I_%M_%S")
-LOG_FILE = open(LOG_FILE_NAME, 'w')
-WORK_DIR = "/tmp/audit_%s" % int(time.time())
-MAVEN_CMD = "mvn"
-GPG_CMD = "gpg"
-SBT_CMD = "sbt -Dsbt.log.noformat=true"
-
-# Track failures to print them at the end
-failures = []
-
-# Log a message. Use sparingly because this flushes every write.
-def log(msg):
-    LOG_FILE.write(msg + "\n")
-    LOG_FILE.flush()
-
-def log_and_print(msg):
-    print msg
-    log(msg)
-
-# Prompt the user to delete the scratch directory used
-def clean_work_files():
-    response = raw_input("OK to delete scratch directory '%s'? (y/N) " % WORK_DIR)
-    if response == "y":
-        shutil.rmtree(WORK_DIR)
-
-# Run the given command and log its output to the log file
-def run_cmd(cmd, exit_on_failure=True):
-    log("Running command: %s" % cmd)
-    ret = subprocess.call(cmd, shell=True, stdout=LOG_FILE, stderr=LOG_FILE)
-    if ret != 0 and exit_on_failure:
-        log_and_print("Command failed: %s" % cmd)
-        clean_work_files()
-        sys.exit(-1)
-    return ret
-
-def run_cmd_with_output(cmd):
-    log_and_print("Running command: %s" % cmd)
-    return subprocess.check_output(cmd, shell=True, stderr=LOG_FILE)
-
-# Test if the given condition is successful
-# If so, print the pass message; otherwise print the failure message
-def test(cond, msg):
-    return passed(msg) if cond else failed(msg)
-
-def passed(msg):
-    log_and_print("[PASSED] %s" % msg)
-
-def failed(msg):
-    failures.append(msg)
-    log_and_print("[**FAILED**] %s" % msg)
-
-def get_url(url):
-    return urllib2.urlopen(url).read()
-
-# If the path exists, prompt the user to delete it
-# If the resource is not deleted, abort
-def ensure_path_not_present(path):
-    full_path = os.path.expanduser(path)
-    if os.path.exists(full_path):
-        print "Found %s locally." % full_path
-        response = raw_input("This can interfere with testing published artifacts. OK to delete? (y/N) ")
-        if response == "y":
-            shutil.rmtree(full_path)
-        else:
-            print "Abort."
-            sys.exit(-1)
-
-log_and_print("|-------- Starting Spark audit tests for release %s --------|" % RELEASE_VERSION)
-log_and_print("Log output can be found in %s" % LOG_FILE_NAME)
-
-original_dir = os.getcwd()
-
-# For each of these modules, we'll test an 'empty' application in sbt and
-# maven that links against them. This will catch issues with messed up
-# dependencies within those projects.
-modules = [
-    "spark-core", "spark-bagel", "spark-mllib", "spark-streaming", "spark-repl",
-    "spark-graphx", "spark-streaming-flume", "spark-streaming-kafka",
-    "spark-streaming-mqtt", "spark-streaming-twitter", "spark-streaming-zeromq",
-    "spark-catalyst", "spark-sql", "spark-hive", "spark-streaming-kinesis-asl"
-]
-modules = map(lambda m: "%s_%s" % (m, SCALA_BINARY_VERSION), modules)
-
-# Check for directories that might interfere with tests
-local_ivy_spark = "~/.ivy2/local/org.apache.spark"
-cache_ivy_spark = "~/.ivy2/cache/org.apache.spark"
-local_maven_kafka = "~/.m2/repository/org/apache/kafka"
-local_maven_kafka = "~/.m2/repository/org/apache/spark"
-map(ensure_path_not_present, [local_ivy_spark, cache_ivy_spark, local_maven_kafka])
-
-# SBT build tests
-log_and_print("==== Building SBT modules ====")
-os.chdir("blank_sbt_build")
-os.environ["SPARK_VERSION"] = RELEASE_VERSION
-os.environ["SCALA_VERSION"] = SCALA_VERSION
-os.environ["SPARK_RELEASE_REPOSITORY"] = RELEASE_REPOSITORY
-os.environ["SPARK_AUDIT_MASTER"] = "local"
-for module in modules:
-    log("==== Building module %s in SBT ====" % module)
-    os.environ["SPARK_MODULE"] = module
-    ret = run_cmd("%s clean update" % SBT_CMD, exit_on_failure=False)
-    test(ret == 0, "SBT build against '%s' module" % module)
-os.chdir(original_dir)
-
-# SBT application tests
-log_and_print("==== Building SBT applications ====")
-for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming", "sbt_app_sql", "sbt_app_hive", "sbt_app_kinesis"]:
-    log("==== Building application %s in SBT ====" % app)
-    os.chdir(app)
-    ret = run_cmd("%s clean run" % SBT_CMD, exit_on_failure=False)
-    test(ret == 0, "SBT application (%s)" % app)
-    os.chdir(original_dir)
-
-# Maven build tests
-os.chdir("blank_maven_build")
-log_and_print("==== Building Maven modules ====")
-for module in modules:
-    log("==== Building module %s in maven ====" % module)
-    cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" '
-           '-Dspark.module="%s" clean compile' %
-           (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, module))
-    ret = run_cmd(cmd, exit_on_failure=False)
-    test(ret == 0, "maven build against '%s' module" % module)
-os.chdir(original_dir)
-
-# Maven application tests
-log_and_print("==== Building Maven applications ====")
-os.chdir("maven_app_core")
-mvn_exec_cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" '
-                '-Dscala.binary.version="%s" clean compile '
-                'exec:java -Dexec.mainClass="SimpleApp"' %
-                (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, SCALA_BINARY_VERSION))
-ret = run_cmd(mvn_exec_cmd, exit_on_failure=False)
-test(ret == 0, "maven application (core)")
-os.chdir(original_dir)
-
-# Binary artifact tests
-if os.path.exists(WORK_DIR):
-    print "Working directory '%s' already exists" % WORK_DIR
-    sys.exit(-1)
-os.mkdir(WORK_DIR)
-os.chdir(WORK_DIR)
-
-index_page = get_url(RELEASE_URL)
-artifact_regex = r = re.compile("<a href=\"(.*.tgz)\">")
-artifacts = r.findall(index_page)
-
-# Verify artifact integrity
-for artifact in artifacts:
-    log_and_print("==== Verifying download integrity for artifact: %s ====" % artifact)
-
-    artifact_url = "%s/%s" % (RELEASE_URL, artifact)
-    key_file = "%s.asc" % artifact
-    run_cmd("wget %s" % artifact_url)
-    run_cmd("wget %s/%s" % (RELEASE_URL, key_file))
-    run_cmd("wget %s%s" % (artifact_url, ".sha"))
-
-    # Verify signature
-    run_cmd("%s --keyserver pgp.mit.edu --recv-key %s" % (GPG_CMD, RELEASE_KEY))
-    run_cmd("%s %s" % (GPG_CMD, key_file))
-    passed("Artifact signature verified.")
-
-    # Verify md5
-    my_md5 = run_cmd_with_output("%s --print-md MD5 %s" % (GPG_CMD, artifact)).strip()
-    release_md5 = get_url("%s.md5" % artifact_url).strip()
-    test(my_md5 == release_md5, "Artifact MD5 verified.")
-
-    # Verify sha
-    my_sha = run_cmd_with_output("%s --print-md SHA512 %s" % (GPG_CMD, artifact)).strip()
-    release_sha = get_url("%s.sha" % artifact_url).strip()
-    test(my_sha == release_sha, "Artifact SHA verified.")
-
-    # Verify Apache required files
-    dir_name = artifact.replace(".tgz", "")
-    run_cmd("tar xvzf %s" % artifact)
-    base_files = os.listdir(dir_name)
-    test("CHANGES.txt" in base_files, "Tarball contains CHANGES.txt file")
-    test("NOTICE" in base_files, "Tarball contains NOTICE file")
-    test("LICENSE" in base_files, "Tarball contains LICENSE file")
-
-    os.chdir(WORK_DIR)
-
-# Report result
-log_and_print("\n")
-if len(failures) == 0:
-    log_and_print("*** ALL TESTS PASSED ***")
-else:
-    log_and_print("XXXXX SOME TESTS DID NOT PASS XXXXX")
-    for f in failures:
-        log_and_print("  %s" % f)
-os.chdir(original_dir)
-
-# Clean up
-clean_work_files()
-
-log_and_print("|-------- Spark release audit complete --------|")
diff --git a/dev/_site/audit-release/blank_maven_build/pom.xml b/dev/_site/audit-release/blank_maven_build/pom.xml
deleted file mode 100644
index 02dd9046c9a49..0000000000000
--- a/dev/_site/audit-release/blank_maven_build/pom.xml
+++ /dev/null
@@ -1,43 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  ~ Licensed to the Apache Software Foundation (ASF) under one or more
-  ~ contributor license agreements.  See the NOTICE file distributed with
-  ~ this work for additional information regarding copyright ownership.
-  ~ The ASF licenses this file to You under the Apache License, Version 2.0
-  ~ (the "License"); you may not use this file except in compliance with
-  ~ the License.  You may obtain a copy of the License at
-  ~
-  ~    http://www.apache.org/licenses/LICENSE-2.0
-  ~
-  ~ Unless required by applicable law or agreed to in writing, software
-  ~ distributed under the License is distributed on an "AS IS" BASIS,
-  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  ~ See the License for the specific language governing permissions and
-  ~ limitations under the License.
-  -->
-
-<project>
-  <groupId>spark.audit</groupId>
-  <artifactId>spark-audit</artifactId>
-  <modelVersion>4.0.0</modelVersion>
-  <name>Spark Release Auditor</name>
-  <packaging>jar</packaging>
-  <version>1.0</version>
-  <repositories>
-    <repository>
-      <id>Spray.cc repository</id>
-      <url>http://repo.spray.cc</url>
-    </repository>
-    <repository>
-      <id>Spark Staging Repo</id>
-      <url>${spark.release.repository}</url>
-    </repository>
-  </repositories>
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>${spark.module}</artifactId>
-      <version>${spark.version}</version>
-    </dependency>
-  </dependencies>
-</project>
diff --git a/dev/_site/audit-release/blank_sbt_build/build.sbt b/dev/_site/audit-release/blank_sbt_build/build.sbt
deleted file mode 100644
index 62815542e5bd9..0000000000000
--- a/dev/_site/audit-release/blank_sbt_build/build.sbt
+++ /dev/null
@@ -1,30 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements.  See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License.  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-name := "Spark Release Auditor"
-
-version := "1.0"
-
-scalaVersion := System.getenv.get("SCALA_VERSION")
-
-libraryDependencies += "org.apache.spark" % System.getenv.get("SPARK_MODULE") % System.getenv.get("SPARK_VERSION")
-
-resolvers ++= Seq(
-  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
-  "Eclipse Paho Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/",
-  "Maven Repository" at "http://repo1.maven.org/maven2/",
-  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/maven_app_core/input.txt b/dev/_site/audit-release/maven_app_core/input.txt
deleted file mode 100644
index 837b6f85ae97f..0000000000000
--- a/dev/_site/audit-release/maven_app_core/input.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-a
-b
-c
-d
-a
-b
-c
-d
diff --git a/dev/_site/audit-release/maven_app_core/pom.xml b/dev/_site/audit-release/maven_app_core/pom.xml
deleted file mode 100644
index b516396825573..0000000000000
--- a/dev/_site/audit-release/maven_app_core/pom.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  ~ Licensed to the Apache Software Foundation (ASF) under one or more
-  ~ contributor license agreements.  See the NOTICE file distributed with
-  ~ this work for additional information regarding copyright ownership.
-  ~ The ASF licenses this file to You under the Apache License, Version 2.0
-  ~ (the "License"); you may not use this file except in compliance with
-  ~ the License.  You may obtain a copy of the License at
-  ~
-  ~    http://www.apache.org/licenses/LICENSE-2.0
-  ~
-  ~ Unless required by applicable law or agreed to in writing, software
-  ~ distributed under the License is distributed on an "AS IS" BASIS,
-  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  ~ See the License for the specific language governing permissions and
-  ~ limitations under the License.
-  -->
-
-<project>
-  <groupId>spark.audit</groupId>
-  <artifactId>spark-audit</artifactId>
-  <modelVersion>4.0.0</modelVersion>
-  <name>Simple Project</name>
-  <packaging>jar</packaging>
-  <version>1.0</version>
-  <repositories>
-    <repository>
-      <id>Spray.cc repository</id>
-      <url>http://repo.spray.cc</url>
-    </repository>
-    <repository>
-      <id>Spark Staging Repo</id>
-      <url>${spark.release.repository}</url>
-    </repository>
-  </repositories>
-  <dependencies>
-    <dependency> <!-- Spark dependency -->
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-core_${scala.binary.version}</artifactId>
-      <version>${spark.version}</version>
-    </dependency>
-  </dependencies>
-  <!-- Makes sure we get a fairly recent compiler plugin. -->
-  <build>
-    <plugins>
-      <plugin>
-        <artifactId>maven-compiler-plugin</artifactId>
-        <version>3.1</version>
-      </plugin>
-    </plugins>
-  </build>
-</project>
diff --git a/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java b/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java
deleted file mode 100644
index 5217689e7c092..0000000000000
--- a/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.spark.api.java.*;
-import org.apache.spark.api.java.function.Function;
-
-public class SimpleApp {
-  public static void main(String[] args) {
-    String logFile = "input.txt";
-    JavaSparkContext sc = new JavaSparkContext("local", "Simple App");
-    JavaRDD<String> logData = sc.textFile(logFile).cache();
-
-    long numAs = logData.filter(new Function<String, Boolean>() {
-      public Boolean call(String s) { return s.contains("a"); }
-    }).count();
-
-    long numBs = logData.filter(new Function<String, Boolean>() {
-      public Boolean call(String s) { return s.contains("b"); }
-    }).count();
-
-   if (numAs != 2 || numBs != 2) {
-     System.out.println("Failed to parse log files with Spark");
-     System.exit(-1);
-   }
-   System.out.println("Test succeeded");
-   sc.stop();
-  }
-}
diff --git a/dev/_site/audit-release/sbt_app_core/build.sbt b/dev/_site/audit-release/sbt_app_core/build.sbt
deleted file mode 100644
index 291b1d6440bac..0000000000000
--- a/dev/_site/audit-release/sbt_app_core/build.sbt
+++ /dev/null
@@ -1,28 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements.  See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License.  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-name := "Simple Project"
-
-version := "1.0"
-
-scalaVersion := System.getenv.get("SCALA_VERSION")
-
-libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION")
-
-resolvers ++= Seq(
-  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
-  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_core/input.txt b/dev/_site/audit-release/sbt_app_core/input.txt
deleted file mode 100644
index 837b6f85ae97f..0000000000000
--- a/dev/_site/audit-release/sbt_app_core/input.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-a
-b
-c
-d
-a
-b
-c
-d
diff --git a/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
deleted file mode 100644
index 61d91c70e9709..0000000000000
--- a/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package main.scala
-
-import scala.util.Try
-
-import org.apache.spark.SparkConf
-import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
-
-object SimpleApp {
-  def main(args: Array[String]) {
-    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
-      case Some(master) => new SparkConf().setAppName("Simple Spark App").setMaster(master)
-      case None => new SparkConf().setAppName("Simple Spark App")
-    }
-    val logFile = "input.txt"
-    val sc = new SparkContext(conf)
-    val logData = sc.textFile(logFile, 2).cache()
-    val numAs = logData.filter(line => line.contains("a")).count()
-    val numBs = logData.filter(line => line.contains("b")).count()
-    if (numAs != 2 || numBs != 2) {
-      println("Failed to parse log files with Spark")
-      System.exit(-1)
-    }
-
-    // Regression test for SPARK-1167: Remove metrics-ganglia from default build due to LGPL issue
-    val foundConsole = Try(Class.forName("org.apache.spark.metrics.sink.ConsoleSink")).isSuccess
-    val foundGanglia = Try(Class.forName("org.apache.spark.metrics.sink.GangliaSink")).isSuccess
-    if (!foundConsole) {
-      println("Console sink not loaded via spark-core")
-      System.exit(-1)
-    }
-    if (foundGanglia) {
-      println("Ganglia sink was loaded via spark-core")
-      System.exit(-1)
-    }
-
-    // Remove kinesis from default build due to ASL license issue
-    val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess
-    if (foundKinesis) {
-      println("Kinesis was loaded via spark-core")
-      System.exit(-1)
-    }
-  }
-}
-// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_ganglia/build.sbt b/dev/_site/audit-release/sbt_app_ganglia/build.sbt
deleted file mode 100644
index 6d9474acf5bbc..0000000000000
--- a/dev/_site/audit-release/sbt_app_ganglia/build.sbt
+++ /dev/null
@@ -1,30 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements.  See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License.  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-name := "Ganglia Test"
-
-version := "1.0"
-
-scalaVersion := System.getenv.get("SCALA_VERSION")
-
-libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION")
-
-libraryDependencies += "org.apache.spark" %% "spark-ganglia-lgpl" % System.getenv.get("SPARK_VERSION")
-
-resolvers ++= Seq(
-  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
-  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala
deleted file mode 100644
index 9f7ae75d0b477..0000000000000
--- a/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package main.scala
-
-import scala.util.Try
-
-import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
-
-object SimpleApp {
-  def main(args: Array[String]) {
-    // Regression test for SPARK-1167: Remove metrics-ganglia from default build due to LGPL issue
-    val foundConsole = Try(Class.forName("org.apache.spark.metrics.sink.ConsoleSink")).isSuccess
-    val foundGanglia = Try(Class.forName("org.apache.spark.metrics.sink.GangliaSink")).isSuccess
-    if (!foundConsole) {
-      println("Console sink not loaded via spark-core")
-      System.exit(-1)
-    }
-    if (!foundGanglia) {
-      println("Ganglia sink not loaded via spark-ganglia-lgpl")
-      System.exit(-1)
-    }
-  }
-}
-// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_graphx/build.sbt b/dev/_site/audit-release/sbt_app_graphx/build.sbt
deleted file mode 100644
index dd11245e67d44..0000000000000
--- a/dev/_site/audit-release/sbt_app_graphx/build.sbt
+++ /dev/null
@@ -1,28 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements.  See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License.  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-name := "Simple Project"
-
-version := "1.0"
-
-scalaVersion := System.getenv.get("SCALA_VERSION")
-
-libraryDependencies += "org.apache.spark" %% "spark-graphx" % System.getenv.get("SPARK_VERSION")
-
-resolvers ++= Seq(
-  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
-  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala b/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala
deleted file mode 100644
index 2f0b6ef9a5672..0000000000000
--- a/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package main.scala
-
-import org.apache.spark.{SparkContext, SparkConf}
-import org.apache.spark.SparkContext._
-import org.apache.spark.graphx._
-import org.apache.spark.rdd.RDD
-
-object GraphXApp {
-  def main(args: Array[String]) {
-    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
-      case Some(master) => new SparkConf().setAppName("Simple GraphX App").setMaster(master)
-      case None => new SparkConf().setAppName("Simple Graphx App")
-    }
-    val sc = new SparkContext(conf)
-    SparkContext.jarOfClass(this.getClass).foreach(sc.addJar)
-
-    val users: RDD[(VertexId, (String, String))] =
-      sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
-                           (5L, ("franklin", "prof")), (2L, ("istoica", "prof")),
-                           (4L, ("peter", "student"))))
-    val relationships: RDD[Edge[String]] =
-      sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
-                           Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"),
-                           Edge(4L, 0L, "student"),   Edge(5L, 0L, "colleague")))
-    val defaultUser = ("John Doe", "Missing")
-    val graph = Graph(users, relationships, defaultUser)
-    // Notice that there is a user 0 (for which we have no information) connected to users
-    // 4 (peter) and 5 (franklin).
-    val triplets = graph.triplets.map(e => (e.srcAttr._1, e.dstAttr._1)).collect
-    if (!triplets.exists(_ == ("peter", "John Doe"))) {
-      println("Failed to run GraphX")
-      System.exit(-1)
-    }
-    println("Test succeeded")
-  }
-}
-// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_hive/build.sbt b/dev/_site/audit-release/sbt_app_hive/build.sbt
deleted file mode 100644
index c8824f2b15e55..0000000000000
--- a/dev/_site/audit-release/sbt_app_hive/build.sbt
+++ /dev/null
@@ -1,29 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements.  See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License.  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-name := "Simple Project"
-
-version := "1.0"
-
-scalaVersion := System.getenv.get("SCALA_VERSION")
-
-libraryDependencies += "org.apache.spark" %% "spark-hive" % System.getenv.get("SPARK_VERSION")
-
-resolvers ++= Seq(
-  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
-  "Maven Repository" at "http://repo1.maven.org/maven2/",
-  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_hive/data.txt b/dev/_site/audit-release/sbt_app_hive/data.txt
deleted file mode 100644
index 0229e67f51e01..0000000000000
--- a/dev/_site/audit-release/sbt_app_hive/data.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-0val_0
-1val_1
-2val_2
-3val_3
-4val_4
-5val_5
-6val_6
-7val_7
-9val_9
diff --git a/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala b/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
deleted file mode 100644
index 4a980ec071ae4..0000000000000
--- a/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package main.scala
-
-import scala.collection.mutable.{ListBuffer, Queue}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.hive.HiveContext
-
-case class Person(name: String, age: Int)
-
-object SparkSqlExample {
-
-  def main(args: Array[String]) {
-    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
-      case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master)
-      case None => new SparkConf().setAppName("Simple Sql App")
-    }
-    val sc = new SparkContext(conf)
-    val hiveContext = new HiveContext(sc)
-
-    import hiveContext._
-    sql("DROP TABLE IF EXISTS src")
-    sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
-    sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src")
-    val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect()
-    results.foreach(println)
-    
-    def test(f: => Boolean, failureMsg: String) = {
-      if (!f) {
-        println(failureMsg)
-        System.exit(-1)
-      }
-    }
-    
-    test(results.size == 5, "Unexpected number of selected elements: " + results)
-    println("Test succeeded")
-    sc.stop()
-  }
-}
-// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_kinesis/build.sbt b/dev/_site/audit-release/sbt_app_kinesis/build.sbt
deleted file mode 100644
index 981bc7957b5ed..0000000000000
--- a/dev/_site/audit-release/sbt_app_kinesis/build.sbt
+++ /dev/null
@@ -1,28 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements.  See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License.  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-name := "Kinesis Test"
-
-version := "1.0"
-
-scalaVersion := System.getenv.get("SCALA_VERSION")
-
-libraryDependencies += "org.apache.spark" %% "spark-streaming-kinesis-asl" % System.getenv.get("SPARK_VERSION")
-
-resolvers ++= Seq(
-  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
-  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
deleted file mode 100644
index adc25b57d6aa5..0000000000000
--- a/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package main.scala
-
-import scala.util.Try
-
-import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
-
-object SimpleApp {
-  def main(args: Array[String]) {
-    val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess
-    if (!foundKinesis) {
-      println("Kinesis not loaded via kinesis-asl")
-      System.exit(-1)
-    }
-  }
-}
-// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_sql/build.sbt b/dev/_site/audit-release/sbt_app_sql/build.sbt
deleted file mode 100644
index 9116180f71a44..0000000000000
--- a/dev/_site/audit-release/sbt_app_sql/build.sbt
+++ /dev/null
@@ -1,28 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements.  See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License.  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-name := "Simple Project"
-
-version := "1.0"
-
-scalaVersion := System.getenv.get("SCALA_VERSION")
-
-libraryDependencies += "org.apache.spark" %% "spark-sql" % System.getenv.get("SPARK_VERSION")
-
-resolvers ++= Seq(
-  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
-  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala b/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
deleted file mode 100644
index 69c1154dc0955..0000000000000
--- a/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package main.scala
-
-import scala.collection.mutable.{ListBuffer, Queue}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.SparkContext
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SQLContext
-
-case class Person(name: String, age: Int)
-
-object SparkSqlExample {
-
-  def main(args: Array[String]) {
-    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
-      case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master)
-      case None => new SparkConf().setAppName("Simple Sql App")
-    }
-    val sc = new SparkContext(conf)
-    val sqlContext = new SQLContext(sc)
-
-    import sqlContext.implicits._
-    import sqlContext._
-
-    val people = sc.makeRDD(1 to 100, 10).map(x => Person(s"Name$x", x)).toDF()
-    people.registerTempTable("people")
-    val teenagers = sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
-    val teenagerNames = teenagers.map(t => "Name: " + t(0)).collect()
-    teenagerNames.foreach(println)
-
-    def test(f: => Boolean, failureMsg: String) = {
-      if (!f) {
-        println(failureMsg)
-        System.exit(-1)
-      }
-    }
-    
-    test(teenagerNames.size == 7, "Unexpected number of selected elements: " + teenagerNames)
-    println("Test succeeded")
-    sc.stop()
-  }
-}
-// scalastyle:on println
diff --git a/dev/_site/audit-release/sbt_app_streaming/build.sbt b/dev/_site/audit-release/sbt_app_streaming/build.sbt
deleted file mode 100644
index cb369d516dd16..0000000000000
--- a/dev/_site/audit-release/sbt_app_streaming/build.sbt
+++ /dev/null
@@ -1,28 +0,0 @@
-//
-// Licensed to the Apache Software Foundation (ASF) under one or more
-// contributor license agreements.  See the NOTICE file distributed with
-// this work for additional information regarding copyright ownership.
-// The ASF licenses this file to You under the Apache License, Version 2.0
-// (the "License"); you may not use this file except in compliance with
-// the License.  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-name := "Simple Project"
-
-version := "1.0"
-
-scalaVersion := System.getenv.get("SCALA_VERSION")
-
-libraryDependencies += "org.apache.spark" %% "spark-streaming" % System.getenv.get("SPARK_VERSION")
-
-resolvers ++= Seq(
-  "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"),
-  "Spray Repository" at "http://repo.spray.cc/")
diff --git a/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala b/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala
deleted file mode 100644
index d6a074687f4a1..0000000000000
--- a/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package main.scala
-
-import scala.collection.mutable.{ListBuffer, Queue}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.rdd.RDD
-import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.streaming._
-
-object SparkStreamingExample {
-
-  def main(args: Array[String]) {
-    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
-      case Some(master) => new SparkConf().setAppName("Simple Streaming App").setMaster(master)
-      case None => new SparkConf().setAppName("Simple Streaming App")
-    }
-    val ssc = new StreamingContext(conf, Seconds(1))
-    val seen = ListBuffer[RDD[Int]]()
-
-    val rdd1 = ssc.sparkContext.makeRDD(1 to 100, 10)
-    val rdd2 = ssc.sparkContext.makeRDD(1 to 1000, 10)
-    val rdd3 = ssc.sparkContext.makeRDD(1 to 10000, 10)
-
-    val queue = Queue(rdd1, rdd2, rdd3)
-    val stream = ssc.queueStream(queue)
-
-    stream.foreachRDD(rdd => seen += rdd)
-    ssc.start()
-    Thread.sleep(5000)
-
-    def test(f: => Boolean, failureMsg: String) = {
-      if (!f) {
-        println(failureMsg)
-        System.exit(-1)
-      }
-    }
-
-    val rddCounts = seen.map(rdd => rdd.count()).filter(_ > 0)
-    test(rddCounts.length == 3, "Did not collect three RDD's from stream")
-    test(rddCounts.toSet == Set(100, 1000, 10000), "Did not find expected streams")
-
-    println("Test succeeded")
-
-    ssc.stop()
-  }
-}
-// scalastyle:on println
diff --git a/dev/_site/change-scala-version.sh b/dev/_site/change-scala-version.sh
deleted file mode 100755
index d7975dfb6475c..0000000000000
--- a/dev/_site/change-scala-version.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set -e
-
-VALID_VERSIONS=( 2.10 2.11 )
-
-usage() {
-  echo "Usage: $(basename $0) [-h|--help] <version>
-where :
-  -h| --help Display this help text
-  valid version values : ${VALID_VERSIONS[*]}
-" 1>&2
-  exit 1
-}
-
-if [[ ($# -ne 1) || ( $1 == "--help") ||  $1 == "-h" ]]; then
-  usage
-fi
-
-TO_VERSION=$1
-
-check_scala_version() {
-  for i in ${VALID_VERSIONS[*]}; do [ $i = "$1" ] && return 0; done
-  echo "Invalid Scala version: $1. Valid versions: ${VALID_VERSIONS[*]}" 1>&2
-  exit 1
-}
-
-check_scala_version "$TO_VERSION"
-
-if [ $TO_VERSION = "2.11" ]; then
-  FROM_VERSION="2.10"
-else
-  FROM_VERSION="2.11"
-fi
-
-sed_i() {
-  sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2"
-}
-
-export -f sed_i
-
-BASEDIR=$(dirname $0)/..
-find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \
-  -exec bash -c "sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' {}" \;
-
-# Also update <scala.binary.version> in parent POM
-# Match any scala binary version to ensure idempotency
-sed_i '1,/<scala\.binary\.version>[0-9]*\.[0-9]*</s/<scala\.binary\.version>[0-9]*\.[0-9]*</<scala.binary.version>'$TO_VERSION'</' \
-  "$BASEDIR/pom.xml"
-
-# Update source of scaladocs
-echo "$BASEDIR/docs/_plugins/copy_api_dirs.rb"
-sed_i 's/scala\-'$FROM_VERSION'/scala\-'$TO_VERSION'/' "$BASEDIR/docs/_plugins/copy_api_dirs.rb"
diff --git a/dev/_site/change-version-to-2.10.sh b/dev/_site/change-version-to-2.10.sh
deleted file mode 100755
index 0962d34c52f28..0000000000000
--- a/dev/_site/change-version-to-2.10.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This script exists for backwards compability. Use change-scala-version.sh instead.
-echo "This script is deprecated. Please instead run: change-scala-version.sh 2.10"
-
-$(dirname $0)/change-scala-version.sh 2.10
diff --git a/dev/_site/change-version-to-2.11.sh b/dev/_site/change-version-to-2.11.sh
deleted file mode 100755
index 4ccfeef09fd04..0000000000000
--- a/dev/_site/change-version-to-2.11.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This script exists for backwards compability. Use change-scala-version.sh instead.
-echo "This script is deprecated. Please instead run: change-scala-version.sh 2.11"
-
-$(dirname $0)/change-scala-version.sh 2.11
diff --git a/dev/_site/check-license b/dev/_site/check-license
deleted file mode 100755
index 10740cfdc5242..0000000000000
--- a/dev/_site/check-license
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-acquire_rat_jar () {
-
-  URL="http://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar"
-
-  JAR="$rat_jar"
-
-  # Download rat launch jar if it hasn't been downloaded yet
-  if [ ! -f "$JAR" ]; then
-    # Download
-    printf "Attempting to fetch rat\n"
-    JAR_DL="${JAR}.part"
-    if [ $(command -v curl) ]; then
-      curl -L --silent "${URL}" > "$JAR_DL" && mv "$JAR_DL" "$JAR"
-    elif [ $(command -v wget) ]; then
-      wget --quiet ${URL} -O "$JAR_DL" && mv "$JAR_DL" "$JAR"
-    else
-      printf "You do not have curl or wget installed, please install rat manually.\n"
-      exit -1
-    fi
-  fi
-
-  unzip -tq "$JAR" &> /dev/null
-  if [ $? -ne 0 ]; then 
-    # We failed to download
-    rm "$JAR"
-    printf "Our attempt to download rat locally to ${JAR} failed. Please install rat manually.\n"
-    exit -1
-  fi
-}
-
-# Go to the Spark project root directory
-FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
-cd "$FWDIR"
-
-if test -x "$JAVA_HOME/bin/java"; then
-    declare java_cmd="$JAVA_HOME/bin/java"
-else
-    declare java_cmd=java
-fi
-
-export RAT_VERSION=0.10
-export rat_jar="$FWDIR"/lib/apache-rat-${RAT_VERSION}.jar
-mkdir -p "$FWDIR"/lib
-
-[[ -f "$rat_jar" ]] || acquire_rat_jar || {
-    echo "Download failed. Obtain the rat jar manually and place it at $rat_jar"
-    exit 1
-}
-
-$java_cmd -jar "$rat_jar" -E "$FWDIR"/.rat-excludes  -d "$FWDIR" > rat-results.txt
-
-if [ $? -ne 0 ]; then
-   echo "RAT exited abnormally"
-   exit 1
-fi
-
-ERRORS="$(cat rat-results.txt | grep -e "??")"
-
-if test ! -z "$ERRORS"; then 
-    echo "Could not find Apache license headers in the following files:"
-    echo "$ERRORS"
-    exit 1
-else 
-    echo -e "RAT checks passed."
-fi
diff --git a/dev/_site/create-release/generate-changelist.py b/dev/_site/create-release/generate-changelist.py
deleted file mode 100755
index 2e1a35a629342..0000000000000
--- a/dev/_site/create-release/generate-changelist.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/python
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Creates CHANGES.txt from git history.
-#
-# Usage:
-#   First set the new release version and old CHANGES.txt version in this file.
-#   Make sure you have SPARK_HOME set.
-#   $  python generate-changelist.py
-
-
-import os
-import sys
-import subprocess
-import time
-import traceback
-
-SPARK_HOME = os.environ["SPARK_HOME"]
-NEW_RELEASE_VERSION = "1.0.0"
-PREV_RELEASE_GIT_TAG = "v0.9.1"
-
-CHANGELIST = "CHANGES.txt"
-OLD_CHANGELIST = "%s.old" % (CHANGELIST)
-NEW_CHANGELIST = "%s.new" % (CHANGELIST)
-TMP_CHANGELIST = "%s.tmp" % (CHANGELIST)
-
-# date before first PR in TLP Spark repo
-SPARK_REPO_CHANGE_DATE1 = time.strptime("2014-02-26", "%Y-%m-%d")
-# date after last PR in incubator Spark repo
-SPARK_REPO_CHANGE_DATE2 = time.strptime("2014-03-01", "%Y-%m-%d")
-# Threshold PR number that differentiates PRs to TLP
-# and incubator repos
-SPARK_REPO_PR_NUM_THRESH = 200
-
-LOG_FILE_NAME = "changes_%s" % time.strftime("%h_%m_%Y_%I_%M_%S")
-LOG_FILE = open(LOG_FILE_NAME, 'w')
-
-
-def run_cmd(cmd):
-    try:
-        print >> LOG_FILE, "Running command: %s" % cmd
-        output = subprocess.check_output(cmd, shell=True, stderr=LOG_FILE)
-        print >> LOG_FILE, "Output: %s" % output
-        return output
-    except:
-        traceback.print_exc()
-        cleanup()
-        sys.exit(1)
-
-
-def append_to_changelist(string):
-    with open(TMP_CHANGELIST, "a") as f:
-        print >> f, string
-
-
-def cleanup(ask=True):
-    if ask is True:
-        print "OK to delete temporary and log files? (y/N): "
-        response = raw_input()
-    if ask is False or (ask is True and response == "y"):
-        if os.path.isfile(TMP_CHANGELIST):
-            os.remove(TMP_CHANGELIST)
-        if os.path.isfile(OLD_CHANGELIST):
-            os.remove(OLD_CHANGELIST)
-        LOG_FILE.close()
-        os.remove(LOG_FILE_NAME)
-
-
-print "Generating new %s for Spark release %s" % (CHANGELIST, NEW_RELEASE_VERSION)
-os.chdir(SPARK_HOME)
-if os.path.isfile(TMP_CHANGELIST):
-    os.remove(TMP_CHANGELIST)
-if os.path.isfile(OLD_CHANGELIST):
-    os.remove(OLD_CHANGELIST)
-
-append_to_changelist("Spark Change Log")
-append_to_changelist("----------------")
-append_to_changelist("")
-append_to_changelist("Release %s" % NEW_RELEASE_VERSION)
-append_to_changelist("")
-
-print "Getting commits between tag %s and HEAD" % PREV_RELEASE_GIT_TAG
-hashes = run_cmd("git log %s..HEAD --pretty='%%h'" % PREV_RELEASE_GIT_TAG).split()
-
-print "Getting details of %s commits" % len(hashes)
-for h in hashes:
-    date = run_cmd("git log %s -1 --pretty='%%ad' --date=iso | head -1" % h).strip()
-    subject = run_cmd("git log %s -1 --pretty='%%s' | head -1" % h).strip()
-    body = run_cmd("git log %s -1 --pretty='%%b'" % h)
-    committer = run_cmd("git log %s -1 --pretty='%%cn <%%ce>' | head -1" % h).strip()
-    body_lines = body.split("\n")
-
-    if "Merge pull" in subject:
-        # Parse old format commit message
-        append_to_changelist("  %s %s" % (h, date))
-        append_to_changelist("  %s" % subject)
-        append_to_changelist("  [%s]" % body_lines[0])
-        append_to_changelist("")
-
-    elif "maven-release" not in subject:
-        # Parse new format commit message
-        # Get authors from commit message, committer otherwise
-        authors = [committer]
-        if "Author:" in body:
-            authors = [line.split(":")[1].strip() for line in body_lines if "Author:" in line]
-
-        # Generate GitHub PR URL for easy access if possible
-        github_url = ""
-        if "Closes #" in body:
-            pr_num = [line.split()[1].lstrip("#") for line in body_lines if "Closes #" in line][0]
-            github_url = "github.com/apache/spark/pull/%s" % pr_num
-            day = time.strptime(date.split()[0], "%Y-%m-%d")
-            if (day < SPARK_REPO_CHANGE_DATE1 or
-                (day < SPARK_REPO_CHANGE_DATE2 and pr_num < SPARK_REPO_PR_NUM_THRESH)):
-                github_url = "github.com/apache/incubator-spark/pull/%s" % pr_num
-
-        append_to_changelist("  %s" % subject)
-        append_to_changelist("  %s" % ', '.join(authors))
-        # for author in authors:
-        #     append_to_changelist("  %s" % author)
-        append_to_changelist("  %s" % date)
-        if len(github_url) > 0:
-            append_to_changelist("  Commit: %s, %s" % (h, github_url))
-        else:
-            append_to_changelist("  Commit: %s" % h)
-        append_to_changelist("")
-
-# Append old change list
-print "Appending changelist from tag %s" % PREV_RELEASE_GIT_TAG
-run_cmd("git show %s:%s | tail -n +3 >> %s" % (PREV_RELEASE_GIT_TAG, CHANGELIST, TMP_CHANGELIST))
-run_cmd("cp %s %s" % (TMP_CHANGELIST, NEW_CHANGELIST))
-print "New change list generated as %s" % NEW_CHANGELIST
-cleanup(False)
diff --git a/dev/_site/create-release/generate-contributors.py b/dev/_site/create-release/generate-contributors.py
deleted file mode 100755
index db9c680a4bad3..0000000000000
--- a/dev/_site/create-release/generate-contributors.py
+++ /dev/null
@@ -1,248 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This script automates the process of creating release notes.
-
-import os
-import re
-import sys
-
-from releaseutils import *
-
-# You must set the following before use!
-JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
-RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2")
-PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0")
-
-# If the release tags are not provided, prompt the user to provide them
-while not tag_exists(RELEASE_TAG):
-    RELEASE_TAG = raw_input("Please provide a valid release tag: ")
-while not tag_exists(PREVIOUS_RELEASE_TAG):
-    print "Please specify the previous release tag."
-    PREVIOUS_RELEASE_TAG = raw_input(\
-      "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")
-
-# Gather commits found in the new tag but not in the old tag.
-# This filters commits based on both the git hash and the PR number.
-# If either is present in the old tag, then we ignore the commit.
-print "Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG)
-release_commits = get_commits(RELEASE_TAG)
-previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG)
-previous_release_hashes = set()
-previous_release_prs = set()
-for old_commit in previous_release_commits:
-    previous_release_hashes.add(old_commit.get_hash())
-    if old_commit.get_pr_number():
-        previous_release_prs.add(old_commit.get_pr_number())
-new_commits = []
-for this_commit in release_commits:
-    this_hash = this_commit.get_hash()
-    this_pr_number = this_commit.get_pr_number()
-    if this_hash in previous_release_hashes:
-        continue
-    if this_pr_number and this_pr_number in previous_release_prs:
-        continue
-    new_commits.append(this_commit)
-if not new_commits:
-    sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))
-
-# Prompt the user for confirmation that the commit range is correct
-print "\n=================================================================================="
-print "JIRA server: %s" % JIRA_API_BASE
-print "Release tag: %s" % RELEASE_TAG
-print "Previous release tag: %s" % PREVIOUS_RELEASE_TAG
-print "Number of commits in this range: %s" % len(new_commits)
-print
-def print_indented(_list):
-    for x in _list: print "  %s" % x
-if yesOrNoPrompt("Show all commits?"):
-    print_indented(new_commits)
-print "==================================================================================\n"
-if not yesOrNoPrompt("Does this look correct?"):
-    sys.exit("Ok, exiting")
-
-# Filter out special commits
-releases = []
-maintenance = []
-reverts = []
-nojiras = []
-filtered_commits = []
-def is_release(commit_title):
-    return re.findall("\[release\]", commit_title.lower()) or\
-      "preparing spark release" in commit_title.lower() or\
-      "preparing development version" in commit_title.lower() or\
-      "CHANGES.txt" in commit_title
-def is_maintenance(commit_title):
-    return "maintenance" in commit_title.lower() or\
-      "manually close" in commit_title.lower()
-def has_no_jira(commit_title):
-    return not re.findall("SPARK-[0-9]+", commit_title.upper())
-def is_revert(commit_title):
-    return "revert" in commit_title.lower()
-def is_docs(commit_title):
-    return re.findall("docs*", commit_title.lower()) or\
-      "programming guide" in commit_title.lower()
-for c in new_commits:
-    t = c.get_title()
-    if not t: continue
-    elif is_release(t): releases.append(c)
-    elif is_maintenance(t): maintenance.append(c)
-    elif is_revert(t): reverts.append(c)
-    elif is_docs(t): filtered_commits.append(c) # docs may not have JIRA numbers
-    elif has_no_jira(t): nojiras.append(c)
-    else: filtered_commits.append(c)
-
-# Warn against ignored commits
-if releases or maintenance or reverts or nojiras:
-    print "\n=================================================================================="
-    if releases: print "Found %d release commits" % len(releases)
-    if maintenance: print "Found %d maintenance commits" % len(maintenance)
-    if reverts: print "Found %d revert commits" % len(reverts)
-    if nojiras: print "Found %d commits with no JIRA" % len(nojiras)
-    print "* Warning: these commits will be ignored.\n"
-    if yesOrNoPrompt("Show ignored commits?"):
-        if releases: print "Release (%d)" % len(releases); print_indented(releases)
-        if maintenance: print "Maintenance (%d)" % len(maintenance); print_indented(maintenance)
-        if reverts: print "Revert (%d)" % len(reverts); print_indented(reverts)
-        if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
-    print "==================== Warning: the above commits will be ignored ==================\n"
-prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits)
-if not yesOrNoPrompt(prompt_msg):
-    sys.exit("Ok, exiting.")
-
-# Keep track of warnings to tell the user at the end
-warnings = []
-
-# Mapping from the invalid author name to its associated JIRA issues
-# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471")
-invalid_authors = {}
-
-# Populate a map that groups issues and components by author
-# It takes the form: Author name -> { Contribution type -> Spark components }
-# For instance,
-# {
-#   'Andrew Or': {
-#     'bug fixes': ['windows', 'core', 'web ui'],
-#     'improvements': ['core']
-#   },
-#   'Tathagata Das' : {
-#     'bug fixes': ['streaming']
-#     'new feature': ['streaming']
-#   }
-# }
-#
-author_info = {}
-jira_options = { "server": JIRA_API_BASE }
-jira_client = JIRA(options = jira_options)
-print "\n=========================== Compiling contributor list ==========================="
-for commit in filtered_commits:
-    _hash = commit.get_hash()
-    title = commit.get_title()
-    issues = re.findall("SPARK-[0-9]+", title.upper())
-    author = commit.get_author()
-    date = get_date(_hash)
-    # If the author name is invalid, keep track of it along
-    # with all associated issues so we can translate it later
-    if is_valid_author(author):
-        author = capitalize_author(author)
-    else:
-        if author not in invalid_authors:
-            invalid_authors[author] = set()
-        for issue in issues:
-            invalid_authors[author].add(issue)
-    # Parse components from the commit title, if any
-    commit_components = find_components(title, _hash)
-    # Populate or merge an issue into author_info[author]
-    def populate(issue_type, components):
-        components = components or [CORE_COMPONENT] # assume core if no components provided
-        if author not in author_info:
-            author_info[author] = {}
-        if issue_type not in author_info[author]:
-            author_info[author][issue_type] = set()
-        for component in components:
-            author_info[author][issue_type].add(component)
-    # Find issues and components associated with this commit
-    for issue in issues:
-        try:
-            jira_issue = jira_client.issue(issue)
-            jira_type = jira_issue.fields.issuetype.name
-            jira_type = translate_issue_type(jira_type, issue, warnings)
-            jira_components = [translate_component(c.name, _hash, warnings)\
-              for c in jira_issue.fields.components]
-            all_components = set(jira_components + commit_components)
-            populate(jira_type, all_components)
-        except Exception as e:
-            print "Unexpected error:", e
-    # For docs without an associated JIRA, manually add it ourselves
-    if is_docs(title) and not issues:
-        populate("documentation", commit_components)
-    print "  Processed commit %s authored by %s on %s" % (_hash, author, date)
-print "==================================================================================\n"
-
-# Write to contributors file ordered by author names
-# Each line takes the format " * Author name -- semi-colon delimited contributions"
-# e.g. * Andrew Or -- Bug fixes in Windows, Core, and Web UI; improvements in Core
-# e.g. * Tathagata Das -- Bug fixes and new features in Streaming
-contributors_file = open(contributors_file_name, "w")
-authors = author_info.keys()
-authors.sort()
-for author in authors:
-    contribution = ""
-    components = set()
-    issue_types = set()
-    for issue_type, comps in author_info[author].items():
-        components.update(comps)
-        issue_types.add(issue_type)
-    # If there is only one component, mention it only once
-    # e.g. Bug fixes, improvements in MLlib
-    if len(components) == 1:
-        contribution = "%s in %s" % (nice_join(issue_types), next(iter(components)))
-    # Otherwise, group contributions by issue types instead of modules
-    # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN
-    else:
-        contributions = ["%s in %s" % (issue_type, nice_join(comps)) \
-          for issue_type, comps in author_info[author].items()]
-        contribution = "; ".join(contributions)
-    # Do not use python's capitalize() on the whole string to preserve case
-    assert contribution
-    contribution = contribution[0].capitalize() + contribution[1:]
-    # If the author name is invalid, use an intermediate format that
-    # can be translated through translate-contributors.py later
-    # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
-    if author in invalid_authors and invalid_authors[author]:
-        author = author + "/" + "/".join(invalid_authors[author])
-    #line = " * %s -- %s" % (author, contribution)
-    line = author
-    contributors_file.write(line + "\n")
-contributors_file.close()
-print "Contributors list is successfully written to %s!" % contributors_file_name
-
-# Prompt the user to translate author names if necessary
-if invalid_authors:
-    warnings.append("Found the following invalid authors:")
-    for a in invalid_authors:
-        warnings.append("\t%s" % a)
-    warnings.append("Please run './translate-contributors.py' to translate them.")
-
-# Log any warnings encountered in the process
-if warnings:
-    print "\n============ Warnings encountered while creating the contributor list ============"
-    for w in warnings: print w
-    print "Please correct these in the final contributors list at %s." % contributors_file_name
-    print "==================================================================================\n"
-
diff --git a/dev/_site/create-release/known_translations b/dev/_site/create-release/known_translations
deleted file mode 100644
index 3563fe3cc3c03..0000000000000
--- a/dev/_site/create-release/known_translations
+++ /dev/null
@@ -1,167 +0,0 @@
-# This is a mapping of names to be translated through translate-contributors.py
-# The format expected on each line should be: <old name> - <new name>
-CodingCat - Nan Zhu
-CrazyJvm - Chao Chen
-EugenCepoi - Eugen Cepoi
-GraceH - Jie Huang
-JerryLead - Lijie Xu
-Leolh - Liu Hao
-Lewuathe - Kai Sasaki
-RongGu - Rong Gu
-Shiti - Shiti Saxena
-Victsm - Min Shen
-WangTaoTheTonic - Wang Tao
-XuTingjun - Tingjun Xu
-YanTangZhai - Yantang Zhai
-alexdebrie - Alex DeBrie
-alokito - Alok Saldanha
-anantasty - Anant Asthana
-andrewor14 - Andrew Or
-aniketbhatnagar - Aniket Bhatnagar
-arahuja - Arun Ahuja
-brkyvz - Burak Yavuz
-chesterxgchen - Chester Chen
-chiragaggarwal - Chirag Aggarwal
-chouqin - Qiping Li
-cocoatomo - Tomohiko K.
-coderfi - Fairiz Azizi
-coderxiang - Shuo Xiang
-davies - Davies Liu
-epahomov - Egor Pahomov
-falaki - Hossein Falaki
-freeman-lab - Jeremy Freeman
-industrial-sloth - Jascha Swisher
-jackylk - Jacky Li
-jayunit100 - Jay Vyas
-jerryshao - Saisai Shao
-jkbradley - Joseph Bradley
-lianhuiwang - Lianhui Wang
-lirui-intel - Rui Li
-luluorta - Lu Lu
-luogankun - Gankun Luo
-maji2014 - Derek Ma
-mccheah - Matthew Cheah
-mengxr - Xiangrui Meng
-nartz - Nathan Artz
-odedz - Oded Zimerman
-ravipesala - Ravindra Pesala
-roxchkplusony - Victor Tso
-scwf - Wang Fei
-shimingfei - Shiming Fei
-surq - Surong Quan
-suyanNone - Su Yan
-tedyu - Ted Yu
-tigerquoll - Dale Richardson
-wangxiaojing - Xiaojing Wang
-watermen - Yadong Qi
-witgo - Guoqiang Li
-xinyunh - Xinyun Huang
-zsxwing - Shixiong Zhu
-Bilna - Bilna P
-DoingDone9 - Doing Done
-Earne - Ernest
-FlytxtRnD - Meethu Mathew
-GenTang - Gen TANG
-JoshRosen - Josh Rosen
-MechCoder - Manoj Kumar
-OopsOutOfMemory - Sheng Li
-Peishen-Jia - Peishen Jia
-SaintBacchus - Huang Zhaowei
-azagrebin - Andrey Zagrebin
-bzz - Alexander Bezzubov
-fjiang6 - Fan Jiang
-gasparms - Gaspar Munoz
-guowei2 - Guo Wei
-hhbyyh - Yuhao Yang
-hseagle - Peng Xu
-javadba - Stephen Boesch
-jbencook - Ben Cook
-kul - Kuldeep
-ligangty - Gang Li
-marsishandsome - Liangliang Gu
-medale - Markus Dale
-nemccarthy - Nathan McCarthy
-nxwhite-str - Nate Crosswhite
-seayi - Xiaohua Yi
-tianyi - Yi Tian
-uncleGen - Uncle Gen
-viper-kun - Xu Kun
-x1- - Yuri Saito
-zapletal-martin - Martin Zapletal
-zuxqoj - Shekhar Bansal
-mingyukim - Mingyu Kim
-sigmoidanalytics - Mayur Rustagi
-AiHe - Ai He
-BenFradet - Ben Fradet
-FavioVazquez - Favio Vazquez
-JaysonSunshine - Jayson Sunshine
-Liuchang0812 - Liu Chang
-Sephiroth-Lin - Sephiroth Lin
-dobashim - Masaru Dobashi
-ehnalis - Zoltan Zvara
-emres - Emre Sevinc
-gchen - Guancheng Chen
-haiyangsea - Haiyang Sea
-hlin09 - Hao Lin
-hqzizania - Qian Huang
-jeanlyn - Jean Lyn
-jerluc - Jeremy A. Lucas
-jrabary - Jaonary Rabarisoa
-judynash - Judy Nash
-kaka1992 - Chen Song
-ksonj - Kalle Jepsen
-kuromatsu-nobuyuki - Nobuyuki Kuromatsu
-lazyman500 - Dong Xu
-leahmcguire - Leah McGuire
-mbittmann - Mark Bittmann
-mbonaci - Marko Bonaci
-meawoppl - Matthew Goodman
-nyaapa - Arsenii Krasikov
-phatak-dev - Madhukara Phatak
-prabeesh - Prabeesh K
-rakeshchalasani - Rakesh Chalasani
-rekhajoshm - Rekha Joshi
-sisihj - June He
-szheng79 - Shuai Zheng
-texasmichelle - Michelle Casbon
-vinodkc - Vinod KC
-yongtang - Yong Tang
-ypcat - Pei-Lun Lee
-zhichao-li - Zhichao Li
-zzcclp - Zhichao Zhang
-979969786 - Yuming Wang
-Rosstin - Rosstin Murphy
-ameyc - Amey Chaugule
-animeshbaranawal - Animesh Baranawal
-cafreeman - Chris Freeman
-lee19 - Lee
-lockwobr - Brian Lockwood
-navis - Navis Ryu
-pparkkin - Paavo Parkkinen
-HyukjinKwon - Hyukjin Kwon
-JDrit - Joseph Batchik
-JuhongPark - Juhong Park
-KaiXinXiaoLei - KaiXinXIaoLei
-NamelessAnalyst - NamelessAnalyst
-alyaxey - Alex Slusarenko
-baishuo - Shuo Bai
-fe2s - Oleksiy Dyagilev
-felixcheung - Felix Cheung
-feynmanliang - Feynman Liang
-josepablocam - Jose Cambronero
-kai-zeng - Kai Zeng
-mosessky - mosessky
-msannell - Michael Sannella
-nishkamravi2 - Nishkam Ravi
-noel-smith - Noel Smith
-petz2000 - Patrick Baier
-qiansl127 - Shilei Qian
-rahulpalamuttam - Rahul Palamuttam
-rowan000 - Rowan Chattaway
-sarutak - Kousuke Saruta
-sethah - Seth Hendrickson
-small-wang - Wang Wei
-stanzhai - Stan Zhai
-tien-dungle - Tien-Dung Le
-xuchenCN - Xu Chen
-zhangjiajin - Zhang JiaJin
diff --git a/dev/_site/create-release/release-build.sh b/dev/_site/create-release/release-build.sh
deleted file mode 100755
index cb79e9eba06e2..0000000000000
--- a/dev/_site/create-release/release-build.sh
+++ /dev/null
@@ -1,326 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-function exit_with_usage {
-  cat << EOF
-usage: release-build.sh <package|docs|publish-snapshot|publish-release>
-Creates build deliverables from a Spark commit.
-
-Top level targets are
-  package: Create binary packages and copy them to people.apache
-  docs: Build docs and copy them to people.apache
-  publish-snapshot: Publish snapshot release to Apache snapshots
-  publish-release: Publish a release to Apache release repo
-
-All other inputs are environment variables
-
-GIT_REF - Release tag or commit to build from
-SPARK_VERSION - Release identifier used when publishing
-SPARK_PACKAGE_VERSION - Release identifier in top level package directory
-REMOTE_PARENT_DIR - Parent in which to create doc or release builds.
-REMOTE_PARENT_MAX_LENGTH - If set, parent directory will be cleaned to only
- have this number of subdirectories (by deleting old ones). WARNING: This deletes data.
-
-ASF_USERNAME - Username of ASF committer account
-ASF_PASSWORD - Password of ASF committer account
-ASF_RSA_KEY - RSA private key file for ASF committer account
-
-GPG_KEY - GPG key used to sign release artifacts
-GPG_PASSPHRASE - Passphrase for GPG key
-EOF
-  exit 1
-}
-
-set -e
-
-if [ $# -eq 0 ]; then
-  exit_with_usage
-fi
-
-if [[ $@ == *"help"* ]]; then
-  exit_with_usage
-fi
-
-for env in ASF_USERNAME ASF_RSA_KEY GPG_PASSPHRASE GPG_KEY; do
-  if [ -z "${!env}" ]; then
-    echo "ERROR: $env must be set to run this script"
-    exit_with_usage
-  fi
-done
-
-# Commit ref to checkout when building
-GIT_REF=${GIT_REF:-master}
-
-# Destination directory parent on remote server
-REMOTE_PARENT_DIR=${REMOTE_PARENT_DIR:-/home/$ASF_USERNAME/public_html}
-
-SSH="ssh -o ConnectTimeout=300 -o StrictHostKeyChecking=no -i $ASF_RSA_KEY"
-GPG="gpg --no-tty --batch"
-NEXUS_ROOT=https://repository.apache.org/service/local/staging
-NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads
-BASE_DIR=$(pwd)
-
-MVN="build/mvn --force"
-PUBLISH_PROFILES="-Pyarn -Phive -Phadoop-2.2"
-PUBLISH_PROFILES="$PUBLISH_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
-
-rm -rf spark
-git clone https://git-wip-us.apache.org/repos/asf/spark.git
-cd spark
-git checkout $GIT_REF
-git_hash=`git rev-parse --short HEAD`
-echo "Checked out Spark git hash $git_hash"
-
-if [ -z "$SPARK_VERSION" ]; then
-  SPARK_VERSION=$($MVN help:evaluate -Dexpression=project.version \
-    | grep -v INFO | grep -v WARNING | grep -v Download)
-fi
-
-if [ -z "$SPARK_PACKAGE_VERSION" ]; then
-  SPARK_PACKAGE_VERSION="${SPARK_VERSION}-$(date +%Y_%m_%d_%H_%M)-${git_hash}"
-fi
-
-DEST_DIR_NAME="spark-$SPARK_PACKAGE_VERSION"
-USER_HOST="$ASF_USERNAME@people.apache.org"
-
-git clean -d -f -x
-rm .gitignore
-rm -rf .git
-cd ..
-
-if [ -n "$REMOTE_PARENT_MAX_LENGTH" ]; then
-  old_dirs=$($SSH $USER_HOST ls -t $REMOTE_PARENT_DIR | tail -n +$REMOTE_PARENT_MAX_LENGTH)
-  for old_dir in $old_dirs; do
-    echo "Removing directory: $old_dir"
-    $SSH $USER_HOST rm -r $REMOTE_PARENT_DIR/$old_dir
-  done
-fi
-
-if [[ "$1" == "package" ]]; then
-  # Source and binary tarballs
-  echo "Packaging release tarballs"
-  cp -r spark spark-$SPARK_VERSION
-  tar cvzf spark-$SPARK_VERSION.tgz spark-$SPARK_VERSION
-  echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour --output spark-$SPARK_VERSION.tgz.asc \
-    --detach-sig spark-$SPARK_VERSION.tgz
-  echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md MD5 spark-$SPARK_VERSION.tgz > \
-    spark-$SPARK_VERSION.tgz.md5
-  echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
-    SHA512 spark-$SPARK_VERSION.tgz > spark-$SPARK_VERSION.tgz.sha
-  rm -rf spark-$SPARK_VERSION
-
-  # Updated for each binary build
-  make_binary_release() {
-    NAME=$1
-    FLAGS=$2
-    ZINC_PORT=$3
-    cp -r spark spark-$SPARK_VERSION-bin-$NAME
-
-    cd spark-$SPARK_VERSION-bin-$NAME
-
-    # TODO There should probably be a flag to make-distribution to allow 2.11 support
-    if [[ $FLAGS == *scala-2.11* ]]; then
-      ./dev/change-scala-version.sh 2.11
-    fi
-
-    export ZINC_PORT=$ZINC_PORT
-    echo "Creating distribution: $NAME ($FLAGS)"
-
-    # Get maven home set by MVN
-    MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'`
-
-    ./make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
-      -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
-    cd ..
-    cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz .
-
-    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
-      --output spark-$SPARK_VERSION-bin-$NAME.tgz.asc \
-      --detach-sig spark-$SPARK_VERSION-bin-$NAME.tgz
-    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
-      MD5 spark-$SPARK_VERSION-bin-$NAME.tgz > \
-      spark-$SPARK_VERSION-bin-$NAME.tgz.md5
-    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
-      SHA512 spark-$SPARK_VERSION-bin-$NAME.tgz > \
-      spark-$SPARK_VERSION-bin-$NAME.tgz.sha
-  }
-
-  # TODO: Check exit codes of children here:
-  # http://stackoverflow.com/questions/1570262/shell-get-exit-code-of-background-process
-
-  # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds
-  # share the same Zinc server.
-  make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" &
-  make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" &
-  make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" &
-  make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" &
-  make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" &
-  make_binary_release "hadoop2.6" "-Psparkr -Phadoop-2.6 -Phive -Phive-thriftserver -Pyarn" "3034" &
-  make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn" "3037" &
-  make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn" "3038" &
-  wait
-  rm -rf spark-$SPARK_VERSION-bin-*/
-
-  # Copy data
-  dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-bin"
-  echo "Copying release tarballs to $dest_dir"
-  $SSH $USER_HOST mkdir $dest_dir
-  rsync -e "$SSH" spark-* $USER_HOST:$dest_dir
-  echo "Linking /latest to $dest_dir"
-  $SSH $USER_HOST rm -f "$REMOTE_PARENT_DIR/latest"
-  $SSH $USER_HOST ln -s $dest_dir "$REMOTE_PARENT_DIR/latest"
-  exit 0
-fi
-
-if [[ "$1" == "docs" ]]; then
-  # Documentation
-  cd spark
-  echo "Building Spark docs"
-  dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-docs"
-  cd docs
-  # Compile docs with Java 7 to use nicer format
-  # TODO: Make configurable to add this: PRODUCTION=1
-  PRODUCTION=1 RELEASE_VERSION="$SPARK_VERSION" jekyll build
-  echo "Copying release documentation to $dest_dir"
-  $SSH $USER_HOST mkdir $dest_dir
-  echo "Linking /latest to $dest_dir"
-  $SSH $USER_HOST rm -f "$REMOTE_PARENT_DIR/latest"
-  $SSH $USER_HOST ln -s $dest_dir "$REMOTE_PARENT_DIR/latest"
-  rsync -e "$SSH" -r _site/* $USER_HOST:$dest_dir
-  cd ..
-  exit 0
-fi
-
-if [[ "$1" == "publish-snapshot" ]]; then
-  cd spark
-  # Publish Spark to Maven release repo
-  echo "Deploying Spark SNAPSHOT at '$GIT_REF' ($git_hash)"
-  echo "Publish version is $SPARK_VERSION"
-  if [[ ! $SPARK_VERSION == *"SNAPSHOT"* ]]; then
-    echo "ERROR: Snapshots must have a version containing SNAPSHOT"
-    echo "ERROR: You gave version '$SPARK_VERSION'"
-    exit 1
-  fi
-  # Coerce the requested version
-  $MVN versions:set -DnewVersion=$SPARK_VERSION
-  tmp_settings="tmp-settings.xml"
-  echo "<settings><servers><server>" > $tmp_settings
-  echo "<id>apache.snapshots.https</id><username>$ASF_USERNAME</username>" >> $tmp_settings
-  echo "<password>$ASF_PASSWORD</password>" >> $tmp_settings
-  echo "</server></servers></settings>" >> $tmp_settings
-
-  # Generate random point for Zinc
-  export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)")
-
-  $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES \
-    -Phive-thriftserver deploy
-  ./dev/change-scala-version.sh 2.11
-  $MVN -DzincPort=$ZINC_PORT -Dscala-2.11 --settings $tmp_settings \
-    -DskipTests $PUBLISH_PROFILES clean deploy
-
-  # Clean-up Zinc nailgun process
-  /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
-
-  rm $tmp_settings
-  cd ..
-  exit 0
-fi
-
-if [[ "$1" == "publish-release" ]]; then
-  cd spark
-  # Publish Spark to Maven release repo
-  echo "Publishing Spark checkout at '$GIT_REF' ($git_hash)"
-  echo "Publish version is $SPARK_VERSION"
-  # Coerce the requested version
-  $MVN versions:set -DnewVersion=$SPARK_VERSION
-
-  # Using Nexus API documented here:
-  # https://support.sonatype.com/entries/39720203-Uploading-to-a-Staging-Repository-via-REST-API
-  echo "Creating Nexus staging repository"
-  repo_request="<promoteRequest><data><description>Apache Spark $SPARK_VERSION (commit $git_hash)</description></data></promoteRequest>"
-  out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \
-    -H "Content-Type:application/xml" -v \
-    $NEXUS_ROOT/profiles/$NEXUS_PROFILE/start)
-  staged_repo_id=$(echo $out | sed -e "s/.*\(orgapachespark-[0-9]\{4\}\).*/\1/")
-  echo "Created Nexus staging repository: $staged_repo_id"
-
-  tmp_repo=$(mktemp -d spark-repo-XXXXX)
-
-  # Generate random point for Zinc
-  export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)")
-
-  $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $PUBLISH_PROFILES \
-    -Phive-thriftserver clean install
-
-  ./dev/change-scala-version.sh 2.11
-
-  $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -Dscala-2.11 \
-    -DskipTests $PUBLISH_PROFILES clean install
-
-  # Clean-up Zinc nailgun process
-  /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill
-
-  ./dev/change-version-to-2.10.sh
-
-  pushd $tmp_repo/org/apache/spark
-
-  # Remove any extra files generated during install
-  find . -type f |grep -v \.jar |grep -v \.pom | xargs rm
-
-  echo "Creating hash and signature files"
-  for file in $(find . -type f)
-  do
-    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --output $file.asc \
-      --detach-sig --armour $file;
-    if [ $(command -v md5) ]; then
-      # Available on OS X; -q to keep only hash
-      md5 -q $file > $file.md5
-    else
-      # Available on Linux; cut to keep only hash
-      md5sum $file | cut -f1 -d' ' > $file.md5
-    fi
-    sha1sum $file | cut -f1 -d' ' > $file.sha1
-  done
-
-  nexus_upload=$NEXUS_ROOT/deployByRepositoryId/$staged_repo_id
-  echo "Uplading files to $nexus_upload"
-  for file in $(find . -type f)
-  do
-    # strip leading ./
-    file_short=$(echo $file | sed -e "s/\.\///")
-    dest_url="$nexus_upload/org/apache/spark/$file_short"
-    echo "  Uploading $file_short"
-    curl -u $ASF_USERNAME:$ASF_PASSWORD --upload-file $file_short $dest_url
-  done
-
-  echo "Closing nexus staging repository"
-  repo_request="<promoteRequest><data><stagedRepositoryId>$staged_repo_id</stagedRepositoryId><description>Apache Spark $SPARK_VERSION (commit $git_hash)</description></data></promoteRequest>"
-  out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \
-    -H "Content-Type:application/xml" -v \
-    $NEXUS_ROOT/profiles/$NEXUS_PROFILE/finish)
-  echo "Closed Nexus staging repository: $staged_repo_id"
-  popd
-  rm -rf $tmp_repo
-  cd ..
-  exit 0
-fi
-
-cd ..
-rm -rf spark
-echo "ERROR: expects to be called with 'package', 'docs', 'publish-release' or 'publish-snapshot'"
diff --git a/dev/_site/create-release/release-tag.sh b/dev/_site/create-release/release-tag.sh
deleted file mode 100755
index b0a3374becc6a..0000000000000
--- a/dev/_site/create-release/release-tag.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-function exit_with_usage {
-  cat << EOF
-usage: tag-release.sh
-Tags a Spark release on a particular branch.
-
-Inputs are specified with the following environment variables:
-ASF_USERNAME - Apache Username
-ASF_PASSWORD - Apache Password
-GIT_NAME - Name to use with git
-GIT_EMAIL - E-mail address to use with git
-GIT_BRANCH - Git branch on which to make release
-RELEASE_VERSION - Version used in pom files for release
-RELEASE_TAG - Name of release tag
-NEXT_VERSION - Development version after release
-EOF
-  exit 1
-}
-
-set -e
-
-if [[ $@ == *"help"* ]]; then
-  exit_with_usage
-fi
-
-for env in ASF_USERNAME ASF_PASSWORD RELEASE_VERSION RELEASE_TAG NEXT_VERSION GIT_EMAIL GIT_NAME GIT_BRANCH; do
-  if [ -z "${!env}" ]; then
-    echo "$env must be set to run this script"
-    exit 1
-  fi
-done
-
-ASF_SPARK_REPO="git-wip-us.apache.org/repos/asf/spark.git"
-MVN="build/mvn --force"
-
-rm -rf spark
-git clone https://$ASF_USERNAME:$ASF_PASSWORD@$ASF_SPARK_REPO -b $GIT_BRANCH
-cd spark
-
-git config user.name "$GIT_NAME"
-git config user.email $GIT_EMAIL
-
-# Create release version
-$MVN versions:set -DnewVersion=$RELEASE_VERSION | grep -v "no value" # silence logs
-git commit -a -m "Preparing Spark release $RELEASE_TAG"
-echo "Creating tag $RELEASE_TAG at the head of $GIT_BRANCH"
-git tag $RELEASE_TAG
-
-# TODO: It would be nice to do some verifications here
-#       i.e. check whether ec2 scripts have the new version
-
-# Create next version
-$MVN versions:set -DnewVersion=$NEXT_VERSION | grep -v "no value" # silence logs
-git commit -a -m "Preparing development version $NEXT_VERSION"
-
-# Push changes
-git push origin $RELEASE_TAG
-git push origin HEAD:$GIT_BRANCH
-
-cd ..
-rm -rf spark
diff --git a/dev/_site/create-release/releaseutils.py b/dev/_site/create-release/releaseutils.py
deleted file mode 100755
index 7f152b7f53559..0000000000000
--- a/dev/_site/create-release/releaseutils.py
+++ /dev/null
@@ -1,260 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This file contains helper methods used in creating a release.
-
-import re
-import sys
-from subprocess import Popen, PIPE
-
-try:
-    from jira.client import JIRA
-    # Old versions have JIRAError in exceptions package, new (0.5+) in utils.
-    try:
-        from jira.exceptions import JIRAError
-    except ImportError:
-        from jira.utils import JIRAError
-except ImportError:
-    print "This tool requires the jira-python library"
-    print "Install using 'sudo pip install jira'"
-    sys.exit(-1)
-
-try:
-    from github import Github
-    from github import GithubException
-except ImportError:
-    print "This tool requires the PyGithub library"
-    print "Install using 'sudo pip install PyGithub'"
-    sys.exit(-1)
-
-try:
-    import unidecode
-except ImportError:
-    print "This tool requires the unidecode library to decode obscure github usernames"
-    print "Install using 'sudo pip install unidecode'"
-    sys.exit(-1)
-
-# Contributors list file name
-contributors_file_name = "contributors.txt"
-
-# Prompt the user to answer yes or no until they do so
-def yesOrNoPrompt(msg):
-    response = raw_input("%s [y/n]: " % msg)
-    while response != "y" and response != "n":
-        return yesOrNoPrompt(msg)
-    return response == "y"
-
-# Utility functions run git commands (written with Git 1.8.5)
-def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
-def run_cmd_error(cmd): return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1]
-def get_date(commit_hash):
-    return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash])
-def tag_exists(tag):
-    stderr = run_cmd_error(["git", "show", tag])
-    return "error" not in stderr
-
-# A type-safe representation of a commit
-class Commit:
-    def __init__(self, _hash, author, title, pr_number = None):
-        self._hash = _hash
-        self.author = author
-        self.title = title
-        self.pr_number = pr_number
-    def get_hash(self): return self._hash
-    def get_author(self): return self.author
-    def get_title(self): return self.title
-    def get_pr_number(self): return self.pr_number
-    def __str__(self):
-        closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else ""
-        return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr)
-
-# Return all commits that belong to the specified tag.
-#
-# Under the hood, this runs a `git log` on that tag and parses the fields
-# from the command output to construct a list of Commit objects. Note that
-# because certain fields reside in the commit description and cannot be parsed
-# through the Github API itself, we need to do some intelligent regex parsing
-# to extract those fields.
-#
-# This is written using Git 1.8.5.
-def get_commits(tag):
-    commit_start_marker = "|=== COMMIT START MARKER ===|"
-    commit_end_marker = "|=== COMMIT END MARKER ===|"
-    field_end_marker = "|=== COMMIT FIELD END MARKER ===|"
-    log_format =\
-        commit_start_marker + "%h" +\
-        field_end_marker + "%an" +\
-        field_end_marker + "%s" +\
-        commit_end_marker + "%b"
-    output = run_cmd(["git", "log", "--quiet", "--pretty=format:" + log_format, tag])
-    commits = []
-    raw_commits = [c for c in output.split(commit_start_marker) if c]
-    for commit in raw_commits:
-        if commit.count(commit_end_marker) != 1:
-            print "Commit end marker not found in commit: "
-            for line in commit.split("\n"): print line
-            sys.exit(1)
-        # Separate commit digest from the body
-        # From the digest we extract the hash, author and the title
-        # From the body, we extract the PR number and the github username
-        [commit_digest, commit_body] = commit.split(commit_end_marker)
-        if commit_digest.count(field_end_marker) != 2:
-            sys.exit("Unexpected format in commit: %s" % commit_digest)
-        [_hash, author, title] = commit_digest.split(field_end_marker)
-        # The PR number and github username is in the commit message
-        # itself and cannot be accessed through any Github API
-        pr_number = None
-        match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body)
-        if match:
-            [pr_number, github_username] = match.groups()
-            # If the author name is not valid, use the github
-            # username so we can translate it properly later
-            if not is_valid_author(author):
-                author = github_username
-        # Guard against special characters
-        author = unidecode.unidecode(unicode(author, "UTF-8")).strip()
-        commit = Commit(_hash, author, title, pr_number)
-        commits.append(commit)
-    return commits
-
-# Maintain a mapping for translating issue types to contributions in the release notes
-# This serves an additional function of warning the user against unknown issue types
-# Note: This list is partially derived from this link:
-# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/issuetypes
-# Keep these in lower case
-known_issue_types = {
-    "bug": "bug fixes",
-    "build": "build fixes",
-    "dependency upgrade": "build fixes",
-    "improvement": "improvements",
-    "new feature": "new features",
-    "documentation": "documentation",
-    "test": "test",
-    "task": "improvement",
-    "sub-task": "improvement"
-}
-
-# Maintain a mapping for translating component names when creating the release notes
-# This serves an additional function of warning the user against unknown components
-# Note: This list is largely derived from this link:
-# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/components
-CORE_COMPONENT = "Core"
-known_components = {
-    "block manager": CORE_COMPONENT,
-    "build": CORE_COMPONENT,
-    "deploy": CORE_COMPONENT,
-    "documentation": CORE_COMPONENT,
-    "ec2": "EC2",
-    "examples": CORE_COMPONENT,
-    "graphx": "GraphX",
-    "input/output": CORE_COMPONENT,
-    "java api": "Java API",
-    "mesos": "Mesos",
-    "ml": "MLlib",
-    "mllib": "MLlib",
-    "project infra": "Project Infra",
-    "pyspark": "PySpark",
-    "shuffle": "Shuffle",
-    "spark core": CORE_COMPONENT,
-    "spark shell": CORE_COMPONENT,
-    "sql": "SQL",
-    "streaming": "Streaming",
-    "web ui": "Web UI",
-    "windows": "Windows",
-    "yarn": "YARN"
-}
-
-# Translate issue types using a format appropriate for writing contributions
-# If an unknown issue type is encountered, warn the user
-def translate_issue_type(issue_type, issue_id, warnings):
-    issue_type = issue_type.lower()
-    if issue_type in known_issue_types:
-        return known_issue_types[issue_type]
-    else:
-        warnings.append("Unknown issue type \"%s\" (see %s)" % (issue_type, issue_id))
-        return issue_type
-
-# Translate component names using a format appropriate for writing contributions
-# If an unknown component is encountered, warn the user
-def translate_component(component, commit_hash, warnings):
-    component = component.lower()
-    if component in known_components:
-        return known_components[component]
-    else:
-        warnings.append("Unknown component \"%s\" (see %s)" % (component, commit_hash))
-        return component
-
-# Parse components in the commit message
-# The returned components are already filtered and translated
-def find_components(commit, commit_hash):
-    components = re.findall("\[\w*\]", commit.lower())
-    components = [translate_component(c, commit_hash)\
-        for c in components if c in known_components]
-    return components
-
-# Join a list of strings in a human-readable manner
-# e.g. ["Juice"] -> "Juice"
-# e.g. ["Juice", "baby"] -> "Juice and baby"
-# e.g. ["Juice", "baby", "moon"] -> "Juice, baby, and moon"
-def nice_join(str_list):
-    str_list = list(str_list) # sometimes it's a set
-    if not str_list:
-        return ""
-    elif len(str_list) == 1:
-        return next(iter(str_list))
-    elif len(str_list) == 2:
-        return " and ".join(str_list)
-    else:
-        return ", ".join(str_list[:-1]) + ", and " + str_list[-1]
-
-# Return the full name of the specified user on Github
-# If the user doesn't exist, return None
-def get_github_name(author, github_client):
-    if github_client:
-        try:
-            return github_client.get_user(author).name
-        except GithubException as e:
-            # If this is not a "not found" exception
-            if e.status != 404:
-                raise e
-    return None
-
-# Return the full name of the specified user on JIRA
-# If the user doesn't exist, return None
-def get_jira_name(author, jira_client):
-    if jira_client:
-        try:
-            return jira_client.user(author).displayName
-        except JIRAError as e:
-            # If this is not a "not found" exception
-            if e.status_code != 404:
-                raise e
-    return None
-
-# Return whether the given name is in the form <First Name><space><Last Name>
-def is_valid_author(author):
-    if not author: return False
-    return " " in author and not re.findall("[0-9]", author)
-
-# Capitalize the first letter of each word in the given author name
-def capitalize_author(author):
-    if not author: return None
-    words = author.split(" ")
-    words = [w[0].capitalize() + w[1:] for w in words if w]
-    return " ".join(words)
-
diff --git a/dev/_site/create-release/translate-contributors.py b/dev/_site/create-release/translate-contributors.py
deleted file mode 100755
index 86fa02d87b9a0..0000000000000
--- a/dev/_site/create-release/translate-contributors.py
+++ /dev/null
@@ -1,253 +0,0 @@
-#!/usr/bin/env python
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script translates invalid authors in the contributors list generated
-# by generate-contributors.py. When the script encounters an author name that
-# is considered invalid, it searches Github and JIRA in an attempt to search
-# for replacements. This tool runs in two modes:
-#
-# (1) Interactive mode: For each invalid author name, this script presents
-# all candidate replacements to the user and awaits user response. In this
-# mode, the user may also input a custom name. This is the default.
-#
-# (2) Non-interactive mode: For each invalid author name, this script replaces
-# the name with the first valid candidate it can find. If there is none, it
-# uses the original name. This can be enabled through the --non-interactive flag.
-
-import os
-import sys
-
-from releaseutils import *
-
-# You must set the following before use!
-JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
-JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
-JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
-GITHUB_API_TOKEN = os.environ.get("GITHUB_API_TOKEN", None)
-if not JIRA_USERNAME or not JIRA_PASSWORD:
-    sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set")
-if not GITHUB_API_TOKEN:
-    sys.exit("GITHUB_API_TOKEN must be set")
-
-# Write new contributors list to <old_file_name>.final
-if not os.path.isfile(contributors_file_name):
-    print "Contributors file %s does not exist!" % contributors_file_name
-    print "Have you run ./generate-contributors.py yet?"
-    sys.exit(1)
-contributors_file = open(contributors_file_name, "r")
-warnings = []
-
-# In non-interactive mode, this script will choose the first replacement that is valid
-INTERACTIVE_MODE = True
-if len(sys.argv) > 1:
-    options = set(sys.argv[1:])
-    if "--non-interactive" in options:
-        INTERACTIVE_MODE = False
-if INTERACTIVE_MODE:
-    print "Running in interactive mode. To disable this, provide the --non-interactive flag."
-
-# Setup Github and JIRA clients
-jira_options = { "server": JIRA_API_BASE }
-jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
-github_client = Github(GITHUB_API_TOKEN)
-
-# Load known author translations that are cached locally
-known_translations = {}
-known_translations_file_name = "known_translations"
-known_translations_file = open(known_translations_file_name, "r")
-for line in known_translations_file:
-    if line.startswith("#"): continue
-    [old_name, new_name] = line.strip("\n").split(" - ")
-    known_translations[old_name] = new_name
-known_translations_file.close()
-
-# Open again in case the user adds new mappings
-known_translations_file = open(known_translations_file_name, "a")
-
-# Generate candidates for the given author. This should only be called if the given author
-# name does not represent a full name as this operation is somewhat expensive. Under the
-# hood, it makes several calls to the Github and JIRA API servers to find the candidates.
-#
-# This returns a list of (candidate name, source) 2-tuples. E.g.
-# [
-#   (NOT_FOUND, "No full name found for Github user andrewor14"),
-#   ("Andrew Or", "Full name of JIRA user andrewor14"),
-#   ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"),
-#   ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"),
-#   (NOT_FOUND, "No assignee found for SPARK-1763")
-# ]
-NOT_FOUND = "Not found"
-def generate_candidates(author, issues):
-    candidates = []
-    # First check for full name of Github user
-    github_name = get_github_name(author, github_client)
-    if github_name:
-        candidates.append((github_name, "Full name of Github user %s" % author))
-    else:
-        candidates.append((NOT_FOUND, "No full name found for Github user %s" % author))
-    # Then do the same for JIRA user
-    jira_name = get_jira_name(author, jira_client)
-    if jira_name:
-        candidates.append((jira_name, "Full name of JIRA user %s" % author))
-    else:
-        candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % author))
-    # Then do the same for the assignee of each of the associated JIRAs
-    # Note that a given issue may not have an assignee, or the assignee may not have a full name
-    for issue in issues:
-        try:
-            jira_issue = jira_client.issue(issue)
-        except JIRAError as e:
-            # Do not exit just because an issue is not found!
-            if e.status_code == 404:
-                warnings.append("Issue %s not found!" % issue)
-                continue
-            raise e
-        jira_assignee = jira_issue.fields.assignee
-        if jira_assignee:
-            user_name = jira_assignee.name
-            display_name = jira_assignee.displayName
-            if display_name:
-                candidates.append((display_name, "Full name of %s assignee %s" % (issue, user_name)))
-            else:
-                candidates.append((NOT_FOUND, "No full name found for %s assignee %" % (issue, user_name)))
-        else:
-            candidates.append((NOT_FOUND, "No assignee found for %s" % issue))
-    # Guard against special characters in candidate names
-    # Note that the candidate name may already be in unicode (JIRA returns this)
-    for i, (candidate, source) in enumerate(candidates):
-        try:
-            candidate = unicode(candidate, "UTF-8")
-        except TypeError:
-            # already in unicode
-            pass
-        candidate = unidecode.unidecode(candidate).strip()
-        candidates[i] = (candidate, source)
-    return candidates
-
-# Translate each invalid author by searching for possible candidates from Github and JIRA
-# In interactive mode, this script presents the user with a list of choices and have the user
-# select from this list. Additionally, the user may also choose to enter a custom name.
-# In non-interactive mode, this script picks the first valid author name from the candidates
-# If no such name exists, the original name is used (without the JIRA numbers).
-print "\n========================== Translating contributor list =========================="
-lines = contributors_file.readlines()
-contributions = []
-for i, line in enumerate(lines):
-    temp_author = line.strip(" * ").split(" -- ")[0]
-    print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines))
-    if not temp_author:
-        error_msg = "    ERROR: Expected the following format \" * <author> -- <contributions>\"\n"
-        error_msg += "    ERROR: Actual = %s" % line
-        print error_msg
-        warnings.append(error_msg)
-        contributions.append(line)
-        continue
-    author = temp_author.split("/")[0]
-    # Use the local copy of known translations where possible
-    if author in known_translations:
-        line = line.replace(temp_author, known_translations[author])
-    elif not is_valid_author(author):
-        new_author = author
-        issues = temp_author.split("/")[1:]
-        candidates = generate_candidates(author, issues)
-        # Print out potential replacement candidates along with the sources, e.g.
-        #   [X] No full name found for Github user andrewor14
-        #   [X] No assignee found for SPARK-1763
-        #   [0] Andrew Or - Full name of JIRA user andrewor14
-        #   [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14
-        #   [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14
-        #   [3] andrewor14 - Raw Github username
-        #   [4] Custom
-        candidate_names = []
-        bad_prompts = [] # Prompts that can't actually be selected; print these first.
-        good_prompts = [] # Prompts that contain valid choices
-        for candidate, source in candidates:
-            if candidate == NOT_FOUND:
-                bad_prompts.append("    [X] %s" % source)
-            else:
-                index = len(candidate_names)
-                candidate_names.append(candidate)
-                good_prompts.append("    [%d] %s - %s" % (index, candidate, source))
-        raw_index = len(candidate_names)
-        custom_index = len(candidate_names) + 1
-        for p in bad_prompts: print p
-        if bad_prompts: print "    ---"
-        for p in good_prompts: print p
-        # In interactive mode, additionally provide "custom" option and await user response
-        if INTERACTIVE_MODE:
-            print "    [%d] %s - Raw Github username" % (raw_index, author)
-            print "    [%d] Custom" % custom_index
-            response = raw_input("    Your choice: ")
-            last_index = custom_index
-            while not response.isdigit() or int(response) > last_index:
-                response = raw_input("    Please enter an integer between 0 and %d: " % last_index)
-            response = int(response)
-            if response == custom_index:
-                new_author = raw_input("    Please type a custom name for this author: ")
-            elif response != raw_index:
-                new_author = candidate_names[response]
-        # In non-interactive mode, just pick the first candidate
-        else:
-            valid_candidate_names = [name for name, _ in candidates\
-                if is_valid_author(name) and name != NOT_FOUND]
-            if valid_candidate_names:
-                new_author = valid_candidate_names[0]
-        # Finally, capitalize the author and replace the original one with it
-        # If the final replacement is still invalid, log a warning
-        if is_valid_author(new_author):
-            new_author = capitalize_author(new_author)
-        else:
-            warnings.append("Unable to find a valid name %s for author %s" % (author, temp_author))
-        print "    * Replacing %s with %s" % (author, new_author)
-        # If we are in interactive mode, prompt the user whether we want to remember this new mapping
-        if INTERACTIVE_MODE and\
-          author not in known_translations and\
-          yesOrNoPrompt("    Add mapping %s -> %s to known translations file?" % (author, new_author)):
-            known_translations_file.write("%s - %s\n" % (author, new_author))
-            known_translations_file.flush()
-        line = line.replace(temp_author, author)
-    contributions.append(line)
-print "==================================================================================\n"
-contributors_file.close()
-known_translations_file.close()
-
-# Sort the contributions before writing them to the new file.
-# Additionally, check if there are any duplicate author rows.
-# This could happen if the same user has both a valid full
-# name (e.g. Andrew Or) and an invalid one (andrewor14).
-# If so, warn the user about this at the end.
-contributions.sort()
-all_authors = set()
-new_contributors_file_name = contributors_file_name + ".final"
-new_contributors_file = open(new_contributors_file_name, "w")
-for line in contributions:
-    author = line.strip(" * ").split(" -- ")[0]
-    if author in all_authors:
-        warnings.append("Detected duplicate author name %s. Please merge these manually." % author)
-    all_authors.add(author)
-    new_contributors_file.write(line)
-new_contributors_file.close()
-
-print "Translated contributors list successfully written to %s!" % new_contributors_file_name
-
-# Log any warnings encountered in the process
-if warnings:
-    print "\n========== Warnings encountered while translating the contributor list ==========="
-    for w in warnings: print w
-    print "Please manually correct these in the final contributors list at %s." % new_contributors_file_name
-    print "==================================================================================\n"
-
diff --git a/dev/_site/github_jira_sync.py b/dev/_site/github_jira_sync.py
deleted file mode 100755
index 287f0ca24a7df..0000000000000
--- a/dev/_site/github_jira_sync.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Utility for updating JIRA's with information about Github pull requests
-
-import json
-import os
-import re
-import sys
-import urllib2
-
-try:
-    import jira.client
-except ImportError:
-    print "This tool requires the jira-python library"
-    print "Install using 'sudo pip install jira'"
-    sys.exit(-1)
-
-# User facing configs
-GITHUB_API_BASE = os.environ.get("GITHUB_API_BASE", "https://api.github.com/repos/apache/spark")
-JIRA_PROJECT_NAME = os.environ.get("JIRA_PROJECT_NAME", "SPARK")
-JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
-JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "apachespark")
-JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "XXX")
-# Maximum number of updates to perform in one run
-MAX_UPDATES = int(os.environ.get("MAX_UPDATES", "100000"))
-# Cut-off for oldest PR on which to comment. Useful for avoiding
-# "notification overload" when running for the first time.
-MIN_COMMENT_PR = int(os.environ.get("MIN_COMMENT_PR", "1496"))
-
-# File used as an opitimization to store maximum previously seen PR
-# Used mostly because accessing ASF JIRA is slow, so we want to avoid checking
-# the state of JIRA's that are tied to PR's we've already looked at.
-MAX_FILE = ".github-jira-max"
-
-def get_url(url):
-    try:
-        return urllib2.urlopen(url)
-    except urllib2.HTTPError as e:
-        print "Unable to fetch URL, exiting: %s" % url
-        sys.exit(-1)
-
-def get_json(urllib_response):
-    return json.load(urllib_response)
-
-# Return a list of (JIRA id, JSON dict) tuples:
-# e.g. [('SPARK-1234', {.. json ..}), ('SPARK-5687', {.. json ..})}
-def get_jira_prs():
-    result = []
-    has_next_page = True
-    page_num = 0
-    while has_next_page:
-	page = get_url(GITHUB_API_BASE + "/pulls?page=%s&per_page=100" % page_num)
-	page_json = get_json(page)
-
-	for pull in page_json:
-	    jiras = re.findall(JIRA_PROJECT_NAME + "-[0-9]{4,5}", pull['title'])
-	    for jira in jiras:
-		result = result + [(jira,  pull)]
-
-	# Check if there is another page
-	link_header = filter(lambda k: k.startswith("Link"), page.info().headers)[0]
-	if not "next"in link_header:
-	    has_next_page = False
-	else:
-	    page_num = page_num + 1
-    return result
-
-def set_max_pr(max_val):
-    f = open(MAX_FILE, 'w')
-    f.write("%s" % max_val)
-    f.close()
-    print "Writing largest PR number seen: %s" % max_val
-
-def get_max_pr():
-    if os.path.exists(MAX_FILE):
-        result = int(open(MAX_FILE, 'r').read())
-        print "Read largest PR number previously seen: %s" % result
-        return result
-    else:
-        return 0
-
-jira_client = jira.client.JIRA({'server': JIRA_API_BASE},
-                                basic_auth=(JIRA_USERNAME, JIRA_PASSWORD))
-
-jira_prs = get_jira_prs()
-
-previous_max = get_max_pr()
-print "Retrieved %s JIRA PR's from Github" % len(jira_prs)
-jira_prs = [(k, v) for k, v in jira_prs if int(v['number']) > previous_max]
-print "%s PR's remain after excluding visted ones" % len(jira_prs)
-
-num_updates = 0
-considered = []
-for issue, pr in sorted(jira_prs, key=lambda (k, v): int(v['number'])):
-    if num_updates >= MAX_UPDATES:
-      break
-    pr_num = int(pr['number'])
-
-    print "Checking issue %s" % issue
-    considered = considered + [pr_num]
-
-    url = pr['html_url']
-    title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) 
-    try:
-      existing_links = map(lambda l: l.raw['object']['url'], jira_client.remote_links(issue))
-    except:
-      print "Failure reading JIRA %s (does it exist?)" % issue
-      print sys.exc_info()[0]
-      continue
-
-    if url in existing_links:
-        continue
-
-    icon = {"title": "Pull request #%s" % pr['number'], 
-      "url16x16": "https://assets-cdn.github.com/favicon.ico"}
-    destination = {"title": title, "url": url, "icon": icon}
-    # For all possible fields see:
-    # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links     
-    # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"} 
-    jira_client.add_remote_link(issue, destination)
-    
-    comment = "User '%s' has created a pull request for this issue:" % pr['user']['login']
-    comment = comment + ("\n%s" % pr['html_url'])
-    if pr_num >= MIN_COMMENT_PR:
-        jira_client.add_comment(issue, comment)
-    
-    print "Added link %s <-> PR #%s" % (issue, pr['number'])
-    num_updates = num_updates + 1
-
-if len(considered) > 0:
-    set_max_pr(max(considered))
diff --git a/dev/_site/lint-python b/dev/_site/lint-python
deleted file mode 100755
index 0b97213ae3dff..0000000000000
--- a/dev/_site/lint-python
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
-SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
-PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport"
-PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py ./dev/run-tests-jenkins.py"
-PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
-PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
-PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt"
-
-cd "$SPARK_ROOT_DIR"
-
-# compileall: https://docs.python.org/2/library/compileall.html
-python -B -m compileall -q -l $PATHS_TO_CHECK > "$PEP8_REPORT_PATH"
-compile_status="${PIPESTATUS[0]}"
-
-# Get pep8 at runtime so that we don't rely on it being installed on the build server.
-#+ See: https://github.com/apache/spark/pull/1744#issuecomment-50982162
-#+ TODOs:
-#+  - Download pep8 from PyPI. It's more "official".
-PEP8_VERSION="1.6.2"
-PEP8_SCRIPT_PATH="$SPARK_ROOT_DIR/dev/pep8-$PEP8_VERSION.py"
-PEP8_SCRIPT_REMOTE_PATH="https://raw.githubusercontent.com/jcrocholl/pep8/$PEP8_VERSION/pep8.py"
-
-if [ ! -e "$PEP8_SCRIPT_PATH" ]; then
-    curl --silent -o "$PEP8_SCRIPT_PATH" "$PEP8_SCRIPT_REMOTE_PATH"    
-    curl_status="$?"
-
-    if [ "$curl_status" -ne 0 ]; then
-        echo "Failed to download pep8.py from \"$PEP8_SCRIPT_REMOTE_PATH\"."
-        exit "$curl_status"
-    fi
-fi
-
-# Easy install pylint in /dev/pylint. To easy_install into a directory, the PYTHONPATH should
-# be set to the directory.
-# dev/pylint should be appended to the PATH variable as well.
-# Jenkins by default installs the pylint3 version, so for now this just checks the code quality
-# of python3.
-export "PYTHONPATH=$SPARK_ROOT_DIR/dev/pylint"
-export "PYLINT_HOME=$PYTHONPATH"
-export "PATH=$PYTHONPATH:$PATH"
-
-# if [ ! -d "$PYLINT_HOME" ]; then
-#     mkdir "$PYLINT_HOME"
-#     # Redirect the annoying pylint installation output.
-#     easy_install -d "$PYLINT_HOME" pylint==1.4.4 &>> "$PYLINT_INSTALL_INFO"
-#     easy_install_status="$?"
-#
-#     if [ "$easy_install_status" -ne 0 ]; then
-#         echo "Unable to install pylint locally in \"$PYTHONPATH\"."
-#         cat "$PYLINT_INSTALL_INFO"
-#         exit "$easy_install_status"
-#     fi
-#
-#     rm "$PYLINT_INSTALL_INFO"
-#
-# fi
-
-# There is no need to write this output to a file
-#+ first, but we do so so that the check status can
-#+ be output before the report, like with the
-#+ scalastyle and RAT checks.
-python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK >> "$PEP8_REPORT_PATH"
-pep8_status="${PIPESTATUS[0]}"
-
-if [ "$compile_status" -eq 0 -a "$pep8_status" -eq 0 ]; then
-    lint_status=0
-else
-    lint_status=1
-fi
-
-if [ "$lint_status" -ne 0 ]; then
-    echo "PEP8 checks failed."
-    cat "$PEP8_REPORT_PATH"
-else
-    echo "PEP8 checks passed."
-fi
-
-rm "$PEP8_REPORT_PATH"
-
-# for to_be_checked in "$PATHS_TO_CHECK"
-# do
-#     pylint --rcfile="$SPARK_ROOT_DIR/pylintrc" $to_be_checked >> "$PYLINT_REPORT_PATH"
-# done
-
-# if [ "${PIPESTATUS[0]}" -ne 0 ]; then
-#     lint_status=1
-#     echo "Pylint checks failed."
-#     cat "$PYLINT_REPORT_PATH"
-# else
-#     echo "Pylint checks passed."
-# fi
-
-# rm "$PYLINT_REPORT_PATH"
-
-exit "$lint_status"
diff --git a/dev/_site/lint-r b/dev/_site/lint-r
deleted file mode 100755
index bfda0bca15eb7..0000000000000
--- a/dev/_site/lint-r
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
-SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)"
-LINT_R_REPORT_FILE_NAME="$SPARK_ROOT_DIR/dev/lint-r-report.log"
-
-
-if ! type "Rscript" > /dev/null; then
-  echo "ERROR: You should install R"
-  exit
-fi
-
-`which Rscript` --vanilla "$SPARK_ROOT_DIR/dev/lint-r.R" "$SPARK_ROOT_DIR" | tee "$LINT_R_REPORT_FILE_NAME"
-
-NUM_LINES=`wc -l < "$LINT_R_REPORT_FILE_NAME" | awk '{print $1}'`
-if [ "$NUM_LINES" = "0" ] ; then
-  lint_status=0
-  echo "lintr checks passed."
-else
-  lint_status=1
-  echo "lintr checks failed."
-fi
-
-exit "$lint_status"
diff --git a/dev/_site/lint-r.R b/dev/_site/lint-r.R
deleted file mode 100644
index 999eef571b824..0000000000000
--- a/dev/_site/lint-r.R
+++ /dev/null
@@ -1,37 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-argv <- commandArgs(TRUE)
-SPARK_ROOT_DIR <- as.character(argv[1])
-LOCAL_LIB_LOC <- file.path(SPARK_ROOT_DIR, "R", "lib")
-
-# Checks if SparkR is installed in a local directory.
-if (! library(SparkR, lib.loc = LOCAL_LIB_LOC, logical.return = TRUE)) {
-  stop("You should install SparkR in a local directory with `R/install-dev.sh`.")
-}
-
-# Installs lintr from Github in a local directory.
-# NOTE: The CRAN's version is too old to adapt to our rules.
-if ("lintr" %in% row.names(installed.packages())  == FALSE) {
-  devtools::install_github("jimhester/lintr")
-}
-
-library(lintr)
-library(methods)
-library(testthat)
-path.to.package <- file.path(SPARK_ROOT_DIR, "R", "pkg")
-lint_package(path.to.package, cache = FALSE)
diff --git a/dev/_site/lint-scala b/dev/_site/lint-scala
deleted file mode 100755
index c676dfdf4f44e..0000000000000
--- a/dev/_site/lint-scala
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
-SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)"
-
-"$SCRIPT_DIR/scalastyle"
diff --git a/dev/_site/merge_spark_pr.py b/dev/_site/merge_spark_pr.py
deleted file mode 100755
index bf1a000f46791..0000000000000
--- a/dev/_site/merge_spark_pr.py
+++ /dev/null
@@ -1,453 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Utility for creating well-formed pull request merges and pushing them to Apache.
-#   usage: ./apache-pr-merge.py    (see config env vars below)
-#
-# This utility assumes you already have local a Spark git folder and that you
-# have added remotes corresponding to both (i) the github apache Spark
-# mirror and (ii) the apache git repo.
-
-import json
-import os
-import re
-import subprocess
-import sys
-import urllib2
-
-try:
-    import jira.client
-    JIRA_IMPORTED = True
-except ImportError:
-    JIRA_IMPORTED = False
-
-# Location of your Spark git development area
-SPARK_HOME = os.environ.get("SPARK_HOME", os.getcwd())
-# Remote name which points to the Gihub site
-PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "apache-github")
-# Remote name which points to Apache git
-PUSH_REMOTE_NAME = os.environ.get("PUSH_REMOTE_NAME", "apache")
-# ASF JIRA username
-JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "")
-# ASF JIRA password
-JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "")
-# OAuth key used for issuing requests against the GitHub API. If this is not defined, then requests
-# will be unauthenticated. You should only need to configure this if you find yourself regularly
-# exceeding your IP's unauthenticated request rate limit. You can create an OAuth key at
-# https://github.com/settings/tokens. This script only requires the "public_repo" scope.
-GITHUB_OAUTH_KEY = os.environ.get("GITHUB_OAUTH_KEY")
-
-
-GITHUB_BASE = "https://github.com/apache/spark/pull"
-GITHUB_API_BASE = "https://api.github.com/repos/apache/spark"
-JIRA_BASE = "https://issues.apache.org/jira/browse"
-JIRA_API_BASE = "https://issues.apache.org/jira"
-# Prefix added to temporary branches
-BRANCH_PREFIX = "PR_TOOL"
-
-
-def get_json(url):
-    try:
-        request = urllib2.Request(url)
-        if GITHUB_OAUTH_KEY:
-            request.add_header('Authorization', 'token %s' % GITHUB_OAUTH_KEY)
-        return json.load(urllib2.urlopen(request))
-    except urllib2.HTTPError as e:
-        if "X-RateLimit-Remaining" in e.headers and e.headers["X-RateLimit-Remaining"] == '0':
-            print "Exceeded the GitHub API rate limit; see the instructions in " + \
-                  "dev/merge_spark_pr.py to configure an OAuth token for making authenticated " + \
-                  "GitHub requests."
-        else:
-            print "Unable to fetch URL, exiting: %s" % url
-        sys.exit(-1)
-
-
-def fail(msg):
-    print msg
-    clean_up()
-    sys.exit(-1)
-
-
-def run_cmd(cmd):
-    print cmd
-    if isinstance(cmd, list):
-        return subprocess.check_output(cmd)
-    else:
-        return subprocess.check_output(cmd.split(" "))
-
-
-def continue_maybe(prompt):
-    result = raw_input("\n%s (y/n): " % prompt)
-    if result.lower() != "y":
-        fail("Okay, exiting")
-
-def clean_up():
-    print "Restoring head pointer to %s" % original_head
-    run_cmd("git checkout %s" % original_head)
-
-    branches = run_cmd("git branch").replace(" ", "").split("\n")
-
-    for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches):
-        print "Deleting local branch %s" % branch
-        run_cmd("git branch -D %s" % branch)
-
-
-# merge the requested PR and return the merge hash
-def merge_pr(pr_num, target_ref, title, body, pr_repo_desc):
-    pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num)
-    target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, target_ref.upper())
-    run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, pr_branch_name))
-    run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, target_branch_name))
-    run_cmd("git checkout %s" % target_branch_name)
-
-    had_conflicts = False
-    try:
-        run_cmd(['git', 'merge', pr_branch_name, '--squash'])
-    except Exception as e:
-        msg = "Error merging: %s\nWould you like to manually fix-up this merge?" % e
-        continue_maybe(msg)
-        msg = "Okay, please fix any conflicts and 'git add' conflicting files... Finished?"
-        continue_maybe(msg)
-        had_conflicts = True
-
-    commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
-                             '--pretty=format:%an <%ae>']).split("\n")
-    distinct_authors = sorted(set(commit_authors),
-                              key=lambda x: commit_authors.count(x), reverse=True)
-    primary_author = raw_input(
-        "Enter primary author in the format of \"name <email>\" [%s]: " %
-        distinct_authors[0])
-    if primary_author == "":
-        primary_author = distinct_authors[0]
-
-    commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
-                      '--pretty=format:%h [%an] %s']).split("\n\n")
-
-    merge_message_flags = []
-
-    merge_message_flags += ["-m", title]
-    if body is not None:
-        # We remove @ symbols from the body to avoid triggering e-mails
-        # to people every time someone creates a public fork of Spark.
-        merge_message_flags += ["-m", body.replace("@", "")]
-
-    authors = "\n".join(["Author: %s" % a for a in distinct_authors])
-
-    merge_message_flags += ["-m", authors]
-
-    if had_conflicts:
-        committer_name = run_cmd("git config --get user.name").strip()
-        committer_email = run_cmd("git config --get user.email").strip()
-        message = "This patch had conflicts when merged, resolved by\nCommitter: %s <%s>" % (
-            committer_name, committer_email)
-        merge_message_flags += ["-m", message]
-
-    # The string "Closes #%s" string is required for GitHub to correctly close the PR
-    merge_message_flags += ["-m", "Closes #%s from %s." % (pr_num, pr_repo_desc)]
-
-    run_cmd(['git', 'commit', '--author="%s"' % primary_author] + merge_message_flags)
-
-    continue_maybe("Merge complete (local ref %s). Push to %s?" % (
-        target_branch_name, PUSH_REMOTE_NAME))
-
-    try:
-        run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, target_ref))
-    except Exception as e:
-        clean_up()
-        fail("Exception while pushing: %s" % e)
-
-    merge_hash = run_cmd("git rev-parse %s" % target_branch_name)[:8]
-    clean_up()
-    print("Pull request #%s merged!" % pr_num)
-    print("Merge hash: %s" % merge_hash)
-    return merge_hash
-
-
-def cherry_pick(pr_num, merge_hash, default_branch):
-    pick_ref = raw_input("Enter a branch name [%s]: " % default_branch)
-    if pick_ref == "":
-        pick_ref = default_branch
-
-    pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, pick_ref.upper())
-
-    run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, pick_branch_name))
-    run_cmd("git checkout %s" % pick_branch_name)
-
-    try:
-        run_cmd("git cherry-pick -sx %s" % merge_hash)
-    except Exception as e:
-        msg = "Error cherry-picking: %s\nWould you like to manually fix-up this merge?" % e
-        continue_maybe(msg)
-        msg = "Okay, please fix any conflicts and finish the cherry-pick. Finished?"
-        continue_maybe(msg)
-
-    continue_maybe("Pick complete (local ref %s). Push to %s?" % (
-        pick_branch_name, PUSH_REMOTE_NAME))
-
-    try:
-        run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref))
-    except Exception as e:
-        clean_up()
-        fail("Exception while pushing: %s" % e)
-
-    pick_hash = run_cmd("git rev-parse %s" % pick_branch_name)[:8]
-    clean_up()
-
-    print("Pull request #%s picked into %s!" % (pr_num, pick_ref))
-    print("Pick hash: %s" % pick_hash)
-    return pick_ref
-
-
-def fix_version_from_branch(branch, versions):
-    # Note: Assumes this is a sorted (newest->oldest) list of un-released versions
-    if branch == "master":
-        return versions[0]
-    else:
-        branch_ver = branch.replace("branch-", "")
-        return filter(lambda x: x.name.startswith(branch_ver), versions)[-1]
-
-
-def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
-    asf_jira = jira.client.JIRA({'server': JIRA_API_BASE},
-                                basic_auth=(JIRA_USERNAME, JIRA_PASSWORD))
-
-    jira_id = raw_input("Enter a JIRA id [%s]: " % default_jira_id)
-    if jira_id == "":
-        jira_id = default_jira_id
-
-    try:
-        issue = asf_jira.issue(jira_id)
-    except Exception as e:
-        fail("ASF JIRA could not find %s\n%s" % (jira_id, e))
-
-    cur_status = issue.fields.status.name
-    cur_summary = issue.fields.summary
-    cur_assignee = issue.fields.assignee
-    if cur_assignee is None:
-        cur_assignee = "NOT ASSIGNED!!!"
-    else:
-        cur_assignee = cur_assignee.displayName
-
-    if cur_status == "Resolved" or cur_status == "Closed":
-        fail("JIRA issue %s already has status '%s'" % (jira_id, cur_status))
-    print ("=== JIRA %s ===" % jira_id)
-    print ("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" % (
-        cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id))
-
-    versions = asf_jira.project_versions("SPARK")
-    versions = sorted(versions, key=lambda x: x.name, reverse=True)
-    versions = filter(lambda x: x.raw['released'] is False, versions)
-    # Consider only x.y.z versions
-    versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions)
-
-    default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches)
-    for v in default_fix_versions:
-        # Handles the case where we have forked a release branch but not yet made the release.
-        # In this case, if the PR is committed to the master branch and the release branch, we
-        # only consider the release branch to be the fix version. E.g. it is not valid to have
-        # both 1.1.0 and 1.0.0 as fix versions.
-        (major, minor, patch) = v.split(".")
-        if patch == "0":
-            previous = "%s.%s.%s" % (major, int(minor) - 1, 0)
-            if previous in default_fix_versions:
-                default_fix_versions = filter(lambda x: x != v, default_fix_versions)
-    default_fix_versions = ",".join(default_fix_versions)
-
-    fix_versions = raw_input("Enter comma-separated fix version(s) [%s]: " % default_fix_versions)
-    if fix_versions == "":
-        fix_versions = default_fix_versions
-    fix_versions = fix_versions.replace(" ", "").split(",")
-
-    def get_version_json(version_str):
-        return filter(lambda v: v.name == version_str, versions)[0].raw
-
-    jira_fix_versions = map(lambda v: get_version_json(v), fix_versions)
-
-    resolve = filter(lambda a: a['name'] == "Resolve Issue", asf_jira.transitions(jira_id))[0]
-    resolution = filter(lambda r: r.raw['name'] == "Fixed", asf_jira.resolutions())[0]
-    asf_jira.transition_issue(
-        jira_id, resolve["id"], fixVersions = jira_fix_versions,
-        comment = comment, resolution = {'id': resolution.raw['id']})
-
-    print "Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions)
-
-
-def resolve_jira_issues(title, merge_branches, comment):
-    jira_ids = re.findall("SPARK-[0-9]{4,5}", title)
-
-    if len(jira_ids) == 0:
-        resolve_jira_issue(merge_branches, comment)
-    for jira_id in jira_ids:
-        resolve_jira_issue(merge_branches, comment, jira_id)
-
-
-def standardize_jira_ref(text):
-    """
-    Standardize the [SPARK-XXXXX] [MODULE] prefix
-    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to "[SPARK-XXX][MLLIB] Issue"
-
-    >>> standardize_jira_ref("[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful")
-    '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is successful'
-    >>> standardize_jira_ref("[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests")
-    '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull requests'
-    >>> standardize_jira_ref("[MLlib] Spark  5954: Top by key")
-    '[SPARK-5954][MLLIB] Top by key'
-    >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl")
-    '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
-    >>> standardize_jira_ref("SPARK-1094 Support MiMa for reporting binary compatibility accross versions.")
-    '[SPARK-1094] Support MiMa for reporting binary compatibility accross versions.'
-    >>> standardize_jira_ref("[WIP]  [SPARK-1146] Vagrant support for Spark")
-    '[SPARK-1146][WIP] Vagrant support for Spark'
-    >>> standardize_jira_ref("SPARK-1032. If Yarn app fails before registering, app master stays aroun...")
-    '[SPARK-1032] If Yarn app fails before registering, app master stays aroun...'
-    >>> standardize_jira_ref("[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.")
-    '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.'
-    >>> standardize_jira_ref("Additional information for users building from source code")
-    'Additional information for users building from source code'
-    """
-    jira_refs = []
-    components = []
-
-    # If the string is compliant, no need to process any further
-    if (re.search(r'^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+', text)):
-        return text
-
-    # Extract JIRA ref(s):
-    pattern = re.compile(r'(SPARK[-\s]*[0-9]{3,6})+', re.IGNORECASE)
-    for ref in pattern.findall(text):
-        # Add brackets, replace spaces with a dash, & convert to uppercase
-        jira_refs.append('[' + re.sub(r'\s+', '-', ref.upper()) + ']')
-        text = text.replace(ref, '')
-
-    # Extract spark component(s):
-    # Look for alphanumeric chars, spaces, dashes, periods, and/or commas
-    pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE)
-    for component in pattern.findall(text):
-        components.append(component.upper())
-        text = text.replace(component, '')
-
-    # Cleanup any remaining symbols:
-    pattern = re.compile(r'^\W+(.*)', re.IGNORECASE)
-    if (pattern.search(text) is not None):
-        text = pattern.search(text).groups()[0]
-
-    # Assemble full text (JIRA ref(s), module(s), remaining text)
-    clean_text = ''.join(jira_refs).strip() + ''.join(components).strip() + " " + text.strip()
-
-    # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were included
-    clean_text = re.sub(r'\s+', ' ', clean_text.strip())
-
-    return clean_text
-
-def main():
-    global original_head
-
-    os.chdir(SPARK_HOME)
-    original_head = run_cmd("git rev-parse HEAD")[:8]
-
-    branches = get_json("%s/branches" % GITHUB_API_BASE)
-    branch_names = filter(lambda x: x.startswith("branch-"), [x['name'] for x in branches])
-    # Assumes branch names can be sorted lexicographically
-    latest_branch = sorted(branch_names, reverse=True)[0]
-
-    pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ")
-    pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num))
-    pr_events = get_json("%s/issues/%s/events" % (GITHUB_API_BASE, pr_num))
-
-    url = pr["url"]
-
-    # Decide whether to use the modified title or not
-    modified_title = standardize_jira_ref(pr["title"])
-    if modified_title != pr["title"]:
-        print "I've re-written the title as follows to match the standard format:"
-        print "Original: %s" % pr["title"]
-        print "Modified: %s" % modified_title
-        result = raw_input("Would you like to use the modified title? (y/n): ")
-        if result.lower() == "y":
-            title = modified_title
-            print "Using modified title:"
-        else:
-            title = pr["title"]
-            print "Using original title:"
-        print title
-    else:
-        title = pr["title"]
-
-    body = pr["body"]
-    target_ref = pr["base"]["ref"]
-    user_login = pr["user"]["login"]
-    base_ref = pr["head"]["ref"]
-    pr_repo_desc = "%s/%s" % (user_login, base_ref)
-
-    # Merged pull requests don't appear as merged in the GitHub API;
-    # Instead, they're closed by asfgit.
-    merge_commits = \
-        [e for e in pr_events if e["actor"]["login"] == "asfgit" and e["event"] == "closed"]
-
-    if merge_commits:
-        merge_hash = merge_commits[0]["commit_id"]
-        message = get_json("%s/commits/%s" % (GITHUB_API_BASE, merge_hash))["commit"]["message"]
-
-        print "Pull request %s has already been merged, assuming you want to backport" % pr_num
-        commit_is_downloaded = run_cmd(['git', 'rev-parse', '--quiet', '--verify',
-                                    "%s^{commit}" % merge_hash]).strip() != ""
-        if not commit_is_downloaded:
-            fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num)
-
-        print "Found commit %s:\n%s" % (merge_hash, message)
-        cherry_pick(pr_num, merge_hash, latest_branch)
-        sys.exit(0)
-
-    if not bool(pr["mergeable"]):
-        msg = "Pull request %s is not mergeable in its current form.\n" % pr_num + \
-            "Continue? (experts only!)"
-        continue_maybe(msg)
-
-    print ("\n=== Pull Request #%s ===" % pr_num)
-    print ("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % (
-        title, pr_repo_desc, target_ref, url))
-    continue_maybe("Proceed with merging pull request #%s?" % pr_num)
-
-    merged_refs = [target_ref]
-
-    merge_hash = merge_pr(pr_num, target_ref, title, body, pr_repo_desc)
-
-    pick_prompt = "Would you like to pick %s into another branch?" % merge_hash
-    while raw_input("\n%s (y/n): " % pick_prompt).lower() == "y":
-        merged_refs = merged_refs + [cherry_pick(pr_num, merge_hash, latest_branch)]
-
-    if JIRA_IMPORTED:
-        if JIRA_USERNAME and JIRA_PASSWORD:
-            continue_maybe("Would you like to update an associated JIRA?")
-            jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (pr_num, GITHUB_BASE, pr_num)
-            resolve_jira_issues(title, merged_refs, jira_comment)
-        else:
-            print "JIRA_USERNAME and JIRA_PASSWORD not set"
-            print "Exiting without trying to close the associated JIRA."
-    else:
-        print "Could not find jira-python library. Run 'sudo pip install jira' to install."
-        print "Exiting without trying to close the associated JIRA."
-
-if __name__ == "__main__":
-    import doctest
-    (failure_count, test_count) = doctest.testmod()
-    if failure_count:
-        exit(-1)
-
-    main()
diff --git a/dev/_site/mima b/dev/_site/mima
deleted file mode 100755
index 2952fa65d42ff..0000000000000
--- a/dev/_site/mima
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set -o pipefail
-set -e
-
-# Go to the Spark project root directory
-FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
-cd "$FWDIR"
-
-echo -e "q\n" | build/sbt oldDeps/update
-rm -f .generated-mima*
-
-generate_mima_ignore() {
-  SPARK_JAVA_OPTS="-XX:MaxPermSize=1g -Xmx2g" \
-    ./bin/spark-class org.apache.spark.tools.GenerateMIMAIgnore
-}
-
-# Generate Mima Ignore is called twice, first with latest built jars
-# on the classpath and then again with previous version jars on the classpath.
-# Because of a bug in GenerateMIMAIgnore that when old jars are ahead on classpath
-# it did not process the new classes (which are in assembly jar).
-generate_mima_ignore
-
-export SPARK_CLASSPATH="`find lib_managed \( -name '*spark*jar' -a -type f \) | tr "\\n" ":"`"
-echo "SPARK_CLASSPATH=$SPARK_CLASSPATH"
-
-generate_mima_ignore
-
-echo -e "q\n" | build/sbt mima-report-binary-issues | grep -v -e "info.*Resolving"
-ret_val=$?
-
-if [ $ret_val != 0 ]; then
-  echo "NOTE: Exceptions to binary compatibility can be added in project/MimaExcludes.scala"
-fi
-
-rm -f .generated-mima*
-exit $ret_val
diff --git a/dev/_site/run-tests b/dev/_site/run-tests
deleted file mode 100755
index 257d1e8d50bb4..0000000000000
--- a/dev/_site/run-tests
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FWDIR="$(cd "`dirname $0`"/..; pwd)"
-cd "$FWDIR"
-
-exec python -u ./dev/run-tests.py "$@"
diff --git a/dev/_site/run-tests-jenkins b/dev/_site/run-tests-jenkins
deleted file mode 100755
index e79accf9e987a..0000000000000
--- a/dev/_site/run-tests-jenkins
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Wrapper script that runs the Spark tests then reports QA results
-# to github via its API.
-# Environment variables are populated by the code here:
-#+ https://github.com/jenkinsci/ghprb-plugin/blob/master/src/main/java/org/jenkinsci/plugins/ghprb/GhprbTrigger.java#L139
-
-FWDIR="$(cd "`dirname $0`"/..; pwd)"
-cd "$FWDIR"
-
-exec python -u ./dev/run-tests-jenkins.py "$@"
diff --git a/dev/_site/run-tests-jenkins.py b/dev/_site/run-tests-jenkins.py
deleted file mode 100755
index 623004310e189..0000000000000
--- a/dev/_site/run-tests-jenkins.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/usr/bin/env python2
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-import os
-import sys
-import json
-import urllib2
-import functools
-import subprocess
-
-from sparktestsupport import SPARK_HOME, ERROR_CODES
-from sparktestsupport.shellutils import run_cmd
-
-
-def print_err(msg):
-    """
-    Given a set of arguments, will print them to the STDERR stream
-    """
-    print(msg, file=sys.stderr)
-
-
-def post_message_to_github(msg, ghprb_pull_id):
-    print("Attempting to post to Github...")
-
-    url = "https://api.github.com/repos/apache/spark/issues/" + ghprb_pull_id + "/comments"
-    github_oauth_key = os.environ["GITHUB_OAUTH_KEY"]
-
-    posted_message = json.dumps({"body": msg})
-    request = urllib2.Request(url,
-                              headers={
-                                  "Authorization": "token %s" % github_oauth_key,
-                                  "Content-Type": "application/json"
-                              },
-                              data=posted_message)
-    try:
-        response = urllib2.urlopen(request)
-
-        if response.getcode() == 201:
-            print(" > Post successful.")
-    except urllib2.HTTPError as http_e:
-        print_err("Failed to post message to Github.")
-        print_err(" > http_code: %s" % http_e.code)
-        print_err(" > api_response: %s" % http_e.read())
-        print_err(" > data: %s" % posted_message)
-    except urllib2.URLError as url_e:
-        print_err("Failed to post message to Github.")
-        print_err(" > urllib2_status: %s" % url_e.reason[1])
-        print_err(" > data: %s" % posted_message)
-
-
-def pr_message(build_display_name,
-               build_url,
-               ghprb_pull_id,
-               short_commit_hash,
-               commit_url,
-               msg,
-               post_msg=''):
-    # align the arguments properly for string formatting
-    str_args = (build_display_name,
-                msg,
-                build_url,
-                ghprb_pull_id,
-                short_commit_hash,
-                commit_url,
-                str(' ' + post_msg + '.') if post_msg else '.')
-    return '**[Test build %s %s](%sconsoleFull)** for PR %s at commit [`%s`](%s)%s' % str_args
-
-
-def run_pr_checks(pr_tests, ghprb_actual_commit, sha1):
-    """
-    Executes a set of pull request checks to ease development and report issues with various
-    components such as style, linting, dependencies, compatibilities, etc.
-    @return a list of messages to post back to Github
-    """
-    # Ensure we save off the current HEAD to revert to
-    current_pr_head = run_cmd(['git', 'rev-parse', 'HEAD'], return_output=True).strip()
-    pr_results = list()
-
-    for pr_test in pr_tests:
-        test_name = pr_test + '.sh'
-        pr_results.append(run_cmd(['bash', os.path.join(SPARK_HOME, 'dev', 'tests', test_name),
-                                   ghprb_actual_commit, sha1],
-                                  return_output=True).rstrip())
-        # Ensure, after each test, that we're back on the current PR
-        run_cmd(['git', 'checkout', '-f', current_pr_head])
-    return pr_results
-
-
-def run_tests(tests_timeout):
-    """
-    Runs the `dev/run-tests` script and responds with the correct error message
-    under the various failure scenarios.
-    @return a tuple containing the test result code and the result note to post to Github
-    """
-
-    test_result_code = subprocess.Popen(['timeout',
-                                         tests_timeout,
-                                         os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait()
-
-    failure_note_by_errcode = {
-        1: 'executing the `dev/run-tests` script',  # error to denote run-tests script failures
-        ERROR_CODES["BLOCK_GENERAL"]: 'some tests',
-        ERROR_CODES["BLOCK_RAT"]: 'RAT tests',
-        ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests',
-        ERROR_CODES["BLOCK_PYTHON_STYLE"]: 'Python style tests',
-        ERROR_CODES["BLOCK_R_STYLE"]: 'R style tests',
-        ERROR_CODES["BLOCK_DOCUMENTATION"]: 'to generate documentation',
-        ERROR_CODES["BLOCK_BUILD"]: 'to build',
-        ERROR_CODES["BLOCK_MIMA"]: 'MiMa tests',
-        ERROR_CODES["BLOCK_SPARK_UNIT_TESTS"]: 'Spark unit tests',
-        ERROR_CODES["BLOCK_PYSPARK_UNIT_TESTS"]: 'PySpark unit tests',
-        ERROR_CODES["BLOCK_SPARKR_UNIT_TESTS"]: 'SparkR unit tests',
-        ERROR_CODES["BLOCK_TIMEOUT"]: 'from timeout after a configured wait of \`%s\`' % (
-            tests_timeout)
-    }
-
-    if test_result_code == 0:
-        test_result_note = ' * This patch passes all tests.'
-    else:
-        test_result_note = ' * This patch **fails %s**.' % failure_note_by_errcode[test_result_code]
-
-    return [test_result_code, test_result_note]
-
-
-def main():
-    # Important Environment Variables
-    # ---
-    # $ghprbActualCommit
-    #   This is the hash of the most recent commit in the PR.
-    #   The merge-base of this and master is the commit from which the PR was branched.
-    # $sha1
-    #   If the patch merges cleanly, this is a reference to the merge commit hash
-    #     (e.g. "origin/pr/2606/merge").
-    #   If the patch does not merge cleanly, it is equal to $ghprbActualCommit.
-    #   The merge-base of this and master in the case of a clean merge is the most recent commit
-    #     against master.
-    ghprb_pull_id = os.environ["ghprbPullId"]
-    ghprb_actual_commit = os.environ["ghprbActualCommit"]
-    ghprb_pull_title = os.environ["ghprbPullTitle"]
-    sha1 = os.environ["sha1"]
-
-    # Marks this build as a pull request build.
-    os.environ["AMP_JENKINS_PRB"] = "true"
-    # Switch to a Maven-based build if the PR title contains "test-maven":
-    if "test-maven" in ghprb_pull_title:
-        os.environ["AMPLAB_JENKINS_BUILD_TOOL"] = "maven"
-    # Switch the Hadoop profile based on the PR title:
-    if "test-hadoop1.0" in ghprb_pull_title:
-        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop1.0"
-    if "test-hadoop2.2" in ghprb_pull_title:
-        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.0"
-    if "test-hadoop2.2" in ghprb_pull_title:
-        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.2"
-    if "test-hadoop2.3" in ghprb_pull_title:
-        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.3"
-
-    build_display_name = os.environ["BUILD_DISPLAY_NAME"]
-    build_url = os.environ["BUILD_URL"]
-
-    commit_url = "https://github.com/apache/spark/commit/" + ghprb_actual_commit
-
-    # GitHub doesn't auto-link short hashes when submitted via the API, unfortunately. :(
-    short_commit_hash = ghprb_actual_commit[0:7]
-
-    # format: http://linux.die.net/man/1/timeout
-    # must be less than the timeout configured on Jenkins (currently 300m)
-    tests_timeout = "250m"
-
-    # Array to capture all test names to run on the pull request. These tests are represented
-    # by their file equivalents in the dev/tests/ directory.
-    #
-    # To write a PR test:
-    #   * the file must reside within the dev/tests directory
-    #   * be an executable bash script
-    #   * accept three arguments on the command line, the first being the Github PR long commit
-    #     hash, the second the Github SHA1 hash, and the final the current PR hash
-    #   * and, lastly, return string output to be included in the pr message output that will
-    #     be posted to Github
-    pr_tests = [
-        "pr_merge_ability",
-        "pr_public_classes"
-        # DISABLED (pwendell) "pr_new_dependencies"
-    ]
-
-    # `bind_message_base` returns a function to generate messages for Github posting
-    github_message = functools.partial(pr_message,
-                                       build_display_name,
-                                       build_url,
-                                       ghprb_pull_id,
-                                       short_commit_hash,
-                                       commit_url)
-
-    # post start message
-    post_message_to_github(github_message('has started'), ghprb_pull_id)
-
-    pr_check_results = run_pr_checks(pr_tests, ghprb_actual_commit, sha1)
-
-    test_result_code, test_result_note = run_tests(tests_timeout)
-
-    # post end message
-    result_message = github_message('has finished')
-    result_message += '\n' + test_result_note + '\n'
-    result_message += '\n'.join(pr_check_results)
-
-    post_message_to_github(result_message, ghprb_pull_id)
-
-    sys.exit(test_result_code)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/dev/_site/run-tests.py b/dev/_site/run-tests.py
deleted file mode 100755
index 9e1abb0697192..0000000000000
--- a/dev/_site/run-tests.py
+++ /dev/null
@@ -1,561 +0,0 @@
-#!/usr/bin/env python2
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-import itertools
-from optparse import OptionParser
-import os
-import random
-import re
-import sys
-import subprocess
-from collections import namedtuple
-
-from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
-from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
-import sparktestsupport.modules as modules
-
-
-# -------------------------------------------------------------------------------------------------
-# Functions for traversing module dependency graph
-# -------------------------------------------------------------------------------------------------
-
-
-def determine_modules_for_files(filenames):
-    """
-    Given a list of filenames, return the set of modules that contain those files.
-    If a file is not associated with a more specific submodule, then this method will consider that
-    file to belong to the 'root' module.
-
-    >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
-    ['pyspark-core', 'sql']
-    >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
-    ['root']
-    """
-    changed_modules = set()
-    for filename in filenames:
-        matched_at_least_one_module = False
-        for module in modules.all_modules:
-            if module.contains_file(filename):
-                changed_modules.add(module)
-                matched_at_least_one_module = True
-        if not matched_at_least_one_module:
-            changed_modules.add(modules.root)
-    return changed_modules
-
-
-def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None):
-    """
-    Given a git commit and target ref, use the set of files changed in the diff in order to
-    determine which modules' tests should be run.
-
-    >>> [x.name for x in determine_modules_for_files( \
-            identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))]
-    ['graphx']
-    >>> 'root' in [x.name for x in determine_modules_for_files( \
-         identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
-    True
-    """
-    if target_branch is None and target_ref is None:
-        raise AttributeError("must specify either target_branch or target_ref")
-    elif target_branch is not None and target_ref is not None:
-        raise AttributeError("must specify either target_branch or target_ref, not both")
-    if target_branch is not None:
-        diff_target = target_branch
-        run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
-    else:
-        diff_target = target_ref
-    raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
-                                         universal_newlines=True)
-    # Remove any empty strings
-    return [f for f in raw_output.split('\n') if f]
-
-
-def setup_test_environ(environ):
-    print("[info] Setup the following environment variables for tests: ")
-    for (k, v) in environ.items():
-        print("%s=%s" % (k, v))
-        os.environ[k] = v
-
-
-def determine_modules_to_test(changed_modules):
-    """
-    Given a set of modules that have changed, compute the transitive closure of those modules'
-    dependent modules in order to determine the set of modules that should be tested.
-
-    >>> sorted(x.name for x in determine_modules_to_test([modules.root]))
-    ['root']
-    >>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
-    ['examples', 'graphx']
-    >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
-    >>> x # doctest: +NORMALIZE_WHITESPACE
-    ['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \
-     'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql']
-    """
-    # If we're going to have to run all of the tests, then we can just short-circuit
-    # and return 'root'. No module depends on root, so if it appears then it will be
-    # in changed_modules.
-    if modules.root in changed_modules:
-        return [modules.root]
-    modules_to_test = set()
-    for module in changed_modules:
-        modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
-    return modules_to_test.union(set(changed_modules))
-
-
-def determine_tags_to_exclude(changed_modules):
-    tags = []
-    for m in modules.all_modules:
-        if m not in changed_modules:
-            tags += m.test_tags
-    return tags
-
-
-# -------------------------------------------------------------------------------------------------
-# Functions for working with subprocesses and shell tools
-# -------------------------------------------------------------------------------------------------
-
-
-def determine_java_executable():
-    """Will return the path of the java executable that will be used by Spark's
-    tests or `None`"""
-
-    # Any changes in the way that Spark's build detects java must be reflected
-    # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to
-    # the `java` executable on the path
-
-    java_home = os.environ.get("JAVA_HOME")
-
-    # check if there is an executable at $JAVA_HOME/bin/java
-    java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None
-    # if the java_exe wasn't set, check for a `java` version on the $PATH
-    return java_exe if java_exe else which("java")
-
-
-JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 'update'])
-
-
-def determine_java_version(java_exe):
-    """Given a valid java executable will return its version in named tuple format
-    with accessors '.major', '.minor', '.patch', '.update'"""
-
-    raw_output = subprocess.check_output([java_exe, "-version"],
-                                         stderr=subprocess.STDOUT,
-                                         universal_newlines=True)
-
-    raw_output_lines = raw_output.split('\n')
-
-    # find raw version string, eg 'java version "1.8.0_25"'
-    raw_version_str = next(x for x in raw_output_lines if " version " in x)
-
-    match = re.search('(\d+)\.(\d+)\.(\d+)_(\d+)', raw_version_str)
-
-    major = int(match.group(1))
-    minor = int(match.group(2))
-    patch = int(match.group(3))
-    update = int(match.group(4))
-
-    return JavaVersion(major, minor, patch, update)
-
-# -------------------------------------------------------------------------------------------------
-# Functions for running the other build and test scripts
-# -------------------------------------------------------------------------------------------------
-
-
-def set_title_and_block(title, err_block):
-    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES[err_block])
-    line_str = '=' * 72
-
-    print('')
-    print(line_str)
-    print(title)
-    print(line_str)
-
-
-def run_apache_rat_checks():
-    set_title_and_block("Running Apache RAT checks", "BLOCK_RAT")
-    run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
-
-
-def run_scala_style_checks():
-    set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
-    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")])
-
-
-def run_python_style_checks():
-    set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE")
-    run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")])
-
-
-def run_sparkr_style_checks():
-    set_title_and_block("Running R style checks", "BLOCK_R_STYLE")
-
-    if which("R"):
-        # R style check should be executed after `install-dev.sh`.
-        # Since warnings about `no visible global function definition` appear
-        # without the installation. SEE ALSO: SPARK-9121.
-        run_cmd([os.path.join(SPARK_HOME, "dev", "lint-r")])
-    else:
-        print("Ignoring SparkR style check as R was not found in PATH")
-
-
-def build_spark_documentation():
-    set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION")
-    os.environ["PRODUCTION"] = "1 jekyll build"
-
-    os.chdir(os.path.join(SPARK_HOME, "docs"))
-
-    jekyll_bin = which("jekyll")
-
-    if not jekyll_bin:
-        print("[error] Cannot find a version of `jekyll` on the system; please",
-              " install one and retry to build documentation.")
-        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
-    else:
-        run_cmd([jekyll_bin, "build"])
-
-    os.chdir(SPARK_HOME)
-
-
-def get_zinc_port():
-    """
-    Get a randomized port on which to start Zinc
-    """
-    return random.randrange(3030, 4030)
-
-
-def kill_zinc_on_port(zinc_port):
-    """
-    Kill the Zinc process running on the given port, if one exists.
-    """
-    cmd = ("/usr/sbin/lsof -P |grep %s | grep LISTEN "
-           "| awk '{ print $2; }' | xargs kill") % zinc_port
-    subprocess.check_call(cmd, shell=True)
-
-
-def exec_maven(mvn_args=()):
-    """Will call Maven in the current directory with the list of mvn_args passed
-    in and returns the subprocess for any further processing"""
-
-    zinc_port = get_zinc_port()
-    os.environ["ZINC_PORT"] = "%s" % zinc_port
-    zinc_flag = "-DzincPort=%s" % zinc_port
-    flags = [os.path.join(SPARK_HOME, "build", "mvn"), "--force", zinc_flag]
-    run_cmd(flags + mvn_args)
-    kill_zinc_on_port(zinc_port)
-
-
-def exec_sbt(sbt_args=()):
-    """Will call SBT in the current directory with the list of mvn_args passed
-    in and returns the subprocess for any further processing"""
-
-    sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args
-
-    sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" +
-                                   "^.*[warn].*Merging" + "|" +
-                                   "^.*[info].*Including")
-
-    # NOTE: echo "q" is needed because sbt on encountering a build file
-    # with failure (either resolution or compilation) prompts the user for
-    # input either q, r, etc to quit or retry. This echo is there to make it
-    # not block.
-    echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE)
-    sbt_proc = subprocess.Popen(sbt_cmd,
-                                stdin=echo_proc.stdout,
-                                stdout=subprocess.PIPE)
-    echo_proc.wait()
-    for line in iter(sbt_proc.stdout.readline, ''):
-        if not sbt_output_filter.match(line):
-            print(line, end='')
-    retcode = sbt_proc.wait()
-
-    if retcode > 0:
-        exit_from_command_with_retcode(sbt_cmd, retcode)
-
-
-def get_hadoop_profiles(hadoop_version):
-    """
-    For the given Hadoop version tag, return a list of SBT profile flags for
-    building and testing against that Hadoop version.
-    """
-
-    sbt_maven_hadoop_profiles = {
-        "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.2.1"],
-        "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"],
-        "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"],
-        "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"],
-        "hadoop2.6": ["-Pyarn", "-Phadoop-2.6"],
-    }
-
-    if hadoop_version in sbt_maven_hadoop_profiles:
-        return sbt_maven_hadoop_profiles[hadoop_version]
-    else:
-        print("[error] Could not find", hadoop_version, "in the list. Valid options",
-              " are", sbt_maven_hadoop_profiles.keys())
-        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
-
-
-def build_spark_maven(hadoop_version):
-    # Enable all of the profiles for the build:
-    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
-    mvn_goals = ["clean", "package", "-DskipTests"]
-    profiles_and_goals = build_profiles + mvn_goals
-
-    print("[info] Building Spark (w/Hive 1.2.1) using Maven with these arguments: ",
-          " ".join(profiles_and_goals))
-
-    exec_maven(profiles_and_goals)
-
-
-def build_spark_sbt(hadoop_version):
-    # Enable all of the profiles for the build:
-    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
-    sbt_goals = ["package",
-                 "assembly/assembly",
-                 "streaming-kafka-assembly/assembly",
-                 "streaming-flume-assembly/assembly",
-                 "streaming-mqtt-assembly/assembly",
-                 "streaming-mqtt/test:assembly",
-                 "streaming-kinesis-asl-assembly/assembly"]
-    profiles_and_goals = build_profiles + sbt_goals
-
-    print("[info] Building Spark (w/Hive 1.2.1) using SBT with these arguments: ",
-          " ".join(profiles_and_goals))
-
-    exec_sbt(profiles_and_goals)
-
-
-def build_apache_spark(build_tool, hadoop_version):
-    """Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or
-    `maven`). Defaults to using `sbt`."""
-
-    set_title_and_block("Building Spark", "BLOCK_BUILD")
-
-    rm_r("lib_managed")
-
-    if build_tool == "maven":
-        build_spark_maven(hadoop_version)
-    else:
-        build_spark_sbt(hadoop_version)
-
-
-def detect_binary_inop_with_mima():
-    set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
-    run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
-
-
-def run_scala_tests_maven(test_profiles):
-    mvn_test_goals = ["test", "--fail-at-end"]
-
-    profiles_and_goals = test_profiles + mvn_test_goals
-
-    print("[info] Running Spark tests using Maven with these arguments: ",
-          " ".join(profiles_and_goals))
-
-    exec_maven(profiles_and_goals)
-
-
-def run_scala_tests_sbt(test_modules, test_profiles):
-
-    sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
-
-    if not sbt_test_goals:
-        return
-
-    profiles_and_goals = test_profiles + list(sbt_test_goals)
-
-    print("[info] Running Spark tests using SBT with these arguments: ",
-          " ".join(profiles_and_goals))
-
-    exec_sbt(profiles_and_goals)
-
-
-def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags):
-    """Function to properly execute all tests passed in as a set from the
-    `determine_test_suites` function"""
-    set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
-
-    test_modules = set(test_modules)
-
-    test_profiles = get_hadoop_profiles(hadoop_version) + \
-        list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules)))
-
-    if excluded_tags:
-        test_profiles += ['-Dtest.exclude.tags=' + ",".join(excluded_tags)]
-
-    if build_tool == "maven":
-        run_scala_tests_maven(test_profiles)
-    else:
-        run_scala_tests_sbt(test_modules, test_profiles)
-
-
-def run_python_tests(test_modules, parallelism):
-    set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
-
-    command = [os.path.join(SPARK_HOME, "python", "run-tests")]
-    if test_modules != [modules.root]:
-        command.append("--modules=%s" % ','.join(m.name for m in test_modules))
-    command.append("--parallelism=%i" % parallelism)
-    run_cmd(command)
-
-
-def run_sparkr_tests():
-    set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS")
-
-    if which("R"):
-        run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
-    else:
-        print("Ignoring SparkR tests as R was not found in PATH")
-
-
-def parse_opts():
-    parser = OptionParser(
-        prog="run-tests"
-    )
-    parser.add_option(
-        "-p", "--parallelism", type="int", default=4,
-        help="The number of suites to test in parallel (default %default)"
-    )
-
-    (opts, args) = parser.parse_args()
-    if args:
-        parser.error("Unsupported arguments: %s" % ' '.join(args))
-    if opts.parallelism < 1:
-        parser.error("Parallelism cannot be less than 1")
-    return opts
-
-
-def main():
-    opts = parse_opts()
-    # Ensure the user home directory (HOME) is valid and is an absolute directory
-    if not USER_HOME or not os.path.isabs(USER_HOME):
-        print("[error] Cannot determine your home directory as an absolute path;",
-              " ensure the $HOME environment variable is set properly.")
-        sys.exit(1)
-
-    os.chdir(SPARK_HOME)
-
-    rm_r(os.path.join(SPARK_HOME, "work"))
-    rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
-    rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))
-
-    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])
-
-    java_exe = determine_java_executable()
-
-    if not java_exe:
-        print("[error] Cannot find a version of `java` on the system; please",
-              " install one and retry.")
-        sys.exit(2)
-
-    java_version = determine_java_version(java_exe)
-
-    if java_version.minor < 8:
-        print("[warn] Java 8 tests will not run because JDK version is < 1.8.")
-
-    # install SparkR
-    if which("R"):
-        run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
-    else:
-        print("Can't install SparkR as R is was not found in PATH")
-
-    if os.environ.get("AMPLAB_JENKINS"):
-        # if we're on the Amplab Jenkins build servers setup variables
-        # to reflect the environment settings
-        build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
-        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3")
-        test_env = "amplab_jenkins"
-        # add path for Python3 in Jenkins if we're calling from a Jenkins machine
-        os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
-    else:
-        # else we're running locally and can use local settings
-        build_tool = "sbt"
-        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.3")
-        test_env = "local"
-
-    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
-          "under environment", test_env)
-
-    changed_modules = None
-    changed_files = None
-    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
-        target_branch = os.environ["ghprbTargetBranch"]
-        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
-        changed_modules = determine_modules_for_files(changed_files)
-        excluded_tags = determine_tags_to_exclude(changed_modules)
-    if not changed_modules:
-        changed_modules = [modules.root]
-        excluded_tags = []
-    print("[info] Found the following changed modules:",
-          ", ".join(x.name for x in changed_modules))
-
-    # setup environment variables
-    # note - the 'root' module doesn't collect environment variables for all modules. Because the
-    # environment variables should not be set if a module is not changed, even if running the 'root'
-    # module. So here we should use changed_modules rather than test_modules.
-    test_environ = {}
-    for m in changed_modules:
-        test_environ.update(m.environ)
-    setup_test_environ(test_environ)
-
-    test_modules = determine_modules_to_test(changed_modules)
-
-    # license checks
-    run_apache_rat_checks()
-
-    # style checks
-    if not changed_files or any(f.endswith(".scala") for f in changed_files):
-        run_scala_style_checks()
-    if not changed_files or any(f.endswith(".py") for f in changed_files):
-        run_python_style_checks()
-    if not changed_files or any(f.endswith(".R") for f in changed_files):
-        run_sparkr_style_checks()
-
-    # determine if docs were changed and if we're inside the amplab environment
-    # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
-    # if "DOCS" in changed_modules and test_env == "amplab_jenkins":
-    #    build_spark_documentation()
-
-    # spark build
-    build_apache_spark(build_tool, hadoop_version)
-
-    # backwards compatibility checks
-    if build_tool == "sbt":
-        # Note: compatiblity tests only supported in sbt for now
-        detect_binary_inop_with_mima()
-
-    # run the test suites
-    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)
-
-    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
-    if modules_with_python_tests:
-        run_python_tests(modules_with_python_tests, opts.parallelism)
-    if any(m.should_run_r_tests for m in test_modules):
-        run_sparkr_tests()
-
-
-def _test():
-    import doctest
-    failure_count = doctest.testmod()[0]
-    if failure_count:
-        exit(-1)
-
-if __name__ == "__main__":
-    _test()
-    main()
diff --git a/dev/_site/scalastyle b/dev/_site/scalastyle
deleted file mode 100755
index ad93f7e85b27c..0000000000000
--- a/dev/_site/scalastyle
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-echo -e "q\n" | build/sbt -Pkinesis-asl -Phive -Phive-thriftserver scalastyle > scalastyle.txt
-echo -e "q\n" | build/sbt -Pkinesis-asl -Phive -Phive-thriftserver test:scalastyle >> scalastyle.txt
-# Check style with YARN built too
-echo -e "q\n" | build/sbt -Pkinesis-asl -Pyarn -Phadoop-2.2 scalastyle >> scalastyle.txt
-echo -e "q\n" | build/sbt -Pkinesis-asl -Pyarn -Phadoop-2.2 test:scalastyle >> scalastyle.txt
-
-ERRORS=$(cat scalastyle.txt | awk '{if($1~/error/)print}')
-rm scalastyle.txt
-
-if test ! -z "$ERRORS"; then
-    echo -e "Scalastyle checks failed at following occurrences:\n$ERRORS"
-    exit 1
-else
-    echo -e "Scalastyle checks passed."
-fi
diff --git a/dev/_site/sparktestsupport/modules.py b/dev/_site/sparktestsupport/modules.py
deleted file mode 100644
index d65547e04db4b..0000000000000
--- a/dev/_site/sparktestsupport/modules.py
+++ /dev/null
@@ -1,437 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import itertools
-import re
-
-all_modules = []
-
-
-class Module(object):
-    """
-    A module is the basic abstraction in our test runner script. Each module consists of a set of
-    source files, a set of test commands, and a set of dependencies on other modules. We use modules
-    to define a dependency graph that lets determine which tests to run based on which files have
-    changed.
-    """
-
-    def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={},
-                 sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(),
-                 test_tags=(), should_run_r_tests=False):
-        """
-        Define a new module.
-
-        :param name: A short module name, for display in logging and error messages.
-        :param dependencies: A set of dependencies for this module. This should only include direct
-            dependencies; transitive dependencies are resolved automatically.
-        :param source_file_regexes: a set of regexes that match source files belonging to this
-            module. These regexes are applied by attempting to match at the beginning of the
-            filename strings.
-        :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
-            order to build and test this module (e.g. '-PprofileName').
-        :param environ: A dict of environment variables that should be set when files in this
-            module are changed.
-        :param sbt_test_goals: A set of SBT test goals for testing this module.
-        :param python_test_goals: A set of Python test goals for testing this module.
-        :param blacklisted_python_implementations: A set of Python implementations that are not
-            supported by this module's Python components. The values in this set should match
-            strings returned by Python's `platform.python_implementation()`.
-        :param test_tags A set of tags that will be excluded when running unit tests if the module
-            is not explicitly changed.
-        :param should_run_r_tests: If true, changes in this module will trigger all R tests.
-        """
-        self.name = name
-        self.dependencies = dependencies
-        self.source_file_prefixes = source_file_regexes
-        self.sbt_test_goals = sbt_test_goals
-        self.build_profile_flags = build_profile_flags
-        self.environ = environ
-        self.python_test_goals = python_test_goals
-        self.blacklisted_python_implementations = blacklisted_python_implementations
-        self.test_tags = test_tags
-        self.should_run_r_tests = should_run_r_tests
-
-        self.dependent_modules = set()
-        for dep in dependencies:
-            dep.dependent_modules.add(self)
-        all_modules.append(self)
-
-    def contains_file(self, filename):
-        return any(re.match(p, filename) for p in self.source_file_prefixes)
-
-
-sql = Module(
-    name="sql",
-    dependencies=[],
-    source_file_regexes=[
-        "sql/(?!hive-thriftserver)",
-        "bin/spark-sql",
-    ],
-    build_profile_flags=[
-        "-Phive",
-    ],
-    sbt_test_goals=[
-        "catalyst/test",
-        "sql/test",
-        "hive/test",
-    ],
-    test_tags=[
-        "org.apache.spark.tags.ExtendedHiveTest"
-    ]
-)
-
-
-hive_thriftserver = Module(
-    name="hive-thriftserver",
-    dependencies=[sql],
-    source_file_regexes=[
-        "sql/hive-thriftserver",
-        "sbin/start-thriftserver.sh",
-    ],
-    build_profile_flags=[
-        "-Phive-thriftserver",
-    ],
-    sbt_test_goals=[
-        "hive-thriftserver/test",
-    ]
-)
-
-
-graphx = Module(
-    name="graphx",
-    dependencies=[],
-    source_file_regexes=[
-        "graphx/",
-    ],
-    sbt_test_goals=[
-        "graphx/test"
-    ]
-)
-
-
-streaming = Module(
-    name="streaming",
-    dependencies=[],
-    source_file_regexes=[
-        "streaming",
-    ],
-    sbt_test_goals=[
-        "streaming/test",
-    ]
-)
-
-
-# Don't set the dependencies because changes in other modules should not trigger Kinesis tests.
-# Kinesis tests depends on external Amazon kinesis service. We should run these tests only when
-# files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't
-# fail other PRs.
-streaming_kinesis_asl = Module(
-    name="streaming-kinesis-asl",
-    dependencies=[],
-    source_file_regexes=[
-        "extras/kinesis-asl/",
-        "extras/kinesis-asl-assembly/",
-    ],
-    build_profile_flags=[
-        "-Pkinesis-asl",
-    ],
-    environ={
-        "ENABLE_KINESIS_TESTS": "1"
-    },
-    sbt_test_goals=[
-        "streaming-kinesis-asl/test",
-    ]
-)
-
-
-streaming_zeromq = Module(
-    name="streaming-zeromq",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/zeromq",
-    ],
-    sbt_test_goals=[
-        "streaming-zeromq/test",
-    ]
-)
-
-
-streaming_twitter = Module(
-    name="streaming-twitter",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/twitter",
-    ],
-    sbt_test_goals=[
-        "streaming-twitter/test",
-    ]
-)
-
-
-streaming_mqtt = Module(
-    name="streaming-mqtt",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/mqtt",
-        "external/mqtt-assembly",
-    ],
-    sbt_test_goals=[
-        "streaming-mqtt/test",
-    ]
-)
-
-
-streaming_kafka = Module(
-    name="streaming-kafka",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/kafka",
-        "external/kafka-assembly",
-    ],
-    sbt_test_goals=[
-        "streaming-kafka/test",
-    ]
-)
-
-
-streaming_flume_sink = Module(
-    name="streaming-flume-sink",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/flume-sink",
-    ],
-    sbt_test_goals=[
-        "streaming-flume-sink/test",
-    ]
-)
-
-
-streaming_flume = Module(
-    name="streaming-flume",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/flume",
-    ],
-    sbt_test_goals=[
-        "streaming-flume/test",
-    ]
-)
-
-
-streaming_flume_assembly = Module(
-    name="streaming-flume-assembly",
-    dependencies=[streaming_flume, streaming_flume_sink],
-    source_file_regexes=[
-        "external/flume-assembly",
-    ]
-)
-
-
-mllib = Module(
-    name="mllib",
-    dependencies=[streaming, sql],
-    source_file_regexes=[
-        "data/mllib/",
-        "mllib/",
-    ],
-    sbt_test_goals=[
-        "mllib/test",
-    ]
-)
-
-
-examples = Module(
-    name="examples",
-    dependencies=[graphx, mllib, streaming, sql],
-    source_file_regexes=[
-        "examples/",
-    ],
-    sbt_test_goals=[
-        "examples/test",
-    ]
-)
-
-
-pyspark_core = Module(
-    name="pyspark-core",
-    dependencies=[],
-    source_file_regexes=[
-        "python/(?!pyspark/(ml|mllib|sql|streaming))"
-    ],
-    python_test_goals=[
-        "pyspark.rdd",
-        "pyspark.context",
-        "pyspark.conf",
-        "pyspark.broadcast",
-        "pyspark.accumulators",
-        "pyspark.serializers",
-        "pyspark.profiler",
-        "pyspark.shuffle",
-        "pyspark.tests",
-    ]
-)
-
-
-pyspark_sql = Module(
-    name="pyspark-sql",
-    dependencies=[pyspark_core, sql],
-    source_file_regexes=[
-        "python/pyspark/sql"
-    ],
-    python_test_goals=[
-        "pyspark.sql.types",
-        "pyspark.sql.context",
-        "pyspark.sql.column",
-        "pyspark.sql.dataframe",
-        "pyspark.sql.group",
-        "pyspark.sql.functions",
-        "pyspark.sql.readwriter",
-        "pyspark.sql.window",
-        "pyspark.sql.tests",
-    ]
-)
-
-
-pyspark_streaming = Module(
-    name="pyspark-streaming",
-    dependencies=[
-        pyspark_core,
-        streaming,
-        streaming_kafka,
-        streaming_flume_assembly,
-        streaming_mqtt,
-        streaming_kinesis_asl
-    ],
-    source_file_regexes=[
-        "python/pyspark/streaming"
-    ],
-    python_test_goals=[
-        "pyspark.streaming.util",
-        "pyspark.streaming.tests",
-    ]
-)
-
-
-pyspark_mllib = Module(
-    name="pyspark-mllib",
-    dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib],
-    source_file_regexes=[
-        "python/pyspark/mllib"
-    ],
-    python_test_goals=[
-        "pyspark.mllib.classification",
-        "pyspark.mllib.clustering",
-        "pyspark.mllib.evaluation",
-        "pyspark.mllib.feature",
-        "pyspark.mllib.fpm",
-        "pyspark.mllib.linalg.__init__",
-        "pyspark.mllib.linalg.distributed",
-        "pyspark.mllib.random",
-        "pyspark.mllib.recommendation",
-        "pyspark.mllib.regression",
-        "pyspark.mllib.stat._statistics",
-        "pyspark.mllib.stat.KernelDensity",
-        "pyspark.mllib.tree",
-        "pyspark.mllib.util",
-        "pyspark.mllib.tests",
-    ],
-    blacklisted_python_implementations=[
-        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
-    ]
-)
-
-
-pyspark_ml = Module(
-    name="pyspark-ml",
-    dependencies=[pyspark_core, pyspark_mllib],
-    source_file_regexes=[
-        "python/pyspark/ml/"
-    ],
-    python_test_goals=[
-        "pyspark.ml.feature",
-        "pyspark.ml.classification",
-        "pyspark.ml.clustering",
-        "pyspark.ml.recommendation",
-        "pyspark.ml.regression",
-        "pyspark.ml.tuning",
-        "pyspark.ml.tests",
-        "pyspark.ml.evaluation",
-    ],
-    blacklisted_python_implementations=[
-        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
-    ]
-)
-
-sparkr = Module(
-    name="sparkr",
-    dependencies=[sql, mllib],
-    source_file_regexes=[
-        "R/",
-    ],
-    should_run_r_tests=True
-)
-
-
-docs = Module(
-    name="docs",
-    dependencies=[],
-    source_file_regexes=[
-        "docs/",
-    ]
-)
-
-
-ec2 = Module(
-    name="ec2",
-    dependencies=[],
-    source_file_regexes=[
-        "ec2/",
-    ]
-)
-
-
-yarn = Module(
-    name="yarn",
-    dependencies=[],
-    source_file_regexes=[
-        "yarn/",
-        "network/yarn/",
-    ],
-    sbt_test_goals=[
-        "yarn/test",
-        "network-yarn/test",
-    ],
-    test_tags=[
-        "org.apache.spark.tags.ExtendedYarnTest"
-    ]
-)
-
-# The root module is a dummy module which is used to run all of the tests.
-# No other modules should directly depend on this module.
-root = Module(
-    name="root",
-    dependencies=[],
-    source_file_regexes=[],
-    # In order to run all of the tests, enable every test profile:
-    build_profile_flags=list(set(
-        itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
-    sbt_test_goals=[
-        "test",
-    ],
-    python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)),
-    should_run_r_tests=True
-)
diff --git a/dev/_site/sparktestsupport/shellutils.py b/dev/_site/sparktestsupport/shellutils.py
deleted file mode 100644
index d280e797077d1..0000000000000
--- a/dev/_site/sparktestsupport/shellutils.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-import os
-import shutil
-import subprocess
-import sys
-
-
-if sys.version_info >= (2, 7):
-    subprocess_check_output = subprocess.check_output
-    subprocess_check_call = subprocess.check_call
-else:
-    # SPARK-8763
-    # backported from subprocess module in Python 2.7
-    def subprocess_check_output(*popenargs, **kwargs):
-        if 'stdout' in kwargs:
-            raise ValueError('stdout argument not allowed, it will be overridden.')
-        process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
-        output, unused_err = process.communicate()
-        retcode = process.poll()
-        if retcode:
-            cmd = kwargs.get("args")
-            if cmd is None:
-                cmd = popenargs[0]
-            raise subprocess.CalledProcessError(retcode, cmd, output=output)
-        return output
-
-    # backported from subprocess module in Python 2.7
-    def subprocess_check_call(*popenargs, **kwargs):
-        retcode = call(*popenargs, **kwargs)
-        if retcode:
-            cmd = kwargs.get("args")
-            if cmd is None:
-                cmd = popenargs[0]
-            raise CalledProcessError(retcode, cmd)
-        return 0
-
-
-def exit_from_command_with_retcode(cmd, retcode):
-    print("[error] running", ' '.join(cmd), "; received return code", retcode)
-    sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
-
-
-def rm_r(path):
-    """
-    Given an arbitrary path, properly remove it with the correct Python construct if it exists.
-    From: http://stackoverflow.com/a/9559881
-    """
-
-    if os.path.isdir(path):
-        shutil.rmtree(path)
-    elif os.path.exists(path):
-        os.remove(path)
-
-
-def run_cmd(cmd, return_output=False):
-    """
-    Given a command as a list of arguments will attempt to execute the command
-    and, on failure, print an error message and exit.
-    """
-
-    if not isinstance(cmd, list):
-        cmd = cmd.split()
-    try:
-        if return_output:
-            return subprocess_check_output(cmd)
-        else:
-            return subprocess_check_call(cmd)
-    except subprocess.CalledProcessError as e:
-        exit_from_command_with_retcode(e.cmd, e.returncode)
-
-
-def is_exe(path):
-    """
-    Check if a given path is an executable file.
-    From: http://stackoverflow.com/a/377028
-    """
-
-    return os.path.isfile(path) and os.access(path, os.X_OK)
-
-
-def which(program):
-    """
-    Find and return the given program by its absolute path or 'None' if the program cannot be found.
-    From: http://stackoverflow.com/a/377028
-    """
-
-    fpath = os.path.split(program)[0]
-
-    if fpath:
-        if is_exe(program):
-            return program
-    else:
-        for path in os.environ.get("PATH").split(os.pathsep):
-            path = path.strip('"')
-            exe_file = os.path.join(path, program)
-            if is_exe(exe_file):
-                return exe_file
-    return None
diff --git a/dev/_site/tests/pr_merge_ability.sh b/dev/_site/tests/pr_merge_ability.sh
deleted file mode 100755
index d9a347fe24a8c..0000000000000
--- a/dev/_site/tests/pr_merge_ability.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# This script follows the base format for testing pull requests against
-# another branch and returning results to be published. More details can be
-# found at dev/run-tests-jenkins.
-#
-# Arg1: The Github Pull Request Actual Commit
-#+ known as `ghprbActualCommit` in `run-tests-jenkins`
-# Arg2: The SHA1 hash
-#+ known as `sha1` in `run-tests-jenkins`
-#
-
-ghprbActualCommit="$1"
-sha1="$2"
-
-# check PR merge-ability
-if [ "${sha1}" == "${ghprbActualCommit}" ]; then
-  echo " * This patch **does not merge cleanly**."
-else
-  echo " * This patch merges cleanly."
-fi
diff --git a/dev/_site/tests/pr_new_dependencies.sh b/dev/_site/tests/pr_new_dependencies.sh
deleted file mode 100755
index fdfb3c62aff58..0000000000000
--- a/dev/_site/tests/pr_new_dependencies.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# This script follows the base format for testing pull requests against
-# another branch and returning results to be published. More details can be
-# found at dev/run-tests-jenkins.
-#
-# Arg1: The Github Pull Request Actual Commit
-#+ known as `ghprbActualCommit` in `run-tests-jenkins`
-# Arg2: The SHA1 hash
-#+ known as `sha1` in `run-tests-jenkins`
-# Arg3: Current PR Commit Hash
-#+ the PR hash for the current commit
-#
-
-ghprbActualCommit="$1"
-sha1="$2"
-current_pr_head="$3"
-
-MVN_BIN="build/mvn"
-CURR_CP_FILE="my-classpath.txt"
-MASTER_CP_FILE="master-classpath.txt"
-
-# First switch over to the master branch
-git checkout -f master
-# Find and copy all pom.xml files into a *.gate file that we can check
-# against through various `git` changes
-find -name "pom.xml" -exec cp {} {}.gate \;
-# Switch back to the current PR
-git checkout -f "${current_pr_head}"
-
-# Check if any *.pom files from the current branch are different from the master
-difference_q=""
-for p in $(find -name "pom.xml"); do
-  [[ -f "${p}" && -f "${p}.gate" ]] && \
-    difference_q="${difference_q}$(diff $p.gate $p)"
-done
-
-# If no pom files were changed we can easily say no new dependencies were added
-if [ -z "${difference_q}" ]; then
-  echo " * This patch does not change any dependencies."
-else
-  # Else we need to manually build spark to determine what, if any, dependencies
-  # were added into the Spark assembly jar
-  ${MVN_BIN} clean package dependency:build-classpath -DskipTests 2>/dev/null | \
-    sed -n -e '/Building Spark Project Assembly/,$p' | \
-    grep --context=1 -m 2 "Dependencies classpath:" | \
-    head -n 3 | \
-    tail -n 1 | \
-    tr ":" "\n" | \
-    rev | \
-    cut -d "/" -f 1 | \
-    rev | \
-    sort > ${CURR_CP_FILE}
-
-  # Checkout the master branch to compare against
-  git checkout -f master
-
-  ${MVN_BIN} clean package dependency:build-classpath -DskipTests 2>/dev/null | \
-    sed -n -e '/Building Spark Project Assembly/,$p' | \
-    grep --context=1 -m 2 "Dependencies classpath:" | \
-    head -n 3 | \
-    tail -n 1 | \
-    tr ":" "\n" | \
-    rev | \
-    cut -d "/" -f 1 | \
-    rev | \
-    sort > ${MASTER_CP_FILE}
-
-  DIFF_RESULTS="`diff ${CURR_CP_FILE} ${MASTER_CP_FILE}`"
-
-  if [ -z "${DIFF_RESULTS}" ]; then
-    echo " * This patch does not change any dependencies."
-  else
-    # Pretty print the new dependencies
-    added_deps=$(echo "${DIFF_RESULTS}" | grep "<" | cut -d' ' -f2 | awk '{printf "   * \`"$1"\`\\n"}')
-    removed_deps=$(echo "${DIFF_RESULTS}" | grep ">" | cut -d' ' -f2 | awk '{printf "   * \`"$1"\`\\n"}')
-    added_deps_text=" * This patch **adds the following new dependencies:**\n${added_deps}"
-    removed_deps_text=" * This patch **removes the following dependencies:**\n${removed_deps}"
-
-    # Construct the final returned message with proper 
-    return_mssg=""
-    [ -n "${added_deps}" ] && return_mssg="${added_deps_text}"
-    if [ -n "${removed_deps}" ]; then
-      if [ -n "${return_mssg}" ]; then
-        return_mssg="${return_mssg}\n${removed_deps_text}"
-      else
-        return_mssg="${removed_deps_text}"
-      fi
-    fi
-    echo "${return_mssg}"
-  fi
-  
-  # Remove the files we've left over
-  [ -f "${CURR_CP_FILE}" ] && rm -f "${CURR_CP_FILE}"
-  [ -f "${MASTER_CP_FILE}" ] && rm -f "${MASTER_CP_FILE}"
-
-  # Clean up our mess from the Maven builds just in case
-  ${MVN_BIN} clean &>/dev/null
-fi
diff --git a/dev/_site/tests/pr_public_classes.sh b/dev/_site/tests/pr_public_classes.sh
deleted file mode 100755
index 927295b88c963..0000000000000
--- a/dev/_site/tests/pr_public_classes.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# This script follows the base format for testing pull requests against
-# another branch and returning results to be published. More details can be
-# found at dev/run-tests-jenkins.
-#
-# Arg1: The Github Pull Request Actual Commit
-#+ known as `ghprbActualCommit` in `run-tests-jenkins`
-# Arg2: The SHA1 hash
-#+ known as `sha1` in `run-tests-jenkins`
-#
-
-# We diff master...$ghprbActualCommit because that gets us changes introduced in the PR
-#+ and not anything else added to master since the PR was branched.
-
-ghprbActualCommit="$1"
-sha1="$2"
-
-source_files=$(
-  git diff master...$ghprbActualCommit --name-only  `# diff patch against master from branch point` \
-    | grep -v -e "\/test"                               `# ignore files in test directories` \
-    | grep -e "\.py$" -e "\.java$" -e "\.scala$"        `# include only code files` \
-    | tr "\n" " "
-)
-new_public_classes=$(
-  git diff master...$ghprbActualCommit ${source_files}      `# diff patch against master from branch point` \
-    | grep "^\+"                              `# filter in only added lines` \
-    | sed -r -e "s/^\+//g"                    `# remove the leading +` \
-    | grep -e "trait " -e "class "            `# filter in lines with these key words` \
-    | grep -e "{" -e "("                      `# filter in lines with these key words, too` \
-    | grep -v -e "\@\@" -e "private"          `# exclude lines with these words` \
-    | grep -v -e "^// " -e "^/\*" -e "^ \* "  `# exclude comment lines` \
-    | sed -r -e "s/\{.*//g"                   `# remove from the { onwards` \
-    | sed -r -e "s/\}//g"                     `# just in case, remove }; they mess the JSON` \
-    | sed -r -e "s/\"/\\\\\"/g"               `# escape double quotes; they mess the JSON` \
-    | sed -r -e "s/^(.*)$/\`\1\`/g"           `# surround with backticks for style` \
-    | sed -r -e "s/^/  \* /g"                 `# prepend '  *' to start of line` \
-    | sed -r -e "s/$/\\\n/g"                  `# append newline to end of line` \
-    | tr -d "\n"                              `# remove actual LF characters`
-)
-
-if [ -z "$new_public_classes" ]; then
-  echo " * This patch adds no public classes."
-else
-  public_classes_note=" * This patch adds the following public classes _(experimental)_:"
-  echo "${public_classes_note}\n${new_public_classes}"
-fi

From 1106cae4f74cf4727b870c9eb8e695dfd0c423a9 Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Fri, 13 Nov 2015 14:48:02 +0530
Subject: [PATCH 05/13] java style issues

---
 .../mllib/JavaBinaryClassification.java       | 151 +++++-----
 .../examples/mllib/JavaLinearRegression.java  |  94 +++----
 .../mllib/JavaMultiLabelClassification.java   |  73 ++---
 .../mllib/JavaMulticlassClassification.java   |  94 +++----
 .../spark/examples/mllib/JavaRanking.java     | 265 +++++++++---------
 .../mllib/binary_classification_metrics.py    |   2 +
 .../main/python/mllib/regression_metrics.py   |   4 +-
 .../mllib/BinaryClassificationMetrics.scala   |   3 +-
 .../examples/mllib/MultiLabelMetrics.scala    |   1 +
 9 files changed, 348 insertions(+), 339 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
index b17dc79abff16..58b255eb598ec 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
@@ -19,6 +19,7 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
+
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
@@ -32,82 +33,80 @@
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 
-
 public class JavaBinaryClassification {
-    public static void main(String[] args) {
-
-        SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics");
-        SparkContext sc = new SparkContext(conf);
-        String path = "data/mllib/sample_binary_classification_data.txt";
-        JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
-
-        // Split initial RDD into two... [60% training data, 40% testing data].
-        JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L);
-        JavaRDD<LabeledPoint> training = splits[0].cache();
-        JavaRDD<LabeledPoint> test = splits[1];
-
-        // Run training algorithm to build the model.
-        final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
-                .setNumClasses(2)
-                .run(training.rdd());
-
-        // Clear the prediction threshold so the model will return probabilities
-        model.clearThreshold();
-
-        // Compute raw scores on the test set.
-        JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
-                new Function<LabeledPoint, Tuple2<Object, Object>>() {
-                    public Tuple2<Object, Object> call(LabeledPoint p) {
-                        Double prediction = model.predict(p.features());
-                        return new Tuple2<Object, Object>(prediction, p.label());
-                    }
-                }
-        );
-
-        // Get evaluation metrics.
-        BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictionAndLabels.rdd());
-
-        // Precision by threshold
-        JavaRDD<Tuple2<Object, Object>> precision = metrics.precisionByThreshold().toJavaRDD();
-        System.out.println("Precision by threshold: " + precision.toArray());
-
-        // Recall by threshold
-        JavaRDD<Tuple2<Object, Object>> recall = metrics.recallByThreshold().toJavaRDD();
-        System.out.println("Recall by threshold: " + recall.toArray());
-
-        // F Score by threshold
-        JavaRDD<Tuple2<Object, Object>> f1Score = metrics.fMeasureByThreshold().toJavaRDD();
-        System.out.println("F1 Score by threshold: " + f1Score.toArray());
-
-        JavaRDD<Tuple2<Object, Object>> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD();
-        System.out.println("F2 Score by threshold: " + f2Score.toArray());
-
-        // Precision-recall curve
-        JavaRDD<Tuple2<Object, Object>> prc = metrics.pr().toJavaRDD();
-        System.out.println("Precision-recall curve: " + prc.toArray());
-
-        // Thresholds
-        JavaRDD<Double> thresholds = precision.map(
-                new Function<Tuple2<Object, Object>, Double>() {
-                    public Double call(Tuple2<Object, Object> t) {
-                        return new Double(t._1().toString());
-                    }
-                }
-        );
-
-        // ROC Curve
-        JavaRDD<Tuple2<Object, Object>> roc = metrics.roc().toJavaRDD();
-        System.out.println("ROC curve: " + roc.toArray());
-
-        // AUPRC
-        System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR());
-
-        // AUROC
-        System.out.println("Area under ROC = " + metrics.areaUnderROC());
-
-        // Save and load model
-        model.save(sc, "myModelPath");
-        LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
-    }
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics");
+    SparkContext sc = new SparkContext(conf);
+    String path = "data/mllib/sample_binary_classification_data.txt";
+    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
+
+    // Split initial RDD into two... [60% training data, 40% testing data].
+    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L);
+    JavaRDD<LabeledPoint> training = splits[0].cache();
+    JavaRDD<LabeledPoint> test = splits[1];
+
+    // Run training algorithm to build the model.
+    final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
+            .setNumClasses(2)
+            .run(training.rdd());
+
+    // Clear the prediction threshold so the model will return probabilities
+    model.clearThreshold();
+
+    // Compute raw scores on the test set.
+    JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
+            new Function<LabeledPoint, Tuple2<Object, Object>>() {
+              public Tuple2<Object, Object> call(LabeledPoint p) {
+                Double prediction = model.predict(p.features());
+                return new Tuple2<Object, Object>(prediction, p.label());
+              }
+            }
+    );
+
+    // Get evaluation metrics.
+    BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictionAndLabels.rdd());
+
+    // Precision by threshold
+    JavaRDD<Tuple2<Object, Object>> precision = metrics.precisionByThreshold().toJavaRDD();
+    System.out.println("Precision by threshold: " + precision.toArray());
+
+    // Recall by threshold
+    JavaRDD<Tuple2<Object, Object>> recall = metrics.recallByThreshold().toJavaRDD();
+    System.out.println("Recall by threshold: " + recall.toArray());
+
+    // F Score by threshold
+    JavaRDD<Tuple2<Object, Object>> f1Score = metrics.fMeasureByThreshold().toJavaRDD();
+    System.out.println("F1 Score by threshold: " + f1Score.toArray());
+
+    JavaRDD<Tuple2<Object, Object>> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD();
+    System.out.println("F2 Score by threshold: " + f2Score.toArray());
+
+    // Precision-recall curve
+    JavaRDD<Tuple2<Object, Object>> prc = metrics.pr().toJavaRDD();
+    System.out.println("Precision-recall curve: " + prc.toArray());
+
+    // Thresholds
+    JavaRDD<Double> thresholds = precision.map(
+            new Function<Tuple2<Object, Object>, Double>() {
+              public Double call(Tuple2<Object, Object> t) {
+                return new Double(t._1().toString());
+              }
+            }
+    );
+
+    // ROC Curve
+    JavaRDD<Tuple2<Object, Object>> roc = metrics.roc().toJavaRDD();
+    System.out.println("ROC curve: " + roc.toArray());
+
+    // AUPRC
+    System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR());
+
+    // AUROC
+    System.out.println("Area under ROC = " + metrics.areaUnderROC());
+
+    // Save and load model
+    model.save(sc, "myModelPath");
+    LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
+  }
 }
 // $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java
index cc60409b42859..6781ec619b89e 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java
@@ -19,6 +19,7 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
+
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
@@ -30,63 +31,62 @@
 import org.apache.spark.mllib.evaluation.RegressionMetrics;
 import org.apache.spark.SparkConf;
 
-
 // Read in the ratings data
 public class JavaLinearRegression {
-    public static void main(String[] args) {
-        SparkConf conf = new SparkConf().setAppName("Linear Regression Example");
-        JavaSparkContext sc = new JavaSparkContext(conf);
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Linear Regression Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
 
-        // Load and parse the data
-        String path = "data/mllib/sample_linear_regression_data.txt";
-        JavaRDD<String> data = sc.textFile(path);
-        JavaRDD<LabeledPoint> parsedData = data.map(
-                new Function<String, LabeledPoint>() {
-                    public LabeledPoint call(String line) {
-                        String[] parts = line.split(" ");
-                        double[] v = new double[parts.length - 1];
-                        for (int i = 1; i < parts.length - 1; i++)
-                            v[i - 1] = Double.parseDouble(parts[i].split(":")[1]);
-                        return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
-                    }
-                }
-        );
-        parsedData.cache();
+    // Load and parse the data
+    String path = "data/mllib/sample_linear_regression_data.txt";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<LabeledPoint> parsedData = data.map(
+            new Function<String, LabeledPoint>() {
+              public LabeledPoint call(String line) {
+                String[] parts = line.split(" ");
+                double[] v = new double[parts.length - 1];
+                for (int i = 1; i < parts.length - 1; i++)
+                  v[i - 1] = Double.parseDouble(parts[i].split(":")[1]);
+                return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
+              }
+            }
+    );
+    parsedData.cache();
 
-        // Building the model
-        int numIterations = 100;
-        final LinearRegressionModel model =
-                LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations);
+    // Building the model
+    int numIterations = 100;
+    final LinearRegressionModel model =
+            LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations);
 
-        // Evaluate model on training examples and compute training error
-        JavaRDD<Tuple2<Object, Object>> valuesAndPreds = parsedData.map(
-                new Function<LabeledPoint, Tuple2<Object, Object>>() {
-                    public Tuple2<Object, Object> call(LabeledPoint point) {
-                        double prediction = model.predict(point.features());
-                        return new Tuple2<Object, Object>(prediction, point.label());
-                    }
-                }
-        );
+    // Evaluate model on training examples and compute training error
+    JavaRDD<Tuple2<Object, Object>> valuesAndPreds = parsedData.map(
+            new Function<LabeledPoint, Tuple2<Object, Object>>() {
+              public Tuple2<Object, Object> call(LabeledPoint point) {
+                double prediction = model.predict(point.features());
+                return new Tuple2<Object, Object>(prediction, point.label());
+              }
+            }
+    );
 
-        // Instantiate metrics object
-        RegressionMetrics metrics = new RegressionMetrics(valuesAndPreds.rdd());
+    // Instantiate metrics object
+    RegressionMetrics metrics = new RegressionMetrics(valuesAndPreds.rdd());
 
-        // Squared error
-        System.out.format("MSE = %f\n", metrics.meanSquaredError());
-        System.out.format("RMSE = %f\n", metrics.rootMeanSquaredError());
+    // Squared error
+    System.out.format("MSE = %f\n", metrics.meanSquaredError());
+    System.out.format("RMSE = %f\n", metrics.rootMeanSquaredError());
 
-        // R-squared
-        System.out.format("R Squared = %f\n", metrics.r2());
+    // R-squared
+    System.out.format("R Squared = %f\n", metrics.r2());
 
-        // Mean absolute error
-        System.out.format("MAE = %f\n", metrics.meanAbsoluteError());
+    // Mean absolute error
+    System.out.format("MAE = %f\n", metrics.meanAbsoluteError());
 
-        // Explained variance
-        System.out.format("Explained Variance = %f\n", metrics.explainedVariance());
+    // Explained variance
+    System.out.format("Explained Variance = %f\n", metrics.explainedVariance());
 
-        // Save and load model
-        model.save(sc.sc(), "myModelPath");
-        LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath");
-    }
+    // Save and load model
+    model.save(sc.sc(), "myModelPath");
+    LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath");
+  }
 }
 // $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
index 53204523bc865..c4d44dd8ea872 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
@@ -19,60 +19,63 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
+
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.mllib.evaluation.MultilabelMetrics;
 import org.apache.spark.SparkConf;
+
 import java.util.Arrays;
 import java.util.List;
 // $example off$
 import org.apache.spark.SparkContext;
+
 // $example on$
 public class JavaMultiLabelClassification {
-    public static void main(String[] args) {
-        SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics");
-        JavaSparkContext sc = new JavaSparkContext(conf);
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics");
+    JavaSparkContext sc = new JavaSparkContext(conf);
 
-        List<Tuple2<double[], double[]>> data = Arrays.asList(
-                new Tuple2<double[], double[]>(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}),
-                new Tuple2<double[], double[]>(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}),
-                new Tuple2<double[], double[]>(new double[]{}, new double[]{0.0}),
-                new Tuple2<double[], double[]>(new double[]{2.0}, new double[]{2.0}),
-                new Tuple2<double[], double[]>(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}),
-                new Tuple2<double[], double[]>(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}),
-                new Tuple2<double[], double[]>(new double[]{1.0}, new double[]{1.0, 2.0})
-        );
-        JavaRDD<Tuple2<double[], double[]>> scoreAndLabels = sc.parallelize(data);
+    List<Tuple2<double[], double[]>> data = Arrays.asList(
+            new Tuple2<double[], double[]>(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}),
+            new Tuple2<double[], double[]>(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}),
+            new Tuple2<double[], double[]>(new double[]{}, new double[]{0.0}),
+            new Tuple2<double[], double[]>(new double[]{2.0}, new double[]{2.0}),
+            new Tuple2<double[], double[]>(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}),
+            new Tuple2<double[], double[]>(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}),
+            new Tuple2<double[], double[]>(new double[]{1.0}, new double[]{1.0, 2.0})
+    );
+    JavaRDD<Tuple2<double[], double[]>> scoreAndLabels = sc.parallelize(data);
 
-        // Instantiate metrics object
-        MultilabelMetrics metrics = new MultilabelMetrics(scoreAndLabels.rdd());
+    // Instantiate metrics object
+    MultilabelMetrics metrics = new MultilabelMetrics(scoreAndLabels.rdd());
 
-        // Summary stats
-        System.out.format("Recall = %f\n", metrics.recall());
-        System.out.format("Precision = %f\n", metrics.precision());
-        System.out.format("F1 measure = %f\n", metrics.f1Measure());
-        System.out.format("Accuracy = %f\n", metrics.accuracy());
+    // Summary stats
+    System.out.format("Recall = %f\n", metrics.recall());
+    System.out.format("Precision = %f\n", metrics.precision());
+    System.out.format("F1 measure = %f\n", metrics.f1Measure());
+    System.out.format("Accuracy = %f\n", metrics.accuracy());
 
-        // Stats by labels
-        for (int i = 0; i < metrics.labels().length - 1; i++) {
-            System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
-            System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
-            System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure(metrics.labels()[i]));
-        }
+    // Stats by labels
+    for (int i = 0; i < metrics.labels().length - 1; i++) {
+      System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
+      System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
+      System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure(metrics.labels()[i]));
+    }
 
-        // Micro stats
-        System.out.format("Micro recall = %f\n", metrics.microRecall());
-        System.out.format("Micro precision = %f\n", metrics.microPrecision());
-        System.out.format("Micro F1 measure = %f\n", metrics.microF1Measure());
+    // Micro stats
+    System.out.format("Micro recall = %f\n", metrics.microRecall());
+    System.out.format("Micro precision = %f\n", metrics.microPrecision());
+    System.out.format("Micro F1 measure = %f\n", metrics.microF1Measure());
 
-        // Hamming loss
-        System.out.format("Hamming loss = %f\n", metrics.hammingLoss());
+    // Hamming loss
+    System.out.format("Hamming loss = %f\n", metrics.hammingLoss());
 
-        // Subset accuracy
-        System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy());
+    // Subset accuracy
+    System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy());
 
-    }
+  }
 }
 // $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
index 0e74da7a883d1..cc5af0b178fa5 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
@@ -19,6 +19,7 @@
 package org.apache.spark.examples.mllib
 
 // $example on$
+
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
@@ -33,62 +34,61 @@
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 
-
 public class JavaMulticlassClassification {
-    public static void main(String[] args) {
-        SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics");
-        SparkContext sc = new SparkContext(conf);
-        String path = "data/mllib/sample_multiclass_classification_data.txt";
-        JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics");
+    SparkContext sc = new SparkContext(conf);
+    String path = "data/mllib/sample_multiclass_classification_data.txt";
+    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
 
-        // Split initial RDD into two... [60% training data, 40% testing data].
-        JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
-        JavaRDD<LabeledPoint> training = splits[0].cache();
-        JavaRDD<LabeledPoint> test = splits[1];
+    // Split initial RDD into two... [60% training data, 40% testing data].
+    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L);
+    JavaRDD<LabeledPoint> training = splits[0].cache();
+    JavaRDD<LabeledPoint> test = splits[1];
 
-        // Run training algorithm to build the model.
-        final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
-                .setNumClasses(3)
-                .run(training.rdd());
+    // Run training algorithm to build the model.
+    final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
+            .setNumClasses(3)
+            .run(training.rdd());
 
-        // Compute raw scores on the test set.
-        JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
-                new Function<LabeledPoint, Tuple2<Object, Object>>() {
-                    public Tuple2<Object, Object> call(LabeledPoint p) {
-                        Double prediction = model.predict(p.features());
-                        return new Tuple2<Object, Object>(prediction, p.label());
-                    }
-                }
-        );
+    // Compute raw scores on the test set.
+    JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
+            new Function<LabeledPoint, Tuple2<Object, Object>>() {
+              public Tuple2<Object, Object> call(LabeledPoint p) {
+                Double prediction = model.predict(p.features());
+                return new Tuple2<Object, Object>(prediction, p.label());
+              }
+            }
+    );
 
-        // Get evaluation metrics.
-        MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
+    // Get evaluation metrics.
+    MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
 
-        // Confusion matrix
-        Matrix confusion = metrics.confusionMatrix();
-        System.out.println("Confusion matrix: \n" + confusion);
+    // Confusion matrix
+    Matrix confusion = metrics.confusionMatrix();
+    System.out.println("Confusion matrix: \n" + confusion);
 
-        // Overall statistics
-        System.out.println("Precision = " + metrics.precision());
-        System.out.println("Recall = " + metrics.recall());
-        System.out.println("F1 Score = " + metrics.fMeasure());
+    // Overall statistics
+    System.out.println("Precision = " + metrics.precision());
+    System.out.println("Recall = " + metrics.recall());
+    System.out.println("F1 Score = " + metrics.fMeasure());
 
-        // Stats by labels
-        for (int i = 0; i < metrics.labels().length; i++) {
-            System.out.format("Class %f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
-            System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
-            System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i]));
-        }
+    // Stats by labels
+    for (int i = 0; i < metrics.labels().length; i++) {
+      System.out.format("Class %f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
+      System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
+      System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i]));
+    }
 
-        //Weighted stats
-        System.out.format("Weighted precision = %f\n", metrics.weightedPrecision());
-        System.out.format("Weighted recall = %f\n", metrics.weightedRecall());
-        System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure());
-        System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate());
+    //Weighted stats
+    System.out.format("Weighted precision = %f\n", metrics.weightedPrecision());
+    System.out.format("Weighted recall = %f\n", metrics.weightedRecall());
+    System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure());
+    System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate());
 
-        // Save and load model
-        model.save(sc, "myModelPath");
-        LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
-    }
+    // Save and load model
+    model.save(sc, "myModelPath");
+    LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
+  }
 }
 // $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
index b389a09c2715f..2fca06b25ebe5 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
@@ -19,6 +19,7 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
+
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
@@ -26,151 +27,151 @@
 import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.Function;
+
 import java.util.*;
+
 import org.apache.spark.mllib.evaluation.RegressionMetrics;
 import org.apache.spark.mllib.evaluation.RankingMetrics;
 import org.apache.spark.mllib.recommendation.ALS;
 import org.apache.spark.mllib.recommendation.Rating;
 
-
 // Read in the ratings data
 public class JavaRanking {
-    public static void main(String[] args) {
-        SparkConf conf = new SparkConf().setAppName("Ranking Metrics");
-        JavaSparkContext sc = new JavaSparkContext(conf);
-        String path = "data/mllib/sample_movielens_data.txt";
-        JavaRDD<String> data = sc.textFile(path);
-        JavaRDD<Rating> ratings = data.map(
-                new Function<String, Rating>() {
-                    public Rating call(String line) {
-                        String[] parts = line.split("::");
-                        return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]) - 2.5);
-                    }
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("Ranking Metrics");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+    String path = "data/mllib/sample_movielens_data.txt";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<Rating> ratings = data.map(
+            new Function<String, Rating>() {
+              public Rating call(String line) {
+                String[] parts = line.split("::");
+                return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]) - 2.5);
+              }
+            }
+    );
+    ratings.cache();
+
+    // Train an ALS model
+    final MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), 10, 10, 0.01);
+
+    // Get top 10 recommendations for every user and scale ratings from 0 to 1
+    JavaRDD<Tuple2<Object, Rating[]>> userRecs = model.recommendProductsForUsers(10).toJavaRDD();
+    JavaRDD<Tuple2<Object, Rating[]>> userRecsScaled = userRecs.map(
+            new Function<Tuple2<Object, Rating[]>, Tuple2<Object, Rating[]>>() {
+              public Tuple2<Object, Rating[]> call(Tuple2<Object, Rating[]> t) {
+                Rating[] scaledRatings = new Rating[t._2().length];
+                for (int i = 0; i < scaledRatings.length; i++) {
+                  double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0);
+                  scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating);
                 }
-        );
-        ratings.cache();
-
-        // Train an ALS model
-        final MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), 10, 10, 0.01);
-
-        // Get top 10 recommendations for every user and scale ratings from 0 to 1
-        JavaRDD<Tuple2<Object, Rating[]>> userRecs = model.recommendProductsForUsers(10).toJavaRDD();
-        JavaRDD<Tuple2<Object, Rating[]>> userRecsScaled = userRecs.map(
-                new Function<Tuple2<Object, Rating[]>, Tuple2<Object, Rating[]>>() {
-                    public Tuple2<Object, Rating[]> call(Tuple2<Object, Rating[]> t) {
-                        Rating[] scaledRatings = new Rating[t._2().length];
-                        for (int i = 0; i < scaledRatings.length; i++) {
-                            double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0);
-                            scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating);
-                        }
-                        return new Tuple2<Object, Rating[]>(t._1(), scaledRatings);
-                    }
+                return new Tuple2<Object, Rating[]>(t._1(), scaledRatings);
+              }
+            }
+    );
+    JavaPairRDD<Object, Rating[]> userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled);
+
+    // Map ratings to 1 or 0, 1 indicating a movie that should be recommended
+    JavaRDD<Rating> binarizedRatings = ratings.map(
+            new Function<Rating, Rating>() {
+              public Rating call(Rating r) {
+                double binaryRating;
+                if (r.rating() > 0.0) {
+                  binaryRating = 1.0;
+                } else {
+                  binaryRating = 0.0;
                 }
-        );
-        JavaPairRDD<Object, Rating[]> userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled);
-
-        // Map ratings to 1 or 0, 1 indicating a movie that should be recommended
-        JavaRDD<Rating> binarizedRatings = ratings.map(
-                new Function<Rating, Rating>() {
-                    public Rating call(Rating r) {
-                        double binaryRating;
-                        if (r.rating() > 0.0) {
-                            binaryRating = 1.0;
-                        }
-                        else {
-                            binaryRating = 0.0;
-                        }
-                        return new Rating(r.user(), r.product(), binaryRating);
-                    }
+                return new Rating(r.user(), r.product(), binaryRating);
+              }
+            }
+    );
+
+    // Group ratings by common user
+    JavaPairRDD<Object, Iterable<Rating>> userMovies = binarizedRatings.groupBy(
+            new Function<Rating, Object>() {
+              public Object call(Rating r) {
+                return r.user();
+              }
+            }
+    );
+
+    // Get true relevant documents from all user ratings
+    JavaPairRDD<Object, List<Integer>> userMoviesList = userMovies.mapValues(
+            new Function<Iterable<Rating>, List<Integer>>() {
+              public List<Integer> call(Iterable<Rating> docs) {
+                List<Integer> products = new ArrayList<Integer>();
+                for (Rating r : docs) {
+                  if (r.rating() > 0.0) {
+                    products.add(r.product());
+                  }
                 }
-        );
-
-        // Group ratings by common user
-        JavaPairRDD<Object, Iterable<Rating>> userMovies = binarizedRatings.groupBy(
-                new Function<Rating, Object>() {
-                    public Object call(Rating r) {
-                        return r.user();
-                    }
+                return products;
+              }
+            }
+    );
+
+    // Extract the product id from each recommendation
+    JavaPairRDD<Object, List<Integer>> userRecommendedList = userRecommended.mapValues(
+            new Function<Rating[], List<Integer>>() {
+              public List<Integer> call(Rating[] docs) {
+                List<Integer> products = new ArrayList<Integer>();
+                for (Rating r : docs) {
+                  products.add(r.product());
                 }
-        );
-
-        // Get true relevant documents from all user ratings
-        JavaPairRDD<Object, List<Integer>> userMoviesList = userMovies.mapValues(
-                new Function<Iterable<Rating>, List<Integer>>() {
-                    public List<Integer> call(Iterable<Rating> docs) {
-                        List<Integer> products = new ArrayList<Integer>();
-                        for (Rating r : docs) {
-                            if (r.rating() > 0.0) {
-                                products.add(r.product());
-                            }
-                        }
-                        return products;
-                    }
-                }
-        );
-
-        // Extract the product id from each recommendation
-        JavaPairRDD<Object, List<Integer>> userRecommendedList = userRecommended.mapValues(
-                new Function<Rating[], List<Integer>>() {
-                    public List<Integer> call(Rating[] docs) {
-                        List<Integer> products = new ArrayList<Integer>();
-                        for (Rating r : docs) {
-                            products.add(r.product());
-                        }
-                        return products;
+                return products;
+              }
+            }
+    );
+    JavaRDD<Tuple2<List<Integer>, List<Integer>>> relevantDocs = userMoviesList.join(userRecommendedList).values();
+
+    // Instantiate the metrics object
+    RankingMetrics metrics = RankingMetrics.of(relevantDocs);
+
+    // Precision and NDCG at k
+    Integer[] kVector = {1, 3, 5};
+    for (Integer k : kVector) {
+      System.out.format("Precision at %d = %f\n", k, metrics.precisionAt(k));
+      System.out.format("NDCG at %d = %f\n", k, metrics.ndcgAt(k));
+    }
+
+    // Mean average precision
+    System.out.format("Mean average precision = %f\n", metrics.meanAveragePrecision());
+
+    // Evaluate the model using numerical ratings and regression metrics
+    JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
+            new Function<Rating, Tuple2<Object, Object>>() {
+              public Tuple2<Object, Object> call(Rating r) {
+                return new Tuple2<Object, Object>(r.user(), r.product());
+              }
+            }
+    );
+    JavaPairRDD<Tuple2<Integer, Integer>, Object> predictions = JavaPairRDD.fromJavaRDD(
+            model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
+                    new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
+                      public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r) {
+                        return new Tuple2<Tuple2<Integer, Integer>, Object>(
+                                new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+                      }
                     }
-                }
-        );
-        JavaRDD<Tuple2<List<Integer>, List<Integer>>> relevantDocs = userMoviesList.join(userRecommendedList).values();
-
-        // Instantiate the metrics object
-        RankingMetrics metrics = RankingMetrics.of(relevantDocs);
-
-        // Precision and NDCG at k
-        Integer[] kVector = {1, 3, 5};
-        for (Integer k : kVector) {
-            System.out.format("Precision at %d = %f\n", k, metrics.precisionAt(k));
-            System.out.format("NDCG at %d = %f\n", k, metrics.ndcgAt(k));
-        }
-
-        // Mean average precision
-        System.out.format("Mean average precision = %f\n", metrics.meanAveragePrecision());
-
-        // Evaluate the model using numerical ratings and regression metrics
-        JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
-                new Function<Rating, Tuple2<Object, Object>>() {
-                    public Tuple2<Object, Object> call(Rating r) {
-                        return new Tuple2<Object, Object>(r.user(), r.product());
+            ));
+    JavaRDD<Tuple2<Object, Object>> ratesAndPreds =
+            JavaPairRDD.fromJavaRDD(ratings.map(
+                    new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
+                      public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r) {
+                        return new Tuple2<Tuple2<Integer, Integer>, Object>(
+                                new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+                      }
                     }
-                }
-        );
-        JavaPairRDD<Tuple2<Integer, Integer>, Object> predictions = JavaPairRDD.fromJavaRDD(
-                model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
-                        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
-                            public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r){
-                                return new Tuple2<Tuple2<Integer, Integer>, Object>(
-                                        new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
-                            }
-                        }
-                ));
-        JavaRDD<Tuple2<Object, Object>> ratesAndPreds =
-                JavaPairRDD.fromJavaRDD(ratings.map(
-                        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
-                            public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r){
-                                return new Tuple2<Tuple2<Integer, Integer>, Object>(
-                                        new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
-                            }
-                        }
-                )).join(predictions).values();
-
-        // Create regression metrics object
-        RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd());
-
-        // Root mean squared error
-        System.out.format("RMSE = %f\n", regressionMetrics.rootMeanSquaredError());
-
-        // R-squared
-        System.out.format("R-squared = %f\n", regressionMetrics.r2());
-    }
+            )).join(predictions).values();
+
+    // Create regression metrics object
+    RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd());
+
+    // Root mean squared error
+    System.out.format("RMSE = %f\n", regressionMetrics.rootMeanSquaredError());
+
+    // R-squared
+    System.out.format("R-squared = %f\n", regressionMetrics.r2());
+  }
 }
 // $example off$
\ No newline at end of file
diff --git a/examples/src/main/python/mllib/binary_classification_metrics.py b/examples/src/main/python/mllib/binary_classification_metrics.py
index 85583c7e6cfa7..f8c32bbe6154e 100644
--- a/examples/src/main/python/mllib/binary_classification_metrics.py
+++ b/examples/src/main/python/mllib/binary_classification_metrics.py
@@ -36,7 +36,9 @@
     sc = SparkContext(appName="BinaryClassificationMetrics")
     sqlContext = SQLContext(sc)
 
+    # $example on$
     # Several of the methods available in scala are currently missing from pyspark
+    # $example off$
 
     # $example on$
     # Load training data in LIBSVM format
diff --git a/examples/src/main/python/mllib/regression_metrics.py b/examples/src/main/python/mllib/regression_metrics.py
index 2b90f2457267c..aca33aa7f8611 100644
--- a/examples/src/main/python/mllib/regression_metrics.py
+++ b/examples/src/main/python/mllib/regression_metrics.py
@@ -27,9 +27,11 @@
     sc = SparkContext(appName="Regression Metrics")
     # $example on$
     # Load and parse the data
+
     def parsePoint(line):
         values = line.split()
-        return LabeledPoint(float(values[0]),DenseVector([float(x.split(':')[1]) for x in values[1:]]))
+        return LabeledPoint(float(values[0]),
+                            DenseVector([float(x.split(':')[1]) for x in values[1:]]))
 
     data = sc.textFile("data/mllib/sample_linear_regression_data.txt")
     parsedData = data.map(parsePoint)
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala
index db640ccc4a08e..72728ccc5ae43 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala
@@ -38,7 +38,7 @@ object BinaryClassificationMetrics {
     import sqlContext.implicits._
     // $example on$
     // Load training data in LIBSVM format
-    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/data/mllib/sample_binary_classification_data.txt")
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")
 
     // Split data into training (60%) and test (40%)
     val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L)
@@ -106,4 +106,5 @@ object BinaryClassificationMetrics {
 
   }
 }
+
 // scalastyle:on println
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala
index 020b86d2b332c..4ae6c48364208 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala
@@ -20,6 +20,7 @@ package org.apache.spark.examples.mllib
 
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkContext, SparkConf}
+
 // $example on$
 import org.apache.spark.mllib.evaluation.MultilabelMetrics
 import org.apache.spark.rdd.RDD;

From ad3c01ef932d4f4d5a47c7f1fbf6789b2f70caad Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Fri, 13 Nov 2015 15:16:56 +0530
Subject: [PATCH 06/13] fixed scala style issues

---
 .../mllib/JavaBinaryClassification.java       |  2 +-
 .../mllib/JavaMultiLabelClassification.java   |  2 +-
 .../mllib/JavaMulticlassClassification.java   |  2 +-
 .../mllib/BinaryClassificationMetrics.scala   |  7 ++++---
 .../examples/mllib/MultiLabelMetrics.scala    | 15 ++++++++-------
 .../examples/mllib/MulticlassMetrics.scala    |  2 +-
 .../spark/examples/mllib/RankingMetrics.scala | 19 ++++++++++++++-----
 .../examples/mllib/RegressionMetrics.scala    |  4 +++-
 8 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
index 58b255eb598ec..86fbc1aa58c21 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
@@ -23,13 +23,13 @@
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.mllib.classification.LogisticRegressionModel;
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.rdd.RDD;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
index c4d44dd8ea872..88313964c05df 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
@@ -23,8 +23,8 @@
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.mllib.evaluation.MultilabelMetrics;
+import org.apache.spark.rdd.RDD;
 import org.apache.spark.SparkConf;
 
 import java.util.Arrays;
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
index cc5af0b178fa5..4bec6ffee0ed4 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
@@ -23,7 +23,6 @@
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.mllib.classification.LogisticRegressionModel;
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
@@ -31,6 +30,7 @@
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.rdd.RDD;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala
index 72728ccc5ae43..3a6ac425fca23 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala
@@ -18,8 +18,6 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.{SparkContext, SparkConf}
 
 // $example on$
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
@@ -28,6 +26,9 @@ import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils
 // $example off$
 
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.sql.SQLContext
+
 object BinaryClassificationMetrics {
 
   def main(args: Array[String]) {
@@ -107,4 +108,4 @@ object BinaryClassificationMetrics {
   }
 }
 
-// scalastyle:on println
\ No newline at end of file
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala
index 4ae6c48364208..ef19aee6df331 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala
@@ -18,17 +18,16 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.{SparkContext, SparkConf}
-
 // $example on$
 import org.apache.spark.mllib.evaluation.MultilabelMetrics
 import org.apache.spark.rdd.RDD;
 // $example off$
-object MultiLabelMetrics {
 
-  def main(args: Array[String]) {
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
 
+object MultiLabelMetrics {
+  def main(args: Array[String]) {
     val conf = new SparkConf().setAppName("MultiLabelMetrics")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
@@ -53,7 +52,8 @@ object MultiLabelMetrics {
     println(s"Accuracy = ${metrics.accuracy}")
 
     // Individual label stats
-    metrics.labels.foreach(label => println(s"Class $label precision = ${metrics.precision(label)}"))
+    metrics.labels.foreach(label =>
+      println(s"Class $label precision = ${metrics.precision(label)}"))
     metrics.labels.foreach(label => println(s"Class $label recall = ${metrics.recall(label)}"))
     metrics.labels.foreach(label => println(s"Class $label F1-score = ${metrics.f1Measure(label)}"))
 
@@ -69,4 +69,5 @@ object MultiLabelMetrics {
     println(s"Subset accuracy = ${metrics.subsetAccuracy}")
     // $example off$
   }
-}
\ No newline at end of file
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala
index 0ed3c633f19d8..6091fcb8be0c2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala
@@ -100,4 +100,4 @@ object MulticlassMetrics {
 
   }
 }
-// scalastyle:on println
\ No newline at end of file
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala
index 9a7a25357f596..047df02719074 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala
@@ -20,9 +20,12 @@ package org.apache.spark.examples.mllib
 
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkContext, SparkConf}
+
 // $example on$
+
 import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics}
 import org.apache.spark.mllib.recommendation.{ALS, Rating}
+
 // $example off$
 object RankingMetrics {
 
@@ -40,7 +43,8 @@ object RankingMetrics {
     }.cache()
 
     // Map ratings to 1 or 0, 1 indicating a movie that should be recommended
-    val binarizedRatings = ratings.map(r => Rating(r.user, r.product, if (r.rating > 0) 1.0 else 0.0)).cache()
+    val binarizedRatings = ratings.map(r => Rating(r.user, r.product,
+      if (r.rating > 0) 1.0 else 0.0)).cache()
 
     // Summarize ratings
     val numRatings = ratings.count()
@@ -68,7 +72,8 @@ object RankingMetrics {
     // Assume that any movie a user rated 3 or higher (which maps to a 1) is a relevant document
     // Compare with top ten most relevant documents
     val userMovies = binarizedRatings.groupBy(_.user)
-    val relevantDocuments = userMovies.join(userRecommended).map { case (user, (actual, predictions)) =>
+    val relevantDocuments = userMovies.join(userRecommended).map { case (user, (actual,
+    predictions)) =>
       (predictions.map(_.product), actual.filter(_.rating > 0.0).map(_.product).toArray)
     }
 
@@ -89,9 +94,11 @@ object RankingMetrics {
     }
 
     // Get predictions for each data point
-    val allPredictions = model.predict(ratings.map(r => (r.user, r.product))).map(r => ((r.user, r.product), r.rating))
+    val allPredictions = model.predict(ratings.map(r => (r.user, r.product))).map(r => ((r.user,
+      r.product), r.rating))
     val allRatings = ratings.map(r => ((r.user, r.product), r.rating))
-    val predictionsAndLabels = allPredictions.join(allRatings).map { case ((user, product), (predicted, actual)) =>
+    val predictionsAndLabels = allPredictions.join(allRatings).map { case ((user, product),
+    (predicted, actual)) =>
       (predicted, actual)
     }
 
@@ -103,4 +110,6 @@ object RankingMetrics {
     println(s"R-squared = ${regressionMetrics.r2}")
     // $example off$
   }
-}
\ No newline at end of file
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala
index 7dc77caeafa7a..5bbcf59a0d774 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala
@@ -66,4 +66,6 @@ object RegressionMetrics {
     println(s"Explained variance = ${metrics.explainedVariance}")
     // $example off$
   }
-}
\ No newline at end of file
+}
+// scalastyle:on println
+

From 4d18447c80a5db25499d731e150750b2c39db7f6 Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Fri, 13 Nov 2015 15:19:48 +0530
Subject: [PATCH 07/13] fixed java style issues

---
 .../examples/mllib/JavaMultiLabelClassification.java |  7 +++----
 .../org/apache/spark/examples/mllib/JavaRanking.java | 12 ++++--------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
index 88313964c05df..b7283000db047 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
@@ -18,17 +18,16 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib;
 
+// $example off$
+import java.util.Arrays;
+import java.util.List;
 // $example on$
-
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
 import org.apache.spark.mllib.evaluation.MultilabelMetrics;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.SparkConf;
-
-import java.util.Arrays;
-import java.util.List;
 // $example off$
 import org.apache.spark.SparkContext;
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
index 2fca06b25ebe5..18723b8beb38c 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
@@ -19,22 +19,18 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
-
+import java.util.*;
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
-import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.Function;
-
-import java.util.*;
-
 import org.apache.spark.mllib.evaluation.RegressionMetrics;
 import org.apache.spark.mllib.evaluation.RankingMetrics;
 import org.apache.spark.mllib.recommendation.ALS;
+import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
 import org.apache.spark.mllib.recommendation.Rating;
-
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.SparkConf;
 // Read in the ratings data
 public class JavaRanking {
   public static void main(String[] args) {

From 3c40a35d665c821f1f2f7cbdc3af2c0f9e4ff45d Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Wed, 18 Nov 2015 12:07:55 +0530
Subject: [PATCH 08/13] fixed styling issues

---
 docs/mllib-evaluation-metrics.md              |   9 +-
 ...vaBinaryClassificationMetricsExample.java} |  40 ++---
 ....java => JavaLinearRegressionExample.java} |  47 +++---
 ...ltiLabelClassificationMetricsExample.java} |  35 +++--
 ...lticlassClassificationMetricsExample.java} |  39 +++--
 ...ng.java => JavaRankingMetricsExample.java} | 145 +++++++++---------
 ... binary_classification_metrics_example.py} |  11 +-
 ...rics.py => multi_class_metrics_example.py} |   2 +-
 ...rics.py => multi_label_metrics_example.py} |   3 +-
 ..._metrics.py => ranking_metrics_example.py} |   2 +-
 ...trics.py => regression_metrics_example.py} |   3 +-
 ... BinaryClassificationMetricsExample.scala} |  11 +-
 ...s.scala => MultiLabelMetricsExample.scala} |   9 +-
 ...s.scala => MulticlassMetricsExample.scala} |  10 +-
 ...rics.scala => RankingMetricsExample.scala} |  10 +-
 ...s.scala => RegressionMetricsExample.scala} |   9 +-
 16 files changed, 182 insertions(+), 203 deletions(-)
 rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaBinaryClassification.java => JavaBinaryClassificationMetricsExample.java} (83%)
 rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaLinearRegression.java => JavaLinearRegressionExample.java} (71%)
 rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaMultiLabelClassification.java => JavaMultiLabelClassificationMetricsExample.java} (74%)
 rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaMulticlassClassification.java => JavaMulticlassClassificationMetricsExample.java} (81%)
 rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaRanking.java => JavaRankingMetricsExample.java} (56%)
 rename examples/src/main/python/mllib/{binary_classification_metrics.py => binary_classification_metrics_example.py} (95%)
 rename examples/src/main/python/mllib/{multi_class_metrics.py => multi_class_metrics_example.py} (97%)
 rename examples/src/main/python/mllib/{multi_label_metrics.py => multi_label_metrics_example.py} (97%)
 rename examples/src/main/python/mllib/{ranking_metrics.py => ranking_metrics_example.py} (97%)
 rename examples/src/main/python/mllib/{regression_metrics.py => regression_metrics_example.py} (97%)
 rename examples/src/main/scala/org/apache/spark/examples/mllib/{BinaryClassificationMetrics.scala => BinaryClassificationMetricsExample.scala} (97%)
 rename examples/src/main/scala/org/apache/spark/examples/mllib/{MultiLabelMetrics.scala => MultiLabelMetricsExample.scala} (94%)
 rename examples/src/main/scala/org/apache/spark/examples/mllib/{MulticlassMetrics.scala => MulticlassMetricsExample.scala} (97%)
 rename examples/src/main/scala/org/apache/spark/examples/mllib/{RankingMetrics.scala => RankingMetricsExample.scala} (97%)
 rename examples/src/main/scala/org/apache/spark/examples/mllib/{RegressionMetrics.scala => RegressionMetricsExample.scala} (94%)

diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index 7a9792c4a1455..138a1b297ad33 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -111,7 +111,7 @@ Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org
 <div data-lang="java" markdown="1">
 Refer to the [`LogisticRegressionModel` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionModel.html) and [`LogisticRegressionWithLBFGS` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/mllib/JavaBinaryClassification.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java %}
 
 </div>
 
@@ -247,7 +247,8 @@ Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.sp
 <div data-lang="java" markdown="1">
 Refer to the [`MulticlassMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MulticlassMetrics.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample
+.java %}
 
 </div>
 
@@ -397,7 +398,7 @@ Refer to the [`MultilabelMetrics` Scala docs](api/scala/index.html#org.apache.sp
 <div data-lang="java" markdown="1">
 Refer to the [`MultilabelMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MultilabelMetrics.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetrics.java %}
 
 </div>
 
@@ -525,7 +526,7 @@ Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.sp
 <div data-lang="java" markdown="1">
 Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) and [`RankingMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RankingMetrics.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/mllib/JavaRanking.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java %}
 
 </div>
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java
similarity index 83%
rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java
index 86fbc1aa58c21..c77c6ba52c26c 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java
@@ -19,7 +19,6 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
-
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
@@ -30,13 +29,15 @@
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.rdd.RDD;
+// $example off$
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 
-public class JavaBinaryClassification {
+public class JavaBinaryClassificationMetricsExample {
   public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics");
+    SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics Example");
     SparkContext sc = new SparkContext(conf);
+    // $example on$
     String path = "data/mllib/sample_binary_classification_data.txt";
     JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
 
@@ -47,20 +48,20 @@ public static void main(String[] args) {
 
     // Run training algorithm to build the model.
     final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
-            .setNumClasses(2)
-            .run(training.rdd());
+      .setNumClasses(2)
+      .run(training.rdd());
 
     // Clear the prediction threshold so the model will return probabilities
     model.clearThreshold();
 
     // Compute raw scores on the test set.
     JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
-            new Function<LabeledPoint, Tuple2<Object, Object>>() {
-              public Tuple2<Object, Object> call(LabeledPoint p) {
-                Double prediction = model.predict(p.features());
-                return new Tuple2<Object, Object>(prediction, p.label());
-              }
-            }
+      new Function<LabeledPoint, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(LabeledPoint p) {
+          Double prediction = model.predict(p.features());
+          return new Tuple2<Object, Object>(prediction, p.label());
+        }
+      }
     );
 
     // Get evaluation metrics.
@@ -87,11 +88,11 @@ public Tuple2<Object, Object> call(LabeledPoint p) {
 
     // Thresholds
     JavaRDD<Double> thresholds = precision.map(
-            new Function<Tuple2<Object, Object>, Double>() {
-              public Double call(Tuple2<Object, Object> t) {
-                return new Double(t._1().toString());
-              }
-            }
+      new Function<Tuple2<Object, Object>, Double>() {
+        public Double call(Tuple2<Object, Object> t) {
+          return new Double(t._1().toString());
+        }
+      }
     );
 
     // ROC Curve
@@ -105,8 +106,9 @@ public Double call(Tuple2<Object, Object> t) {
     System.out.println("Area under ROC = " + metrics.areaUnderROC());
 
     // Save and load model
-    model.save(sc, "myModelPath");
-    LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
+    model.save(sc, "target/tmp/LogisticRegressionModel");
+    LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
+            "target/tmp/LogisticRegressionModel");
+    // $example off$
   }
 }
-// $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java
similarity index 71%
rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java
index 6781ec619b89e..76f99d39c2763 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java
@@ -19,7 +19,6 @@
 package org.apache.spark.examples.mllib;
 
 // $example on$
-
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
@@ -30,42 +29,42 @@
 import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
 import org.apache.spark.mllib.evaluation.RegressionMetrics;
 import org.apache.spark.SparkConf;
-
+// $example off$
 // Read in the ratings data
-public class JavaLinearRegression {
+public class JavaLinearRegressionExample {
   public static void main(String[] args) {
     SparkConf conf = new SparkConf().setAppName("Linear Regression Example");
     JavaSparkContext sc = new JavaSparkContext(conf);
-
+    // $example on$
     // Load and parse the data
     String path = "data/mllib/sample_linear_regression_data.txt";
     JavaRDD<String> data = sc.textFile(path);
     JavaRDD<LabeledPoint> parsedData = data.map(
-            new Function<String, LabeledPoint>() {
-              public LabeledPoint call(String line) {
-                String[] parts = line.split(" ");
-                double[] v = new double[parts.length - 1];
-                for (int i = 1; i < parts.length - 1; i++)
-                  v[i - 1] = Double.parseDouble(parts[i].split(":")[1]);
-                return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
-              }
-            }
+      new Function<String, LabeledPoint>() {
+        public LabeledPoint call(String line) {
+          String[] parts = line.split(" ");
+          double[] v = new double[parts.length - 1];
+          for (int i = 1; i < parts.length - 1; i++)
+            v[i - 1] = Double.parseDouble(parts[i].split(":")[1]);
+            return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
+        }
+      }
     );
     parsedData.cache();
 
     // Building the model
     int numIterations = 100;
-    final LinearRegressionModel model =
-            LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations);
+    final LinearRegressionModel model =  LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData),
+            numIterations);
 
     // Evaluate model on training examples and compute training error
     JavaRDD<Tuple2<Object, Object>> valuesAndPreds = parsedData.map(
-            new Function<LabeledPoint, Tuple2<Object, Object>>() {
-              public Tuple2<Object, Object> call(LabeledPoint point) {
-                double prediction = model.predict(point.features());
-                return new Tuple2<Object, Object>(prediction, point.label());
-              }
-            }
+      new Function<LabeledPoint, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(LabeledPoint point) {
+          double prediction = model.predict(point.features());
+          return new Tuple2<Object, Object>(prediction, point.label());
+        }
+      }
     );
 
     // Instantiate metrics object
@@ -85,8 +84,8 @@ public Tuple2<Object, Object> call(LabeledPoint point) {
     System.out.format("Explained Variance = %f\n", metrics.explainedVariance());
 
     // Save and load model
-    model.save(sc.sc(), "myModelPath");
-    LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath");
+    model.save(sc.sc(), "target/tmp/LogisticRegressionModel");
+    LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "target/tmp/LogisticRegressionModel");
+    // $example on$
   }
 }
-// $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java
similarity index 74%
rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java
index b7283000db047..c69f315298f2a 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java
@@ -18,7 +18,7 @@
 // scalastyle:off println
 package org.apache.spark.examples.mllib;
 
-// $example off$
+
 import java.util.Arrays;
 import java.util.List;
 // $example on$
@@ -31,20 +31,19 @@
 // $example off$
 import org.apache.spark.SparkContext;
 
-// $example on$
-public class JavaMultiLabelClassification {
+public class JavaMultiLabelClassificationMetricsExample {
   public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics");
+    SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics Example");
     JavaSparkContext sc = new JavaSparkContext(conf);
-
+    // $example on$
     List<Tuple2<double[], double[]>> data = Arrays.asList(
-            new Tuple2<double[], double[]>(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}),
-            new Tuple2<double[], double[]>(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}),
-            new Tuple2<double[], double[]>(new double[]{}, new double[]{0.0}),
-            new Tuple2<double[], double[]>(new double[]{2.0}, new double[]{2.0}),
-            new Tuple2<double[], double[]>(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}),
-            new Tuple2<double[], double[]>(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}),
-            new Tuple2<double[], double[]>(new double[]{1.0}, new double[]{1.0, 2.0})
+      new Tuple2<double[], double[]>(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}),
+      new Tuple2<double[], double[]>(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}),
+      new Tuple2<double[], double[]>(new double[]{}, new double[]{0.0}),
+      new Tuple2<double[], double[]>(new double[]{2.0}, new double[]{2.0}),
+      new Tuple2<double[], double[]>(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}),
+      new Tuple2<double[], double[]>(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}),
+      new Tuple2<double[], double[]>(new double[]{1.0}, new double[]{1.0, 2.0})
     );
     JavaRDD<Tuple2<double[], double[]>> scoreAndLabels = sc.parallelize(data);
 
@@ -59,9 +58,12 @@ public static void main(String[] args) {
 
     // Stats by labels
     for (int i = 0; i < metrics.labels().length - 1; i++) {
-      System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
-      System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
-      System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure(metrics.labels()[i]));
+      System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision
+              (metrics.labels()[i]));
+      System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics
+              .labels()[i]));
+      System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure
+              (metrics.labels()[i]));
     }
 
     // Micro stats
@@ -74,7 +76,6 @@ public static void main(String[] args) {
 
     // Subset accuracy
     System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy());
-
+    // $example off$
   }
 }
-// $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
similarity index 81%
rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
index 4bec6ffee0ed4..e05494a1da4de 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
@@ -19,7 +19,6 @@
 package org.apache.spark.examples.mllib
 
 // $example on$
-
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
@@ -30,14 +29,16 @@
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.mllib.linalg.Matrix;
+// $example off$
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 
-public class JavaMulticlassClassification {
+public class JavaMulticlassClassificationMetricsExample {
   public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics");
+    SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics Example");
     SparkContext sc = new SparkContext(conf);
+    // $example on$
     String path = "data/mllib/sample_multiclass_classification_data.txt";
     JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
 
@@ -48,17 +49,17 @@ public static void main(String[] args) {
 
     // Run training algorithm to build the model.
     final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
-            .setNumClasses(3)
-            .run(training.rdd());
+      .setNumClasses(3)
+      .run(training.rdd());
 
     // Compute raw scores on the test set.
     JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
-            new Function<LabeledPoint, Tuple2<Object, Object>>() {
-              public Tuple2<Object, Object> call(LabeledPoint p) {
-                Double prediction = model.predict(p.features());
-                return new Tuple2<Object, Object>(prediction, p.label());
-              }
-            }
+      new Function<LabeledPoint, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(LabeledPoint p) {
+          Double prediction = model.predict(p.features());
+          return new Tuple2<Object, Object>(prediction, p.label());
+        }
+      }
     );
 
     // Get evaluation metrics.
@@ -75,9 +76,12 @@ public Tuple2<Object, Object> call(LabeledPoint p) {
 
     // Stats by labels
     for (int i = 0; i < metrics.labels().length; i++) {
-      System.out.format("Class %f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i]));
-      System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i]));
-      System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i]));
+      System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision
+              (metrics.labels()[i]));
+      System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics
+              .labels()[i]));
+      System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure
+              (metrics.labels()[i]));
     }
 
     //Weighted stats
@@ -87,8 +91,9 @@ public Tuple2<Object, Object> call(LabeledPoint p) {
     System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate());
 
     // Save and load model
-    model.save(sc, "myModelPath");
-    LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath");
+    model.save(sc, "target/tmp/LogisticRegressionModel");
+    LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
+            "target/tmp/LogisticRegressionModel");
+    // $example off$
   }
 }
-// $example off$
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java
similarity index 56%
rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java
index 18723b8beb38c..2f64a58dceeb2 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java
@@ -20,6 +20,7 @@
 
 // $example on$
 import java.util.*;
+
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
@@ -29,22 +30,25 @@
 import org.apache.spark.mllib.recommendation.ALS;
 import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
 import org.apache.spark.mllib.recommendation.Rating;
+// $example off$
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.SparkConf;
-// Read in the ratings data
-public class JavaRanking {
+
+public class JavaRankingMetricsExample {
   public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Ranking Metrics");
+    SparkConf conf = new SparkConf().setAppName("Ranking Metrics Example");
     JavaSparkContext sc = new JavaSparkContext(conf);
+    // $example on$
     String path = "data/mllib/sample_movielens_data.txt";
     JavaRDD<String> data = sc.textFile(path);
     JavaRDD<Rating> ratings = data.map(
-            new Function<String, Rating>() {
-              public Rating call(String line) {
-                String[] parts = line.split("::");
-                return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]) - 2.5);
-              }
-            }
+      new Function<String, Rating>() {
+        public Rating call(String line) {
+          String[] parts = line.split("::");
+            return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double
+                    .parseDouble(parts[2]) - 2.5);
+        }
+      }
     );
     ratings.cache();
 
@@ -54,32 +58,32 @@ public Rating call(String line) {
     // Get top 10 recommendations for every user and scale ratings from 0 to 1
     JavaRDD<Tuple2<Object, Rating[]>> userRecs = model.recommendProductsForUsers(10).toJavaRDD();
     JavaRDD<Tuple2<Object, Rating[]>> userRecsScaled = userRecs.map(
-            new Function<Tuple2<Object, Rating[]>, Tuple2<Object, Rating[]>>() {
-              public Tuple2<Object, Rating[]> call(Tuple2<Object, Rating[]> t) {
-                Rating[] scaledRatings = new Rating[t._2().length];
-                for (int i = 0; i < scaledRatings.length; i++) {
-                  double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0);
-                  scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating);
-                }
-                return new Tuple2<Object, Rating[]>(t._1(), scaledRatings);
-              }
-            }
+      new Function<Tuple2<Object, Rating[]>, Tuple2<Object, Rating[]>>() {
+        public Tuple2<Object, Rating[]> call(Tuple2<Object, Rating[]> t) {
+          Rating[] scaledRatings = new Rating[t._2().length];
+          for (int i = 0; i < scaledRatings.length; i++) {
+            double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0);
+            scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating);
+          }
+          return new Tuple2<Object, Rating[]>(t._1(), scaledRatings);
+        }
+      }
     );
     JavaPairRDD<Object, Rating[]> userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled);
 
     // Map ratings to 1 or 0, 1 indicating a movie that should be recommended
     JavaRDD<Rating> binarizedRatings = ratings.map(
-            new Function<Rating, Rating>() {
-              public Rating call(Rating r) {
-                double binaryRating;
-                if (r.rating() > 0.0) {
-                  binaryRating = 1.0;
-                } else {
-                  binaryRating = 0.0;
-                }
-                return new Rating(r.user(), r.product(), binaryRating);
-              }
-            }
+      new Function<Rating, Rating>() {
+        public Rating call(Rating r) {
+          double binaryRating;
+          if (r.rating() > 0.0) {
+              binaryRating = 1.0;
+          } else {
+              binaryRating = 0.0;
+          }
+          return new Rating(r.user(), r.product(), binaryRating);
+        }
+      }
     );
 
     // Group ratings by common user
@@ -93,32 +97,33 @@ public Object call(Rating r) {
 
     // Get true relevant documents from all user ratings
     JavaPairRDD<Object, List<Integer>> userMoviesList = userMovies.mapValues(
-            new Function<Iterable<Rating>, List<Integer>>() {
-              public List<Integer> call(Iterable<Rating> docs) {
-                List<Integer> products = new ArrayList<Integer>();
-                for (Rating r : docs) {
-                  if (r.rating() > 0.0) {
-                    products.add(r.product());
-                  }
-                }
-                return products;
-              }
+      new Function<Iterable<Rating>, List<Integer>>() {
+        public List<Integer> call(Iterable<Rating> docs) {
+          List<Integer> products = new ArrayList<Integer>();
+          for (Rating r : docs) {
+            if (r.rating() > 0.0) {
+              products.add(r.product());
             }
+          }
+          return products;
+        }
+      }
     );
 
     // Extract the product id from each recommendation
     JavaPairRDD<Object, List<Integer>> userRecommendedList = userRecommended.mapValues(
-            new Function<Rating[], List<Integer>>() {
-              public List<Integer> call(Rating[] docs) {
-                List<Integer> products = new ArrayList<Integer>();
-                for (Rating r : docs) {
-                  products.add(r.product());
-                }
-                return products;
-              }
-            }
+      new Function<Rating[], List<Integer>>() {
+        public List<Integer> call(Rating[] docs) {
+          List<Integer> products = new ArrayList<Integer>();
+          for (Rating r : docs) {
+            products.add(r.product());
+          }
+          return products;
+        }
+      }
     );
-    JavaRDD<Tuple2<List<Integer>, List<Integer>>> relevantDocs = userMoviesList.join(userRecommendedList).values();
+    JavaRDD<Tuple2<List<Integer>, List<Integer>>> relevantDocs = userMoviesList.join
+            (userRecommendedList).values();
 
     // Instantiate the metrics object
     RankingMetrics metrics = RankingMetrics.of(relevantDocs);
@@ -135,29 +140,29 @@ public List<Integer> call(Rating[] docs) {
 
     // Evaluate the model using numerical ratings and regression metrics
     JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
-            new Function<Rating, Tuple2<Object, Object>>() {
-              public Tuple2<Object, Object> call(Rating r) {
-                return new Tuple2<Object, Object>(r.user(), r.product());
-              }
-            }
+      new Function<Rating, Tuple2<Object, Object>>() {
+        public Tuple2<Object, Object> call(Rating r) {
+          return new Tuple2<Object, Object>(r.user(), r.product());
+        }
+      }
     );
     JavaPairRDD<Tuple2<Integer, Integer>, Object> predictions = JavaPairRDD.fromJavaRDD(
-            model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
-                    new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
-                      public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r) {
-                        return new Tuple2<Tuple2<Integer, Integer>, Object>(
-                                new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
-                      }
-                    }
-            ));
+      model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
+        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
+          public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r) {
+            return new Tuple2<Tuple2<Integer, Integer>, Object>(
+              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+          }
+        }
+      ));
     JavaRDD<Tuple2<Object, Object>> ratesAndPreds =
             JavaPairRDD.fromJavaRDD(ratings.map(
-                    new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
-                      public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r) {
-                        return new Tuple2<Tuple2<Integer, Integer>, Object>(
-                                new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
-                      }
-                    }
+              new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
+                public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r) {
+                  return new Tuple2<Tuple2<Integer, Integer>, Object>(
+                    new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+                }
+              }
             )).join(predictions).values();
 
     // Create regression metrics object
@@ -168,6 +173,6 @@ public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r) {
 
     // R-squared
     System.out.format("R-squared = %f\n", regressionMetrics.r2());
+    // $example off$
   }
 }
-// $example off$
\ No newline at end of file
diff --git a/examples/src/main/python/mllib/binary_classification_metrics.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py
similarity index 95%
rename from examples/src/main/python/mllib/binary_classification_metrics.py
rename to examples/src/main/python/mllib/binary_classification_metrics_example.py
index f8c32bbe6154e..38b557108c2ea 100644
--- a/examples/src/main/python/mllib/binary_classification_metrics.py
+++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py
@@ -14,16 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-
 """
 Binary Classification Metrics Example.
 """
 from __future__ import print_function
-
 import sys
-
-
 from pyspark import SparkContext, SQLContext
 # $example on$
 from pyspark.mllib.classification import LogisticRegressionWithLBFGS
@@ -33,14 +28,10 @@
 # $example off$
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="BinaryClassificationMetrics")
+    sc = SparkContext(appName="BinaryClassificationMetricsExample")
     sqlContext = SQLContext(sc)
-
     # $example on$
     # Several of the methods available in scala are currently missing from pyspark
-    # $example off$
-
-    # $example on$
     # Load training data in LIBSVM format
     data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")
 
diff --git a/examples/src/main/python/mllib/multi_class_metrics.py b/examples/src/main/python/mllib/multi_class_metrics_example.py
similarity index 97%
rename from examples/src/main/python/mllib/multi_class_metrics.py
rename to examples/src/main/python/mllib/multi_class_metrics_example.py
index 7959b7230a563..cd56b3c97c778 100644
--- a/examples/src/main/python/mllib/multi_class_metrics.py
+++ b/examples/src/main/python/mllib/multi_class_metrics_example.py
@@ -24,7 +24,7 @@
 from pyspark import SparkContext
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="MultiClassMetrics")
+    sc = SparkContext(appName="MultiClassMetricsExample")
 
     # Several of the methods available in scala are currently missing from pyspark
     # $example on$
diff --git a/examples/src/main/python/mllib/multi_label_metrics.py b/examples/src/main/python/mllib/multi_label_metrics_example.py
similarity index 97%
rename from examples/src/main/python/mllib/multi_label_metrics.py
rename to examples/src/main/python/mllib/multi_label_metrics_example.py
index d02d8d862d1e1..f293ce8e309e9 100644
--- a/examples/src/main/python/mllib/multi_label_metrics.py
+++ b/examples/src/main/python/mllib/multi_label_metrics_example.py
@@ -21,9 +21,8 @@
 from pyspark.mllib.util import MLUtils
 from pyspark import SparkContext
 
-
 if __name__ == "__main__":
-    sc = SparkContext(appName="MultiClassMetrics")
+    sc = SparkContext(appName="MultiLabelMetricsExample")
     # $example on$
     scoreAndLabels = sc.parallelize([
         ([0.0, 1.0], [0.0, 2.0]),
diff --git a/examples/src/main/python/mllib/ranking_metrics.py b/examples/src/main/python/mllib/ranking_metrics_example.py
similarity index 97%
rename from examples/src/main/python/mllib/ranking_metrics.py
rename to examples/src/main/python/mllib/ranking_metrics_example.py
index 6fcdf3032d3dc..deefbd23c5b94 100644
--- a/examples/src/main/python/mllib/ranking_metrics.py
+++ b/examples/src/main/python/mllib/ranking_metrics_example.py
@@ -22,7 +22,7 @@
 from pyspark import SparkContext
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="Ranking Metrics")
+    sc = SparkContext(appName="Ranking Metrics Example")
 
     # Several of the methods available in scala are currently missing from pyspark
     # $example on$
diff --git a/examples/src/main/python/mllib/regression_metrics.py b/examples/src/main/python/mllib/regression_metrics_example.py
similarity index 97%
rename from examples/src/main/python/mllib/regression_metrics.py
rename to examples/src/main/python/mllib/regression_metrics_example.py
index aca33aa7f8611..a3e9c12dbc4ea 100644
--- a/examples/src/main/python/mllib/regression_metrics.py
+++ b/examples/src/main/python/mllib/regression_metrics_example.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
 # $example on$
 from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
 from pyspark.mllib.evaluation import RegressionMetrics
@@ -24,7 +23,7 @@
 from pyspark import SparkContext
 
 if __name__ == "__main__":
-    sc = SparkContext(appName="Regression Metrics")
+    sc = SparkContext(appName="Regression Metrics Example")
     # $example on$
     # Load and parse the data
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
similarity index 97%
rename from examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
index 3a6ac425fca23..466c84ed2dec7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
@@ -17,8 +17,6 @@
 
 // scalastyle:off println
 package org.apache.spark.examples.mllib
-
-
 // $example on$
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
@@ -29,11 +27,11 @@ import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.{SparkContext, SparkConf}
 import org.apache.spark.sql.SQLContext
 
-object BinaryClassificationMetrics {
+object BinaryClassificationMetricsExample {
 
-  def main(args: Array[String]) {
+  def main(args: Array[String]): Unit = {
 
-    val conf = new SparkConf().setAppName("BinaryClassificationMetrics")
+    val conf = new SparkConf().setAppName("BinaryClassificationMetricsExample")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
     import sqlContext.implicits._
@@ -102,10 +100,7 @@ object BinaryClassificationMetrics {
     // AUROC
     val auROC = metrics.areaUnderROC
     println("Area under ROC = " + auROC)
-
     // $example off$
-
   }
 }
-
 // scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
similarity index 94%
rename from examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
index ef19aee6df331..035f74490b3fe 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
@@ -14,21 +14,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 // scalastyle:off println
 package org.apache.spark.examples.mllib
-
 // $example on$
 import org.apache.spark.mllib.evaluation.MultilabelMetrics
 import org.apache.spark.rdd.RDD;
 // $example off$
-
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkContext, SparkConf}
 
-object MultiLabelMetrics {
-  def main(args: Array[String]) {
-    val conf = new SparkConf().setAppName("MultiLabelMetrics")
+object MultiLabelMetricsExample {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("MultiLabelMetricsExample")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
     import sqlContext.implicits._
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
similarity index 97%
rename from examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
index 6091fcb8be0c2..a3d29a5a091b6 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
@@ -14,23 +14,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 // scalastyle:off println
 package org.apache.spark.examples.mllib
-
 // $example on$
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils
 // $example off$
-
 import org.apache.spark.{SparkContext, SparkConf}
 
-object MulticlassMetrics {
-
-  def main(args: Array[String]) {
+object MulticlassMetricsExample {
 
+  def main(args: Array[String]): Unit = {
     val conf = new SparkConf().setAppName("MulticlassMetrics")
     val sc = new SparkContext(conf)
 
@@ -95,9 +91,7 @@ object MulticlassMetrics {
     println(s"Weighted recall: ${metrics.weightedRecall}")
     println(s"Weighted F1 score: ${metrics.weightedFMeasure}")
     println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}")
-
     // $example off$
-
   }
 }
 // scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala
similarity index 97%
rename from examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala
index 047df02719074..9fe933bf53931 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala
@@ -14,24 +14,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkContext, SparkConf}
-
 // $example on$
-
 import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics}
 import org.apache.spark.mllib.recommendation.{ALS, Rating}
-
 // $example off$
-object RankingMetrics {
 
+object RankingMetricsExample {
   def main(args: Array[String]) {
-
-    val conf = new SparkConf().setAppName("RankingMetrics")
+    val conf = new SparkConf().setAppName("RankingMetricsExample")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
     import sqlContext.implicits._
@@ -112,4 +107,3 @@ object RankingMetrics {
   }
 }
 // scalastyle:on println
-
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
similarity index 94%
rename from examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala
rename to examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
index 5bbcf59a0d774..fcab66f0df2e5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 // $example on$
@@ -28,11 +27,9 @@ import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkConf, SparkContext}
 
-object RegressionMetrics {
-
-  def main(args: Array[String]) {
-
-    val conf = new SparkConf().setAppName("RegressionMetrics")
+object RegressionMetricsExample {
+  def main(args: Array[String]) : Unit = {
+    val conf = new SparkConf().setAppName("RegressionMetricsExample")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
     // $example on$

From 892591b232dea89b943c138ad5a722d527a6801a Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Wed, 18 Nov 2015 12:14:14 +0530
Subject: [PATCH 09/13] fixed mllib-evaluation-metrics.md file

---
 docs/mllib-evaluation-metrics.md | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index 138a1b297ad33..3b07eee369de5 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -104,7 +104,7 @@ data, and evaluate the performance of the algorithm by several binary evaluation
 <div data-lang="scala" markdown="1">
 Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) and [`BinaryClassificationMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) for details on the API.
 
-{% include_example scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala %}
 
 </div>
 
@@ -118,7 +118,7 @@ Refer to the [`LogisticRegressionModel` Java docs](api/java/org/apache/spark/mll
 <div data-lang="python" markdown="1">
 Refer to the [`BinaryClassificationMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.BinaryClassificationMetrics) and [`LogisticRegressionWithLBFGS` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.LogisticRegressionWithLBFGS) for more details on the API.
 
-{% include_example python/mllib/binary_classification_metrics.py %}
+{% include_example python/mllib/binary_classification_metrics_example.py %}
 </div>
 </div>
 
@@ -240,7 +240,7 @@ the data, and evaluate the performance of the algorithm by several multiclass cl
 <div data-lang="scala" markdown="1">
 Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MulticlassMetrics) for details on the API.
 
-{% include_example scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala %}
 
 </div>
 
@@ -255,7 +255,7 @@ Refer to the [`MulticlassMetrics` Java docs](api/java/org/apache/spark/mllib/eva
 <div data-lang="python" markdown="1">
 Refer to the [`MulticlassMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.MulticlassMetrics) for more details on the API.
 
-{% include_example python/mllib/multi_class_metrics.py %}
+{% include_example python/mllib/multi_class_metrics_example.py %}
 
 </div>
 </div>
@@ -391,21 +391,22 @@ True classes:
 <div data-lang="scala" markdown="1">
 Refer to the [`MultilabelMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MultilabelMetrics) for details on the API.
 
-{% include_example scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala %}
 
 </div>
 
 <div data-lang="java" markdown="1">
 Refer to the [`MultilabelMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MultilabelMetrics.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetrics.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample
+.java %}
 
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`MultilabelMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.MultilabelMetrics) for more details on the API.
 
-{% include_example python/mllib/multi_label_metrics.py %}
+{% include_example python/mllib/multi_label_metrics_example.py %}
 
 </div>
 </div>
@@ -519,7 +520,7 @@ expanded world of non-positive weights are "the same as never having interacted
 <div data-lang="scala" markdown="1">
 Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RegressionMetrics) and [`RankingMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RankingMetrics) for details on the API.
 
-{% include_example scala/org/apache/spark/examples/mllib/RankingMetrics.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala %}
 
 </div>
 
@@ -533,7 +534,7 @@ Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/eva
 <div data-lang="python" markdown="1">
 Refer to the [`RegressionMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RegressionMetrics) and [`RankingMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics) for more details on the API.
 
-{% include_example python/mllib/ranking_metrics.py %}
+{% include_example python/mllib/ranking_metrics_example.py %}
 
 </div>
 </div>
@@ -583,21 +584,21 @@ and evaluate the performance of the algorithm by several regression metrics.
 <div data-lang="scala" markdown="1">
 Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RegressionMetrics) for details on the API.
 
-{% include_example scala/org/apache/spark/examples/mllib/RegressionMetrics.scala %}
+{% include_example scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala %}
 
 </div>
 
 <div data-lang="java" markdown="1">
 Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/mllib/JavaLinearRegression.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java %}
 
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`RegressionMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RegressionMetrics) for more details on the API.
 
-{% include_example python/mllib/regression_metrics.py %}
+{% include_example python/mllib/regression_metrics_example.py %}
 
 </div>
 </div>

From 8d2d508a89bf329f57d423345e81ca20f27fd541 Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Wed, 18 Nov 2015 12:42:37 +0530
Subject: [PATCH 10/13] fixed java issues

---
 docs/mllib-evaluation-metrics.md                    |  6 ++----
 .../JavaBinaryClassificationMetricsExample.java     |  3 ---
 .../examples/mllib/JavaLinearRegressionExample.java | 13 ++++++-------
 .../JavaMultiLabelClassificationMetricsExample.java |  2 --
 .../JavaMulticlassClassificationMetricsExample.java |  5 +----
 5 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index 3b07eee369de5..a1afde0dc6635 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -247,8 +247,7 @@ Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.sp
 <div data-lang="java" markdown="1">
 Refer to the [`MulticlassMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MulticlassMetrics.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample
-.java %}
+ {% include_example java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java %}
 
 </div>
 
@@ -398,8 +397,7 @@ Refer to the [`MultilabelMetrics` Scala docs](api/scala/index.html#org.apache.sp
 <div data-lang="java" markdown="1">
 Refer to the [`MultilabelMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MultilabelMetrics.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample
-.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java %}
 
 </div>
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java
index c77c6ba52c26c..d905aa82d5e49 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java
@@ -14,10 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-// scalastyle:off println
 package org.apache.spark.examples.mllib;
-
 // $example on$
 import scala.Tuple2;
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java
index 76f99d39c2763..5f4e2d74f04ef 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java
@@ -14,10 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-// scalastyle:off println
 package org.apache.spark.examples.mllib;
-
 // $example on$
 import scala.Tuple2;
 
@@ -30,6 +27,7 @@
 import org.apache.spark.mllib.evaluation.RegressionMetrics;
 import org.apache.spark.SparkConf;
 // $example off$
+
 // Read in the ratings data
 public class JavaLinearRegressionExample {
   public static void main(String[] args) {
@@ -46,7 +44,7 @@ public LabeledPoint call(String line) {
           double[] v = new double[parts.length - 1];
           for (int i = 1; i < parts.length - 1; i++)
             v[i - 1] = Double.parseDouble(parts[i].split(":")[1]);
-            return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
+          return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
         }
       }
     );
@@ -54,7 +52,7 @@ public LabeledPoint call(String line) {
 
     // Building the model
     int numIterations = 100;
-    final LinearRegressionModel model =  LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData),
+    final LinearRegressionModel model = LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData),
             numIterations);
 
     // Evaluate model on training examples and compute training error
@@ -85,7 +83,8 @@ public Tuple2<Object, Object> call(LabeledPoint point) {
 
     // Save and load model
     model.save(sc.sc(), "target/tmp/LogisticRegressionModel");
-    LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "target/tmp/LogisticRegressionModel");
-    // $example on$
+    LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(),
+            "target/tmp/LogisticRegressionModel");
+    // $example off$
   }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java
index c69f315298f2a..d6b9178b2c4a0 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java
@@ -14,8 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-// scalastyle:off println
 package org.apache.spark.examples.mllib;
 
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
index e05494a1da4de..52d0b3354bf46 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
@@ -14,10 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-// scalastyle:off println
 package org.apache.spark.examples.mllib
-
 // $example on$
 import scala.Tuple2;
 
@@ -50,7 +47,7 @@ public static void main(String[] args) {
     // Run training algorithm to build the model.
     final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
       .setNumClasses(3)
-      .run(training.rdd());
+            .run(training.rdd());
 
     // Compute raw scores on the test set.
     JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(

From 54008ced6c3a380cac71bb85b2a7da2a83fb8c20 Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Thu, 19 Nov 2015 12:32:48 +0530
Subject: [PATCH 11/13] fixed import issue

---
 .../mllib/JavaMulticlassClassificationMetricsExample.java       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
index 52d0b3354bf46..9f3426a0ba6de 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.examples.mllib
+package org.apache.spark.examples.mllib;
 // $example on$
 import scala.Tuple2;
 

From 1c5cc8f1753942e4c1efb7f868a32d6b30420159 Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Thu, 19 Nov 2015 15:46:09 +0530
Subject: [PATCH 12/13] fixed spacing and removed few imports and re-ordered
 imports

---
 docs/mllib-evaluation-metrics.md              |  2 +-
 ...avaBinaryClassificationMetricsExample.java | 10 +++--
 ...ultiLabelClassificationMetricsExample.java | 11 +++---
 ...ulticlassClassificationMetricsExample.java | 13 ++++---
 .../mllib/JavaRankingMetricsExample.java      | 38 +++++++++----------
 ...java => JavaRegressionMetricsExample.java} | 11 +++---
 .../mllib/multi_label_metrics_example.py      |  1 -
 .../python/mllib/ranking_metrics_example.py   |  2 +-
 .../mllib/regression_metrics_example.py       |  1 -
 .../BinaryClassificationMetricsExample.scala  |  5 +--
 .../mllib/MultiLabelMetricsExample.scala      |  9 ++---
 .../mllib/MulticlassMetricsExample.scala      |  4 +-
 .../mllib/RankingMetricsExample.scala         |  5 ++-
 .../mllib/RegressionMetricsExample.scala      |  6 +--
 14 files changed, 59 insertions(+), 59 deletions(-)
 rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaLinearRegressionExample.java => JavaRegressionMetricsExample.java} (93%)

diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index a1afde0dc6635..6924037b941f3 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -589,7 +589,7 @@ Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.sp
 <div data-lang="java" markdown="1">
 Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) for details on the API.
 
-{% include_example java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java %}
+{% include_example java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java %}
 
 </div>
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java
index d905aa82d5e49..980a9108af53f 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java
@@ -14,7 +14,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.spark.examples.mllib;
+
 // $example on$
 import scala.Tuple2;
 
@@ -25,21 +27,21 @@
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
 // $example off$
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 
 public class JavaBinaryClassificationMetricsExample {
   public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics Example");
+    SparkConf conf = new SparkConf().setAppName("Java Binary Classification Metrics Example");
     SparkContext sc = new SparkContext(conf);
     // $example on$
     String path = "data/mllib/sample_binary_classification_data.txt";
     JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
 
     // Split initial RDD into two... [60% training data, 40% testing data].
-    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L);
+    JavaRDD<LabeledPoint>[] splits =
+      data.randomSplit(new double[]{0.6, 0.4}, 11L);
     JavaRDD<LabeledPoint> training = splits[0].cache();
     JavaRDD<LabeledPoint> test = splits[1];
 
@@ -105,7 +107,7 @@ public Double call(Tuple2<Object, Object> t) {
     // Save and load model
     model.save(sc, "target/tmp/LogisticRegressionModel");
     LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
-            "target/tmp/LogisticRegressionModel");
+      "target/tmp/LogisticRegressionModel");
     // $example off$
   }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java
index d6b9178b2c4a0..b54e1ea3f2bcf 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java
@@ -14,12 +14,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.examples.mllib;
 
+package org.apache.spark.examples.mllib;
 
+// $example on$
 import java.util.Arrays;
 import java.util.List;
-// $example on$
+
 import scala.Tuple2;
 
 import org.apache.spark.api.java.*;
@@ -57,11 +58,11 @@ public static void main(String[] args) {
     // Stats by labels
     for (int i = 0; i < metrics.labels().length - 1; i++) {
       System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision
-              (metrics.labels()[i]));
+        (metrics.labels()[i]));
       System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics
-              .labels()[i]));
+        .labels()[i]));
       System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure
-              (metrics.labels()[i]));
+        (metrics.labels()[i]));
     }
 
     // Micro stats
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
index 9f3426a0ba6de..21f628fb51b6e 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
@@ -14,7 +14,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.spark.examples.mllib;
+
 // $example on$
 import scala.Tuple2;
 
@@ -27,7 +29,6 @@
 import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.mllib.linalg.Matrix;
 // $example off$
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkContext;
 
@@ -47,7 +48,7 @@ public static void main(String[] args) {
     // Run training algorithm to build the model.
     final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
       .setNumClasses(3)
-            .run(training.rdd());
+      .run(training.rdd());
 
     // Compute raw scores on the test set.
     JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
@@ -74,11 +75,11 @@ public Tuple2<Object, Object> call(LabeledPoint p) {
     // Stats by labels
     for (int i = 0; i < metrics.labels().length; i++) {
       System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision
-              (metrics.labels()[i]));
+        (metrics.labels()[i]));
       System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics
-              .labels()[i]));
+        .labels()[i]));
       System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure
-              (metrics.labels()[i]));
+        (metrics.labels()[i]));
     }
 
     //Weighted stats
@@ -90,7 +91,7 @@ public Tuple2<Object, Object> call(LabeledPoint p) {
     // Save and load model
     model.save(sc, "target/tmp/LogisticRegressionModel");
     LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
-            "target/tmp/LogisticRegressionModel");
+      "target/tmp/LogisticRegressionModel");
     // $example off$
   }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java
index 2f64a58dceeb2..7c4c97e74681f 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java
@@ -15,7 +15,6 @@
  * limitations under the License.
  */
 
-// scalastyle:off println
 package org.apache.spark.examples.mllib;
 
 // $example on$
@@ -31,12 +30,11 @@
 import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
 import org.apache.spark.mllib.recommendation.Rating;
 // $example off$
-import org.apache.spark.rdd.RDD;
 import org.apache.spark.SparkConf;
 
 public class JavaRankingMetricsExample {
   public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Ranking Metrics Example");
+    SparkConf conf = new SparkConf().setAppName("Java Ranking Metrics Example");
     JavaSparkContext sc = new JavaSparkContext(conf);
     // $example on$
     String path = "data/mllib/sample_movielens_data.txt";
@@ -46,7 +44,7 @@ public static void main(String[] args) {
         public Rating call(String line) {
           String[] parts = line.split("::");
             return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double
-                    .parseDouble(parts[2]) - 2.5);
+              .parseDouble(parts[2]) - 2.5);
         }
       }
     );
@@ -77,9 +75,9 @@ public Tuple2<Object, Rating[]> call(Tuple2<Object, Rating[]> t) {
         public Rating call(Rating r) {
           double binaryRating;
           if (r.rating() > 0.0) {
-              binaryRating = 1.0;
+            binaryRating = 1.0;
           } else {
-              binaryRating = 0.0;
+            binaryRating = 0.0;
           }
           return new Rating(r.user(), r.product(), binaryRating);
         }
@@ -88,11 +86,11 @@ public Rating call(Rating r) {
 
     // Group ratings by common user
     JavaPairRDD<Object, Iterable<Rating>> userMovies = binarizedRatings.groupBy(
-            new Function<Rating, Object>() {
-              public Object call(Rating r) {
-                return r.user();
-              }
-            }
+      new Function<Rating, Object>() {
+        public Object call(Rating r) {
+          return r.user();
+        }
+      }
     );
 
     // Get true relevant documents from all user ratings
@@ -123,7 +121,7 @@ public List<Integer> call(Rating[] docs) {
       }
     );
     JavaRDD<Tuple2<List<Integer>, List<Integer>>> relevantDocs = userMoviesList.join
-            (userRecommendedList).values();
+      (userRecommendedList).values();
 
     // Instantiate the metrics object
     RankingMetrics metrics = RankingMetrics.of(relevantDocs);
@@ -156,14 +154,14 @@ public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r) {
         }
       ));
     JavaRDD<Tuple2<Object, Object>> ratesAndPreds =
-            JavaPairRDD.fromJavaRDD(ratings.map(
-              new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
-                public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r) {
-                  return new Tuple2<Tuple2<Integer, Integer>, Object>(
-                    new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
-                }
-              }
-            )).join(predictions).values();
+      JavaPairRDD.fromJavaRDD(ratings.map(
+        new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Object>>() {
+          public Tuple2<Tuple2<Integer, Integer>, Object> call(Rating r) {
+            return new Tuple2<Tuple2<Integer, Integer>, Object>(
+              new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
+          }
+        }
+      )).join(predictions).values();
 
     // Create regression metrics object
     RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd());
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java
similarity index 93%
rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java
rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java
index 5f4e2d74f04ef..d2efc6bf97776 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java
@@ -14,7 +14,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.spark.examples.mllib;
+
 // $example on$
 import scala.Tuple2;
 
@@ -28,10 +30,9 @@
 import org.apache.spark.SparkConf;
 // $example off$
 
-// Read in the ratings data
-public class JavaLinearRegressionExample {
+public class JavaRegressionMetricsExample {
   public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("Linear Regression Example");
+    SparkConf conf = new SparkConf().setAppName("Java Regression Metrics Example");
     JavaSparkContext sc = new JavaSparkContext(conf);
     // $example on$
     // Load and parse the data
@@ -53,7 +54,7 @@ public LabeledPoint call(String line) {
     // Building the model
     int numIterations = 100;
     final LinearRegressionModel model = LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData),
-            numIterations);
+      numIterations);
 
     // Evaluate model on training examples and compute training error
     JavaRDD<Tuple2<Object, Object>> valuesAndPreds = parsedData.map(
@@ -84,7 +85,7 @@ public Tuple2<Object, Object> call(LabeledPoint point) {
     // Save and load model
     model.save(sc.sc(), "target/tmp/LogisticRegressionModel");
     LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(),
-            "target/tmp/LogisticRegressionModel");
+      "target/tmp/LogisticRegressionModel");
     // $example off$
   }
 }
diff --git a/examples/src/main/python/mllib/multi_label_metrics_example.py b/examples/src/main/python/mllib/multi_label_metrics_example.py
index f293ce8e309e9..960ade6597379 100644
--- a/examples/src/main/python/mllib/multi_label_metrics_example.py
+++ b/examples/src/main/python/mllib/multi_label_metrics_example.py
@@ -18,7 +18,6 @@
 # $example on$
 from pyspark.mllib.evaluation import MultilabelMetrics
 # $example off$
-from pyspark.mllib.util import MLUtils
 from pyspark import SparkContext
 
 if __name__ == "__main__":
diff --git a/examples/src/main/python/mllib/ranking_metrics_example.py b/examples/src/main/python/mllib/ranking_metrics_example.py
index deefbd23c5b94..327791966c901 100644
--- a/examples/src/main/python/mllib/ranking_metrics_example.py
+++ b/examples/src/main/python/mllib/ranking_metrics_example.py
@@ -26,7 +26,7 @@
 
     # Several of the methods available in scala are currently missing from pyspark
     # $example on$
-    #  Read in the ratings data
+    # Read in the ratings data
     lines = sc.textFile("data/mllib/sample_movielens_data.txt")
 
     def parseLine(line):
diff --git a/examples/src/main/python/mllib/regression_metrics_example.py b/examples/src/main/python/mllib/regression_metrics_example.py
index a3e9c12dbc4ea..89f44f5dc097d 100644
--- a/examples/src/main/python/mllib/regression_metrics_example.py
+++ b/examples/src/main/python/mllib/regression_metrics_example.py
@@ -26,7 +26,6 @@
     sc = SparkContext(appName="Regression Metrics Example")
     # $example on$
     # Load and parse the data
-
     def parsePoint(line):
         values = line.split()
         return LabeledPoint(float(values[0]),
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
index 466c84ed2dec7..13a37827ab935 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
@@ -17,15 +17,14 @@
 
 // scalastyle:off println
 package org.apache.spark.examples.mllib
+
 // $example on$
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils
 // $example off$
-
 import org.apache.spark.{SparkContext, SparkConf}
-import org.apache.spark.sql.SQLContext
 
 object BinaryClassificationMetricsExample {
 
@@ -33,8 +32,6 @@ object BinaryClassificationMetricsExample {
 
     val conf = new SparkConf().setAppName("BinaryClassificationMetricsExample")
     val sc = new SparkContext(conf)
-    val sqlContext = new SQLContext(sc)
-    import sqlContext.implicits._
     // $example on$
     // Load training data in LIBSVM format
     val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
index 035f74490b3fe..4503c15360adc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala
@@ -14,26 +14,25 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 // scalastyle:off println
 package org.apache.spark.examples.mllib
+
 // $example on$
 import org.apache.spark.mllib.evaluation.MultilabelMetrics
-import org.apache.spark.rdd.RDD;
+import org.apache.spark.rdd.RDD
 // $example off$
-import org.apache.spark.sql.SQLContext
 import org.apache.spark.{SparkContext, SparkConf}
 
 object MultiLabelMetricsExample {
   def main(args: Array[String]): Unit = {
     val conf = new SparkConf().setAppName("MultiLabelMetricsExample")
     val sc = new SparkContext(conf)
-    val sqlContext = new SQLContext(sc)
-    import sqlContext.implicits._
     // $example on$
     val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize(
       Seq((Array(0.0, 1.0), Array(0.0, 2.0)),
         (Array(0.0, 2.0), Array(0.0, 1.0)),
-        (Array(), Array(0.0)),
+        (Array.empty[Double], Array(0.0)),
         (Array(2.0), Array(2.0)),
         (Array(2.0, 0.0), Array(2.0, 0.0)),
         (Array(0.0, 1.0, 2.0), Array(0.0, 1.0)),
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
index a3d29a5a091b6..0904449245989 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala
@@ -14,8 +14,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 // scalastyle:off println
 package org.apache.spark.examples.mllib
+
 // $example on$
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 import org.apache.spark.mllib.evaluation.MulticlassMetrics
@@ -27,7 +29,7 @@ import org.apache.spark.{SparkContext, SparkConf}
 object MulticlassMetricsExample {
 
   def main(args: Array[String]): Unit = {
-    val conf = new SparkConf().setAppName("MulticlassMetrics")
+    val conf = new SparkConf().setAppName("MulticlassMetricsExample")
     val sc = new SparkContext(conf)
 
     // $example on$
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala
index 9fe933bf53931..cffa03d5cc9f4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala
@@ -14,15 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 // scalastyle:off println
 package org.apache.spark.examples.mllib
 
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.{SparkContext, SparkConf}
 // $example on$
 import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics}
 import org.apache.spark.mllib.recommendation.{ALS, Rating}
 // $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
 
 object RankingMetricsExample {
   def main(args: Array[String]) {
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
index fcab66f0df2e5..d29a3c86cfc27 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 // scalastyle:off println
+
 package org.apache.spark.examples.mllib
+
+// $example on$
 // $example on$
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.regression.LinearRegressionModel
 import org.apache.spark.mllib.regression.LinearRegressionWithSGD
-import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.evaluation.RegressionMetrics
 import org.apache.spark.mllib.util.MLUtils
 // $example off$

From 88512e7ff1f1d55f31a5c12b57668216d39b22b9 Mon Sep 17 00:00:00 2001
From: Vikas Nelamangala <vikasnelamangala@Vikass-MacBook-Pro.local>
Date: Thu, 19 Nov 2015 16:10:34 +0530
Subject: [PATCH 13/13] removed extra lines & fixed style issues

---
 .../main/python/mllib/binary_classification_metrics_example.py   | 1 -
 examples/src/main/python/mllib/regression_metrics_example.py     | 1 +
 .../apache/spark/examples/mllib/RegressionMetricsExample.scala   | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/src/main/python/mllib/binary_classification_metrics_example.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py
index 38b557108c2ea..437acb998acc3 100644
--- a/examples/src/main/python/mllib/binary_classification_metrics_example.py
+++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py
@@ -23,7 +23,6 @@
 # $example on$
 from pyspark.mllib.classification import LogisticRegressionWithLBFGS
 from pyspark.mllib.evaluation import BinaryClassificationMetrics
-from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.util import MLUtils
 # $example off$
 
diff --git a/examples/src/main/python/mllib/regression_metrics_example.py b/examples/src/main/python/mllib/regression_metrics_example.py
index 89f44f5dc097d..a3a83aafd7a1f 100644
--- a/examples/src/main/python/mllib/regression_metrics_example.py
+++ b/examples/src/main/python/mllib/regression_metrics_example.py
@@ -24,6 +24,7 @@
 
 if __name__ == "__main__":
     sc = SparkContext(appName="Regression Metrics Example")
+
     # $example on$
     # Load and parse the data
     def parsePoint(line):
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
index d29a3c86cfc27..47d44532521ca 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala
@@ -18,7 +18,6 @@
 
 package org.apache.spark.examples.mllib
 
-// $example on$
 // $example on$
 import org.apache.spark.mllib.regression.LinearRegressionWithSGD
 import org.apache.spark.mllib.evaluation.RegressionMetrics