From 377d7a9fff84929bc086471656a8ba47561e8b17 Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Mon, 9 Nov 2015 12:09:17 +0530 Subject: [PATCH 01/13] Intial commit..added all files --- docs/mllib-evaluation-metrics.md | 940 +----------------- .../mllib/JavaBinaryClassification.java | 113 +++ .../examples/mllib/JavaLinearRegression.java | 90 ++ .../mllib/JavaMultiLabelClassification.java | 77 ++ .../mllib/JavaMulticlassClassification.java | 92 ++ .../spark/examples/mllib/JavaRanking.java | 175 ++++ .../mllib/binary_classification_metrics.py | 63 ++ .../main/python/mllib/multi_class_metrics.py | 69 ++ .../main/python/mllib/multi_label_metrics.py | 63 ++ .../src/main/python/mllib/ranking_metrics.py | 54 + .../main/python/mllib/regression_metrics.py | 55 + .../mllib/BinaryClassificationMetrics.scala | 109 ++ .../spark/examples/mllib/MultiLabelMetrics | 69 ++ .../examples/mllib/MulticlassMetrics.scala | 103 ++ .../spark/examples/mllib/RankingMetrics.scala | 102 ++ .../examples/mllib/RegressionMetrics.scala | 66 ++ 16 files changed, 1315 insertions(+), 925 deletions(-) create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java create mode 100644 examples/src/main/python/mllib/binary_classification_metrics.py create mode 100644 examples/src/main/python/mllib/multi_class_metrics.py create mode 100644 examples/src/main/python/mllib/multi_label_metrics.py create mode 100644 examples/src/main/python/mllib/ranking_metrics.py create mode 100644 examples/src/main/python/mllib/regression_metrics.py create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md index f73eff637dc36..2991249161046 100644 --- a/docs/mllib-evaluation-metrics.md +++ b/docs/mllib-evaluation-metrics.md @@ -104,214 +104,21 @@ data, and evaluate the performance of the algorithm by several binary evaluation
Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) and [`BinaryClassificationMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS -import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.util.MLUtils - -// Load training data in LIBSVM format -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") - -// Split data into training (60%) and test (40%) -val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L) -training.cache() - -// Run training algorithm to build the model -val model = new LogisticRegressionWithLBFGS() - .setNumClasses(2) - .run(training) - -// Clear the prediction threshold so the model will return probabilities -model.clearThreshold - -// Compute raw scores on the test set -val predictionAndLabels = test.map { case LabeledPoint(label, features) => - val prediction = model.predict(features) - (prediction, label) -} - -// Instantiate metrics object -val metrics = new BinaryClassificationMetrics(predictionAndLabels) - -// Precision by threshold -val precision = metrics.precisionByThreshold -precision.foreach { case (t, p) => - println(s"Threshold: $t, Precision: $p") -} - -// Recall by threshold -val recall = metrics.recallByThreshold -recall.foreach { case (t, r) => - println(s"Threshold: $t, Recall: $r") -} - -// Precision-Recall Curve -val PRC = metrics.pr - -// F-measure -val f1Score = metrics.fMeasureByThreshold -f1Score.foreach { case (t, f) => - println(s"Threshold: $t, F-score: $f, Beta = 1") -} - -val beta = 0.5 -val fScore = metrics.fMeasureByThreshold(beta) -f1Score.foreach { case (t, f) => - println(s"Threshold: $t, F-score: $f, Beta = 0.5") -} - -// AUPRC -val auPRC = metrics.areaUnderPR -println("Area under precision-recall curve = " + auPRC) - -// Compute thresholds used in ROC and PR curves -val thresholds = precision.map(_._1) - -// ROC Curve -val roc = metrics.roc - -// AUROC -val auROC = metrics.areaUnderROC -println("Area under ROC = " + auROC) - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala %}
Refer to the [`LogisticRegressionModel` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionModel.html) and [`LogisticRegressionWithLBFGS` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html) for details on the API. -{% highlight java %} -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.rdd.RDD; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.mllib.classification.LogisticRegressionModel; -import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; -import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; - -public class BinaryClassification { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics"); - SparkContext sc = new SparkContext(conf); - String path = "data/mllib/sample_binary_classification_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - - // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L); - JavaRDD training = splits[0].cache(); - JavaRDD test = splits[1]; - - // Run training algorithm to build the model. - final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() - .setNumClasses(2) - .run(training.rdd()); - - // Clear the prediction threshold so the model will return probabilities - model.clearThreshold(); - - // Compute raw scores on the test set. - JavaRDD> predictionAndLabels = test.map( - new Function>() { - public Tuple2 call(LabeledPoint p) { - Double prediction = model.predict(p.features()); - return new Tuple2(prediction, p.label()); - } - } - ); - - // Get evaluation metrics. - BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictionAndLabels.rdd()); - - // Precision by threshold - JavaRDD> precision = metrics.precisionByThreshold().toJavaRDD(); - System.out.println("Precision by threshold: " + precision.toArray()); - - // Recall by threshold - JavaRDD> recall = metrics.recallByThreshold().toJavaRDD(); - System.out.println("Recall by threshold: " + recall.toArray()); - - // F Score by threshold - JavaRDD> f1Score = metrics.fMeasureByThreshold().toJavaRDD(); - System.out.println("F1 Score by threshold: " + f1Score.toArray()); - - JavaRDD> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD(); - System.out.println("F2 Score by threshold: " + f2Score.toArray()); - - // Precision-recall curve - JavaRDD> prc = metrics.pr().toJavaRDD(); - System.out.println("Precision-recall curve: " + prc.toArray()); - - // Thresholds - JavaRDD thresholds = precision.map( - new Function, Double>() { - public Double call (Tuple2 t) { - return new Double(t._1().toString()); - } - } - ); - - // ROC Curve - JavaRDD> roc = metrics.roc().toJavaRDD(); - System.out.println("ROC curve: " + roc.toArray()); - - // AUPRC - System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR()); - - // AUROC - System.out.println("Area under ROC = " + metrics.areaUnderROC()); - - // Save and load model - model.save(sc, "myModelPath"); - LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); - } -} - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaBinaryClassification.java %}
Refer to the [`BinaryClassificationMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.BinaryClassificationMetrics) and [`LogisticRegressionWithLBFGS` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.LogisticRegressionWithLBFGS) for more details on the API. -{% highlight python %} -from pyspark.mllib.classification import LogisticRegressionWithLBFGS -from pyspark.mllib.evaluation import BinaryClassificationMetrics -from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.util import MLUtils - -# Several of the methods available in scala are currently missing from pyspark - -# Load training data in LIBSVM format -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") - -# Split data into training (60%) and test (40%) -training, test = data.randomSplit([0.6, 0.4], seed = 11L) -training.cache() - -# Run training algorithm to build the model -model = LogisticRegressionWithLBFGS.train(training) - -# Compute raw scores on the test set -predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) - -# Instantiate metrics object -metrics = BinaryClassificationMetrics(predictionAndLabels) - -# Area under precision-recall curve -print("Area under PR = %s" % metrics.areaUnderPR) - -# Area under ROC curve -print("Area under ROC = %s" % metrics.areaUnderROC) - -{% endhighlight %} - +{% include_example python/mllib/binary_classification_metrics.py %}
@@ -433,204 +240,21 @@ the data, and evaluate the performance of the algorithm by several multiclass cl
Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MulticlassMetrics) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS -import org.apache.spark.mllib.evaluation.MulticlassMetrics -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.util.MLUtils - -// Load training data in LIBSVM format -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt") - -// Split data into training (60%) and test (40%) -val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L) -training.cache() - -// Run training algorithm to build the model -val model = new LogisticRegressionWithLBFGS() - .setNumClasses(3) - .run(training) - -// Compute raw scores on the test set -val predictionAndLabels = test.map { case LabeledPoint(label, features) => - val prediction = model.predict(features) - (prediction, label) -} - -// Instantiate metrics object -val metrics = new MulticlassMetrics(predictionAndLabels) - -// Confusion matrix -println("Confusion matrix:") -println(metrics.confusionMatrix) - -// Overall Statistics -val precision = metrics.precision -val recall = metrics.recall // same as true positive rate -val f1Score = metrics.fMeasure -println("Summary Statistics") -println(s"Precision = $precision") -println(s"Recall = $recall") -println(s"F1 Score = $f1Score") - -// Precision by label -val labels = metrics.labels -labels.foreach { l => - println(s"Precision($l) = " + metrics.precision(l)) -} - -// Recall by label -labels.foreach { l => - println(s"Recall($l) = " + metrics.recall(l)) -} - -// False positive rate by label -labels.foreach { l => - println(s"FPR($l) = " + metrics.falsePositiveRate(l)) -} - -// F-measure by label -labels.foreach { l => - println(s"F1-Score($l) = " + metrics.fMeasure(l)) -} - -// Weighted stats -println(s"Weighted precision: ${metrics.weightedPrecision}") -println(s"Weighted recall: ${metrics.weightedRecall}") -println(s"Weighted F1 score: ${metrics.weightedFMeasure}") -println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}") - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala %}
Refer to the [`MulticlassMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MulticlassMetrics.html) for details on the API. -{% highlight java %} -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.rdd.RDD; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.mllib.classification.LogisticRegressionModel; -import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; -import org.apache.spark.mllib.evaluation.MulticlassMetrics; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; - -public class MulticlassClassification { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Multiclass Classification Metrics"); - SparkContext sc = new SparkContext(conf); - String path = "data/mllib/sample_multiclass_classification_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - - // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L); - JavaRDD training = splits[0].cache(); - JavaRDD test = splits[1]; - - // Run training algorithm to build the model. - final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() - .setNumClasses(3) - .run(training.rdd()); - - // Compute raw scores on the test set. - JavaRDD> predictionAndLabels = test.map( - new Function>() { - public Tuple2 call(LabeledPoint p) { - Double prediction = model.predict(p.features()); - return new Tuple2(prediction, p.label()); - } - } - ); - - // Get evaluation metrics. - MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); - - // Confusion matrix - Matrix confusion = metrics.confusionMatrix(); - System.out.println("Confusion matrix: \n" + confusion); - - // Overall statistics - System.out.println("Precision = " + metrics.precision()); - System.out.println("Recall = " + metrics.recall()); - System.out.println("F1 Score = " + metrics.fMeasure()); - - // Stats by labels - for (int i = 0; i < metrics.labels().length; i++) { - System.out.format("Class %f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i])); - System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])); - System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i])); - } - - //Weighted stats - System.out.format("Weighted precision = %f\n", metrics.weightedPrecision()); - System.out.format("Weighted recall = %f\n", metrics.weightedRecall()); - System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure()); - System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate()); - - // Save and load model - model.save(sc, "myModelPath"); - LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); - } -} - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassMetrics.java %}
Refer to the [`MulticlassMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.MulticlassMetrics) for more details on the API. -{% highlight python %} -from pyspark.mllib.classification import LogisticRegressionWithLBFGS -from pyspark.mllib.util import MLUtils -from pyspark.mllib.evaluation import MulticlassMetrics - -# Load training data in LIBSVM format -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt") - -# Split data into training (60%) and test (40%) -training, test = data.randomSplit([0.6, 0.4], seed = 11L) -training.cache() - -# Run training algorithm to build the model -model = LogisticRegressionWithLBFGS.train(training, numClasses=3) - -# Compute raw scores on the test set -predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) - -# Instantiate metrics object -metrics = MulticlassMetrics(predictionAndLabels) - -# Overall statistics -precision = metrics.precision() -recall = metrics.recall() -f1Score = metrics.fMeasure() -print("Summary Stats") -print("Precision = %s" % precision) -print("Recall = %s" % recall) -print("F1 Score = %s" % f1Score) - -# Statistics by class -labels = data.map(lambda lp: lp.label).distinct().collect() -for label in sorted(labels): - print("Class %s precision = %s" % (label, metrics.precision(label))) - print("Class %s recall = %s" % (label, metrics.recall(label))) - print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) - -# Weighted stats -print("Weighted recall = %s" % metrics.weightedRecall) -print("Weighted precision = %s" % metrics.weightedPrecision) -print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) -print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) -print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate) -{% endhighlight %} +{% include_example python/mllib/multi_class_metrics.py %}
@@ -766,154 +390,21 @@ True classes:
Refer to the [`MultilabelMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MultilabelMetrics) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.evaluation.MultilabelMetrics -import org.apache.spark.rdd.RDD; - -val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize( - Seq((Array(0.0, 1.0), Array(0.0, 2.0)), - (Array(0.0, 2.0), Array(0.0, 1.0)), - (Array(), Array(0.0)), - (Array(2.0), Array(2.0)), - (Array(2.0, 0.0), Array(2.0, 0.0)), - (Array(0.0, 1.0, 2.0), Array(0.0, 1.0)), - (Array(1.0), Array(1.0, 2.0))), 2) - -// Instantiate metrics object -val metrics = new MultilabelMetrics(scoreAndLabels) - -// Summary stats -println(s"Recall = ${metrics.recall}") -println(s"Precision = ${metrics.precision}") -println(s"F1 measure = ${metrics.f1Measure}") -println(s"Accuracy = ${metrics.accuracy}") - -// Individual label stats -metrics.labels.foreach(label => println(s"Class $label precision = ${metrics.precision(label)}")) -metrics.labels.foreach(label => println(s"Class $label recall = ${metrics.recall(label)}")) -metrics.labels.foreach(label => println(s"Class $label F1-score = ${metrics.f1Measure(label)}")) - -// Micro stats -println(s"Micro recall = ${metrics.microRecall}") -println(s"Micro precision = ${metrics.microPrecision}") -println(s"Micro F1 measure = ${metrics.microF1Measure}") - -// Hamming loss -println(s"Hamming loss = ${metrics.hammingLoss}") - -// Subset accuracy -println(s"Subset accuracy = ${metrics.subsetAccuracy}") - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala %}
Refer to the [`MultilabelMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MultilabelMetrics.html) for details on the API. -{% highlight java %} -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.rdd.RDD; -import org.apache.spark.mllib.evaluation.MultilabelMetrics; -import org.apache.spark.SparkConf; -import java.util.Arrays; -import java.util.List; - -public class MultilabelClassification { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics"); - JavaSparkContext sc = new JavaSparkContext(conf); - - List> data = Arrays.asList( - new Tuple2(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}), - new Tuple2(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}), - new Tuple2(new double[]{}, new double[]{0.0}), - new Tuple2(new double[]{2.0}, new double[]{2.0}), - new Tuple2(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}), - new Tuple2(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}), - new Tuple2(new double[]{1.0}, new double[]{1.0, 2.0}) - ); - JavaRDD> scoreAndLabels = sc.parallelize(data); - - // Instantiate metrics object - MultilabelMetrics metrics = new MultilabelMetrics(scoreAndLabels.rdd()); - - // Summary stats - System.out.format("Recall = %f\n", metrics.recall()); - System.out.format("Precision = %f\n", metrics.precision()); - System.out.format("F1 measure = %f\n", metrics.f1Measure()); - System.out.format("Accuracy = %f\n", metrics.accuracy()); - - // Stats by labels - for (int i = 0; i < metrics.labels().length - 1; i++) { - System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i])); - System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])); - System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure(metrics.labels()[i])); - } - - // Micro stats - System.out.format("Micro recall = %f\n", metrics.microRecall()); - System.out.format("Micro precision = %f\n", metrics.microPrecision()); - System.out.format("Micro F1 measure = %f\n", metrics.microF1Measure()); - - // Hamming loss - System.out.format("Hamming loss = %f\n", metrics.hammingLoss()); - - // Subset accuracy - System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy()); - - } -} - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java %}
Refer to the [`MultilabelMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.MultilabelMetrics) for more details on the API. -{% highlight python %} -from pyspark.mllib.evaluation import MultilabelMetrics - -scoreAndLabels = sc.parallelize([ - ([0.0, 1.0], [0.0, 2.0]), - ([0.0, 2.0], [0.0, 1.0]), - ([], [0.0]), - ([2.0], [2.0]), - ([2.0, 0.0], [2.0, 0.0]), - ([0.0, 1.0, 2.0], [0.0, 1.0]), - ([1.0], [1.0, 2.0])]) - -# Instantiate metrics object -metrics = MultilabelMetrics(scoreAndLabels) - -# Summary stats -print("Recall = %s" % metrics.recall()) -print("Precision = %s" % metrics.precision()) -print("F1 measure = %s" % metrics.f1Measure()) -print("Accuracy = %s" % metrics.accuracy) - -# Individual label stats -labels = scoreAndLabels.flatMap(lambda x: x[1]).distinct().collect() -for label in labels: - print("Class %s precision = %s" % (label, metrics.precision(label))) - print("Class %s recall = %s" % (label, metrics.recall(label))) - print("Class %s F1 Measure = %s" % (label, metrics.f1Measure(label))) - -# Micro stats -print("Micro precision = %s" % metrics.microPrecision) -print("Micro recall = %s" % metrics.microRecall) -print("Micro F1 measure = %s" % metrics.microF1Measure) - -# Hamming loss -print("Hamming loss = %s" % metrics.hammingLoss) - -# Subset accuracy -print("Subset accuracy = %s" % metrics.subsetAccuracy) - -{% endhighlight %} +{% include_example python/mllib/multi_label_metrics.py %}
@@ -1027,280 +518,21 @@ expanded world of non-positive weights are "the same as never having interacted
Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RegressionMetrics) and [`RankingMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RankingMetrics) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics} -import org.apache.spark.mllib.recommendation.{ALS, Rating} - -// Read in the ratings data -val ratings = sc.textFile("data/mllib/sample_movielens_data.txt").map { line => - val fields = line.split("::") - Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble - 2.5) -}.cache() - -// Map ratings to 1 or 0, 1 indicating a movie that should be recommended -val binarizedRatings = ratings.map(r => Rating(r.user, r.product, if (r.rating > 0) 1.0 else 0.0)).cache() - -// Summarize ratings -val numRatings = ratings.count() -val numUsers = ratings.map(_.user).distinct().count() -val numMovies = ratings.map(_.product).distinct().count() -println(s"Got $numRatings ratings from $numUsers users on $numMovies movies.") - -// Build the model -val numIterations = 10 -val rank = 10 -val lambda = 0.01 -val model = ALS.train(ratings, rank, numIterations, lambda) - -// Define a function to scale ratings from 0 to 1 -def scaledRating(r: Rating): Rating = { - val scaledRating = math.max(math.min(r.rating, 1.0), 0.0) - Rating(r.user, r.product, scaledRating) -} - -// Get sorted top ten predictions for each user and then scale from [0, 1] -val userRecommended = model.recommendProductsForUsers(10).map{ case (user, recs) => - (user, recs.map(scaledRating)) -} - -// Assume that any movie a user rated 3 or higher (which maps to a 1) is a relevant document -// Compare with top ten most relevant documents -val userMovies = binarizedRatings.groupBy(_.user) -val relevantDocuments = userMovies.join(userRecommended).map{ case (user, (actual, predictions)) => - (predictions.map(_.product), actual.filter(_.rating > 0.0).map(_.product).toArray) -} - -// Instantiate metrics object -val metrics = new RankingMetrics(relevantDocuments) - -// Precision at K -Array(1, 3, 5).foreach{ k => - println(s"Precision at $k = ${metrics.precisionAt(k)}") -} - -// Mean average precision -println(s"Mean average precision = ${metrics.meanAveragePrecision}") - -// Normalized discounted cumulative gain -Array(1, 3, 5).foreach{ k => - println(s"NDCG at $k = ${metrics.ndcgAt(k)}") -} - -// Get predictions for each data point -val allPredictions = model.predict(ratings.map(r => (r.user, r.product))).map(r => ((r.user, r.product), r.rating)) -val allRatings = ratings.map(r => ((r.user, r.product), r.rating)) -val predictionsAndLabels = allPredictions.join(allRatings).map{ case ((user, product), (predicted, actual)) => - (predicted, actual) -} - -// Get the RMSE using regression metrics -val regressionMetrics = new RegressionMetrics(predictionsAndLabels) -println(s"RMSE = ${regressionMetrics.rootMeanSquaredError}") - -// R-squared -println(s"R-squared = ${regressionMetrics.r2}") - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/RankingMetrics.scala %}
Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) and [`RankingMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RankingMetrics.html) for details on the API. -{% highlight java %} -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.rdd.RDD; -import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.Function; -import java.util.*; -import org.apache.spark.mllib.evaluation.RegressionMetrics; -import org.apache.spark.mllib.evaluation.RankingMetrics; -import org.apache.spark.mllib.recommendation.ALS; -import org.apache.spark.mllib.recommendation.Rating; - -// Read in the ratings data -public class Ranking { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Ranking Metrics"); - JavaSparkContext sc = new JavaSparkContext(conf); - String path = "data/mllib/sample_movielens_data.txt"; - JavaRDD data = sc.textFile(path); - JavaRDD ratings = data.map( - new Function() { - public Rating call(String line) { - String[] parts = line.split("::"); - return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]) - 2.5); - } - } - ); - ratings.cache(); - - // Train an ALS model - final MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), 10, 10, 0.01); - - // Get top 10 recommendations for every user and scale ratings from 0 to 1 - JavaRDD> userRecs = model.recommendProductsForUsers(10).toJavaRDD(); - JavaRDD> userRecsScaled = userRecs.map( - new Function, Tuple2>() { - public Tuple2 call(Tuple2 t) { - Rating[] scaledRatings = new Rating[t._2().length]; - for (int i = 0; i < scaledRatings.length; i++) { - double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0); - scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating); - } - return new Tuple2(t._1(), scaledRatings); - } - } - ); - JavaPairRDD userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled); - - // Map ratings to 1 or 0, 1 indicating a movie that should be recommended - JavaRDD binarizedRatings = ratings.map( - new Function() { - public Rating call(Rating r) { - double binaryRating; - if (r.rating() > 0.0) { - binaryRating = 1.0; - } - else { - binaryRating = 0.0; - } - return new Rating(r.user(), r.product(), binaryRating); - } - } - ); - - // Group ratings by common user - JavaPairRDD> userMovies = binarizedRatings.groupBy( - new Function() { - public Object call(Rating r) { - return r.user(); - } - } - ); - - // Get true relevant documents from all user ratings - JavaPairRDD> userMoviesList = userMovies.mapValues( - new Function, List>() { - public List call(Iterable docs) { - List products = new ArrayList(); - for (Rating r : docs) { - if (r.rating() > 0.0) { - products.add(r.product()); - } - } - return products; - } - } - ); - - // Extract the product id from each recommendation - JavaPairRDD> userRecommendedList = userRecommended.mapValues( - new Function>() { - public List call(Rating[] docs) { - List products = new ArrayList(); - for (Rating r : docs) { - products.add(r.product()); - } - return products; - } - } - ); - JavaRDD, List>> relevantDocs = userMoviesList.join(userRecommendedList).values(); - - // Instantiate the metrics object - RankingMetrics metrics = RankingMetrics.of(relevantDocs); - - // Precision and NDCG at k - Integer[] kVector = {1, 3, 5}; - for (Integer k : kVector) { - System.out.format("Precision at %d = %f\n", k, metrics.precisionAt(k)); - System.out.format("NDCG at %d = %f\n", k, metrics.ndcgAt(k)); - } - - // Mean average precision - System.out.format("Mean average precision = %f\n", metrics.meanAveragePrecision()); - - // Evaluate the model using numerical ratings and regression metrics - JavaRDD> userProducts = ratings.map( - new Function>() { - public Tuple2 call(Rating r) { - return new Tuple2(r.user(), r.product()); - } - } - ); - JavaPairRDD, Object> predictions = JavaPairRDD.fromJavaRDD( - model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map( - new Function, Object>>() { - public Tuple2, Object> call(Rating r){ - return new Tuple2, Object>( - new Tuple2(r.user(), r.product()), r.rating()); - } - } - )); - JavaRDD> ratesAndPreds = - JavaPairRDD.fromJavaRDD(ratings.map( - new Function, Object>>() { - public Tuple2, Object> call(Rating r){ - return new Tuple2, Object>( - new Tuple2(r.user(), r.product()), r.rating()); - } - } - )).join(predictions).values(); - - // Create regression metrics object - RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd()); - - // Root mean squared error - System.out.format("RMSE = %f\n", regressionMetrics.rootMeanSquaredError()); - - // R-squared - System.out.format("R-squared = %f\n", regressionMetrics.r2()); - } -} - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaRanking.java %}
Refer to the [`RegressionMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RegressionMetrics) and [`RankingMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics) for more details on the API. -{% highlight python %} -from pyspark.mllib.recommendation import ALS, Rating -from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics - -# Read in the ratings data -lines = sc.textFile("data/mllib/sample_movielens_data.txt") - -def parseLine(line): - fields = line.split("::") - return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5) -ratings = lines.map(lambda r: parseLine(r)) - -# Train a model on to predict user-product ratings -model = ALS.train(ratings, 10, 10, 0.01) - -# Get predicted ratings on all existing user-product pairs -testData = ratings.map(lambda p: (p.user, p.product)) -predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating)) - -ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating)) -scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1]) - -# Instantiate regression metrics to compare predicted and actual ratings -metrics = RegressionMetrics(scoreAndLabels) - -# Root mean sqaured error -print("RMSE = %s" % metrics.rootMeanSquaredError) - -# R-squared -print("R-squared = %s" % metrics.r2) - -{% endhighlight %} +{% include_example python/mllib/ranking_metrics.py %}
@@ -1350,163 +582,21 @@ and evaluate the performance of the algorithm by several regression metrics.
Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RegressionMetrics) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.regression.LinearRegressionModel -import org.apache.spark.mllib.regression.LinearRegressionWithSGD -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.evaluation.RegressionMetrics -import org.apache.spark.mllib.util.MLUtils - -// Load the data -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_linear_regression_data.txt").cache() - -// Build the model -val numIterations = 100 -val model = LinearRegressionWithSGD.train(data, numIterations) - -// Get predictions -val valuesAndPreds = data.map{ point => - val prediction = model.predict(point.features) - (prediction, point.label) -} - -// Instantiate metrics object -val metrics = new RegressionMetrics(valuesAndPreds) - -// Squared error -println(s"MSE = ${metrics.meanSquaredError}") -println(s"RMSE = ${metrics.rootMeanSquaredError}") - -// R-squared -println(s"R-squared = ${metrics.r2}") - -// Mean absolute error -println(s"MAE = ${metrics.meanAbsoluteError}") - -// Explained variance -println(s"Explained variance = ${metrics.explainedVariance}") - -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/RegressionMetrics.scala %}
Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) for details on the API. -{% highlight java %} -import scala.Tuple2; - -import org.apache.spark.api.java.*; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.regression.LinearRegressionModel; -import org.apache.spark.mllib.regression.LinearRegressionWithSGD; -import org.apache.spark.mllib.evaluation.RegressionMetrics; -import org.apache.spark.SparkConf; - -public class LinearRegression { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Linear Regression Example"); - JavaSparkContext sc = new JavaSparkContext(conf); - - // Load and parse the data - String path = "data/mllib/sample_linear_regression_data.txt"; - JavaRDD data = sc.textFile(path); - JavaRDD parsedData = data.map( - new Function() { - public LabeledPoint call(String line) { - String[] parts = line.split(" "); - double[] v = new double[parts.length - 1]; - for (int i = 1; i < parts.length - 1; i++) - v[i - 1] = Double.parseDouble(parts[i].split(":")[1]); - return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); - } - } - ); - parsedData.cache(); - - // Building the model - int numIterations = 100; - final LinearRegressionModel model = - LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations); - - // Evaluate model on training examples and compute training error - JavaRDD> valuesAndPreds = parsedData.map( - new Function>() { - public Tuple2 call(LabeledPoint point) { - double prediction = model.predict(point.features()); - return new Tuple2(prediction, point.label()); - } - } - ); - - // Instantiate metrics object - RegressionMetrics metrics = new RegressionMetrics(valuesAndPreds.rdd()); - - // Squared error - System.out.format("MSE = %f\n", metrics.meanSquaredError()); - System.out.format("RMSE = %f\n", metrics.rootMeanSquaredError()); - - // R-squared - System.out.format("R Squared = %f\n", metrics.r2()); - - // Mean absolute error - System.out.format("MAE = %f\n", metrics.meanAbsoluteError()); - - // Explained variance - System.out.format("Explained Variance = %f\n", metrics.explainedVariance()); - - // Save and load model - model.save(sc.sc(), "myModelPath"); - LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath"); - } -} - -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaLinearRegression.java %}
Refer to the [`RegressionMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RegressionMetrics) for more details on the API. -{% highlight python %} -from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD -from pyspark.mllib.evaluation import RegressionMetrics -from pyspark.mllib.linalg import DenseVector - -# Load and parse the data -def parsePoint(line): - values = line.split() - return LabeledPoint(float(values[0]), DenseVector([float(x.split(':')[1]) for x in values[1:]])) - -data = sc.textFile("data/mllib/sample_linear_regression_data.txt") -parsedData = data.map(parsePoint) - -# Build the model -model = LinearRegressionWithSGD.train(parsedData) - -# Get predictions -valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label)) - -# Instantiate metrics object -metrics = RegressionMetrics(valuesAndPreds) - -# Squared Error -print("MSE = %s" % metrics.meanSquaredError) -print("RMSE = %s" % metrics.rootMeanSquaredError) - -# R-squared -print("R-squared = %s" % metrics.r2) - -# Mean absolute error -print("MAE = %s" % metrics.meanAbsoluteError) - -# Explained variance -print("Explained variance = %s" % metrics.explainedVariance) - -{% endhighlight %} +{% include_example python/mllib/regression_metrics.py %}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java new file mode 100644 index 0000000000000..45da1fec120ab --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib; + +// $example on$ + +import scala.Tuple2; + +import org.apache.spark.api.java.*; +import org.apache.spark.rdd.RDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.classification.LogisticRegressionModel; +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; +// $example off$ + +public class JavaBinaryClassification { + public static void main(String[] args) { + + SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics"); + SparkContext sc = new SparkContext(conf); + String path = "data/mllib/sample_binary_classification_data.txt"; + JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); + + // Split initial RDD into two... [60% training data, 40% testing data]. + JavaRDD[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L); + JavaRDD training = splits[0].cache(); + JavaRDD test = splits[1]; + + // Run training algorithm to build the model. + final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() + .setNumClasses(2) + .run(training.rdd()); + + // Clear the prediction threshold so the model will return probabilities + model.clearThreshold(); + + // Compute raw scores on the test set. + JavaRDD> predictionAndLabels = test.map( + new Function>() { + public Tuple2 call(LabeledPoint p) { + Double prediction = model.predict(p.features()); + return new Tuple2(prediction, p.label()); + } + } + ); + + // Get evaluation metrics. + BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictionAndLabels.rdd()); + + // Precision by threshold + JavaRDD> precision = metrics.precisionByThreshold().toJavaRDD(); + System.out.println("Precision by threshold: " + precision.toArray()); + + // Recall by threshold + JavaRDD> recall = metrics.recallByThreshold().toJavaRDD(); + System.out.println("Recall by threshold: " + recall.toArray()); + + // F Score by threshold + JavaRDD> f1Score = metrics.fMeasureByThreshold().toJavaRDD(); + System.out.println("F1 Score by threshold: " + f1Score.toArray()); + + JavaRDD> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD(); + System.out.println("F2 Score by threshold: " + f2Score.toArray()); + + // Precision-recall curve + JavaRDD> prc = metrics.pr().toJavaRDD(); + System.out.println("Precision-recall curve: " + prc.toArray()); + + // Thresholds + JavaRDD thresholds = precision.map( + new Function, Double>() { + public Double call(Tuple2 t) { + return new Double(t._1().toString()); + } + } + ); + + // ROC Curve + JavaRDD> roc = metrics.roc().toJavaRDD(); + System.out.println("ROC curve: " + roc.toArray()); + + // AUPRC + System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR()); + + // AUROC + System.out.println("Area under ROC = " + metrics.areaUnderROC()); + + // Save and load model + model.save(sc, "myModelPath"); + LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java new file mode 100644 index 0000000000000..309efced045b6 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib; + +// $example on$ + +import org.apache.spark.api.java.*; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.regression.LinearRegressionModel; +import org.apache.spark.mllib.regression.LinearRegressionWithSGD; +import org.apache.spark.mllib.evaluation.RegressionMetrics; +import org.apache.spark.SparkConf; +// $example off$ + +// Read in the ratings data +public class JavaLinearRegression { + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("Linear Regression Example"); + JavaSparkContext sc = new JavaSparkContext(conf); + + // Load and parse the data + String path = "data/mllib/sample_linear_regression_data.txt"; + JavaRDD data = sc.textFile(path); + JavaRDD parsedData = data.map( + new Function() { + public LabeledPoint call(String line) { + String[] parts = line.split(" "); + double[] v = new double[parts.length - 1]; + for (int i = 1; i < parts.length - 1; i++) + v[i - 1] = Double.parseDouble(parts[i].split(":")[1]); + return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); + } + } + ); + parsedData.cache(); + + // Building the model + int numIterations = 100; + final LinearRegressionModel model = + LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations); + + // Evaluate model on training examples and compute training error + JavaRDD> valuesAndPreds = parsedData.map( + new Function>() { + public Tuple2 call(LabeledPoint point) { + double prediction = model.predict(point.features()); + return new Tuple2(prediction, point.label()); + } + } + ); + + // Instantiate metrics object + RegressionMetrics metrics = new RegressionMetrics(valuesAndPreds.rdd()); + + // Squared error + System.out.format("MSE = %f\n", metrics.meanSquaredError()); + System.out.format("RMSE = %f\n", metrics.rootMeanSquaredError()); + + // R-squared + System.out.format("R Squared = %f\n", metrics.r2()); + + // Mean absolute error + System.out.format("MAE = %f\n", metrics.meanAbsoluteError()); + + // Explained variance + System.out.format("Explained Variance = %f\n", metrics.explainedVariance()); + + // Save and load model + model.save(sc.sc(), "myModelPath"); + LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath"); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java new file mode 100644 index 0000000000000..5b3a03f0830b1 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib; + +// $example on$ +import scala.Tuple2; + +import org.apache.spark.api.java.*; +import org.apache.spark.rdd.RDD; +import org.apache.spark.mllib.evaluation.MultilabelMetrics; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; +import java.util.Arrays; +import java.util.List; +// $example off$ + +public class MultilabelClassification { + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics"); + JavaSparkContext sc = new JavaSparkContext(conf); + + List> data = Arrays.asList( + new Tuple2(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}), + new Tuple2(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}), + new Tuple2(new double[]{}, new double[]{0.0}), + new Tuple2(new double[]{2.0}, new double[]{2.0}), + new Tuple2(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}), + new Tuple2(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}), + new Tuple2(new double[]{1.0}, new double[]{1.0, 2.0}) + ); + JavaRDD> scoreAndLabels = sc.parallelize(data); + + // Instantiate metrics object + MultilabelMetrics metrics = new MultilabelMetrics(scoreAndLabels.rdd()); + + // Summary stats + System.out.format("Recall = %f\n", metrics.recall()); + System.out.format("Precision = %f\n", metrics.precision()); + System.out.format("F1 measure = %f\n", metrics.f1Measure()); + System.out.format("Accuracy = %f\n", metrics.accuracy()); + + // Stats by labels + for (int i = 0; i < metrics.labels().length - 1; i++) { + System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i])); + System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])); + System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure(metrics.labels()[i])); + } + + // Micro stats + System.out.format("Micro recall = %f\n", metrics.microRecall()); + System.out.format("Micro precision = %f\n", metrics.microPrecision()); + System.out.format("Micro F1 measure = %f\n", metrics.microF1Measure()); + + // Hamming loss + System.out.format("Hamming loss = %f\n", metrics.hammingLoss()); + + // Subset accuracy + System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy()); + + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java new file mode 100644 index 0000000000000..a5e92df358d14 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import scala.Tuple2; + +import org.apache.spark.api.java.*; +import org.apache.spark.rdd.RDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.classification.LogisticRegressionModel; +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; +import org.apache.spark.mllib.evaluation.MulticlassMetrics; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; + + +public class JavaMulticlassClassification { + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics"); + SparkContext sc = new SparkContext(conf); + String path = "data/mllib/sample_multiclass_classification_data.txt"; + JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); + + // Split initial RDD into two... [60% training data, 40% testing data]. + JavaRDD[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L); + JavaRDD training = splits[0].cache(); + JavaRDD test = splits[1]; + + // Run training algorithm to build the model. + final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() + .setNumClasses(3) + .run(training.rdd()); + + // Compute raw scores on the test set. + JavaRDD> predictionAndLabels = test.map( + new Function>() { + public Tuple2 call(LabeledPoint p) { + Double prediction = model.predict(p.features()); + return new Tuple2(prediction, p.label()); + } + } + ); + + // Get evaluation metrics. + MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); + + // Confusion matrix + Matrix confusion = metrics.confusionMatrix(); + System.out.println("Confusion matrix: \n" + confusion); + + // Overall statistics + System.out.println("Precision = " + metrics.precision()); + System.out.println("Recall = " + metrics.recall()); + System.out.println("F1 Score = " + metrics.fMeasure()); + + // Stats by labels + for (int i = 0; i < metrics.labels().length; i++) { + System.out.format("Class %f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i])); + System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])); + System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i])); + } + + //Weighted stats + System.out.format("Weighted precision = %f\n", metrics.weightedPrecision()); + System.out.format("Weighted recall = %f\n", metrics.weightedRecall()); + System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure()); + System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate()); + + // Save and load model + model.save(sc, "myModelPath"); + LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); + } +} \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java new file mode 100644 index 0000000000000..2162cc658f193 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib; + +// $example on$ +import scala.Tuple2; + +import org.apache.spark.api.java.*; +import org.apache.spark.rdd.RDD; +import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.Function; +import java.util.*; +import org.apache.spark.mllib.evaluation.RegressionMetrics; +import org.apache.spark.mllib.evaluation.RankingMetrics; +import org.apache.spark.mllib.recommendation.ALS; +import org.apache.spark.mllib.recommendation.Rating; +// $example off$ + +// Read in the ratings data +public class JavaRanking { + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("Ranking Metrics"); + JavaSparkContext sc = new JavaSparkContext(conf); + String path = "data/mllib/sample_movielens_data.txt"; + JavaRDD data = sc.textFile(path); + JavaRDD ratings = data.map( + new Function() { + public Rating call(String line) { + String[] parts = line.split("::"); + return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]) - 2.5); + } + } + ); + ratings.cache(); + + // Train an ALS model + final MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), 10, 10, 0.01); + + // Get top 10 recommendations for every user and scale ratings from 0 to 1 + JavaRDD> userRecs = model.recommendProductsForUsers(10).toJavaRDD(); + JavaRDD> userRecsScaled = userRecs.map( + new Function, Tuple2>() { + public Tuple2 call(Tuple2 t) { + Rating[] scaledRatings = new Rating[t._2().length]; + for (int i = 0; i < scaledRatings.length; i++) { + double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0); + scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating); + } + return new Tuple2(t._1(), scaledRatings); + } + } + ); + JavaPairRDD userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled); + + // Map ratings to 1 or 0, 1 indicating a movie that should be recommended + JavaRDD binarizedRatings = ratings.map( + new Function() { + public Rating call(Rating r) { + double binaryRating; + if (r.rating() > 0.0) { + binaryRating = 1.0; + } + else { + binaryRating = 0.0; + } + return new Rating(r.user(), r.product(), binaryRating); + } + } + ); + + // Group ratings by common user + JavaPairRDD> userMovies = binarizedRatings.groupBy( + new Function() { + public Object call(Rating r) { + return r.user(); + } + } + ); + + // Get true relevant documents from all user ratings + JavaPairRDD> userMoviesList = userMovies.mapValues( + new Function, List>() { + public List call(Iterable docs) { + List products = new ArrayList(); + for (Rating r : docs) { + if (r.rating() > 0.0) { + products.add(r.product()); + } + } + return products; + } + } + ); + + // Extract the product id from each recommendation + JavaPairRDD> userRecommendedList = userRecommended.mapValues( + new Function>() { + public List call(Rating[] docs) { + List products = new ArrayList(); + for (Rating r : docs) { + products.add(r.product()); + } + return products; + } + } + ); + JavaRDD, List>> relevantDocs = userMoviesList.join(userRecommendedList).values(); + + // Instantiate the metrics object + RankingMetrics metrics = RankingMetrics.of(relevantDocs); + + // Precision and NDCG at k + Integer[] kVector = {1, 3, 5}; + for (Integer k : kVector) { + System.out.format("Precision at %d = %f\n", k, metrics.precisionAt(k)); + System.out.format("NDCG at %d = %f\n", k, metrics.ndcgAt(k)); + } + + // Mean average precision + System.out.format("Mean average precision = %f\n", metrics.meanAveragePrecision()); + + // Evaluate the model using numerical ratings and regression metrics + JavaRDD> userProducts = ratings.map( + new Function>() { + public Tuple2 call(Rating r) { + return new Tuple2(r.user(), r.product()); + } + } + ); + JavaPairRDD, Object> predictions = JavaPairRDD.fromJavaRDD( + model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map( + new Function, Object>>() { + public Tuple2, Object> call(Rating r){ + return new Tuple2, Object>( + new Tuple2(r.user(), r.product()), r.rating()); + } + } + )); + JavaRDD> ratesAndPreds = + JavaPairRDD.fromJavaRDD(ratings.map( + new Function, Object>>() { + public Tuple2, Object> call(Rating r){ + return new Tuple2, Object>( + new Tuple2(r.user(), r.product()), r.rating()); + } + } + )).join(predictions).values(); + + // Create regression metrics object + RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd()); + + // Root mean squared error + System.out.format("RMSE = %f\n", regressionMetrics.rootMeanSquaredError()); + + // R-squared + System.out.format("R-squared = %f\n", regressionMetrics.r2()); + } +} diff --git a/examples/src/main/python/mllib/binary_classification_metrics.py b/examples/src/main/python/mllib/binary_classification_metrics.py new file mode 100644 index 0000000000000..9155b02083b0c --- /dev/null +++ b/examples/src/main/python/mllib/binary_classification_metrics.py @@ -0,0 +1,63 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +""" +Binary Classification Metrics Example. +""" +from __future__ import print_function + +import sys + +# $example on$ +from pyspark import SparkContext,SQLContext +from pyspark.mllib.classification import LogisticRegressionWithLBFGS +from pyspark.mllib.evaluation import BinaryClassificationMetrics +from pyspark.mllib.regression import LabeledPoint +from pyspark.mllib.util import MLUtils +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="BinaryClassificationMetrics") + sqlContext = SQLContext(sc) + +# Several of the methods available in scala are currently missing from pyspark + +# $example on$ +# Load training data in LIBSVM format +data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") + +# Split data into training (60%) and test (40%) +training, test = data.randomSplit([0.6, 0.4], seed = 11L) +training.cache() + +# Run training algorithm to build the model +model = LogisticRegressionWithLBFGS.train(training) + +# Compute raw scores on the test set +predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) + +# Instantiate metrics object +metrics = BinaryClassificationMetrics(predictionAndLabels) + +# Area under precision-recall curve +print("Area under PR = %s" % metrics.areaUnderPR) + +# Area under ROC curve +print("Area under ROC = %s" % metrics.areaUnderROC) +# $example off$ + diff --git a/examples/src/main/python/mllib/multi_class_metrics.py b/examples/src/main/python/mllib/multi_class_metrics.py new file mode 100644 index 0000000000000..07c0f462b188f --- /dev/null +++ b/examples/src/main/python/mllib/multi_class_metrics.py @@ -0,0 +1,69 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# $example on$ + +from pyspark.mllib.classification import LogisticRegressionWithLBFGS +from pyspark.mllib.util import MLUtils +from pyspark.mllib.evaluation import MulticlassMetrics + +# $example off$ +from pyspark import SparkContext + +if __name__ == "__main__": + sc = SparkContext(appName="MultiClassMetrics") + +# Several of the methods available in scala are currently missing from pyspark + +# Load training data in LIBSVM format +data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt") + +# Split data into training (60%) and test (40%) +training, test = data.randomSplit([0.6, 0.4], seed = 11L) +training.cache() + +# Run training algorithm to build the model +model = LogisticRegressionWithLBFGS.train(training, numClasses=3) + +# Compute raw scores on the test set +predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) + +# Instantiate metrics object +metrics = MulticlassMetrics(predictionAndLabels) + +# Overall statistics +precision = metrics.precision() +recall = metrics.recall() +f1Score = metrics.fMeasure() +print("Summary Stats") +print("Precision = %s" % precision) +print("Recall = %s" % recall) +print("F1 Score = %s" % f1Score) + +# Statistics by class +labels = data.map(lambda lp: lp.label).distinct().collect() +for label in sorted(labels): + print("Class %s precision = %s" % (label, metrics.precision(label))) + print("Class %s recall = %s" % (label, metrics.recall(label))) + print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) + +# Weighted stats +print("Weighted recall = %s" % metrics.weightedRecall) +print("Weighted precision = %s" % metrics.weightedPrecision) +print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) +print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) +print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate) \ No newline at end of file diff --git a/examples/src/main/python/mllib/multi_label_metrics.py b/examples/src/main/python/mllib/multi_label_metrics.py new file mode 100644 index 0000000000000..93b7e1a0cce2e --- /dev/null +++ b/examples/src/main/python/mllib/multi_label_metrics.py @@ -0,0 +1,63 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# $example on$ + +from pyspark.mllib.evaluation import MultilabelMetrics +from pyspark.mllib.util import MLUtils +from pyspark import SparkContext +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="MultiClassMetrics") +# $example on$ +scoreAndLabels = sc.parallelize([ + ([0.0, 1.0], [0.0, 2.0]), + ([0.0, 2.0], [0.0, 1.0]), + ([], [0.0]), + ([2.0], [2.0]), + ([2.0, 0.0], [2.0, 0.0]), + ([0.0, 1.0, 2.0], [0.0, 1.0]), + ([1.0], [1.0, 2.0])]) + +# Instantiate metrics object +metrics = MultilabelMetrics(scoreAndLabels) + +# Summary stats +print("Recall = %s" % metrics.recall()) +print("Precision = %s" % metrics.precision()) +print("F1 measure = %s" % metrics.f1Measure()) +print("Accuracy = %s" % metrics.accuracy) + +# Individual label stats +labels = scoreAndLabels.flatMap(lambda x: x[1]).distinct().collect() +for label in labels: + print("Class %s precision = %s" % (label, metrics.precision(label))) + print("Class %s recall = %s" % (label, metrics.recall(label))) + print("Class %s F1 Measure = %s" % (label, metrics.f1Measure(label))) + +# Micro stats +print("Micro precision = %s" % metrics.microPrecision) +print("Micro recall = %s" % metrics.microRecall) +print("Micro F1 measure = %s" % metrics.microF1Measure) + +# Hamming loss +print("Hamming loss = %s" % metrics.hammingLoss) + +# Subset accuracy +print("Subset accuracy = %s" % metrics.subsetAccuracy) +# $example off$ \ No newline at end of file diff --git a/examples/src/main/python/mllib/ranking_metrics.py b/examples/src/main/python/mllib/ranking_metrics.py new file mode 100644 index 0000000000000..7f8032ce17028 --- /dev/null +++ b/examples/src/main/python/mllib/ranking_metrics.py @@ -0,0 +1,54 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# $example on$ +from pyspark import SparkContext +from pyspark.mllib.recommendation import ALS, Rating +from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="Ranking Metrics") + +# Several of the methods available in scala are currently missing from pyspark + +# Read in the ratings data +lines = sc.textFile("data/mllib/sample_movielens_data.txt") + +def parseLine(line): + fields = line.split("::") + return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5) +ratings = lines.map(lambda r: parseLine(r)) + +# Train a model on to predict user-product ratings +model = ALS.train(ratings, 10, 10, 0.01) + +# Get predicted ratings on all existing user-product pairs +testData = ratings.map(lambda p: (p.user, p.product)) +predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating)) + +ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating)) +scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1]) + +# Instantiate regression metrics to compare predicted and actual ratings +metrics = RegressionMetrics(scoreAndLabels) + +# Root mean sqaured error +print("RMSE = %s" % metrics.rootMeanSquaredError) + +# R-squared +print("R-squared = %s" % metrics.r2) \ No newline at end of file diff --git a/examples/src/main/python/mllib/regression_metrics.py b/examples/src/main/python/mllib/regression_metrics.py new file mode 100644 index 0000000000000..601268da546db --- /dev/null +++ b/examples/src/main/python/mllib/regression_metrics.py @@ -0,0 +1,55 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# $example on$ +from pyspark import SparkContext +from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD +from pyspark.mllib.evaluation import RegressionMetrics +from pyspark.mllib.linalg import DenseVector +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="Regression Metrics") + +def parsePoint(line): + values = line.split() + return LabeledPoint(float(values[0]), DenseVector([float(x.split(':')[1]) for x in values[1:]])) + +data = sc.textFile("data/mllib/sample_linear_regression_data.txt") +parsedData = data.map(parsePoint) + +# Build the model +model = LinearRegressionWithSGD.train(parsedData) + +# Get predictions +valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label)) + +# Instantiate metrics object +metrics = RegressionMetrics(valuesAndPreds) + +# Squared Error +print("MSE = %s" % metrics.meanSquaredError) +print("RMSE = %s" % metrics.rootMeanSquaredError) + +# R-squared +print("R-squared = %s" % metrics.r2) + +# Mean absolute error +print("MAE = %s" % metrics.meanAbsoluteError) + +# Explained variance +print("Explained variance = %s" % metrics.explainedVariance) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala new file mode 100644 index 0000000000000..db640ccc4a08e --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkContext, SparkConf} + +// $example on$ +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.util.MLUtils +// $example off$ + +object BinaryClassificationMetrics { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("BinaryClassificationMetrics") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + import sqlContext.implicits._ + // $example on$ + // Load training data in LIBSVM format + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/data/mllib/sample_binary_classification_data.txt") + + // Split data into training (60%) and test (40%) + val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L) + training.cache() + + // Run training algorithm to build the model + val model = new LogisticRegressionWithLBFGS() + .setNumClasses(2) + .run(training) + + // Clear the prediction threshold so the model will return probabilities + model.clearThreshold + + // Compute raw scores on the test set + val predictionAndLabels = test.map { case LabeledPoint(label, features) => + val prediction = model.predict(features) + (prediction, label) + } + + // Instantiate metrics object + val metrics = new BinaryClassificationMetrics(predictionAndLabels) + + // Precision by threshold + val precision = metrics.precisionByThreshold + precision.foreach { case (t, p) => + println(s"Threshold: $t, Precision: $p") + } + + // Recall by threshold + val recall = metrics.recallByThreshold + recall.foreach { case (t, r) => + println(s"Threshold: $t, Recall: $r") + } + + // Precision-Recall Curve + val PRC = metrics.pr + + // F-measure + val f1Score = metrics.fMeasureByThreshold + f1Score.foreach { case (t, f) => + println(s"Threshold: $t, F-score: $f, Beta = 1") + } + + val beta = 0.5 + val fScore = metrics.fMeasureByThreshold(beta) + f1Score.foreach { case (t, f) => + println(s"Threshold: $t, F-score: $f, Beta = 0.5") + } + + // AUPRC + val auPRC = metrics.areaUnderPR + println("Area under precision-recall curve = " + auPRC) + + // Compute thresholds used in ROC and PR curves + val thresholds = precision.map(_._1) + + // ROC Curve + val roc = metrics.roc + + // AUROC + val auROC = metrics.areaUnderROC + println("Area under ROC = " + auROC) + + // $example off$ + + } +} +// scalastyle:on println \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics new file mode 100644 index 0000000000000..a5fcb145b650e --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkContext, SparkConf} + +import org.apache.spark.mllib.evaluation.MultilabelMetrics +import org.apache.spark.rdd.RDD; +object MultiLabelMetrics { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("MultiLabelMetrics") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + import sqlContext.implicits._ + // $example on$ + val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize( + Seq((Array(0.0, 1.0), Array(0.0, 2.0)), + (Array(0.0, 2.0), Array(0.0, 1.0)), + (Array(), Array(0.0)), + (Array(2.0), Array(2.0)), + (Array(2.0, 0.0), Array(2.0, 0.0)), + (Array(0.0, 1.0, 2.0), Array(0.0, 1.0)), + (Array(1.0), Array(1.0, 2.0))), 2) + + // Instantiate metrics object + val metrics = new MultilabelMetrics(scoreAndLabels) + + // Summary stats + println(s"Recall = ${metrics.recall}") + println(s"Precision = ${metrics.precision}") + println(s"F1 measure = ${metrics.f1Measure}") + println(s"Accuracy = ${metrics.accuracy}") + + // Individual label stats + metrics.labels.foreach(label => println(s"Class $label precision = ${metrics.precision(label)}")) + metrics.labels.foreach(label => println(s"Class $label recall = ${metrics.recall(label)}")) + metrics.labels.foreach(label => println(s"Class $label F1-score = ${metrics.f1Measure(label)}")) + + // Micro stats + println(s"Micro recall = ${metrics.microRecall}") + println(s"Micro precision = ${metrics.microPrecision}") + println(s"Micro F1 measure = ${metrics.microF1Measure}") + + // Hamming loss + println(s"Hamming loss = ${metrics.hammingLoss}") + + // Subset accuracy + println(s"Subset accuracy = ${metrics.subsetAccuracy}") + } +} \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala new file mode 100644 index 0000000000000..0ed3c633f19d8 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +// $example on$ +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS +import org.apache.spark.mllib.evaluation.MulticlassMetrics +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.util.MLUtils +// $example off$ + +import org.apache.spark.{SparkContext, SparkConf} + +object MulticlassMetrics { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("MulticlassMetrics") + val sc = new SparkContext(conf) + + // $example on$ + // Load training data in LIBSVM format + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt") + + // Split data into training (60%) and test (40%) + val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L) + training.cache() + + // Run training algorithm to build the model + val model = new LogisticRegressionWithLBFGS() + .setNumClasses(3) + .run(training) + + // Compute raw scores on the test set + val predictionAndLabels = test.map { case LabeledPoint(label, features) => + val prediction = model.predict(features) + (prediction, label) + } + + // Instantiate metrics object + val metrics = new MulticlassMetrics(predictionAndLabels) + + // Confusion matrix + println("Confusion matrix:") + println(metrics.confusionMatrix) + + // Overall Statistics + val precision = metrics.precision + val recall = metrics.recall // same as true positive rate + val f1Score = metrics.fMeasure + println("Summary Statistics") + println(s"Precision = $precision") + println(s"Recall = $recall") + println(s"F1 Score = $f1Score") + + // Precision by label + val labels = metrics.labels + labels.foreach { l => + println(s"Precision($l) = " + metrics.precision(l)) + } + + // Recall by label + labels.foreach { l => + println(s"Recall($l) = " + metrics.recall(l)) + } + + // False positive rate by label + labels.foreach { l => + println(s"FPR($l) = " + metrics.falsePositiveRate(l)) + } + + // F-measure by label + labels.foreach { l => + println(s"F1-Score($l) = " + metrics.fMeasure(l)) + } + + // Weighted stats + println(s"Weighted precision: ${metrics.weightedPrecision}") + println(s"Weighted recall: ${metrics.weightedRecall}") + println(s"Weighted F1 score: ${metrics.weightedFMeasure}") + println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}") + + // $example off$ + + } +} +// scalastyle:on println \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala new file mode 100644 index 0000000000000..2d6b6455bb5e3 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkContext, SparkConf} +import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics} +import org.apache.spark.mllib.recommendation.{ALS, Rating} +object RankingMetrics { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("RankingMetrics") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + import sqlContext.implicits._ + // Read in the ratings data + val ratings = sc.textFile("data/mllib/sample_movielens_data.txt").map { line => + val fields = line.split("::") + Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble - 2.5) + }.cache() + + // Map ratings to 1 or 0, 1 indicating a movie that should be recommended + val binarizedRatings = ratings.map(r => Rating(r.user, r.product, if (r.rating > 0) 1.0 else 0.0)).cache() + + // Summarize ratings + val numRatings = ratings.count() + val numUsers = ratings.map(_.user).distinct().count() + val numMovies = ratings.map(_.product).distinct().count() + println(s"Got $numRatings ratings from $numUsers users on $numMovies movies.") + + // Build the model + val numIterations = 10 + val rank = 10 + val lambda = 0.01 + val model = ALS.train(ratings, rank, numIterations, lambda) + + // Define a function to scale ratings from 0 to 1 + def scaledRating(r: Rating): Rating = { + val scaledRating = math.max(math.min(r.rating, 1.0), 0.0) + Rating(r.user, r.product, scaledRating) + } + + // Get sorted top ten predictions for each user and then scale from [0, 1] + val userRecommended = model.recommendProductsForUsers(10).map { case (user, recs) => + (user, recs.map(scaledRating)) + } + + // Assume that any movie a user rated 3 or higher (which maps to a 1) is a relevant document + // Compare with top ten most relevant documents + val userMovies = binarizedRatings.groupBy(_.user) + val relevantDocuments = userMovies.join(userRecommended).map { case (user, (actual, predictions)) => + (predictions.map(_.product), actual.filter(_.rating > 0.0).map(_.product).toArray) + } + + // Instantiate metrics object + val metrics = new RankingMetrics(relevantDocuments) + + // Precision at K + Array(1, 3, 5).foreach { k => + println(s"Precision at $k = ${metrics.precisionAt(k)}") + } + + // Mean average precision + println(s"Mean average precision = ${metrics.meanAveragePrecision}") + + // Normalized discounted cumulative gain + Array(1, 3, 5).foreach { k => + println(s"NDCG at $k = ${metrics.ndcgAt(k)}") + } + + // Get predictions for each data point + val allPredictions = model.predict(ratings.map(r => (r.user, r.product))).map(r => ((r.user, r.product), r.rating)) + val allRatings = ratings.map(r => ((r.user, r.product), r.rating)) + val predictionsAndLabels = allPredictions.join(allRatings).map { case ((user, product), (predicted, actual)) => + (predicted, actual) + } + + // Get the RMSE using regression metrics + val regressionMetrics = new RegressionMetrics(predictionsAndLabels) + println(s"RMSE = ${regressionMetrics.rootMeanSquaredError}") + + // R-squared + println(s"R-squared = ${regressionMetrics.r2}") + } +} \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala new file mode 100644 index 0000000000000..71b1c2d2cda77 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.mllib + +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.regression.LinearRegressionModel +import org.apache.spark.mllib.regression.LinearRegressionWithSGD +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.evaluation.RegressionMetrics +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkConf, SparkContext} + +object RegressionMetrics { + + def main(args: Array[String]) { + + val conf = new SparkConf().setAppName("RegressionMetrics") + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + // Load the data + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_linear_regression_data.txt").cache() + + // Build the model + val numIterations = 100 + val model = LinearRegressionWithSGD.train(data, numIterations) + + // Get predictions + val valuesAndPreds = data.map{ point => + val prediction = model.predict(point.features) + (prediction, point.label) + } + + // Instantiate metrics object + val metrics = new RegressionMetrics(valuesAndPreds) + + // Squared error + println(s"MSE = ${metrics.meanSquaredError}") + println(s"RMSE = ${metrics.rootMeanSquaredError}") + + // R-squared + println(s"R-squared = ${metrics.r2}") + + // Mean absolute error + println(s"MAE = ${metrics.meanAbsoluteError}") + + // Explained variance + println(s"Explained variance = ${metrics.explainedVariance}") + } +} \ No newline at end of file From cb9c846ffdb9d8d2f7deb42ecb2e4254caaf2462 Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Fri, 13 Nov 2015 13:35:22 +0530 Subject: [PATCH 02/13] Fixed scala issues --- docs/mllib-evaluation-metrics.md | 2 +- .../mllib/JavaBinaryClassification.java | 4 +- .../examples/mllib/JavaLinearRegression.java | 4 +- .../mllib/JavaMultiLabelClassification.java | 7 +- .../mllib/JavaMulticlassClassification.java | 4 +- .../spark/examples/mllib/JavaRanking.java | 3 +- .../mllib/binary_classification_metrics.py | 40 +++++----- .../main/python/mllib/multi_class_metrics.py | 70 ++++++++--------- .../main/python/mllib/multi_label_metrics.py | 78 +++++++++---------- .../src/main/python/mllib/ranking_metrics.py | 45 +++++------ .../main/python/mllib/regression_metrics.py | 47 +++++------ .../spark/examples/mllib/MultiLabelMetrics | 4 +- .../spark/examples/mllib/RankingMetrics.scala | 4 + .../examples/mllib/RegressionMetrics.scala | 5 +- 14 files changed, 168 insertions(+), 149 deletions(-) diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md index 2991249161046..7a9792c4a1455 100644 --- a/docs/mllib-evaluation-metrics.md +++ b/docs/mllib-evaluation-metrics.md @@ -247,7 +247,7 @@ Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.sp
Refer to the [`MulticlassMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MulticlassMetrics.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassMetrics.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java %}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java index 45da1fec120ab..b17dc79abff16 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java @@ -19,7 +19,6 @@ package org.apache.spark.examples.mllib; // $example on$ - import scala.Tuple2; import org.apache.spark.api.java.*; @@ -32,7 +31,7 @@ import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; -// $example off$ + public class JavaBinaryClassification { public static void main(String[] args) { @@ -111,3 +110,4 @@ public Double call(Tuple2 t) { LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); } } +// $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java index 309efced045b6..cc60409b42859 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java @@ -19,6 +19,7 @@ package org.apache.spark.examples.mllib; // $example on$ +import scala.Tuple2; import org.apache.spark.api.java.*; import org.apache.spark.api.java.function.Function; @@ -28,7 +29,7 @@ import org.apache.spark.mllib.regression.LinearRegressionWithSGD; import org.apache.spark.mllib.evaluation.RegressionMetrics; import org.apache.spark.SparkConf; -// $example off$ + // Read in the ratings data public class JavaLinearRegression { @@ -88,3 +89,4 @@ public Tuple2 call(LabeledPoint point) { LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath"); } } +// $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java index 5b3a03f0830b1..53204523bc865 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java @@ -25,12 +25,12 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.mllib.evaluation.MultilabelMetrics; import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; import java.util.Arrays; import java.util.List; // $example off$ - -public class MultilabelClassification { +import org.apache.spark.SparkContext; +// $example on$ +public class JavaMultiLabelClassification { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics"); JavaSparkContext sc = new JavaSparkContext(conf); @@ -75,3 +75,4 @@ public static void main(String[] args) { } } +// $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java index a5e92df358d14..0e74da7a883d1 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java @@ -18,6 +18,7 @@ // scalastyle:off println package org.apache.spark.examples.mllib +// $example on$ import scala.Tuple2; import org.apache.spark.api.java.*; @@ -89,4 +90,5 @@ public Tuple2 call(LabeledPoint p) { model.save(sc, "myModelPath"); LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); } -} \ No newline at end of file +} +// $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java index 2162cc658f193..b389a09c2715f 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java @@ -31,7 +31,7 @@ import org.apache.spark.mllib.evaluation.RankingMetrics; import org.apache.spark.mllib.recommendation.ALS; import org.apache.spark.mllib.recommendation.Rating; -// $example off$ + // Read in the ratings data public class JavaRanking { @@ -173,3 +173,4 @@ public Tuple2, Object> call(Rating r){ System.out.format("R-squared = %f\n", regressionMetrics.r2()); } } +// $example off$ \ No newline at end of file diff --git a/examples/src/main/python/mllib/binary_classification_metrics.py b/examples/src/main/python/mllib/binary_classification_metrics.py index 9155b02083b0c..85583c7e6cfa7 100644 --- a/examples/src/main/python/mllib/binary_classification_metrics.py +++ b/examples/src/main/python/mllib/binary_classification_metrics.py @@ -23,8 +23,9 @@ import sys + +from pyspark import SparkContext, SQLContext # $example on$ -from pyspark import SparkContext,SQLContext from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.mllib.evaluation import BinaryClassificationMetrics from pyspark.mllib.regression import LabeledPoint @@ -35,29 +36,28 @@ sc = SparkContext(appName="BinaryClassificationMetrics") sqlContext = SQLContext(sc) -# Several of the methods available in scala are currently missing from pyspark - -# $example on$ -# Load training data in LIBSVM format -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") + # Several of the methods available in scala are currently missing from pyspark -# Split data into training (60%) and test (40%) -training, test = data.randomSplit([0.6, 0.4], seed = 11L) -training.cache() + # $example on$ + # Load training data in LIBSVM format + data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") -# Run training algorithm to build the model -model = LogisticRegressionWithLBFGS.train(training) + # Split data into training (60%) and test (40%) + training, test = data.randomSplit([0.6, 0.4], seed=11L) + training.cache() -# Compute raw scores on the test set -predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) + # Run training algorithm to build the model + model = LogisticRegressionWithLBFGS.train(training) -# Instantiate metrics object -metrics = BinaryClassificationMetrics(predictionAndLabels) + # Compute raw scores on the test set + predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) -# Area under precision-recall curve -print("Area under PR = %s" % metrics.areaUnderPR) + # Instantiate metrics object + metrics = BinaryClassificationMetrics(predictionAndLabels) -# Area under ROC curve -print("Area under ROC = %s" % metrics.areaUnderROC) -# $example off$ + # Area under precision-recall curve + print("Area under PR = %s" % metrics.areaUnderPR) + # Area under ROC curve + print("Area under ROC = %s" % metrics.areaUnderROC) + # $example off$ diff --git a/examples/src/main/python/mllib/multi_class_metrics.py b/examples/src/main/python/mllib/multi_class_metrics.py index 07c0f462b188f..7959b7230a563 100644 --- a/examples/src/main/python/mllib/multi_class_metrics.py +++ b/examples/src/main/python/mllib/multi_class_metrics.py @@ -16,54 +16,54 @@ # # $example on$ - from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.mllib.util import MLUtils from pyspark.mllib.evaluation import MulticlassMetrics - # $example off$ + from pyspark import SparkContext if __name__ == "__main__": sc = SparkContext(appName="MultiClassMetrics") -# Several of the methods available in scala are currently missing from pyspark - -# Load training data in LIBSVM format -data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt") + # Several of the methods available in scala are currently missing from pyspark + # $example on$ + # Load training data in LIBSVM format + data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt") -# Split data into training (60%) and test (40%) -training, test = data.randomSplit([0.6, 0.4], seed = 11L) -training.cache() + # Split data into training (60%) and test (40%) + training, test = data.randomSplit([0.6, 0.4], seed=11L) + training.cache() -# Run training algorithm to build the model -model = LogisticRegressionWithLBFGS.train(training, numClasses=3) + # Run training algorithm to build the model + model = LogisticRegressionWithLBFGS.train(training, numClasses=3) -# Compute raw scores on the test set -predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) + # Compute raw scores on the test set + predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) -# Instantiate metrics object -metrics = MulticlassMetrics(predictionAndLabels) + # Instantiate metrics object + metrics = MulticlassMetrics(predictionAndLabels) -# Overall statistics -precision = metrics.precision() -recall = metrics.recall() -f1Score = metrics.fMeasure() -print("Summary Stats") -print("Precision = %s" % precision) -print("Recall = %s" % recall) -print("F1 Score = %s" % f1Score) + # Overall statistics + precision = metrics.precision() + recall = metrics.recall() + f1Score = metrics.fMeasure() + print("Summary Stats") + print("Precision = %s" % precision) + print("Recall = %s" % recall) + print("F1 Score = %s" % f1Score) -# Statistics by class -labels = data.map(lambda lp: lp.label).distinct().collect() -for label in sorted(labels): - print("Class %s precision = %s" % (label, metrics.precision(label))) - print("Class %s recall = %s" % (label, metrics.recall(label))) - print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) + # Statistics by class + labels = data.map(lambda lp: lp.label).distinct().collect() + for label in sorted(labels): + print("Class %s precision = %s" % (label, metrics.precision(label))) + print("Class %s recall = %s" % (label, metrics.recall(label))) + print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) -# Weighted stats -print("Weighted recall = %s" % metrics.weightedRecall) -print("Weighted precision = %s" % metrics.weightedPrecision) -print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) -print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) -print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate) \ No newline at end of file + # Weighted stats + print("Weighted recall = %s" % metrics.weightedRecall) + print("Weighted precision = %s" % metrics.weightedPrecision) + print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) + print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) + print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate) + # $example off$ diff --git a/examples/src/main/python/mllib/multi_label_metrics.py b/examples/src/main/python/mllib/multi_label_metrics.py index 93b7e1a0cce2e..d02d8d862d1e1 100644 --- a/examples/src/main/python/mllib/multi_label_metrics.py +++ b/examples/src/main/python/mllib/multi_label_metrics.py @@ -16,48 +16,48 @@ # # $example on$ - from pyspark.mllib.evaluation import MultilabelMetrics +# $example off$ from pyspark.mllib.util import MLUtils from pyspark import SparkContext -# $example off$ + if __name__ == "__main__": sc = SparkContext(appName="MultiClassMetrics") -# $example on$ -scoreAndLabels = sc.parallelize([ - ([0.0, 1.0], [0.0, 2.0]), - ([0.0, 2.0], [0.0, 1.0]), - ([], [0.0]), - ([2.0], [2.0]), - ([2.0, 0.0], [2.0, 0.0]), - ([0.0, 1.0, 2.0], [0.0, 1.0]), - ([1.0], [1.0, 2.0])]) - -# Instantiate metrics object -metrics = MultilabelMetrics(scoreAndLabels) - -# Summary stats -print("Recall = %s" % metrics.recall()) -print("Precision = %s" % metrics.precision()) -print("F1 measure = %s" % metrics.f1Measure()) -print("Accuracy = %s" % metrics.accuracy) - -# Individual label stats -labels = scoreAndLabels.flatMap(lambda x: x[1]).distinct().collect() -for label in labels: - print("Class %s precision = %s" % (label, metrics.precision(label))) - print("Class %s recall = %s" % (label, metrics.recall(label))) - print("Class %s F1 Measure = %s" % (label, metrics.f1Measure(label))) - -# Micro stats -print("Micro precision = %s" % metrics.microPrecision) -print("Micro recall = %s" % metrics.microRecall) -print("Micro F1 measure = %s" % metrics.microF1Measure) - -# Hamming loss -print("Hamming loss = %s" % metrics.hammingLoss) - -# Subset accuracy -print("Subset accuracy = %s" % metrics.subsetAccuracy) -# $example off$ \ No newline at end of file + # $example on$ + scoreAndLabels = sc.parallelize([ + ([0.0, 1.0], [0.0, 2.0]), + ([0.0, 2.0], [0.0, 1.0]), + ([], [0.0]), + ([2.0], [2.0]), + ([2.0, 0.0], [2.0, 0.0]), + ([0.0, 1.0, 2.0], [0.0, 1.0]), + ([1.0], [1.0, 2.0])]) + + # Instantiate metrics object + metrics = MultilabelMetrics(scoreAndLabels) + + # Summary stats + print("Recall = %s" % metrics.recall()) + print("Precision = %s" % metrics.precision()) + print("F1 measure = %s" % metrics.f1Measure()) + print("Accuracy = %s" % metrics.accuracy) + + # Individual label stats + labels = scoreAndLabels.flatMap(lambda x: x[1]).distinct().collect() + for label in labels: + print("Class %s precision = %s" % (label, metrics.precision(label))) + print("Class %s recall = %s" % (label, metrics.recall(label))) + print("Class %s F1 Measure = %s" % (label, metrics.f1Measure(label))) + + # Micro stats + print("Micro precision = %s" % metrics.microPrecision) + print("Micro recall = %s" % metrics.microRecall) + print("Micro F1 measure = %s" % metrics.microF1Measure) + + # Hamming loss + print("Hamming loss = %s" % metrics.hammingLoss) + + # Subset accuracy + print("Subset accuracy = %s" % metrics.subsetAccuracy) + # $example off$ diff --git a/examples/src/main/python/mllib/ranking_metrics.py b/examples/src/main/python/mllib/ranking_metrics.py index 7f8032ce17028..6fcdf3032d3dc 100644 --- a/examples/src/main/python/mllib/ranking_metrics.py +++ b/examples/src/main/python/mllib/ranking_metrics.py @@ -16,39 +16,40 @@ # # $example on$ -from pyspark import SparkContext from pyspark.mllib.recommendation import ALS, Rating from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics # $example off$ +from pyspark import SparkContext if __name__ == "__main__": sc = SparkContext(appName="Ranking Metrics") -# Several of the methods available in scala are currently missing from pyspark - -# Read in the ratings data -lines = sc.textFile("data/mllib/sample_movielens_data.txt") + # Several of the methods available in scala are currently missing from pyspark + # $example on$ + # Read in the ratings data + lines = sc.textFile("data/mllib/sample_movielens_data.txt") -def parseLine(line): - fields = line.split("::") - return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5) -ratings = lines.map(lambda r: parseLine(r)) + def parseLine(line): + fields = line.split("::") + return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5) + ratings = lines.map(lambda r: parseLine(r)) -# Train a model on to predict user-product ratings -model = ALS.train(ratings, 10, 10, 0.01) + # Train a model on to predict user-product ratings + model = ALS.train(ratings, 10, 10, 0.01) -# Get predicted ratings on all existing user-product pairs -testData = ratings.map(lambda p: (p.user, p.product)) -predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating)) + # Get predicted ratings on all existing user-product pairs + testData = ratings.map(lambda p: (p.user, p.product)) + predictions = model.predictAll(testData).map(lambda r: ((r.user, r.product), r.rating)) -ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating)) -scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1]) + ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating)) + scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1]) -# Instantiate regression metrics to compare predicted and actual ratings -metrics = RegressionMetrics(scoreAndLabels) + # Instantiate regression metrics to compare predicted and actual ratings + metrics = RegressionMetrics(scoreAndLabels) -# Root mean sqaured error -print("RMSE = %s" % metrics.rootMeanSquaredError) + # Root mean sqaured error + print("RMSE = %s" % metrics.rootMeanSquaredError) -# R-squared -print("R-squared = %s" % metrics.r2) \ No newline at end of file + # R-squared + print("R-squared = %s" % metrics.r2) + # $example off$ diff --git a/examples/src/main/python/mllib/regression_metrics.py b/examples/src/main/python/mllib/regression_metrics.py index 601268da546db..2b90f2457267c 100644 --- a/examples/src/main/python/mllib/regression_metrics.py +++ b/examples/src/main/python/mllib/regression_metrics.py @@ -16,40 +16,43 @@ # # $example on$ -from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD from pyspark.mllib.evaluation import RegressionMetrics from pyspark.mllib.linalg import DenseVector # $example off$ +from pyspark import SparkContext + if __name__ == "__main__": sc = SparkContext(appName="Regression Metrics") + # $example on$ + # Load and parse the data + def parsePoint(line): + values = line.split() + return LabeledPoint(float(values[0]),DenseVector([float(x.split(':')[1]) for x in values[1:]])) -def parsePoint(line): - values = line.split() - return LabeledPoint(float(values[0]), DenseVector([float(x.split(':')[1]) for x in values[1:]])) - -data = sc.textFile("data/mllib/sample_linear_regression_data.txt") -parsedData = data.map(parsePoint) + data = sc.textFile("data/mllib/sample_linear_regression_data.txt") + parsedData = data.map(parsePoint) -# Build the model -model = LinearRegressionWithSGD.train(parsedData) + # Build the model + model = LinearRegressionWithSGD.train(parsedData) -# Get predictions -valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label)) + # Get predictions + valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label)) -# Instantiate metrics object -metrics = RegressionMetrics(valuesAndPreds) + # Instantiate metrics object + metrics = RegressionMetrics(valuesAndPreds) -# Squared Error -print("MSE = %s" % metrics.meanSquaredError) -print("RMSE = %s" % metrics.rootMeanSquaredError) + # Squared Error + print("MSE = %s" % metrics.meanSquaredError) + print("RMSE = %s" % metrics.rootMeanSquaredError) -# R-squared -print("R-squared = %s" % metrics.r2) + # R-squared + print("R-squared = %s" % metrics.r2) -# Mean absolute error -print("MAE = %s" % metrics.meanAbsoluteError) + # Mean absolute error + print("MAE = %s" % metrics.meanAbsoluteError) -# Explained variance -print("Explained variance = %s" % metrics.explainedVariance) + # Explained variance + print("Explained variance = %s" % metrics.explainedVariance) + # $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics index a5fcb145b650e..020b86d2b332c 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics @@ -20,9 +20,10 @@ package org.apache.spark.examples.mllib import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkContext, SparkConf} - +// $example on$ import org.apache.spark.mllib.evaluation.MultilabelMetrics import org.apache.spark.rdd.RDD; +// $example off$ object MultiLabelMetrics { def main(args: Array[String]) { @@ -65,5 +66,6 @@ object MultiLabelMetrics { // Subset accuracy println(s"Subset accuracy = ${metrics.subsetAccuracy}") + // $example off$ } } \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala index 2d6b6455bb5e3..9a7a25357f596 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala @@ -20,8 +20,10 @@ package org.apache.spark.examples.mllib import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkContext, SparkConf} +// $example on$ import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics} import org.apache.spark.mllib.recommendation.{ALS, Rating} +// $example off$ object RankingMetrics { def main(args: Array[String]) { @@ -30,6 +32,7 @@ object RankingMetrics { val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ + // $example on$ // Read in the ratings data val ratings = sc.textFile("data/mllib/sample_movielens_data.txt").map { line => val fields = line.split("::") @@ -98,5 +101,6 @@ object RankingMetrics { // R-squared println(s"R-squared = ${regressionMetrics.r2}") + // $example off$ } } \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala index 71b1c2d2cda77..7dc77caeafa7a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala @@ -17,13 +17,14 @@ // scalastyle:off println package org.apache.spark.examples.mllib - +// $example on$ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.util.MLUtils +// $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} @@ -34,6 +35,7 @@ object RegressionMetrics { val conf = new SparkConf().setAppName("RegressionMetrics") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) + // $example on$ // Load the data val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_linear_regression_data.txt").cache() @@ -62,5 +64,6 @@ object RegressionMetrics { // Explained variance println(s"Explained variance = ${metrics.explainedVariance}") + // $example off$ } } \ No newline at end of file From ed33687f85833f94f845f27ba361cf8d6dbb0169 Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Fri, 13 Nov 2015 14:31:38 +0530 Subject: [PATCH 03/13] fixed styling issues --- dev/_site/README.md | 5 + dev/_site/audit-release/README.md | 11 + dev/_site/audit-release/audit_release.py | 237 ++++++++ .../audit-release/blank_maven_build/pom.xml | 43 ++ .../audit-release/blank_sbt_build/build.sbt | 30 + .../audit-release/maven_app_core/input.txt | 8 + .../audit-release/maven_app_core/pom.xml | 52 ++ .../src/main/java/SimpleApp.java | 42 ++ .../audit-release/sbt_app_core/build.sbt | 28 + .../audit-release/sbt_app_core/input.txt | 8 + .../src/main/scala/SparkApp.scala | 63 ++ .../audit-release/sbt_app_ganglia/build.sbt | 30 + .../src/main/scala/SparkApp.scala | 41 ++ .../audit-release/sbt_app_graphx/build.sbt | 28 + .../src/main/scala/GraphxApp.scala | 55 ++ .../audit-release/sbt_app_hive/build.sbt | 29 + dev/_site/audit-release/sbt_app_hive/data.txt | 9 + .../sbt_app_hive/src/main/scala/HiveApp.scala | 59 ++ .../audit-release/sbt_app_kinesis/build.sbt | 28 + .../src/main/scala/SparkApp.scala | 35 ++ dev/_site/audit-release/sbt_app_sql/build.sbt | 28 + .../sbt_app_sql/src/main/scala/SqlApp.scala | 61 ++ .../audit-release/sbt_app_streaming/build.sbt | 28 + .../src/main/scala/StreamingApp.scala | 65 ++ dev/_site/change-scala-version.sh | 70 +++ dev/_site/change-version-to-2.10.sh | 23 + dev/_site/change-version-to-2.11.sh | 23 + dev/_site/check-license | 85 +++ .../create-release/generate-changelist.py | 148 +++++ .../create-release/generate-contributors.py | 248 ++++++++ dev/_site/create-release/known_translations | 167 ++++++ dev/_site/create-release/release-build.sh | 326 ++++++++++ dev/_site/create-release/release-tag.sh | 79 +++ dev/_site/create-release/releaseutils.py | 260 ++++++++ .../create-release/translate-contributors.py | 253 ++++++++ dev/_site/github_jira_sync.py | 147 +++++ dev/_site/lint-python | 114 ++++ dev/_site/lint-r | 41 ++ dev/_site/lint-r.R | 37 ++ dev/_site/lint-scala | 23 + dev/_site/merge_spark_pr.py | 453 ++++++++++++++ dev/_site/mima | 54 ++ dev/_site/run-tests | 23 + dev/_site/run-tests-jenkins | 28 + dev/_site/run-tests-jenkins.py | 228 +++++++ dev/_site/run-tests.py | 561 ++++++++++++++++++ dev/_site/scalastyle | 34 ++ dev/_site/sparktestsupport/modules.py | 437 ++++++++++++++ dev/_site/sparktestsupport/shellutils.py | 115 ++++ dev/_site/tests/pr_merge_ability.sh | 39 ++ dev/_site/tests/pr_new_dependencies.sh | 117 ++++ dev/_site/tests/pr_public_classes.sh | 65 ++ ...tiLabelMetrics => MultiLabelMetrics.scala} | 0 53 files changed, 5221 insertions(+) create mode 100644 dev/_site/README.md create mode 100644 dev/_site/audit-release/README.md create mode 100755 dev/_site/audit-release/audit_release.py create mode 100644 dev/_site/audit-release/blank_maven_build/pom.xml create mode 100644 dev/_site/audit-release/blank_sbt_build/build.sbt create mode 100644 dev/_site/audit-release/maven_app_core/input.txt create mode 100644 dev/_site/audit-release/maven_app_core/pom.xml create mode 100644 dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java create mode 100644 dev/_site/audit-release/sbt_app_core/build.sbt create mode 100644 dev/_site/audit-release/sbt_app_core/input.txt create mode 100644 dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala create mode 100644 dev/_site/audit-release/sbt_app_ganglia/build.sbt create mode 100644 dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala create mode 100644 dev/_site/audit-release/sbt_app_graphx/build.sbt create mode 100644 dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala create mode 100644 dev/_site/audit-release/sbt_app_hive/build.sbt create mode 100644 dev/_site/audit-release/sbt_app_hive/data.txt create mode 100644 dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala create mode 100644 dev/_site/audit-release/sbt_app_kinesis/build.sbt create mode 100644 dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala create mode 100644 dev/_site/audit-release/sbt_app_sql/build.sbt create mode 100644 dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala create mode 100644 dev/_site/audit-release/sbt_app_streaming/build.sbt create mode 100644 dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala create mode 100755 dev/_site/change-scala-version.sh create mode 100755 dev/_site/change-version-to-2.10.sh create mode 100755 dev/_site/change-version-to-2.11.sh create mode 100755 dev/_site/check-license create mode 100755 dev/_site/create-release/generate-changelist.py create mode 100755 dev/_site/create-release/generate-contributors.py create mode 100644 dev/_site/create-release/known_translations create mode 100755 dev/_site/create-release/release-build.sh create mode 100755 dev/_site/create-release/release-tag.sh create mode 100755 dev/_site/create-release/releaseutils.py create mode 100755 dev/_site/create-release/translate-contributors.py create mode 100755 dev/_site/github_jira_sync.py create mode 100755 dev/_site/lint-python create mode 100755 dev/_site/lint-r create mode 100644 dev/_site/lint-r.R create mode 100755 dev/_site/lint-scala create mode 100755 dev/_site/merge_spark_pr.py create mode 100755 dev/_site/mima create mode 100755 dev/_site/run-tests create mode 100755 dev/_site/run-tests-jenkins create mode 100755 dev/_site/run-tests-jenkins.py create mode 100755 dev/_site/run-tests.py create mode 100755 dev/_site/scalastyle create mode 100644 dev/_site/sparktestsupport/modules.py create mode 100644 dev/_site/sparktestsupport/shellutils.py create mode 100755 dev/_site/tests/pr_merge_ability.sh create mode 100755 dev/_site/tests/pr_new_dependencies.sh create mode 100755 dev/_site/tests/pr_public_classes.sh rename examples/src/main/scala/org/apache/spark/examples/mllib/{MultiLabelMetrics => MultiLabelMetrics.scala} (100%) diff --git a/dev/_site/README.md b/dev/_site/README.md new file mode 100644 index 0000000000000..2b0f3d8ee8924 --- /dev/null +++ b/dev/_site/README.md @@ -0,0 +1,5 @@ +# Spark Developer Scripts +This directory contains scripts useful to developers when packaging, +testing, or committing to Spark. + +Many of these scripts require Apache credentials to work correctly. diff --git a/dev/_site/audit-release/README.md b/dev/_site/audit-release/README.md new file mode 100644 index 0000000000000..f72f8c653a265 --- /dev/null +++ b/dev/_site/audit-release/README.md @@ -0,0 +1,11 @@ +# Test Application Builds +This directory includes test applications which are built when auditing releases. You can +run them locally by setting appropriate environment variables. + +``` +$ cd sbt_app_core +$ SCALA_VERSION=2.10.5 \ + SPARK_VERSION=1.0.0-SNAPSHOT \ + SPARK_RELEASE_REPOSITORY=file:///home/patrick/.ivy2/local \ + sbt run +``` diff --git a/dev/_site/audit-release/audit_release.py b/dev/_site/audit-release/audit_release.py new file mode 100755 index 0000000000000..27d1dd784ce2e --- /dev/null +++ b/dev/_site/audit-release/audit_release.py @@ -0,0 +1,237 @@ +#!/usr/bin/python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Audits binary and maven artifacts for a Spark release. +# Requires GPG and Maven. +# usage: +# python audit_release.py + +import os +import re +import shutil +import subprocess +import sys +import time +import urllib2 + +# Note: The following variables must be set before use! +RELEASE_URL = "http://people.apache.org/~andrewor14/spark-1.1.1-rc1/" +RELEASE_KEY = "XXXXXXXX" # Your 8-digit hex +RELEASE_REPOSITORY = "https://repository.apache.org/content/repositories/orgapachespark-1033" +RELEASE_VERSION = "1.1.1" +SCALA_VERSION = "2.10.5" +SCALA_BINARY_VERSION = "2.10" + +# Do not set these +LOG_FILE_NAME = "spark_audit_%s" % time.strftime("%h_%m_%Y_%I_%M_%S") +LOG_FILE = open(LOG_FILE_NAME, 'w') +WORK_DIR = "/tmp/audit_%s" % int(time.time()) +MAVEN_CMD = "mvn" +GPG_CMD = "gpg" +SBT_CMD = "sbt -Dsbt.log.noformat=true" + +# Track failures to print them at the end +failures = [] + +# Log a message. Use sparingly because this flushes every write. +def log(msg): + LOG_FILE.write(msg + "\n") + LOG_FILE.flush() + +def log_and_print(msg): + print msg + log(msg) + +# Prompt the user to delete the scratch directory used +def clean_work_files(): + response = raw_input("OK to delete scratch directory '%s'? (y/N) " % WORK_DIR) + if response == "y": + shutil.rmtree(WORK_DIR) + +# Run the given command and log its output to the log file +def run_cmd(cmd, exit_on_failure=True): + log("Running command: %s" % cmd) + ret = subprocess.call(cmd, shell=True, stdout=LOG_FILE, stderr=LOG_FILE) + if ret != 0 and exit_on_failure: + log_and_print("Command failed: %s" % cmd) + clean_work_files() + sys.exit(-1) + return ret + +def run_cmd_with_output(cmd): + log_and_print("Running command: %s" % cmd) + return subprocess.check_output(cmd, shell=True, stderr=LOG_FILE) + +# Test if the given condition is successful +# If so, print the pass message; otherwise print the failure message +def test(cond, msg): + return passed(msg) if cond else failed(msg) + +def passed(msg): + log_and_print("[PASSED] %s" % msg) + +def failed(msg): + failures.append(msg) + log_and_print("[**FAILED**] %s" % msg) + +def get_url(url): + return urllib2.urlopen(url).read() + +# If the path exists, prompt the user to delete it +# If the resource is not deleted, abort +def ensure_path_not_present(path): + full_path = os.path.expanduser(path) + if os.path.exists(full_path): + print "Found %s locally." % full_path + response = raw_input("This can interfere with testing published artifacts. OK to delete? (y/N) ") + if response == "y": + shutil.rmtree(full_path) + else: + print "Abort." + sys.exit(-1) + +log_and_print("|-------- Starting Spark audit tests for release %s --------|" % RELEASE_VERSION) +log_and_print("Log output can be found in %s" % LOG_FILE_NAME) + +original_dir = os.getcwd() + +# For each of these modules, we'll test an 'empty' application in sbt and +# maven that links against them. This will catch issues with messed up +# dependencies within those projects. +modules = [ + "spark-core", "spark-bagel", "spark-mllib", "spark-streaming", "spark-repl", + "spark-graphx", "spark-streaming-flume", "spark-streaming-kafka", + "spark-streaming-mqtt", "spark-streaming-twitter", "spark-streaming-zeromq", + "spark-catalyst", "spark-sql", "spark-hive", "spark-streaming-kinesis-asl" +] +modules = map(lambda m: "%s_%s" % (m, SCALA_BINARY_VERSION), modules) + +# Check for directories that might interfere with tests +local_ivy_spark = "~/.ivy2/local/org.apache.spark" +cache_ivy_spark = "~/.ivy2/cache/org.apache.spark" +local_maven_kafka = "~/.m2/repository/org/apache/kafka" +local_maven_kafka = "~/.m2/repository/org/apache/spark" +map(ensure_path_not_present, [local_ivy_spark, cache_ivy_spark, local_maven_kafka]) + +# SBT build tests +log_and_print("==== Building SBT modules ====") +os.chdir("blank_sbt_build") +os.environ["SPARK_VERSION"] = RELEASE_VERSION +os.environ["SCALA_VERSION"] = SCALA_VERSION +os.environ["SPARK_RELEASE_REPOSITORY"] = RELEASE_REPOSITORY +os.environ["SPARK_AUDIT_MASTER"] = "local" +for module in modules: + log("==== Building module %s in SBT ====" % module) + os.environ["SPARK_MODULE"] = module + ret = run_cmd("%s clean update" % SBT_CMD, exit_on_failure=False) + test(ret == 0, "SBT build against '%s' module" % module) +os.chdir(original_dir) + +# SBT application tests +log_and_print("==== Building SBT applications ====") +for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming", "sbt_app_sql", "sbt_app_hive", "sbt_app_kinesis"]: + log("==== Building application %s in SBT ====" % app) + os.chdir(app) + ret = run_cmd("%s clean run" % SBT_CMD, exit_on_failure=False) + test(ret == 0, "SBT application (%s)" % app) + os.chdir(original_dir) + +# Maven build tests +os.chdir("blank_maven_build") +log_and_print("==== Building Maven modules ====") +for module in modules: + log("==== Building module %s in maven ====" % module) + cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" ' + '-Dspark.module="%s" clean compile' % + (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, module)) + ret = run_cmd(cmd, exit_on_failure=False) + test(ret == 0, "maven build against '%s' module" % module) +os.chdir(original_dir) + +# Maven application tests +log_and_print("==== Building Maven applications ====") +os.chdir("maven_app_core") +mvn_exec_cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" ' + '-Dscala.binary.version="%s" clean compile ' + 'exec:java -Dexec.mainClass="SimpleApp"' % + (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, SCALA_BINARY_VERSION)) +ret = run_cmd(mvn_exec_cmd, exit_on_failure=False) +test(ret == 0, "maven application (core)") +os.chdir(original_dir) + +# Binary artifact tests +if os.path.exists(WORK_DIR): + print "Working directory '%s' already exists" % WORK_DIR + sys.exit(-1) +os.mkdir(WORK_DIR) +os.chdir(WORK_DIR) + +index_page = get_url(RELEASE_URL) +artifact_regex = r = re.compile("") +artifacts = r.findall(index_page) + +# Verify artifact integrity +for artifact in artifacts: + log_and_print("==== Verifying download integrity for artifact: %s ====" % artifact) + + artifact_url = "%s/%s" % (RELEASE_URL, artifact) + key_file = "%s.asc" % artifact + run_cmd("wget %s" % artifact_url) + run_cmd("wget %s/%s" % (RELEASE_URL, key_file)) + run_cmd("wget %s%s" % (artifact_url, ".sha")) + + # Verify signature + run_cmd("%s --keyserver pgp.mit.edu --recv-key %s" % (GPG_CMD, RELEASE_KEY)) + run_cmd("%s %s" % (GPG_CMD, key_file)) + passed("Artifact signature verified.") + + # Verify md5 + my_md5 = run_cmd_with_output("%s --print-md MD5 %s" % (GPG_CMD, artifact)).strip() + release_md5 = get_url("%s.md5" % artifact_url).strip() + test(my_md5 == release_md5, "Artifact MD5 verified.") + + # Verify sha + my_sha = run_cmd_with_output("%s --print-md SHA512 %s" % (GPG_CMD, artifact)).strip() + release_sha = get_url("%s.sha" % artifact_url).strip() + test(my_sha == release_sha, "Artifact SHA verified.") + + # Verify Apache required files + dir_name = artifact.replace(".tgz", "") + run_cmd("tar xvzf %s" % artifact) + base_files = os.listdir(dir_name) + test("CHANGES.txt" in base_files, "Tarball contains CHANGES.txt file") + test("NOTICE" in base_files, "Tarball contains NOTICE file") + test("LICENSE" in base_files, "Tarball contains LICENSE file") + + os.chdir(WORK_DIR) + +# Report result +log_and_print("\n") +if len(failures) == 0: + log_and_print("*** ALL TESTS PASSED ***") +else: + log_and_print("XXXXX SOME TESTS DID NOT PASS XXXXX") + for f in failures: + log_and_print(" %s" % f) +os.chdir(original_dir) + +# Clean up +clean_work_files() + +log_and_print("|-------- Spark release audit complete --------|") diff --git a/dev/_site/audit-release/blank_maven_build/pom.xml b/dev/_site/audit-release/blank_maven_build/pom.xml new file mode 100644 index 0000000000000..02dd9046c9a49 --- /dev/null +++ b/dev/_site/audit-release/blank_maven_build/pom.xml @@ -0,0 +1,43 @@ + + + + + spark.audit + spark-audit + 4.0.0 + Spark Release Auditor + jar + 1.0 + + + Spray.cc repository + http://repo.spray.cc + + + Spark Staging Repo + ${spark.release.repository} + + + + + org.apache.spark + ${spark.module} + ${spark.version} + + + diff --git a/dev/_site/audit-release/blank_sbt_build/build.sbt b/dev/_site/audit-release/blank_sbt_build/build.sbt new file mode 100644 index 0000000000000..62815542e5bd9 --- /dev/null +++ b/dev/_site/audit-release/blank_sbt_build/build.sbt @@ -0,0 +1,30 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Spark Release Auditor" + +version := "1.0" + +scalaVersion := System.getenv.get("SCALA_VERSION") + +libraryDependencies += "org.apache.spark" % System.getenv.get("SPARK_MODULE") % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Eclipse Paho Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/", + "Maven Repository" at "http://repo1.maven.org/maven2/", + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/maven_app_core/input.txt b/dev/_site/audit-release/maven_app_core/input.txt new file mode 100644 index 0000000000000..837b6f85ae97f --- /dev/null +++ b/dev/_site/audit-release/maven_app_core/input.txt @@ -0,0 +1,8 @@ +a +b +c +d +a +b +c +d diff --git a/dev/_site/audit-release/maven_app_core/pom.xml b/dev/_site/audit-release/maven_app_core/pom.xml new file mode 100644 index 0000000000000..b516396825573 --- /dev/null +++ b/dev/_site/audit-release/maven_app_core/pom.xml @@ -0,0 +1,52 @@ + + + + + spark.audit + spark-audit + 4.0.0 + Simple Project + jar + 1.0 + + + Spray.cc repository + http://repo.spray.cc + + + Spark Staging Repo + ${spark.release.repository} + + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + + + + + + + maven-compiler-plugin + 3.1 + + + + diff --git a/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java b/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java new file mode 100644 index 0000000000000..5217689e7c092 --- /dev/null +++ b/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.spark.api.java.*; +import org.apache.spark.api.java.function.Function; + +public class SimpleApp { + public static void main(String[] args) { + String logFile = "input.txt"; + JavaSparkContext sc = new JavaSparkContext("local", "Simple App"); + JavaRDD logData = sc.textFile(logFile).cache(); + + long numAs = logData.filter(new Function() { + public Boolean call(String s) { return s.contains("a"); } + }).count(); + + long numBs = logData.filter(new Function() { + public Boolean call(String s) { return s.contains("b"); } + }).count(); + + if (numAs != 2 || numBs != 2) { + System.out.println("Failed to parse log files with Spark"); + System.exit(-1); + } + System.out.println("Test succeeded"); + sc.stop(); + } +} diff --git a/dev/_site/audit-release/sbt_app_core/build.sbt b/dev/_site/audit-release/sbt_app_core/build.sbt new file mode 100644 index 0000000000000..291b1d6440bac --- /dev/null +++ b/dev/_site/audit-release/sbt_app_core/build.sbt @@ -0,0 +1,28 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Simple Project" + +version := "1.0" + +scalaVersion := System.getenv.get("SCALA_VERSION") + +libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_core/input.txt b/dev/_site/audit-release/sbt_app_core/input.txt new file mode 100644 index 0000000000000..837b6f85ae97f --- /dev/null +++ b/dev/_site/audit-release/sbt_app_core/input.txt @@ -0,0 +1,8 @@ +a +b +c +d +a +b +c +d diff --git a/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala new file mode 100644 index 0000000000000..61d91c70e9709 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package main.scala + +import scala.util.Try + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ + +object SimpleApp { + def main(args: Array[String]) { + val conf = sys.env.get("SPARK_AUDIT_MASTER") match { + case Some(master) => new SparkConf().setAppName("Simple Spark App").setMaster(master) + case None => new SparkConf().setAppName("Simple Spark App") + } + val logFile = "input.txt" + val sc = new SparkContext(conf) + val logData = sc.textFile(logFile, 2).cache() + val numAs = logData.filter(line => line.contains("a")).count() + val numBs = logData.filter(line => line.contains("b")).count() + if (numAs != 2 || numBs != 2) { + println("Failed to parse log files with Spark") + System.exit(-1) + } + + // Regression test for SPARK-1167: Remove metrics-ganglia from default build due to LGPL issue + val foundConsole = Try(Class.forName("org.apache.spark.metrics.sink.ConsoleSink")).isSuccess + val foundGanglia = Try(Class.forName("org.apache.spark.metrics.sink.GangliaSink")).isSuccess + if (!foundConsole) { + println("Console sink not loaded via spark-core") + System.exit(-1) + } + if (foundGanglia) { + println("Ganglia sink was loaded via spark-core") + System.exit(-1) + } + + // Remove kinesis from default build due to ASL license issue + val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess + if (foundKinesis) { + println("Kinesis was loaded via spark-core") + System.exit(-1) + } + } +} +// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_ganglia/build.sbt b/dev/_site/audit-release/sbt_app_ganglia/build.sbt new file mode 100644 index 0000000000000..6d9474acf5bbc --- /dev/null +++ b/dev/_site/audit-release/sbt_app_ganglia/build.sbt @@ -0,0 +1,30 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Ganglia Test" + +version := "1.0" + +scalaVersion := System.getenv.get("SCALA_VERSION") + +libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION") + +libraryDependencies += "org.apache.spark" %% "spark-ganglia-lgpl" % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala new file mode 100644 index 0000000000000..9f7ae75d0b477 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package main.scala + +import scala.util.Try + +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ + +object SimpleApp { + def main(args: Array[String]) { + // Regression test for SPARK-1167: Remove metrics-ganglia from default build due to LGPL issue + val foundConsole = Try(Class.forName("org.apache.spark.metrics.sink.ConsoleSink")).isSuccess + val foundGanglia = Try(Class.forName("org.apache.spark.metrics.sink.GangliaSink")).isSuccess + if (!foundConsole) { + println("Console sink not loaded via spark-core") + System.exit(-1) + } + if (!foundGanglia) { + println("Ganglia sink not loaded via spark-ganglia-lgpl") + System.exit(-1) + } + } +} +// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_graphx/build.sbt b/dev/_site/audit-release/sbt_app_graphx/build.sbt new file mode 100644 index 0000000000000..dd11245e67d44 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_graphx/build.sbt @@ -0,0 +1,28 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Simple Project" + +version := "1.0" + +scalaVersion := System.getenv.get("SCALA_VERSION") + +libraryDependencies += "org.apache.spark" %% "spark-graphx" % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala b/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala new file mode 100644 index 0000000000000..2f0b6ef9a5672 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package main.scala + +import org.apache.spark.{SparkContext, SparkConf} +import org.apache.spark.SparkContext._ +import org.apache.spark.graphx._ +import org.apache.spark.rdd.RDD + +object GraphXApp { + def main(args: Array[String]) { + val conf = sys.env.get("SPARK_AUDIT_MASTER") match { + case Some(master) => new SparkConf().setAppName("Simple GraphX App").setMaster(master) + case None => new SparkConf().setAppName("Simple Graphx App") + } + val sc = new SparkContext(conf) + SparkContext.jarOfClass(this.getClass).foreach(sc.addJar) + + val users: RDD[(VertexId, (String, String))] = + sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")), + (5L, ("franklin", "prof")), (2L, ("istoica", "prof")), + (4L, ("peter", "student")))) + val relationships: RDD[Edge[String]] = + sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"), + Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"), + Edge(4L, 0L, "student"), Edge(5L, 0L, "colleague"))) + val defaultUser = ("John Doe", "Missing") + val graph = Graph(users, relationships, defaultUser) + // Notice that there is a user 0 (for which we have no information) connected to users + // 4 (peter) and 5 (franklin). + val triplets = graph.triplets.map(e => (e.srcAttr._1, e.dstAttr._1)).collect + if (!triplets.exists(_ == ("peter", "John Doe"))) { + println("Failed to run GraphX") + System.exit(-1) + } + println("Test succeeded") + } +} +// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_hive/build.sbt b/dev/_site/audit-release/sbt_app_hive/build.sbt new file mode 100644 index 0000000000000..c8824f2b15e55 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_hive/build.sbt @@ -0,0 +1,29 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Simple Project" + +version := "1.0" + +scalaVersion := System.getenv.get("SCALA_VERSION") + +libraryDependencies += "org.apache.spark" %% "spark-hive" % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Maven Repository" at "http://repo1.maven.org/maven2/", + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_hive/data.txt b/dev/_site/audit-release/sbt_app_hive/data.txt new file mode 100644 index 0000000000000..0229e67f51e01 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_hive/data.txt @@ -0,0 +1,9 @@ +0val_0 +1val_1 +2val_2 +3val_3 +4val_4 +5val_5 +6val_6 +7val_7 +9val_9 diff --git a/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala b/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala new file mode 100644 index 0000000000000..4a980ec071ae4 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package main.scala + +import scala.collection.mutable.{ListBuffer, Queue} + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.hive.HiveContext + +case class Person(name: String, age: Int) + +object SparkSqlExample { + + def main(args: Array[String]) { + val conf = sys.env.get("SPARK_AUDIT_MASTER") match { + case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master) + case None => new SparkConf().setAppName("Simple Sql App") + } + val sc = new SparkContext(conf) + val hiveContext = new HiveContext(sc) + + import hiveContext._ + sql("DROP TABLE IF EXISTS src") + sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") + sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src") + val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect() + results.foreach(println) + + def test(f: => Boolean, failureMsg: String) = { + if (!f) { + println(failureMsg) + System.exit(-1) + } + } + + test(results.size == 5, "Unexpected number of selected elements: " + results) + println("Test succeeded") + sc.stop() + } +} +// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_kinesis/build.sbt b/dev/_site/audit-release/sbt_app_kinesis/build.sbt new file mode 100644 index 0000000000000..981bc7957b5ed --- /dev/null +++ b/dev/_site/audit-release/sbt_app_kinesis/build.sbt @@ -0,0 +1,28 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Kinesis Test" + +version := "1.0" + +scalaVersion := System.getenv.get("SCALA_VERSION") + +libraryDependencies += "org.apache.spark" %% "spark-streaming-kinesis-asl" % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala new file mode 100644 index 0000000000000..adc25b57d6aa5 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package main.scala + +import scala.util.Try + +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ + +object SimpleApp { + def main(args: Array[String]) { + val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess + if (!foundKinesis) { + println("Kinesis not loaded via kinesis-asl") + System.exit(-1) + } + } +} +// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_sql/build.sbt b/dev/_site/audit-release/sbt_app_sql/build.sbt new file mode 100644 index 0000000000000..9116180f71a44 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_sql/build.sbt @@ -0,0 +1,28 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Simple Project" + +version := "1.0" + +scalaVersion := System.getenv.get("SCALA_VERSION") + +libraryDependencies += "org.apache.spark" %% "spark-sql" % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala b/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala new file mode 100644 index 0000000000000..69c1154dc0955 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package main.scala + +import scala.collection.mutable.{ListBuffer, Queue} + +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SQLContext + +case class Person(name: String, age: Int) + +object SparkSqlExample { + + def main(args: Array[String]) { + val conf = sys.env.get("SPARK_AUDIT_MASTER") match { + case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master) + case None => new SparkConf().setAppName("Simple Sql App") + } + val sc = new SparkContext(conf) + val sqlContext = new SQLContext(sc) + + import sqlContext.implicits._ + import sqlContext._ + + val people = sc.makeRDD(1 to 100, 10).map(x => Person(s"Name$x", x)).toDF() + people.registerTempTable("people") + val teenagers = sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") + val teenagerNames = teenagers.map(t => "Name: " + t(0)).collect() + teenagerNames.foreach(println) + + def test(f: => Boolean, failureMsg: String) = { + if (!f) { + println(failureMsg) + System.exit(-1) + } + } + + test(teenagerNames.size == 7, "Unexpected number of selected elements: " + teenagerNames) + println("Test succeeded") + sc.stop() + } +} +// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_streaming/build.sbt b/dev/_site/audit-release/sbt_app_streaming/build.sbt new file mode 100644 index 0000000000000..cb369d516dd16 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_streaming/build.sbt @@ -0,0 +1,28 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +name := "Simple Project" + +version := "1.0" + +scalaVersion := System.getenv.get("SCALA_VERSION") + +libraryDependencies += "org.apache.spark" %% "spark-streaming" % System.getenv.get("SPARK_VERSION") + +resolvers ++= Seq( + "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), + "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala b/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala new file mode 100644 index 0000000000000..d6a074687f4a1 --- /dev/null +++ b/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package main.scala + +import scala.collection.mutable.{ListBuffer, Queue} + +import org.apache.spark.SparkConf +import org.apache.spark.rdd.RDD +import org.apache.spark.streaming.StreamingContext +import org.apache.spark.streaming._ + +object SparkStreamingExample { + + def main(args: Array[String]) { + val conf = sys.env.get("SPARK_AUDIT_MASTER") match { + case Some(master) => new SparkConf().setAppName("Simple Streaming App").setMaster(master) + case None => new SparkConf().setAppName("Simple Streaming App") + } + val ssc = new StreamingContext(conf, Seconds(1)) + val seen = ListBuffer[RDD[Int]]() + + val rdd1 = ssc.sparkContext.makeRDD(1 to 100, 10) + val rdd2 = ssc.sparkContext.makeRDD(1 to 1000, 10) + val rdd3 = ssc.sparkContext.makeRDD(1 to 10000, 10) + + val queue = Queue(rdd1, rdd2, rdd3) + val stream = ssc.queueStream(queue) + + stream.foreachRDD(rdd => seen += rdd) + ssc.start() + Thread.sleep(5000) + + def test(f: => Boolean, failureMsg: String) = { + if (!f) { + println(failureMsg) + System.exit(-1) + } + } + + val rddCounts = seen.map(rdd => rdd.count()).filter(_ > 0) + test(rddCounts.length == 3, "Did not collect three RDD's from stream") + test(rddCounts.toSet == Set(100, 1000, 10000), "Did not find expected streams") + + println("Test succeeded") + + ssc.stop() + } +} +// scalastyle:on println diff --git a/dev/_site/change-scala-version.sh b/dev/_site/change-scala-version.sh new file mode 100755 index 0000000000000..d7975dfb6475c --- /dev/null +++ b/dev/_site/change-scala-version.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -e + +VALID_VERSIONS=( 2.10 2.11 ) + +usage() { + echo "Usage: $(basename $0) [-h|--help] +where : + -h| --help Display this help text + valid version values : ${VALID_VERSIONS[*]} +" 1>&2 + exit 1 +} + +if [[ ($# -ne 1) || ( $1 == "--help") || $1 == "-h" ]]; then + usage +fi + +TO_VERSION=$1 + +check_scala_version() { + for i in ${VALID_VERSIONS[*]}; do [ $i = "$1" ] && return 0; done + echo "Invalid Scala version: $1. Valid versions: ${VALID_VERSIONS[*]}" 1>&2 + exit 1 +} + +check_scala_version "$TO_VERSION" + +if [ $TO_VERSION = "2.11" ]; then + FROM_VERSION="2.10" +else + FROM_VERSION="2.11" +fi + +sed_i() { + sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2" +} + +export -f sed_i + +BASEDIR=$(dirname $0)/.. +find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \ + -exec bash -c "sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' {}" \; + +# Also update in parent POM +# Match any scala binary version to ensure idempotency +sed_i '1,/[0-9]*\.[0-9]*[0-9]*\.[0-9]*'$TO_VERSION' "$JAR_DL" && mv "$JAR_DL" "$JAR" + elif [ $(command -v wget) ]; then + wget --quiet ${URL} -O "$JAR_DL" && mv "$JAR_DL" "$JAR" + else + printf "You do not have curl or wget installed, please install rat manually.\n" + exit -1 + fi + fi + + unzip -tq "$JAR" &> /dev/null + if [ $? -ne 0 ]; then + # We failed to download + rm "$JAR" + printf "Our attempt to download rat locally to ${JAR} failed. Please install rat manually.\n" + exit -1 + fi +} + +# Go to the Spark project root directory +FWDIR="$(cd "`dirname "$0"`"/..; pwd)" +cd "$FWDIR" + +if test -x "$JAVA_HOME/bin/java"; then + declare java_cmd="$JAVA_HOME/bin/java" +else + declare java_cmd=java +fi + +export RAT_VERSION=0.10 +export rat_jar="$FWDIR"/lib/apache-rat-${RAT_VERSION}.jar +mkdir -p "$FWDIR"/lib + +[[ -f "$rat_jar" ]] || acquire_rat_jar || { + echo "Download failed. Obtain the rat jar manually and place it at $rat_jar" + exit 1 +} + +$java_cmd -jar "$rat_jar" -E "$FWDIR"/.rat-excludes -d "$FWDIR" > rat-results.txt + +if [ $? -ne 0 ]; then + echo "RAT exited abnormally" + exit 1 +fi + +ERRORS="$(cat rat-results.txt | grep -e "??")" + +if test ! -z "$ERRORS"; then + echo "Could not find Apache license headers in the following files:" + echo "$ERRORS" + exit 1 +else + echo -e "RAT checks passed." +fi diff --git a/dev/_site/create-release/generate-changelist.py b/dev/_site/create-release/generate-changelist.py new file mode 100755 index 0000000000000..2e1a35a629342 --- /dev/null +++ b/dev/_site/create-release/generate-changelist.py @@ -0,0 +1,148 @@ +#!/usr/bin/python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Creates CHANGES.txt from git history. +# +# Usage: +# First set the new release version and old CHANGES.txt version in this file. +# Make sure you have SPARK_HOME set. +# $ python generate-changelist.py + + +import os +import sys +import subprocess +import time +import traceback + +SPARK_HOME = os.environ["SPARK_HOME"] +NEW_RELEASE_VERSION = "1.0.0" +PREV_RELEASE_GIT_TAG = "v0.9.1" + +CHANGELIST = "CHANGES.txt" +OLD_CHANGELIST = "%s.old" % (CHANGELIST) +NEW_CHANGELIST = "%s.new" % (CHANGELIST) +TMP_CHANGELIST = "%s.tmp" % (CHANGELIST) + +# date before first PR in TLP Spark repo +SPARK_REPO_CHANGE_DATE1 = time.strptime("2014-02-26", "%Y-%m-%d") +# date after last PR in incubator Spark repo +SPARK_REPO_CHANGE_DATE2 = time.strptime("2014-03-01", "%Y-%m-%d") +# Threshold PR number that differentiates PRs to TLP +# and incubator repos +SPARK_REPO_PR_NUM_THRESH = 200 + +LOG_FILE_NAME = "changes_%s" % time.strftime("%h_%m_%Y_%I_%M_%S") +LOG_FILE = open(LOG_FILE_NAME, 'w') + + +def run_cmd(cmd): + try: + print >> LOG_FILE, "Running command: %s" % cmd + output = subprocess.check_output(cmd, shell=True, stderr=LOG_FILE) + print >> LOG_FILE, "Output: %s" % output + return output + except: + traceback.print_exc() + cleanup() + sys.exit(1) + + +def append_to_changelist(string): + with open(TMP_CHANGELIST, "a") as f: + print >> f, string + + +def cleanup(ask=True): + if ask is True: + print "OK to delete temporary and log files? (y/N): " + response = raw_input() + if ask is False or (ask is True and response == "y"): + if os.path.isfile(TMP_CHANGELIST): + os.remove(TMP_CHANGELIST) + if os.path.isfile(OLD_CHANGELIST): + os.remove(OLD_CHANGELIST) + LOG_FILE.close() + os.remove(LOG_FILE_NAME) + + +print "Generating new %s for Spark release %s" % (CHANGELIST, NEW_RELEASE_VERSION) +os.chdir(SPARK_HOME) +if os.path.isfile(TMP_CHANGELIST): + os.remove(TMP_CHANGELIST) +if os.path.isfile(OLD_CHANGELIST): + os.remove(OLD_CHANGELIST) + +append_to_changelist("Spark Change Log") +append_to_changelist("----------------") +append_to_changelist("") +append_to_changelist("Release %s" % NEW_RELEASE_VERSION) +append_to_changelist("") + +print "Getting commits between tag %s and HEAD" % PREV_RELEASE_GIT_TAG +hashes = run_cmd("git log %s..HEAD --pretty='%%h'" % PREV_RELEASE_GIT_TAG).split() + +print "Getting details of %s commits" % len(hashes) +for h in hashes: + date = run_cmd("git log %s -1 --pretty='%%ad' --date=iso | head -1" % h).strip() + subject = run_cmd("git log %s -1 --pretty='%%s' | head -1" % h).strip() + body = run_cmd("git log %s -1 --pretty='%%b'" % h) + committer = run_cmd("git log %s -1 --pretty='%%cn <%%ce>' | head -1" % h).strip() + body_lines = body.split("\n") + + if "Merge pull" in subject: + # Parse old format commit message + append_to_changelist(" %s %s" % (h, date)) + append_to_changelist(" %s" % subject) + append_to_changelist(" [%s]" % body_lines[0]) + append_to_changelist("") + + elif "maven-release" not in subject: + # Parse new format commit message + # Get authors from commit message, committer otherwise + authors = [committer] + if "Author:" in body: + authors = [line.split(":")[1].strip() for line in body_lines if "Author:" in line] + + # Generate GitHub PR URL for easy access if possible + github_url = "" + if "Closes #" in body: + pr_num = [line.split()[1].lstrip("#") for line in body_lines if "Closes #" in line][0] + github_url = "github.com/apache/spark/pull/%s" % pr_num + day = time.strptime(date.split()[0], "%Y-%m-%d") + if (day < SPARK_REPO_CHANGE_DATE1 or + (day < SPARK_REPO_CHANGE_DATE2 and pr_num < SPARK_REPO_PR_NUM_THRESH)): + github_url = "github.com/apache/incubator-spark/pull/%s" % pr_num + + append_to_changelist(" %s" % subject) + append_to_changelist(" %s" % ', '.join(authors)) + # for author in authors: + # append_to_changelist(" %s" % author) + append_to_changelist(" %s" % date) + if len(github_url) > 0: + append_to_changelist(" Commit: %s, %s" % (h, github_url)) + else: + append_to_changelist(" Commit: %s" % h) + append_to_changelist("") + +# Append old change list +print "Appending changelist from tag %s" % PREV_RELEASE_GIT_TAG +run_cmd("git show %s:%s | tail -n +3 >> %s" % (PREV_RELEASE_GIT_TAG, CHANGELIST, TMP_CHANGELIST)) +run_cmd("cp %s %s" % (TMP_CHANGELIST, NEW_CHANGELIST)) +print "New change list generated as %s" % NEW_CHANGELIST +cleanup(False) diff --git a/dev/_site/create-release/generate-contributors.py b/dev/_site/create-release/generate-contributors.py new file mode 100755 index 0000000000000..db9c680a4bad3 --- /dev/null +++ b/dev/_site/create-release/generate-contributors.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This script automates the process of creating release notes. + +import os +import re +import sys + +from releaseutils import * + +# You must set the following before use! +JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") +RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2") +PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0") + +# If the release tags are not provided, prompt the user to provide them +while not tag_exists(RELEASE_TAG): + RELEASE_TAG = raw_input("Please provide a valid release tag: ") +while not tag_exists(PREVIOUS_RELEASE_TAG): + print "Please specify the previous release tag." + PREVIOUS_RELEASE_TAG = raw_input(\ + "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ") + +# Gather commits found in the new tag but not in the old tag. +# This filters commits based on both the git hash and the PR number. +# If either is present in the old tag, then we ignore the commit. +print "Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG) +release_commits = get_commits(RELEASE_TAG) +previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG) +previous_release_hashes = set() +previous_release_prs = set() +for old_commit in previous_release_commits: + previous_release_hashes.add(old_commit.get_hash()) + if old_commit.get_pr_number(): + previous_release_prs.add(old_commit.get_pr_number()) +new_commits = [] +for this_commit in release_commits: + this_hash = this_commit.get_hash() + this_pr_number = this_commit.get_pr_number() + if this_hash in previous_release_hashes: + continue + if this_pr_number and this_pr_number in previous_release_prs: + continue + new_commits.append(this_commit) +if not new_commits: + sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG)) + +# Prompt the user for confirmation that the commit range is correct +print "\n==================================================================================" +print "JIRA server: %s" % JIRA_API_BASE +print "Release tag: %s" % RELEASE_TAG +print "Previous release tag: %s" % PREVIOUS_RELEASE_TAG +print "Number of commits in this range: %s" % len(new_commits) +print +def print_indented(_list): + for x in _list: print " %s" % x +if yesOrNoPrompt("Show all commits?"): + print_indented(new_commits) +print "==================================================================================\n" +if not yesOrNoPrompt("Does this look correct?"): + sys.exit("Ok, exiting") + +# Filter out special commits +releases = [] +maintenance = [] +reverts = [] +nojiras = [] +filtered_commits = [] +def is_release(commit_title): + return re.findall("\[release\]", commit_title.lower()) or\ + "preparing spark release" in commit_title.lower() or\ + "preparing development version" in commit_title.lower() or\ + "CHANGES.txt" in commit_title +def is_maintenance(commit_title): + return "maintenance" in commit_title.lower() or\ + "manually close" in commit_title.lower() +def has_no_jira(commit_title): + return not re.findall("SPARK-[0-9]+", commit_title.upper()) +def is_revert(commit_title): + return "revert" in commit_title.lower() +def is_docs(commit_title): + return re.findall("docs*", commit_title.lower()) or\ + "programming guide" in commit_title.lower() +for c in new_commits: + t = c.get_title() + if not t: continue + elif is_release(t): releases.append(c) + elif is_maintenance(t): maintenance.append(c) + elif is_revert(t): reverts.append(c) + elif is_docs(t): filtered_commits.append(c) # docs may not have JIRA numbers + elif has_no_jira(t): nojiras.append(c) + else: filtered_commits.append(c) + +# Warn against ignored commits +if releases or maintenance or reverts or nojiras: + print "\n==================================================================================" + if releases: print "Found %d release commits" % len(releases) + if maintenance: print "Found %d maintenance commits" % len(maintenance) + if reverts: print "Found %d revert commits" % len(reverts) + if nojiras: print "Found %d commits with no JIRA" % len(nojiras) + print "* Warning: these commits will be ignored.\n" + if yesOrNoPrompt("Show ignored commits?"): + if releases: print "Release (%d)" % len(releases); print_indented(releases) + if maintenance: print "Maintenance (%d)" % len(maintenance); print_indented(maintenance) + if reverts: print "Revert (%d)" % len(reverts); print_indented(reverts) + if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras) + print "==================== Warning: the above commits will be ignored ==================\n" +prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits) +if not yesOrNoPrompt(prompt_msg): + sys.exit("Ok, exiting.") + +# Keep track of warnings to tell the user at the end +warnings = [] + +# Mapping from the invalid author name to its associated JIRA issues +# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471") +invalid_authors = {} + +# Populate a map that groups issues and components by author +# It takes the form: Author name -> { Contribution type -> Spark components } +# For instance, +# { +# 'Andrew Or': { +# 'bug fixes': ['windows', 'core', 'web ui'], +# 'improvements': ['core'] +# }, +# 'Tathagata Das' : { +# 'bug fixes': ['streaming'] +# 'new feature': ['streaming'] +# } +# } +# +author_info = {} +jira_options = { "server": JIRA_API_BASE } +jira_client = JIRA(options = jira_options) +print "\n=========================== Compiling contributor list ===========================" +for commit in filtered_commits: + _hash = commit.get_hash() + title = commit.get_title() + issues = re.findall("SPARK-[0-9]+", title.upper()) + author = commit.get_author() + date = get_date(_hash) + # If the author name is invalid, keep track of it along + # with all associated issues so we can translate it later + if is_valid_author(author): + author = capitalize_author(author) + else: + if author not in invalid_authors: + invalid_authors[author] = set() + for issue in issues: + invalid_authors[author].add(issue) + # Parse components from the commit title, if any + commit_components = find_components(title, _hash) + # Populate or merge an issue into author_info[author] + def populate(issue_type, components): + components = components or [CORE_COMPONENT] # assume core if no components provided + if author not in author_info: + author_info[author] = {} + if issue_type not in author_info[author]: + author_info[author][issue_type] = set() + for component in components: + author_info[author][issue_type].add(component) + # Find issues and components associated with this commit + for issue in issues: + try: + jira_issue = jira_client.issue(issue) + jira_type = jira_issue.fields.issuetype.name + jira_type = translate_issue_type(jira_type, issue, warnings) + jira_components = [translate_component(c.name, _hash, warnings)\ + for c in jira_issue.fields.components] + all_components = set(jira_components + commit_components) + populate(jira_type, all_components) + except Exception as e: + print "Unexpected error:", e + # For docs without an associated JIRA, manually add it ourselves + if is_docs(title) and not issues: + populate("documentation", commit_components) + print " Processed commit %s authored by %s on %s" % (_hash, author, date) +print "==================================================================================\n" + +# Write to contributors file ordered by author names +# Each line takes the format " * Author name -- semi-colon delimited contributions" +# e.g. * Andrew Or -- Bug fixes in Windows, Core, and Web UI; improvements in Core +# e.g. * Tathagata Das -- Bug fixes and new features in Streaming +contributors_file = open(contributors_file_name, "w") +authors = author_info.keys() +authors.sort() +for author in authors: + contribution = "" + components = set() + issue_types = set() + for issue_type, comps in author_info[author].items(): + components.update(comps) + issue_types.add(issue_type) + # If there is only one component, mention it only once + # e.g. Bug fixes, improvements in MLlib + if len(components) == 1: + contribution = "%s in %s" % (nice_join(issue_types), next(iter(components))) + # Otherwise, group contributions by issue types instead of modules + # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN + else: + contributions = ["%s in %s" % (issue_type, nice_join(comps)) \ + for issue_type, comps in author_info[author].items()] + contribution = "; ".join(contributions) + # Do not use python's capitalize() on the whole string to preserve case + assert contribution + contribution = contribution[0].capitalize() + contribution[1:] + # If the author name is invalid, use an intermediate format that + # can be translated through translate-contributors.py later + # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672 + if author in invalid_authors and invalid_authors[author]: + author = author + "/" + "/".join(invalid_authors[author]) + #line = " * %s -- %s" % (author, contribution) + line = author + contributors_file.write(line + "\n") +contributors_file.close() +print "Contributors list is successfully written to %s!" % contributors_file_name + +# Prompt the user to translate author names if necessary +if invalid_authors: + warnings.append("Found the following invalid authors:") + for a in invalid_authors: + warnings.append("\t%s" % a) + warnings.append("Please run './translate-contributors.py' to translate them.") + +# Log any warnings encountered in the process +if warnings: + print "\n============ Warnings encountered while creating the contributor list ============" + for w in warnings: print w + print "Please correct these in the final contributors list at %s." % contributors_file_name + print "==================================================================================\n" + diff --git a/dev/_site/create-release/known_translations b/dev/_site/create-release/known_translations new file mode 100644 index 0000000000000..3563fe3cc3c03 --- /dev/null +++ b/dev/_site/create-release/known_translations @@ -0,0 +1,167 @@ +# This is a mapping of names to be translated through translate-contributors.py +# The format expected on each line should be: - +CodingCat - Nan Zhu +CrazyJvm - Chao Chen +EugenCepoi - Eugen Cepoi +GraceH - Jie Huang +JerryLead - Lijie Xu +Leolh - Liu Hao +Lewuathe - Kai Sasaki +RongGu - Rong Gu +Shiti - Shiti Saxena +Victsm - Min Shen +WangTaoTheTonic - Wang Tao +XuTingjun - Tingjun Xu +YanTangZhai - Yantang Zhai +alexdebrie - Alex DeBrie +alokito - Alok Saldanha +anantasty - Anant Asthana +andrewor14 - Andrew Or +aniketbhatnagar - Aniket Bhatnagar +arahuja - Arun Ahuja +brkyvz - Burak Yavuz +chesterxgchen - Chester Chen +chiragaggarwal - Chirag Aggarwal +chouqin - Qiping Li +cocoatomo - Tomohiko K. +coderfi - Fairiz Azizi +coderxiang - Shuo Xiang +davies - Davies Liu +epahomov - Egor Pahomov +falaki - Hossein Falaki +freeman-lab - Jeremy Freeman +industrial-sloth - Jascha Swisher +jackylk - Jacky Li +jayunit100 - Jay Vyas +jerryshao - Saisai Shao +jkbradley - Joseph Bradley +lianhuiwang - Lianhui Wang +lirui-intel - Rui Li +luluorta - Lu Lu +luogankun - Gankun Luo +maji2014 - Derek Ma +mccheah - Matthew Cheah +mengxr - Xiangrui Meng +nartz - Nathan Artz +odedz - Oded Zimerman +ravipesala - Ravindra Pesala +roxchkplusony - Victor Tso +scwf - Wang Fei +shimingfei - Shiming Fei +surq - Surong Quan +suyanNone - Su Yan +tedyu - Ted Yu +tigerquoll - Dale Richardson +wangxiaojing - Xiaojing Wang +watermen - Yadong Qi +witgo - Guoqiang Li +xinyunh - Xinyun Huang +zsxwing - Shixiong Zhu +Bilna - Bilna P +DoingDone9 - Doing Done +Earne - Ernest +FlytxtRnD - Meethu Mathew +GenTang - Gen TANG +JoshRosen - Josh Rosen +MechCoder - Manoj Kumar +OopsOutOfMemory - Sheng Li +Peishen-Jia - Peishen Jia +SaintBacchus - Huang Zhaowei +azagrebin - Andrey Zagrebin +bzz - Alexander Bezzubov +fjiang6 - Fan Jiang +gasparms - Gaspar Munoz +guowei2 - Guo Wei +hhbyyh - Yuhao Yang +hseagle - Peng Xu +javadba - Stephen Boesch +jbencook - Ben Cook +kul - Kuldeep +ligangty - Gang Li +marsishandsome - Liangliang Gu +medale - Markus Dale +nemccarthy - Nathan McCarthy +nxwhite-str - Nate Crosswhite +seayi - Xiaohua Yi +tianyi - Yi Tian +uncleGen - Uncle Gen +viper-kun - Xu Kun +x1- - Yuri Saito +zapletal-martin - Martin Zapletal +zuxqoj - Shekhar Bansal +mingyukim - Mingyu Kim +sigmoidanalytics - Mayur Rustagi +AiHe - Ai He +BenFradet - Ben Fradet +FavioVazquez - Favio Vazquez +JaysonSunshine - Jayson Sunshine +Liuchang0812 - Liu Chang +Sephiroth-Lin - Sephiroth Lin +dobashim - Masaru Dobashi +ehnalis - Zoltan Zvara +emres - Emre Sevinc +gchen - Guancheng Chen +haiyangsea - Haiyang Sea +hlin09 - Hao Lin +hqzizania - Qian Huang +jeanlyn - Jean Lyn +jerluc - Jeremy A. Lucas +jrabary - Jaonary Rabarisoa +judynash - Judy Nash +kaka1992 - Chen Song +ksonj - Kalle Jepsen +kuromatsu-nobuyuki - Nobuyuki Kuromatsu +lazyman500 - Dong Xu +leahmcguire - Leah McGuire +mbittmann - Mark Bittmann +mbonaci - Marko Bonaci +meawoppl - Matthew Goodman +nyaapa - Arsenii Krasikov +phatak-dev - Madhukara Phatak +prabeesh - Prabeesh K +rakeshchalasani - Rakesh Chalasani +rekhajoshm - Rekha Joshi +sisihj - June He +szheng79 - Shuai Zheng +texasmichelle - Michelle Casbon +vinodkc - Vinod KC +yongtang - Yong Tang +ypcat - Pei-Lun Lee +zhichao-li - Zhichao Li +zzcclp - Zhichao Zhang +979969786 - Yuming Wang +Rosstin - Rosstin Murphy +ameyc - Amey Chaugule +animeshbaranawal - Animesh Baranawal +cafreeman - Chris Freeman +lee19 - Lee +lockwobr - Brian Lockwood +navis - Navis Ryu +pparkkin - Paavo Parkkinen +HyukjinKwon - Hyukjin Kwon +JDrit - Joseph Batchik +JuhongPark - Juhong Park +KaiXinXiaoLei - KaiXinXIaoLei +NamelessAnalyst - NamelessAnalyst +alyaxey - Alex Slusarenko +baishuo - Shuo Bai +fe2s - Oleksiy Dyagilev +felixcheung - Felix Cheung +feynmanliang - Feynman Liang +josepablocam - Jose Cambronero +kai-zeng - Kai Zeng +mosessky - mosessky +msannell - Michael Sannella +nishkamravi2 - Nishkam Ravi +noel-smith - Noel Smith +petz2000 - Patrick Baier +qiansl127 - Shilei Qian +rahulpalamuttam - Rahul Palamuttam +rowan000 - Rowan Chattaway +sarutak - Kousuke Saruta +sethah - Seth Hendrickson +small-wang - Wang Wei +stanzhai - Stan Zhai +tien-dungle - Tien-Dung Le +xuchenCN - Xu Chen +zhangjiajin - Zhang JiaJin diff --git a/dev/_site/create-release/release-build.sh b/dev/_site/create-release/release-build.sh new file mode 100755 index 0000000000000..cb79e9eba06e2 --- /dev/null +++ b/dev/_site/create-release/release-build.sh @@ -0,0 +1,326 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +function exit_with_usage { + cat << EOF +usage: release-build.sh +Creates build deliverables from a Spark commit. + +Top level targets are + package: Create binary packages and copy them to people.apache + docs: Build docs and copy them to people.apache + publish-snapshot: Publish snapshot release to Apache snapshots + publish-release: Publish a release to Apache release repo + +All other inputs are environment variables + +GIT_REF - Release tag or commit to build from +SPARK_VERSION - Release identifier used when publishing +SPARK_PACKAGE_VERSION - Release identifier in top level package directory +REMOTE_PARENT_DIR - Parent in which to create doc or release builds. +REMOTE_PARENT_MAX_LENGTH - If set, parent directory will be cleaned to only + have this number of subdirectories (by deleting old ones). WARNING: This deletes data. + +ASF_USERNAME - Username of ASF committer account +ASF_PASSWORD - Password of ASF committer account +ASF_RSA_KEY - RSA private key file for ASF committer account + +GPG_KEY - GPG key used to sign release artifacts +GPG_PASSPHRASE - Passphrase for GPG key +EOF + exit 1 +} + +set -e + +if [ $# -eq 0 ]; then + exit_with_usage +fi + +if [[ $@ == *"help"* ]]; then + exit_with_usage +fi + +for env in ASF_USERNAME ASF_RSA_KEY GPG_PASSPHRASE GPG_KEY; do + if [ -z "${!env}" ]; then + echo "ERROR: $env must be set to run this script" + exit_with_usage + fi +done + +# Commit ref to checkout when building +GIT_REF=${GIT_REF:-master} + +# Destination directory parent on remote server +REMOTE_PARENT_DIR=${REMOTE_PARENT_DIR:-/home/$ASF_USERNAME/public_html} + +SSH="ssh -o ConnectTimeout=300 -o StrictHostKeyChecking=no -i $ASF_RSA_KEY" +GPG="gpg --no-tty --batch" +NEXUS_ROOT=https://repository.apache.org/service/local/staging +NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads +BASE_DIR=$(pwd) + +MVN="build/mvn --force" +PUBLISH_PROFILES="-Pyarn -Phive -Phadoop-2.2" +PUBLISH_PROFILES="$PUBLISH_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl" + +rm -rf spark +git clone https://git-wip-us.apache.org/repos/asf/spark.git +cd spark +git checkout $GIT_REF +git_hash=`git rev-parse --short HEAD` +echo "Checked out Spark git hash $git_hash" + +if [ -z "$SPARK_VERSION" ]; then + SPARK_VERSION=$($MVN help:evaluate -Dexpression=project.version \ + | grep -v INFO | grep -v WARNING | grep -v Download) +fi + +if [ -z "$SPARK_PACKAGE_VERSION" ]; then + SPARK_PACKAGE_VERSION="${SPARK_VERSION}-$(date +%Y_%m_%d_%H_%M)-${git_hash}" +fi + +DEST_DIR_NAME="spark-$SPARK_PACKAGE_VERSION" +USER_HOST="$ASF_USERNAME@people.apache.org" + +git clean -d -f -x +rm .gitignore +rm -rf .git +cd .. + +if [ -n "$REMOTE_PARENT_MAX_LENGTH" ]; then + old_dirs=$($SSH $USER_HOST ls -t $REMOTE_PARENT_DIR | tail -n +$REMOTE_PARENT_MAX_LENGTH) + for old_dir in $old_dirs; do + echo "Removing directory: $old_dir" + $SSH $USER_HOST rm -r $REMOTE_PARENT_DIR/$old_dir + done +fi + +if [[ "$1" == "package" ]]; then + # Source and binary tarballs + echo "Packaging release tarballs" + cp -r spark spark-$SPARK_VERSION + tar cvzf spark-$SPARK_VERSION.tgz spark-$SPARK_VERSION + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour --output spark-$SPARK_VERSION.tgz.asc \ + --detach-sig spark-$SPARK_VERSION.tgz + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md MD5 spark-$SPARK_VERSION.tgz > \ + spark-$SPARK_VERSION.tgz.md5 + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ + SHA512 spark-$SPARK_VERSION.tgz > spark-$SPARK_VERSION.tgz.sha + rm -rf spark-$SPARK_VERSION + + # Updated for each binary build + make_binary_release() { + NAME=$1 + FLAGS=$2 + ZINC_PORT=$3 + cp -r spark spark-$SPARK_VERSION-bin-$NAME + + cd spark-$SPARK_VERSION-bin-$NAME + + # TODO There should probably be a flag to make-distribution to allow 2.11 support + if [[ $FLAGS == *scala-2.11* ]]; then + ./dev/change-scala-version.sh 2.11 + fi + + export ZINC_PORT=$ZINC_PORT + echo "Creating distribution: $NAME ($FLAGS)" + + # Get maven home set by MVN + MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'` + + ./make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \ + -DzincPort=$ZINC_PORT 2>&1 > ../binary-release-$NAME.log + cd .. + cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz . + + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \ + --output spark-$SPARK_VERSION-bin-$NAME.tgz.asc \ + --detach-sig spark-$SPARK_VERSION-bin-$NAME.tgz + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ + MD5 spark-$SPARK_VERSION-bin-$NAME.tgz > \ + spark-$SPARK_VERSION-bin-$NAME.tgz.md5 + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ + SHA512 spark-$SPARK_VERSION-bin-$NAME.tgz > \ + spark-$SPARK_VERSION-bin-$NAME.tgz.sha + } + + # TODO: Check exit codes of children here: + # http://stackoverflow.com/questions/1570262/shell-get-exit-code-of-background-process + + # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds + # share the same Zinc server. + make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" & + make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" & + make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" & + make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" & + make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" & + make_binary_release "hadoop2.6" "-Psparkr -Phadoop-2.6 -Phive -Phive-thriftserver -Pyarn" "3034" & + make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn" "3037" & + make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn" "3038" & + wait + rm -rf spark-$SPARK_VERSION-bin-*/ + + # Copy data + dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-bin" + echo "Copying release tarballs to $dest_dir" + $SSH $USER_HOST mkdir $dest_dir + rsync -e "$SSH" spark-* $USER_HOST:$dest_dir + echo "Linking /latest to $dest_dir" + $SSH $USER_HOST rm -f "$REMOTE_PARENT_DIR/latest" + $SSH $USER_HOST ln -s $dest_dir "$REMOTE_PARENT_DIR/latest" + exit 0 +fi + +if [[ "$1" == "docs" ]]; then + # Documentation + cd spark + echo "Building Spark docs" + dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-docs" + cd docs + # Compile docs with Java 7 to use nicer format + # TODO: Make configurable to add this: PRODUCTION=1 + PRODUCTION=1 RELEASE_VERSION="$SPARK_VERSION" jekyll build + echo "Copying release documentation to $dest_dir" + $SSH $USER_HOST mkdir $dest_dir + echo "Linking /latest to $dest_dir" + $SSH $USER_HOST rm -f "$REMOTE_PARENT_DIR/latest" + $SSH $USER_HOST ln -s $dest_dir "$REMOTE_PARENT_DIR/latest" + rsync -e "$SSH" -r _site/* $USER_HOST:$dest_dir + cd .. + exit 0 +fi + +if [[ "$1" == "publish-snapshot" ]]; then + cd spark + # Publish Spark to Maven release repo + echo "Deploying Spark SNAPSHOT at '$GIT_REF' ($git_hash)" + echo "Publish version is $SPARK_VERSION" + if [[ ! $SPARK_VERSION == *"SNAPSHOT"* ]]; then + echo "ERROR: Snapshots must have a version containing SNAPSHOT" + echo "ERROR: You gave version '$SPARK_VERSION'" + exit 1 + fi + # Coerce the requested version + $MVN versions:set -DnewVersion=$SPARK_VERSION + tmp_settings="tmp-settings.xml" + echo "" > $tmp_settings + echo "apache.snapshots.https$ASF_USERNAME" >> $tmp_settings + echo "$ASF_PASSWORD" >> $tmp_settings + echo "" >> $tmp_settings + + # Generate random point for Zinc + export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") + + $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES \ + -Phive-thriftserver deploy + ./dev/change-scala-version.sh 2.11 + $MVN -DzincPort=$ZINC_PORT -Dscala-2.11 --settings $tmp_settings \ + -DskipTests $PUBLISH_PROFILES clean deploy + + # Clean-up Zinc nailgun process + /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill + + rm $tmp_settings + cd .. + exit 0 +fi + +if [[ "$1" == "publish-release" ]]; then + cd spark + # Publish Spark to Maven release repo + echo "Publishing Spark checkout at '$GIT_REF' ($git_hash)" + echo "Publish version is $SPARK_VERSION" + # Coerce the requested version + $MVN versions:set -DnewVersion=$SPARK_VERSION + + # Using Nexus API documented here: + # https://support.sonatype.com/entries/39720203-Uploading-to-a-Staging-Repository-via-REST-API + echo "Creating Nexus staging repository" + repo_request="Apache Spark $SPARK_VERSION (commit $git_hash)" + out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \ + -H "Content-Type:application/xml" -v \ + $NEXUS_ROOT/profiles/$NEXUS_PROFILE/start) + staged_repo_id=$(echo $out | sed -e "s/.*\(orgapachespark-[0-9]\{4\}\).*/\1/") + echo "Created Nexus staging repository: $staged_repo_id" + + tmp_repo=$(mktemp -d spark-repo-XXXXX) + + # Generate random point for Zinc + export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") + + $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $PUBLISH_PROFILES \ + -Phive-thriftserver clean install + + ./dev/change-scala-version.sh 2.11 + + $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -Dscala-2.11 \ + -DskipTests $PUBLISH_PROFILES clean install + + # Clean-up Zinc nailgun process + /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill + + ./dev/change-version-to-2.10.sh + + pushd $tmp_repo/org/apache/spark + + # Remove any extra files generated during install + find . -type f |grep -v \.jar |grep -v \.pom | xargs rm + + echo "Creating hash and signature files" + for file in $(find . -type f) + do + echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --output $file.asc \ + --detach-sig --armour $file; + if [ $(command -v md5) ]; then + # Available on OS X; -q to keep only hash + md5 -q $file > $file.md5 + else + # Available on Linux; cut to keep only hash + md5sum $file | cut -f1 -d' ' > $file.md5 + fi + sha1sum $file | cut -f1 -d' ' > $file.sha1 + done + + nexus_upload=$NEXUS_ROOT/deployByRepositoryId/$staged_repo_id + echo "Uplading files to $nexus_upload" + for file in $(find . -type f) + do + # strip leading ./ + file_short=$(echo $file | sed -e "s/\.\///") + dest_url="$nexus_upload/org/apache/spark/$file_short" + echo " Uploading $file_short" + curl -u $ASF_USERNAME:$ASF_PASSWORD --upload-file $file_short $dest_url + done + + echo "Closing nexus staging repository" + repo_request="$staged_repo_idApache Spark $SPARK_VERSION (commit $git_hash)" + out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \ + -H "Content-Type:application/xml" -v \ + $NEXUS_ROOT/profiles/$NEXUS_PROFILE/finish) + echo "Closed Nexus staging repository: $staged_repo_id" + popd + rm -rf $tmp_repo + cd .. + exit 0 +fi + +cd .. +rm -rf spark +echo "ERROR: expects to be called with 'package', 'docs', 'publish-release' or 'publish-snapshot'" diff --git a/dev/_site/create-release/release-tag.sh b/dev/_site/create-release/release-tag.sh new file mode 100755 index 0000000000000..b0a3374becc6a --- /dev/null +++ b/dev/_site/create-release/release-tag.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +function exit_with_usage { + cat << EOF +usage: tag-release.sh +Tags a Spark release on a particular branch. + +Inputs are specified with the following environment variables: +ASF_USERNAME - Apache Username +ASF_PASSWORD - Apache Password +GIT_NAME - Name to use with git +GIT_EMAIL - E-mail address to use with git +GIT_BRANCH - Git branch on which to make release +RELEASE_VERSION - Version used in pom files for release +RELEASE_TAG - Name of release tag +NEXT_VERSION - Development version after release +EOF + exit 1 +} + +set -e + +if [[ $@ == *"help"* ]]; then + exit_with_usage +fi + +for env in ASF_USERNAME ASF_PASSWORD RELEASE_VERSION RELEASE_TAG NEXT_VERSION GIT_EMAIL GIT_NAME GIT_BRANCH; do + if [ -z "${!env}" ]; then + echo "$env must be set to run this script" + exit 1 + fi +done + +ASF_SPARK_REPO="git-wip-us.apache.org/repos/asf/spark.git" +MVN="build/mvn --force" + +rm -rf spark +git clone https://$ASF_USERNAME:$ASF_PASSWORD@$ASF_SPARK_REPO -b $GIT_BRANCH +cd spark + +git config user.name "$GIT_NAME" +git config user.email $GIT_EMAIL + +# Create release version +$MVN versions:set -DnewVersion=$RELEASE_VERSION | grep -v "no value" # silence logs +git commit -a -m "Preparing Spark release $RELEASE_TAG" +echo "Creating tag $RELEASE_TAG at the head of $GIT_BRANCH" +git tag $RELEASE_TAG + +# TODO: It would be nice to do some verifications here +# i.e. check whether ec2 scripts have the new version + +# Create next version +$MVN versions:set -DnewVersion=$NEXT_VERSION | grep -v "no value" # silence logs +git commit -a -m "Preparing development version $NEXT_VERSION" + +# Push changes +git push origin $RELEASE_TAG +git push origin HEAD:$GIT_BRANCH + +cd .. +rm -rf spark diff --git a/dev/_site/create-release/releaseutils.py b/dev/_site/create-release/releaseutils.py new file mode 100755 index 0000000000000..7f152b7f53559 --- /dev/null +++ b/dev/_site/create-release/releaseutils.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This file contains helper methods used in creating a release. + +import re +import sys +from subprocess import Popen, PIPE + +try: + from jira.client import JIRA + # Old versions have JIRAError in exceptions package, new (0.5+) in utils. + try: + from jira.exceptions import JIRAError + except ImportError: + from jira.utils import JIRAError +except ImportError: + print "This tool requires the jira-python library" + print "Install using 'sudo pip install jira'" + sys.exit(-1) + +try: + from github import Github + from github import GithubException +except ImportError: + print "This tool requires the PyGithub library" + print "Install using 'sudo pip install PyGithub'" + sys.exit(-1) + +try: + import unidecode +except ImportError: + print "This tool requires the unidecode library to decode obscure github usernames" + print "Install using 'sudo pip install unidecode'" + sys.exit(-1) + +# Contributors list file name +contributors_file_name = "contributors.txt" + +# Prompt the user to answer yes or no until they do so +def yesOrNoPrompt(msg): + response = raw_input("%s [y/n]: " % msg) + while response != "y" and response != "n": + return yesOrNoPrompt(msg) + return response == "y" + +# Utility functions run git commands (written with Git 1.8.5) +def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0] +def run_cmd_error(cmd): return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1] +def get_date(commit_hash): + return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash]) +def tag_exists(tag): + stderr = run_cmd_error(["git", "show", tag]) + return "error" not in stderr + +# A type-safe representation of a commit +class Commit: + def __init__(self, _hash, author, title, pr_number = None): + self._hash = _hash + self.author = author + self.title = title + self.pr_number = pr_number + def get_hash(self): return self._hash + def get_author(self): return self.author + def get_title(self): return self.title + def get_pr_number(self): return self.pr_number + def __str__(self): + closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else "" + return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr) + +# Return all commits that belong to the specified tag. +# +# Under the hood, this runs a `git log` on that tag and parses the fields +# from the command output to construct a list of Commit objects. Note that +# because certain fields reside in the commit description and cannot be parsed +# through the Github API itself, we need to do some intelligent regex parsing +# to extract those fields. +# +# This is written using Git 1.8.5. +def get_commits(tag): + commit_start_marker = "|=== COMMIT START MARKER ===|" + commit_end_marker = "|=== COMMIT END MARKER ===|" + field_end_marker = "|=== COMMIT FIELD END MARKER ===|" + log_format =\ + commit_start_marker + "%h" +\ + field_end_marker + "%an" +\ + field_end_marker + "%s" +\ + commit_end_marker + "%b" + output = run_cmd(["git", "log", "--quiet", "--pretty=format:" + log_format, tag]) + commits = [] + raw_commits = [c for c in output.split(commit_start_marker) if c] + for commit in raw_commits: + if commit.count(commit_end_marker) != 1: + print "Commit end marker not found in commit: " + for line in commit.split("\n"): print line + sys.exit(1) + # Separate commit digest from the body + # From the digest we extract the hash, author and the title + # From the body, we extract the PR number and the github username + [commit_digest, commit_body] = commit.split(commit_end_marker) + if commit_digest.count(field_end_marker) != 2: + sys.exit("Unexpected format in commit: %s" % commit_digest) + [_hash, author, title] = commit_digest.split(field_end_marker) + # The PR number and github username is in the commit message + # itself and cannot be accessed through any Github API + pr_number = None + match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body) + if match: + [pr_number, github_username] = match.groups() + # If the author name is not valid, use the github + # username so we can translate it properly later + if not is_valid_author(author): + author = github_username + # Guard against special characters + author = unidecode.unidecode(unicode(author, "UTF-8")).strip() + commit = Commit(_hash, author, title, pr_number) + commits.append(commit) + return commits + +# Maintain a mapping for translating issue types to contributions in the release notes +# This serves an additional function of warning the user against unknown issue types +# Note: This list is partially derived from this link: +# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/issuetypes +# Keep these in lower case +known_issue_types = { + "bug": "bug fixes", + "build": "build fixes", + "dependency upgrade": "build fixes", + "improvement": "improvements", + "new feature": "new features", + "documentation": "documentation", + "test": "test", + "task": "improvement", + "sub-task": "improvement" +} + +# Maintain a mapping for translating component names when creating the release notes +# This serves an additional function of warning the user against unknown components +# Note: This list is largely derived from this link: +# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/components +CORE_COMPONENT = "Core" +known_components = { + "block manager": CORE_COMPONENT, + "build": CORE_COMPONENT, + "deploy": CORE_COMPONENT, + "documentation": CORE_COMPONENT, + "ec2": "EC2", + "examples": CORE_COMPONENT, + "graphx": "GraphX", + "input/output": CORE_COMPONENT, + "java api": "Java API", + "mesos": "Mesos", + "ml": "MLlib", + "mllib": "MLlib", + "project infra": "Project Infra", + "pyspark": "PySpark", + "shuffle": "Shuffle", + "spark core": CORE_COMPONENT, + "spark shell": CORE_COMPONENT, + "sql": "SQL", + "streaming": "Streaming", + "web ui": "Web UI", + "windows": "Windows", + "yarn": "YARN" +} + +# Translate issue types using a format appropriate for writing contributions +# If an unknown issue type is encountered, warn the user +def translate_issue_type(issue_type, issue_id, warnings): + issue_type = issue_type.lower() + if issue_type in known_issue_types: + return known_issue_types[issue_type] + else: + warnings.append("Unknown issue type \"%s\" (see %s)" % (issue_type, issue_id)) + return issue_type + +# Translate component names using a format appropriate for writing contributions +# If an unknown component is encountered, warn the user +def translate_component(component, commit_hash, warnings): + component = component.lower() + if component in known_components: + return known_components[component] + else: + warnings.append("Unknown component \"%s\" (see %s)" % (component, commit_hash)) + return component + +# Parse components in the commit message +# The returned components are already filtered and translated +def find_components(commit, commit_hash): + components = re.findall("\[\w*\]", commit.lower()) + components = [translate_component(c, commit_hash)\ + for c in components if c in known_components] + return components + +# Join a list of strings in a human-readable manner +# e.g. ["Juice"] -> "Juice" +# e.g. ["Juice", "baby"] -> "Juice and baby" +# e.g. ["Juice", "baby", "moon"] -> "Juice, baby, and moon" +def nice_join(str_list): + str_list = list(str_list) # sometimes it's a set + if not str_list: + return "" + elif len(str_list) == 1: + return next(iter(str_list)) + elif len(str_list) == 2: + return " and ".join(str_list) + else: + return ", ".join(str_list[:-1]) + ", and " + str_list[-1] + +# Return the full name of the specified user on Github +# If the user doesn't exist, return None +def get_github_name(author, github_client): + if github_client: + try: + return github_client.get_user(author).name + except GithubException as e: + # If this is not a "not found" exception + if e.status != 404: + raise e + return None + +# Return the full name of the specified user on JIRA +# If the user doesn't exist, return None +def get_jira_name(author, jira_client): + if jira_client: + try: + return jira_client.user(author).displayName + except JIRAError as e: + # If this is not a "not found" exception + if e.status_code != 404: + raise e + return None + +# Return whether the given name is in the form +def is_valid_author(author): + if not author: return False + return " " in author and not re.findall("[0-9]", author) + +# Capitalize the first letter of each word in the given author name +def capitalize_author(author): + if not author: return None + words = author.split(" ") + words = [w[0].capitalize() + w[1:] for w in words if w] + return " ".join(words) + diff --git a/dev/_site/create-release/translate-contributors.py b/dev/_site/create-release/translate-contributors.py new file mode 100755 index 0000000000000..86fa02d87b9a0 --- /dev/null +++ b/dev/_site/create-release/translate-contributors.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script translates invalid authors in the contributors list generated +# by generate-contributors.py. When the script encounters an author name that +# is considered invalid, it searches Github and JIRA in an attempt to search +# for replacements. This tool runs in two modes: +# +# (1) Interactive mode: For each invalid author name, this script presents +# all candidate replacements to the user and awaits user response. In this +# mode, the user may also input a custom name. This is the default. +# +# (2) Non-interactive mode: For each invalid author name, this script replaces +# the name with the first valid candidate it can find. If there is none, it +# uses the original name. This can be enabled through the --non-interactive flag. + +import os +import sys + +from releaseutils import * + +# You must set the following before use! +JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") +JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None) +JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None) +GITHUB_API_TOKEN = os.environ.get("GITHUB_API_TOKEN", None) +if not JIRA_USERNAME or not JIRA_PASSWORD: + sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set") +if not GITHUB_API_TOKEN: + sys.exit("GITHUB_API_TOKEN must be set") + +# Write new contributors list to .final +if not os.path.isfile(contributors_file_name): + print "Contributors file %s does not exist!" % contributors_file_name + print "Have you run ./generate-contributors.py yet?" + sys.exit(1) +contributors_file = open(contributors_file_name, "r") +warnings = [] + +# In non-interactive mode, this script will choose the first replacement that is valid +INTERACTIVE_MODE = True +if len(sys.argv) > 1: + options = set(sys.argv[1:]) + if "--non-interactive" in options: + INTERACTIVE_MODE = False +if INTERACTIVE_MODE: + print "Running in interactive mode. To disable this, provide the --non-interactive flag." + +# Setup Github and JIRA clients +jira_options = { "server": JIRA_API_BASE } +jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) +github_client = Github(GITHUB_API_TOKEN) + +# Load known author translations that are cached locally +known_translations = {} +known_translations_file_name = "known_translations" +known_translations_file = open(known_translations_file_name, "r") +for line in known_translations_file: + if line.startswith("#"): continue + [old_name, new_name] = line.strip("\n").split(" - ") + known_translations[old_name] = new_name +known_translations_file.close() + +# Open again in case the user adds new mappings +known_translations_file = open(known_translations_file_name, "a") + +# Generate candidates for the given author. This should only be called if the given author +# name does not represent a full name as this operation is somewhat expensive. Under the +# hood, it makes several calls to the Github and JIRA API servers to find the candidates. +# +# This returns a list of (candidate name, source) 2-tuples. E.g. +# [ +# (NOT_FOUND, "No full name found for Github user andrewor14"), +# ("Andrew Or", "Full name of JIRA user andrewor14"), +# ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"), +# ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"), +# (NOT_FOUND, "No assignee found for SPARK-1763") +# ] +NOT_FOUND = "Not found" +def generate_candidates(author, issues): + candidates = [] + # First check for full name of Github user + github_name = get_github_name(author, github_client) + if github_name: + candidates.append((github_name, "Full name of Github user %s" % author)) + else: + candidates.append((NOT_FOUND, "No full name found for Github user %s" % author)) + # Then do the same for JIRA user + jira_name = get_jira_name(author, jira_client) + if jira_name: + candidates.append((jira_name, "Full name of JIRA user %s" % author)) + else: + candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % author)) + # Then do the same for the assignee of each of the associated JIRAs + # Note that a given issue may not have an assignee, or the assignee may not have a full name + for issue in issues: + try: + jira_issue = jira_client.issue(issue) + except JIRAError as e: + # Do not exit just because an issue is not found! + if e.status_code == 404: + warnings.append("Issue %s not found!" % issue) + continue + raise e + jira_assignee = jira_issue.fields.assignee + if jira_assignee: + user_name = jira_assignee.name + display_name = jira_assignee.displayName + if display_name: + candidates.append((display_name, "Full name of %s assignee %s" % (issue, user_name))) + else: + candidates.append((NOT_FOUND, "No full name found for %s assignee %" % (issue, user_name))) + else: + candidates.append((NOT_FOUND, "No assignee found for %s" % issue)) + # Guard against special characters in candidate names + # Note that the candidate name may already be in unicode (JIRA returns this) + for i, (candidate, source) in enumerate(candidates): + try: + candidate = unicode(candidate, "UTF-8") + except TypeError: + # already in unicode + pass + candidate = unidecode.unidecode(candidate).strip() + candidates[i] = (candidate, source) + return candidates + +# Translate each invalid author by searching for possible candidates from Github and JIRA +# In interactive mode, this script presents the user with a list of choices and have the user +# select from this list. Additionally, the user may also choose to enter a custom name. +# In non-interactive mode, this script picks the first valid author name from the candidates +# If no such name exists, the original name is used (without the JIRA numbers). +print "\n========================== Translating contributor list ==========================" +lines = contributors_file.readlines() +contributions = [] +for i, line in enumerate(lines): + temp_author = line.strip(" * ").split(" -- ")[0] + print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines)) + if not temp_author: + error_msg = " ERROR: Expected the following format \" * -- \"\n" + error_msg += " ERROR: Actual = %s" % line + print error_msg + warnings.append(error_msg) + contributions.append(line) + continue + author = temp_author.split("/")[0] + # Use the local copy of known translations where possible + if author in known_translations: + line = line.replace(temp_author, known_translations[author]) + elif not is_valid_author(author): + new_author = author + issues = temp_author.split("/")[1:] + candidates = generate_candidates(author, issues) + # Print out potential replacement candidates along with the sources, e.g. + # [X] No full name found for Github user andrewor14 + # [X] No assignee found for SPARK-1763 + # [0] Andrew Or - Full name of JIRA user andrewor14 + # [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14 + # [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14 + # [3] andrewor14 - Raw Github username + # [4] Custom + candidate_names = [] + bad_prompts = [] # Prompts that can't actually be selected; print these first. + good_prompts = [] # Prompts that contain valid choices + for candidate, source in candidates: + if candidate == NOT_FOUND: + bad_prompts.append(" [X] %s" % source) + else: + index = len(candidate_names) + candidate_names.append(candidate) + good_prompts.append(" [%d] %s - %s" % (index, candidate, source)) + raw_index = len(candidate_names) + custom_index = len(candidate_names) + 1 + for p in bad_prompts: print p + if bad_prompts: print " ---" + for p in good_prompts: print p + # In interactive mode, additionally provide "custom" option and await user response + if INTERACTIVE_MODE: + print " [%d] %s - Raw Github username" % (raw_index, author) + print " [%d] Custom" % custom_index + response = raw_input(" Your choice: ") + last_index = custom_index + while not response.isdigit() or int(response) > last_index: + response = raw_input(" Please enter an integer between 0 and %d: " % last_index) + response = int(response) + if response == custom_index: + new_author = raw_input(" Please type a custom name for this author: ") + elif response != raw_index: + new_author = candidate_names[response] + # In non-interactive mode, just pick the first candidate + else: + valid_candidate_names = [name for name, _ in candidates\ + if is_valid_author(name) and name != NOT_FOUND] + if valid_candidate_names: + new_author = valid_candidate_names[0] + # Finally, capitalize the author and replace the original one with it + # If the final replacement is still invalid, log a warning + if is_valid_author(new_author): + new_author = capitalize_author(new_author) + else: + warnings.append("Unable to find a valid name %s for author %s" % (author, temp_author)) + print " * Replacing %s with %s" % (author, new_author) + # If we are in interactive mode, prompt the user whether we want to remember this new mapping + if INTERACTIVE_MODE and\ + author not in known_translations and\ + yesOrNoPrompt(" Add mapping %s -> %s to known translations file?" % (author, new_author)): + known_translations_file.write("%s - %s\n" % (author, new_author)) + known_translations_file.flush() + line = line.replace(temp_author, author) + contributions.append(line) +print "==================================================================================\n" +contributors_file.close() +known_translations_file.close() + +# Sort the contributions before writing them to the new file. +# Additionally, check if there are any duplicate author rows. +# This could happen if the same user has both a valid full +# name (e.g. Andrew Or) and an invalid one (andrewor14). +# If so, warn the user about this at the end. +contributions.sort() +all_authors = set() +new_contributors_file_name = contributors_file_name + ".final" +new_contributors_file = open(new_contributors_file_name, "w") +for line in contributions: + author = line.strip(" * ").split(" -- ")[0] + if author in all_authors: + warnings.append("Detected duplicate author name %s. Please merge these manually." % author) + all_authors.add(author) + new_contributors_file.write(line) +new_contributors_file.close() + +print "Translated contributors list successfully written to %s!" % new_contributors_file_name + +# Log any warnings encountered in the process +if warnings: + print "\n========== Warnings encountered while translating the contributor list ===========" + for w in warnings: print w + print "Please manually correct these in the final contributors list at %s." % new_contributors_file_name + print "==================================================================================\n" + diff --git a/dev/_site/github_jira_sync.py b/dev/_site/github_jira_sync.py new file mode 100755 index 0000000000000..287f0ca24a7df --- /dev/null +++ b/dev/_site/github_jira_sync.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Utility for updating JIRA's with information about Github pull requests + +import json +import os +import re +import sys +import urllib2 + +try: + import jira.client +except ImportError: + print "This tool requires the jira-python library" + print "Install using 'sudo pip install jira'" + sys.exit(-1) + +# User facing configs +GITHUB_API_BASE = os.environ.get("GITHUB_API_BASE", "https://api.github.com/repos/apache/spark") +JIRA_PROJECT_NAME = os.environ.get("JIRA_PROJECT_NAME", "SPARK") +JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") +JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "apachespark") +JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "XXX") +# Maximum number of updates to perform in one run +MAX_UPDATES = int(os.environ.get("MAX_UPDATES", "100000")) +# Cut-off for oldest PR on which to comment. Useful for avoiding +# "notification overload" when running for the first time. +MIN_COMMENT_PR = int(os.environ.get("MIN_COMMENT_PR", "1496")) + +# File used as an opitimization to store maximum previously seen PR +# Used mostly because accessing ASF JIRA is slow, so we want to avoid checking +# the state of JIRA's that are tied to PR's we've already looked at. +MAX_FILE = ".github-jira-max" + +def get_url(url): + try: + return urllib2.urlopen(url) + except urllib2.HTTPError as e: + print "Unable to fetch URL, exiting: %s" % url + sys.exit(-1) + +def get_json(urllib_response): + return json.load(urllib_response) + +# Return a list of (JIRA id, JSON dict) tuples: +# e.g. [('SPARK-1234', {.. json ..}), ('SPARK-5687', {.. json ..})} +def get_jira_prs(): + result = [] + has_next_page = True + page_num = 0 + while has_next_page: + page = get_url(GITHUB_API_BASE + "/pulls?page=%s&per_page=100" % page_num) + page_json = get_json(page) + + for pull in page_json: + jiras = re.findall(JIRA_PROJECT_NAME + "-[0-9]{4,5}", pull['title']) + for jira in jiras: + result = result + [(jira, pull)] + + # Check if there is another page + link_header = filter(lambda k: k.startswith("Link"), page.info().headers)[0] + if not "next"in link_header: + has_next_page = False + else: + page_num = page_num + 1 + return result + +def set_max_pr(max_val): + f = open(MAX_FILE, 'w') + f.write("%s" % max_val) + f.close() + print "Writing largest PR number seen: %s" % max_val + +def get_max_pr(): + if os.path.exists(MAX_FILE): + result = int(open(MAX_FILE, 'r').read()) + print "Read largest PR number previously seen: %s" % result + return result + else: + return 0 + +jira_client = jira.client.JIRA({'server': JIRA_API_BASE}, + basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) + +jira_prs = get_jira_prs() + +previous_max = get_max_pr() +print "Retrieved %s JIRA PR's from Github" % len(jira_prs) +jira_prs = [(k, v) for k, v in jira_prs if int(v['number']) > previous_max] +print "%s PR's remain after excluding visted ones" % len(jira_prs) + +num_updates = 0 +considered = [] +for issue, pr in sorted(jira_prs, key=lambda (k, v): int(v['number'])): + if num_updates >= MAX_UPDATES: + break + pr_num = int(pr['number']) + + print "Checking issue %s" % issue + considered = considered + [pr_num] + + url = pr['html_url'] + title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) + try: + existing_links = map(lambda l: l.raw['object']['url'], jira_client.remote_links(issue)) + except: + print "Failure reading JIRA %s (does it exist?)" % issue + print sys.exc_info()[0] + continue + + if url in existing_links: + continue + + icon = {"title": "Pull request #%s" % pr['number'], + "url16x16": "https://assets-cdn.github.com/favicon.ico"} + destination = {"title": title, "url": url, "icon": icon} + # For all possible fields see: + # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links + # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"} + jira_client.add_remote_link(issue, destination) + + comment = "User '%s' has created a pull request for this issue:" % pr['user']['login'] + comment = comment + ("\n%s" % pr['html_url']) + if pr_num >= MIN_COMMENT_PR: + jira_client.add_comment(issue, comment) + + print "Added link %s <-> PR #%s" % (issue, pr['number']) + num_updates = num_updates + 1 + +if len(considered) > 0: + set_max_pr(max(considered)) diff --git a/dev/_site/lint-python b/dev/_site/lint-python new file mode 100755 index 0000000000000..0b97213ae3dff --- /dev/null +++ b/dev/_site/lint-python @@ -0,0 +1,114 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" +SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")" +PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport" +PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py ./dev/run-tests-jenkins.py" +PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt" +PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt" +PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt" + +cd "$SPARK_ROOT_DIR" + +# compileall: https://docs.python.org/2/library/compileall.html +python -B -m compileall -q -l $PATHS_TO_CHECK > "$PEP8_REPORT_PATH" +compile_status="${PIPESTATUS[0]}" + +# Get pep8 at runtime so that we don't rely on it being installed on the build server. +#+ See: https://github.com/apache/spark/pull/1744#issuecomment-50982162 +#+ TODOs: +#+ - Download pep8 from PyPI. It's more "official". +PEP8_VERSION="1.6.2" +PEP8_SCRIPT_PATH="$SPARK_ROOT_DIR/dev/pep8-$PEP8_VERSION.py" +PEP8_SCRIPT_REMOTE_PATH="https://raw.githubusercontent.com/jcrocholl/pep8/$PEP8_VERSION/pep8.py" + +if [ ! -e "$PEP8_SCRIPT_PATH" ]; then + curl --silent -o "$PEP8_SCRIPT_PATH" "$PEP8_SCRIPT_REMOTE_PATH" + curl_status="$?" + + if [ "$curl_status" -ne 0 ]; then + echo "Failed to download pep8.py from \"$PEP8_SCRIPT_REMOTE_PATH\"." + exit "$curl_status" + fi +fi + +# Easy install pylint in /dev/pylint. To easy_install into a directory, the PYTHONPATH should +# be set to the directory. +# dev/pylint should be appended to the PATH variable as well. +# Jenkins by default installs the pylint3 version, so for now this just checks the code quality +# of python3. +export "PYTHONPATH=$SPARK_ROOT_DIR/dev/pylint" +export "PYLINT_HOME=$PYTHONPATH" +export "PATH=$PYTHONPATH:$PATH" + +# if [ ! -d "$PYLINT_HOME" ]; then +# mkdir "$PYLINT_HOME" +# # Redirect the annoying pylint installation output. +# easy_install -d "$PYLINT_HOME" pylint==1.4.4 &>> "$PYLINT_INSTALL_INFO" +# easy_install_status="$?" +# +# if [ "$easy_install_status" -ne 0 ]; then +# echo "Unable to install pylint locally in \"$PYTHONPATH\"." +# cat "$PYLINT_INSTALL_INFO" +# exit "$easy_install_status" +# fi +# +# rm "$PYLINT_INSTALL_INFO" +# +# fi + +# There is no need to write this output to a file +#+ first, but we do so so that the check status can +#+ be output before the report, like with the +#+ scalastyle and RAT checks. +python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK >> "$PEP8_REPORT_PATH" +pep8_status="${PIPESTATUS[0]}" + +if [ "$compile_status" -eq 0 -a "$pep8_status" -eq 0 ]; then + lint_status=0 +else + lint_status=1 +fi + +if [ "$lint_status" -ne 0 ]; then + echo "PEP8 checks failed." + cat "$PEP8_REPORT_PATH" +else + echo "PEP8 checks passed." +fi + +rm "$PEP8_REPORT_PATH" + +# for to_be_checked in "$PATHS_TO_CHECK" +# do +# pylint --rcfile="$SPARK_ROOT_DIR/pylintrc" $to_be_checked >> "$PYLINT_REPORT_PATH" +# done + +# if [ "${PIPESTATUS[0]}" -ne 0 ]; then +# lint_status=1 +# echo "Pylint checks failed." +# cat "$PYLINT_REPORT_PATH" +# else +# echo "Pylint checks passed." +# fi + +# rm "$PYLINT_REPORT_PATH" + +exit "$lint_status" diff --git a/dev/_site/lint-r b/dev/_site/lint-r new file mode 100755 index 0000000000000..bfda0bca15eb7 --- /dev/null +++ b/dev/_site/lint-r @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" +SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)" +LINT_R_REPORT_FILE_NAME="$SPARK_ROOT_DIR/dev/lint-r-report.log" + + +if ! type "Rscript" > /dev/null; then + echo "ERROR: You should install R" + exit +fi + +`which Rscript` --vanilla "$SPARK_ROOT_DIR/dev/lint-r.R" "$SPARK_ROOT_DIR" | tee "$LINT_R_REPORT_FILE_NAME" + +NUM_LINES=`wc -l < "$LINT_R_REPORT_FILE_NAME" | awk '{print $1}'` +if [ "$NUM_LINES" = "0" ] ; then + lint_status=0 + echo "lintr checks passed." +else + lint_status=1 + echo "lintr checks failed." +fi + +exit "$lint_status" diff --git a/dev/_site/lint-r.R b/dev/_site/lint-r.R new file mode 100644 index 0000000000000..999eef571b824 --- /dev/null +++ b/dev/_site/lint-r.R @@ -0,0 +1,37 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +argv <- commandArgs(TRUE) +SPARK_ROOT_DIR <- as.character(argv[1]) +LOCAL_LIB_LOC <- file.path(SPARK_ROOT_DIR, "R", "lib") + +# Checks if SparkR is installed in a local directory. +if (! library(SparkR, lib.loc = LOCAL_LIB_LOC, logical.return = TRUE)) { + stop("You should install SparkR in a local directory with `R/install-dev.sh`.") +} + +# Installs lintr from Github in a local directory. +# NOTE: The CRAN's version is too old to adapt to our rules. +if ("lintr" %in% row.names(installed.packages()) == FALSE) { + devtools::install_github("jimhester/lintr") +} + +library(lintr) +library(methods) +library(testthat) +path.to.package <- file.path(SPARK_ROOT_DIR, "R", "pkg") +lint_package(path.to.package, cache = FALSE) diff --git a/dev/_site/lint-scala b/dev/_site/lint-scala new file mode 100755 index 0000000000000..c676dfdf4f44e --- /dev/null +++ b/dev/_site/lint-scala @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" +SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)" + +"$SCRIPT_DIR/scalastyle" diff --git a/dev/_site/merge_spark_pr.py b/dev/_site/merge_spark_pr.py new file mode 100755 index 0000000000000..bf1a000f46791 --- /dev/null +++ b/dev/_site/merge_spark_pr.py @@ -0,0 +1,453 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Utility for creating well-formed pull request merges and pushing them to Apache. +# usage: ./apache-pr-merge.py (see config env vars below) +# +# This utility assumes you already have local a Spark git folder and that you +# have added remotes corresponding to both (i) the github apache Spark +# mirror and (ii) the apache git repo. + +import json +import os +import re +import subprocess +import sys +import urllib2 + +try: + import jira.client + JIRA_IMPORTED = True +except ImportError: + JIRA_IMPORTED = False + +# Location of your Spark git development area +SPARK_HOME = os.environ.get("SPARK_HOME", os.getcwd()) +# Remote name which points to the Gihub site +PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "apache-github") +# Remote name which points to Apache git +PUSH_REMOTE_NAME = os.environ.get("PUSH_REMOTE_NAME", "apache") +# ASF JIRA username +JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "") +# ASF JIRA password +JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "") +# OAuth key used for issuing requests against the GitHub API. If this is not defined, then requests +# will be unauthenticated. You should only need to configure this if you find yourself regularly +# exceeding your IP's unauthenticated request rate limit. You can create an OAuth key at +# https://github.com/settings/tokens. This script only requires the "public_repo" scope. +GITHUB_OAUTH_KEY = os.environ.get("GITHUB_OAUTH_KEY") + + +GITHUB_BASE = "https://github.com/apache/spark/pull" +GITHUB_API_BASE = "https://api.github.com/repos/apache/spark" +JIRA_BASE = "https://issues.apache.org/jira/browse" +JIRA_API_BASE = "https://issues.apache.org/jira" +# Prefix added to temporary branches +BRANCH_PREFIX = "PR_TOOL" + + +def get_json(url): + try: + request = urllib2.Request(url) + if GITHUB_OAUTH_KEY: + request.add_header('Authorization', 'token %s' % GITHUB_OAUTH_KEY) + return json.load(urllib2.urlopen(request)) + except urllib2.HTTPError as e: + if "X-RateLimit-Remaining" in e.headers and e.headers["X-RateLimit-Remaining"] == '0': + print "Exceeded the GitHub API rate limit; see the instructions in " + \ + "dev/merge_spark_pr.py to configure an OAuth token for making authenticated " + \ + "GitHub requests." + else: + print "Unable to fetch URL, exiting: %s" % url + sys.exit(-1) + + +def fail(msg): + print msg + clean_up() + sys.exit(-1) + + +def run_cmd(cmd): + print cmd + if isinstance(cmd, list): + return subprocess.check_output(cmd) + else: + return subprocess.check_output(cmd.split(" ")) + + +def continue_maybe(prompt): + result = raw_input("\n%s (y/n): " % prompt) + if result.lower() != "y": + fail("Okay, exiting") + +def clean_up(): + print "Restoring head pointer to %s" % original_head + run_cmd("git checkout %s" % original_head) + + branches = run_cmd("git branch").replace(" ", "").split("\n") + + for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches): + print "Deleting local branch %s" % branch + run_cmd("git branch -D %s" % branch) + + +# merge the requested PR and return the merge hash +def merge_pr(pr_num, target_ref, title, body, pr_repo_desc): + pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) + target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, target_ref.upper()) + run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, pr_branch_name)) + run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, target_branch_name)) + run_cmd("git checkout %s" % target_branch_name) + + had_conflicts = False + try: + run_cmd(['git', 'merge', pr_branch_name, '--squash']) + except Exception as e: + msg = "Error merging: %s\nWould you like to manually fix-up this merge?" % e + continue_maybe(msg) + msg = "Okay, please fix any conflicts and 'git add' conflicting files... Finished?" + continue_maybe(msg) + had_conflicts = True + + commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, + '--pretty=format:%an <%ae>']).split("\n") + distinct_authors = sorted(set(commit_authors), + key=lambda x: commit_authors.count(x), reverse=True) + primary_author = raw_input( + "Enter primary author in the format of \"name \" [%s]: " % + distinct_authors[0]) + if primary_author == "": + primary_author = distinct_authors[0] + + commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, + '--pretty=format:%h [%an] %s']).split("\n\n") + + merge_message_flags = [] + + merge_message_flags += ["-m", title] + if body is not None: + # We remove @ symbols from the body to avoid triggering e-mails + # to people every time someone creates a public fork of Spark. + merge_message_flags += ["-m", body.replace("@", "")] + + authors = "\n".join(["Author: %s" % a for a in distinct_authors]) + + merge_message_flags += ["-m", authors] + + if had_conflicts: + committer_name = run_cmd("git config --get user.name").strip() + committer_email = run_cmd("git config --get user.email").strip() + message = "This patch had conflicts when merged, resolved by\nCommitter: %s <%s>" % ( + committer_name, committer_email) + merge_message_flags += ["-m", message] + + # The string "Closes #%s" string is required for GitHub to correctly close the PR + merge_message_flags += ["-m", "Closes #%s from %s." % (pr_num, pr_repo_desc)] + + run_cmd(['git', 'commit', '--author="%s"' % primary_author] + merge_message_flags) + + continue_maybe("Merge complete (local ref %s). Push to %s?" % ( + target_branch_name, PUSH_REMOTE_NAME)) + + try: + run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, target_ref)) + except Exception as e: + clean_up() + fail("Exception while pushing: %s" % e) + + merge_hash = run_cmd("git rev-parse %s" % target_branch_name)[:8] + clean_up() + print("Pull request #%s merged!" % pr_num) + print("Merge hash: %s" % merge_hash) + return merge_hash + + +def cherry_pick(pr_num, merge_hash, default_branch): + pick_ref = raw_input("Enter a branch name [%s]: " % default_branch) + if pick_ref == "": + pick_ref = default_branch + + pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, pick_ref.upper()) + + run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, pick_branch_name)) + run_cmd("git checkout %s" % pick_branch_name) + + try: + run_cmd("git cherry-pick -sx %s" % merge_hash) + except Exception as e: + msg = "Error cherry-picking: %s\nWould you like to manually fix-up this merge?" % e + continue_maybe(msg) + msg = "Okay, please fix any conflicts and finish the cherry-pick. Finished?" + continue_maybe(msg) + + continue_maybe("Pick complete (local ref %s). Push to %s?" % ( + pick_branch_name, PUSH_REMOTE_NAME)) + + try: + run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref)) + except Exception as e: + clean_up() + fail("Exception while pushing: %s" % e) + + pick_hash = run_cmd("git rev-parse %s" % pick_branch_name)[:8] + clean_up() + + print("Pull request #%s picked into %s!" % (pr_num, pick_ref)) + print("Pick hash: %s" % pick_hash) + return pick_ref + + +def fix_version_from_branch(branch, versions): + # Note: Assumes this is a sorted (newest->oldest) list of un-released versions + if branch == "master": + return versions[0] + else: + branch_ver = branch.replace("branch-", "") + return filter(lambda x: x.name.startswith(branch_ver), versions)[-1] + + +def resolve_jira_issue(merge_branches, comment, default_jira_id=""): + asf_jira = jira.client.JIRA({'server': JIRA_API_BASE}, + basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) + + jira_id = raw_input("Enter a JIRA id [%s]: " % default_jira_id) + if jira_id == "": + jira_id = default_jira_id + + try: + issue = asf_jira.issue(jira_id) + except Exception as e: + fail("ASF JIRA could not find %s\n%s" % (jira_id, e)) + + cur_status = issue.fields.status.name + cur_summary = issue.fields.summary + cur_assignee = issue.fields.assignee + if cur_assignee is None: + cur_assignee = "NOT ASSIGNED!!!" + else: + cur_assignee = cur_assignee.displayName + + if cur_status == "Resolved" or cur_status == "Closed": + fail("JIRA issue %s already has status '%s'" % (jira_id, cur_status)) + print ("=== JIRA %s ===" % jira_id) + print ("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" % ( + cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id)) + + versions = asf_jira.project_versions("SPARK") + versions = sorted(versions, key=lambda x: x.name, reverse=True) + versions = filter(lambda x: x.raw['released'] is False, versions) + # Consider only x.y.z versions + versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions) + + default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches) + for v in default_fix_versions: + # Handles the case where we have forked a release branch but not yet made the release. + # In this case, if the PR is committed to the master branch and the release branch, we + # only consider the release branch to be the fix version. E.g. it is not valid to have + # both 1.1.0 and 1.0.0 as fix versions. + (major, minor, patch) = v.split(".") + if patch == "0": + previous = "%s.%s.%s" % (major, int(minor) - 1, 0) + if previous in default_fix_versions: + default_fix_versions = filter(lambda x: x != v, default_fix_versions) + default_fix_versions = ",".join(default_fix_versions) + + fix_versions = raw_input("Enter comma-separated fix version(s) [%s]: " % default_fix_versions) + if fix_versions == "": + fix_versions = default_fix_versions + fix_versions = fix_versions.replace(" ", "").split(",") + + def get_version_json(version_str): + return filter(lambda v: v.name == version_str, versions)[0].raw + + jira_fix_versions = map(lambda v: get_version_json(v), fix_versions) + + resolve = filter(lambda a: a['name'] == "Resolve Issue", asf_jira.transitions(jira_id))[0] + resolution = filter(lambda r: r.raw['name'] == "Fixed", asf_jira.resolutions())[0] + asf_jira.transition_issue( + jira_id, resolve["id"], fixVersions = jira_fix_versions, + comment = comment, resolution = {'id': resolution.raw['id']}) + + print "Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions) + + +def resolve_jira_issues(title, merge_branches, comment): + jira_ids = re.findall("SPARK-[0-9]{4,5}", title) + + if len(jira_ids) == 0: + resolve_jira_issue(merge_branches, comment) + for jira_id in jira_ids: + resolve_jira_issue(merge_branches, comment, jira_id) + + +def standardize_jira_ref(text): + """ + Standardize the [SPARK-XXXXX] [MODULE] prefix + Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to "[SPARK-XXX][MLLIB] Issue" + + >>> standardize_jira_ref("[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful") + '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is successful' + >>> standardize_jira_ref("[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests") + '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull requests' + >>> standardize_jira_ref("[MLlib] Spark 5954: Top by key") + '[SPARK-5954][MLLIB] Top by key' + >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl") + '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl' + >>> standardize_jira_ref("SPARK-1094 Support MiMa for reporting binary compatibility accross versions.") + '[SPARK-1094] Support MiMa for reporting binary compatibility accross versions.' + >>> standardize_jira_ref("[WIP] [SPARK-1146] Vagrant support for Spark") + '[SPARK-1146][WIP] Vagrant support for Spark' + >>> standardize_jira_ref("SPARK-1032. If Yarn app fails before registering, app master stays aroun...") + '[SPARK-1032] If Yarn app fails before registering, app master stays aroun...' + >>> standardize_jira_ref("[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.") + '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.' + >>> standardize_jira_ref("Additional information for users building from source code") + 'Additional information for users building from source code' + """ + jira_refs = [] + components = [] + + # If the string is compliant, no need to process any further + if (re.search(r'^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+', text)): + return text + + # Extract JIRA ref(s): + pattern = re.compile(r'(SPARK[-\s]*[0-9]{3,6})+', re.IGNORECASE) + for ref in pattern.findall(text): + # Add brackets, replace spaces with a dash, & convert to uppercase + jira_refs.append('[' + re.sub(r'\s+', '-', ref.upper()) + ']') + text = text.replace(ref, '') + + # Extract spark component(s): + # Look for alphanumeric chars, spaces, dashes, periods, and/or commas + pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE) + for component in pattern.findall(text): + components.append(component.upper()) + text = text.replace(component, '') + + # Cleanup any remaining symbols: + pattern = re.compile(r'^\W+(.*)', re.IGNORECASE) + if (pattern.search(text) is not None): + text = pattern.search(text).groups()[0] + + # Assemble full text (JIRA ref(s), module(s), remaining text) + clean_text = ''.join(jira_refs).strip() + ''.join(components).strip() + " " + text.strip() + + # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were included + clean_text = re.sub(r'\s+', ' ', clean_text.strip()) + + return clean_text + +def main(): + global original_head + + os.chdir(SPARK_HOME) + original_head = run_cmd("git rev-parse HEAD")[:8] + + branches = get_json("%s/branches" % GITHUB_API_BASE) + branch_names = filter(lambda x: x.startswith("branch-"), [x['name'] for x in branches]) + # Assumes branch names can be sorted lexicographically + latest_branch = sorted(branch_names, reverse=True)[0] + + pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ") + pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num)) + pr_events = get_json("%s/issues/%s/events" % (GITHUB_API_BASE, pr_num)) + + url = pr["url"] + + # Decide whether to use the modified title or not + modified_title = standardize_jira_ref(pr["title"]) + if modified_title != pr["title"]: + print "I've re-written the title as follows to match the standard format:" + print "Original: %s" % pr["title"] + print "Modified: %s" % modified_title + result = raw_input("Would you like to use the modified title? (y/n): ") + if result.lower() == "y": + title = modified_title + print "Using modified title:" + else: + title = pr["title"] + print "Using original title:" + print title + else: + title = pr["title"] + + body = pr["body"] + target_ref = pr["base"]["ref"] + user_login = pr["user"]["login"] + base_ref = pr["head"]["ref"] + pr_repo_desc = "%s/%s" % (user_login, base_ref) + + # Merged pull requests don't appear as merged in the GitHub API; + # Instead, they're closed by asfgit. + merge_commits = \ + [e for e in pr_events if e["actor"]["login"] == "asfgit" and e["event"] == "closed"] + + if merge_commits: + merge_hash = merge_commits[0]["commit_id"] + message = get_json("%s/commits/%s" % (GITHUB_API_BASE, merge_hash))["commit"]["message"] + + print "Pull request %s has already been merged, assuming you want to backport" % pr_num + commit_is_downloaded = run_cmd(['git', 'rev-parse', '--quiet', '--verify', + "%s^{commit}" % merge_hash]).strip() != "" + if not commit_is_downloaded: + fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num) + + print "Found commit %s:\n%s" % (merge_hash, message) + cherry_pick(pr_num, merge_hash, latest_branch) + sys.exit(0) + + if not bool(pr["mergeable"]): + msg = "Pull request %s is not mergeable in its current form.\n" % pr_num + \ + "Continue? (experts only!)" + continue_maybe(msg) + + print ("\n=== Pull Request #%s ===" % pr_num) + print ("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % ( + title, pr_repo_desc, target_ref, url)) + continue_maybe("Proceed with merging pull request #%s?" % pr_num) + + merged_refs = [target_ref] + + merge_hash = merge_pr(pr_num, target_ref, title, body, pr_repo_desc) + + pick_prompt = "Would you like to pick %s into another branch?" % merge_hash + while raw_input("\n%s (y/n): " % pick_prompt).lower() == "y": + merged_refs = merged_refs + [cherry_pick(pr_num, merge_hash, latest_branch)] + + if JIRA_IMPORTED: + if JIRA_USERNAME and JIRA_PASSWORD: + continue_maybe("Would you like to update an associated JIRA?") + jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (pr_num, GITHUB_BASE, pr_num) + resolve_jira_issues(title, merged_refs, jira_comment) + else: + print "JIRA_USERNAME and JIRA_PASSWORD not set" + print "Exiting without trying to close the associated JIRA." + else: + print "Could not find jira-python library. Run 'sudo pip install jira' to install." + print "Exiting without trying to close the associated JIRA." + +if __name__ == "__main__": + import doctest + (failure_count, test_count) = doctest.testmod() + if failure_count: + exit(-1) + + main() diff --git a/dev/_site/mima b/dev/_site/mima new file mode 100755 index 0000000000000..2952fa65d42ff --- /dev/null +++ b/dev/_site/mima @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -o pipefail +set -e + +# Go to the Spark project root directory +FWDIR="$(cd "`dirname "$0"`"/..; pwd)" +cd "$FWDIR" + +echo -e "q\n" | build/sbt oldDeps/update +rm -f .generated-mima* + +generate_mima_ignore() { + SPARK_JAVA_OPTS="-XX:MaxPermSize=1g -Xmx2g" \ + ./bin/spark-class org.apache.spark.tools.GenerateMIMAIgnore +} + +# Generate Mima Ignore is called twice, first with latest built jars +# on the classpath and then again with previous version jars on the classpath. +# Because of a bug in GenerateMIMAIgnore that when old jars are ahead on classpath +# it did not process the new classes (which are in assembly jar). +generate_mima_ignore + +export SPARK_CLASSPATH="`find lib_managed \( -name '*spark*jar' -a -type f \) | tr "\\n" ":"`" +echo "SPARK_CLASSPATH=$SPARK_CLASSPATH" + +generate_mima_ignore + +echo -e "q\n" | build/sbt mima-report-binary-issues | grep -v -e "info.*Resolving" +ret_val=$? + +if [ $ret_val != 0 ]; then + echo "NOTE: Exceptions to binary compatibility can be added in project/MimaExcludes.scala" +fi + +rm -f .generated-mima* +exit $ret_val diff --git a/dev/_site/run-tests b/dev/_site/run-tests new file mode 100755 index 0000000000000..257d1e8d50bb4 --- /dev/null +++ b/dev/_site/run-tests @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +FWDIR="$(cd "`dirname $0`"/..; pwd)" +cd "$FWDIR" + +exec python -u ./dev/run-tests.py "$@" diff --git a/dev/_site/run-tests-jenkins b/dev/_site/run-tests-jenkins new file mode 100755 index 0000000000000..e79accf9e987a --- /dev/null +++ b/dev/_site/run-tests-jenkins @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Wrapper script that runs the Spark tests then reports QA results +# to github via its API. +# Environment variables are populated by the code here: +#+ https://github.com/jenkinsci/ghprb-plugin/blob/master/src/main/java/org/jenkinsci/plugins/ghprb/GhprbTrigger.java#L139 + +FWDIR="$(cd "`dirname $0`"/..; pwd)" +cd "$FWDIR" + +exec python -u ./dev/run-tests-jenkins.py "$@" diff --git a/dev/_site/run-tests-jenkins.py b/dev/_site/run-tests-jenkins.py new file mode 100755 index 0000000000000..623004310e189 --- /dev/null +++ b/dev/_site/run-tests-jenkins.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python2 + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import os +import sys +import json +import urllib2 +import functools +import subprocess + +from sparktestsupport import SPARK_HOME, ERROR_CODES +from sparktestsupport.shellutils import run_cmd + + +def print_err(msg): + """ + Given a set of arguments, will print them to the STDERR stream + """ + print(msg, file=sys.stderr) + + +def post_message_to_github(msg, ghprb_pull_id): + print("Attempting to post to Github...") + + url = "https://api.github.com/repos/apache/spark/issues/" + ghprb_pull_id + "/comments" + github_oauth_key = os.environ["GITHUB_OAUTH_KEY"] + + posted_message = json.dumps({"body": msg}) + request = urllib2.Request(url, + headers={ + "Authorization": "token %s" % github_oauth_key, + "Content-Type": "application/json" + }, + data=posted_message) + try: + response = urllib2.urlopen(request) + + if response.getcode() == 201: + print(" > Post successful.") + except urllib2.HTTPError as http_e: + print_err("Failed to post message to Github.") + print_err(" > http_code: %s" % http_e.code) + print_err(" > api_response: %s" % http_e.read()) + print_err(" > data: %s" % posted_message) + except urllib2.URLError as url_e: + print_err("Failed to post message to Github.") + print_err(" > urllib2_status: %s" % url_e.reason[1]) + print_err(" > data: %s" % posted_message) + + +def pr_message(build_display_name, + build_url, + ghprb_pull_id, + short_commit_hash, + commit_url, + msg, + post_msg=''): + # align the arguments properly for string formatting + str_args = (build_display_name, + msg, + build_url, + ghprb_pull_id, + short_commit_hash, + commit_url, + str(' ' + post_msg + '.') if post_msg else '.') + return '**[Test build %s %s](%sconsoleFull)** for PR %s at commit [`%s`](%s)%s' % str_args + + +def run_pr_checks(pr_tests, ghprb_actual_commit, sha1): + """ + Executes a set of pull request checks to ease development and report issues with various + components such as style, linting, dependencies, compatibilities, etc. + @return a list of messages to post back to Github + """ + # Ensure we save off the current HEAD to revert to + current_pr_head = run_cmd(['git', 'rev-parse', 'HEAD'], return_output=True).strip() + pr_results = list() + + for pr_test in pr_tests: + test_name = pr_test + '.sh' + pr_results.append(run_cmd(['bash', os.path.join(SPARK_HOME, 'dev', 'tests', test_name), + ghprb_actual_commit, sha1], + return_output=True).rstrip()) + # Ensure, after each test, that we're back on the current PR + run_cmd(['git', 'checkout', '-f', current_pr_head]) + return pr_results + + +def run_tests(tests_timeout): + """ + Runs the `dev/run-tests` script and responds with the correct error message + under the various failure scenarios. + @return a tuple containing the test result code and the result note to post to Github + """ + + test_result_code = subprocess.Popen(['timeout', + tests_timeout, + os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait() + + failure_note_by_errcode = { + 1: 'executing the `dev/run-tests` script', # error to denote run-tests script failures + ERROR_CODES["BLOCK_GENERAL"]: 'some tests', + ERROR_CODES["BLOCK_RAT"]: 'RAT tests', + ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests', + ERROR_CODES["BLOCK_PYTHON_STYLE"]: 'Python style tests', + ERROR_CODES["BLOCK_R_STYLE"]: 'R style tests', + ERROR_CODES["BLOCK_DOCUMENTATION"]: 'to generate documentation', + ERROR_CODES["BLOCK_BUILD"]: 'to build', + ERROR_CODES["BLOCK_MIMA"]: 'MiMa tests', + ERROR_CODES["BLOCK_SPARK_UNIT_TESTS"]: 'Spark unit tests', + ERROR_CODES["BLOCK_PYSPARK_UNIT_TESTS"]: 'PySpark unit tests', + ERROR_CODES["BLOCK_SPARKR_UNIT_TESTS"]: 'SparkR unit tests', + ERROR_CODES["BLOCK_TIMEOUT"]: 'from timeout after a configured wait of \`%s\`' % ( + tests_timeout) + } + + if test_result_code == 0: + test_result_note = ' * This patch passes all tests.' + else: + test_result_note = ' * This patch **fails %s**.' % failure_note_by_errcode[test_result_code] + + return [test_result_code, test_result_note] + + +def main(): + # Important Environment Variables + # --- + # $ghprbActualCommit + # This is the hash of the most recent commit in the PR. + # The merge-base of this and master is the commit from which the PR was branched. + # $sha1 + # If the patch merges cleanly, this is a reference to the merge commit hash + # (e.g. "origin/pr/2606/merge"). + # If the patch does not merge cleanly, it is equal to $ghprbActualCommit. + # The merge-base of this and master in the case of a clean merge is the most recent commit + # against master. + ghprb_pull_id = os.environ["ghprbPullId"] + ghprb_actual_commit = os.environ["ghprbActualCommit"] + ghprb_pull_title = os.environ["ghprbPullTitle"] + sha1 = os.environ["sha1"] + + # Marks this build as a pull request build. + os.environ["AMP_JENKINS_PRB"] = "true" + # Switch to a Maven-based build if the PR title contains "test-maven": + if "test-maven" in ghprb_pull_title: + os.environ["AMPLAB_JENKINS_BUILD_TOOL"] = "maven" + # Switch the Hadoop profile based on the PR title: + if "test-hadoop1.0" in ghprb_pull_title: + os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop1.0" + if "test-hadoop2.2" in ghprb_pull_title: + os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.0" + if "test-hadoop2.2" in ghprb_pull_title: + os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.2" + if "test-hadoop2.3" in ghprb_pull_title: + os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.3" + + build_display_name = os.environ["BUILD_DISPLAY_NAME"] + build_url = os.environ["BUILD_URL"] + + commit_url = "https://github.com/apache/spark/commit/" + ghprb_actual_commit + + # GitHub doesn't auto-link short hashes when submitted via the API, unfortunately. :( + short_commit_hash = ghprb_actual_commit[0:7] + + # format: http://linux.die.net/man/1/timeout + # must be less than the timeout configured on Jenkins (currently 300m) + tests_timeout = "250m" + + # Array to capture all test names to run on the pull request. These tests are represented + # by their file equivalents in the dev/tests/ directory. + # + # To write a PR test: + # * the file must reside within the dev/tests directory + # * be an executable bash script + # * accept three arguments on the command line, the first being the Github PR long commit + # hash, the second the Github SHA1 hash, and the final the current PR hash + # * and, lastly, return string output to be included in the pr message output that will + # be posted to Github + pr_tests = [ + "pr_merge_ability", + "pr_public_classes" + # DISABLED (pwendell) "pr_new_dependencies" + ] + + # `bind_message_base` returns a function to generate messages for Github posting + github_message = functools.partial(pr_message, + build_display_name, + build_url, + ghprb_pull_id, + short_commit_hash, + commit_url) + + # post start message + post_message_to_github(github_message('has started'), ghprb_pull_id) + + pr_check_results = run_pr_checks(pr_tests, ghprb_actual_commit, sha1) + + test_result_code, test_result_note = run_tests(tests_timeout) + + # post end message + result_message = github_message('has finished') + result_message += '\n' + test_result_note + '\n' + result_message += '\n'.join(pr_check_results) + + post_message_to_github(result_message, ghprb_pull_id) + + sys.exit(test_result_code) + + +if __name__ == "__main__": + main() diff --git a/dev/_site/run-tests.py b/dev/_site/run-tests.py new file mode 100755 index 0000000000000..9e1abb0697192 --- /dev/null +++ b/dev/_site/run-tests.py @@ -0,0 +1,561 @@ +#!/usr/bin/env python2 + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import itertools +from optparse import OptionParser +import os +import random +import re +import sys +import subprocess +from collections import namedtuple + +from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES +from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which +import sparktestsupport.modules as modules + + +# ------------------------------------------------------------------------------------------------- +# Functions for traversing module dependency graph +# ------------------------------------------------------------------------------------------------- + + +def determine_modules_for_files(filenames): + """ + Given a list of filenames, return the set of modules that contain those files. + If a file is not associated with a more specific submodule, then this method will consider that + file to belong to the 'root' module. + + >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"])) + ['pyspark-core', 'sql'] + >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])] + ['root'] + """ + changed_modules = set() + for filename in filenames: + matched_at_least_one_module = False + for module in modules.all_modules: + if module.contains_file(filename): + changed_modules.add(module) + matched_at_least_one_module = True + if not matched_at_least_one_module: + changed_modules.add(modules.root) + return changed_modules + + +def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None): + """ + Given a git commit and target ref, use the set of files changed in the diff in order to + determine which modules' tests should be run. + + >>> [x.name for x in determine_modules_for_files( \ + identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))] + ['graphx'] + >>> 'root' in [x.name for x in determine_modules_for_files( \ + identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))] + True + """ + if target_branch is None and target_ref is None: + raise AttributeError("must specify either target_branch or target_ref") + elif target_branch is not None and target_ref is not None: + raise AttributeError("must specify either target_branch or target_ref, not both") + if target_branch is not None: + diff_target = target_branch + run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)]) + else: + diff_target = target_ref + raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target], + universal_newlines=True) + # Remove any empty strings + return [f for f in raw_output.split('\n') if f] + + +def setup_test_environ(environ): + print("[info] Setup the following environment variables for tests: ") + for (k, v) in environ.items(): + print("%s=%s" % (k, v)) + os.environ[k] = v + + +def determine_modules_to_test(changed_modules): + """ + Given a set of modules that have changed, compute the transitive closure of those modules' + dependent modules in order to determine the set of modules that should be tested. + + >>> sorted(x.name for x in determine_modules_to_test([modules.root])) + ['root'] + >>> sorted(x.name for x in determine_modules_to_test([modules.graphx])) + ['examples', 'graphx'] + >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql])) + >>> x # doctest: +NORMALIZE_WHITESPACE + ['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \ + 'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql'] + """ + # If we're going to have to run all of the tests, then we can just short-circuit + # and return 'root'. No module depends on root, so if it appears then it will be + # in changed_modules. + if modules.root in changed_modules: + return [modules.root] + modules_to_test = set() + for module in changed_modules: + modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules)) + return modules_to_test.union(set(changed_modules)) + + +def determine_tags_to_exclude(changed_modules): + tags = [] + for m in modules.all_modules: + if m not in changed_modules: + tags += m.test_tags + return tags + + +# ------------------------------------------------------------------------------------------------- +# Functions for working with subprocesses and shell tools +# ------------------------------------------------------------------------------------------------- + + +def determine_java_executable(): + """Will return the path of the java executable that will be used by Spark's + tests or `None`""" + + # Any changes in the way that Spark's build detects java must be reflected + # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to + # the `java` executable on the path + + java_home = os.environ.get("JAVA_HOME") + + # check if there is an executable at $JAVA_HOME/bin/java + java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None + # if the java_exe wasn't set, check for a `java` version on the $PATH + return java_exe if java_exe else which("java") + + +JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 'update']) + + +def determine_java_version(java_exe): + """Given a valid java executable will return its version in named tuple format + with accessors '.major', '.minor', '.patch', '.update'""" + + raw_output = subprocess.check_output([java_exe, "-version"], + stderr=subprocess.STDOUT, + universal_newlines=True) + + raw_output_lines = raw_output.split('\n') + + # find raw version string, eg 'java version "1.8.0_25"' + raw_version_str = next(x for x in raw_output_lines if " version " in x) + + match = re.search('(\d+)\.(\d+)\.(\d+)_(\d+)', raw_version_str) + + major = int(match.group(1)) + minor = int(match.group(2)) + patch = int(match.group(3)) + update = int(match.group(4)) + + return JavaVersion(major, minor, patch, update) + +# ------------------------------------------------------------------------------------------------- +# Functions for running the other build and test scripts +# ------------------------------------------------------------------------------------------------- + + +def set_title_and_block(title, err_block): + os.environ["CURRENT_BLOCK"] = str(ERROR_CODES[err_block]) + line_str = '=' * 72 + + print('') + print(line_str) + print(title) + print(line_str) + + +def run_apache_rat_checks(): + set_title_and_block("Running Apache RAT checks", "BLOCK_RAT") + run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")]) + + +def run_scala_style_checks(): + set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE") + run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")]) + + +def run_python_style_checks(): + set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE") + run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")]) + + +def run_sparkr_style_checks(): + set_title_and_block("Running R style checks", "BLOCK_R_STYLE") + + if which("R"): + # R style check should be executed after `install-dev.sh`. + # Since warnings about `no visible global function definition` appear + # without the installation. SEE ALSO: SPARK-9121. + run_cmd([os.path.join(SPARK_HOME, "dev", "lint-r")]) + else: + print("Ignoring SparkR style check as R was not found in PATH") + + +def build_spark_documentation(): + set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION") + os.environ["PRODUCTION"] = "1 jekyll build" + + os.chdir(os.path.join(SPARK_HOME, "docs")) + + jekyll_bin = which("jekyll") + + if not jekyll_bin: + print("[error] Cannot find a version of `jekyll` on the system; please", + " install one and retry to build documentation.") + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) + else: + run_cmd([jekyll_bin, "build"]) + + os.chdir(SPARK_HOME) + + +def get_zinc_port(): + """ + Get a randomized port on which to start Zinc + """ + return random.randrange(3030, 4030) + + +def kill_zinc_on_port(zinc_port): + """ + Kill the Zinc process running on the given port, if one exists. + """ + cmd = ("/usr/sbin/lsof -P |grep %s | grep LISTEN " + "| awk '{ print $2; }' | xargs kill") % zinc_port + subprocess.check_call(cmd, shell=True) + + +def exec_maven(mvn_args=()): + """Will call Maven in the current directory with the list of mvn_args passed + in and returns the subprocess for any further processing""" + + zinc_port = get_zinc_port() + os.environ["ZINC_PORT"] = "%s" % zinc_port + zinc_flag = "-DzincPort=%s" % zinc_port + flags = [os.path.join(SPARK_HOME, "build", "mvn"), "--force", zinc_flag] + run_cmd(flags + mvn_args) + kill_zinc_on_port(zinc_port) + + +def exec_sbt(sbt_args=()): + """Will call SBT in the current directory with the list of mvn_args passed + in and returns the subprocess for any further processing""" + + sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args + + sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" + + "^.*[warn].*Merging" + "|" + + "^.*[info].*Including") + + # NOTE: echo "q" is needed because sbt on encountering a build file + # with failure (either resolution or compilation) prompts the user for + # input either q, r, etc to quit or retry. This echo is there to make it + # not block. + echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE) + sbt_proc = subprocess.Popen(sbt_cmd, + stdin=echo_proc.stdout, + stdout=subprocess.PIPE) + echo_proc.wait() + for line in iter(sbt_proc.stdout.readline, ''): + if not sbt_output_filter.match(line): + print(line, end='') + retcode = sbt_proc.wait() + + if retcode > 0: + exit_from_command_with_retcode(sbt_cmd, retcode) + + +def get_hadoop_profiles(hadoop_version): + """ + For the given Hadoop version tag, return a list of SBT profile flags for + building and testing against that Hadoop version. + """ + + sbt_maven_hadoop_profiles = { + "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.2.1"], + "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], + "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"], + "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], + "hadoop2.6": ["-Pyarn", "-Phadoop-2.6"], + } + + if hadoop_version in sbt_maven_hadoop_profiles: + return sbt_maven_hadoop_profiles[hadoop_version] + else: + print("[error] Could not find", hadoop_version, "in the list. Valid options", + " are", sbt_maven_hadoop_profiles.keys()) + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) + + +def build_spark_maven(hadoop_version): + # Enable all of the profiles for the build: + build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags + mvn_goals = ["clean", "package", "-DskipTests"] + profiles_and_goals = build_profiles + mvn_goals + + print("[info] Building Spark (w/Hive 1.2.1) using Maven with these arguments: ", + " ".join(profiles_and_goals)) + + exec_maven(profiles_and_goals) + + +def build_spark_sbt(hadoop_version): + # Enable all of the profiles for the build: + build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags + sbt_goals = ["package", + "assembly/assembly", + "streaming-kafka-assembly/assembly", + "streaming-flume-assembly/assembly", + "streaming-mqtt-assembly/assembly", + "streaming-mqtt/test:assembly", + "streaming-kinesis-asl-assembly/assembly"] + profiles_and_goals = build_profiles + sbt_goals + + print("[info] Building Spark (w/Hive 1.2.1) using SBT with these arguments: ", + " ".join(profiles_and_goals)) + + exec_sbt(profiles_and_goals) + + +def build_apache_spark(build_tool, hadoop_version): + """Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or + `maven`). Defaults to using `sbt`.""" + + set_title_and_block("Building Spark", "BLOCK_BUILD") + + rm_r("lib_managed") + + if build_tool == "maven": + build_spark_maven(hadoop_version) + else: + build_spark_sbt(hadoop_version) + + +def detect_binary_inop_with_mima(): + set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA") + run_cmd([os.path.join(SPARK_HOME, "dev", "mima")]) + + +def run_scala_tests_maven(test_profiles): + mvn_test_goals = ["test", "--fail-at-end"] + + profiles_and_goals = test_profiles + mvn_test_goals + + print("[info] Running Spark tests using Maven with these arguments: ", + " ".join(profiles_and_goals)) + + exec_maven(profiles_and_goals) + + +def run_scala_tests_sbt(test_modules, test_profiles): + + sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules)) + + if not sbt_test_goals: + return + + profiles_and_goals = test_profiles + list(sbt_test_goals) + + print("[info] Running Spark tests using SBT with these arguments: ", + " ".join(profiles_and_goals)) + + exec_sbt(profiles_and_goals) + + +def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags): + """Function to properly execute all tests passed in as a set from the + `determine_test_suites` function""" + set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") + + test_modules = set(test_modules) + + test_profiles = get_hadoop_profiles(hadoop_version) + \ + list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules))) + + if excluded_tags: + test_profiles += ['-Dtest.exclude.tags=' + ",".join(excluded_tags)] + + if build_tool == "maven": + run_scala_tests_maven(test_profiles) + else: + run_scala_tests_sbt(test_modules, test_profiles) + + +def run_python_tests(test_modules, parallelism): + set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") + + command = [os.path.join(SPARK_HOME, "python", "run-tests")] + if test_modules != [modules.root]: + command.append("--modules=%s" % ','.join(m.name for m in test_modules)) + command.append("--parallelism=%i" % parallelism) + run_cmd(command) + + +def run_sparkr_tests(): + set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS") + + if which("R"): + run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")]) + else: + print("Ignoring SparkR tests as R was not found in PATH") + + +def parse_opts(): + parser = OptionParser( + prog="run-tests" + ) + parser.add_option( + "-p", "--parallelism", type="int", default=4, + help="The number of suites to test in parallel (default %default)" + ) + + (opts, args) = parser.parse_args() + if args: + parser.error("Unsupported arguments: %s" % ' '.join(args)) + if opts.parallelism < 1: + parser.error("Parallelism cannot be less than 1") + return opts + + +def main(): + opts = parse_opts() + # Ensure the user home directory (HOME) is valid and is an absolute directory + if not USER_HOME or not os.path.isabs(USER_HOME): + print("[error] Cannot determine your home directory as an absolute path;", + " ensure the $HOME environment variable is set properly.") + sys.exit(1) + + os.chdir(SPARK_HOME) + + rm_r(os.path.join(SPARK_HOME, "work")) + rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) + rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) + + os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"]) + + java_exe = determine_java_executable() + + if not java_exe: + print("[error] Cannot find a version of `java` on the system; please", + " install one and retry.") + sys.exit(2) + + java_version = determine_java_version(java_exe) + + if java_version.minor < 8: + print("[warn] Java 8 tests will not run because JDK version is < 1.8.") + + # install SparkR + if which("R"): + run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) + else: + print("Can't install SparkR as R is was not found in PATH") + + if os.environ.get("AMPLAB_JENKINS"): + # if we're on the Amplab Jenkins build servers setup variables + # to reflect the environment settings + build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") + hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3") + test_env = "amplab_jenkins" + # add path for Python3 in Jenkins if we're calling from a Jenkins machine + os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH") + else: + # else we're running locally and can use local settings + build_tool = "sbt" + hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.3") + test_env = "local" + + print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, + "under environment", test_env) + + changed_modules = None + changed_files = None + if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): + target_branch = os.environ["ghprbTargetBranch"] + changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) + changed_modules = determine_modules_for_files(changed_files) + excluded_tags = determine_tags_to_exclude(changed_modules) + if not changed_modules: + changed_modules = [modules.root] + excluded_tags = [] + print("[info] Found the following changed modules:", + ", ".join(x.name for x in changed_modules)) + + # setup environment variables + # note - the 'root' module doesn't collect environment variables for all modules. Because the + # environment variables should not be set if a module is not changed, even if running the 'root' + # module. So here we should use changed_modules rather than test_modules. + test_environ = {} + for m in changed_modules: + test_environ.update(m.environ) + setup_test_environ(test_environ) + + test_modules = determine_modules_to_test(changed_modules) + + # license checks + run_apache_rat_checks() + + # style checks + if not changed_files or any(f.endswith(".scala") for f in changed_files): + run_scala_style_checks() + if not changed_files or any(f.endswith(".py") for f in changed_files): + run_python_style_checks() + if not changed_files or any(f.endswith(".R") for f in changed_files): + run_sparkr_style_checks() + + # determine if docs were changed and if we're inside the amplab environment + # note - the below commented out until *all* Jenkins workers can get `jekyll` installed + # if "DOCS" in changed_modules and test_env == "amplab_jenkins": + # build_spark_documentation() + + # spark build + build_apache_spark(build_tool, hadoop_version) + + # backwards compatibility checks + if build_tool == "sbt": + # Note: compatiblity tests only supported in sbt for now + detect_binary_inop_with_mima() + + # run the test suites + run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags) + + modules_with_python_tests = [m for m in test_modules if m.python_test_goals] + if modules_with_python_tests: + run_python_tests(modules_with_python_tests, opts.parallelism) + if any(m.should_run_r_tests for m in test_modules): + run_sparkr_tests() + + +def _test(): + import doctest + failure_count = doctest.testmod()[0] + if failure_count: + exit(-1) + +if __name__ == "__main__": + _test() + main() diff --git a/dev/_site/scalastyle b/dev/_site/scalastyle new file mode 100755 index 0000000000000..ad93f7e85b27c --- /dev/null +++ b/dev/_site/scalastyle @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +echo -e "q\n" | build/sbt -Pkinesis-asl -Phive -Phive-thriftserver scalastyle > scalastyle.txt +echo -e "q\n" | build/sbt -Pkinesis-asl -Phive -Phive-thriftserver test:scalastyle >> scalastyle.txt +# Check style with YARN built too +echo -e "q\n" | build/sbt -Pkinesis-asl -Pyarn -Phadoop-2.2 scalastyle >> scalastyle.txt +echo -e "q\n" | build/sbt -Pkinesis-asl -Pyarn -Phadoop-2.2 test:scalastyle >> scalastyle.txt + +ERRORS=$(cat scalastyle.txt | awk '{if($1~/error/)print}') +rm scalastyle.txt + +if test ! -z "$ERRORS"; then + echo -e "Scalastyle checks failed at following occurrences:\n$ERRORS" + exit 1 +else + echo -e "Scalastyle checks passed." +fi diff --git a/dev/_site/sparktestsupport/modules.py b/dev/_site/sparktestsupport/modules.py new file mode 100644 index 0000000000000..d65547e04db4b --- /dev/null +++ b/dev/_site/sparktestsupport/modules.py @@ -0,0 +1,437 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import itertools +import re + +all_modules = [] + + +class Module(object): + """ + A module is the basic abstraction in our test runner script. Each module consists of a set of + source files, a set of test commands, and a set of dependencies on other modules. We use modules + to define a dependency graph that lets determine which tests to run based on which files have + changed. + """ + + def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={}, + sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(), + test_tags=(), should_run_r_tests=False): + """ + Define a new module. + + :param name: A short module name, for display in logging and error messages. + :param dependencies: A set of dependencies for this module. This should only include direct + dependencies; transitive dependencies are resolved automatically. + :param source_file_regexes: a set of regexes that match source files belonging to this + module. These regexes are applied by attempting to match at the beginning of the + filename strings. + :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in + order to build and test this module (e.g. '-PprofileName'). + :param environ: A dict of environment variables that should be set when files in this + module are changed. + :param sbt_test_goals: A set of SBT test goals for testing this module. + :param python_test_goals: A set of Python test goals for testing this module. + :param blacklisted_python_implementations: A set of Python implementations that are not + supported by this module's Python components. The values in this set should match + strings returned by Python's `platform.python_implementation()`. + :param test_tags A set of tags that will be excluded when running unit tests if the module + is not explicitly changed. + :param should_run_r_tests: If true, changes in this module will trigger all R tests. + """ + self.name = name + self.dependencies = dependencies + self.source_file_prefixes = source_file_regexes + self.sbt_test_goals = sbt_test_goals + self.build_profile_flags = build_profile_flags + self.environ = environ + self.python_test_goals = python_test_goals + self.blacklisted_python_implementations = blacklisted_python_implementations + self.test_tags = test_tags + self.should_run_r_tests = should_run_r_tests + + self.dependent_modules = set() + for dep in dependencies: + dep.dependent_modules.add(self) + all_modules.append(self) + + def contains_file(self, filename): + return any(re.match(p, filename) for p in self.source_file_prefixes) + + +sql = Module( + name="sql", + dependencies=[], + source_file_regexes=[ + "sql/(?!hive-thriftserver)", + "bin/spark-sql", + ], + build_profile_flags=[ + "-Phive", + ], + sbt_test_goals=[ + "catalyst/test", + "sql/test", + "hive/test", + ], + test_tags=[ + "org.apache.spark.tags.ExtendedHiveTest" + ] +) + + +hive_thriftserver = Module( + name="hive-thriftserver", + dependencies=[sql], + source_file_regexes=[ + "sql/hive-thriftserver", + "sbin/start-thriftserver.sh", + ], + build_profile_flags=[ + "-Phive-thriftserver", + ], + sbt_test_goals=[ + "hive-thriftserver/test", + ] +) + + +graphx = Module( + name="graphx", + dependencies=[], + source_file_regexes=[ + "graphx/", + ], + sbt_test_goals=[ + "graphx/test" + ] +) + + +streaming = Module( + name="streaming", + dependencies=[], + source_file_regexes=[ + "streaming", + ], + sbt_test_goals=[ + "streaming/test", + ] +) + + +# Don't set the dependencies because changes in other modules should not trigger Kinesis tests. +# Kinesis tests depends on external Amazon kinesis service. We should run these tests only when +# files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't +# fail other PRs. +streaming_kinesis_asl = Module( + name="streaming-kinesis-asl", + dependencies=[], + source_file_regexes=[ + "extras/kinesis-asl/", + "extras/kinesis-asl-assembly/", + ], + build_profile_flags=[ + "-Pkinesis-asl", + ], + environ={ + "ENABLE_KINESIS_TESTS": "1" + }, + sbt_test_goals=[ + "streaming-kinesis-asl/test", + ] +) + + +streaming_zeromq = Module( + name="streaming-zeromq", + dependencies=[streaming], + source_file_regexes=[ + "external/zeromq", + ], + sbt_test_goals=[ + "streaming-zeromq/test", + ] +) + + +streaming_twitter = Module( + name="streaming-twitter", + dependencies=[streaming], + source_file_regexes=[ + "external/twitter", + ], + sbt_test_goals=[ + "streaming-twitter/test", + ] +) + + +streaming_mqtt = Module( + name="streaming-mqtt", + dependencies=[streaming], + source_file_regexes=[ + "external/mqtt", + "external/mqtt-assembly", + ], + sbt_test_goals=[ + "streaming-mqtt/test", + ] +) + + +streaming_kafka = Module( + name="streaming-kafka", + dependencies=[streaming], + source_file_regexes=[ + "external/kafka", + "external/kafka-assembly", + ], + sbt_test_goals=[ + "streaming-kafka/test", + ] +) + + +streaming_flume_sink = Module( + name="streaming-flume-sink", + dependencies=[streaming], + source_file_regexes=[ + "external/flume-sink", + ], + sbt_test_goals=[ + "streaming-flume-sink/test", + ] +) + + +streaming_flume = Module( + name="streaming-flume", + dependencies=[streaming], + source_file_regexes=[ + "external/flume", + ], + sbt_test_goals=[ + "streaming-flume/test", + ] +) + + +streaming_flume_assembly = Module( + name="streaming-flume-assembly", + dependencies=[streaming_flume, streaming_flume_sink], + source_file_regexes=[ + "external/flume-assembly", + ] +) + + +mllib = Module( + name="mllib", + dependencies=[streaming, sql], + source_file_regexes=[ + "data/mllib/", + "mllib/", + ], + sbt_test_goals=[ + "mllib/test", + ] +) + + +examples = Module( + name="examples", + dependencies=[graphx, mllib, streaming, sql], + source_file_regexes=[ + "examples/", + ], + sbt_test_goals=[ + "examples/test", + ] +) + + +pyspark_core = Module( + name="pyspark-core", + dependencies=[], + source_file_regexes=[ + "python/(?!pyspark/(ml|mllib|sql|streaming))" + ], + python_test_goals=[ + "pyspark.rdd", + "pyspark.context", + "pyspark.conf", + "pyspark.broadcast", + "pyspark.accumulators", + "pyspark.serializers", + "pyspark.profiler", + "pyspark.shuffle", + "pyspark.tests", + ] +) + + +pyspark_sql = Module( + name="pyspark-sql", + dependencies=[pyspark_core, sql], + source_file_regexes=[ + "python/pyspark/sql" + ], + python_test_goals=[ + "pyspark.sql.types", + "pyspark.sql.context", + "pyspark.sql.column", + "pyspark.sql.dataframe", + "pyspark.sql.group", + "pyspark.sql.functions", + "pyspark.sql.readwriter", + "pyspark.sql.window", + "pyspark.sql.tests", + ] +) + + +pyspark_streaming = Module( + name="pyspark-streaming", + dependencies=[ + pyspark_core, + streaming, + streaming_kafka, + streaming_flume_assembly, + streaming_mqtt, + streaming_kinesis_asl + ], + source_file_regexes=[ + "python/pyspark/streaming" + ], + python_test_goals=[ + "pyspark.streaming.util", + "pyspark.streaming.tests", + ] +) + + +pyspark_mllib = Module( + name="pyspark-mllib", + dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib], + source_file_regexes=[ + "python/pyspark/mllib" + ], + python_test_goals=[ + "pyspark.mllib.classification", + "pyspark.mllib.clustering", + "pyspark.mllib.evaluation", + "pyspark.mllib.feature", + "pyspark.mllib.fpm", + "pyspark.mllib.linalg.__init__", + "pyspark.mllib.linalg.distributed", + "pyspark.mllib.random", + "pyspark.mllib.recommendation", + "pyspark.mllib.regression", + "pyspark.mllib.stat._statistics", + "pyspark.mllib.stat.KernelDensity", + "pyspark.mllib.tree", + "pyspark.mllib.util", + "pyspark.mllib.tests", + ], + blacklisted_python_implementations=[ + "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there + ] +) + + +pyspark_ml = Module( + name="pyspark-ml", + dependencies=[pyspark_core, pyspark_mllib], + source_file_regexes=[ + "python/pyspark/ml/" + ], + python_test_goals=[ + "pyspark.ml.feature", + "pyspark.ml.classification", + "pyspark.ml.clustering", + "pyspark.ml.recommendation", + "pyspark.ml.regression", + "pyspark.ml.tuning", + "pyspark.ml.tests", + "pyspark.ml.evaluation", + ], + blacklisted_python_implementations=[ + "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there + ] +) + +sparkr = Module( + name="sparkr", + dependencies=[sql, mllib], + source_file_regexes=[ + "R/", + ], + should_run_r_tests=True +) + + +docs = Module( + name="docs", + dependencies=[], + source_file_regexes=[ + "docs/", + ] +) + + +ec2 = Module( + name="ec2", + dependencies=[], + source_file_regexes=[ + "ec2/", + ] +) + + +yarn = Module( + name="yarn", + dependencies=[], + source_file_regexes=[ + "yarn/", + "network/yarn/", + ], + sbt_test_goals=[ + "yarn/test", + "network-yarn/test", + ], + test_tags=[ + "org.apache.spark.tags.ExtendedYarnTest" + ] +) + +# The root module is a dummy module which is used to run all of the tests. +# No other modules should directly depend on this module. +root = Module( + name="root", + dependencies=[], + source_file_regexes=[], + # In order to run all of the tests, enable every test profile: + build_profile_flags=list(set( + itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))), + sbt_test_goals=[ + "test", + ], + python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)), + should_run_r_tests=True +) diff --git a/dev/_site/sparktestsupport/shellutils.py b/dev/_site/sparktestsupport/shellutils.py new file mode 100644 index 0000000000000..d280e797077d1 --- /dev/null +++ b/dev/_site/sparktestsupport/shellutils.py @@ -0,0 +1,115 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import os +import shutil +import subprocess +import sys + + +if sys.version_info >= (2, 7): + subprocess_check_output = subprocess.check_output + subprocess_check_call = subprocess.check_call +else: + # SPARK-8763 + # backported from subprocess module in Python 2.7 + def subprocess_check_output(*popenargs, **kwargs): + if 'stdout' in kwargs: + raise ValueError('stdout argument not allowed, it will be overridden.') + process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise subprocess.CalledProcessError(retcode, cmd, output=output) + return output + + # backported from subprocess module in Python 2.7 + def subprocess_check_call(*popenargs, **kwargs): + retcode = call(*popenargs, **kwargs) + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise CalledProcessError(retcode, cmd) + return 0 + + +def exit_from_command_with_retcode(cmd, retcode): + print("[error] running", ' '.join(cmd), "; received return code", retcode) + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) + + +def rm_r(path): + """ + Given an arbitrary path, properly remove it with the correct Python construct if it exists. + From: http://stackoverflow.com/a/9559881 + """ + + if os.path.isdir(path): + shutil.rmtree(path) + elif os.path.exists(path): + os.remove(path) + + +def run_cmd(cmd, return_output=False): + """ + Given a command as a list of arguments will attempt to execute the command + and, on failure, print an error message and exit. + """ + + if not isinstance(cmd, list): + cmd = cmd.split() + try: + if return_output: + return subprocess_check_output(cmd) + else: + return subprocess_check_call(cmd) + except subprocess.CalledProcessError as e: + exit_from_command_with_retcode(e.cmd, e.returncode) + + +def is_exe(path): + """ + Check if a given path is an executable file. + From: http://stackoverflow.com/a/377028 + """ + + return os.path.isfile(path) and os.access(path, os.X_OK) + + +def which(program): + """ + Find and return the given program by its absolute path or 'None' if the program cannot be found. + From: http://stackoverflow.com/a/377028 + """ + + fpath = os.path.split(program)[0] + + if fpath: + if is_exe(program): + return program + else: + for path in os.environ.get("PATH").split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + return None diff --git a/dev/_site/tests/pr_merge_ability.sh b/dev/_site/tests/pr_merge_ability.sh new file mode 100755 index 0000000000000..d9a347fe24a8c --- /dev/null +++ b/dev/_site/tests/pr_merge_ability.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# This script follows the base format for testing pull requests against +# another branch and returning results to be published. More details can be +# found at dev/run-tests-jenkins. +# +# Arg1: The Github Pull Request Actual Commit +#+ known as `ghprbActualCommit` in `run-tests-jenkins` +# Arg2: The SHA1 hash +#+ known as `sha1` in `run-tests-jenkins` +# + +ghprbActualCommit="$1" +sha1="$2" + +# check PR merge-ability +if [ "${sha1}" == "${ghprbActualCommit}" ]; then + echo " * This patch **does not merge cleanly**." +else + echo " * This patch merges cleanly." +fi diff --git a/dev/_site/tests/pr_new_dependencies.sh b/dev/_site/tests/pr_new_dependencies.sh new file mode 100755 index 0000000000000..fdfb3c62aff58 --- /dev/null +++ b/dev/_site/tests/pr_new_dependencies.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# This script follows the base format for testing pull requests against +# another branch and returning results to be published. More details can be +# found at dev/run-tests-jenkins. +# +# Arg1: The Github Pull Request Actual Commit +#+ known as `ghprbActualCommit` in `run-tests-jenkins` +# Arg2: The SHA1 hash +#+ known as `sha1` in `run-tests-jenkins` +# Arg3: Current PR Commit Hash +#+ the PR hash for the current commit +# + +ghprbActualCommit="$1" +sha1="$2" +current_pr_head="$3" + +MVN_BIN="build/mvn" +CURR_CP_FILE="my-classpath.txt" +MASTER_CP_FILE="master-classpath.txt" + +# First switch over to the master branch +git checkout -f master +# Find and copy all pom.xml files into a *.gate file that we can check +# against through various `git` changes +find -name "pom.xml" -exec cp {} {}.gate \; +# Switch back to the current PR +git checkout -f "${current_pr_head}" + +# Check if any *.pom files from the current branch are different from the master +difference_q="" +for p in $(find -name "pom.xml"); do + [[ -f "${p}" && -f "${p}.gate" ]] && \ + difference_q="${difference_q}$(diff $p.gate $p)" +done + +# If no pom files were changed we can easily say no new dependencies were added +if [ -z "${difference_q}" ]; then + echo " * This patch does not change any dependencies." +else + # Else we need to manually build spark to determine what, if any, dependencies + # were added into the Spark assembly jar + ${MVN_BIN} clean package dependency:build-classpath -DskipTests 2>/dev/null | \ + sed -n -e '/Building Spark Project Assembly/,$p' | \ + grep --context=1 -m 2 "Dependencies classpath:" | \ + head -n 3 | \ + tail -n 1 | \ + tr ":" "\n" | \ + rev | \ + cut -d "/" -f 1 | \ + rev | \ + sort > ${CURR_CP_FILE} + + # Checkout the master branch to compare against + git checkout -f master + + ${MVN_BIN} clean package dependency:build-classpath -DskipTests 2>/dev/null | \ + sed -n -e '/Building Spark Project Assembly/,$p' | \ + grep --context=1 -m 2 "Dependencies classpath:" | \ + head -n 3 | \ + tail -n 1 | \ + tr ":" "\n" | \ + rev | \ + cut -d "/" -f 1 | \ + rev | \ + sort > ${MASTER_CP_FILE} + + DIFF_RESULTS="`diff ${CURR_CP_FILE} ${MASTER_CP_FILE}`" + + if [ -z "${DIFF_RESULTS}" ]; then + echo " * This patch does not change any dependencies." + else + # Pretty print the new dependencies + added_deps=$(echo "${DIFF_RESULTS}" | grep "<" | cut -d' ' -f2 | awk '{printf " * \`"$1"\`\\n"}') + removed_deps=$(echo "${DIFF_RESULTS}" | grep ">" | cut -d' ' -f2 | awk '{printf " * \`"$1"\`\\n"}') + added_deps_text=" * This patch **adds the following new dependencies:**\n${added_deps}" + removed_deps_text=" * This patch **removes the following dependencies:**\n${removed_deps}" + + # Construct the final returned message with proper + return_mssg="" + [ -n "${added_deps}" ] && return_mssg="${added_deps_text}" + if [ -n "${removed_deps}" ]; then + if [ -n "${return_mssg}" ]; then + return_mssg="${return_mssg}\n${removed_deps_text}" + else + return_mssg="${removed_deps_text}" + fi + fi + echo "${return_mssg}" + fi + + # Remove the files we've left over + [ -f "${CURR_CP_FILE}" ] && rm -f "${CURR_CP_FILE}" + [ -f "${MASTER_CP_FILE}" ] && rm -f "${MASTER_CP_FILE}" + + # Clean up our mess from the Maven builds just in case + ${MVN_BIN} clean &>/dev/null +fi diff --git a/dev/_site/tests/pr_public_classes.sh b/dev/_site/tests/pr_public_classes.sh new file mode 100755 index 0000000000000..927295b88c963 --- /dev/null +++ b/dev/_site/tests/pr_public_classes.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# This script follows the base format for testing pull requests against +# another branch and returning results to be published. More details can be +# found at dev/run-tests-jenkins. +# +# Arg1: The Github Pull Request Actual Commit +#+ known as `ghprbActualCommit` in `run-tests-jenkins` +# Arg2: The SHA1 hash +#+ known as `sha1` in `run-tests-jenkins` +# + +# We diff master...$ghprbActualCommit because that gets us changes introduced in the PR +#+ and not anything else added to master since the PR was branched. + +ghprbActualCommit="$1" +sha1="$2" + +source_files=$( + git diff master...$ghprbActualCommit --name-only `# diff patch against master from branch point` \ + | grep -v -e "\/test" `# ignore files in test directories` \ + | grep -e "\.py$" -e "\.java$" -e "\.scala$" `# include only code files` \ + | tr "\n" " " +) +new_public_classes=$( + git diff master...$ghprbActualCommit ${source_files} `# diff patch against master from branch point` \ + | grep "^\+" `# filter in only added lines` \ + | sed -r -e "s/^\+//g" `# remove the leading +` \ + | grep -e "trait " -e "class " `# filter in lines with these key words` \ + | grep -e "{" -e "(" `# filter in lines with these key words, too` \ + | grep -v -e "\@\@" -e "private" `# exclude lines with these words` \ + | grep -v -e "^// " -e "^/\*" -e "^ \* " `# exclude comment lines` \ + | sed -r -e "s/\{.*//g" `# remove from the { onwards` \ + | sed -r -e "s/\}//g" `# just in case, remove }; they mess the JSON` \ + | sed -r -e "s/\"/\\\\\"/g" `# escape double quotes; they mess the JSON` \ + | sed -r -e "s/^(.*)$/\`\1\`/g" `# surround with backticks for style` \ + | sed -r -e "s/^/ \* /g" `# prepend ' *' to start of line` \ + | sed -r -e "s/$/\\\n/g" `# append newline to end of line` \ + | tr -d "\n" `# remove actual LF characters` +) + +if [ -z "$new_public_classes" ]; then + echo " * This patch adds no public classes." +else + public_classes_note=" * This patch adds the following public classes _(experimental)_:" + echo "${public_classes_note}\n${new_public_classes}" +fi diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala similarity index 100% rename from examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics rename to examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala From 3af5fa334d57c6825bd4db7cf6b6ee564b97a2ff Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Fri, 13 Nov 2015 14:33:40 +0530 Subject: [PATCH 04/13] removed build files --- dev/_site/README.md | 5 - dev/_site/audit-release/README.md | 11 - dev/_site/audit-release/audit_release.py | 237 -------- .../audit-release/blank_maven_build/pom.xml | 43 -- .../audit-release/blank_sbt_build/build.sbt | 30 - .../audit-release/maven_app_core/input.txt | 8 - .../audit-release/maven_app_core/pom.xml | 52 -- .../src/main/java/SimpleApp.java | 42 -- .../audit-release/sbt_app_core/build.sbt | 28 - .../audit-release/sbt_app_core/input.txt | 8 - .../src/main/scala/SparkApp.scala | 63 -- .../audit-release/sbt_app_ganglia/build.sbt | 30 - .../src/main/scala/SparkApp.scala | 41 -- .../audit-release/sbt_app_graphx/build.sbt | 28 - .../src/main/scala/GraphxApp.scala | 55 -- .../audit-release/sbt_app_hive/build.sbt | 29 - dev/_site/audit-release/sbt_app_hive/data.txt | 9 - .../sbt_app_hive/src/main/scala/HiveApp.scala | 59 -- .../audit-release/sbt_app_kinesis/build.sbt | 28 - .../src/main/scala/SparkApp.scala | 35 -- dev/_site/audit-release/sbt_app_sql/build.sbt | 28 - .../sbt_app_sql/src/main/scala/SqlApp.scala | 61 -- .../audit-release/sbt_app_streaming/build.sbt | 28 - .../src/main/scala/StreamingApp.scala | 65 -- dev/_site/change-scala-version.sh | 70 --- dev/_site/change-version-to-2.10.sh | 23 - dev/_site/change-version-to-2.11.sh | 23 - dev/_site/check-license | 85 --- .../create-release/generate-changelist.py | 148 ----- .../create-release/generate-contributors.py | 248 -------- dev/_site/create-release/known_translations | 167 ------ dev/_site/create-release/release-build.sh | 326 ---------- dev/_site/create-release/release-tag.sh | 79 --- dev/_site/create-release/releaseutils.py | 260 -------- .../create-release/translate-contributors.py | 253 -------- dev/_site/github_jira_sync.py | 147 ----- dev/_site/lint-python | 114 ---- dev/_site/lint-r | 41 -- dev/_site/lint-r.R | 37 -- dev/_site/lint-scala | 23 - dev/_site/merge_spark_pr.py | 453 -------------- dev/_site/mima | 54 -- dev/_site/run-tests | 23 - dev/_site/run-tests-jenkins | 28 - dev/_site/run-tests-jenkins.py | 228 ------- dev/_site/run-tests.py | 561 ------------------ dev/_site/scalastyle | 34 -- dev/_site/sparktestsupport/modules.py | 437 -------------- dev/_site/sparktestsupport/shellutils.py | 115 ---- dev/_site/tests/pr_merge_ability.sh | 39 -- dev/_site/tests/pr_new_dependencies.sh | 117 ---- dev/_site/tests/pr_public_classes.sh | 65 -- 52 files changed, 5221 deletions(-) delete mode 100644 dev/_site/README.md delete mode 100644 dev/_site/audit-release/README.md delete mode 100755 dev/_site/audit-release/audit_release.py delete mode 100644 dev/_site/audit-release/blank_maven_build/pom.xml delete mode 100644 dev/_site/audit-release/blank_sbt_build/build.sbt delete mode 100644 dev/_site/audit-release/maven_app_core/input.txt delete mode 100644 dev/_site/audit-release/maven_app_core/pom.xml delete mode 100644 dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java delete mode 100644 dev/_site/audit-release/sbt_app_core/build.sbt delete mode 100644 dev/_site/audit-release/sbt_app_core/input.txt delete mode 100644 dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala delete mode 100644 dev/_site/audit-release/sbt_app_ganglia/build.sbt delete mode 100644 dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala delete mode 100644 dev/_site/audit-release/sbt_app_graphx/build.sbt delete mode 100644 dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala delete mode 100644 dev/_site/audit-release/sbt_app_hive/build.sbt delete mode 100644 dev/_site/audit-release/sbt_app_hive/data.txt delete mode 100644 dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala delete mode 100644 dev/_site/audit-release/sbt_app_kinesis/build.sbt delete mode 100644 dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala delete mode 100644 dev/_site/audit-release/sbt_app_sql/build.sbt delete mode 100644 dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala delete mode 100644 dev/_site/audit-release/sbt_app_streaming/build.sbt delete mode 100644 dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala delete mode 100755 dev/_site/change-scala-version.sh delete mode 100755 dev/_site/change-version-to-2.10.sh delete mode 100755 dev/_site/change-version-to-2.11.sh delete mode 100755 dev/_site/check-license delete mode 100755 dev/_site/create-release/generate-changelist.py delete mode 100755 dev/_site/create-release/generate-contributors.py delete mode 100644 dev/_site/create-release/known_translations delete mode 100755 dev/_site/create-release/release-build.sh delete mode 100755 dev/_site/create-release/release-tag.sh delete mode 100755 dev/_site/create-release/releaseutils.py delete mode 100755 dev/_site/create-release/translate-contributors.py delete mode 100755 dev/_site/github_jira_sync.py delete mode 100755 dev/_site/lint-python delete mode 100755 dev/_site/lint-r delete mode 100644 dev/_site/lint-r.R delete mode 100755 dev/_site/lint-scala delete mode 100755 dev/_site/merge_spark_pr.py delete mode 100755 dev/_site/mima delete mode 100755 dev/_site/run-tests delete mode 100755 dev/_site/run-tests-jenkins delete mode 100755 dev/_site/run-tests-jenkins.py delete mode 100755 dev/_site/run-tests.py delete mode 100755 dev/_site/scalastyle delete mode 100644 dev/_site/sparktestsupport/modules.py delete mode 100644 dev/_site/sparktestsupport/shellutils.py delete mode 100755 dev/_site/tests/pr_merge_ability.sh delete mode 100755 dev/_site/tests/pr_new_dependencies.sh delete mode 100755 dev/_site/tests/pr_public_classes.sh diff --git a/dev/_site/README.md b/dev/_site/README.md deleted file mode 100644 index 2b0f3d8ee8924..0000000000000 --- a/dev/_site/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Spark Developer Scripts -This directory contains scripts useful to developers when packaging, -testing, or committing to Spark. - -Many of these scripts require Apache credentials to work correctly. diff --git a/dev/_site/audit-release/README.md b/dev/_site/audit-release/README.md deleted file mode 100644 index f72f8c653a265..0000000000000 --- a/dev/_site/audit-release/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Test Application Builds -This directory includes test applications which are built when auditing releases. You can -run them locally by setting appropriate environment variables. - -``` -$ cd sbt_app_core -$ SCALA_VERSION=2.10.5 \ - SPARK_VERSION=1.0.0-SNAPSHOT \ - SPARK_RELEASE_REPOSITORY=file:///home/patrick/.ivy2/local \ - sbt run -``` diff --git a/dev/_site/audit-release/audit_release.py b/dev/_site/audit-release/audit_release.py deleted file mode 100755 index 27d1dd784ce2e..0000000000000 --- a/dev/_site/audit-release/audit_release.py +++ /dev/null @@ -1,237 +0,0 @@ -#!/usr/bin/python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Audits binary and maven artifacts for a Spark release. -# Requires GPG and Maven. -# usage: -# python audit_release.py - -import os -import re -import shutil -import subprocess -import sys -import time -import urllib2 - -# Note: The following variables must be set before use! -RELEASE_URL = "http://people.apache.org/~andrewor14/spark-1.1.1-rc1/" -RELEASE_KEY = "XXXXXXXX" # Your 8-digit hex -RELEASE_REPOSITORY = "https://repository.apache.org/content/repositories/orgapachespark-1033" -RELEASE_VERSION = "1.1.1" -SCALA_VERSION = "2.10.5" -SCALA_BINARY_VERSION = "2.10" - -# Do not set these -LOG_FILE_NAME = "spark_audit_%s" % time.strftime("%h_%m_%Y_%I_%M_%S") -LOG_FILE = open(LOG_FILE_NAME, 'w') -WORK_DIR = "/tmp/audit_%s" % int(time.time()) -MAVEN_CMD = "mvn" -GPG_CMD = "gpg" -SBT_CMD = "sbt -Dsbt.log.noformat=true" - -# Track failures to print them at the end -failures = [] - -# Log a message. Use sparingly because this flushes every write. -def log(msg): - LOG_FILE.write(msg + "\n") - LOG_FILE.flush() - -def log_and_print(msg): - print msg - log(msg) - -# Prompt the user to delete the scratch directory used -def clean_work_files(): - response = raw_input("OK to delete scratch directory '%s'? (y/N) " % WORK_DIR) - if response == "y": - shutil.rmtree(WORK_DIR) - -# Run the given command and log its output to the log file -def run_cmd(cmd, exit_on_failure=True): - log("Running command: %s" % cmd) - ret = subprocess.call(cmd, shell=True, stdout=LOG_FILE, stderr=LOG_FILE) - if ret != 0 and exit_on_failure: - log_and_print("Command failed: %s" % cmd) - clean_work_files() - sys.exit(-1) - return ret - -def run_cmd_with_output(cmd): - log_and_print("Running command: %s" % cmd) - return subprocess.check_output(cmd, shell=True, stderr=LOG_FILE) - -# Test if the given condition is successful -# If so, print the pass message; otherwise print the failure message -def test(cond, msg): - return passed(msg) if cond else failed(msg) - -def passed(msg): - log_and_print("[PASSED] %s" % msg) - -def failed(msg): - failures.append(msg) - log_and_print("[**FAILED**] %s" % msg) - -def get_url(url): - return urllib2.urlopen(url).read() - -# If the path exists, prompt the user to delete it -# If the resource is not deleted, abort -def ensure_path_not_present(path): - full_path = os.path.expanduser(path) - if os.path.exists(full_path): - print "Found %s locally." % full_path - response = raw_input("This can interfere with testing published artifacts. OK to delete? (y/N) ") - if response == "y": - shutil.rmtree(full_path) - else: - print "Abort." - sys.exit(-1) - -log_and_print("|-------- Starting Spark audit tests for release %s --------|" % RELEASE_VERSION) -log_and_print("Log output can be found in %s" % LOG_FILE_NAME) - -original_dir = os.getcwd() - -# For each of these modules, we'll test an 'empty' application in sbt and -# maven that links against them. This will catch issues with messed up -# dependencies within those projects. -modules = [ - "spark-core", "spark-bagel", "spark-mllib", "spark-streaming", "spark-repl", - "spark-graphx", "spark-streaming-flume", "spark-streaming-kafka", - "spark-streaming-mqtt", "spark-streaming-twitter", "spark-streaming-zeromq", - "spark-catalyst", "spark-sql", "spark-hive", "spark-streaming-kinesis-asl" -] -modules = map(lambda m: "%s_%s" % (m, SCALA_BINARY_VERSION), modules) - -# Check for directories that might interfere with tests -local_ivy_spark = "~/.ivy2/local/org.apache.spark" -cache_ivy_spark = "~/.ivy2/cache/org.apache.spark" -local_maven_kafka = "~/.m2/repository/org/apache/kafka" -local_maven_kafka = "~/.m2/repository/org/apache/spark" -map(ensure_path_not_present, [local_ivy_spark, cache_ivy_spark, local_maven_kafka]) - -# SBT build tests -log_and_print("==== Building SBT modules ====") -os.chdir("blank_sbt_build") -os.environ["SPARK_VERSION"] = RELEASE_VERSION -os.environ["SCALA_VERSION"] = SCALA_VERSION -os.environ["SPARK_RELEASE_REPOSITORY"] = RELEASE_REPOSITORY -os.environ["SPARK_AUDIT_MASTER"] = "local" -for module in modules: - log("==== Building module %s in SBT ====" % module) - os.environ["SPARK_MODULE"] = module - ret = run_cmd("%s clean update" % SBT_CMD, exit_on_failure=False) - test(ret == 0, "SBT build against '%s' module" % module) -os.chdir(original_dir) - -# SBT application tests -log_and_print("==== Building SBT applications ====") -for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming", "sbt_app_sql", "sbt_app_hive", "sbt_app_kinesis"]: - log("==== Building application %s in SBT ====" % app) - os.chdir(app) - ret = run_cmd("%s clean run" % SBT_CMD, exit_on_failure=False) - test(ret == 0, "SBT application (%s)" % app) - os.chdir(original_dir) - -# Maven build tests -os.chdir("blank_maven_build") -log_and_print("==== Building Maven modules ====") -for module in modules: - log("==== Building module %s in maven ====" % module) - cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" ' - '-Dspark.module="%s" clean compile' % - (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, module)) - ret = run_cmd(cmd, exit_on_failure=False) - test(ret == 0, "maven build against '%s' module" % module) -os.chdir(original_dir) - -# Maven application tests -log_and_print("==== Building Maven applications ====") -os.chdir("maven_app_core") -mvn_exec_cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" ' - '-Dscala.binary.version="%s" clean compile ' - 'exec:java -Dexec.mainClass="SimpleApp"' % - (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, SCALA_BINARY_VERSION)) -ret = run_cmd(mvn_exec_cmd, exit_on_failure=False) -test(ret == 0, "maven application (core)") -os.chdir(original_dir) - -# Binary artifact tests -if os.path.exists(WORK_DIR): - print "Working directory '%s' already exists" % WORK_DIR - sys.exit(-1) -os.mkdir(WORK_DIR) -os.chdir(WORK_DIR) - -index_page = get_url(RELEASE_URL) -artifact_regex = r = re.compile("") -artifacts = r.findall(index_page) - -# Verify artifact integrity -for artifact in artifacts: - log_and_print("==== Verifying download integrity for artifact: %s ====" % artifact) - - artifact_url = "%s/%s" % (RELEASE_URL, artifact) - key_file = "%s.asc" % artifact - run_cmd("wget %s" % artifact_url) - run_cmd("wget %s/%s" % (RELEASE_URL, key_file)) - run_cmd("wget %s%s" % (artifact_url, ".sha")) - - # Verify signature - run_cmd("%s --keyserver pgp.mit.edu --recv-key %s" % (GPG_CMD, RELEASE_KEY)) - run_cmd("%s %s" % (GPG_CMD, key_file)) - passed("Artifact signature verified.") - - # Verify md5 - my_md5 = run_cmd_with_output("%s --print-md MD5 %s" % (GPG_CMD, artifact)).strip() - release_md5 = get_url("%s.md5" % artifact_url).strip() - test(my_md5 == release_md5, "Artifact MD5 verified.") - - # Verify sha - my_sha = run_cmd_with_output("%s --print-md SHA512 %s" % (GPG_CMD, artifact)).strip() - release_sha = get_url("%s.sha" % artifact_url).strip() - test(my_sha == release_sha, "Artifact SHA verified.") - - # Verify Apache required files - dir_name = artifact.replace(".tgz", "") - run_cmd("tar xvzf %s" % artifact) - base_files = os.listdir(dir_name) - test("CHANGES.txt" in base_files, "Tarball contains CHANGES.txt file") - test("NOTICE" in base_files, "Tarball contains NOTICE file") - test("LICENSE" in base_files, "Tarball contains LICENSE file") - - os.chdir(WORK_DIR) - -# Report result -log_and_print("\n") -if len(failures) == 0: - log_and_print("*** ALL TESTS PASSED ***") -else: - log_and_print("XXXXX SOME TESTS DID NOT PASS XXXXX") - for f in failures: - log_and_print(" %s" % f) -os.chdir(original_dir) - -# Clean up -clean_work_files() - -log_and_print("|-------- Spark release audit complete --------|") diff --git a/dev/_site/audit-release/blank_maven_build/pom.xml b/dev/_site/audit-release/blank_maven_build/pom.xml deleted file mode 100644 index 02dd9046c9a49..0000000000000 --- a/dev/_site/audit-release/blank_maven_build/pom.xml +++ /dev/null @@ -1,43 +0,0 @@ - - - - - spark.audit - spark-audit - 4.0.0 - Spark Release Auditor - jar - 1.0 - - - Spray.cc repository - http://repo.spray.cc - - - Spark Staging Repo - ${spark.release.repository} - - - - - org.apache.spark - ${spark.module} - ${spark.version} - - - diff --git a/dev/_site/audit-release/blank_sbt_build/build.sbt b/dev/_site/audit-release/blank_sbt_build/build.sbt deleted file mode 100644 index 62815542e5bd9..0000000000000 --- a/dev/_site/audit-release/blank_sbt_build/build.sbt +++ /dev/null @@ -1,30 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Spark Release Auditor" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" % System.getenv.get("SPARK_MODULE") % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Eclipse Paho Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/", - "Maven Repository" at "http://repo1.maven.org/maven2/", - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/maven_app_core/input.txt b/dev/_site/audit-release/maven_app_core/input.txt deleted file mode 100644 index 837b6f85ae97f..0000000000000 --- a/dev/_site/audit-release/maven_app_core/input.txt +++ /dev/null @@ -1,8 +0,0 @@ -a -b -c -d -a -b -c -d diff --git a/dev/_site/audit-release/maven_app_core/pom.xml b/dev/_site/audit-release/maven_app_core/pom.xml deleted file mode 100644 index b516396825573..0000000000000 --- a/dev/_site/audit-release/maven_app_core/pom.xml +++ /dev/null @@ -1,52 +0,0 @@ - - - - - spark.audit - spark-audit - 4.0.0 - Simple Project - jar - 1.0 - - - Spray.cc repository - http://repo.spray.cc - - - Spark Staging Repo - ${spark.release.repository} - - - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - - - - - - - maven-compiler-plugin - 3.1 - - - - diff --git a/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java b/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java deleted file mode 100644 index 5217689e7c092..0000000000000 --- a/dev/_site/audit-release/maven_app_core/src/main/java/SimpleApp.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.spark.api.java.*; -import org.apache.spark.api.java.function.Function; - -public class SimpleApp { - public static void main(String[] args) { - String logFile = "input.txt"; - JavaSparkContext sc = new JavaSparkContext("local", "Simple App"); - JavaRDD logData = sc.textFile(logFile).cache(); - - long numAs = logData.filter(new Function() { - public Boolean call(String s) { return s.contains("a"); } - }).count(); - - long numBs = logData.filter(new Function() { - public Boolean call(String s) { return s.contains("b"); } - }).count(); - - if (numAs != 2 || numBs != 2) { - System.out.println("Failed to parse log files with Spark"); - System.exit(-1); - } - System.out.println("Test succeeded"); - sc.stop(); - } -} diff --git a/dev/_site/audit-release/sbt_app_core/build.sbt b/dev/_site/audit-release/sbt_app_core/build.sbt deleted file mode 100644 index 291b1d6440bac..0000000000000 --- a/dev/_site/audit-release/sbt_app_core/build.sbt +++ /dev/null @@ -1,28 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Simple Project" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_core/input.txt b/dev/_site/audit-release/sbt_app_core/input.txt deleted file mode 100644 index 837b6f85ae97f..0000000000000 --- a/dev/_site/audit-release/sbt_app_core/input.txt +++ /dev/null @@ -1,8 +0,0 @@ -a -b -c -d -a -b -c -d diff --git a/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala deleted file mode 100644 index 61d91c70e9709..0000000000000 --- a/dev/_site/audit-release/sbt_app_core/src/main/scala/SparkApp.scala +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import scala.util.Try - -import org.apache.spark.SparkConf -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ - -object SimpleApp { - def main(args: Array[String]) { - val conf = sys.env.get("SPARK_AUDIT_MASTER") match { - case Some(master) => new SparkConf().setAppName("Simple Spark App").setMaster(master) - case None => new SparkConf().setAppName("Simple Spark App") - } - val logFile = "input.txt" - val sc = new SparkContext(conf) - val logData = sc.textFile(logFile, 2).cache() - val numAs = logData.filter(line => line.contains("a")).count() - val numBs = logData.filter(line => line.contains("b")).count() - if (numAs != 2 || numBs != 2) { - println("Failed to parse log files with Spark") - System.exit(-1) - } - - // Regression test for SPARK-1167: Remove metrics-ganglia from default build due to LGPL issue - val foundConsole = Try(Class.forName("org.apache.spark.metrics.sink.ConsoleSink")).isSuccess - val foundGanglia = Try(Class.forName("org.apache.spark.metrics.sink.GangliaSink")).isSuccess - if (!foundConsole) { - println("Console sink not loaded via spark-core") - System.exit(-1) - } - if (foundGanglia) { - println("Ganglia sink was loaded via spark-core") - System.exit(-1) - } - - // Remove kinesis from default build due to ASL license issue - val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess - if (foundKinesis) { - println("Kinesis was loaded via spark-core") - System.exit(-1) - } - } -} -// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_ganglia/build.sbt b/dev/_site/audit-release/sbt_app_ganglia/build.sbt deleted file mode 100644 index 6d9474acf5bbc..0000000000000 --- a/dev/_site/audit-release/sbt_app_ganglia/build.sbt +++ /dev/null @@ -1,30 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Ganglia Test" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-core" % System.getenv.get("SPARK_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-ganglia-lgpl" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala deleted file mode 100644 index 9f7ae75d0b477..0000000000000 --- a/dev/_site/audit-release/sbt_app_ganglia/src/main/scala/SparkApp.scala +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import scala.util.Try - -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ - -object SimpleApp { - def main(args: Array[String]) { - // Regression test for SPARK-1167: Remove metrics-ganglia from default build due to LGPL issue - val foundConsole = Try(Class.forName("org.apache.spark.metrics.sink.ConsoleSink")).isSuccess - val foundGanglia = Try(Class.forName("org.apache.spark.metrics.sink.GangliaSink")).isSuccess - if (!foundConsole) { - println("Console sink not loaded via spark-core") - System.exit(-1) - } - if (!foundGanglia) { - println("Ganglia sink not loaded via spark-ganglia-lgpl") - System.exit(-1) - } - } -} -// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_graphx/build.sbt b/dev/_site/audit-release/sbt_app_graphx/build.sbt deleted file mode 100644 index dd11245e67d44..0000000000000 --- a/dev/_site/audit-release/sbt_app_graphx/build.sbt +++ /dev/null @@ -1,28 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Simple Project" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-graphx" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala b/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala deleted file mode 100644 index 2f0b6ef9a5672..0000000000000 --- a/dev/_site/audit-release/sbt_app_graphx/src/main/scala/GraphxApp.scala +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import org.apache.spark.{SparkContext, SparkConf} -import org.apache.spark.SparkContext._ -import org.apache.spark.graphx._ -import org.apache.spark.rdd.RDD - -object GraphXApp { - def main(args: Array[String]) { - val conf = sys.env.get("SPARK_AUDIT_MASTER") match { - case Some(master) => new SparkConf().setAppName("Simple GraphX App").setMaster(master) - case None => new SparkConf().setAppName("Simple Graphx App") - } - val sc = new SparkContext(conf) - SparkContext.jarOfClass(this.getClass).foreach(sc.addJar) - - val users: RDD[(VertexId, (String, String))] = - sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")), - (5L, ("franklin", "prof")), (2L, ("istoica", "prof")), - (4L, ("peter", "student")))) - val relationships: RDD[Edge[String]] = - sc.parallelize(Array(Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"), - Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"), - Edge(4L, 0L, "student"), Edge(5L, 0L, "colleague"))) - val defaultUser = ("John Doe", "Missing") - val graph = Graph(users, relationships, defaultUser) - // Notice that there is a user 0 (for which we have no information) connected to users - // 4 (peter) and 5 (franklin). - val triplets = graph.triplets.map(e => (e.srcAttr._1, e.dstAttr._1)).collect - if (!triplets.exists(_ == ("peter", "John Doe"))) { - println("Failed to run GraphX") - System.exit(-1) - } - println("Test succeeded") - } -} -// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_hive/build.sbt b/dev/_site/audit-release/sbt_app_hive/build.sbt deleted file mode 100644 index c8824f2b15e55..0000000000000 --- a/dev/_site/audit-release/sbt_app_hive/build.sbt +++ /dev/null @@ -1,29 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Simple Project" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-hive" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Maven Repository" at "http://repo1.maven.org/maven2/", - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_hive/data.txt b/dev/_site/audit-release/sbt_app_hive/data.txt deleted file mode 100644 index 0229e67f51e01..0000000000000 --- a/dev/_site/audit-release/sbt_app_hive/data.txt +++ /dev/null @@ -1,9 +0,0 @@ -0val_0 -1val_1 -2val_2 -3val_3 -4val_4 -5val_5 -6val_6 -7val_7 -9val_9 diff --git a/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala b/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala deleted file mode 100644 index 4a980ec071ae4..0000000000000 --- a/dev/_site/audit-release/sbt_app_hive/src/main/scala/HiveApp.scala +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import scala.collection.mutable.{ListBuffer, Queue} - -import org.apache.spark.SparkConf -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.hive.HiveContext - -case class Person(name: String, age: Int) - -object SparkSqlExample { - - def main(args: Array[String]) { - val conf = sys.env.get("SPARK_AUDIT_MASTER") match { - case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master) - case None => new SparkConf().setAppName("Simple Sql App") - } - val sc = new SparkContext(conf) - val hiveContext = new HiveContext(sc) - - import hiveContext._ - sql("DROP TABLE IF EXISTS src") - sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") - sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src") - val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect() - results.foreach(println) - - def test(f: => Boolean, failureMsg: String) = { - if (!f) { - println(failureMsg) - System.exit(-1) - } - } - - test(results.size == 5, "Unexpected number of selected elements: " + results) - println("Test succeeded") - sc.stop() - } -} -// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_kinesis/build.sbt b/dev/_site/audit-release/sbt_app_kinesis/build.sbt deleted file mode 100644 index 981bc7957b5ed..0000000000000 --- a/dev/_site/audit-release/sbt_app_kinesis/build.sbt +++ /dev/null @@ -1,28 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Kinesis Test" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-streaming-kinesis-asl" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala b/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala deleted file mode 100644 index adc25b57d6aa5..0000000000000 --- a/dev/_site/audit-release/sbt_app_kinesis/src/main/scala/SparkApp.scala +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import scala.util.Try - -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ - -object SimpleApp { - def main(args: Array[String]) { - val foundKinesis = Try(Class.forName("org.apache.spark.streaming.kinesis.KinesisUtils")).isSuccess - if (!foundKinesis) { - println("Kinesis not loaded via kinesis-asl") - System.exit(-1) - } - } -} -// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_sql/build.sbt b/dev/_site/audit-release/sbt_app_sql/build.sbt deleted file mode 100644 index 9116180f71a44..0000000000000 --- a/dev/_site/audit-release/sbt_app_sql/build.sbt +++ /dev/null @@ -1,28 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Simple Project" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-sql" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala b/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala deleted file mode 100644 index 69c1154dc0955..0000000000000 --- a/dev/_site/audit-release/sbt_app_sql/src/main/scala/SqlApp.scala +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import scala.collection.mutable.{ListBuffer, Queue} - -import org.apache.spark.SparkConf -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SQLContext - -case class Person(name: String, age: Int) - -object SparkSqlExample { - - def main(args: Array[String]) { - val conf = sys.env.get("SPARK_AUDIT_MASTER") match { - case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master) - case None => new SparkConf().setAppName("Simple Sql App") - } - val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) - - import sqlContext.implicits._ - import sqlContext._ - - val people = sc.makeRDD(1 to 100, 10).map(x => Person(s"Name$x", x)).toDF() - people.registerTempTable("people") - val teenagers = sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") - val teenagerNames = teenagers.map(t => "Name: " + t(0)).collect() - teenagerNames.foreach(println) - - def test(f: => Boolean, failureMsg: String) = { - if (!f) { - println(failureMsg) - System.exit(-1) - } - } - - test(teenagerNames.size == 7, "Unexpected number of selected elements: " + teenagerNames) - println("Test succeeded") - sc.stop() - } -} -// scalastyle:on println diff --git a/dev/_site/audit-release/sbt_app_streaming/build.sbt b/dev/_site/audit-release/sbt_app_streaming/build.sbt deleted file mode 100644 index cb369d516dd16..0000000000000 --- a/dev/_site/audit-release/sbt_app_streaming/build.sbt +++ /dev/null @@ -1,28 +0,0 @@ -// -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -name := "Simple Project" - -version := "1.0" - -scalaVersion := System.getenv.get("SCALA_VERSION") - -libraryDependencies += "org.apache.spark" %% "spark-streaming" % System.getenv.get("SPARK_VERSION") - -resolvers ++= Seq( - "Spark Release Repository" at System.getenv.get("SPARK_RELEASE_REPOSITORY"), - "Spray Repository" at "http://repo.spray.cc/") diff --git a/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala b/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala deleted file mode 100644 index d6a074687f4a1..0000000000000 --- a/dev/_site/audit-release/sbt_app_streaming/src/main/scala/StreamingApp.scala +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package main.scala - -import scala.collection.mutable.{ListBuffer, Queue} - -import org.apache.spark.SparkConf -import org.apache.spark.rdd.RDD -import org.apache.spark.streaming.StreamingContext -import org.apache.spark.streaming._ - -object SparkStreamingExample { - - def main(args: Array[String]) { - val conf = sys.env.get("SPARK_AUDIT_MASTER") match { - case Some(master) => new SparkConf().setAppName("Simple Streaming App").setMaster(master) - case None => new SparkConf().setAppName("Simple Streaming App") - } - val ssc = new StreamingContext(conf, Seconds(1)) - val seen = ListBuffer[RDD[Int]]() - - val rdd1 = ssc.sparkContext.makeRDD(1 to 100, 10) - val rdd2 = ssc.sparkContext.makeRDD(1 to 1000, 10) - val rdd3 = ssc.sparkContext.makeRDD(1 to 10000, 10) - - val queue = Queue(rdd1, rdd2, rdd3) - val stream = ssc.queueStream(queue) - - stream.foreachRDD(rdd => seen += rdd) - ssc.start() - Thread.sleep(5000) - - def test(f: => Boolean, failureMsg: String) = { - if (!f) { - println(failureMsg) - System.exit(-1) - } - } - - val rddCounts = seen.map(rdd => rdd.count()).filter(_ > 0) - test(rddCounts.length == 3, "Did not collect three RDD's from stream") - test(rddCounts.toSet == Set(100, 1000, 10000), "Did not find expected streams") - - println("Test succeeded") - - ssc.stop() - } -} -// scalastyle:on println diff --git a/dev/_site/change-scala-version.sh b/dev/_site/change-scala-version.sh deleted file mode 100755 index d7975dfb6475c..0000000000000 --- a/dev/_site/change-scala-version.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -set -e - -VALID_VERSIONS=( 2.10 2.11 ) - -usage() { - echo "Usage: $(basename $0) [-h|--help] -where : - -h| --help Display this help text - valid version values : ${VALID_VERSIONS[*]} -" 1>&2 - exit 1 -} - -if [[ ($# -ne 1) || ( $1 == "--help") || $1 == "-h" ]]; then - usage -fi - -TO_VERSION=$1 - -check_scala_version() { - for i in ${VALID_VERSIONS[*]}; do [ $i = "$1" ] && return 0; done - echo "Invalid Scala version: $1. Valid versions: ${VALID_VERSIONS[*]}" 1>&2 - exit 1 -} - -check_scala_version "$TO_VERSION" - -if [ $TO_VERSION = "2.11" ]; then - FROM_VERSION="2.10" -else - FROM_VERSION="2.11" -fi - -sed_i() { - sed -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2" -} - -export -f sed_i - -BASEDIR=$(dirname $0)/.. -find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \ - -exec bash -c "sed_i 's/\(artifactId.*\)_'$FROM_VERSION'/\1_'$TO_VERSION'/g' {}" \; - -# Also update in parent POM -# Match any scala binary version to ensure idempotency -sed_i '1,/[0-9]*\.[0-9]*[0-9]*\.[0-9]*'$TO_VERSION' "$JAR_DL" && mv "$JAR_DL" "$JAR" - elif [ $(command -v wget) ]; then - wget --quiet ${URL} -O "$JAR_DL" && mv "$JAR_DL" "$JAR" - else - printf "You do not have curl or wget installed, please install rat manually.\n" - exit -1 - fi - fi - - unzip -tq "$JAR" &> /dev/null - if [ $? -ne 0 ]; then - # We failed to download - rm "$JAR" - printf "Our attempt to download rat locally to ${JAR} failed. Please install rat manually.\n" - exit -1 - fi -} - -# Go to the Spark project root directory -FWDIR="$(cd "`dirname "$0"`"/..; pwd)" -cd "$FWDIR" - -if test -x "$JAVA_HOME/bin/java"; then - declare java_cmd="$JAVA_HOME/bin/java" -else - declare java_cmd=java -fi - -export RAT_VERSION=0.10 -export rat_jar="$FWDIR"/lib/apache-rat-${RAT_VERSION}.jar -mkdir -p "$FWDIR"/lib - -[[ -f "$rat_jar" ]] || acquire_rat_jar || { - echo "Download failed. Obtain the rat jar manually and place it at $rat_jar" - exit 1 -} - -$java_cmd -jar "$rat_jar" -E "$FWDIR"/.rat-excludes -d "$FWDIR" > rat-results.txt - -if [ $? -ne 0 ]; then - echo "RAT exited abnormally" - exit 1 -fi - -ERRORS="$(cat rat-results.txt | grep -e "??")" - -if test ! -z "$ERRORS"; then - echo "Could not find Apache license headers in the following files:" - echo "$ERRORS" - exit 1 -else - echo -e "RAT checks passed." -fi diff --git a/dev/_site/create-release/generate-changelist.py b/dev/_site/create-release/generate-changelist.py deleted file mode 100755 index 2e1a35a629342..0000000000000 --- a/dev/_site/create-release/generate-changelist.py +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Creates CHANGES.txt from git history. -# -# Usage: -# First set the new release version and old CHANGES.txt version in this file. -# Make sure you have SPARK_HOME set. -# $ python generate-changelist.py - - -import os -import sys -import subprocess -import time -import traceback - -SPARK_HOME = os.environ["SPARK_HOME"] -NEW_RELEASE_VERSION = "1.0.0" -PREV_RELEASE_GIT_TAG = "v0.9.1" - -CHANGELIST = "CHANGES.txt" -OLD_CHANGELIST = "%s.old" % (CHANGELIST) -NEW_CHANGELIST = "%s.new" % (CHANGELIST) -TMP_CHANGELIST = "%s.tmp" % (CHANGELIST) - -# date before first PR in TLP Spark repo -SPARK_REPO_CHANGE_DATE1 = time.strptime("2014-02-26", "%Y-%m-%d") -# date after last PR in incubator Spark repo -SPARK_REPO_CHANGE_DATE2 = time.strptime("2014-03-01", "%Y-%m-%d") -# Threshold PR number that differentiates PRs to TLP -# and incubator repos -SPARK_REPO_PR_NUM_THRESH = 200 - -LOG_FILE_NAME = "changes_%s" % time.strftime("%h_%m_%Y_%I_%M_%S") -LOG_FILE = open(LOG_FILE_NAME, 'w') - - -def run_cmd(cmd): - try: - print >> LOG_FILE, "Running command: %s" % cmd - output = subprocess.check_output(cmd, shell=True, stderr=LOG_FILE) - print >> LOG_FILE, "Output: %s" % output - return output - except: - traceback.print_exc() - cleanup() - sys.exit(1) - - -def append_to_changelist(string): - with open(TMP_CHANGELIST, "a") as f: - print >> f, string - - -def cleanup(ask=True): - if ask is True: - print "OK to delete temporary and log files? (y/N): " - response = raw_input() - if ask is False or (ask is True and response == "y"): - if os.path.isfile(TMP_CHANGELIST): - os.remove(TMP_CHANGELIST) - if os.path.isfile(OLD_CHANGELIST): - os.remove(OLD_CHANGELIST) - LOG_FILE.close() - os.remove(LOG_FILE_NAME) - - -print "Generating new %s for Spark release %s" % (CHANGELIST, NEW_RELEASE_VERSION) -os.chdir(SPARK_HOME) -if os.path.isfile(TMP_CHANGELIST): - os.remove(TMP_CHANGELIST) -if os.path.isfile(OLD_CHANGELIST): - os.remove(OLD_CHANGELIST) - -append_to_changelist("Spark Change Log") -append_to_changelist("----------------") -append_to_changelist("") -append_to_changelist("Release %s" % NEW_RELEASE_VERSION) -append_to_changelist("") - -print "Getting commits between tag %s and HEAD" % PREV_RELEASE_GIT_TAG -hashes = run_cmd("git log %s..HEAD --pretty='%%h'" % PREV_RELEASE_GIT_TAG).split() - -print "Getting details of %s commits" % len(hashes) -for h in hashes: - date = run_cmd("git log %s -1 --pretty='%%ad' --date=iso | head -1" % h).strip() - subject = run_cmd("git log %s -1 --pretty='%%s' | head -1" % h).strip() - body = run_cmd("git log %s -1 --pretty='%%b'" % h) - committer = run_cmd("git log %s -1 --pretty='%%cn <%%ce>' | head -1" % h).strip() - body_lines = body.split("\n") - - if "Merge pull" in subject: - # Parse old format commit message - append_to_changelist(" %s %s" % (h, date)) - append_to_changelist(" %s" % subject) - append_to_changelist(" [%s]" % body_lines[0]) - append_to_changelist("") - - elif "maven-release" not in subject: - # Parse new format commit message - # Get authors from commit message, committer otherwise - authors = [committer] - if "Author:" in body: - authors = [line.split(":")[1].strip() for line in body_lines if "Author:" in line] - - # Generate GitHub PR URL for easy access if possible - github_url = "" - if "Closes #" in body: - pr_num = [line.split()[1].lstrip("#") for line in body_lines if "Closes #" in line][0] - github_url = "github.com/apache/spark/pull/%s" % pr_num - day = time.strptime(date.split()[0], "%Y-%m-%d") - if (day < SPARK_REPO_CHANGE_DATE1 or - (day < SPARK_REPO_CHANGE_DATE2 and pr_num < SPARK_REPO_PR_NUM_THRESH)): - github_url = "github.com/apache/incubator-spark/pull/%s" % pr_num - - append_to_changelist(" %s" % subject) - append_to_changelist(" %s" % ', '.join(authors)) - # for author in authors: - # append_to_changelist(" %s" % author) - append_to_changelist(" %s" % date) - if len(github_url) > 0: - append_to_changelist(" Commit: %s, %s" % (h, github_url)) - else: - append_to_changelist(" Commit: %s" % h) - append_to_changelist("") - -# Append old change list -print "Appending changelist from tag %s" % PREV_RELEASE_GIT_TAG -run_cmd("git show %s:%s | tail -n +3 >> %s" % (PREV_RELEASE_GIT_TAG, CHANGELIST, TMP_CHANGELIST)) -run_cmd("cp %s %s" % (TMP_CHANGELIST, NEW_CHANGELIST)) -print "New change list generated as %s" % NEW_CHANGELIST -cleanup(False) diff --git a/dev/_site/create-release/generate-contributors.py b/dev/_site/create-release/generate-contributors.py deleted file mode 100755 index db9c680a4bad3..0000000000000 --- a/dev/_site/create-release/generate-contributors.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# This script automates the process of creating release notes. - -import os -import re -import sys - -from releaseutils import * - -# You must set the following before use! -JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") -RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2") -PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0") - -# If the release tags are not provided, prompt the user to provide them -while not tag_exists(RELEASE_TAG): - RELEASE_TAG = raw_input("Please provide a valid release tag: ") -while not tag_exists(PREVIOUS_RELEASE_TAG): - print "Please specify the previous release tag." - PREVIOUS_RELEASE_TAG = raw_input(\ - "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ") - -# Gather commits found in the new tag but not in the old tag. -# This filters commits based on both the git hash and the PR number. -# If either is present in the old tag, then we ignore the commit. -print "Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG) -release_commits = get_commits(RELEASE_TAG) -previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG) -previous_release_hashes = set() -previous_release_prs = set() -for old_commit in previous_release_commits: - previous_release_hashes.add(old_commit.get_hash()) - if old_commit.get_pr_number(): - previous_release_prs.add(old_commit.get_pr_number()) -new_commits = [] -for this_commit in release_commits: - this_hash = this_commit.get_hash() - this_pr_number = this_commit.get_pr_number() - if this_hash in previous_release_hashes: - continue - if this_pr_number and this_pr_number in previous_release_prs: - continue - new_commits.append(this_commit) -if not new_commits: - sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG)) - -# Prompt the user for confirmation that the commit range is correct -print "\n==================================================================================" -print "JIRA server: %s" % JIRA_API_BASE -print "Release tag: %s" % RELEASE_TAG -print "Previous release tag: %s" % PREVIOUS_RELEASE_TAG -print "Number of commits in this range: %s" % len(new_commits) -print -def print_indented(_list): - for x in _list: print " %s" % x -if yesOrNoPrompt("Show all commits?"): - print_indented(new_commits) -print "==================================================================================\n" -if not yesOrNoPrompt("Does this look correct?"): - sys.exit("Ok, exiting") - -# Filter out special commits -releases = [] -maintenance = [] -reverts = [] -nojiras = [] -filtered_commits = [] -def is_release(commit_title): - return re.findall("\[release\]", commit_title.lower()) or\ - "preparing spark release" in commit_title.lower() or\ - "preparing development version" in commit_title.lower() or\ - "CHANGES.txt" in commit_title -def is_maintenance(commit_title): - return "maintenance" in commit_title.lower() or\ - "manually close" in commit_title.lower() -def has_no_jira(commit_title): - return not re.findall("SPARK-[0-9]+", commit_title.upper()) -def is_revert(commit_title): - return "revert" in commit_title.lower() -def is_docs(commit_title): - return re.findall("docs*", commit_title.lower()) or\ - "programming guide" in commit_title.lower() -for c in new_commits: - t = c.get_title() - if not t: continue - elif is_release(t): releases.append(c) - elif is_maintenance(t): maintenance.append(c) - elif is_revert(t): reverts.append(c) - elif is_docs(t): filtered_commits.append(c) # docs may not have JIRA numbers - elif has_no_jira(t): nojiras.append(c) - else: filtered_commits.append(c) - -# Warn against ignored commits -if releases or maintenance or reverts or nojiras: - print "\n==================================================================================" - if releases: print "Found %d release commits" % len(releases) - if maintenance: print "Found %d maintenance commits" % len(maintenance) - if reverts: print "Found %d revert commits" % len(reverts) - if nojiras: print "Found %d commits with no JIRA" % len(nojiras) - print "* Warning: these commits will be ignored.\n" - if yesOrNoPrompt("Show ignored commits?"): - if releases: print "Release (%d)" % len(releases); print_indented(releases) - if maintenance: print "Maintenance (%d)" % len(maintenance); print_indented(maintenance) - if reverts: print "Revert (%d)" % len(reverts); print_indented(reverts) - if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras) - print "==================== Warning: the above commits will be ignored ==================\n" -prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits) -if not yesOrNoPrompt(prompt_msg): - sys.exit("Ok, exiting.") - -# Keep track of warnings to tell the user at the end -warnings = [] - -# Mapping from the invalid author name to its associated JIRA issues -# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471") -invalid_authors = {} - -# Populate a map that groups issues and components by author -# It takes the form: Author name -> { Contribution type -> Spark components } -# For instance, -# { -# 'Andrew Or': { -# 'bug fixes': ['windows', 'core', 'web ui'], -# 'improvements': ['core'] -# }, -# 'Tathagata Das' : { -# 'bug fixes': ['streaming'] -# 'new feature': ['streaming'] -# } -# } -# -author_info = {} -jira_options = { "server": JIRA_API_BASE } -jira_client = JIRA(options = jira_options) -print "\n=========================== Compiling contributor list ===========================" -for commit in filtered_commits: - _hash = commit.get_hash() - title = commit.get_title() - issues = re.findall("SPARK-[0-9]+", title.upper()) - author = commit.get_author() - date = get_date(_hash) - # If the author name is invalid, keep track of it along - # with all associated issues so we can translate it later - if is_valid_author(author): - author = capitalize_author(author) - else: - if author not in invalid_authors: - invalid_authors[author] = set() - for issue in issues: - invalid_authors[author].add(issue) - # Parse components from the commit title, if any - commit_components = find_components(title, _hash) - # Populate or merge an issue into author_info[author] - def populate(issue_type, components): - components = components or [CORE_COMPONENT] # assume core if no components provided - if author not in author_info: - author_info[author] = {} - if issue_type not in author_info[author]: - author_info[author][issue_type] = set() - for component in components: - author_info[author][issue_type].add(component) - # Find issues and components associated with this commit - for issue in issues: - try: - jira_issue = jira_client.issue(issue) - jira_type = jira_issue.fields.issuetype.name - jira_type = translate_issue_type(jira_type, issue, warnings) - jira_components = [translate_component(c.name, _hash, warnings)\ - for c in jira_issue.fields.components] - all_components = set(jira_components + commit_components) - populate(jira_type, all_components) - except Exception as e: - print "Unexpected error:", e - # For docs without an associated JIRA, manually add it ourselves - if is_docs(title) and not issues: - populate("documentation", commit_components) - print " Processed commit %s authored by %s on %s" % (_hash, author, date) -print "==================================================================================\n" - -# Write to contributors file ordered by author names -# Each line takes the format " * Author name -- semi-colon delimited contributions" -# e.g. * Andrew Or -- Bug fixes in Windows, Core, and Web UI; improvements in Core -# e.g. * Tathagata Das -- Bug fixes and new features in Streaming -contributors_file = open(contributors_file_name, "w") -authors = author_info.keys() -authors.sort() -for author in authors: - contribution = "" - components = set() - issue_types = set() - for issue_type, comps in author_info[author].items(): - components.update(comps) - issue_types.add(issue_type) - # If there is only one component, mention it only once - # e.g. Bug fixes, improvements in MLlib - if len(components) == 1: - contribution = "%s in %s" % (nice_join(issue_types), next(iter(components))) - # Otherwise, group contributions by issue types instead of modules - # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN - else: - contributions = ["%s in %s" % (issue_type, nice_join(comps)) \ - for issue_type, comps in author_info[author].items()] - contribution = "; ".join(contributions) - # Do not use python's capitalize() on the whole string to preserve case - assert contribution - contribution = contribution[0].capitalize() + contribution[1:] - # If the author name is invalid, use an intermediate format that - # can be translated through translate-contributors.py later - # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672 - if author in invalid_authors and invalid_authors[author]: - author = author + "/" + "/".join(invalid_authors[author]) - #line = " * %s -- %s" % (author, contribution) - line = author - contributors_file.write(line + "\n") -contributors_file.close() -print "Contributors list is successfully written to %s!" % contributors_file_name - -# Prompt the user to translate author names if necessary -if invalid_authors: - warnings.append("Found the following invalid authors:") - for a in invalid_authors: - warnings.append("\t%s" % a) - warnings.append("Please run './translate-contributors.py' to translate them.") - -# Log any warnings encountered in the process -if warnings: - print "\n============ Warnings encountered while creating the contributor list ============" - for w in warnings: print w - print "Please correct these in the final contributors list at %s." % contributors_file_name - print "==================================================================================\n" - diff --git a/dev/_site/create-release/known_translations b/dev/_site/create-release/known_translations deleted file mode 100644 index 3563fe3cc3c03..0000000000000 --- a/dev/_site/create-release/known_translations +++ /dev/null @@ -1,167 +0,0 @@ -# This is a mapping of names to be translated through translate-contributors.py -# The format expected on each line should be: - -CodingCat - Nan Zhu -CrazyJvm - Chao Chen -EugenCepoi - Eugen Cepoi -GraceH - Jie Huang -JerryLead - Lijie Xu -Leolh - Liu Hao -Lewuathe - Kai Sasaki -RongGu - Rong Gu -Shiti - Shiti Saxena -Victsm - Min Shen -WangTaoTheTonic - Wang Tao -XuTingjun - Tingjun Xu -YanTangZhai - Yantang Zhai -alexdebrie - Alex DeBrie -alokito - Alok Saldanha -anantasty - Anant Asthana -andrewor14 - Andrew Or -aniketbhatnagar - Aniket Bhatnagar -arahuja - Arun Ahuja -brkyvz - Burak Yavuz -chesterxgchen - Chester Chen -chiragaggarwal - Chirag Aggarwal -chouqin - Qiping Li -cocoatomo - Tomohiko K. -coderfi - Fairiz Azizi -coderxiang - Shuo Xiang -davies - Davies Liu -epahomov - Egor Pahomov -falaki - Hossein Falaki -freeman-lab - Jeremy Freeman -industrial-sloth - Jascha Swisher -jackylk - Jacky Li -jayunit100 - Jay Vyas -jerryshao - Saisai Shao -jkbradley - Joseph Bradley -lianhuiwang - Lianhui Wang -lirui-intel - Rui Li -luluorta - Lu Lu -luogankun - Gankun Luo -maji2014 - Derek Ma -mccheah - Matthew Cheah -mengxr - Xiangrui Meng -nartz - Nathan Artz -odedz - Oded Zimerman -ravipesala - Ravindra Pesala -roxchkplusony - Victor Tso -scwf - Wang Fei -shimingfei - Shiming Fei -surq - Surong Quan -suyanNone - Su Yan -tedyu - Ted Yu -tigerquoll - Dale Richardson -wangxiaojing - Xiaojing Wang -watermen - Yadong Qi -witgo - Guoqiang Li -xinyunh - Xinyun Huang -zsxwing - Shixiong Zhu -Bilna - Bilna P -DoingDone9 - Doing Done -Earne - Ernest -FlytxtRnD - Meethu Mathew -GenTang - Gen TANG -JoshRosen - Josh Rosen -MechCoder - Manoj Kumar -OopsOutOfMemory - Sheng Li -Peishen-Jia - Peishen Jia -SaintBacchus - Huang Zhaowei -azagrebin - Andrey Zagrebin -bzz - Alexander Bezzubov -fjiang6 - Fan Jiang -gasparms - Gaspar Munoz -guowei2 - Guo Wei -hhbyyh - Yuhao Yang -hseagle - Peng Xu -javadba - Stephen Boesch -jbencook - Ben Cook -kul - Kuldeep -ligangty - Gang Li -marsishandsome - Liangliang Gu -medale - Markus Dale -nemccarthy - Nathan McCarthy -nxwhite-str - Nate Crosswhite -seayi - Xiaohua Yi -tianyi - Yi Tian -uncleGen - Uncle Gen -viper-kun - Xu Kun -x1- - Yuri Saito -zapletal-martin - Martin Zapletal -zuxqoj - Shekhar Bansal -mingyukim - Mingyu Kim -sigmoidanalytics - Mayur Rustagi -AiHe - Ai He -BenFradet - Ben Fradet -FavioVazquez - Favio Vazquez -JaysonSunshine - Jayson Sunshine -Liuchang0812 - Liu Chang -Sephiroth-Lin - Sephiroth Lin -dobashim - Masaru Dobashi -ehnalis - Zoltan Zvara -emres - Emre Sevinc -gchen - Guancheng Chen -haiyangsea - Haiyang Sea -hlin09 - Hao Lin -hqzizania - Qian Huang -jeanlyn - Jean Lyn -jerluc - Jeremy A. Lucas -jrabary - Jaonary Rabarisoa -judynash - Judy Nash -kaka1992 - Chen Song -ksonj - Kalle Jepsen -kuromatsu-nobuyuki - Nobuyuki Kuromatsu -lazyman500 - Dong Xu -leahmcguire - Leah McGuire -mbittmann - Mark Bittmann -mbonaci - Marko Bonaci -meawoppl - Matthew Goodman -nyaapa - Arsenii Krasikov -phatak-dev - Madhukara Phatak -prabeesh - Prabeesh K -rakeshchalasani - Rakesh Chalasani -rekhajoshm - Rekha Joshi -sisihj - June He -szheng79 - Shuai Zheng -texasmichelle - Michelle Casbon -vinodkc - Vinod KC -yongtang - Yong Tang -ypcat - Pei-Lun Lee -zhichao-li - Zhichao Li -zzcclp - Zhichao Zhang -979969786 - Yuming Wang -Rosstin - Rosstin Murphy -ameyc - Amey Chaugule -animeshbaranawal - Animesh Baranawal -cafreeman - Chris Freeman -lee19 - Lee -lockwobr - Brian Lockwood -navis - Navis Ryu -pparkkin - Paavo Parkkinen -HyukjinKwon - Hyukjin Kwon -JDrit - Joseph Batchik -JuhongPark - Juhong Park -KaiXinXiaoLei - KaiXinXIaoLei -NamelessAnalyst - NamelessAnalyst -alyaxey - Alex Slusarenko -baishuo - Shuo Bai -fe2s - Oleksiy Dyagilev -felixcheung - Felix Cheung -feynmanliang - Feynman Liang -josepablocam - Jose Cambronero -kai-zeng - Kai Zeng -mosessky - mosessky -msannell - Michael Sannella -nishkamravi2 - Nishkam Ravi -noel-smith - Noel Smith -petz2000 - Patrick Baier -qiansl127 - Shilei Qian -rahulpalamuttam - Rahul Palamuttam -rowan000 - Rowan Chattaway -sarutak - Kousuke Saruta -sethah - Seth Hendrickson -small-wang - Wang Wei -stanzhai - Stan Zhai -tien-dungle - Tien-Dung Le -xuchenCN - Xu Chen -zhangjiajin - Zhang JiaJin diff --git a/dev/_site/create-release/release-build.sh b/dev/_site/create-release/release-build.sh deleted file mode 100755 index cb79e9eba06e2..0000000000000 --- a/dev/_site/create-release/release-build.sh +++ /dev/null @@ -1,326 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -function exit_with_usage { - cat << EOF -usage: release-build.sh -Creates build deliverables from a Spark commit. - -Top level targets are - package: Create binary packages and copy them to people.apache - docs: Build docs and copy them to people.apache - publish-snapshot: Publish snapshot release to Apache snapshots - publish-release: Publish a release to Apache release repo - -All other inputs are environment variables - -GIT_REF - Release tag or commit to build from -SPARK_VERSION - Release identifier used when publishing -SPARK_PACKAGE_VERSION - Release identifier in top level package directory -REMOTE_PARENT_DIR - Parent in which to create doc or release builds. -REMOTE_PARENT_MAX_LENGTH - If set, parent directory will be cleaned to only - have this number of subdirectories (by deleting old ones). WARNING: This deletes data. - -ASF_USERNAME - Username of ASF committer account -ASF_PASSWORD - Password of ASF committer account -ASF_RSA_KEY - RSA private key file for ASF committer account - -GPG_KEY - GPG key used to sign release artifacts -GPG_PASSPHRASE - Passphrase for GPG key -EOF - exit 1 -} - -set -e - -if [ $# -eq 0 ]; then - exit_with_usage -fi - -if [[ $@ == *"help"* ]]; then - exit_with_usage -fi - -for env in ASF_USERNAME ASF_RSA_KEY GPG_PASSPHRASE GPG_KEY; do - if [ -z "${!env}" ]; then - echo "ERROR: $env must be set to run this script" - exit_with_usage - fi -done - -# Commit ref to checkout when building -GIT_REF=${GIT_REF:-master} - -# Destination directory parent on remote server -REMOTE_PARENT_DIR=${REMOTE_PARENT_DIR:-/home/$ASF_USERNAME/public_html} - -SSH="ssh -o ConnectTimeout=300 -o StrictHostKeyChecking=no -i $ASF_RSA_KEY" -GPG="gpg --no-tty --batch" -NEXUS_ROOT=https://repository.apache.org/service/local/staging -NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads -BASE_DIR=$(pwd) - -MVN="build/mvn --force" -PUBLISH_PROFILES="-Pyarn -Phive -Phadoop-2.2" -PUBLISH_PROFILES="$PUBLISH_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl" - -rm -rf spark -git clone https://git-wip-us.apache.org/repos/asf/spark.git -cd spark -git checkout $GIT_REF -git_hash=`git rev-parse --short HEAD` -echo "Checked out Spark git hash $git_hash" - -if [ -z "$SPARK_VERSION" ]; then - SPARK_VERSION=$($MVN help:evaluate -Dexpression=project.version \ - | grep -v INFO | grep -v WARNING | grep -v Download) -fi - -if [ -z "$SPARK_PACKAGE_VERSION" ]; then - SPARK_PACKAGE_VERSION="${SPARK_VERSION}-$(date +%Y_%m_%d_%H_%M)-${git_hash}" -fi - -DEST_DIR_NAME="spark-$SPARK_PACKAGE_VERSION" -USER_HOST="$ASF_USERNAME@people.apache.org" - -git clean -d -f -x -rm .gitignore -rm -rf .git -cd .. - -if [ -n "$REMOTE_PARENT_MAX_LENGTH" ]; then - old_dirs=$($SSH $USER_HOST ls -t $REMOTE_PARENT_DIR | tail -n +$REMOTE_PARENT_MAX_LENGTH) - for old_dir in $old_dirs; do - echo "Removing directory: $old_dir" - $SSH $USER_HOST rm -r $REMOTE_PARENT_DIR/$old_dir - done -fi - -if [[ "$1" == "package" ]]; then - # Source and binary tarballs - echo "Packaging release tarballs" - cp -r spark spark-$SPARK_VERSION - tar cvzf spark-$SPARK_VERSION.tgz spark-$SPARK_VERSION - echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour --output spark-$SPARK_VERSION.tgz.asc \ - --detach-sig spark-$SPARK_VERSION.tgz - echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md MD5 spark-$SPARK_VERSION.tgz > \ - spark-$SPARK_VERSION.tgz.md5 - echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ - SHA512 spark-$SPARK_VERSION.tgz > spark-$SPARK_VERSION.tgz.sha - rm -rf spark-$SPARK_VERSION - - # Updated for each binary build - make_binary_release() { - NAME=$1 - FLAGS=$2 - ZINC_PORT=$3 - cp -r spark spark-$SPARK_VERSION-bin-$NAME - - cd spark-$SPARK_VERSION-bin-$NAME - - # TODO There should probably be a flag to make-distribution to allow 2.11 support - if [[ $FLAGS == *scala-2.11* ]]; then - ./dev/change-scala-version.sh 2.11 - fi - - export ZINC_PORT=$ZINC_PORT - echo "Creating distribution: $NAME ($FLAGS)" - - # Get maven home set by MVN - MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'` - - ./make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \ - -DzincPort=$ZINC_PORT 2>&1 > ../binary-release-$NAME.log - cd .. - cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz . - - echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \ - --output spark-$SPARK_VERSION-bin-$NAME.tgz.asc \ - --detach-sig spark-$SPARK_VERSION-bin-$NAME.tgz - echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ - MD5 spark-$SPARK_VERSION-bin-$NAME.tgz > \ - spark-$SPARK_VERSION-bin-$NAME.tgz.md5 - echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ - SHA512 spark-$SPARK_VERSION-bin-$NAME.tgz > \ - spark-$SPARK_VERSION-bin-$NAME.tgz.sha - } - - # TODO: Check exit codes of children here: - # http://stackoverflow.com/questions/1570262/shell-get-exit-code-of-background-process - - # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds - # share the same Zinc server. - make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" & - make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" & - make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" & - make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" & - make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" & - make_binary_release "hadoop2.6" "-Psparkr -Phadoop-2.6 -Phive -Phive-thriftserver -Pyarn" "3034" & - make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn" "3037" & - make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn" "3038" & - wait - rm -rf spark-$SPARK_VERSION-bin-*/ - - # Copy data - dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-bin" - echo "Copying release tarballs to $dest_dir" - $SSH $USER_HOST mkdir $dest_dir - rsync -e "$SSH" spark-* $USER_HOST:$dest_dir - echo "Linking /latest to $dest_dir" - $SSH $USER_HOST rm -f "$REMOTE_PARENT_DIR/latest" - $SSH $USER_HOST ln -s $dest_dir "$REMOTE_PARENT_DIR/latest" - exit 0 -fi - -if [[ "$1" == "docs" ]]; then - # Documentation - cd spark - echo "Building Spark docs" - dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-docs" - cd docs - # Compile docs with Java 7 to use nicer format - # TODO: Make configurable to add this: PRODUCTION=1 - PRODUCTION=1 RELEASE_VERSION="$SPARK_VERSION" jekyll build - echo "Copying release documentation to $dest_dir" - $SSH $USER_HOST mkdir $dest_dir - echo "Linking /latest to $dest_dir" - $SSH $USER_HOST rm -f "$REMOTE_PARENT_DIR/latest" - $SSH $USER_HOST ln -s $dest_dir "$REMOTE_PARENT_DIR/latest" - rsync -e "$SSH" -r _site/* $USER_HOST:$dest_dir - cd .. - exit 0 -fi - -if [[ "$1" == "publish-snapshot" ]]; then - cd spark - # Publish Spark to Maven release repo - echo "Deploying Spark SNAPSHOT at '$GIT_REF' ($git_hash)" - echo "Publish version is $SPARK_VERSION" - if [[ ! $SPARK_VERSION == *"SNAPSHOT"* ]]; then - echo "ERROR: Snapshots must have a version containing SNAPSHOT" - echo "ERROR: You gave version '$SPARK_VERSION'" - exit 1 - fi - # Coerce the requested version - $MVN versions:set -DnewVersion=$SPARK_VERSION - tmp_settings="tmp-settings.xml" - echo "" > $tmp_settings - echo "apache.snapshots.https$ASF_USERNAME" >> $tmp_settings - echo "$ASF_PASSWORD" >> $tmp_settings - echo "" >> $tmp_settings - - # Generate random point for Zinc - export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") - - $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES \ - -Phive-thriftserver deploy - ./dev/change-scala-version.sh 2.11 - $MVN -DzincPort=$ZINC_PORT -Dscala-2.11 --settings $tmp_settings \ - -DskipTests $PUBLISH_PROFILES clean deploy - - # Clean-up Zinc nailgun process - /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill - - rm $tmp_settings - cd .. - exit 0 -fi - -if [[ "$1" == "publish-release" ]]; then - cd spark - # Publish Spark to Maven release repo - echo "Publishing Spark checkout at '$GIT_REF' ($git_hash)" - echo "Publish version is $SPARK_VERSION" - # Coerce the requested version - $MVN versions:set -DnewVersion=$SPARK_VERSION - - # Using Nexus API documented here: - # https://support.sonatype.com/entries/39720203-Uploading-to-a-Staging-Repository-via-REST-API - echo "Creating Nexus staging repository" - repo_request="Apache Spark $SPARK_VERSION (commit $git_hash)" - out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \ - -H "Content-Type:application/xml" -v \ - $NEXUS_ROOT/profiles/$NEXUS_PROFILE/start) - staged_repo_id=$(echo $out | sed -e "s/.*\(orgapachespark-[0-9]\{4\}\).*/\1/") - echo "Created Nexus staging repository: $staged_repo_id" - - tmp_repo=$(mktemp -d spark-repo-XXXXX) - - # Generate random point for Zinc - export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") - - $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $PUBLISH_PROFILES \ - -Phive-thriftserver clean install - - ./dev/change-scala-version.sh 2.11 - - $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -Dscala-2.11 \ - -DskipTests $PUBLISH_PROFILES clean install - - # Clean-up Zinc nailgun process - /usr/sbin/lsof -P |grep $ZINC_PORT | grep LISTEN | awk '{ print $2; }' | xargs kill - - ./dev/change-version-to-2.10.sh - - pushd $tmp_repo/org/apache/spark - - # Remove any extra files generated during install - find . -type f |grep -v \.jar |grep -v \.pom | xargs rm - - echo "Creating hash and signature files" - for file in $(find . -type f) - do - echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --output $file.asc \ - --detach-sig --armour $file; - if [ $(command -v md5) ]; then - # Available on OS X; -q to keep only hash - md5 -q $file > $file.md5 - else - # Available on Linux; cut to keep only hash - md5sum $file | cut -f1 -d' ' > $file.md5 - fi - sha1sum $file | cut -f1 -d' ' > $file.sha1 - done - - nexus_upload=$NEXUS_ROOT/deployByRepositoryId/$staged_repo_id - echo "Uplading files to $nexus_upload" - for file in $(find . -type f) - do - # strip leading ./ - file_short=$(echo $file | sed -e "s/\.\///") - dest_url="$nexus_upload/org/apache/spark/$file_short" - echo " Uploading $file_short" - curl -u $ASF_USERNAME:$ASF_PASSWORD --upload-file $file_short $dest_url - done - - echo "Closing nexus staging repository" - repo_request="$staged_repo_idApache Spark $SPARK_VERSION (commit $git_hash)" - out=$(curl -X POST -d "$repo_request" -u $ASF_USERNAME:$ASF_PASSWORD \ - -H "Content-Type:application/xml" -v \ - $NEXUS_ROOT/profiles/$NEXUS_PROFILE/finish) - echo "Closed Nexus staging repository: $staged_repo_id" - popd - rm -rf $tmp_repo - cd .. - exit 0 -fi - -cd .. -rm -rf spark -echo "ERROR: expects to be called with 'package', 'docs', 'publish-release' or 'publish-snapshot'" diff --git a/dev/_site/create-release/release-tag.sh b/dev/_site/create-release/release-tag.sh deleted file mode 100755 index b0a3374becc6a..0000000000000 --- a/dev/_site/create-release/release-tag.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -function exit_with_usage { - cat << EOF -usage: tag-release.sh -Tags a Spark release on a particular branch. - -Inputs are specified with the following environment variables: -ASF_USERNAME - Apache Username -ASF_PASSWORD - Apache Password -GIT_NAME - Name to use with git -GIT_EMAIL - E-mail address to use with git -GIT_BRANCH - Git branch on which to make release -RELEASE_VERSION - Version used in pom files for release -RELEASE_TAG - Name of release tag -NEXT_VERSION - Development version after release -EOF - exit 1 -} - -set -e - -if [[ $@ == *"help"* ]]; then - exit_with_usage -fi - -for env in ASF_USERNAME ASF_PASSWORD RELEASE_VERSION RELEASE_TAG NEXT_VERSION GIT_EMAIL GIT_NAME GIT_BRANCH; do - if [ -z "${!env}" ]; then - echo "$env must be set to run this script" - exit 1 - fi -done - -ASF_SPARK_REPO="git-wip-us.apache.org/repos/asf/spark.git" -MVN="build/mvn --force" - -rm -rf spark -git clone https://$ASF_USERNAME:$ASF_PASSWORD@$ASF_SPARK_REPO -b $GIT_BRANCH -cd spark - -git config user.name "$GIT_NAME" -git config user.email $GIT_EMAIL - -# Create release version -$MVN versions:set -DnewVersion=$RELEASE_VERSION | grep -v "no value" # silence logs -git commit -a -m "Preparing Spark release $RELEASE_TAG" -echo "Creating tag $RELEASE_TAG at the head of $GIT_BRANCH" -git tag $RELEASE_TAG - -# TODO: It would be nice to do some verifications here -# i.e. check whether ec2 scripts have the new version - -# Create next version -$MVN versions:set -DnewVersion=$NEXT_VERSION | grep -v "no value" # silence logs -git commit -a -m "Preparing development version $NEXT_VERSION" - -# Push changes -git push origin $RELEASE_TAG -git push origin HEAD:$GIT_BRANCH - -cd .. -rm -rf spark diff --git a/dev/_site/create-release/releaseutils.py b/dev/_site/create-release/releaseutils.py deleted file mode 100755 index 7f152b7f53559..0000000000000 --- a/dev/_site/create-release/releaseutils.py +++ /dev/null @@ -1,260 +0,0 @@ -#!/usr/bin/env python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# This file contains helper methods used in creating a release. - -import re -import sys -from subprocess import Popen, PIPE - -try: - from jira.client import JIRA - # Old versions have JIRAError in exceptions package, new (0.5+) in utils. - try: - from jira.exceptions import JIRAError - except ImportError: - from jira.utils import JIRAError -except ImportError: - print "This tool requires the jira-python library" - print "Install using 'sudo pip install jira'" - sys.exit(-1) - -try: - from github import Github - from github import GithubException -except ImportError: - print "This tool requires the PyGithub library" - print "Install using 'sudo pip install PyGithub'" - sys.exit(-1) - -try: - import unidecode -except ImportError: - print "This tool requires the unidecode library to decode obscure github usernames" - print "Install using 'sudo pip install unidecode'" - sys.exit(-1) - -# Contributors list file name -contributors_file_name = "contributors.txt" - -# Prompt the user to answer yes or no until they do so -def yesOrNoPrompt(msg): - response = raw_input("%s [y/n]: " % msg) - while response != "y" and response != "n": - return yesOrNoPrompt(msg) - return response == "y" - -# Utility functions run git commands (written with Git 1.8.5) -def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0] -def run_cmd_error(cmd): return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1] -def get_date(commit_hash): - return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash]) -def tag_exists(tag): - stderr = run_cmd_error(["git", "show", tag]) - return "error" not in stderr - -# A type-safe representation of a commit -class Commit: - def __init__(self, _hash, author, title, pr_number = None): - self._hash = _hash - self.author = author - self.title = title - self.pr_number = pr_number - def get_hash(self): return self._hash - def get_author(self): return self.author - def get_title(self): return self.title - def get_pr_number(self): return self.pr_number - def __str__(self): - closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else "" - return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr) - -# Return all commits that belong to the specified tag. -# -# Under the hood, this runs a `git log` on that tag and parses the fields -# from the command output to construct a list of Commit objects. Note that -# because certain fields reside in the commit description and cannot be parsed -# through the Github API itself, we need to do some intelligent regex parsing -# to extract those fields. -# -# This is written using Git 1.8.5. -def get_commits(tag): - commit_start_marker = "|=== COMMIT START MARKER ===|" - commit_end_marker = "|=== COMMIT END MARKER ===|" - field_end_marker = "|=== COMMIT FIELD END MARKER ===|" - log_format =\ - commit_start_marker + "%h" +\ - field_end_marker + "%an" +\ - field_end_marker + "%s" +\ - commit_end_marker + "%b" - output = run_cmd(["git", "log", "--quiet", "--pretty=format:" + log_format, tag]) - commits = [] - raw_commits = [c for c in output.split(commit_start_marker) if c] - for commit in raw_commits: - if commit.count(commit_end_marker) != 1: - print "Commit end marker not found in commit: " - for line in commit.split("\n"): print line - sys.exit(1) - # Separate commit digest from the body - # From the digest we extract the hash, author and the title - # From the body, we extract the PR number and the github username - [commit_digest, commit_body] = commit.split(commit_end_marker) - if commit_digest.count(field_end_marker) != 2: - sys.exit("Unexpected format in commit: %s" % commit_digest) - [_hash, author, title] = commit_digest.split(field_end_marker) - # The PR number and github username is in the commit message - # itself and cannot be accessed through any Github API - pr_number = None - match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body) - if match: - [pr_number, github_username] = match.groups() - # If the author name is not valid, use the github - # username so we can translate it properly later - if not is_valid_author(author): - author = github_username - # Guard against special characters - author = unidecode.unidecode(unicode(author, "UTF-8")).strip() - commit = Commit(_hash, author, title, pr_number) - commits.append(commit) - return commits - -# Maintain a mapping for translating issue types to contributions in the release notes -# This serves an additional function of warning the user against unknown issue types -# Note: This list is partially derived from this link: -# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/issuetypes -# Keep these in lower case -known_issue_types = { - "bug": "bug fixes", - "build": "build fixes", - "dependency upgrade": "build fixes", - "improvement": "improvements", - "new feature": "new features", - "documentation": "documentation", - "test": "test", - "task": "improvement", - "sub-task": "improvement" -} - -# Maintain a mapping for translating component names when creating the release notes -# This serves an additional function of warning the user against unknown components -# Note: This list is largely derived from this link: -# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/components -CORE_COMPONENT = "Core" -known_components = { - "block manager": CORE_COMPONENT, - "build": CORE_COMPONENT, - "deploy": CORE_COMPONENT, - "documentation": CORE_COMPONENT, - "ec2": "EC2", - "examples": CORE_COMPONENT, - "graphx": "GraphX", - "input/output": CORE_COMPONENT, - "java api": "Java API", - "mesos": "Mesos", - "ml": "MLlib", - "mllib": "MLlib", - "project infra": "Project Infra", - "pyspark": "PySpark", - "shuffle": "Shuffle", - "spark core": CORE_COMPONENT, - "spark shell": CORE_COMPONENT, - "sql": "SQL", - "streaming": "Streaming", - "web ui": "Web UI", - "windows": "Windows", - "yarn": "YARN" -} - -# Translate issue types using a format appropriate for writing contributions -# If an unknown issue type is encountered, warn the user -def translate_issue_type(issue_type, issue_id, warnings): - issue_type = issue_type.lower() - if issue_type in known_issue_types: - return known_issue_types[issue_type] - else: - warnings.append("Unknown issue type \"%s\" (see %s)" % (issue_type, issue_id)) - return issue_type - -# Translate component names using a format appropriate for writing contributions -# If an unknown component is encountered, warn the user -def translate_component(component, commit_hash, warnings): - component = component.lower() - if component in known_components: - return known_components[component] - else: - warnings.append("Unknown component \"%s\" (see %s)" % (component, commit_hash)) - return component - -# Parse components in the commit message -# The returned components are already filtered and translated -def find_components(commit, commit_hash): - components = re.findall("\[\w*\]", commit.lower()) - components = [translate_component(c, commit_hash)\ - for c in components if c in known_components] - return components - -# Join a list of strings in a human-readable manner -# e.g. ["Juice"] -> "Juice" -# e.g. ["Juice", "baby"] -> "Juice and baby" -# e.g. ["Juice", "baby", "moon"] -> "Juice, baby, and moon" -def nice_join(str_list): - str_list = list(str_list) # sometimes it's a set - if not str_list: - return "" - elif len(str_list) == 1: - return next(iter(str_list)) - elif len(str_list) == 2: - return " and ".join(str_list) - else: - return ", ".join(str_list[:-1]) + ", and " + str_list[-1] - -# Return the full name of the specified user on Github -# If the user doesn't exist, return None -def get_github_name(author, github_client): - if github_client: - try: - return github_client.get_user(author).name - except GithubException as e: - # If this is not a "not found" exception - if e.status != 404: - raise e - return None - -# Return the full name of the specified user on JIRA -# If the user doesn't exist, return None -def get_jira_name(author, jira_client): - if jira_client: - try: - return jira_client.user(author).displayName - except JIRAError as e: - # If this is not a "not found" exception - if e.status_code != 404: - raise e - return None - -# Return whether the given name is in the form -def is_valid_author(author): - if not author: return False - return " " in author and not re.findall("[0-9]", author) - -# Capitalize the first letter of each word in the given author name -def capitalize_author(author): - if not author: return None - words = author.split(" ") - words = [w[0].capitalize() + w[1:] for w in words if w] - return " ".join(words) - diff --git a/dev/_site/create-release/translate-contributors.py b/dev/_site/create-release/translate-contributors.py deleted file mode 100755 index 86fa02d87b9a0..0000000000000 --- a/dev/_site/create-release/translate-contributors.py +++ /dev/null @@ -1,253 +0,0 @@ -#!/usr/bin/env python -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script translates invalid authors in the contributors list generated -# by generate-contributors.py. When the script encounters an author name that -# is considered invalid, it searches Github and JIRA in an attempt to search -# for replacements. This tool runs in two modes: -# -# (1) Interactive mode: For each invalid author name, this script presents -# all candidate replacements to the user and awaits user response. In this -# mode, the user may also input a custom name. This is the default. -# -# (2) Non-interactive mode: For each invalid author name, this script replaces -# the name with the first valid candidate it can find. If there is none, it -# uses the original name. This can be enabled through the --non-interactive flag. - -import os -import sys - -from releaseutils import * - -# You must set the following before use! -JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") -JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None) -JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None) -GITHUB_API_TOKEN = os.environ.get("GITHUB_API_TOKEN", None) -if not JIRA_USERNAME or not JIRA_PASSWORD: - sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set") -if not GITHUB_API_TOKEN: - sys.exit("GITHUB_API_TOKEN must be set") - -# Write new contributors list to .final -if not os.path.isfile(contributors_file_name): - print "Contributors file %s does not exist!" % contributors_file_name - print "Have you run ./generate-contributors.py yet?" - sys.exit(1) -contributors_file = open(contributors_file_name, "r") -warnings = [] - -# In non-interactive mode, this script will choose the first replacement that is valid -INTERACTIVE_MODE = True -if len(sys.argv) > 1: - options = set(sys.argv[1:]) - if "--non-interactive" in options: - INTERACTIVE_MODE = False -if INTERACTIVE_MODE: - print "Running in interactive mode. To disable this, provide the --non-interactive flag." - -# Setup Github and JIRA clients -jira_options = { "server": JIRA_API_BASE } -jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) -github_client = Github(GITHUB_API_TOKEN) - -# Load known author translations that are cached locally -known_translations = {} -known_translations_file_name = "known_translations" -known_translations_file = open(known_translations_file_name, "r") -for line in known_translations_file: - if line.startswith("#"): continue - [old_name, new_name] = line.strip("\n").split(" - ") - known_translations[old_name] = new_name -known_translations_file.close() - -# Open again in case the user adds new mappings -known_translations_file = open(known_translations_file_name, "a") - -# Generate candidates for the given author. This should only be called if the given author -# name does not represent a full name as this operation is somewhat expensive. Under the -# hood, it makes several calls to the Github and JIRA API servers to find the candidates. -# -# This returns a list of (candidate name, source) 2-tuples. E.g. -# [ -# (NOT_FOUND, "No full name found for Github user andrewor14"), -# ("Andrew Or", "Full name of JIRA user andrewor14"), -# ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"), -# ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"), -# (NOT_FOUND, "No assignee found for SPARK-1763") -# ] -NOT_FOUND = "Not found" -def generate_candidates(author, issues): - candidates = [] - # First check for full name of Github user - github_name = get_github_name(author, github_client) - if github_name: - candidates.append((github_name, "Full name of Github user %s" % author)) - else: - candidates.append((NOT_FOUND, "No full name found for Github user %s" % author)) - # Then do the same for JIRA user - jira_name = get_jira_name(author, jira_client) - if jira_name: - candidates.append((jira_name, "Full name of JIRA user %s" % author)) - else: - candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % author)) - # Then do the same for the assignee of each of the associated JIRAs - # Note that a given issue may not have an assignee, or the assignee may not have a full name - for issue in issues: - try: - jira_issue = jira_client.issue(issue) - except JIRAError as e: - # Do not exit just because an issue is not found! - if e.status_code == 404: - warnings.append("Issue %s not found!" % issue) - continue - raise e - jira_assignee = jira_issue.fields.assignee - if jira_assignee: - user_name = jira_assignee.name - display_name = jira_assignee.displayName - if display_name: - candidates.append((display_name, "Full name of %s assignee %s" % (issue, user_name))) - else: - candidates.append((NOT_FOUND, "No full name found for %s assignee %" % (issue, user_name))) - else: - candidates.append((NOT_FOUND, "No assignee found for %s" % issue)) - # Guard against special characters in candidate names - # Note that the candidate name may already be in unicode (JIRA returns this) - for i, (candidate, source) in enumerate(candidates): - try: - candidate = unicode(candidate, "UTF-8") - except TypeError: - # already in unicode - pass - candidate = unidecode.unidecode(candidate).strip() - candidates[i] = (candidate, source) - return candidates - -# Translate each invalid author by searching for possible candidates from Github and JIRA -# In interactive mode, this script presents the user with a list of choices and have the user -# select from this list. Additionally, the user may also choose to enter a custom name. -# In non-interactive mode, this script picks the first valid author name from the candidates -# If no such name exists, the original name is used (without the JIRA numbers). -print "\n========================== Translating contributor list ==========================" -lines = contributors_file.readlines() -contributions = [] -for i, line in enumerate(lines): - temp_author = line.strip(" * ").split(" -- ")[0] - print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines)) - if not temp_author: - error_msg = " ERROR: Expected the following format \" * -- \"\n" - error_msg += " ERROR: Actual = %s" % line - print error_msg - warnings.append(error_msg) - contributions.append(line) - continue - author = temp_author.split("/")[0] - # Use the local copy of known translations where possible - if author in known_translations: - line = line.replace(temp_author, known_translations[author]) - elif not is_valid_author(author): - new_author = author - issues = temp_author.split("/")[1:] - candidates = generate_candidates(author, issues) - # Print out potential replacement candidates along with the sources, e.g. - # [X] No full name found for Github user andrewor14 - # [X] No assignee found for SPARK-1763 - # [0] Andrew Or - Full name of JIRA user andrewor14 - # [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14 - # [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14 - # [3] andrewor14 - Raw Github username - # [4] Custom - candidate_names = [] - bad_prompts = [] # Prompts that can't actually be selected; print these first. - good_prompts = [] # Prompts that contain valid choices - for candidate, source in candidates: - if candidate == NOT_FOUND: - bad_prompts.append(" [X] %s" % source) - else: - index = len(candidate_names) - candidate_names.append(candidate) - good_prompts.append(" [%d] %s - %s" % (index, candidate, source)) - raw_index = len(candidate_names) - custom_index = len(candidate_names) + 1 - for p in bad_prompts: print p - if bad_prompts: print " ---" - for p in good_prompts: print p - # In interactive mode, additionally provide "custom" option and await user response - if INTERACTIVE_MODE: - print " [%d] %s - Raw Github username" % (raw_index, author) - print " [%d] Custom" % custom_index - response = raw_input(" Your choice: ") - last_index = custom_index - while not response.isdigit() or int(response) > last_index: - response = raw_input(" Please enter an integer between 0 and %d: " % last_index) - response = int(response) - if response == custom_index: - new_author = raw_input(" Please type a custom name for this author: ") - elif response != raw_index: - new_author = candidate_names[response] - # In non-interactive mode, just pick the first candidate - else: - valid_candidate_names = [name for name, _ in candidates\ - if is_valid_author(name) and name != NOT_FOUND] - if valid_candidate_names: - new_author = valid_candidate_names[0] - # Finally, capitalize the author and replace the original one with it - # If the final replacement is still invalid, log a warning - if is_valid_author(new_author): - new_author = capitalize_author(new_author) - else: - warnings.append("Unable to find a valid name %s for author %s" % (author, temp_author)) - print " * Replacing %s with %s" % (author, new_author) - # If we are in interactive mode, prompt the user whether we want to remember this new mapping - if INTERACTIVE_MODE and\ - author not in known_translations and\ - yesOrNoPrompt(" Add mapping %s -> %s to known translations file?" % (author, new_author)): - known_translations_file.write("%s - %s\n" % (author, new_author)) - known_translations_file.flush() - line = line.replace(temp_author, author) - contributions.append(line) -print "==================================================================================\n" -contributors_file.close() -known_translations_file.close() - -# Sort the contributions before writing them to the new file. -# Additionally, check if there are any duplicate author rows. -# This could happen if the same user has both a valid full -# name (e.g. Andrew Or) and an invalid one (andrewor14). -# If so, warn the user about this at the end. -contributions.sort() -all_authors = set() -new_contributors_file_name = contributors_file_name + ".final" -new_contributors_file = open(new_contributors_file_name, "w") -for line in contributions: - author = line.strip(" * ").split(" -- ")[0] - if author in all_authors: - warnings.append("Detected duplicate author name %s. Please merge these manually." % author) - all_authors.add(author) - new_contributors_file.write(line) -new_contributors_file.close() - -print "Translated contributors list successfully written to %s!" % new_contributors_file_name - -# Log any warnings encountered in the process -if warnings: - print "\n========== Warnings encountered while translating the contributor list ===========" - for w in warnings: print w - print "Please manually correct these in the final contributors list at %s." % new_contributors_file_name - print "==================================================================================\n" - diff --git a/dev/_site/github_jira_sync.py b/dev/_site/github_jira_sync.py deleted file mode 100755 index 287f0ca24a7df..0000000000000 --- a/dev/_site/github_jira_sync.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Utility for updating JIRA's with information about Github pull requests - -import json -import os -import re -import sys -import urllib2 - -try: - import jira.client -except ImportError: - print "This tool requires the jira-python library" - print "Install using 'sudo pip install jira'" - sys.exit(-1) - -# User facing configs -GITHUB_API_BASE = os.environ.get("GITHUB_API_BASE", "https://api.github.com/repos/apache/spark") -JIRA_PROJECT_NAME = os.environ.get("JIRA_PROJECT_NAME", "SPARK") -JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") -JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "apachespark") -JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "XXX") -# Maximum number of updates to perform in one run -MAX_UPDATES = int(os.environ.get("MAX_UPDATES", "100000")) -# Cut-off for oldest PR on which to comment. Useful for avoiding -# "notification overload" when running for the first time. -MIN_COMMENT_PR = int(os.environ.get("MIN_COMMENT_PR", "1496")) - -# File used as an opitimization to store maximum previously seen PR -# Used mostly because accessing ASF JIRA is slow, so we want to avoid checking -# the state of JIRA's that are tied to PR's we've already looked at. -MAX_FILE = ".github-jira-max" - -def get_url(url): - try: - return urllib2.urlopen(url) - except urllib2.HTTPError as e: - print "Unable to fetch URL, exiting: %s" % url - sys.exit(-1) - -def get_json(urllib_response): - return json.load(urllib_response) - -# Return a list of (JIRA id, JSON dict) tuples: -# e.g. [('SPARK-1234', {.. json ..}), ('SPARK-5687', {.. json ..})} -def get_jira_prs(): - result = [] - has_next_page = True - page_num = 0 - while has_next_page: - page = get_url(GITHUB_API_BASE + "/pulls?page=%s&per_page=100" % page_num) - page_json = get_json(page) - - for pull in page_json: - jiras = re.findall(JIRA_PROJECT_NAME + "-[0-9]{4,5}", pull['title']) - for jira in jiras: - result = result + [(jira, pull)] - - # Check if there is another page - link_header = filter(lambda k: k.startswith("Link"), page.info().headers)[0] - if not "next"in link_header: - has_next_page = False - else: - page_num = page_num + 1 - return result - -def set_max_pr(max_val): - f = open(MAX_FILE, 'w') - f.write("%s" % max_val) - f.close() - print "Writing largest PR number seen: %s" % max_val - -def get_max_pr(): - if os.path.exists(MAX_FILE): - result = int(open(MAX_FILE, 'r').read()) - print "Read largest PR number previously seen: %s" % result - return result - else: - return 0 - -jira_client = jira.client.JIRA({'server': JIRA_API_BASE}, - basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) - -jira_prs = get_jira_prs() - -previous_max = get_max_pr() -print "Retrieved %s JIRA PR's from Github" % len(jira_prs) -jira_prs = [(k, v) for k, v in jira_prs if int(v['number']) > previous_max] -print "%s PR's remain after excluding visted ones" % len(jira_prs) - -num_updates = 0 -considered = [] -for issue, pr in sorted(jira_prs, key=lambda (k, v): int(v['number'])): - if num_updates >= MAX_UPDATES: - break - pr_num = int(pr['number']) - - print "Checking issue %s" % issue - considered = considered + [pr_num] - - url = pr['html_url'] - title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) - try: - existing_links = map(lambda l: l.raw['object']['url'], jira_client.remote_links(issue)) - except: - print "Failure reading JIRA %s (does it exist?)" % issue - print sys.exc_info()[0] - continue - - if url in existing_links: - continue - - icon = {"title": "Pull request #%s" % pr['number'], - "url16x16": "https://assets-cdn.github.com/favicon.ico"} - destination = {"title": title, "url": url, "icon": icon} - # For all possible fields see: - # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links - # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"} - jira_client.add_remote_link(issue, destination) - - comment = "User '%s' has created a pull request for this issue:" % pr['user']['login'] - comment = comment + ("\n%s" % pr['html_url']) - if pr_num >= MIN_COMMENT_PR: - jira_client.add_comment(issue, comment) - - print "Added link %s <-> PR #%s" % (issue, pr['number']) - num_updates = num_updates + 1 - -if len(considered) > 0: - set_max_pr(max(considered)) diff --git a/dev/_site/lint-python b/dev/_site/lint-python deleted file mode 100755 index 0b97213ae3dff..0000000000000 --- a/dev/_site/lint-python +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" -SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")" -PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport" -PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py ./dev/run-tests-jenkins.py" -PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt" -PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt" -PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt" - -cd "$SPARK_ROOT_DIR" - -# compileall: https://docs.python.org/2/library/compileall.html -python -B -m compileall -q -l $PATHS_TO_CHECK > "$PEP8_REPORT_PATH" -compile_status="${PIPESTATUS[0]}" - -# Get pep8 at runtime so that we don't rely on it being installed on the build server. -#+ See: https://github.com/apache/spark/pull/1744#issuecomment-50982162 -#+ TODOs: -#+ - Download pep8 from PyPI. It's more "official". -PEP8_VERSION="1.6.2" -PEP8_SCRIPT_PATH="$SPARK_ROOT_DIR/dev/pep8-$PEP8_VERSION.py" -PEP8_SCRIPT_REMOTE_PATH="https://raw.githubusercontent.com/jcrocholl/pep8/$PEP8_VERSION/pep8.py" - -if [ ! -e "$PEP8_SCRIPT_PATH" ]; then - curl --silent -o "$PEP8_SCRIPT_PATH" "$PEP8_SCRIPT_REMOTE_PATH" - curl_status="$?" - - if [ "$curl_status" -ne 0 ]; then - echo "Failed to download pep8.py from \"$PEP8_SCRIPT_REMOTE_PATH\"." - exit "$curl_status" - fi -fi - -# Easy install pylint in /dev/pylint. To easy_install into a directory, the PYTHONPATH should -# be set to the directory. -# dev/pylint should be appended to the PATH variable as well. -# Jenkins by default installs the pylint3 version, so for now this just checks the code quality -# of python3. -export "PYTHONPATH=$SPARK_ROOT_DIR/dev/pylint" -export "PYLINT_HOME=$PYTHONPATH" -export "PATH=$PYTHONPATH:$PATH" - -# if [ ! -d "$PYLINT_HOME" ]; then -# mkdir "$PYLINT_HOME" -# # Redirect the annoying pylint installation output. -# easy_install -d "$PYLINT_HOME" pylint==1.4.4 &>> "$PYLINT_INSTALL_INFO" -# easy_install_status="$?" -# -# if [ "$easy_install_status" -ne 0 ]; then -# echo "Unable to install pylint locally in \"$PYTHONPATH\"." -# cat "$PYLINT_INSTALL_INFO" -# exit "$easy_install_status" -# fi -# -# rm "$PYLINT_INSTALL_INFO" -# -# fi - -# There is no need to write this output to a file -#+ first, but we do so so that the check status can -#+ be output before the report, like with the -#+ scalastyle and RAT checks. -python "$PEP8_SCRIPT_PATH" --ignore=E402,E731,E241,W503,E226 $PATHS_TO_CHECK >> "$PEP8_REPORT_PATH" -pep8_status="${PIPESTATUS[0]}" - -if [ "$compile_status" -eq 0 -a "$pep8_status" -eq 0 ]; then - lint_status=0 -else - lint_status=1 -fi - -if [ "$lint_status" -ne 0 ]; then - echo "PEP8 checks failed." - cat "$PEP8_REPORT_PATH" -else - echo "PEP8 checks passed." -fi - -rm "$PEP8_REPORT_PATH" - -# for to_be_checked in "$PATHS_TO_CHECK" -# do -# pylint --rcfile="$SPARK_ROOT_DIR/pylintrc" $to_be_checked >> "$PYLINT_REPORT_PATH" -# done - -# if [ "${PIPESTATUS[0]}" -ne 0 ]; then -# lint_status=1 -# echo "Pylint checks failed." -# cat "$PYLINT_REPORT_PATH" -# else -# echo "Pylint checks passed." -# fi - -# rm "$PYLINT_REPORT_PATH" - -exit "$lint_status" diff --git a/dev/_site/lint-r b/dev/_site/lint-r deleted file mode 100755 index bfda0bca15eb7..0000000000000 --- a/dev/_site/lint-r +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" -SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)" -LINT_R_REPORT_FILE_NAME="$SPARK_ROOT_DIR/dev/lint-r-report.log" - - -if ! type "Rscript" > /dev/null; then - echo "ERROR: You should install R" - exit -fi - -`which Rscript` --vanilla "$SPARK_ROOT_DIR/dev/lint-r.R" "$SPARK_ROOT_DIR" | tee "$LINT_R_REPORT_FILE_NAME" - -NUM_LINES=`wc -l < "$LINT_R_REPORT_FILE_NAME" | awk '{print $1}'` -if [ "$NUM_LINES" = "0" ] ; then - lint_status=0 - echo "lintr checks passed." -else - lint_status=1 - echo "lintr checks failed." -fi - -exit "$lint_status" diff --git a/dev/_site/lint-r.R b/dev/_site/lint-r.R deleted file mode 100644 index 999eef571b824..0000000000000 --- a/dev/_site/lint-r.R +++ /dev/null @@ -1,37 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -argv <- commandArgs(TRUE) -SPARK_ROOT_DIR <- as.character(argv[1]) -LOCAL_LIB_LOC <- file.path(SPARK_ROOT_DIR, "R", "lib") - -# Checks if SparkR is installed in a local directory. -if (! library(SparkR, lib.loc = LOCAL_LIB_LOC, logical.return = TRUE)) { - stop("You should install SparkR in a local directory with `R/install-dev.sh`.") -} - -# Installs lintr from Github in a local directory. -# NOTE: The CRAN's version is too old to adapt to our rules. -if ("lintr" %in% row.names(installed.packages()) == FALSE) { - devtools::install_github("jimhester/lintr") -} - -library(lintr) -library(methods) -library(testthat) -path.to.package <- file.path(SPARK_ROOT_DIR, "R", "pkg") -lint_package(path.to.package, cache = FALSE) diff --git a/dev/_site/lint-scala b/dev/_site/lint-scala deleted file mode 100755 index c676dfdf4f44e..0000000000000 --- a/dev/_site/lint-scala +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" -SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)" - -"$SCRIPT_DIR/scalastyle" diff --git a/dev/_site/merge_spark_pr.py b/dev/_site/merge_spark_pr.py deleted file mode 100755 index bf1a000f46791..0000000000000 --- a/dev/_site/merge_spark_pr.py +++ /dev/null @@ -1,453 +0,0 @@ -#!/usr/bin/env python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Utility for creating well-formed pull request merges and pushing them to Apache. -# usage: ./apache-pr-merge.py (see config env vars below) -# -# This utility assumes you already have local a Spark git folder and that you -# have added remotes corresponding to both (i) the github apache Spark -# mirror and (ii) the apache git repo. - -import json -import os -import re -import subprocess -import sys -import urllib2 - -try: - import jira.client - JIRA_IMPORTED = True -except ImportError: - JIRA_IMPORTED = False - -# Location of your Spark git development area -SPARK_HOME = os.environ.get("SPARK_HOME", os.getcwd()) -# Remote name which points to the Gihub site -PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "apache-github") -# Remote name which points to Apache git -PUSH_REMOTE_NAME = os.environ.get("PUSH_REMOTE_NAME", "apache") -# ASF JIRA username -JIRA_USERNAME = os.environ.get("JIRA_USERNAME", "") -# ASF JIRA password -JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", "") -# OAuth key used for issuing requests against the GitHub API. If this is not defined, then requests -# will be unauthenticated. You should only need to configure this if you find yourself regularly -# exceeding your IP's unauthenticated request rate limit. You can create an OAuth key at -# https://github.com/settings/tokens. This script only requires the "public_repo" scope. -GITHUB_OAUTH_KEY = os.environ.get("GITHUB_OAUTH_KEY") - - -GITHUB_BASE = "https://github.com/apache/spark/pull" -GITHUB_API_BASE = "https://api.github.com/repos/apache/spark" -JIRA_BASE = "https://issues.apache.org/jira/browse" -JIRA_API_BASE = "https://issues.apache.org/jira" -# Prefix added to temporary branches -BRANCH_PREFIX = "PR_TOOL" - - -def get_json(url): - try: - request = urllib2.Request(url) - if GITHUB_OAUTH_KEY: - request.add_header('Authorization', 'token %s' % GITHUB_OAUTH_KEY) - return json.load(urllib2.urlopen(request)) - except urllib2.HTTPError as e: - if "X-RateLimit-Remaining" in e.headers and e.headers["X-RateLimit-Remaining"] == '0': - print "Exceeded the GitHub API rate limit; see the instructions in " + \ - "dev/merge_spark_pr.py to configure an OAuth token for making authenticated " + \ - "GitHub requests." - else: - print "Unable to fetch URL, exiting: %s" % url - sys.exit(-1) - - -def fail(msg): - print msg - clean_up() - sys.exit(-1) - - -def run_cmd(cmd): - print cmd - if isinstance(cmd, list): - return subprocess.check_output(cmd) - else: - return subprocess.check_output(cmd.split(" ")) - - -def continue_maybe(prompt): - result = raw_input("\n%s (y/n): " % prompt) - if result.lower() != "y": - fail("Okay, exiting") - -def clean_up(): - print "Restoring head pointer to %s" % original_head - run_cmd("git checkout %s" % original_head) - - branches = run_cmd("git branch").replace(" ", "").split("\n") - - for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches): - print "Deleting local branch %s" % branch - run_cmd("git branch -D %s" % branch) - - -# merge the requested PR and return the merge hash -def merge_pr(pr_num, target_ref, title, body, pr_repo_desc): - pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) - target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, target_ref.upper()) - run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, pr_branch_name)) - run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, target_branch_name)) - run_cmd("git checkout %s" % target_branch_name) - - had_conflicts = False - try: - run_cmd(['git', 'merge', pr_branch_name, '--squash']) - except Exception as e: - msg = "Error merging: %s\nWould you like to manually fix-up this merge?" % e - continue_maybe(msg) - msg = "Okay, please fix any conflicts and 'git add' conflicting files... Finished?" - continue_maybe(msg) - had_conflicts = True - - commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, - '--pretty=format:%an <%ae>']).split("\n") - distinct_authors = sorted(set(commit_authors), - key=lambda x: commit_authors.count(x), reverse=True) - primary_author = raw_input( - "Enter primary author in the format of \"name \" [%s]: " % - distinct_authors[0]) - if primary_author == "": - primary_author = distinct_authors[0] - - commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, - '--pretty=format:%h [%an] %s']).split("\n\n") - - merge_message_flags = [] - - merge_message_flags += ["-m", title] - if body is not None: - # We remove @ symbols from the body to avoid triggering e-mails - # to people every time someone creates a public fork of Spark. - merge_message_flags += ["-m", body.replace("@", "")] - - authors = "\n".join(["Author: %s" % a for a in distinct_authors]) - - merge_message_flags += ["-m", authors] - - if had_conflicts: - committer_name = run_cmd("git config --get user.name").strip() - committer_email = run_cmd("git config --get user.email").strip() - message = "This patch had conflicts when merged, resolved by\nCommitter: %s <%s>" % ( - committer_name, committer_email) - merge_message_flags += ["-m", message] - - # The string "Closes #%s" string is required for GitHub to correctly close the PR - merge_message_flags += ["-m", "Closes #%s from %s." % (pr_num, pr_repo_desc)] - - run_cmd(['git', 'commit', '--author="%s"' % primary_author] + merge_message_flags) - - continue_maybe("Merge complete (local ref %s). Push to %s?" % ( - target_branch_name, PUSH_REMOTE_NAME)) - - try: - run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, target_ref)) - except Exception as e: - clean_up() - fail("Exception while pushing: %s" % e) - - merge_hash = run_cmd("git rev-parse %s" % target_branch_name)[:8] - clean_up() - print("Pull request #%s merged!" % pr_num) - print("Merge hash: %s" % merge_hash) - return merge_hash - - -def cherry_pick(pr_num, merge_hash, default_branch): - pick_ref = raw_input("Enter a branch name [%s]: " % default_branch) - if pick_ref == "": - pick_ref = default_branch - - pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, pick_ref.upper()) - - run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, pick_branch_name)) - run_cmd("git checkout %s" % pick_branch_name) - - try: - run_cmd("git cherry-pick -sx %s" % merge_hash) - except Exception as e: - msg = "Error cherry-picking: %s\nWould you like to manually fix-up this merge?" % e - continue_maybe(msg) - msg = "Okay, please fix any conflicts and finish the cherry-pick. Finished?" - continue_maybe(msg) - - continue_maybe("Pick complete (local ref %s). Push to %s?" % ( - pick_branch_name, PUSH_REMOTE_NAME)) - - try: - run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref)) - except Exception as e: - clean_up() - fail("Exception while pushing: %s" % e) - - pick_hash = run_cmd("git rev-parse %s" % pick_branch_name)[:8] - clean_up() - - print("Pull request #%s picked into %s!" % (pr_num, pick_ref)) - print("Pick hash: %s" % pick_hash) - return pick_ref - - -def fix_version_from_branch(branch, versions): - # Note: Assumes this is a sorted (newest->oldest) list of un-released versions - if branch == "master": - return versions[0] - else: - branch_ver = branch.replace("branch-", "") - return filter(lambda x: x.name.startswith(branch_ver), versions)[-1] - - -def resolve_jira_issue(merge_branches, comment, default_jira_id=""): - asf_jira = jira.client.JIRA({'server': JIRA_API_BASE}, - basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) - - jira_id = raw_input("Enter a JIRA id [%s]: " % default_jira_id) - if jira_id == "": - jira_id = default_jira_id - - try: - issue = asf_jira.issue(jira_id) - except Exception as e: - fail("ASF JIRA could not find %s\n%s" % (jira_id, e)) - - cur_status = issue.fields.status.name - cur_summary = issue.fields.summary - cur_assignee = issue.fields.assignee - if cur_assignee is None: - cur_assignee = "NOT ASSIGNED!!!" - else: - cur_assignee = cur_assignee.displayName - - if cur_status == "Resolved" or cur_status == "Closed": - fail("JIRA issue %s already has status '%s'" % (jira_id, cur_status)) - print ("=== JIRA %s ===" % jira_id) - print ("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" % ( - cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id)) - - versions = asf_jira.project_versions("SPARK") - versions = sorted(versions, key=lambda x: x.name, reverse=True) - versions = filter(lambda x: x.raw['released'] is False, versions) - # Consider only x.y.z versions - versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions) - - default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches) - for v in default_fix_versions: - # Handles the case where we have forked a release branch but not yet made the release. - # In this case, if the PR is committed to the master branch and the release branch, we - # only consider the release branch to be the fix version. E.g. it is not valid to have - # both 1.1.0 and 1.0.0 as fix versions. - (major, minor, patch) = v.split(".") - if patch == "0": - previous = "%s.%s.%s" % (major, int(minor) - 1, 0) - if previous in default_fix_versions: - default_fix_versions = filter(lambda x: x != v, default_fix_versions) - default_fix_versions = ",".join(default_fix_versions) - - fix_versions = raw_input("Enter comma-separated fix version(s) [%s]: " % default_fix_versions) - if fix_versions == "": - fix_versions = default_fix_versions - fix_versions = fix_versions.replace(" ", "").split(",") - - def get_version_json(version_str): - return filter(lambda v: v.name == version_str, versions)[0].raw - - jira_fix_versions = map(lambda v: get_version_json(v), fix_versions) - - resolve = filter(lambda a: a['name'] == "Resolve Issue", asf_jira.transitions(jira_id))[0] - resolution = filter(lambda r: r.raw['name'] == "Fixed", asf_jira.resolutions())[0] - asf_jira.transition_issue( - jira_id, resolve["id"], fixVersions = jira_fix_versions, - comment = comment, resolution = {'id': resolution.raw['id']}) - - print "Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions) - - -def resolve_jira_issues(title, merge_branches, comment): - jira_ids = re.findall("SPARK-[0-9]{4,5}", title) - - if len(jira_ids) == 0: - resolve_jira_issue(merge_branches, comment) - for jira_id in jira_ids: - resolve_jira_issue(merge_branches, comment, jira_id) - - -def standardize_jira_ref(text): - """ - Standardize the [SPARK-XXXXX] [MODULE] prefix - Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to "[SPARK-XXX][MLLIB] Issue" - - >>> standardize_jira_ref("[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful") - '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is successful' - >>> standardize_jira_ref("[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests") - '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull requests' - >>> standardize_jira_ref("[MLlib] Spark 5954: Top by key") - '[SPARK-5954][MLLIB] Top by key' - >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl") - '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl' - >>> standardize_jira_ref("SPARK-1094 Support MiMa for reporting binary compatibility accross versions.") - '[SPARK-1094] Support MiMa for reporting binary compatibility accross versions.' - >>> standardize_jira_ref("[WIP] [SPARK-1146] Vagrant support for Spark") - '[SPARK-1146][WIP] Vagrant support for Spark' - >>> standardize_jira_ref("SPARK-1032. If Yarn app fails before registering, app master stays aroun...") - '[SPARK-1032] If Yarn app fails before registering, app master stays aroun...' - >>> standardize_jira_ref("[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.") - '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.' - >>> standardize_jira_ref("Additional information for users building from source code") - 'Additional information for users building from source code' - """ - jira_refs = [] - components = [] - - # If the string is compliant, no need to process any further - if (re.search(r'^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+', text)): - return text - - # Extract JIRA ref(s): - pattern = re.compile(r'(SPARK[-\s]*[0-9]{3,6})+', re.IGNORECASE) - for ref in pattern.findall(text): - # Add brackets, replace spaces with a dash, & convert to uppercase - jira_refs.append('[' + re.sub(r'\s+', '-', ref.upper()) + ']') - text = text.replace(ref, '') - - # Extract spark component(s): - # Look for alphanumeric chars, spaces, dashes, periods, and/or commas - pattern = re.compile(r'(\[[\w\s,-\.]+\])', re.IGNORECASE) - for component in pattern.findall(text): - components.append(component.upper()) - text = text.replace(component, '') - - # Cleanup any remaining symbols: - pattern = re.compile(r'^\W+(.*)', re.IGNORECASE) - if (pattern.search(text) is not None): - text = pattern.search(text).groups()[0] - - # Assemble full text (JIRA ref(s), module(s), remaining text) - clean_text = ''.join(jira_refs).strip() + ''.join(components).strip() + " " + text.strip() - - # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were included - clean_text = re.sub(r'\s+', ' ', clean_text.strip()) - - return clean_text - -def main(): - global original_head - - os.chdir(SPARK_HOME) - original_head = run_cmd("git rev-parse HEAD")[:8] - - branches = get_json("%s/branches" % GITHUB_API_BASE) - branch_names = filter(lambda x: x.startswith("branch-"), [x['name'] for x in branches]) - # Assumes branch names can be sorted lexicographically - latest_branch = sorted(branch_names, reverse=True)[0] - - pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ") - pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num)) - pr_events = get_json("%s/issues/%s/events" % (GITHUB_API_BASE, pr_num)) - - url = pr["url"] - - # Decide whether to use the modified title or not - modified_title = standardize_jira_ref(pr["title"]) - if modified_title != pr["title"]: - print "I've re-written the title as follows to match the standard format:" - print "Original: %s" % pr["title"] - print "Modified: %s" % modified_title - result = raw_input("Would you like to use the modified title? (y/n): ") - if result.lower() == "y": - title = modified_title - print "Using modified title:" - else: - title = pr["title"] - print "Using original title:" - print title - else: - title = pr["title"] - - body = pr["body"] - target_ref = pr["base"]["ref"] - user_login = pr["user"]["login"] - base_ref = pr["head"]["ref"] - pr_repo_desc = "%s/%s" % (user_login, base_ref) - - # Merged pull requests don't appear as merged in the GitHub API; - # Instead, they're closed by asfgit. - merge_commits = \ - [e for e in pr_events if e["actor"]["login"] == "asfgit" and e["event"] == "closed"] - - if merge_commits: - merge_hash = merge_commits[0]["commit_id"] - message = get_json("%s/commits/%s" % (GITHUB_API_BASE, merge_hash))["commit"]["message"] - - print "Pull request %s has already been merged, assuming you want to backport" % pr_num - commit_is_downloaded = run_cmd(['git', 'rev-parse', '--quiet', '--verify', - "%s^{commit}" % merge_hash]).strip() != "" - if not commit_is_downloaded: - fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num) - - print "Found commit %s:\n%s" % (merge_hash, message) - cherry_pick(pr_num, merge_hash, latest_branch) - sys.exit(0) - - if not bool(pr["mergeable"]): - msg = "Pull request %s is not mergeable in its current form.\n" % pr_num + \ - "Continue? (experts only!)" - continue_maybe(msg) - - print ("\n=== Pull Request #%s ===" % pr_num) - print ("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % ( - title, pr_repo_desc, target_ref, url)) - continue_maybe("Proceed with merging pull request #%s?" % pr_num) - - merged_refs = [target_ref] - - merge_hash = merge_pr(pr_num, target_ref, title, body, pr_repo_desc) - - pick_prompt = "Would you like to pick %s into another branch?" % merge_hash - while raw_input("\n%s (y/n): " % pick_prompt).lower() == "y": - merged_refs = merged_refs + [cherry_pick(pr_num, merge_hash, latest_branch)] - - if JIRA_IMPORTED: - if JIRA_USERNAME and JIRA_PASSWORD: - continue_maybe("Would you like to update an associated JIRA?") - jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (pr_num, GITHUB_BASE, pr_num) - resolve_jira_issues(title, merged_refs, jira_comment) - else: - print "JIRA_USERNAME and JIRA_PASSWORD not set" - print "Exiting without trying to close the associated JIRA." - else: - print "Could not find jira-python library. Run 'sudo pip install jira' to install." - print "Exiting without trying to close the associated JIRA." - -if __name__ == "__main__": - import doctest - (failure_count, test_count) = doctest.testmod() - if failure_count: - exit(-1) - - main() diff --git a/dev/_site/mima b/dev/_site/mima deleted file mode 100755 index 2952fa65d42ff..0000000000000 --- a/dev/_site/mima +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -set -o pipefail -set -e - -# Go to the Spark project root directory -FWDIR="$(cd "`dirname "$0"`"/..; pwd)" -cd "$FWDIR" - -echo -e "q\n" | build/sbt oldDeps/update -rm -f .generated-mima* - -generate_mima_ignore() { - SPARK_JAVA_OPTS="-XX:MaxPermSize=1g -Xmx2g" \ - ./bin/spark-class org.apache.spark.tools.GenerateMIMAIgnore -} - -# Generate Mima Ignore is called twice, first with latest built jars -# on the classpath and then again with previous version jars on the classpath. -# Because of a bug in GenerateMIMAIgnore that when old jars are ahead on classpath -# it did not process the new classes (which are in assembly jar). -generate_mima_ignore - -export SPARK_CLASSPATH="`find lib_managed \( -name '*spark*jar' -a -type f \) | tr "\\n" ":"`" -echo "SPARK_CLASSPATH=$SPARK_CLASSPATH" - -generate_mima_ignore - -echo -e "q\n" | build/sbt mima-report-binary-issues | grep -v -e "info.*Resolving" -ret_val=$? - -if [ $ret_val != 0 ]; then - echo "NOTE: Exceptions to binary compatibility can be added in project/MimaExcludes.scala" -fi - -rm -f .generated-mima* -exit $ret_val diff --git a/dev/_site/run-tests b/dev/_site/run-tests deleted file mode 100755 index 257d1e8d50bb4..0000000000000 --- a/dev/_site/run-tests +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -FWDIR="$(cd "`dirname $0`"/..; pwd)" -cd "$FWDIR" - -exec python -u ./dev/run-tests.py "$@" diff --git a/dev/_site/run-tests-jenkins b/dev/_site/run-tests-jenkins deleted file mode 100755 index e79accf9e987a..0000000000000 --- a/dev/_site/run-tests-jenkins +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Wrapper script that runs the Spark tests then reports QA results -# to github via its API. -# Environment variables are populated by the code here: -#+ https://github.com/jenkinsci/ghprb-plugin/blob/master/src/main/java/org/jenkinsci/plugins/ghprb/GhprbTrigger.java#L139 - -FWDIR="$(cd "`dirname $0`"/..; pwd)" -cd "$FWDIR" - -exec python -u ./dev/run-tests-jenkins.py "$@" diff --git a/dev/_site/run-tests-jenkins.py b/dev/_site/run-tests-jenkins.py deleted file mode 100755 index 623004310e189..0000000000000 --- a/dev/_site/run-tests-jenkins.py +++ /dev/null @@ -1,228 +0,0 @@ -#!/usr/bin/env python2 - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function -import os -import sys -import json -import urllib2 -import functools -import subprocess - -from sparktestsupport import SPARK_HOME, ERROR_CODES -from sparktestsupport.shellutils import run_cmd - - -def print_err(msg): - """ - Given a set of arguments, will print them to the STDERR stream - """ - print(msg, file=sys.stderr) - - -def post_message_to_github(msg, ghprb_pull_id): - print("Attempting to post to Github...") - - url = "https://api.github.com/repos/apache/spark/issues/" + ghprb_pull_id + "/comments" - github_oauth_key = os.environ["GITHUB_OAUTH_KEY"] - - posted_message = json.dumps({"body": msg}) - request = urllib2.Request(url, - headers={ - "Authorization": "token %s" % github_oauth_key, - "Content-Type": "application/json" - }, - data=posted_message) - try: - response = urllib2.urlopen(request) - - if response.getcode() == 201: - print(" > Post successful.") - except urllib2.HTTPError as http_e: - print_err("Failed to post message to Github.") - print_err(" > http_code: %s" % http_e.code) - print_err(" > api_response: %s" % http_e.read()) - print_err(" > data: %s" % posted_message) - except urllib2.URLError as url_e: - print_err("Failed to post message to Github.") - print_err(" > urllib2_status: %s" % url_e.reason[1]) - print_err(" > data: %s" % posted_message) - - -def pr_message(build_display_name, - build_url, - ghprb_pull_id, - short_commit_hash, - commit_url, - msg, - post_msg=''): - # align the arguments properly for string formatting - str_args = (build_display_name, - msg, - build_url, - ghprb_pull_id, - short_commit_hash, - commit_url, - str(' ' + post_msg + '.') if post_msg else '.') - return '**[Test build %s %s](%sconsoleFull)** for PR %s at commit [`%s`](%s)%s' % str_args - - -def run_pr_checks(pr_tests, ghprb_actual_commit, sha1): - """ - Executes a set of pull request checks to ease development and report issues with various - components such as style, linting, dependencies, compatibilities, etc. - @return a list of messages to post back to Github - """ - # Ensure we save off the current HEAD to revert to - current_pr_head = run_cmd(['git', 'rev-parse', 'HEAD'], return_output=True).strip() - pr_results = list() - - for pr_test in pr_tests: - test_name = pr_test + '.sh' - pr_results.append(run_cmd(['bash', os.path.join(SPARK_HOME, 'dev', 'tests', test_name), - ghprb_actual_commit, sha1], - return_output=True).rstrip()) - # Ensure, after each test, that we're back on the current PR - run_cmd(['git', 'checkout', '-f', current_pr_head]) - return pr_results - - -def run_tests(tests_timeout): - """ - Runs the `dev/run-tests` script and responds with the correct error message - under the various failure scenarios. - @return a tuple containing the test result code and the result note to post to Github - """ - - test_result_code = subprocess.Popen(['timeout', - tests_timeout, - os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait() - - failure_note_by_errcode = { - 1: 'executing the `dev/run-tests` script', # error to denote run-tests script failures - ERROR_CODES["BLOCK_GENERAL"]: 'some tests', - ERROR_CODES["BLOCK_RAT"]: 'RAT tests', - ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests', - ERROR_CODES["BLOCK_PYTHON_STYLE"]: 'Python style tests', - ERROR_CODES["BLOCK_R_STYLE"]: 'R style tests', - ERROR_CODES["BLOCK_DOCUMENTATION"]: 'to generate documentation', - ERROR_CODES["BLOCK_BUILD"]: 'to build', - ERROR_CODES["BLOCK_MIMA"]: 'MiMa tests', - ERROR_CODES["BLOCK_SPARK_UNIT_TESTS"]: 'Spark unit tests', - ERROR_CODES["BLOCK_PYSPARK_UNIT_TESTS"]: 'PySpark unit tests', - ERROR_CODES["BLOCK_SPARKR_UNIT_TESTS"]: 'SparkR unit tests', - ERROR_CODES["BLOCK_TIMEOUT"]: 'from timeout after a configured wait of \`%s\`' % ( - tests_timeout) - } - - if test_result_code == 0: - test_result_note = ' * This patch passes all tests.' - else: - test_result_note = ' * This patch **fails %s**.' % failure_note_by_errcode[test_result_code] - - return [test_result_code, test_result_note] - - -def main(): - # Important Environment Variables - # --- - # $ghprbActualCommit - # This is the hash of the most recent commit in the PR. - # The merge-base of this and master is the commit from which the PR was branched. - # $sha1 - # If the patch merges cleanly, this is a reference to the merge commit hash - # (e.g. "origin/pr/2606/merge"). - # If the patch does not merge cleanly, it is equal to $ghprbActualCommit. - # The merge-base of this and master in the case of a clean merge is the most recent commit - # against master. - ghprb_pull_id = os.environ["ghprbPullId"] - ghprb_actual_commit = os.environ["ghprbActualCommit"] - ghprb_pull_title = os.environ["ghprbPullTitle"] - sha1 = os.environ["sha1"] - - # Marks this build as a pull request build. - os.environ["AMP_JENKINS_PRB"] = "true" - # Switch to a Maven-based build if the PR title contains "test-maven": - if "test-maven" in ghprb_pull_title: - os.environ["AMPLAB_JENKINS_BUILD_TOOL"] = "maven" - # Switch the Hadoop profile based on the PR title: - if "test-hadoop1.0" in ghprb_pull_title: - os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop1.0" - if "test-hadoop2.2" in ghprb_pull_title: - os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.0" - if "test-hadoop2.2" in ghprb_pull_title: - os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.2" - if "test-hadoop2.3" in ghprb_pull_title: - os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.3" - - build_display_name = os.environ["BUILD_DISPLAY_NAME"] - build_url = os.environ["BUILD_URL"] - - commit_url = "https://github.com/apache/spark/commit/" + ghprb_actual_commit - - # GitHub doesn't auto-link short hashes when submitted via the API, unfortunately. :( - short_commit_hash = ghprb_actual_commit[0:7] - - # format: http://linux.die.net/man/1/timeout - # must be less than the timeout configured on Jenkins (currently 300m) - tests_timeout = "250m" - - # Array to capture all test names to run on the pull request. These tests are represented - # by their file equivalents in the dev/tests/ directory. - # - # To write a PR test: - # * the file must reside within the dev/tests directory - # * be an executable bash script - # * accept three arguments on the command line, the first being the Github PR long commit - # hash, the second the Github SHA1 hash, and the final the current PR hash - # * and, lastly, return string output to be included in the pr message output that will - # be posted to Github - pr_tests = [ - "pr_merge_ability", - "pr_public_classes" - # DISABLED (pwendell) "pr_new_dependencies" - ] - - # `bind_message_base` returns a function to generate messages for Github posting - github_message = functools.partial(pr_message, - build_display_name, - build_url, - ghprb_pull_id, - short_commit_hash, - commit_url) - - # post start message - post_message_to_github(github_message('has started'), ghprb_pull_id) - - pr_check_results = run_pr_checks(pr_tests, ghprb_actual_commit, sha1) - - test_result_code, test_result_note = run_tests(tests_timeout) - - # post end message - result_message = github_message('has finished') - result_message += '\n' + test_result_note + '\n' - result_message += '\n'.join(pr_check_results) - - post_message_to_github(result_message, ghprb_pull_id) - - sys.exit(test_result_code) - - -if __name__ == "__main__": - main() diff --git a/dev/_site/run-tests.py b/dev/_site/run-tests.py deleted file mode 100755 index 9e1abb0697192..0000000000000 --- a/dev/_site/run-tests.py +++ /dev/null @@ -1,561 +0,0 @@ -#!/usr/bin/env python2 - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function -import itertools -from optparse import OptionParser -import os -import random -import re -import sys -import subprocess -from collections import namedtuple - -from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES -from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which -import sparktestsupport.modules as modules - - -# ------------------------------------------------------------------------------------------------- -# Functions for traversing module dependency graph -# ------------------------------------------------------------------------------------------------- - - -def determine_modules_for_files(filenames): - """ - Given a list of filenames, return the set of modules that contain those files. - If a file is not associated with a more specific submodule, then this method will consider that - file to belong to the 'root' module. - - >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"])) - ['pyspark-core', 'sql'] - >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])] - ['root'] - """ - changed_modules = set() - for filename in filenames: - matched_at_least_one_module = False - for module in modules.all_modules: - if module.contains_file(filename): - changed_modules.add(module) - matched_at_least_one_module = True - if not matched_at_least_one_module: - changed_modules.add(modules.root) - return changed_modules - - -def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None): - """ - Given a git commit and target ref, use the set of files changed in the diff in order to - determine which modules' tests should be run. - - >>> [x.name for x in determine_modules_for_files( \ - identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))] - ['graphx'] - >>> 'root' in [x.name for x in determine_modules_for_files( \ - identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))] - True - """ - if target_branch is None and target_ref is None: - raise AttributeError("must specify either target_branch or target_ref") - elif target_branch is not None and target_ref is not None: - raise AttributeError("must specify either target_branch or target_ref, not both") - if target_branch is not None: - diff_target = target_branch - run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)]) - else: - diff_target = target_ref - raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target], - universal_newlines=True) - # Remove any empty strings - return [f for f in raw_output.split('\n') if f] - - -def setup_test_environ(environ): - print("[info] Setup the following environment variables for tests: ") - for (k, v) in environ.items(): - print("%s=%s" % (k, v)) - os.environ[k] = v - - -def determine_modules_to_test(changed_modules): - """ - Given a set of modules that have changed, compute the transitive closure of those modules' - dependent modules in order to determine the set of modules that should be tested. - - >>> sorted(x.name for x in determine_modules_to_test([modules.root])) - ['root'] - >>> sorted(x.name for x in determine_modules_to_test([modules.graphx])) - ['examples', 'graphx'] - >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql])) - >>> x # doctest: +NORMALIZE_WHITESPACE - ['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \ - 'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql'] - """ - # If we're going to have to run all of the tests, then we can just short-circuit - # and return 'root'. No module depends on root, so if it appears then it will be - # in changed_modules. - if modules.root in changed_modules: - return [modules.root] - modules_to_test = set() - for module in changed_modules: - modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules)) - return modules_to_test.union(set(changed_modules)) - - -def determine_tags_to_exclude(changed_modules): - tags = [] - for m in modules.all_modules: - if m not in changed_modules: - tags += m.test_tags - return tags - - -# ------------------------------------------------------------------------------------------------- -# Functions for working with subprocesses and shell tools -# ------------------------------------------------------------------------------------------------- - - -def determine_java_executable(): - """Will return the path of the java executable that will be used by Spark's - tests or `None`""" - - # Any changes in the way that Spark's build detects java must be reflected - # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to - # the `java` executable on the path - - java_home = os.environ.get("JAVA_HOME") - - # check if there is an executable at $JAVA_HOME/bin/java - java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None - # if the java_exe wasn't set, check for a `java` version on the $PATH - return java_exe if java_exe else which("java") - - -JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 'update']) - - -def determine_java_version(java_exe): - """Given a valid java executable will return its version in named tuple format - with accessors '.major', '.minor', '.patch', '.update'""" - - raw_output = subprocess.check_output([java_exe, "-version"], - stderr=subprocess.STDOUT, - universal_newlines=True) - - raw_output_lines = raw_output.split('\n') - - # find raw version string, eg 'java version "1.8.0_25"' - raw_version_str = next(x for x in raw_output_lines if " version " in x) - - match = re.search('(\d+)\.(\d+)\.(\d+)_(\d+)', raw_version_str) - - major = int(match.group(1)) - minor = int(match.group(2)) - patch = int(match.group(3)) - update = int(match.group(4)) - - return JavaVersion(major, minor, patch, update) - -# ------------------------------------------------------------------------------------------------- -# Functions for running the other build and test scripts -# ------------------------------------------------------------------------------------------------- - - -def set_title_and_block(title, err_block): - os.environ["CURRENT_BLOCK"] = str(ERROR_CODES[err_block]) - line_str = '=' * 72 - - print('') - print(line_str) - print(title) - print(line_str) - - -def run_apache_rat_checks(): - set_title_and_block("Running Apache RAT checks", "BLOCK_RAT") - run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")]) - - -def run_scala_style_checks(): - set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE") - run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")]) - - -def run_python_style_checks(): - set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE") - run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")]) - - -def run_sparkr_style_checks(): - set_title_and_block("Running R style checks", "BLOCK_R_STYLE") - - if which("R"): - # R style check should be executed after `install-dev.sh`. - # Since warnings about `no visible global function definition` appear - # without the installation. SEE ALSO: SPARK-9121. - run_cmd([os.path.join(SPARK_HOME, "dev", "lint-r")]) - else: - print("Ignoring SparkR style check as R was not found in PATH") - - -def build_spark_documentation(): - set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION") - os.environ["PRODUCTION"] = "1 jekyll build" - - os.chdir(os.path.join(SPARK_HOME, "docs")) - - jekyll_bin = which("jekyll") - - if not jekyll_bin: - print("[error] Cannot find a version of `jekyll` on the system; please", - " install one and retry to build documentation.") - sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - else: - run_cmd([jekyll_bin, "build"]) - - os.chdir(SPARK_HOME) - - -def get_zinc_port(): - """ - Get a randomized port on which to start Zinc - """ - return random.randrange(3030, 4030) - - -def kill_zinc_on_port(zinc_port): - """ - Kill the Zinc process running on the given port, if one exists. - """ - cmd = ("/usr/sbin/lsof -P |grep %s | grep LISTEN " - "| awk '{ print $2; }' | xargs kill") % zinc_port - subprocess.check_call(cmd, shell=True) - - -def exec_maven(mvn_args=()): - """Will call Maven in the current directory with the list of mvn_args passed - in and returns the subprocess for any further processing""" - - zinc_port = get_zinc_port() - os.environ["ZINC_PORT"] = "%s" % zinc_port - zinc_flag = "-DzincPort=%s" % zinc_port - flags = [os.path.join(SPARK_HOME, "build", "mvn"), "--force", zinc_flag] - run_cmd(flags + mvn_args) - kill_zinc_on_port(zinc_port) - - -def exec_sbt(sbt_args=()): - """Will call SBT in the current directory with the list of mvn_args passed - in and returns the subprocess for any further processing""" - - sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args - - sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" + - "^.*[warn].*Merging" + "|" + - "^.*[info].*Including") - - # NOTE: echo "q" is needed because sbt on encountering a build file - # with failure (either resolution or compilation) prompts the user for - # input either q, r, etc to quit or retry. This echo is there to make it - # not block. - echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE) - sbt_proc = subprocess.Popen(sbt_cmd, - stdin=echo_proc.stdout, - stdout=subprocess.PIPE) - echo_proc.wait() - for line in iter(sbt_proc.stdout.readline, ''): - if not sbt_output_filter.match(line): - print(line, end='') - retcode = sbt_proc.wait() - - if retcode > 0: - exit_from_command_with_retcode(sbt_cmd, retcode) - - -def get_hadoop_profiles(hadoop_version): - """ - For the given Hadoop version tag, return a list of SBT profile flags for - building and testing against that Hadoop version. - """ - - sbt_maven_hadoop_profiles = { - "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.2.1"], - "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], - "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"], - "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], - "hadoop2.6": ["-Pyarn", "-Phadoop-2.6"], - } - - if hadoop_version in sbt_maven_hadoop_profiles: - return sbt_maven_hadoop_profiles[hadoop_version] - else: - print("[error] Could not find", hadoop_version, "in the list. Valid options", - " are", sbt_maven_hadoop_profiles.keys()) - sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - - -def build_spark_maven(hadoop_version): - # Enable all of the profiles for the build: - build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags - mvn_goals = ["clean", "package", "-DskipTests"] - profiles_and_goals = build_profiles + mvn_goals - - print("[info] Building Spark (w/Hive 1.2.1) using Maven with these arguments: ", - " ".join(profiles_and_goals)) - - exec_maven(profiles_and_goals) - - -def build_spark_sbt(hadoop_version): - # Enable all of the profiles for the build: - build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags - sbt_goals = ["package", - "assembly/assembly", - "streaming-kafka-assembly/assembly", - "streaming-flume-assembly/assembly", - "streaming-mqtt-assembly/assembly", - "streaming-mqtt/test:assembly", - "streaming-kinesis-asl-assembly/assembly"] - profiles_and_goals = build_profiles + sbt_goals - - print("[info] Building Spark (w/Hive 1.2.1) using SBT with these arguments: ", - " ".join(profiles_and_goals)) - - exec_sbt(profiles_and_goals) - - -def build_apache_spark(build_tool, hadoop_version): - """Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or - `maven`). Defaults to using `sbt`.""" - - set_title_and_block("Building Spark", "BLOCK_BUILD") - - rm_r("lib_managed") - - if build_tool == "maven": - build_spark_maven(hadoop_version) - else: - build_spark_sbt(hadoop_version) - - -def detect_binary_inop_with_mima(): - set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA") - run_cmd([os.path.join(SPARK_HOME, "dev", "mima")]) - - -def run_scala_tests_maven(test_profiles): - mvn_test_goals = ["test", "--fail-at-end"] - - profiles_and_goals = test_profiles + mvn_test_goals - - print("[info] Running Spark tests using Maven with these arguments: ", - " ".join(profiles_and_goals)) - - exec_maven(profiles_and_goals) - - -def run_scala_tests_sbt(test_modules, test_profiles): - - sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules)) - - if not sbt_test_goals: - return - - profiles_and_goals = test_profiles + list(sbt_test_goals) - - print("[info] Running Spark tests using SBT with these arguments: ", - " ".join(profiles_and_goals)) - - exec_sbt(profiles_and_goals) - - -def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags): - """Function to properly execute all tests passed in as a set from the - `determine_test_suites` function""" - set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") - - test_modules = set(test_modules) - - test_profiles = get_hadoop_profiles(hadoop_version) + \ - list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules))) - - if excluded_tags: - test_profiles += ['-Dtest.exclude.tags=' + ",".join(excluded_tags)] - - if build_tool == "maven": - run_scala_tests_maven(test_profiles) - else: - run_scala_tests_sbt(test_modules, test_profiles) - - -def run_python_tests(test_modules, parallelism): - set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") - - command = [os.path.join(SPARK_HOME, "python", "run-tests")] - if test_modules != [modules.root]: - command.append("--modules=%s" % ','.join(m.name for m in test_modules)) - command.append("--parallelism=%i" % parallelism) - run_cmd(command) - - -def run_sparkr_tests(): - set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS") - - if which("R"): - run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")]) - else: - print("Ignoring SparkR tests as R was not found in PATH") - - -def parse_opts(): - parser = OptionParser( - prog="run-tests" - ) - parser.add_option( - "-p", "--parallelism", type="int", default=4, - help="The number of suites to test in parallel (default %default)" - ) - - (opts, args) = parser.parse_args() - if args: - parser.error("Unsupported arguments: %s" % ' '.join(args)) - if opts.parallelism < 1: - parser.error("Parallelism cannot be less than 1") - return opts - - -def main(): - opts = parse_opts() - # Ensure the user home directory (HOME) is valid and is an absolute directory - if not USER_HOME or not os.path.isabs(USER_HOME): - print("[error] Cannot determine your home directory as an absolute path;", - " ensure the $HOME environment variable is set properly.") - sys.exit(1) - - os.chdir(SPARK_HOME) - - rm_r(os.path.join(SPARK_HOME, "work")) - rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) - rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) - - os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"]) - - java_exe = determine_java_executable() - - if not java_exe: - print("[error] Cannot find a version of `java` on the system; please", - " install one and retry.") - sys.exit(2) - - java_version = determine_java_version(java_exe) - - if java_version.minor < 8: - print("[warn] Java 8 tests will not run because JDK version is < 1.8.") - - # install SparkR - if which("R"): - run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) - else: - print("Can't install SparkR as R is was not found in PATH") - - if os.environ.get("AMPLAB_JENKINS"): - # if we're on the Amplab Jenkins build servers setup variables - # to reflect the environment settings - build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") - hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3") - test_env = "amplab_jenkins" - # add path for Python3 in Jenkins if we're calling from a Jenkins machine - os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH") - else: - # else we're running locally and can use local settings - build_tool = "sbt" - hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.3") - test_env = "local" - - print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, - "under environment", test_env) - - changed_modules = None - changed_files = None - if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): - target_branch = os.environ["ghprbTargetBranch"] - changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) - changed_modules = determine_modules_for_files(changed_files) - excluded_tags = determine_tags_to_exclude(changed_modules) - if not changed_modules: - changed_modules = [modules.root] - excluded_tags = [] - print("[info] Found the following changed modules:", - ", ".join(x.name for x in changed_modules)) - - # setup environment variables - # note - the 'root' module doesn't collect environment variables for all modules. Because the - # environment variables should not be set if a module is not changed, even if running the 'root' - # module. So here we should use changed_modules rather than test_modules. - test_environ = {} - for m in changed_modules: - test_environ.update(m.environ) - setup_test_environ(test_environ) - - test_modules = determine_modules_to_test(changed_modules) - - # license checks - run_apache_rat_checks() - - # style checks - if not changed_files or any(f.endswith(".scala") for f in changed_files): - run_scala_style_checks() - if not changed_files or any(f.endswith(".py") for f in changed_files): - run_python_style_checks() - if not changed_files or any(f.endswith(".R") for f in changed_files): - run_sparkr_style_checks() - - # determine if docs were changed and if we're inside the amplab environment - # note - the below commented out until *all* Jenkins workers can get `jekyll` installed - # if "DOCS" in changed_modules and test_env == "amplab_jenkins": - # build_spark_documentation() - - # spark build - build_apache_spark(build_tool, hadoop_version) - - # backwards compatibility checks - if build_tool == "sbt": - # Note: compatiblity tests only supported in sbt for now - detect_binary_inop_with_mima() - - # run the test suites - run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags) - - modules_with_python_tests = [m for m in test_modules if m.python_test_goals] - if modules_with_python_tests: - run_python_tests(modules_with_python_tests, opts.parallelism) - if any(m.should_run_r_tests for m in test_modules): - run_sparkr_tests() - - -def _test(): - import doctest - failure_count = doctest.testmod()[0] - if failure_count: - exit(-1) - -if __name__ == "__main__": - _test() - main() diff --git a/dev/_site/scalastyle b/dev/_site/scalastyle deleted file mode 100755 index ad93f7e85b27c..0000000000000 --- a/dev/_site/scalastyle +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -echo -e "q\n" | build/sbt -Pkinesis-asl -Phive -Phive-thriftserver scalastyle > scalastyle.txt -echo -e "q\n" | build/sbt -Pkinesis-asl -Phive -Phive-thriftserver test:scalastyle >> scalastyle.txt -# Check style with YARN built too -echo -e "q\n" | build/sbt -Pkinesis-asl -Pyarn -Phadoop-2.2 scalastyle >> scalastyle.txt -echo -e "q\n" | build/sbt -Pkinesis-asl -Pyarn -Phadoop-2.2 test:scalastyle >> scalastyle.txt - -ERRORS=$(cat scalastyle.txt | awk '{if($1~/error/)print}') -rm scalastyle.txt - -if test ! -z "$ERRORS"; then - echo -e "Scalastyle checks failed at following occurrences:\n$ERRORS" - exit 1 -else - echo -e "Scalastyle checks passed." -fi diff --git a/dev/_site/sparktestsupport/modules.py b/dev/_site/sparktestsupport/modules.py deleted file mode 100644 index d65547e04db4b..0000000000000 --- a/dev/_site/sparktestsupport/modules.py +++ /dev/null @@ -1,437 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import itertools -import re - -all_modules = [] - - -class Module(object): - """ - A module is the basic abstraction in our test runner script. Each module consists of a set of - source files, a set of test commands, and a set of dependencies on other modules. We use modules - to define a dependency graph that lets determine which tests to run based on which files have - changed. - """ - - def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={}, - sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(), - test_tags=(), should_run_r_tests=False): - """ - Define a new module. - - :param name: A short module name, for display in logging and error messages. - :param dependencies: A set of dependencies for this module. This should only include direct - dependencies; transitive dependencies are resolved automatically. - :param source_file_regexes: a set of regexes that match source files belonging to this - module. These regexes are applied by attempting to match at the beginning of the - filename strings. - :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in - order to build and test this module (e.g. '-PprofileName'). - :param environ: A dict of environment variables that should be set when files in this - module are changed. - :param sbt_test_goals: A set of SBT test goals for testing this module. - :param python_test_goals: A set of Python test goals for testing this module. - :param blacklisted_python_implementations: A set of Python implementations that are not - supported by this module's Python components. The values in this set should match - strings returned by Python's `platform.python_implementation()`. - :param test_tags A set of tags that will be excluded when running unit tests if the module - is not explicitly changed. - :param should_run_r_tests: If true, changes in this module will trigger all R tests. - """ - self.name = name - self.dependencies = dependencies - self.source_file_prefixes = source_file_regexes - self.sbt_test_goals = sbt_test_goals - self.build_profile_flags = build_profile_flags - self.environ = environ - self.python_test_goals = python_test_goals - self.blacklisted_python_implementations = blacklisted_python_implementations - self.test_tags = test_tags - self.should_run_r_tests = should_run_r_tests - - self.dependent_modules = set() - for dep in dependencies: - dep.dependent_modules.add(self) - all_modules.append(self) - - def contains_file(self, filename): - return any(re.match(p, filename) for p in self.source_file_prefixes) - - -sql = Module( - name="sql", - dependencies=[], - source_file_regexes=[ - "sql/(?!hive-thriftserver)", - "bin/spark-sql", - ], - build_profile_flags=[ - "-Phive", - ], - sbt_test_goals=[ - "catalyst/test", - "sql/test", - "hive/test", - ], - test_tags=[ - "org.apache.spark.tags.ExtendedHiveTest" - ] -) - - -hive_thriftserver = Module( - name="hive-thriftserver", - dependencies=[sql], - source_file_regexes=[ - "sql/hive-thriftserver", - "sbin/start-thriftserver.sh", - ], - build_profile_flags=[ - "-Phive-thriftserver", - ], - sbt_test_goals=[ - "hive-thriftserver/test", - ] -) - - -graphx = Module( - name="graphx", - dependencies=[], - source_file_regexes=[ - "graphx/", - ], - sbt_test_goals=[ - "graphx/test" - ] -) - - -streaming = Module( - name="streaming", - dependencies=[], - source_file_regexes=[ - "streaming", - ], - sbt_test_goals=[ - "streaming/test", - ] -) - - -# Don't set the dependencies because changes in other modules should not trigger Kinesis tests. -# Kinesis tests depends on external Amazon kinesis service. We should run these tests only when -# files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't -# fail other PRs. -streaming_kinesis_asl = Module( - name="streaming-kinesis-asl", - dependencies=[], - source_file_regexes=[ - "extras/kinesis-asl/", - "extras/kinesis-asl-assembly/", - ], - build_profile_flags=[ - "-Pkinesis-asl", - ], - environ={ - "ENABLE_KINESIS_TESTS": "1" - }, - sbt_test_goals=[ - "streaming-kinesis-asl/test", - ] -) - - -streaming_zeromq = Module( - name="streaming-zeromq", - dependencies=[streaming], - source_file_regexes=[ - "external/zeromq", - ], - sbt_test_goals=[ - "streaming-zeromq/test", - ] -) - - -streaming_twitter = Module( - name="streaming-twitter", - dependencies=[streaming], - source_file_regexes=[ - "external/twitter", - ], - sbt_test_goals=[ - "streaming-twitter/test", - ] -) - - -streaming_mqtt = Module( - name="streaming-mqtt", - dependencies=[streaming], - source_file_regexes=[ - "external/mqtt", - "external/mqtt-assembly", - ], - sbt_test_goals=[ - "streaming-mqtt/test", - ] -) - - -streaming_kafka = Module( - name="streaming-kafka", - dependencies=[streaming], - source_file_regexes=[ - "external/kafka", - "external/kafka-assembly", - ], - sbt_test_goals=[ - "streaming-kafka/test", - ] -) - - -streaming_flume_sink = Module( - name="streaming-flume-sink", - dependencies=[streaming], - source_file_regexes=[ - "external/flume-sink", - ], - sbt_test_goals=[ - "streaming-flume-sink/test", - ] -) - - -streaming_flume = Module( - name="streaming-flume", - dependencies=[streaming], - source_file_regexes=[ - "external/flume", - ], - sbt_test_goals=[ - "streaming-flume/test", - ] -) - - -streaming_flume_assembly = Module( - name="streaming-flume-assembly", - dependencies=[streaming_flume, streaming_flume_sink], - source_file_regexes=[ - "external/flume-assembly", - ] -) - - -mllib = Module( - name="mllib", - dependencies=[streaming, sql], - source_file_regexes=[ - "data/mllib/", - "mllib/", - ], - sbt_test_goals=[ - "mllib/test", - ] -) - - -examples = Module( - name="examples", - dependencies=[graphx, mllib, streaming, sql], - source_file_regexes=[ - "examples/", - ], - sbt_test_goals=[ - "examples/test", - ] -) - - -pyspark_core = Module( - name="pyspark-core", - dependencies=[], - source_file_regexes=[ - "python/(?!pyspark/(ml|mllib|sql|streaming))" - ], - python_test_goals=[ - "pyspark.rdd", - "pyspark.context", - "pyspark.conf", - "pyspark.broadcast", - "pyspark.accumulators", - "pyspark.serializers", - "pyspark.profiler", - "pyspark.shuffle", - "pyspark.tests", - ] -) - - -pyspark_sql = Module( - name="pyspark-sql", - dependencies=[pyspark_core, sql], - source_file_regexes=[ - "python/pyspark/sql" - ], - python_test_goals=[ - "pyspark.sql.types", - "pyspark.sql.context", - "pyspark.sql.column", - "pyspark.sql.dataframe", - "pyspark.sql.group", - "pyspark.sql.functions", - "pyspark.sql.readwriter", - "pyspark.sql.window", - "pyspark.sql.tests", - ] -) - - -pyspark_streaming = Module( - name="pyspark-streaming", - dependencies=[ - pyspark_core, - streaming, - streaming_kafka, - streaming_flume_assembly, - streaming_mqtt, - streaming_kinesis_asl - ], - source_file_regexes=[ - "python/pyspark/streaming" - ], - python_test_goals=[ - "pyspark.streaming.util", - "pyspark.streaming.tests", - ] -) - - -pyspark_mllib = Module( - name="pyspark-mllib", - dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib], - source_file_regexes=[ - "python/pyspark/mllib" - ], - python_test_goals=[ - "pyspark.mllib.classification", - "pyspark.mllib.clustering", - "pyspark.mllib.evaluation", - "pyspark.mllib.feature", - "pyspark.mllib.fpm", - "pyspark.mllib.linalg.__init__", - "pyspark.mllib.linalg.distributed", - "pyspark.mllib.random", - "pyspark.mllib.recommendation", - "pyspark.mllib.regression", - "pyspark.mllib.stat._statistics", - "pyspark.mllib.stat.KernelDensity", - "pyspark.mllib.tree", - "pyspark.mllib.util", - "pyspark.mllib.tests", - ], - blacklisted_python_implementations=[ - "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there - ] -) - - -pyspark_ml = Module( - name="pyspark-ml", - dependencies=[pyspark_core, pyspark_mllib], - source_file_regexes=[ - "python/pyspark/ml/" - ], - python_test_goals=[ - "pyspark.ml.feature", - "pyspark.ml.classification", - "pyspark.ml.clustering", - "pyspark.ml.recommendation", - "pyspark.ml.regression", - "pyspark.ml.tuning", - "pyspark.ml.tests", - "pyspark.ml.evaluation", - ], - blacklisted_python_implementations=[ - "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there - ] -) - -sparkr = Module( - name="sparkr", - dependencies=[sql, mllib], - source_file_regexes=[ - "R/", - ], - should_run_r_tests=True -) - - -docs = Module( - name="docs", - dependencies=[], - source_file_regexes=[ - "docs/", - ] -) - - -ec2 = Module( - name="ec2", - dependencies=[], - source_file_regexes=[ - "ec2/", - ] -) - - -yarn = Module( - name="yarn", - dependencies=[], - source_file_regexes=[ - "yarn/", - "network/yarn/", - ], - sbt_test_goals=[ - "yarn/test", - "network-yarn/test", - ], - test_tags=[ - "org.apache.spark.tags.ExtendedYarnTest" - ] -) - -# The root module is a dummy module which is used to run all of the tests. -# No other modules should directly depend on this module. -root = Module( - name="root", - dependencies=[], - source_file_regexes=[], - # In order to run all of the tests, enable every test profile: - build_profile_flags=list(set( - itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))), - sbt_test_goals=[ - "test", - ], - python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)), - should_run_r_tests=True -) diff --git a/dev/_site/sparktestsupport/shellutils.py b/dev/_site/sparktestsupport/shellutils.py deleted file mode 100644 index d280e797077d1..0000000000000 --- a/dev/_site/sparktestsupport/shellutils.py +++ /dev/null @@ -1,115 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function -import os -import shutil -import subprocess -import sys - - -if sys.version_info >= (2, 7): - subprocess_check_output = subprocess.check_output - subprocess_check_call = subprocess.check_call -else: - # SPARK-8763 - # backported from subprocess module in Python 2.7 - def subprocess_check_output(*popenargs, **kwargs): - if 'stdout' in kwargs: - raise ValueError('stdout argument not allowed, it will be overridden.') - process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) - output, unused_err = process.communicate() - retcode = process.poll() - if retcode: - cmd = kwargs.get("args") - if cmd is None: - cmd = popenargs[0] - raise subprocess.CalledProcessError(retcode, cmd, output=output) - return output - - # backported from subprocess module in Python 2.7 - def subprocess_check_call(*popenargs, **kwargs): - retcode = call(*popenargs, **kwargs) - if retcode: - cmd = kwargs.get("args") - if cmd is None: - cmd = popenargs[0] - raise CalledProcessError(retcode, cmd) - return 0 - - -def exit_from_command_with_retcode(cmd, retcode): - print("[error] running", ' '.join(cmd), "; received return code", retcode) - sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - - -def rm_r(path): - """ - Given an arbitrary path, properly remove it with the correct Python construct if it exists. - From: http://stackoverflow.com/a/9559881 - """ - - if os.path.isdir(path): - shutil.rmtree(path) - elif os.path.exists(path): - os.remove(path) - - -def run_cmd(cmd, return_output=False): - """ - Given a command as a list of arguments will attempt to execute the command - and, on failure, print an error message and exit. - """ - - if not isinstance(cmd, list): - cmd = cmd.split() - try: - if return_output: - return subprocess_check_output(cmd) - else: - return subprocess_check_call(cmd) - except subprocess.CalledProcessError as e: - exit_from_command_with_retcode(e.cmd, e.returncode) - - -def is_exe(path): - """ - Check if a given path is an executable file. - From: http://stackoverflow.com/a/377028 - """ - - return os.path.isfile(path) and os.access(path, os.X_OK) - - -def which(program): - """ - Find and return the given program by its absolute path or 'None' if the program cannot be found. - From: http://stackoverflow.com/a/377028 - """ - - fpath = os.path.split(program)[0] - - if fpath: - if is_exe(program): - return program - else: - for path in os.environ.get("PATH").split(os.pathsep): - path = path.strip('"') - exe_file = os.path.join(path, program) - if is_exe(exe_file): - return exe_file - return None diff --git a/dev/_site/tests/pr_merge_ability.sh b/dev/_site/tests/pr_merge_ability.sh deleted file mode 100755 index d9a347fe24a8c..0000000000000 --- a/dev/_site/tests/pr_merge_ability.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# This script follows the base format for testing pull requests against -# another branch and returning results to be published. More details can be -# found at dev/run-tests-jenkins. -# -# Arg1: The Github Pull Request Actual Commit -#+ known as `ghprbActualCommit` in `run-tests-jenkins` -# Arg2: The SHA1 hash -#+ known as `sha1` in `run-tests-jenkins` -# - -ghprbActualCommit="$1" -sha1="$2" - -# check PR merge-ability -if [ "${sha1}" == "${ghprbActualCommit}" ]; then - echo " * This patch **does not merge cleanly**." -else - echo " * This patch merges cleanly." -fi diff --git a/dev/_site/tests/pr_new_dependencies.sh b/dev/_site/tests/pr_new_dependencies.sh deleted file mode 100755 index fdfb3c62aff58..0000000000000 --- a/dev/_site/tests/pr_new_dependencies.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# This script follows the base format for testing pull requests against -# another branch and returning results to be published. More details can be -# found at dev/run-tests-jenkins. -# -# Arg1: The Github Pull Request Actual Commit -#+ known as `ghprbActualCommit` in `run-tests-jenkins` -# Arg2: The SHA1 hash -#+ known as `sha1` in `run-tests-jenkins` -# Arg3: Current PR Commit Hash -#+ the PR hash for the current commit -# - -ghprbActualCommit="$1" -sha1="$2" -current_pr_head="$3" - -MVN_BIN="build/mvn" -CURR_CP_FILE="my-classpath.txt" -MASTER_CP_FILE="master-classpath.txt" - -# First switch over to the master branch -git checkout -f master -# Find and copy all pom.xml files into a *.gate file that we can check -# against through various `git` changes -find -name "pom.xml" -exec cp {} {}.gate \; -# Switch back to the current PR -git checkout -f "${current_pr_head}" - -# Check if any *.pom files from the current branch are different from the master -difference_q="" -for p in $(find -name "pom.xml"); do - [[ -f "${p}" && -f "${p}.gate" ]] && \ - difference_q="${difference_q}$(diff $p.gate $p)" -done - -# If no pom files were changed we can easily say no new dependencies were added -if [ -z "${difference_q}" ]; then - echo " * This patch does not change any dependencies." -else - # Else we need to manually build spark to determine what, if any, dependencies - # were added into the Spark assembly jar - ${MVN_BIN} clean package dependency:build-classpath -DskipTests 2>/dev/null | \ - sed -n -e '/Building Spark Project Assembly/,$p' | \ - grep --context=1 -m 2 "Dependencies classpath:" | \ - head -n 3 | \ - tail -n 1 | \ - tr ":" "\n" | \ - rev | \ - cut -d "/" -f 1 | \ - rev | \ - sort > ${CURR_CP_FILE} - - # Checkout the master branch to compare against - git checkout -f master - - ${MVN_BIN} clean package dependency:build-classpath -DskipTests 2>/dev/null | \ - sed -n -e '/Building Spark Project Assembly/,$p' | \ - grep --context=1 -m 2 "Dependencies classpath:" | \ - head -n 3 | \ - tail -n 1 | \ - tr ":" "\n" | \ - rev | \ - cut -d "/" -f 1 | \ - rev | \ - sort > ${MASTER_CP_FILE} - - DIFF_RESULTS="`diff ${CURR_CP_FILE} ${MASTER_CP_FILE}`" - - if [ -z "${DIFF_RESULTS}" ]; then - echo " * This patch does not change any dependencies." - else - # Pretty print the new dependencies - added_deps=$(echo "${DIFF_RESULTS}" | grep "<" | cut -d' ' -f2 | awk '{printf " * \`"$1"\`\\n"}') - removed_deps=$(echo "${DIFF_RESULTS}" | grep ">" | cut -d' ' -f2 | awk '{printf " * \`"$1"\`\\n"}') - added_deps_text=" * This patch **adds the following new dependencies:**\n${added_deps}" - removed_deps_text=" * This patch **removes the following dependencies:**\n${removed_deps}" - - # Construct the final returned message with proper - return_mssg="" - [ -n "${added_deps}" ] && return_mssg="${added_deps_text}" - if [ -n "${removed_deps}" ]; then - if [ -n "${return_mssg}" ]; then - return_mssg="${return_mssg}\n${removed_deps_text}" - else - return_mssg="${removed_deps_text}" - fi - fi - echo "${return_mssg}" - fi - - # Remove the files we've left over - [ -f "${CURR_CP_FILE}" ] && rm -f "${CURR_CP_FILE}" - [ -f "${MASTER_CP_FILE}" ] && rm -f "${MASTER_CP_FILE}" - - # Clean up our mess from the Maven builds just in case - ${MVN_BIN} clean &>/dev/null -fi diff --git a/dev/_site/tests/pr_public_classes.sh b/dev/_site/tests/pr_public_classes.sh deleted file mode 100755 index 927295b88c963..0000000000000 --- a/dev/_site/tests/pr_public_classes.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# -# This script follows the base format for testing pull requests against -# another branch and returning results to be published. More details can be -# found at dev/run-tests-jenkins. -# -# Arg1: The Github Pull Request Actual Commit -#+ known as `ghprbActualCommit` in `run-tests-jenkins` -# Arg2: The SHA1 hash -#+ known as `sha1` in `run-tests-jenkins` -# - -# We diff master...$ghprbActualCommit because that gets us changes introduced in the PR -#+ and not anything else added to master since the PR was branched. - -ghprbActualCommit="$1" -sha1="$2" - -source_files=$( - git diff master...$ghprbActualCommit --name-only `# diff patch against master from branch point` \ - | grep -v -e "\/test" `# ignore files in test directories` \ - | grep -e "\.py$" -e "\.java$" -e "\.scala$" `# include only code files` \ - | tr "\n" " " -) -new_public_classes=$( - git diff master...$ghprbActualCommit ${source_files} `# diff patch against master from branch point` \ - | grep "^\+" `# filter in only added lines` \ - | sed -r -e "s/^\+//g" `# remove the leading +` \ - | grep -e "trait " -e "class " `# filter in lines with these key words` \ - | grep -e "{" -e "(" `# filter in lines with these key words, too` \ - | grep -v -e "\@\@" -e "private" `# exclude lines with these words` \ - | grep -v -e "^// " -e "^/\*" -e "^ \* " `# exclude comment lines` \ - | sed -r -e "s/\{.*//g" `# remove from the { onwards` \ - | sed -r -e "s/\}//g" `# just in case, remove }; they mess the JSON` \ - | sed -r -e "s/\"/\\\\\"/g" `# escape double quotes; they mess the JSON` \ - | sed -r -e "s/^(.*)$/\`\1\`/g" `# surround with backticks for style` \ - | sed -r -e "s/^/ \* /g" `# prepend ' *' to start of line` \ - | sed -r -e "s/$/\\\n/g" `# append newline to end of line` \ - | tr -d "\n" `# remove actual LF characters` -) - -if [ -z "$new_public_classes" ]; then - echo " * This patch adds no public classes." -else - public_classes_note=" * This patch adds the following public classes _(experimental)_:" - echo "${public_classes_note}\n${new_public_classes}" -fi From 1106cae4f74cf4727b870c9eb8e695dfd0c423a9 Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Fri, 13 Nov 2015 14:48:02 +0530 Subject: [PATCH 05/13] java style issues --- .../mllib/JavaBinaryClassification.java | 151 +++++----- .../examples/mllib/JavaLinearRegression.java | 94 +++---- .../mllib/JavaMultiLabelClassification.java | 73 ++--- .../mllib/JavaMulticlassClassification.java | 94 +++---- .../spark/examples/mllib/JavaRanking.java | 265 +++++++++--------- .../mllib/binary_classification_metrics.py | 2 + .../main/python/mllib/regression_metrics.py | 4 +- .../mllib/BinaryClassificationMetrics.scala | 3 +- .../examples/mllib/MultiLabelMetrics.scala | 1 + 9 files changed, 348 insertions(+), 339 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java index b17dc79abff16..58b255eb598ec 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java @@ -19,6 +19,7 @@ package org.apache.spark.examples.mllib; // $example on$ + import scala.Tuple2; import org.apache.spark.api.java.*; @@ -32,82 +33,80 @@ import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; - public class JavaBinaryClassification { - public static void main(String[] args) { - - SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics"); - SparkContext sc = new SparkContext(conf); - String path = "data/mllib/sample_binary_classification_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - - // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L); - JavaRDD training = splits[0].cache(); - JavaRDD test = splits[1]; - - // Run training algorithm to build the model. - final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() - .setNumClasses(2) - .run(training.rdd()); - - // Clear the prediction threshold so the model will return probabilities - model.clearThreshold(); - - // Compute raw scores on the test set. - JavaRDD> predictionAndLabels = test.map( - new Function>() { - public Tuple2 call(LabeledPoint p) { - Double prediction = model.predict(p.features()); - return new Tuple2(prediction, p.label()); - } - } - ); - - // Get evaluation metrics. - BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictionAndLabels.rdd()); - - // Precision by threshold - JavaRDD> precision = metrics.precisionByThreshold().toJavaRDD(); - System.out.println("Precision by threshold: " + precision.toArray()); - - // Recall by threshold - JavaRDD> recall = metrics.recallByThreshold().toJavaRDD(); - System.out.println("Recall by threshold: " + recall.toArray()); - - // F Score by threshold - JavaRDD> f1Score = metrics.fMeasureByThreshold().toJavaRDD(); - System.out.println("F1 Score by threshold: " + f1Score.toArray()); - - JavaRDD> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD(); - System.out.println("F2 Score by threshold: " + f2Score.toArray()); - - // Precision-recall curve - JavaRDD> prc = metrics.pr().toJavaRDD(); - System.out.println("Precision-recall curve: " + prc.toArray()); - - // Thresholds - JavaRDD thresholds = precision.map( - new Function, Double>() { - public Double call(Tuple2 t) { - return new Double(t._1().toString()); - } - } - ); - - // ROC Curve - JavaRDD> roc = metrics.roc().toJavaRDD(); - System.out.println("ROC curve: " + roc.toArray()); - - // AUPRC - System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR()); - - // AUROC - System.out.println("Area under ROC = " + metrics.areaUnderROC()); - - // Save and load model - model.save(sc, "myModelPath"); - LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); - } + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics"); + SparkContext sc = new SparkContext(conf); + String path = "data/mllib/sample_binary_classification_data.txt"; + JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); + + // Split initial RDD into two... [60% training data, 40% testing data]. + JavaRDD[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L); + JavaRDD training = splits[0].cache(); + JavaRDD test = splits[1]; + + // Run training algorithm to build the model. + final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() + .setNumClasses(2) + .run(training.rdd()); + + // Clear the prediction threshold so the model will return probabilities + model.clearThreshold(); + + // Compute raw scores on the test set. + JavaRDD> predictionAndLabels = test.map( + new Function>() { + public Tuple2 call(LabeledPoint p) { + Double prediction = model.predict(p.features()); + return new Tuple2(prediction, p.label()); + } + } + ); + + // Get evaluation metrics. + BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictionAndLabels.rdd()); + + // Precision by threshold + JavaRDD> precision = metrics.precisionByThreshold().toJavaRDD(); + System.out.println("Precision by threshold: " + precision.toArray()); + + // Recall by threshold + JavaRDD> recall = metrics.recallByThreshold().toJavaRDD(); + System.out.println("Recall by threshold: " + recall.toArray()); + + // F Score by threshold + JavaRDD> f1Score = metrics.fMeasureByThreshold().toJavaRDD(); + System.out.println("F1 Score by threshold: " + f1Score.toArray()); + + JavaRDD> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD(); + System.out.println("F2 Score by threshold: " + f2Score.toArray()); + + // Precision-recall curve + JavaRDD> prc = metrics.pr().toJavaRDD(); + System.out.println("Precision-recall curve: " + prc.toArray()); + + // Thresholds + JavaRDD thresholds = precision.map( + new Function, Double>() { + public Double call(Tuple2 t) { + return new Double(t._1().toString()); + } + } + ); + + // ROC Curve + JavaRDD> roc = metrics.roc().toJavaRDD(); + System.out.println("ROC curve: " + roc.toArray()); + + // AUPRC + System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR()); + + // AUROC + System.out.println("Area under ROC = " + metrics.areaUnderROC()); + + // Save and load model + model.save(sc, "myModelPath"); + LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); + } } // $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java index cc60409b42859..6781ec619b89e 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java @@ -19,6 +19,7 @@ package org.apache.spark.examples.mllib; // $example on$ + import scala.Tuple2; import org.apache.spark.api.java.*; @@ -30,63 +31,62 @@ import org.apache.spark.mllib.evaluation.RegressionMetrics; import org.apache.spark.SparkConf; - // Read in the ratings data public class JavaLinearRegression { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Linear Regression Example"); - JavaSparkContext sc = new JavaSparkContext(conf); + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("Linear Regression Example"); + JavaSparkContext sc = new JavaSparkContext(conf); - // Load and parse the data - String path = "data/mllib/sample_linear_regression_data.txt"; - JavaRDD data = sc.textFile(path); - JavaRDD parsedData = data.map( - new Function() { - public LabeledPoint call(String line) { - String[] parts = line.split(" "); - double[] v = new double[parts.length - 1]; - for (int i = 1; i < parts.length - 1; i++) - v[i - 1] = Double.parseDouble(parts[i].split(":")[1]); - return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); - } - } - ); - parsedData.cache(); + // Load and parse the data + String path = "data/mllib/sample_linear_regression_data.txt"; + JavaRDD data = sc.textFile(path); + JavaRDD parsedData = data.map( + new Function() { + public LabeledPoint call(String line) { + String[] parts = line.split(" "); + double[] v = new double[parts.length - 1]; + for (int i = 1; i < parts.length - 1; i++) + v[i - 1] = Double.parseDouble(parts[i].split(":")[1]); + return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); + } + } + ); + parsedData.cache(); - // Building the model - int numIterations = 100; - final LinearRegressionModel model = - LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations); + // Building the model + int numIterations = 100; + final LinearRegressionModel model = + LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations); - // Evaluate model on training examples and compute training error - JavaRDD> valuesAndPreds = parsedData.map( - new Function>() { - public Tuple2 call(LabeledPoint point) { - double prediction = model.predict(point.features()); - return new Tuple2(prediction, point.label()); - } - } - ); + // Evaluate model on training examples and compute training error + JavaRDD> valuesAndPreds = parsedData.map( + new Function>() { + public Tuple2 call(LabeledPoint point) { + double prediction = model.predict(point.features()); + return new Tuple2(prediction, point.label()); + } + } + ); - // Instantiate metrics object - RegressionMetrics metrics = new RegressionMetrics(valuesAndPreds.rdd()); + // Instantiate metrics object + RegressionMetrics metrics = new RegressionMetrics(valuesAndPreds.rdd()); - // Squared error - System.out.format("MSE = %f\n", metrics.meanSquaredError()); - System.out.format("RMSE = %f\n", metrics.rootMeanSquaredError()); + // Squared error + System.out.format("MSE = %f\n", metrics.meanSquaredError()); + System.out.format("RMSE = %f\n", metrics.rootMeanSquaredError()); - // R-squared - System.out.format("R Squared = %f\n", metrics.r2()); + // R-squared + System.out.format("R Squared = %f\n", metrics.r2()); - // Mean absolute error - System.out.format("MAE = %f\n", metrics.meanAbsoluteError()); + // Mean absolute error + System.out.format("MAE = %f\n", metrics.meanAbsoluteError()); - // Explained variance - System.out.format("Explained Variance = %f\n", metrics.explainedVariance()); + // Explained variance + System.out.format("Explained Variance = %f\n", metrics.explainedVariance()); - // Save and load model - model.save(sc.sc(), "myModelPath"); - LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath"); - } + // Save and load model + model.save(sc.sc(), "myModelPath"); + LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath"); + } } // $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java index 53204523bc865..c4d44dd8ea872 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java @@ -19,60 +19,63 @@ package org.apache.spark.examples.mllib; // $example on$ + import scala.Tuple2; import org.apache.spark.api.java.*; import org.apache.spark.rdd.RDD; import org.apache.spark.mllib.evaluation.MultilabelMetrics; import org.apache.spark.SparkConf; + import java.util.Arrays; import java.util.List; // $example off$ import org.apache.spark.SparkContext; + // $example on$ public class JavaMultiLabelClassification { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics"); - JavaSparkContext sc = new JavaSparkContext(conf); + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics"); + JavaSparkContext sc = new JavaSparkContext(conf); - List> data = Arrays.asList( - new Tuple2(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}), - new Tuple2(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}), - new Tuple2(new double[]{}, new double[]{0.0}), - new Tuple2(new double[]{2.0}, new double[]{2.0}), - new Tuple2(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}), - new Tuple2(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}), - new Tuple2(new double[]{1.0}, new double[]{1.0, 2.0}) - ); - JavaRDD> scoreAndLabels = sc.parallelize(data); + List> data = Arrays.asList( + new Tuple2(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}), + new Tuple2(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}), + new Tuple2(new double[]{}, new double[]{0.0}), + new Tuple2(new double[]{2.0}, new double[]{2.0}), + new Tuple2(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}), + new Tuple2(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}), + new Tuple2(new double[]{1.0}, new double[]{1.0, 2.0}) + ); + JavaRDD> scoreAndLabels = sc.parallelize(data); - // Instantiate metrics object - MultilabelMetrics metrics = new MultilabelMetrics(scoreAndLabels.rdd()); + // Instantiate metrics object + MultilabelMetrics metrics = new MultilabelMetrics(scoreAndLabels.rdd()); - // Summary stats - System.out.format("Recall = %f\n", metrics.recall()); - System.out.format("Precision = %f\n", metrics.precision()); - System.out.format("F1 measure = %f\n", metrics.f1Measure()); - System.out.format("Accuracy = %f\n", metrics.accuracy()); + // Summary stats + System.out.format("Recall = %f\n", metrics.recall()); + System.out.format("Precision = %f\n", metrics.precision()); + System.out.format("F1 measure = %f\n", metrics.f1Measure()); + System.out.format("Accuracy = %f\n", metrics.accuracy()); - // Stats by labels - for (int i = 0; i < metrics.labels().length - 1; i++) { - System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i])); - System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])); - System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure(metrics.labels()[i])); - } + // Stats by labels + for (int i = 0; i < metrics.labels().length - 1; i++) { + System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i])); + System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])); + System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure(metrics.labels()[i])); + } - // Micro stats - System.out.format("Micro recall = %f\n", metrics.microRecall()); - System.out.format("Micro precision = %f\n", metrics.microPrecision()); - System.out.format("Micro F1 measure = %f\n", metrics.microF1Measure()); + // Micro stats + System.out.format("Micro recall = %f\n", metrics.microRecall()); + System.out.format("Micro precision = %f\n", metrics.microPrecision()); + System.out.format("Micro F1 measure = %f\n", metrics.microF1Measure()); - // Hamming loss - System.out.format("Hamming loss = %f\n", metrics.hammingLoss()); + // Hamming loss + System.out.format("Hamming loss = %f\n", metrics.hammingLoss()); - // Subset accuracy - System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy()); + // Subset accuracy + System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy()); - } + } } // $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java index 0e74da7a883d1..cc5af0b178fa5 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java @@ -19,6 +19,7 @@ package org.apache.spark.examples.mllib // $example on$ + import scala.Tuple2; import org.apache.spark.api.java.*; @@ -33,62 +34,61 @@ import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; - public class JavaMulticlassClassification { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics"); - SparkContext sc = new SparkContext(conf); - String path = "data/mllib/sample_multiclass_classification_data.txt"; - JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics"); + SparkContext sc = new SparkContext(conf); + String path = "data/mllib/sample_multiclass_classification_data.txt"; + JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); - // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L); - JavaRDD training = splits[0].cache(); - JavaRDD test = splits[1]; + // Split initial RDD into two... [60% training data, 40% testing data]. + JavaRDD[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L); + JavaRDD training = splits[0].cache(); + JavaRDD test = splits[1]; - // Run training algorithm to build the model. - final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() - .setNumClasses(3) - .run(training.rdd()); + // Run training algorithm to build the model. + final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() + .setNumClasses(3) + .run(training.rdd()); - // Compute raw scores on the test set. - JavaRDD> predictionAndLabels = test.map( - new Function>() { - public Tuple2 call(LabeledPoint p) { - Double prediction = model.predict(p.features()); - return new Tuple2(prediction, p.label()); - } - } - ); + // Compute raw scores on the test set. + JavaRDD> predictionAndLabels = test.map( + new Function>() { + public Tuple2 call(LabeledPoint p) { + Double prediction = model.predict(p.features()); + return new Tuple2(prediction, p.label()); + } + } + ); - // Get evaluation metrics. - MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); + // Get evaluation metrics. + MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); - // Confusion matrix - Matrix confusion = metrics.confusionMatrix(); - System.out.println("Confusion matrix: \n" + confusion); + // Confusion matrix + Matrix confusion = metrics.confusionMatrix(); + System.out.println("Confusion matrix: \n" + confusion); - // Overall statistics - System.out.println("Precision = " + metrics.precision()); - System.out.println("Recall = " + metrics.recall()); - System.out.println("F1 Score = " + metrics.fMeasure()); + // Overall statistics + System.out.println("Precision = " + metrics.precision()); + System.out.println("Recall = " + metrics.recall()); + System.out.println("F1 Score = " + metrics.fMeasure()); - // Stats by labels - for (int i = 0; i < metrics.labels().length; i++) { - System.out.format("Class %f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i])); - System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])); - System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i])); - } + // Stats by labels + for (int i = 0; i < metrics.labels().length; i++) { + System.out.format("Class %f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i])); + System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])); + System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i])); + } - //Weighted stats - System.out.format("Weighted precision = %f\n", metrics.weightedPrecision()); - System.out.format("Weighted recall = %f\n", metrics.weightedRecall()); - System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure()); - System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate()); + //Weighted stats + System.out.format("Weighted precision = %f\n", metrics.weightedPrecision()); + System.out.format("Weighted recall = %f\n", metrics.weightedRecall()); + System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure()); + System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate()); - // Save and load model - model.save(sc, "myModelPath"); - LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); - } + // Save and load model + model.save(sc, "myModelPath"); + LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); + } } // $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java index b389a09c2715f..2fca06b25ebe5 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java @@ -19,6 +19,7 @@ package org.apache.spark.examples.mllib; // $example on$ + import scala.Tuple2; import org.apache.spark.api.java.*; @@ -26,151 +27,151 @@ import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.Function; + import java.util.*; + import org.apache.spark.mllib.evaluation.RegressionMetrics; import org.apache.spark.mllib.evaluation.RankingMetrics; import org.apache.spark.mllib.recommendation.ALS; import org.apache.spark.mllib.recommendation.Rating; - // Read in the ratings data public class JavaRanking { - public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Ranking Metrics"); - JavaSparkContext sc = new JavaSparkContext(conf); - String path = "data/mllib/sample_movielens_data.txt"; - JavaRDD data = sc.textFile(path); - JavaRDD ratings = data.map( - new Function() { - public Rating call(String line) { - String[] parts = line.split("::"); - return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]) - 2.5); - } + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("Ranking Metrics"); + JavaSparkContext sc = new JavaSparkContext(conf); + String path = "data/mllib/sample_movielens_data.txt"; + JavaRDD data = sc.textFile(path); + JavaRDD ratings = data.map( + new Function() { + public Rating call(String line) { + String[] parts = line.split("::"); + return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]) - 2.5); + } + } + ); + ratings.cache(); + + // Train an ALS model + final MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), 10, 10, 0.01); + + // Get top 10 recommendations for every user and scale ratings from 0 to 1 + JavaRDD> userRecs = model.recommendProductsForUsers(10).toJavaRDD(); + JavaRDD> userRecsScaled = userRecs.map( + new Function, Tuple2>() { + public Tuple2 call(Tuple2 t) { + Rating[] scaledRatings = new Rating[t._2().length]; + for (int i = 0; i < scaledRatings.length; i++) { + double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0); + scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating); } - ); - ratings.cache(); - - // Train an ALS model - final MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), 10, 10, 0.01); - - // Get top 10 recommendations for every user and scale ratings from 0 to 1 - JavaRDD> userRecs = model.recommendProductsForUsers(10).toJavaRDD(); - JavaRDD> userRecsScaled = userRecs.map( - new Function, Tuple2>() { - public Tuple2 call(Tuple2 t) { - Rating[] scaledRatings = new Rating[t._2().length]; - for (int i = 0; i < scaledRatings.length; i++) { - double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0); - scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating); - } - return new Tuple2(t._1(), scaledRatings); - } + return new Tuple2(t._1(), scaledRatings); + } + } + ); + JavaPairRDD userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled); + + // Map ratings to 1 or 0, 1 indicating a movie that should be recommended + JavaRDD binarizedRatings = ratings.map( + new Function() { + public Rating call(Rating r) { + double binaryRating; + if (r.rating() > 0.0) { + binaryRating = 1.0; + } else { + binaryRating = 0.0; } - ); - JavaPairRDD userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled); - - // Map ratings to 1 or 0, 1 indicating a movie that should be recommended - JavaRDD binarizedRatings = ratings.map( - new Function() { - public Rating call(Rating r) { - double binaryRating; - if (r.rating() > 0.0) { - binaryRating = 1.0; - } - else { - binaryRating = 0.0; - } - return new Rating(r.user(), r.product(), binaryRating); - } + return new Rating(r.user(), r.product(), binaryRating); + } + } + ); + + // Group ratings by common user + JavaPairRDD> userMovies = binarizedRatings.groupBy( + new Function() { + public Object call(Rating r) { + return r.user(); + } + } + ); + + // Get true relevant documents from all user ratings + JavaPairRDD> userMoviesList = userMovies.mapValues( + new Function, List>() { + public List call(Iterable docs) { + List products = new ArrayList(); + for (Rating r : docs) { + if (r.rating() > 0.0) { + products.add(r.product()); + } } - ); - - // Group ratings by common user - JavaPairRDD> userMovies = binarizedRatings.groupBy( - new Function() { - public Object call(Rating r) { - return r.user(); - } + return products; + } + } + ); + + // Extract the product id from each recommendation + JavaPairRDD> userRecommendedList = userRecommended.mapValues( + new Function>() { + public List call(Rating[] docs) { + List products = new ArrayList(); + for (Rating r : docs) { + products.add(r.product()); } - ); - - // Get true relevant documents from all user ratings - JavaPairRDD> userMoviesList = userMovies.mapValues( - new Function, List>() { - public List call(Iterable docs) { - List products = new ArrayList(); - for (Rating r : docs) { - if (r.rating() > 0.0) { - products.add(r.product()); - } - } - return products; - } - } - ); - - // Extract the product id from each recommendation - JavaPairRDD> userRecommendedList = userRecommended.mapValues( - new Function>() { - public List call(Rating[] docs) { - List products = new ArrayList(); - for (Rating r : docs) { - products.add(r.product()); - } - return products; + return products; + } + } + ); + JavaRDD, List>> relevantDocs = userMoviesList.join(userRecommendedList).values(); + + // Instantiate the metrics object + RankingMetrics metrics = RankingMetrics.of(relevantDocs); + + // Precision and NDCG at k + Integer[] kVector = {1, 3, 5}; + for (Integer k : kVector) { + System.out.format("Precision at %d = %f\n", k, metrics.precisionAt(k)); + System.out.format("NDCG at %d = %f\n", k, metrics.ndcgAt(k)); + } + + // Mean average precision + System.out.format("Mean average precision = %f\n", metrics.meanAveragePrecision()); + + // Evaluate the model using numerical ratings and regression metrics + JavaRDD> userProducts = ratings.map( + new Function>() { + public Tuple2 call(Rating r) { + return new Tuple2(r.user(), r.product()); + } + } + ); + JavaPairRDD, Object> predictions = JavaPairRDD.fromJavaRDD( + model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map( + new Function, Object>>() { + public Tuple2, Object> call(Rating r) { + return new Tuple2, Object>( + new Tuple2(r.user(), r.product()), r.rating()); + } } - } - ); - JavaRDD, List>> relevantDocs = userMoviesList.join(userRecommendedList).values(); - - // Instantiate the metrics object - RankingMetrics metrics = RankingMetrics.of(relevantDocs); - - // Precision and NDCG at k - Integer[] kVector = {1, 3, 5}; - for (Integer k : kVector) { - System.out.format("Precision at %d = %f\n", k, metrics.precisionAt(k)); - System.out.format("NDCG at %d = %f\n", k, metrics.ndcgAt(k)); - } - - // Mean average precision - System.out.format("Mean average precision = %f\n", metrics.meanAveragePrecision()); - - // Evaluate the model using numerical ratings and regression metrics - JavaRDD> userProducts = ratings.map( - new Function>() { - public Tuple2 call(Rating r) { - return new Tuple2(r.user(), r.product()); + )); + JavaRDD> ratesAndPreds = + JavaPairRDD.fromJavaRDD(ratings.map( + new Function, Object>>() { + public Tuple2, Object> call(Rating r) { + return new Tuple2, Object>( + new Tuple2(r.user(), r.product()), r.rating()); + } } - } - ); - JavaPairRDD, Object> predictions = JavaPairRDD.fromJavaRDD( - model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map( - new Function, Object>>() { - public Tuple2, Object> call(Rating r){ - return new Tuple2, Object>( - new Tuple2(r.user(), r.product()), r.rating()); - } - } - )); - JavaRDD> ratesAndPreds = - JavaPairRDD.fromJavaRDD(ratings.map( - new Function, Object>>() { - public Tuple2, Object> call(Rating r){ - return new Tuple2, Object>( - new Tuple2(r.user(), r.product()), r.rating()); - } - } - )).join(predictions).values(); - - // Create regression metrics object - RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd()); - - // Root mean squared error - System.out.format("RMSE = %f\n", regressionMetrics.rootMeanSquaredError()); - - // R-squared - System.out.format("R-squared = %f\n", regressionMetrics.r2()); - } + )).join(predictions).values(); + + // Create regression metrics object + RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd()); + + // Root mean squared error + System.out.format("RMSE = %f\n", regressionMetrics.rootMeanSquaredError()); + + // R-squared + System.out.format("R-squared = %f\n", regressionMetrics.r2()); + } } // $example off$ \ No newline at end of file diff --git a/examples/src/main/python/mllib/binary_classification_metrics.py b/examples/src/main/python/mllib/binary_classification_metrics.py index 85583c7e6cfa7..f8c32bbe6154e 100644 --- a/examples/src/main/python/mllib/binary_classification_metrics.py +++ b/examples/src/main/python/mllib/binary_classification_metrics.py @@ -36,7 +36,9 @@ sc = SparkContext(appName="BinaryClassificationMetrics") sqlContext = SQLContext(sc) + # $example on$ # Several of the methods available in scala are currently missing from pyspark + # $example off$ # $example on$ # Load training data in LIBSVM format diff --git a/examples/src/main/python/mllib/regression_metrics.py b/examples/src/main/python/mllib/regression_metrics.py index 2b90f2457267c..aca33aa7f8611 100644 --- a/examples/src/main/python/mllib/regression_metrics.py +++ b/examples/src/main/python/mllib/regression_metrics.py @@ -27,9 +27,11 @@ sc = SparkContext(appName="Regression Metrics") # $example on$ # Load and parse the data + def parsePoint(line): values = line.split() - return LabeledPoint(float(values[0]),DenseVector([float(x.split(':')[1]) for x in values[1:]])) + return LabeledPoint(float(values[0]), + DenseVector([float(x.split(':')[1]) for x in values[1:]])) data = sc.textFile("data/mllib/sample_linear_regression_data.txt") parsedData = data.map(parsePoint) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala index db640ccc4a08e..72728ccc5ae43 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala @@ -38,7 +38,7 @@ object BinaryClassificationMetrics { import sqlContext.implicits._ // $example on$ // Load training data in LIBSVM format - val data = MLUtils.loadLibSVMFile(sc, "data/mllib/data/mllib/sample_binary_classification_data.txt") + val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") // Split data into training (60%) and test (40%) val Array(training, test) = data.randomSplit(Array(0.6, 0.4), seed = 11L) @@ -106,4 +106,5 @@ object BinaryClassificationMetrics { } } + // scalastyle:on println \ No newline at end of file diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala index 020b86d2b332c..4ae6c48364208 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala @@ -20,6 +20,7 @@ package org.apache.spark.examples.mllib import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkContext, SparkConf} + // $example on$ import org.apache.spark.mllib.evaluation.MultilabelMetrics import org.apache.spark.rdd.RDD; From ad3c01ef932d4f4d5a47c7f1fbf6789b2f70caad Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Fri, 13 Nov 2015 15:16:56 +0530 Subject: [PATCH 06/13] fixed scala style issues --- .../mllib/JavaBinaryClassification.java | 2 +- .../mllib/JavaMultiLabelClassification.java | 2 +- .../mllib/JavaMulticlassClassification.java | 2 +- .../mllib/BinaryClassificationMetrics.scala | 7 ++++--- .../examples/mllib/MultiLabelMetrics.scala | 15 ++++++++------- .../examples/mllib/MulticlassMetrics.scala | 2 +- .../spark/examples/mllib/RankingMetrics.scala | 19 ++++++++++++++----- .../examples/mllib/RegressionMetrics.scala | 4 +++- 8 files changed, 33 insertions(+), 20 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java index 58b255eb598ec..86fbc1aa58c21 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java @@ -23,13 +23,13 @@ import scala.Tuple2; import org.apache.spark.api.java.*; -import org.apache.spark.rdd.RDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.classification.LogisticRegressionModel; import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.rdd.RDD; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java index c4d44dd8ea872..88313964c05df 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java @@ -23,8 +23,8 @@ import scala.Tuple2; import org.apache.spark.api.java.*; -import org.apache.spark.rdd.RDD; import org.apache.spark.mllib.evaluation.MultilabelMetrics; +import org.apache.spark.rdd.RDD; import org.apache.spark.SparkConf; import java.util.Arrays; diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java index cc5af0b178fa5..4bec6ffee0ed4 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java @@ -23,7 +23,6 @@ import scala.Tuple2; import org.apache.spark.api.java.*; -import org.apache.spark.rdd.RDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.classification.LogisticRegressionModel; import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; @@ -31,6 +30,7 @@ import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.rdd.RDD; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala index 72728ccc5ae43..3a6ac425fca23 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala @@ -18,8 +18,6 @@ // scalastyle:off println package org.apache.spark.examples.mllib -import org.apache.spark.sql.SQLContext -import org.apache.spark.{SparkContext, SparkConf} // $example on$ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS @@ -28,6 +26,9 @@ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils // $example off$ +import org.apache.spark.{SparkContext, SparkConf} +import org.apache.spark.sql.SQLContext + object BinaryClassificationMetrics { def main(args: Array[String]) { @@ -107,4 +108,4 @@ object BinaryClassificationMetrics { } } -// scalastyle:on println \ No newline at end of file +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala index 4ae6c48364208..ef19aee6df331 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala @@ -18,17 +18,16 @@ // scalastyle:off println package org.apache.spark.examples.mllib -import org.apache.spark.sql.SQLContext -import org.apache.spark.{SparkContext, SparkConf} - // $example on$ import org.apache.spark.mllib.evaluation.MultilabelMetrics import org.apache.spark.rdd.RDD; // $example off$ -object MultiLabelMetrics { - def main(args: Array[String]) { +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkContext, SparkConf} +object MultiLabelMetrics { + def main(args: Array[String]) { val conf = new SparkConf().setAppName("MultiLabelMetrics") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) @@ -53,7 +52,8 @@ object MultiLabelMetrics { println(s"Accuracy = ${metrics.accuracy}") // Individual label stats - metrics.labels.foreach(label => println(s"Class $label precision = ${metrics.precision(label)}")) + metrics.labels.foreach(label => + println(s"Class $label precision = ${metrics.precision(label)}")) metrics.labels.foreach(label => println(s"Class $label recall = ${metrics.recall(label)}")) metrics.labels.foreach(label => println(s"Class $label F1-score = ${metrics.f1Measure(label)}")) @@ -69,4 +69,5 @@ object MultiLabelMetrics { println(s"Subset accuracy = ${metrics.subsetAccuracy}") // $example off$ } -} \ No newline at end of file +} +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala index 0ed3c633f19d8..6091fcb8be0c2 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala @@ -100,4 +100,4 @@ object MulticlassMetrics { } } -// scalastyle:on println \ No newline at end of file +// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala index 9a7a25357f596..047df02719074 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala @@ -20,9 +20,12 @@ package org.apache.spark.examples.mllib import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkContext, SparkConf} + // $example on$ + import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics} import org.apache.spark.mllib.recommendation.{ALS, Rating} + // $example off$ object RankingMetrics { @@ -40,7 +43,8 @@ object RankingMetrics { }.cache() // Map ratings to 1 or 0, 1 indicating a movie that should be recommended - val binarizedRatings = ratings.map(r => Rating(r.user, r.product, if (r.rating > 0) 1.0 else 0.0)).cache() + val binarizedRatings = ratings.map(r => Rating(r.user, r.product, + if (r.rating > 0) 1.0 else 0.0)).cache() // Summarize ratings val numRatings = ratings.count() @@ -68,7 +72,8 @@ object RankingMetrics { // Assume that any movie a user rated 3 or higher (which maps to a 1) is a relevant document // Compare with top ten most relevant documents val userMovies = binarizedRatings.groupBy(_.user) - val relevantDocuments = userMovies.join(userRecommended).map { case (user, (actual, predictions)) => + val relevantDocuments = userMovies.join(userRecommended).map { case (user, (actual, + predictions)) => (predictions.map(_.product), actual.filter(_.rating > 0.0).map(_.product).toArray) } @@ -89,9 +94,11 @@ object RankingMetrics { } // Get predictions for each data point - val allPredictions = model.predict(ratings.map(r => (r.user, r.product))).map(r => ((r.user, r.product), r.rating)) + val allPredictions = model.predict(ratings.map(r => (r.user, r.product))).map(r => ((r.user, + r.product), r.rating)) val allRatings = ratings.map(r => ((r.user, r.product), r.rating)) - val predictionsAndLabels = allPredictions.join(allRatings).map { case ((user, product), (predicted, actual)) => + val predictionsAndLabels = allPredictions.join(allRatings).map { case ((user, product), + (predicted, actual)) => (predicted, actual) } @@ -103,4 +110,6 @@ object RankingMetrics { println(s"R-squared = ${regressionMetrics.r2}") // $example off$ } -} \ No newline at end of file +} +// scalastyle:on println + diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala index 7dc77caeafa7a..5bbcf59a0d774 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala @@ -66,4 +66,6 @@ object RegressionMetrics { println(s"Explained variance = ${metrics.explainedVariance}") // $example off$ } -} \ No newline at end of file +} +// scalastyle:on println + From 4d18447c80a5db25499d731e150750b2c39db7f6 Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Fri, 13 Nov 2015 15:19:48 +0530 Subject: [PATCH 07/13] fixed java style issues --- .../examples/mllib/JavaMultiLabelClassification.java | 7 +++---- .../org/apache/spark/examples/mllib/JavaRanking.java | 12 ++++-------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java index 88313964c05df..b7283000db047 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java @@ -18,17 +18,16 @@ // scalastyle:off println package org.apache.spark.examples.mllib; +// $example off$ +import java.util.Arrays; +import java.util.List; // $example on$ - import scala.Tuple2; import org.apache.spark.api.java.*; import org.apache.spark.mllib.evaluation.MultilabelMetrics; import org.apache.spark.rdd.RDD; import org.apache.spark.SparkConf; - -import java.util.Arrays; -import java.util.List; // $example off$ import org.apache.spark.SparkContext; diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java index 2fca06b25ebe5..18723b8beb38c 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java @@ -19,22 +19,18 @@ package org.apache.spark.examples.mllib; // $example on$ - +import java.util.*; import scala.Tuple2; import org.apache.spark.api.java.*; -import org.apache.spark.rdd.RDD; -import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; -import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.Function; - -import java.util.*; - import org.apache.spark.mllib.evaluation.RegressionMetrics; import org.apache.spark.mllib.evaluation.RankingMetrics; import org.apache.spark.mllib.recommendation.ALS; +import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; import org.apache.spark.mllib.recommendation.Rating; - +import org.apache.spark.rdd.RDD; +import org.apache.spark.SparkConf; // Read in the ratings data public class JavaRanking { public static void main(String[] args) { From 3c40a35d665c821f1f2f7cbdc3af2c0f9e4ff45d Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Wed, 18 Nov 2015 12:07:55 +0530 Subject: [PATCH 08/13] fixed styling issues --- docs/mllib-evaluation-metrics.md | 9 +- ...vaBinaryClassificationMetricsExample.java} | 40 ++--- ....java => JavaLinearRegressionExample.java} | 47 +++--- ...ltiLabelClassificationMetricsExample.java} | 35 +++-- ...lticlassClassificationMetricsExample.java} | 39 +++-- ...ng.java => JavaRankingMetricsExample.java} | 145 +++++++++--------- ... binary_classification_metrics_example.py} | 11 +- ...rics.py => multi_class_metrics_example.py} | 2 +- ...rics.py => multi_label_metrics_example.py} | 3 +- ..._metrics.py => ranking_metrics_example.py} | 2 +- ...trics.py => regression_metrics_example.py} | 3 +- ... BinaryClassificationMetricsExample.scala} | 11 +- ...s.scala => MultiLabelMetricsExample.scala} | 9 +- ...s.scala => MulticlassMetricsExample.scala} | 10 +- ...rics.scala => RankingMetricsExample.scala} | 10 +- ...s.scala => RegressionMetricsExample.scala} | 9 +- 16 files changed, 182 insertions(+), 203 deletions(-) rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaBinaryClassification.java => JavaBinaryClassificationMetricsExample.java} (83%) rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaLinearRegression.java => JavaLinearRegressionExample.java} (71%) rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaMultiLabelClassification.java => JavaMultiLabelClassificationMetricsExample.java} (74%) rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaMulticlassClassification.java => JavaMulticlassClassificationMetricsExample.java} (81%) rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaRanking.java => JavaRankingMetricsExample.java} (56%) rename examples/src/main/python/mllib/{binary_classification_metrics.py => binary_classification_metrics_example.py} (95%) rename examples/src/main/python/mllib/{multi_class_metrics.py => multi_class_metrics_example.py} (97%) rename examples/src/main/python/mllib/{multi_label_metrics.py => multi_label_metrics_example.py} (97%) rename examples/src/main/python/mllib/{ranking_metrics.py => ranking_metrics_example.py} (97%) rename examples/src/main/python/mllib/{regression_metrics.py => regression_metrics_example.py} (97%) rename examples/src/main/scala/org/apache/spark/examples/mllib/{BinaryClassificationMetrics.scala => BinaryClassificationMetricsExample.scala} (97%) rename examples/src/main/scala/org/apache/spark/examples/mllib/{MultiLabelMetrics.scala => MultiLabelMetricsExample.scala} (94%) rename examples/src/main/scala/org/apache/spark/examples/mllib/{MulticlassMetrics.scala => MulticlassMetricsExample.scala} (97%) rename examples/src/main/scala/org/apache/spark/examples/mllib/{RankingMetrics.scala => RankingMetricsExample.scala} (97%) rename examples/src/main/scala/org/apache/spark/examples/mllib/{RegressionMetrics.scala => RegressionMetricsExample.scala} (94%) diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md index 7a9792c4a1455..138a1b297ad33 100644 --- a/docs/mllib-evaluation-metrics.md +++ b/docs/mllib-evaluation-metrics.md @@ -111,7 +111,7 @@ Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org
Refer to the [`LogisticRegressionModel` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionModel.html) and [`LogisticRegressionWithLBFGS` Java docs](api/java/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaBinaryClassification.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java %}
@@ -247,7 +247,8 @@ Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.sp
Refer to the [`MulticlassMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MulticlassMetrics.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample +.java %}
@@ -397,7 +398,7 @@ Refer to the [`MultilabelMetrics` Scala docs](api/scala/index.html#org.apache.sp
Refer to the [`MultilabelMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MultilabelMetrics.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetrics.java %}
@@ -525,7 +526,7 @@ Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.sp
Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) and [`RankingMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RankingMetrics.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaRanking.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java %}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java similarity index 83% rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java index 86fbc1aa58c21..c77c6ba52c26c 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java @@ -19,7 +19,6 @@ package org.apache.spark.examples.mllib; // $example on$ - import scala.Tuple2; import org.apache.spark.api.java.*; @@ -30,13 +29,15 @@ import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.rdd.RDD; +// $example off$ import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; -public class JavaBinaryClassification { +public class JavaBinaryClassificationMetricsExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics"); + SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics Example"); SparkContext sc = new SparkContext(conf); + // $example on$ String path = "data/mllib/sample_binary_classification_data.txt"; JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); @@ -47,20 +48,20 @@ public static void main(String[] args) { // Run training algorithm to build the model. final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() - .setNumClasses(2) - .run(training.rdd()); + .setNumClasses(2) + .run(training.rdd()); // Clear the prediction threshold so the model will return probabilities model.clearThreshold(); // Compute raw scores on the test set. JavaRDD> predictionAndLabels = test.map( - new Function>() { - public Tuple2 call(LabeledPoint p) { - Double prediction = model.predict(p.features()); - return new Tuple2(prediction, p.label()); - } - } + new Function>() { + public Tuple2 call(LabeledPoint p) { + Double prediction = model.predict(p.features()); + return new Tuple2(prediction, p.label()); + } + } ); // Get evaluation metrics. @@ -87,11 +88,11 @@ public Tuple2 call(LabeledPoint p) { // Thresholds JavaRDD thresholds = precision.map( - new Function, Double>() { - public Double call(Tuple2 t) { - return new Double(t._1().toString()); - } - } + new Function, Double>() { + public Double call(Tuple2 t) { + return new Double(t._1().toString()); + } + } ); // ROC Curve @@ -105,8 +106,9 @@ public Double call(Tuple2 t) { System.out.println("Area under ROC = " + metrics.areaUnderROC()); // Save and load model - model.save(sc, "myModelPath"); - LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); + model.save(sc, "target/tmp/LogisticRegressionModel"); + LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, + "target/tmp/LogisticRegressionModel"); + // $example off$ } } -// $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java similarity index 71% rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java index 6781ec619b89e..76f99d39c2763 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegression.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java @@ -19,7 +19,6 @@ package org.apache.spark.examples.mllib; // $example on$ - import scala.Tuple2; import org.apache.spark.api.java.*; @@ -30,42 +29,42 @@ import org.apache.spark.mllib.regression.LinearRegressionWithSGD; import org.apache.spark.mllib.evaluation.RegressionMetrics; import org.apache.spark.SparkConf; - +// $example off$ // Read in the ratings data -public class JavaLinearRegression { +public class JavaLinearRegressionExample { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Linear Regression Example"); JavaSparkContext sc = new JavaSparkContext(conf); - + // $example on$ // Load and parse the data String path = "data/mllib/sample_linear_regression_data.txt"; JavaRDD data = sc.textFile(path); JavaRDD parsedData = data.map( - new Function() { - public LabeledPoint call(String line) { - String[] parts = line.split(" "); - double[] v = new double[parts.length - 1]; - for (int i = 1; i < parts.length - 1; i++) - v[i - 1] = Double.parseDouble(parts[i].split(":")[1]); - return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); - } - } + new Function() { + public LabeledPoint call(String line) { + String[] parts = line.split(" "); + double[] v = new double[parts.length - 1]; + for (int i = 1; i < parts.length - 1; i++) + v[i - 1] = Double.parseDouble(parts[i].split(":")[1]); + return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); + } + } ); parsedData.cache(); // Building the model int numIterations = 100; - final LinearRegressionModel model = - LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations); + final LinearRegressionModel model = LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), + numIterations); // Evaluate model on training examples and compute training error JavaRDD> valuesAndPreds = parsedData.map( - new Function>() { - public Tuple2 call(LabeledPoint point) { - double prediction = model.predict(point.features()); - return new Tuple2(prediction, point.label()); - } - } + new Function>() { + public Tuple2 call(LabeledPoint point) { + double prediction = model.predict(point.features()); + return new Tuple2(prediction, point.label()); + } + } ); // Instantiate metrics object @@ -85,8 +84,8 @@ public Tuple2 call(LabeledPoint point) { System.out.format("Explained Variance = %f\n", metrics.explainedVariance()); // Save and load model - model.save(sc.sc(), "myModelPath"); - LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath"); + model.save(sc.sc(), "target/tmp/LogisticRegressionModel"); + LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "target/tmp/LogisticRegressionModel"); + // $example on$ } } -// $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java similarity index 74% rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java index b7283000db047..c69f315298f2a 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java @@ -18,7 +18,7 @@ // scalastyle:off println package org.apache.spark.examples.mllib; -// $example off$ + import java.util.Arrays; import java.util.List; // $example on$ @@ -31,20 +31,19 @@ // $example off$ import org.apache.spark.SparkContext; -// $example on$ -public class JavaMultiLabelClassification { +public class JavaMultiLabelClassificationMetricsExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics"); + SparkConf conf = new SparkConf().setAppName("Multilabel Classification Metrics Example"); JavaSparkContext sc = new JavaSparkContext(conf); - + // $example on$ List> data = Arrays.asList( - new Tuple2(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}), - new Tuple2(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}), - new Tuple2(new double[]{}, new double[]{0.0}), - new Tuple2(new double[]{2.0}, new double[]{2.0}), - new Tuple2(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}), - new Tuple2(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}), - new Tuple2(new double[]{1.0}, new double[]{1.0, 2.0}) + new Tuple2(new double[]{0.0, 1.0}, new double[]{0.0, 2.0}), + new Tuple2(new double[]{0.0, 2.0}, new double[]{0.0, 1.0}), + new Tuple2(new double[]{}, new double[]{0.0}), + new Tuple2(new double[]{2.0}, new double[]{2.0}), + new Tuple2(new double[]{2.0, 0.0}, new double[]{2.0, 0.0}), + new Tuple2(new double[]{0.0, 1.0, 2.0}, new double[]{0.0, 1.0}), + new Tuple2(new double[]{1.0}, new double[]{1.0, 2.0}) ); JavaRDD> scoreAndLabels = sc.parallelize(data); @@ -59,9 +58,12 @@ public static void main(String[] args) { // Stats by labels for (int i = 0; i < metrics.labels().length - 1; i++) { - System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i])); - System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])); - System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure(metrics.labels()[i])); + System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision + (metrics.labels()[i])); + System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics + .labels()[i])); + System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure + (metrics.labels()[i])); } // Micro stats @@ -74,7 +76,6 @@ public static void main(String[] args) { // Subset accuracy System.out.format("Subset accuracy = %f\n", metrics.subsetAccuracy()); - + // $example off$ } } -// $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java similarity index 81% rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java index 4bec6ffee0ed4..e05494a1da4de 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassification.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java @@ -19,7 +19,6 @@ package org.apache.spark.examples.mllib // $example on$ - import scala.Tuple2; import org.apache.spark.api.java.*; @@ -30,14 +29,16 @@ import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.mllib.linalg.Matrix; +// $example off$ import org.apache.spark.rdd.RDD; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; -public class JavaMulticlassClassification { +public class JavaMulticlassClassificationMetricsExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics"); + SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics Example"); SparkContext sc = new SparkContext(conf); + // $example on$ String path = "data/mllib/sample_multiclass_classification_data.txt"; JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); @@ -48,17 +49,17 @@ public static void main(String[] args) { // Run training algorithm to build the model. final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() - .setNumClasses(3) - .run(training.rdd()); + .setNumClasses(3) + .run(training.rdd()); // Compute raw scores on the test set. JavaRDD> predictionAndLabels = test.map( - new Function>() { - public Tuple2 call(LabeledPoint p) { - Double prediction = model.predict(p.features()); - return new Tuple2(prediction, p.label()); - } - } + new Function>() { + public Tuple2 call(LabeledPoint p) { + Double prediction = model.predict(p.features()); + return new Tuple2(prediction, p.label()); + } + } ); // Get evaluation metrics. @@ -75,9 +76,12 @@ public Tuple2 call(LabeledPoint p) { // Stats by labels for (int i = 0; i < metrics.labels().length; i++) { - System.out.format("Class %f precision = %f\n", metrics.labels()[i], metrics.precision(metrics.labels()[i])); - System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics.labels()[i])); - System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(metrics.labels()[i])); + System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision + (metrics.labels()[i])); + System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics + .labels()[i])); + System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure + (metrics.labels()[i])); } //Weighted stats @@ -87,8 +91,9 @@ public Tuple2 call(LabeledPoint p) { System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate()); // Save and load model - model.save(sc, "myModelPath"); - LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "myModelPath"); + model.save(sc, "target/tmp/LogisticRegressionModel"); + LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, + "target/tmp/LogisticRegressionModel"); + // $example off$ } } -// $example off$ \ No newline at end of file diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java similarity index 56% rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java index 18723b8beb38c..2f64a58dceeb2 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRanking.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java @@ -20,6 +20,7 @@ // $example on$ import java.util.*; + import scala.Tuple2; import org.apache.spark.api.java.*; @@ -29,22 +30,25 @@ import org.apache.spark.mllib.recommendation.ALS; import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; import org.apache.spark.mllib.recommendation.Rating; +// $example off$ import org.apache.spark.rdd.RDD; import org.apache.spark.SparkConf; -// Read in the ratings data -public class JavaRanking { + +public class JavaRankingMetricsExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Ranking Metrics"); + SparkConf conf = new SparkConf().setAppName("Ranking Metrics Example"); JavaSparkContext sc = new JavaSparkContext(conf); + // $example on$ String path = "data/mllib/sample_movielens_data.txt"; JavaRDD data = sc.textFile(path); JavaRDD ratings = data.map( - new Function() { - public Rating call(String line) { - String[] parts = line.split("::"); - return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double.parseDouble(parts[2]) - 2.5); - } - } + new Function() { + public Rating call(String line) { + String[] parts = line.split("::"); + return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double + .parseDouble(parts[2]) - 2.5); + } + } ); ratings.cache(); @@ -54,32 +58,32 @@ public Rating call(String line) { // Get top 10 recommendations for every user and scale ratings from 0 to 1 JavaRDD> userRecs = model.recommendProductsForUsers(10).toJavaRDD(); JavaRDD> userRecsScaled = userRecs.map( - new Function, Tuple2>() { - public Tuple2 call(Tuple2 t) { - Rating[] scaledRatings = new Rating[t._2().length]; - for (int i = 0; i < scaledRatings.length; i++) { - double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0); - scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating); - } - return new Tuple2(t._1(), scaledRatings); - } - } + new Function, Tuple2>() { + public Tuple2 call(Tuple2 t) { + Rating[] scaledRatings = new Rating[t._2().length]; + for (int i = 0; i < scaledRatings.length; i++) { + double newRating = Math.max(Math.min(t._2()[i].rating(), 1.0), 0.0); + scaledRatings[i] = new Rating(t._2()[i].user(), t._2()[i].product(), newRating); + } + return new Tuple2(t._1(), scaledRatings); + } + } ); JavaPairRDD userRecommended = JavaPairRDD.fromJavaRDD(userRecsScaled); // Map ratings to 1 or 0, 1 indicating a movie that should be recommended JavaRDD binarizedRatings = ratings.map( - new Function() { - public Rating call(Rating r) { - double binaryRating; - if (r.rating() > 0.0) { - binaryRating = 1.0; - } else { - binaryRating = 0.0; - } - return new Rating(r.user(), r.product(), binaryRating); - } - } + new Function() { + public Rating call(Rating r) { + double binaryRating; + if (r.rating() > 0.0) { + binaryRating = 1.0; + } else { + binaryRating = 0.0; + } + return new Rating(r.user(), r.product(), binaryRating); + } + } ); // Group ratings by common user @@ -93,32 +97,33 @@ public Object call(Rating r) { // Get true relevant documents from all user ratings JavaPairRDD> userMoviesList = userMovies.mapValues( - new Function, List>() { - public List call(Iterable docs) { - List products = new ArrayList(); - for (Rating r : docs) { - if (r.rating() > 0.0) { - products.add(r.product()); - } - } - return products; - } + new Function, List>() { + public List call(Iterable docs) { + List products = new ArrayList(); + for (Rating r : docs) { + if (r.rating() > 0.0) { + products.add(r.product()); } + } + return products; + } + } ); // Extract the product id from each recommendation JavaPairRDD> userRecommendedList = userRecommended.mapValues( - new Function>() { - public List call(Rating[] docs) { - List products = new ArrayList(); - for (Rating r : docs) { - products.add(r.product()); - } - return products; - } - } + new Function>() { + public List call(Rating[] docs) { + List products = new ArrayList(); + for (Rating r : docs) { + products.add(r.product()); + } + return products; + } + } ); - JavaRDD, List>> relevantDocs = userMoviesList.join(userRecommendedList).values(); + JavaRDD, List>> relevantDocs = userMoviesList.join + (userRecommendedList).values(); // Instantiate the metrics object RankingMetrics metrics = RankingMetrics.of(relevantDocs); @@ -135,29 +140,29 @@ public List call(Rating[] docs) { // Evaluate the model using numerical ratings and regression metrics JavaRDD> userProducts = ratings.map( - new Function>() { - public Tuple2 call(Rating r) { - return new Tuple2(r.user(), r.product()); - } - } + new Function>() { + public Tuple2 call(Rating r) { + return new Tuple2(r.user(), r.product()); + } + } ); JavaPairRDD, Object> predictions = JavaPairRDD.fromJavaRDD( - model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map( - new Function, Object>>() { - public Tuple2, Object> call(Rating r) { - return new Tuple2, Object>( - new Tuple2(r.user(), r.product()), r.rating()); - } - } - )); + model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map( + new Function, Object>>() { + public Tuple2, Object> call(Rating r) { + return new Tuple2, Object>( + new Tuple2(r.user(), r.product()), r.rating()); + } + } + )); JavaRDD> ratesAndPreds = JavaPairRDD.fromJavaRDD(ratings.map( - new Function, Object>>() { - public Tuple2, Object> call(Rating r) { - return new Tuple2, Object>( - new Tuple2(r.user(), r.product()), r.rating()); - } - } + new Function, Object>>() { + public Tuple2, Object> call(Rating r) { + return new Tuple2, Object>( + new Tuple2(r.user(), r.product()), r.rating()); + } + } )).join(predictions).values(); // Create regression metrics object @@ -168,6 +173,6 @@ public Tuple2, Object> call(Rating r) { // R-squared System.out.format("R-squared = %f\n", regressionMetrics.r2()); + // $example off$ } } -// $example off$ \ No newline at end of file diff --git a/examples/src/main/python/mllib/binary_classification_metrics.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py similarity index 95% rename from examples/src/main/python/mllib/binary_classification_metrics.py rename to examples/src/main/python/mllib/binary_classification_metrics_example.py index f8c32bbe6154e..38b557108c2ea 100644 --- a/examples/src/main/python/mllib/binary_classification_metrics.py +++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py @@ -14,16 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # - - """ Binary Classification Metrics Example. """ from __future__ import print_function - import sys - - from pyspark import SparkContext, SQLContext # $example on$ from pyspark.mllib.classification import LogisticRegressionWithLBFGS @@ -33,14 +28,10 @@ # $example off$ if __name__ == "__main__": - sc = SparkContext(appName="BinaryClassificationMetrics") + sc = SparkContext(appName="BinaryClassificationMetricsExample") sqlContext = SQLContext(sc) - # $example on$ # Several of the methods available in scala are currently missing from pyspark - # $example off$ - - # $example on$ # Load training data in LIBSVM format data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") diff --git a/examples/src/main/python/mllib/multi_class_metrics.py b/examples/src/main/python/mllib/multi_class_metrics_example.py similarity index 97% rename from examples/src/main/python/mllib/multi_class_metrics.py rename to examples/src/main/python/mllib/multi_class_metrics_example.py index 7959b7230a563..cd56b3c97c778 100644 --- a/examples/src/main/python/mllib/multi_class_metrics.py +++ b/examples/src/main/python/mllib/multi_class_metrics_example.py @@ -24,7 +24,7 @@ from pyspark import SparkContext if __name__ == "__main__": - sc = SparkContext(appName="MultiClassMetrics") + sc = SparkContext(appName="MultiClassMetricsExample") # Several of the methods available in scala are currently missing from pyspark # $example on$ diff --git a/examples/src/main/python/mllib/multi_label_metrics.py b/examples/src/main/python/mllib/multi_label_metrics_example.py similarity index 97% rename from examples/src/main/python/mllib/multi_label_metrics.py rename to examples/src/main/python/mllib/multi_label_metrics_example.py index d02d8d862d1e1..f293ce8e309e9 100644 --- a/examples/src/main/python/mllib/multi_label_metrics.py +++ b/examples/src/main/python/mllib/multi_label_metrics_example.py @@ -21,9 +21,8 @@ from pyspark.mllib.util import MLUtils from pyspark import SparkContext - if __name__ == "__main__": - sc = SparkContext(appName="MultiClassMetrics") + sc = SparkContext(appName="MultiLabelMetricsExample") # $example on$ scoreAndLabels = sc.parallelize([ ([0.0, 1.0], [0.0, 2.0]), diff --git a/examples/src/main/python/mllib/ranking_metrics.py b/examples/src/main/python/mllib/ranking_metrics_example.py similarity index 97% rename from examples/src/main/python/mllib/ranking_metrics.py rename to examples/src/main/python/mllib/ranking_metrics_example.py index 6fcdf3032d3dc..deefbd23c5b94 100644 --- a/examples/src/main/python/mllib/ranking_metrics.py +++ b/examples/src/main/python/mllib/ranking_metrics_example.py @@ -22,7 +22,7 @@ from pyspark import SparkContext if __name__ == "__main__": - sc = SparkContext(appName="Ranking Metrics") + sc = SparkContext(appName="Ranking Metrics Example") # Several of the methods available in scala are currently missing from pyspark # $example on$ diff --git a/examples/src/main/python/mllib/regression_metrics.py b/examples/src/main/python/mllib/regression_metrics_example.py similarity index 97% rename from examples/src/main/python/mllib/regression_metrics.py rename to examples/src/main/python/mllib/regression_metrics_example.py index aca33aa7f8611..a3e9c12dbc4ea 100644 --- a/examples/src/main/python/mllib/regression_metrics.py +++ b/examples/src/main/python/mllib/regression_metrics_example.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - # $example on$ from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD from pyspark.mllib.evaluation import RegressionMetrics @@ -24,7 +23,7 @@ from pyspark import SparkContext if __name__ == "__main__": - sc = SparkContext(appName="Regression Metrics") + sc = SparkContext(appName="Regression Metrics Example") # $example on$ # Load and parse the data diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala similarity index 97% rename from examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala index 3a6ac425fca23..466c84ed2dec7 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala @@ -17,8 +17,6 @@ // scalastyle:off println package org.apache.spark.examples.mllib - - // $example on$ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics @@ -29,11 +27,11 @@ import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.sql.SQLContext -object BinaryClassificationMetrics { +object BinaryClassificationMetricsExample { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("BinaryClassificationMetrics") + val conf = new SparkConf().setAppName("BinaryClassificationMetricsExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ @@ -102,10 +100,7 @@ object BinaryClassificationMetrics { // AUROC val auROC = metrics.areaUnderROC println("Area under ROC = " + auROC) - // $example off$ - } } - // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala similarity index 94% rename from examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala index ef19aee6df331..035f74490b3fe 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala @@ -14,21 +14,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - // scalastyle:off println package org.apache.spark.examples.mllib - // $example on$ import org.apache.spark.mllib.evaluation.MultilabelMetrics import org.apache.spark.rdd.RDD; // $example off$ - import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkContext, SparkConf} -object MultiLabelMetrics { - def main(args: Array[String]) { - val conf = new SparkConf().setAppName("MultiLabelMetrics") +object MultiLabelMetricsExample { + def main(args: Array[String]): Unit = { + val conf = new SparkConf().setAppName("MultiLabelMetricsExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala similarity index 97% rename from examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala index 6091fcb8be0c2..a3d29a5a091b6 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala @@ -14,23 +14,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - // scalastyle:off println package org.apache.spark.examples.mllib - // $example on$ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils // $example off$ - import org.apache.spark.{SparkContext, SparkConf} -object MulticlassMetrics { - - def main(args: Array[String]) { +object MulticlassMetricsExample { + def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("MulticlassMetrics") val sc = new SparkContext(conf) @@ -95,9 +91,7 @@ object MulticlassMetrics { println(s"Weighted recall: ${metrics.weightedRecall}") println(s"Weighted F1 score: ${metrics.weightedFMeasure}") println(s"Weighted false positive rate: ${metrics.weightedFalsePositiveRate}") - // $example off$ - } } // scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala similarity index 97% rename from examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala index 047df02719074..9fe933bf53931 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala @@ -14,24 +14,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - // scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkContext, SparkConf} - // $example on$ - import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics} import org.apache.spark.mllib.recommendation.{ALS, Rating} - // $example off$ -object RankingMetrics { +object RankingMetricsExample { def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("RankingMetrics") + val conf = new SparkConf().setAppName("RankingMetricsExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ @@ -112,4 +107,3 @@ object RankingMetrics { } } // scalastyle:on println - diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala similarity index 94% rename from examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala rename to examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala index 5bbcf59a0d774..fcab66f0df2e5 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetrics.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - // scalastyle:off println package org.apache.spark.examples.mllib // $example on$ @@ -28,11 +27,9 @@ import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} -object RegressionMetrics { - - def main(args: Array[String]) { - - val conf = new SparkConf().setAppName("RegressionMetrics") +object RegressionMetricsExample { + def main(args: Array[String]) : Unit = { + val conf = new SparkConf().setAppName("RegressionMetricsExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ From 892591b232dea89b943c138ad5a722d527a6801a Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Wed, 18 Nov 2015 12:14:14 +0530 Subject: [PATCH 09/13] fixed mllib-evaluation-metrics.md file --- docs/mllib-evaluation-metrics.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md index 138a1b297ad33..3b07eee369de5 100644 --- a/docs/mllib-evaluation-metrics.md +++ b/docs/mllib-evaluation-metrics.md @@ -104,7 +104,7 @@ data, and evaluate the performance of the algorithm by several binary evaluation
Refer to the [`LogisticRegressionWithLBFGS` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS) and [`BinaryClassificationMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.BinaryClassificationMetrics) for details on the API. -{% include_example scala/org/apache/spark/examples/mllib/BinaryClassificationMetrics.scala %} +{% include_example scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala %}
@@ -118,7 +118,7 @@ Refer to the [`LogisticRegressionModel` Java docs](api/java/org/apache/spark/mll
Refer to the [`BinaryClassificationMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.BinaryClassificationMetrics) and [`LogisticRegressionWithLBFGS` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.LogisticRegressionWithLBFGS) for more details on the API. -{% include_example python/mllib/binary_classification_metrics.py %} +{% include_example python/mllib/binary_classification_metrics_example.py %}
@@ -240,7 +240,7 @@ the data, and evaluate the performance of the algorithm by several multiclass cl
Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MulticlassMetrics) for details on the API. -{% include_example scala/org/apache/spark/examples/mllib/MulticlassMetrics.scala %} +{% include_example scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala %}
@@ -255,7 +255,7 @@ Refer to the [`MulticlassMetrics` Java docs](api/java/org/apache/spark/mllib/eva
Refer to the [`MulticlassMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.MulticlassMetrics) for more details on the API. -{% include_example python/mllib/multi_class_metrics.py %} +{% include_example python/mllib/multi_class_metrics_example.py %}
@@ -391,21 +391,22 @@ True classes:
Refer to the [`MultilabelMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.MultilabelMetrics) for details on the API. -{% include_example scala/org/apache/spark/examples/mllib/MultiLabelMetrics.scala %} +{% include_example scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala %}
Refer to the [`MultilabelMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MultilabelMetrics.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetrics.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample +.java %}
Refer to the [`MultilabelMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.MultilabelMetrics) for more details on the API. -{% include_example python/mllib/multi_label_metrics.py %} +{% include_example python/mllib/multi_label_metrics_example.py %}
@@ -519,7 +520,7 @@ expanded world of non-positive weights are "the same as never having interacted
Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RegressionMetrics) and [`RankingMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RankingMetrics) for details on the API. -{% include_example scala/org/apache/spark/examples/mllib/RankingMetrics.scala %} +{% include_example scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala %}
@@ -533,7 +534,7 @@ Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/eva
Refer to the [`RegressionMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RegressionMetrics) and [`RankingMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics) for more details on the API. -{% include_example python/mllib/ranking_metrics.py %} +{% include_example python/mllib/ranking_metrics_example.py %}
@@ -583,21 +584,21 @@ and evaluate the performance of the algorithm by several regression metrics.
Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.spark.mllib.evaluation.RegressionMetrics) for details on the API. -{% include_example scala/org/apache/spark/examples/mllib/RegressionMetrics.scala %} +{% include_example scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala %}
Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaLinearRegression.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java %}
Refer to the [`RegressionMetrics` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RegressionMetrics) for more details on the API. -{% include_example python/mllib/regression_metrics.py %} +{% include_example python/mllib/regression_metrics_example.py %}
From 8d2d508a89bf329f57d423345e81ca20f27fd541 Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Wed, 18 Nov 2015 12:42:37 +0530 Subject: [PATCH 10/13] fixed java issues --- docs/mllib-evaluation-metrics.md | 6 ++---- .../JavaBinaryClassificationMetricsExample.java | 3 --- .../examples/mllib/JavaLinearRegressionExample.java | 13 ++++++------- .../JavaMultiLabelClassificationMetricsExample.java | 2 -- .../JavaMulticlassClassificationMetricsExample.java | 5 +---- 5 files changed, 9 insertions(+), 20 deletions(-) diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md index 3b07eee369de5..a1afde0dc6635 100644 --- a/docs/mllib-evaluation-metrics.md +++ b/docs/mllib-evaluation-metrics.md @@ -247,8 +247,7 @@ Refer to the [`MulticlassMetrics` Scala docs](api/scala/index.html#org.apache.sp
Refer to the [`MulticlassMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MulticlassMetrics.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample -.java %} + {% include_example java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java %}
@@ -398,8 +397,7 @@ Refer to the [`MultilabelMetrics` Scala docs](api/scala/index.html#org.apache.sp
Refer to the [`MultilabelMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/MultilabelMetrics.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample -.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java %}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java index c77c6ba52c26c..d905aa82d5e49 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java @@ -14,10 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -// scalastyle:off println package org.apache.spark.examples.mllib; - // $example on$ import scala.Tuple2; diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java index 76f99d39c2763..5f4e2d74f04ef 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java @@ -14,10 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -// scalastyle:off println package org.apache.spark.examples.mllib; - // $example on$ import scala.Tuple2; @@ -30,6 +27,7 @@ import org.apache.spark.mllib.evaluation.RegressionMetrics; import org.apache.spark.SparkConf; // $example off$ + // Read in the ratings data public class JavaLinearRegressionExample { public static void main(String[] args) { @@ -46,7 +44,7 @@ public LabeledPoint call(String line) { double[] v = new double[parts.length - 1]; for (int i = 1; i < parts.length - 1; i++) v[i - 1] = Double.parseDouble(parts[i].split(":")[1]); - return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); + return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); } } ); @@ -54,7 +52,7 @@ public LabeledPoint call(String line) { // Building the model int numIterations = 100; - final LinearRegressionModel model = LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), + final LinearRegressionModel model = LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations); // Evaluate model on training examples and compute training error @@ -85,7 +83,8 @@ public Tuple2 call(LabeledPoint point) { // Save and load model model.save(sc.sc(), "target/tmp/LogisticRegressionModel"); - LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "target/tmp/LogisticRegressionModel"); - // $example on$ + LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), + "target/tmp/LogisticRegressionModel"); + // $example off$ } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java index c69f315298f2a..d6b9178b2c4a0 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -// scalastyle:off println package org.apache.spark.examples.mllib; diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java index e05494a1da4de..52d0b3354bf46 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java @@ -14,10 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -// scalastyle:off println package org.apache.spark.examples.mllib - // $example on$ import scala.Tuple2; @@ -50,7 +47,7 @@ public static void main(String[] args) { // Run training algorithm to build the model. final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() .setNumClasses(3) - .run(training.rdd()); + .run(training.rdd()); // Compute raw scores on the test set. JavaRDD> predictionAndLabels = test.map( From 54008ced6c3a380cac71bb85b2a7da2a83fb8c20 Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Thu, 19 Nov 2015 12:32:48 +0530 Subject: [PATCH 11/13] fixed import issue --- .../mllib/JavaMulticlassClassificationMetricsExample.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java index 52d0b3354bf46..9f3426a0ba6de 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.examples.mllib +package org.apache.spark.examples.mllib; // $example on$ import scala.Tuple2; From 1c5cc8f1753942e4c1efb7f868a32d6b30420159 Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Thu, 19 Nov 2015 15:46:09 +0530 Subject: [PATCH 12/13] fixed spacing and removed few imports and re-ordered imports --- docs/mllib-evaluation-metrics.md | 2 +- ...avaBinaryClassificationMetricsExample.java | 10 +++-- ...ultiLabelClassificationMetricsExample.java | 11 +++--- ...ulticlassClassificationMetricsExample.java | 13 ++++--- .../mllib/JavaRankingMetricsExample.java | 38 +++++++++---------- ...java => JavaRegressionMetricsExample.java} | 11 +++--- .../mllib/multi_label_metrics_example.py | 1 - .../python/mllib/ranking_metrics_example.py | 2 +- .../mllib/regression_metrics_example.py | 1 - .../BinaryClassificationMetricsExample.scala | 5 +-- .../mllib/MultiLabelMetricsExample.scala | 9 ++--- .../mllib/MulticlassMetricsExample.scala | 4 +- .../mllib/RankingMetricsExample.scala | 5 ++- .../mllib/RegressionMetricsExample.scala | 6 +-- 14 files changed, 59 insertions(+), 59 deletions(-) rename examples/src/main/java/org/apache/spark/examples/mllib/{JavaLinearRegressionExample.java => JavaRegressionMetricsExample.java} (93%) diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md index a1afde0dc6635..6924037b941f3 100644 --- a/docs/mllib-evaluation-metrics.md +++ b/docs/mllib-evaluation-metrics.md @@ -589,7 +589,7 @@ Refer to the [`RegressionMetrics` Scala docs](api/scala/index.html#org.apache.sp
Refer to the [`RegressionMetrics` Java docs](api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html) for details on the API. -{% include_example java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java %} +{% include_example java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java %}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java index d905aa82d5e49..980a9108af53f 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java @@ -14,7 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.examples.mllib; + // $example on$ import scala.Tuple2; @@ -25,21 +27,21 @@ import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.rdd.RDD; // $example off$ import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; public class JavaBinaryClassificationMetricsExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Binary Classification Metrics Example"); + SparkConf conf = new SparkConf().setAppName("Java Binary Classification Metrics Example"); SparkContext sc = new SparkContext(conf); // $example on$ String path = "data/mllib/sample_binary_classification_data.txt"; JavaRDD data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); // Split initial RDD into two... [60% training data, 40% testing data]. - JavaRDD[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L); + JavaRDD[] splits = + data.randomSplit(new double[]{0.6, 0.4}, 11L); JavaRDD training = splits[0].cache(); JavaRDD test = splits[1]; @@ -105,7 +107,7 @@ public Double call(Tuple2 t) { // Save and load model model.save(sc, "target/tmp/LogisticRegressionModel"); LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, - "target/tmp/LogisticRegressionModel"); + "target/tmp/LogisticRegressionModel"); // $example off$ } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java index d6b9178b2c4a0..b54e1ea3f2bcf 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMultiLabelClassificationMetricsExample.java @@ -14,12 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.examples.mllib; +package org.apache.spark.examples.mllib; +// $example on$ import java.util.Arrays; import java.util.List; -// $example on$ + import scala.Tuple2; import org.apache.spark.api.java.*; @@ -57,11 +58,11 @@ public static void main(String[] args) { // Stats by labels for (int i = 0; i < metrics.labels().length - 1; i++) { System.out.format("Class %1.1f precision = %f\n", metrics.labels()[i], metrics.precision - (metrics.labels()[i])); + (metrics.labels()[i])); System.out.format("Class %1.1f recall = %f\n", metrics.labels()[i], metrics.recall(metrics - .labels()[i])); + .labels()[i])); System.out.format("Class %1.1f F1 score = %f\n", metrics.labels()[i], metrics.f1Measure - (metrics.labels()[i])); + (metrics.labels()[i])); } // Micro stats diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java index 9f3426a0ba6de..21f628fb51b6e 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java @@ -14,7 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.examples.mllib; + // $example on$ import scala.Tuple2; @@ -27,7 +29,6 @@ import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.mllib.linalg.Matrix; // $example off$ -import org.apache.spark.rdd.RDD; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; @@ -47,7 +48,7 @@ public static void main(String[] args) { // Run training algorithm to build the model. final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() .setNumClasses(3) - .run(training.rdd()); + .run(training.rdd()); // Compute raw scores on the test set. JavaRDD> predictionAndLabels = test.map( @@ -74,11 +75,11 @@ public Tuple2 call(LabeledPoint p) { // Stats by labels for (int i = 0; i < metrics.labels().length; i++) { System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision - (metrics.labels()[i])); + (metrics.labels()[i])); System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(metrics - .labels()[i])); + .labels()[i])); System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure - (metrics.labels()[i])); + (metrics.labels()[i])); } //Weighted stats @@ -90,7 +91,7 @@ public Tuple2 call(LabeledPoint p) { // Save and load model model.save(sc, "target/tmp/LogisticRegressionModel"); LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, - "target/tmp/LogisticRegressionModel"); + "target/tmp/LogisticRegressionModel"); // $example off$ } } diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java index 2f64a58dceeb2..7c4c97e74681f 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java @@ -15,7 +15,6 @@ * limitations under the License. */ -// scalastyle:off println package org.apache.spark.examples.mllib; // $example on$ @@ -31,12 +30,11 @@ import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; import org.apache.spark.mllib.recommendation.Rating; // $example off$ -import org.apache.spark.rdd.RDD; import org.apache.spark.SparkConf; public class JavaRankingMetricsExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Ranking Metrics Example"); + SparkConf conf = new SparkConf().setAppName("Java Ranking Metrics Example"); JavaSparkContext sc = new JavaSparkContext(conf); // $example on$ String path = "data/mllib/sample_movielens_data.txt"; @@ -46,7 +44,7 @@ public static void main(String[] args) { public Rating call(String line) { String[] parts = line.split("::"); return new Rating(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]), Double - .parseDouble(parts[2]) - 2.5); + .parseDouble(parts[2]) - 2.5); } } ); @@ -77,9 +75,9 @@ public Tuple2 call(Tuple2 t) { public Rating call(Rating r) { double binaryRating; if (r.rating() > 0.0) { - binaryRating = 1.0; + binaryRating = 1.0; } else { - binaryRating = 0.0; + binaryRating = 0.0; } return new Rating(r.user(), r.product(), binaryRating); } @@ -88,11 +86,11 @@ public Rating call(Rating r) { // Group ratings by common user JavaPairRDD> userMovies = binarizedRatings.groupBy( - new Function() { - public Object call(Rating r) { - return r.user(); - } - } + new Function() { + public Object call(Rating r) { + return r.user(); + } + } ); // Get true relevant documents from all user ratings @@ -123,7 +121,7 @@ public List call(Rating[] docs) { } ); JavaRDD, List>> relevantDocs = userMoviesList.join - (userRecommendedList).values(); + (userRecommendedList).values(); // Instantiate the metrics object RankingMetrics metrics = RankingMetrics.of(relevantDocs); @@ -156,14 +154,14 @@ public Tuple2, Object> call(Rating r) { } )); JavaRDD> ratesAndPreds = - JavaPairRDD.fromJavaRDD(ratings.map( - new Function, Object>>() { - public Tuple2, Object> call(Rating r) { - return new Tuple2, Object>( - new Tuple2(r.user(), r.product()), r.rating()); - } - } - )).join(predictions).values(); + JavaPairRDD.fromJavaRDD(ratings.map( + new Function, Object>>() { + public Tuple2, Object> call(Rating r) { + return new Tuple2, Object>( + new Tuple2(r.user(), r.product()), r.rating()); + } + } + )).join(predictions).values(); // Create regression metrics object RegressionMetrics regressionMetrics = new RegressionMetrics(ratesAndPreds.rdd()); diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java similarity index 93% rename from examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java rename to examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java index 5f4e2d74f04ef..d2efc6bf97776 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLinearRegressionExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRegressionMetricsExample.java @@ -14,7 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.examples.mllib; + // $example on$ import scala.Tuple2; @@ -28,10 +30,9 @@ import org.apache.spark.SparkConf; // $example off$ -// Read in the ratings data -public class JavaLinearRegressionExample { +public class JavaRegressionMetricsExample { public static void main(String[] args) { - SparkConf conf = new SparkConf().setAppName("Linear Regression Example"); + SparkConf conf = new SparkConf().setAppName("Java Regression Metrics Example"); JavaSparkContext sc = new JavaSparkContext(conf); // $example on$ // Load and parse the data @@ -53,7 +54,7 @@ public LabeledPoint call(String line) { // Building the model int numIterations = 100; final LinearRegressionModel model = LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), - numIterations); + numIterations); // Evaluate model on training examples and compute training error JavaRDD> valuesAndPreds = parsedData.map( @@ -84,7 +85,7 @@ public Tuple2 call(LabeledPoint point) { // Save and load model model.save(sc.sc(), "target/tmp/LogisticRegressionModel"); LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), - "target/tmp/LogisticRegressionModel"); + "target/tmp/LogisticRegressionModel"); // $example off$ } } diff --git a/examples/src/main/python/mllib/multi_label_metrics_example.py b/examples/src/main/python/mllib/multi_label_metrics_example.py index f293ce8e309e9..960ade6597379 100644 --- a/examples/src/main/python/mllib/multi_label_metrics_example.py +++ b/examples/src/main/python/mllib/multi_label_metrics_example.py @@ -18,7 +18,6 @@ # $example on$ from pyspark.mllib.evaluation import MultilabelMetrics # $example off$ -from pyspark.mllib.util import MLUtils from pyspark import SparkContext if __name__ == "__main__": diff --git a/examples/src/main/python/mllib/ranking_metrics_example.py b/examples/src/main/python/mllib/ranking_metrics_example.py index deefbd23c5b94..327791966c901 100644 --- a/examples/src/main/python/mllib/ranking_metrics_example.py +++ b/examples/src/main/python/mllib/ranking_metrics_example.py @@ -26,7 +26,7 @@ # Several of the methods available in scala are currently missing from pyspark # $example on$ - # Read in the ratings data + # Read in the ratings data lines = sc.textFile("data/mllib/sample_movielens_data.txt") def parseLine(line): diff --git a/examples/src/main/python/mllib/regression_metrics_example.py b/examples/src/main/python/mllib/regression_metrics_example.py index a3e9c12dbc4ea..89f44f5dc097d 100644 --- a/examples/src/main/python/mllib/regression_metrics_example.py +++ b/examples/src/main/python/mllib/regression_metrics_example.py @@ -26,7 +26,6 @@ sc = SparkContext(appName="Regression Metrics Example") # $example on$ # Load and parse the data - def parsePoint(line): values = line.split() return LabeledPoint(float(values[0]), diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala index 466c84ed2dec7..13a37827ab935 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala @@ -17,15 +17,14 @@ // scalastyle:off println package org.apache.spark.examples.mllib + // $example on$ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils // $example off$ - import org.apache.spark.{SparkContext, SparkConf} -import org.apache.spark.sql.SQLContext object BinaryClassificationMetricsExample { @@ -33,8 +32,6 @@ object BinaryClassificationMetricsExample { val conf = new SparkConf().setAppName("BinaryClassificationMetricsExample") val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) - import sqlContext.implicits._ // $example on$ // Load training data in LIBSVM format val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala index 035f74490b3fe..4503c15360adc 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MultiLabelMetricsExample.scala @@ -14,26 +14,25 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + // scalastyle:off println package org.apache.spark.examples.mllib + // $example on$ import org.apache.spark.mllib.evaluation.MultilabelMetrics -import org.apache.spark.rdd.RDD; +import org.apache.spark.rdd.RDD // $example off$ -import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkContext, SparkConf} object MultiLabelMetricsExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("MultiLabelMetricsExample") val sc = new SparkContext(conf) - val sqlContext = new SQLContext(sc) - import sqlContext.implicits._ // $example on$ val scoreAndLabels: RDD[(Array[Double], Array[Double])] = sc.parallelize( Seq((Array(0.0, 1.0), Array(0.0, 2.0)), (Array(0.0, 2.0), Array(0.0, 1.0)), - (Array(), Array(0.0)), + (Array.empty[Double], Array(0.0)), (Array(2.0), Array(2.0)), (Array(2.0, 0.0), Array(2.0, 0.0)), (Array(0.0, 1.0, 2.0), Array(0.0, 1.0)), diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala index a3d29a5a091b6..0904449245989 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MulticlassMetricsExample.scala @@ -14,8 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + // scalastyle:off println package org.apache.spark.examples.mllib + // $example on$ import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.evaluation.MulticlassMetrics @@ -27,7 +29,7 @@ import org.apache.spark.{SparkContext, SparkConf} object MulticlassMetricsExample { def main(args: Array[String]): Unit = { - val conf = new SparkConf().setAppName("MulticlassMetrics") + val conf = new SparkConf().setAppName("MulticlassMetricsExample") val sc = new SparkContext(conf) // $example on$ diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala index 9fe933bf53931..cffa03d5cc9f4 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala @@ -14,15 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + // scalastyle:off println package org.apache.spark.examples.mllib -import org.apache.spark.sql.SQLContext -import org.apache.spark.{SparkContext, SparkConf} // $example on$ import org.apache.spark.mllib.evaluation.{RegressionMetrics, RankingMetrics} import org.apache.spark.mllib.recommendation.{ALS, Rating} // $example off$ +import org.apache.spark.sql.SQLContext +import org.apache.spark.{SparkContext, SparkConf} object RankingMetricsExample { def main(args: Array[String]) { diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala index fcab66f0df2e5..d29a3c86cfc27 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala @@ -15,12 +15,12 @@ * limitations under the License. */ // scalastyle:off println + package org.apache.spark.examples.mllib + +// $example on$ // $example on$ -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD -import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.mllib.util.MLUtils // $example off$ From 88512e7ff1f1d55f31a5c12b57668216d39b22b9 Mon Sep 17 00:00:00 2001 From: Vikas Nelamangala Date: Thu, 19 Nov 2015 16:10:34 +0530 Subject: [PATCH 13/13] removed extra lines & fixed style issues --- .../main/python/mllib/binary_classification_metrics_example.py | 1 - examples/src/main/python/mllib/regression_metrics_example.py | 1 + .../apache/spark/examples/mllib/RegressionMetricsExample.scala | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/src/main/python/mllib/binary_classification_metrics_example.py b/examples/src/main/python/mllib/binary_classification_metrics_example.py index 38b557108c2ea..437acb998acc3 100644 --- a/examples/src/main/python/mllib/binary_classification_metrics_example.py +++ b/examples/src/main/python/mllib/binary_classification_metrics_example.py @@ -23,7 +23,6 @@ # $example on$ from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.mllib.evaluation import BinaryClassificationMetrics -from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.util import MLUtils # $example off$ diff --git a/examples/src/main/python/mllib/regression_metrics_example.py b/examples/src/main/python/mllib/regression_metrics_example.py index 89f44f5dc097d..a3a83aafd7a1f 100644 --- a/examples/src/main/python/mllib/regression_metrics_example.py +++ b/examples/src/main/python/mllib/regression_metrics_example.py @@ -24,6 +24,7 @@ if __name__ == "__main__": sc = SparkContext(appName="Regression Metrics Example") + # $example on$ # Load and parse the data def parsePoint(line): diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala index d29a3c86cfc27..47d44532521ca 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RegressionMetricsExample.scala @@ -18,7 +18,6 @@ package org.apache.spark.examples.mllib -// $example on$ // $example on$ import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.evaluation.RegressionMetrics