From 504b5c3724b797ede6f473638edd8523a13da712 Mon Sep 17 00:00:00 2001 From: martinzapletal Date: Wed, 11 Feb 2015 20:16:57 +0000 Subject: [PATCH 1/4] SPARK-5502 Added documentation for Isotonic regression including examples for Scala and Java --- .../mllib/sample_isotonic_regression_data.csv | 101 +++++++++++ docs/mllib-regression.md | 159 ++++++++++++++++++ 2 files changed, 260 insertions(+) create mode 100644 data/mllib/sample_isotonic_regression_data.csv create mode 100644 docs/mllib-regression.md diff --git a/data/mllib/sample_isotonic_regression_data.csv b/data/mllib/sample_isotonic_regression_data.csv new file mode 100644 index 0000000000000..c0f3f72f36a5f --- /dev/null +++ b/data/mllib/sample_isotonic_regression_data.csv @@ -0,0 +1,101 @@ +4710.28,500.00,1.00 +4711.05,501.00,1.00 +4711.58,502.00,1.00 +4712.50,503.00,1.00 +4712.47,504.00,1.00 +4708.73,505.00,1.00 +4708.89,506.00,1.00 +4706.28,507.00,1.00 +4706.40,508.00,1.00 +4706.23,509.00,1.00 +4708.00,510.00,1.00 +4705.46,511.00,1.00 +4705.73,512.00,1.00 +4708.03,513.00,1.00 +4707.20,514.00,1.00 +4707.93,515.00,1.00 +4708.45,516.00,1.00 +4710.55,517.00,1.00 +4709.39,518.00,1.00 +4709.30,519.00,1.00 +4708.55,520.00,1.00 +4708.15,521.00,1.00 +4709.19,522.00,1.00 +4709.30,523.00,1.00 +4709.40,524.00,1.00 +4708.77,525.00,1.00 +4709.09,526.00,1.00 +4709.39,527.00,1.00 +4711.26,528.00,1.00 +4713.97,529.00,1.00 +4715.93,530.00,1.00 +4715.68,531.00,1.00 +4714.98,532.00,1.00 +4715.11,533.00,1.00 +4716.08,534.00,1.00 +4717.18,535.00,1.00 +4716.47,536.00,1.00 +4716.95,537.00,1.00 +4715.85,538.00,1.00 +4715.30,539.00,1.00 +4715.78,540.00,1.00 +4716.89,541.00,1.00 +4718.14,542.00,1.00 +4718.81,543.00,1.00 +4720.03,544.00,1.00 +4718.18,545.00,1.00 +4718.65,546.00,1.00 +4718.36,547.00,1.00 +4719.15,548.00,1.00 +4717.82,549.00,1.00 +4717.54,550.00,1.00 +4717.86,551.00,1.00 +4718.27,552.00,1.00 +4718.27,553.00,1.00 +4718.67,554.00,1.00 +4718.60,555.00,1.00 +4715.57,556.00,1.00 +4715.44,557.00,1.00 +4711.99,558.00,1.00 +4712.56,559.00,1.00 +4713.91,560.00,1.00 +4714.48,561.00,1.00 +4716.51,562.00,1.00 +4716.23,563.00,1.00 +4714.95,564.00,1.00 +4714.97,565.00,1.00 +4714.67,566.00,1.00 +4714.32,567.00,1.00 +4714.45,568.00,1.00 +4714.70,569.00,1.00 +4715.57,570.00,1.00 +4716.84,571.00,1.00 +4716.34,572.00,1.00 +4716.50,573.00,1.00 +4716.56,574.00,1.00 +4716.82,575.00,1.00 +4718.16,576.00,1.00 +4718.45,577.00,1.00 +4719.05,578.00,1.00 +4718.23,579.00,1.00 +4718.86,580.00,1.00 +4718.52,581.00,1.00 +4719.95,582.00,1.00 +4719.97,583.00,1.00 +4721.98,584.00,1.00 +4723.37,585.00,1.00 +4722.72,586.00,1.00 +4723.25,587.00,1.00 +4723.72,588.00,1.00 +4723.72,589.00,1.00 +4723.49,590.00,1.00 +4723.68,591.00,1.00 +4724.22,592.00,1.00 +4724.09,593.00,1.00 +4724.47,594.00,1.00 +4725.07,595.00,1.00 +4723.07,596.00,1.00 +4723.17,597.00,1.00 +4723.18,598.00,1.00 +4721.83,599.00,1.00 +4722.61,600.00,1.00 \ No newline at end of file diff --git a/docs/mllib-regression.md b/docs/mllib-regression.md new file mode 100644 index 0000000000000..cff6063a4d322 --- /dev/null +++ b/docs/mllib-regression.md @@ -0,0 +1,159 @@ +--- +layout: global +title: Naive Bayes - MLlib +displayTitle: MLlib - Regression +--- + +[Regression](http://en.wikipedia.org/wiki/Regression_analysis) is a statistical process +for estimating the relationships among variables. It includes many techniques for modeling +and analyzing several variables, when the focus is on the relationship between +a dependent variable and one or more independent variables. + +[Isotonic regression](http://en.wikipedia.org/wiki/Isotonic_regression) +belongs to the family of regression algorithms. Formally isotonic regression is a problem where +given a finite set of real numbers `$Y = {y_1, y_2, ..., y_n}$` representing observed responses +and `$X = {x_1, x_2, ..., x_n}$` the unknown response values to be fitted +finding a function that minimises + +`\begin{equation} + f(x) = \sum_{i=1}^n w_i (y_i - x_i)^2 +\end{equation}` + +with respect to complete order subject to +`$x_1\le x_2\le ...\le x_n$` where `$w_i$` are positive weights. +The resulting function is called isotonic regression and it is unique. +It can be viewed as least squares problem under order restriction. +Essentially isotonic regression is a +[monotonic function](http://en.wikipedia.org/wiki/Monotonic_function) +best fitting the original data points. + +MLlib supports a +[pool adjacent violators algorithm](http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf) +which uses an approach to +[parallelizing isotonic regression](http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf). +The training input is a RDD of +[tuples](http://www.scala-lang.org/api/2.10.3/index.html#scala.Tuple3) +of three double values that represent label, feature and weight in this order. +Additionally IsotonicRegression algorithm has one optional parameter +called $isotonic$ defaulting to true. +This argument specifies if the isotonic regression is +isotonic (monotonically increasing) or antitonic (monotonically decreasing). + +Training returns an IsotonicRegressionModel that can be used to predict +labels for both known and unknown features. The result of isotonic regression +is treated as piecewise linear function. The rules the prediction uses therefore are: + +* If testData exactly matches a boundary then associated prediction is returned. + In case there are multiple predictions with the same boundary then one of them + is returned. Which one is undefined (same as java.util.Arrays.binarySearch). +* If testData is lower or higher than all boundaries then first or last prediction + is returned respectively. In case there are multiple predictions with the same + boundary then the lowest or highest is returned respectively. +* If testData falls between two values in boundary array then prediction is treated + as piecewise linear function and interpolated value is returned. In case there are + multiple values with the same boundary then the same rules as in previous point are used. + +## Examples + +
+
+Data are read from a csv file where each line has a format label,feature,weight +i.e. 4710.28,500.00,1.00. The data are split to training and testing set. +Model is created using the training set and a mean squared error is calculated from the predicted +labels and real labels in the test set. + +{% highlight scala %} +import org.apache.spark.mllib.regression.IsotonicRegression + +val data = sc.textFile("data/mllib/sample_isotonic_regression_data.csv") + +// Create label, feature, weight tuples from input data. +val parsedData = data.map { line => + val parts = line.split(',').map(_.toDouble) + (parts(0), parts(1), parts(2)) +} + +// Split data into training (60%) and test (40%) sets. +val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L) +val training = splits(0) +val test = splits(1) + +// Create isotonic regression model from training data. +// Isotonic parameter defaults to true so it is only shown for demonstration +val model = new IsotonicRegression().setIsotonic(true).run(training) + +// Create tuples of predicted and real labels. +val predictionAndLabel = test.map { point => + val predictedLabel = model.predict(point._2) + (predictedLabel, point._1) +} + +// Calculate mean squared error between predicted and real labels. +val meanSquaredError = predictionAndLabel.map{case(p, l) => math.pow((p - l), 2)}.mean() +println("Mean Squared Error = " + meanSquaredError) +{% endhighlight %} +
+ +
+Data are read from a csv file where each line has a format label,feature,weight +i.e. 4710.28,500.00,1.00. The data are split to training and testing set. +Model is created using the training set and a mean squared error is calculated from the predicted +labels and real labels in the test set. + +{% highlight java %} +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaDoubleRDD; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.mllib.regression.IsotonicRegressionModel; +import scala.Tuple2; +import scala.Tuple3; + +JavaRDD data = sc.textFile("data/mllib/sample_isotonic_regression_data.csv"); + +// Create label, feature, weight tuples from input data. +JavaRDD> parsedData = data.map( + new Function>() { + public Tuple3 call(String line) { + String[] parts = line.split(","); + + return new Tuple3<>(new Double(parts[0]), new Double(parts[1]), new Double(parts[2])); + } + } +); + +// Split data into training (60%) and test (40%) sets. +JavaRDD>[] splits = parsedData.randomSplit(new double[] {0.6, 0.4}, 11L); +JavaRDD> training = splits[0]; +JavaRDD> test = splits[1]; + +// Create isotonic regression model from training data. +// Isotonic parameter defaults to true so it is only shown for demonstration +IsotonicRegressionModel model = new IsotonicRegression().setIsotonic(true).run(training); + +// Create tuples of predicted and real labels. +JavaPairRDD predictionAndLabel = test.mapToPair( + new PairFunction, Double, Double>() { + @Override public Tuple2 call(Tuple3 point) { + Double predictedLabel = model.predict(point._2()); + return new Tuple2(predictedLabel, point._1()); + } + } +); + +// Calculate mean squared error between predicted and real labels. +Double meanSquaredError = new JavaDoubleRDD(predictionAndLabel.map( + new Function, Object>() { + @Override public Object call(Tuple2 pl) { + return Math.pow(pl._1() - pl._2(), 2); + } + } +).rdd()).mean(); + +System.out.println("Mean Squared Error = " + meanSquaredError); +{% endhighlight %} +
+
\ No newline at end of file From 7d8136eb745fbea92c69c2fb4ab8bce5568a3642 Mon Sep 17 00:00:00 2001 From: martinzapletal Date: Wed, 11 Feb 2015 20:26:37 +0000 Subject: [PATCH 2/4] SPARK-5502 Added documentation for Isotonic regression including examples for Scala and Java --- docs/mllib-regression.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/mllib-regression.md b/docs/mllib-regression.md index cff6063a4d322..80f67d642e54a 100644 --- a/docs/mllib-regression.md +++ b/docs/mllib-regression.md @@ -4,11 +4,13 @@ title: Naive Bayes - MLlib displayTitle: MLlib - Regression --- +## Regression [Regression](http://en.wikipedia.org/wiki/Regression_analysis) is a statistical process for estimating the relationships among variables. It includes many techniques for modeling and analyzing several variables, when the focus is on the relationship between a dependent variable and one or more independent variables. +## Isotonic regression [Isotonic regression](http://en.wikipedia.org/wiki/Isotonic_regression) belongs to the family of regression algorithms. Formally isotonic regression is a problem where given a finite set of real numbers `$Y = {y_1, y_2, ..., y_n}$` representing observed responses @@ -53,7 +55,7 @@ is treated as piecewise linear function. The rules the prediction uses therefore as piecewise linear function and interpolated value is returned. In case there are multiple values with the same boundary then the same rules as in previous point are used. -## Examples +### Examples
From 80bd4c3fc304b4e78873f1da65a95ce1798ee9f9 Mon Sep 17 00:00:00 2001 From: martinzapletal Date: Thu, 12 Feb 2015 12:00:42 +0000 Subject: [PATCH 3/4] SPARK-5502 created docs page for isotonic regression, added links to the page, updated data and examples --- .../mllib/sample_isotonic_regression_data.csv | 101 ------------------ .../mllib/sample_isotonic_regression_data.txt | 100 +++++++++++++++++ docs/mllib-classification-regression.md | 3 +- docs/mllib-guide.md | 1 + ...ession.md => mllib-isotonic-regression.md} | 41 +++---- 5 files changed, 119 insertions(+), 127 deletions(-) delete mode 100644 data/mllib/sample_isotonic_regression_data.csv create mode 100644 data/mllib/sample_isotonic_regression_data.txt rename docs/{mllib-regression.md => mllib-isotonic-regression.md} (79%) diff --git a/data/mllib/sample_isotonic_regression_data.csv b/data/mllib/sample_isotonic_regression_data.csv deleted file mode 100644 index c0f3f72f36a5f..0000000000000 --- a/data/mllib/sample_isotonic_regression_data.csv +++ /dev/null @@ -1,101 +0,0 @@ -4710.28,500.00,1.00 -4711.05,501.00,1.00 -4711.58,502.00,1.00 -4712.50,503.00,1.00 -4712.47,504.00,1.00 -4708.73,505.00,1.00 -4708.89,506.00,1.00 -4706.28,507.00,1.00 -4706.40,508.00,1.00 -4706.23,509.00,1.00 -4708.00,510.00,1.00 -4705.46,511.00,1.00 -4705.73,512.00,1.00 -4708.03,513.00,1.00 -4707.20,514.00,1.00 -4707.93,515.00,1.00 -4708.45,516.00,1.00 -4710.55,517.00,1.00 -4709.39,518.00,1.00 -4709.30,519.00,1.00 -4708.55,520.00,1.00 -4708.15,521.00,1.00 -4709.19,522.00,1.00 -4709.30,523.00,1.00 -4709.40,524.00,1.00 -4708.77,525.00,1.00 -4709.09,526.00,1.00 -4709.39,527.00,1.00 -4711.26,528.00,1.00 -4713.97,529.00,1.00 -4715.93,530.00,1.00 -4715.68,531.00,1.00 -4714.98,532.00,1.00 -4715.11,533.00,1.00 -4716.08,534.00,1.00 -4717.18,535.00,1.00 -4716.47,536.00,1.00 -4716.95,537.00,1.00 -4715.85,538.00,1.00 -4715.30,539.00,1.00 -4715.78,540.00,1.00 -4716.89,541.00,1.00 -4718.14,542.00,1.00 -4718.81,543.00,1.00 -4720.03,544.00,1.00 -4718.18,545.00,1.00 -4718.65,546.00,1.00 -4718.36,547.00,1.00 -4719.15,548.00,1.00 -4717.82,549.00,1.00 -4717.54,550.00,1.00 -4717.86,551.00,1.00 -4718.27,552.00,1.00 -4718.27,553.00,1.00 -4718.67,554.00,1.00 -4718.60,555.00,1.00 -4715.57,556.00,1.00 -4715.44,557.00,1.00 -4711.99,558.00,1.00 -4712.56,559.00,1.00 -4713.91,560.00,1.00 -4714.48,561.00,1.00 -4716.51,562.00,1.00 -4716.23,563.00,1.00 -4714.95,564.00,1.00 -4714.97,565.00,1.00 -4714.67,566.00,1.00 -4714.32,567.00,1.00 -4714.45,568.00,1.00 -4714.70,569.00,1.00 -4715.57,570.00,1.00 -4716.84,571.00,1.00 -4716.34,572.00,1.00 -4716.50,573.00,1.00 -4716.56,574.00,1.00 -4716.82,575.00,1.00 -4718.16,576.00,1.00 -4718.45,577.00,1.00 -4719.05,578.00,1.00 -4718.23,579.00,1.00 -4718.86,580.00,1.00 -4718.52,581.00,1.00 -4719.95,582.00,1.00 -4719.97,583.00,1.00 -4721.98,584.00,1.00 -4723.37,585.00,1.00 -4722.72,586.00,1.00 -4723.25,587.00,1.00 -4723.72,588.00,1.00 -4723.72,589.00,1.00 -4723.49,590.00,1.00 -4723.68,591.00,1.00 -4724.22,592.00,1.00 -4724.09,593.00,1.00 -4724.47,594.00,1.00 -4725.07,595.00,1.00 -4723.07,596.00,1.00 -4723.17,597.00,1.00 -4723.18,598.00,1.00 -4721.83,599.00,1.00 -4722.61,600.00,1.00 \ No newline at end of file diff --git a/data/mllib/sample_isotonic_regression_data.txt b/data/mllib/sample_isotonic_regression_data.txt new file mode 100644 index 0000000000000..d257b509d4d37 --- /dev/null +++ b/data/mllib/sample_isotonic_regression_data.txt @@ -0,0 +1,100 @@ +0.24579296,0.01 +0.28505864,0.02 +0.31208567,0.03 +0.35900051,0.04 +0.35747068,0.05 +0.16675166,0.06 +0.17491076,0.07 +0.04181540,0.08 +0.04793473,0.09 +0.03926568,0.10 +0.12952575,0.11 +0.00000000,0.12 +0.01376849,0.13 +0.13105558,0.14 +0.08873024,0.15 +0.12595614,0.16 +0.15247323,0.17 +0.25956145,0.18 +0.20040796,0.19 +0.19581846,0.20 +0.15757267,0.21 +0.13717491,0.22 +0.19020908,0.23 +0.19581846,0.24 +0.20091790,0.25 +0.16879143,0.26 +0.18510964,0.27 +0.20040796,0.28 +0.29576747,0.29 +0.43396226,0.30 +0.53391127,0.31 +0.52116267,0.32 +0.48546660,0.33 +0.49209587,0.34 +0.54156043,0.35 +0.59765426,0.36 +0.56144824,0.37 +0.58592555,0.38 +0.52983172,0.39 +0.50178480,0.40 +0.52626211,0.41 +0.58286588,0.42 +0.64660887,0.43 +0.68077511,0.44 +0.74298827,0.45 +0.64864865,0.46 +0.67261601,0.47 +0.65782764,0.48 +0.69811321,0.49 +0.63029067,0.50 +0.61601224,0.51 +0.63233044,0.52 +0.65323814,0.53 +0.65323814,0.54 +0.67363590,0.55 +0.67006629,0.56 +0.51555329,0.57 +0.50892402,0.58 +0.33299337,0.59 +0.36206017,0.60 +0.43090260,0.61 +0.45996940,0.62 +0.56348802,0.63 +0.54920959,0.64 +0.48393677,0.65 +0.48495665,0.66 +0.46965834,0.67 +0.45181030,0.68 +0.45843957,0.69 +0.47118817,0.70 +0.51555329,0.71 +0.58031617,0.72 +0.55481897,0.73 +0.56297807,0.74 +0.56603774,0.75 +0.57929628,0.76 +0.64762876,0.77 +0.66241713,0.78 +0.69301377,0.79 +0.65119837,0.80 +0.68332483,0.81 +0.66598674,0.82 +0.73890872,0.83 +0.73992861,0.84 +0.84242733,0.85 +0.91330954,0.86 +0.88016318,0.87 +0.90719021,0.88 +0.93115757,0.89 +0.93115757,0.90 +0.91942886,0.91 +0.92911780,0.92 +0.95665477,0.93 +0.95002550,0.94 +0.96940337,0.95 +1.00000000,0.96 +0.89801122,0.97 +0.90311066,0.98 +0.90362060,0.99 +0.83477817,1.0 \ No newline at end of file diff --git a/docs/mllib-classification-regression.md b/docs/mllib-classification-regression.md index 719cc95767b00..5b9b4dd83b774 100644 --- a/docs/mllib-classification-regression.md +++ b/docs/mllib-classification-regression.md @@ -23,7 +23,7 @@ the supported algorithms for each type of problem. Multiclass Classificationdecision trees, naive Bayes - Regressionlinear least squares, Lasso, ridge regression, decision trees + Regressionlinear least squares, Lasso, ridge regression, decision trees, isotonic regression @@ -35,3 +35,4 @@ More details for these methods can be found here: * [linear regression (least squares, Lasso, ridge)](mllib-linear-methods.html#linear-least-squares-lasso-and-ridge-regression) * [Decision trees](mllib-decision-tree.html) * [Naive Bayes](mllib-naive-bayes.html) +* [Isotonic regression](mllib-isotonic-regression.html) diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 3d32d03e35c62..7026615cc93d9 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -21,6 +21,7 @@ filtering, dimensionality reduction, as well as underlying optimization primitiv * [naive Bayes](mllib-naive-bayes.html) * [decision trees](mllib-decision-tree.html) * [ensembles of trees](mllib-ensembles.html) (Random Forests and Gradient-Boosted Trees) + * [isotonic regression](mllib-isotonic-regression.html) * [Collaborative filtering](mllib-collaborative-filtering.html) * alternating least squares (ALS) * [Clustering](mllib-clustering.html) diff --git a/docs/mllib-regression.md b/docs/mllib-isotonic-regression.md similarity index 79% rename from docs/mllib-regression.md rename to docs/mllib-isotonic-regression.md index 80f67d642e54a..b827ad8a4d9d3 100644 --- a/docs/mllib-regression.md +++ b/docs/mllib-isotonic-regression.md @@ -4,12 +4,6 @@ title: Naive Bayes - MLlib displayTitle: MLlib - Regression --- -## Regression -[Regression](http://en.wikipedia.org/wiki/Regression_analysis) is a statistical process -for estimating the relationships among variables. It includes many techniques for modeling -and analyzing several variables, when the focus is on the relationship between -a dependent variable and one or more independent variables. - ## Isotonic regression [Isotonic regression](http://en.wikipedia.org/wiki/Isotonic_regression) belongs to the family of regression algorithms. Formally isotonic regression is a problem where @@ -30,20 +24,18 @@ Essentially isotonic regression is a best fitting the original data points. MLlib supports a -[pool adjacent violators algorithm](http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf) +[pool adjacent violators algorithm](http://doi.org/10.1198/TECH.2010.10111) which uses an approach to -[parallelizing isotonic regression](http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf). -The training input is a RDD of -[tuples](http://www.scala-lang.org/api/2.10.3/index.html#scala.Tuple3) -of three double values that represent label, feature and weight in this order. -Additionally IsotonicRegression algorithm has one optional parameter -called $isotonic$ defaulting to true. +[parallelizing isotonic regression](http://doi.org/10.1007/978-3-642-99789-1_10). +The training input is a RDD of tuples of three double values that represent +label, feature and weight in this order. Additionally IsotonicRegression algorithm has one +optional parameter called $isotonic$ defaulting to true. This argument specifies if the isotonic regression is isotonic (monotonically increasing) or antitonic (monotonically decreasing). Training returns an IsotonicRegressionModel that can be used to predict labels for both known and unknown features. The result of isotonic regression -is treated as piecewise linear function. The rules the prediction uses therefore are: +is treated as piecewise linear function. The rules for prediction therefore are: * If testData exactly matches a boundary then associated prediction is returned. In case there are multiple predictions with the same boundary then one of them @@ -59,20 +51,20 @@ is treated as piecewise linear function. The rules the prediction uses therefore
-Data are read from a csv file where each line has a format label,feature,weight -i.e. 4710.28,500.00,1.00. The data are split to training and testing set. +Data are read from a file where each line has a format label,feature +i.e. 4710.28,500.00. The data are split to training and testing set. Model is created using the training set and a mean squared error is calculated from the predicted labels and real labels in the test set. {% highlight scala %} import org.apache.spark.mllib.regression.IsotonicRegression -val data = sc.textFile("data/mllib/sample_isotonic_regression_data.csv") +val data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt") -// Create label, feature, weight tuples from input data. +// Create label, feature, weight tuples from input data with weight set to default value 1.0. val parsedData = data.map { line => val parts = line.split(',').map(_.toDouble) - (parts(0), parts(1), parts(2)) + (parts(0), parts(1), 1.0) } // Split data into training (60%) and test (40%) sets. @@ -97,8 +89,8 @@ println("Mean Squared Error = " + meanSquaredError)
-Data are read from a csv file where each line has a format label,feature,weight -i.e. 4710.28,500.00,1.00. The data are split to training and testing set. +Data are read from a file where each line has a format label,feature +i.e. 4710.28,500.00. The data are split to training and testing set. Model is created using the training set and a mean squared error is calculated from the predicted labels and real labels in the test set. @@ -114,15 +106,14 @@ import org.apache.spark.mllib.regression.IsotonicRegressionModel; import scala.Tuple2; import scala.Tuple3; -JavaRDD data = sc.textFile("data/mllib/sample_isotonic_regression_data.csv"); +JavaRDD data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt"); -// Create label, feature, weight tuples from input data. +// Create label, feature, weight tuples from input data with weight set to default value 1.0. JavaRDD> parsedData = data.map( new Function>() { public Tuple3 call(String line) { String[] parts = line.split(","); - - return new Tuple3<>(new Double(parts[0]), new Double(parts[1]), new Double(parts[2])); + return new Tuple3<>(new Double(parts[0]), new Double(parts[1]), 1.0); } } ); From 67fe773f4046abc12d6f116afe285e68df5ce6ae Mon Sep 17 00:00:00 2001 From: martinzapletal Date: Thu, 12 Feb 2015 12:16:50 +0000 Subject: [PATCH 4/4] SPARK-5502 reworded model prediction rules to use more general language rather than the code/implementation specific terms --- docs/mllib-isotonic-regression.md | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/mllib-isotonic-regression.md b/docs/mllib-isotonic-regression.md index b827ad8a4d9d3..12fb29d426741 100644 --- a/docs/mllib-isotonic-regression.md +++ b/docs/mllib-isotonic-regression.md @@ -37,15 +37,18 @@ Training returns an IsotonicRegressionModel that can be used to predict labels for both known and unknown features. The result of isotonic regression is treated as piecewise linear function. The rules for prediction therefore are: -* If testData exactly matches a boundary then associated prediction is returned. - In case there are multiple predictions with the same boundary then one of them - is returned. Which one is undefined (same as java.util.Arrays.binarySearch). -* If testData is lower or higher than all boundaries then first or last prediction - is returned respectively. In case there are multiple predictions with the same - boundary then the lowest or highest is returned respectively. -* If testData falls between two values in boundary array then prediction is treated - as piecewise linear function and interpolated value is returned. In case there are - multiple values with the same boundary then the same rules as in previous point are used. +* If the prediction input exactly matches a training feature + then associated prediction is returned. In case there are multiple predictions with the same + feature then one of them is returned. Which one is undefined + (same as java.util.Arrays.binarySearch). +* If the prediction input is lower or higher than all training features + then prediction with lowest or highest feature is returned respectively. + In case there are multiple predictions with the same feature + then the lowest or highest is returned respectively. +* If the prediction input falls between two training features then prediction is treated + as piecewise linear function and interpolated value is calculated from the + predictions of the two closest features. In case there are multiple values + with the same feature then the same rules as in previous point are used. ### Examples