From 18047484cf869ae5c6fce32c6b64b9069d709eae Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 1 Mar 2016 17:33:20 -0800 Subject: [PATCH 01/17] [SPARK-13430] Added summary classes for logistic and linear regression --- python/pyspark/ml/classification.py | 209 +++++++++++++++++++++++++++- python/pyspark/ml/regression.py | 207 ++++++++++++++++++++++++++- python/pyspark/mllib/common.py | 33 +++-- 3 files changed, 436 insertions(+), 13 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 3179fb30ab4d7..51dabff089eb3 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -23,10 +23,11 @@ from pyspark.ml.param.shared import * from pyspark.ml.regression import ( RandomForestParams, TreeEnsembleParams, DecisionTreeModel, TreeEnsembleModels) -from pyspark.mllib.common import inherit_doc +from pyspark.mllib.common import inherit_doc, JavaCallable -__all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassifier', +__all__ = ['LogisticRegression', 'LogisticRegressionModel', 'BinaryLogisticRegressionSummary', + 'BinaryLogisticRegressionTrainingSummary', 'DecisionTreeClassifier', 'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel', 'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes', 'NaiveBayesModel', 'MultilayerPerceptronClassifier', @@ -218,6 +219,210 @@ def intercept(self): """ return self._call_java("intercept") + @property + @since("2.0.0") + def summary(self): + """ + Gets summary (e.g. residuals, mse, r-squared ) of model on + training set. An exception is thrown if + `trainingSummary == None`. + """ + java_blrt_summary = self._call_java("summary") + return BinaryLogisticRegressionTrainingSummary._fromActiveSparkContext(java_blrt_summary) + + @property + @since("2.0.0") + def hasSummary(self): + """ + Indicates whether a training summary exists for this model + instance. + """ + return self._call_java("hasSummary") + + """ + TODO: enable once Scala API is made public + def evaluate(self, df): + "" + Evaluates the model on a testset. + @param dataset Test dataset to evaluate model on. + "" + java_blr_summary = self._call_java("evaluate", df) + return BinaryLogisticRegressionSummary._fromActiveSparkContext(java_blr_summary) + """ + + +class LogisticRegressionSummary(JavaCallable): + """ + Abstraction for Logistic Regression Results for a given model. + + .. versionadded:: 2.0.0 + """ + + @property + @since("2.0.0") + def predictions(self): + """ + Dataframe outputted by the model's `transform` method. + """ + return self._call("predictions") + + @property + @since("2.0.0") + def probabilityCol(self): + """ + Field in "predictions" which gives the calibrated probability + of each instance as a vector. + """ + return self._call("probabilityCol") + + @property + @since("2.0.0") + def labelCol(self): + """ + Field in "predictions" which gives the true label of each + instance. + """ + return self._call("labelCol") + + @property + @since("2.0.0") + def featuresCol(self): + """ + Field in "predictions" which gives the features of each instance + as a vector. + """ + return self._call("featuresCol") + + +class LogisticRegressionTrainingSummary(LogisticRegressionSummary): + """ + Abstraction for multinomial Logistic Regression Training results. + Currently, the training summary ignores the training weights except + for the objective trace. + + .. versionadded:: 2.0.0 + """ + + @property + @since("2.0.0") + def objectiveHistory(self): + """ + Objective function (scaled loss + regularization) at each iteration. + """ + return self._call("objectiveHistory") + + @property + @since("2.0.0") + def totalIterations(self): + """ + Number of training iterations until termination. + """ + return self._call("totalIterations") + + +class BinaryLogisticRegressionSummary(LogisticRegressionSummary): + """ + .. note:: Experimental + + Binary Logistic regression results for a given model. + + .. versionadded:: 2.0.0 + """ + + @property + @since("2.0.0") + def roc(self): + """ + Returns the receiver operating characteristic (ROC) curve, + which is an Dataframe having two fields (FPR, TPR) with + (0.0, 0.0) prepended and (1.0, 1.0) appended to it. + Reference: http://en.wikipedia.org/wiki/Receiver_operating_characteristic + + Note: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call("roc") + + @property + @since("2.0.0") + def areaUnderROC(self): + """ + Computes the area under the receiver operating characteristic + (ROC) curve. + + Note: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call("areaUnderROC") + + @property + @since("2.0.0") + def pr(self): + """ + Returns the precision-recall curve, which is an Dataframe + containing two fields recall, precision with (0.0, 1.0) prepended to it. + + Note: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call("pr") + + @property + @since("2.0.0") + def fMeasureByThreshold(self): + """ + Returns a dataframe with two fields (threshold, F-Measure) curve + with beta = 1.0. + + Note: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call("fMeasureByThreshold") + + @property + @since("2.0.0") + def precisionByThreshold(self): + """ + Returns a dataframe with two fields (threshold, precision) curve. + Every possible probability obtained in transforming the dataset + are used as thresholds used in calculating the precision. + + Note: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call("precisionByThreshold") + + @property + @since("2.0.0") + def recallByThreshold(self): + """ + Returns a dataframe with two fields (threshold, recall) curve. + Every possible probability obtained in transforming the dataset + are used as thresholds used in calculating the recall. + + Note: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call("recallByThreshold") + + +class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, + LogisticRegressionTrainingSummary): + """ + .. note:: Experimental + + Binary Logistic regression training results for a given model. + + .. versionadded:: 2.0.0 + """ + pass + class TreeClassifierParams(object): """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 6b994fe9f93b4..7dd6eb2a9bc8c 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -21,7 +21,7 @@ from pyspark.ml.param.shared import * from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel -from pyspark.mllib.common import inherit_doc +from pyspark.mllib.common import inherit_doc, JavaCallable __all__ = ['AFTSurvivalRegression', 'AFTSurvivalRegressionModel', @@ -29,6 +29,7 @@ 'GBTRegressor', 'GBTRegressionModel', 'IsotonicRegression', 'IsotonicRegressionModel', 'LinearRegression', 'LinearRegressionModel', + 'LinearRegressionSummary', 'LinearRegressionTrainingSummary', 'RandomForestRegressor', 'RandomForestRegressionModel'] @@ -131,7 +132,6 @@ def weights(self): """ Model weights. """ - warnings.warn("weights is deprecated. Use coefficients instead.") return self._call_java("weights") @@ -151,6 +151,209 @@ def intercept(self): """ return self._call_java("intercept") + @property + @since("2.0.0") + def summary(self): + """ + Gets summary (e.g. residuals, mse, r-squared ) of model on + training set. An exception is thrown if + `trainingSummary == None`. + """ + java_lrt_summary = self._call_java("summary") + return LinearRegressionTrainingSummary._fromActiveSparkContext(java_lrt_summary) + + @property + @since("2.0.0") + def hasSummary(self): + """ + Indicates whether a training summary exists for this model + instance. + """ + return self._call_java("hasSummary") + + """ + TODO: enable once Scala API is made public + def evaluate(self, df): + "" + Evaluates the model on a testset. + @param dataset Test dataset to evaluate model on. + "" + java_lr_summary = self._call_java("evaluate", df) + return LinearRegressionSummary._fromActiveSparkContext(java_lr_summary) + """ + + +class LinearRegressionSummary(JavaCallable): + """ + .. note:: Experimental + + Linear regression results evaluated on a dataset. + + .. versionadded:: 2.0.0 + """ + + @property + @since("2.0.0") + def predictions(self): + """ + Dataframe outputted by the model's `transform` method. + """ + return self._call("predictions") + + @property + @since("2.0.0") + def predictionCol(self): + """ + Field in "predictions" which gives the predicted value of + the label at each instance. + """ + return self._call("predictionCol") + + @property + @since("2.0.0") + def labelCol(self): + """ + Field in "predictions" which gives the true label of each + instance. + """ + return self._call("labelCol") + + @property + @since("2.0.0") + def explainedVariance(self): + """ + Returns the explained variance regression score. + explainedVariance = 1 - variance(y - \hat{y}) / variance(y) + Reference: http://en.wikipedia.org/wiki/Explained_variation + + Note: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call("explainedVariance") + + @property + @since("2.0.0") + def meanAbsoluteError(self): + """ + Returns the mean absolute error, which is a risk function + corresponding to the expected value of the absolute error + loss or l1-norm loss. + + Note: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call("meanAbsoluteError") + + @property + @since("2.0.0") + def meanSquaredError(self): + """ + Returns the mean squared error, which is a risk function + corresponding to the expected value of the squared error + loss or quadratic loss. + + Note: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call("meanSquaredError") + + @property + @since("2.0.0") + def rootMeanSquaredError(self): + """ + Returns the root mean squared error, which is defined as the + square root of the mean squared error. + + Note: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call("rootMeanSquaredError") + + @property + @since("2.0.0") + def r2(self): + """ + Returns R^2^, the coefficient of determination. + Reference: http://en.wikipedia.org/wiki/Coefficient_of_determination + + Note: This ignores instance weights (setting all to 1.0) from + `LinearRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call("r2") + + @property + @since("2.0.0") + def residuals(self): + """ + Residuals (label - predicted value) + """ + return self._call("residuals") + + @property + @since("2.0.0") + def numInstances(self): + """ + Number of instances in DataFrame predictions + """ + return self._call("numInstances") + + @property + @since("2.0.0") + def devianceResiduals(self): + """ + The weighted residuals, the usual residuals rescaled by the + square root of the instance weights. + """ + return self._call("devianceResiduals") + + @property + @since("2.0.0") + def coefficientStandardErrors(self): + """ + Standard error of estimated coefficients and intercept. + """ + return self._call("coefficientStandardErrors") + + @property + @since("2.0.0") + def tValues(self): + """ + T-statistic of estimated coefficients and intercept. + """ + return self._call("tValues") + + @property + @since("2.0.0") + def pValues(self): + """ + Two-sided p-value of estimated coefficients and intercept. + """ + return self._call("pValues") + + +class LinearRegressionTrainingSummary(LinearRegressionSummary): + """ + .. note:: Experimental + + Linear regression training results. Currently, the training summary ignores the + training coefficients except for the objective trace. + + .. versionadded:: 2.0.0 + """ + + @property + @since("2.0.0") + def totalIterations(self): + """ + Number of training iterations until termination. + """ + return self._call("totalIterations") + @inherit_doc class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py index 9fda1b1682f57..888d475c3e421 100644 --- a/python/pyspark/mllib/common.py +++ b/python/pyspark/mllib/common.py @@ -130,20 +130,35 @@ def callMLlibFunc(name, *args): return callJavaFunc(sc, api, *args) -class JavaModelWrapper(object): +class JavaCallable(object): """ - Wrapper for the model in JVM + Wrapper for an object in JVM to make Java calls """ - def __init__(self, java_model): - self._sc = SparkContext.getOrCreate() - self._java_model = java_model + def __init__(self, sc, java_obj): + self._sc = sc + self._java_obj = java_obj def __del__(self): - self._sc._gateway.detach(self._java_model) + self._sc._gateway.detach(self._java_obj) + + @classmethod + def _fromActiveSparkContext(cls, java_obj): + """Create from a currently active context""" + sc = SparkContext._active_spark_context + return cls(sc, java_obj) + + def _call(self, name, *a): + """Call method of java_obj""" + return callJavaFunc(self._sc, getattr(self._java_obj, name), *a) + - def call(self, name, *a): - """Call method of java_model""" - return callJavaFunc(self._sc, getattr(self._java_model, name), *a) +class JavaModelWrapper(JavaCallable): + """ + Wrapper for the model in JVM + """ + def __init__(self, java_model): + sc = SparkContext.getOrCreate() + super(JavaModelWrapper, self).__init__(sc, java_model) def inherit_doc(cls): From 57f15cd675cd50a82ef479286d1c027b0c7f700b Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 2 Mar 2016 14:23:54 -0800 Subject: [PATCH 02/17] adding test for ml linear regression training summary --- python/pyspark/ml/tests.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 5fcfa9e61f6da..aa3d725d2cee3 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -443,6 +443,33 @@ def test_linear_regression(self): pass +class TrainingSummaryTest(PySparkTestCase): + + def test_linear_regression_summary(self): + from pyspark.mllib.linalg import Vectors + df = self.sc.parallelize([ + Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)), + Row(label=0.0, weight=2.0, features=Vectors.dense(0.0))]).toDF() + lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight") + model = lr.fit(df) + self.assertTrue(model.hasSummary) + s = model.summary + # test that api is callable and returns expected types + self.assertGreater(s.totalIterations, 0) + self.assertTrue(isinstance(s.predictions, DataFrame)) + self.assertEqual(s.predictionCol, "prediction") + self.assertEqual(s.labelCol, "label") + self.assertAlmostEqual(s.explainedVariance, 0.25, 2) + self.assertAlmostEqual(s.meanAbsoluteError, 0.0) + self.assertAlmostEqual(s.meanSquaredError, 0.0) + self.assertAlmostEqual(s.rootMeanSquaredError, 0.0) + self.assertAlmostEqual(s.r2, 1.0, 2) + #residuals = s.residuals.rdd.map(lambda r: r.residuals).collect() + #self.assertTrue(isinstance(residuals, list) and isinstance(residuals[0], float)) + + #self.assertTrue(False) + + if __name__ == "__main__": from pyspark.ml.tests import * if xmlrunner: From 4d4bf1a8766834bb49b7014057bac5c0a7f8a03a Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 2 Mar 2016 17:32:09 -0800 Subject: [PATCH 03/17] completed test for ml linear regression training summary --- python/pyspark/ml/tests.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index aa3d725d2cee3..64585277b09d4 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -447,10 +447,12 @@ class TrainingSummaryTest(PySparkTestCase): def test_linear_regression_summary(self): from pyspark.mllib.linalg import Vectors - df = self.sc.parallelize([ - Row(label=1.0, weight=2.0, features=Vectors.dense(1.0)), - Row(label=0.0, weight=2.0, features=Vectors.dense(0.0))]).toDF() - lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight") + sqlContext = SQLContext(self.sc) + df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), + (0.0, 2.0, Vectors.sparse(1, [], []))], + ["label", "weight", "features"]) + lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight", + fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary @@ -464,10 +466,17 @@ def test_linear_regression_summary(self): self.assertAlmostEqual(s.meanSquaredError, 0.0) self.assertAlmostEqual(s.rootMeanSquaredError, 0.0) self.assertAlmostEqual(s.r2, 1.0, 2) - #residuals = s.residuals.rdd.map(lambda r: r.residuals).collect() - #self.assertTrue(isinstance(residuals, list) and isinstance(residuals[0], float)) - - #self.assertTrue(False) + residuals = s.residuals.rdd.map(lambda r: r.residuals).collect() + self.assertTrue(isinstance(residuals, list) and isinstance(residuals[0], float)) + self.assertEqual(s.numInstances, 2) + devResiduals = s.devianceResiduals + self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float)) + coefStdErr = s.coefficientStandardErrors + self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) + tValues = s.tValues + self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) + pValues = s.pValues + self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) if __name__ == "__main__": From f9da8e6df323f5c6447d6f9cae771b910023b3ef Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 3 Mar 2016 13:53:49 -0800 Subject: [PATCH 04/17] adding test for ml logistic regression training summary --- python/pyspark/ml/tests.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 64585277b09d4..6eb2a8b962ec6 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -478,6 +478,31 @@ def test_linear_regression_summary(self): pValues = s.pValues self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) + def test_logistic_regression_summary(self): + from pyspark.mllib.linalg import Vectors + sqlContext = SQLContext(self.sc) + df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), + (0.0, 2.0, Vectors.sparse(1, [], []))], + ["label", "weight", "features"]) + lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) + model = lr.fit(df) + self.assertTrue(model.hasSummary) + s = model.summary + # test that api is callable and returns expected types + self.assertTrue(isinstance(s.predictions, DataFrame)) + self.assertEqual(s.probabilityCol, "probability") + self.assertEqual(s.labelCol, "label") + self.assertEqual(s.featuresCol, "features") + objHist = s.objectiveHistory + self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) + self.assertGreater(s.totalIterations, 0) + self.assertTrue(isinstance(s.roc, DataFrame)) + self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) + self.assertTrue(isinstance(s.pr, DataFrame)) + self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) + self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) + self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) + if __name__ == "__main__": from pyspark.ml.tests import * From ce69f9d5d5748f95c63883c5920e59bbae4e3b79 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 3 Mar 2016 14:53:02 -0800 Subject: [PATCH 05/17] changed residual to only check that DataFrame is returned --- python/pyspark/ml/tests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 6eb2a8b962ec6..1765f8ef66c46 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -466,8 +466,7 @@ def test_linear_regression_summary(self): self.assertAlmostEqual(s.meanSquaredError, 0.0) self.assertAlmostEqual(s.rootMeanSquaredError, 0.0) self.assertAlmostEqual(s.r2, 1.0, 2) - residuals = s.residuals.rdd.map(lambda r: r.residuals).collect() - self.assertTrue(isinstance(residuals, list) and isinstance(residuals[0], float)) + self.assertTrue(isinstance(s.residuals, DataFrame)) self.assertEqual(s.numInstances, 2) devResiduals = s.devianceResiduals self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float)) From e3ac04cfcc9e90a649bd7e46346cee110562b2f7 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 9 Mar 2016 17:19:04 -0800 Subject: [PATCH 06/17] Could not make JavaModel.call private because used in mllib, added _java_model property to fix mllib errors --- python/pyspark/ml/classification.py | 26 +++++++++++------------ python/pyspark/ml/regression.py | 32 ++++++++++++++--------------- python/pyspark/mllib/common.py | 8 ++++++-- 3 files changed, 35 insertions(+), 31 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 142a5fcaf6616..f407313d87323 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -229,7 +229,7 @@ def summary(self): `trainingSummary == None`. """ java_blrt_summary = self._call_java("summary") - return BinaryLogisticRegressionTrainingSummary._fromActiveSparkContext(java_blrt_summary) + return BinaryLogisticRegressionTrainingSummary.fromActiveSparkContext(java_blrt_summary) @property @since("2.0.0") @@ -265,7 +265,7 @@ def predictions(self): """ Dataframe outputted by the model's `transform` method. """ - return self._call("predictions") + return self.call("predictions") @property @since("2.0.0") @@ -274,7 +274,7 @@ def probabilityCol(self): Field in "predictions" which gives the calibrated probability of each instance as a vector. """ - return self._call("probabilityCol") + return self.call("probabilityCol") @property @since("2.0.0") @@ -283,7 +283,7 @@ def labelCol(self): Field in "predictions" which gives the true label of each instance. """ - return self._call("labelCol") + return self.call("labelCol") @property @since("2.0.0") @@ -292,7 +292,7 @@ def featuresCol(self): Field in "predictions" which gives the features of each instance as a vector. """ - return self._call("featuresCol") + return self.call("featuresCol") class LogisticRegressionTrainingSummary(LogisticRegressionSummary): @@ -310,7 +310,7 @@ def objectiveHistory(self): """ Objective function (scaled loss + regularization) at each iteration. """ - return self._call("objectiveHistory") + return self.call("objectiveHistory") @property @since("2.0.0") @@ -318,7 +318,7 @@ def totalIterations(self): """ Number of training iterations until termination. """ - return self._call("totalIterations") + return self.call("totalIterations") class BinaryLogisticRegressionSummary(LogisticRegressionSummary): @@ -343,7 +343,7 @@ def roc(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self._call("roc") + return self.call("roc") @property @since("2.0.0") @@ -356,7 +356,7 @@ def areaUnderROC(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self._call("areaUnderROC") + return self.call("areaUnderROC") @property @since("2.0.0") @@ -369,7 +369,7 @@ def pr(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self._call("pr") + return self.call("pr") @property @since("2.0.0") @@ -382,7 +382,7 @@ def fMeasureByThreshold(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self._call("fMeasureByThreshold") + return self.call("fMeasureByThreshold") @property @since("2.0.0") @@ -396,7 +396,7 @@ def precisionByThreshold(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self._call("precisionByThreshold") + return self.call("precisionByThreshold") @property @since("2.0.0") @@ -410,7 +410,7 @@ def recallByThreshold(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self._call("recallByThreshold") + return self.call("recallByThreshold") class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 7dd6eb2a9bc8c..13b0fef5d23b0 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -160,7 +160,7 @@ def summary(self): `trainingSummary == None`. """ java_lrt_summary = self._call_java("summary") - return LinearRegressionTrainingSummary._fromActiveSparkContext(java_lrt_summary) + return LinearRegressionTrainingSummary.fromActiveSparkContext(java_lrt_summary) @property @since("2.0.0") @@ -198,7 +198,7 @@ def predictions(self): """ Dataframe outputted by the model's `transform` method. """ - return self._call("predictions") + return self.call("predictions") @property @since("2.0.0") @@ -207,7 +207,7 @@ def predictionCol(self): Field in "predictions" which gives the predicted value of the label at each instance. """ - return self._call("predictionCol") + return self.call("predictionCol") @property @since("2.0.0") @@ -216,7 +216,7 @@ def labelCol(self): Field in "predictions" which gives the true label of each instance. """ - return self._call("labelCol") + return self.call("labelCol") @property @since("2.0.0") @@ -230,7 +230,7 @@ def explainedVariance(self): `LinearRegression.weightCol`. This will change in later Spark versions. """ - return self._call("explainedVariance") + return self.call("explainedVariance") @property @since("2.0.0") @@ -244,7 +244,7 @@ def meanAbsoluteError(self): `LinearRegression.weightCol`. This will change in later Spark versions. """ - return self._call("meanAbsoluteError") + return self.call("meanAbsoluteError") @property @since("2.0.0") @@ -258,7 +258,7 @@ def meanSquaredError(self): `LinearRegression.weightCol`. This will change in later Spark versions. """ - return self._call("meanSquaredError") + return self.call("meanSquaredError") @property @since("2.0.0") @@ -271,7 +271,7 @@ def rootMeanSquaredError(self): `LinearRegression.weightCol`. This will change in later Spark versions. """ - return self._call("rootMeanSquaredError") + return self.call("rootMeanSquaredError") @property @since("2.0.0") @@ -284,7 +284,7 @@ def r2(self): `LinearRegression.weightCol`. This will change in later Spark versions. """ - return self._call("r2") + return self.call("r2") @property @since("2.0.0") @@ -292,7 +292,7 @@ def residuals(self): """ Residuals (label - predicted value) """ - return self._call("residuals") + return self.call("residuals") @property @since("2.0.0") @@ -300,7 +300,7 @@ def numInstances(self): """ Number of instances in DataFrame predictions """ - return self._call("numInstances") + return self.call("numInstances") @property @since("2.0.0") @@ -309,7 +309,7 @@ def devianceResiduals(self): The weighted residuals, the usual residuals rescaled by the square root of the instance weights. """ - return self._call("devianceResiduals") + return self.call("devianceResiduals") @property @since("2.0.0") @@ -317,7 +317,7 @@ def coefficientStandardErrors(self): """ Standard error of estimated coefficients and intercept. """ - return self._call("coefficientStandardErrors") + return self.call("coefficientStandardErrors") @property @since("2.0.0") @@ -325,7 +325,7 @@ def tValues(self): """ T-statistic of estimated coefficients and intercept. """ - return self._call("tValues") + return self.call("tValues") @property @since("2.0.0") @@ -333,7 +333,7 @@ def pValues(self): """ Two-sided p-value of estimated coefficients and intercept. """ - return self._call("pValues") + return self.call("pValues") class LinearRegressionTrainingSummary(LinearRegressionSummary): @@ -352,7 +352,7 @@ def totalIterations(self): """ Number of training iterations until termination. """ - return self._call("totalIterations") + return self.call("totalIterations") @inherit_doc diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py index 888d475c3e421..86997b2888b91 100644 --- a/python/pyspark/mllib/common.py +++ b/python/pyspark/mllib/common.py @@ -142,12 +142,12 @@ def __del__(self): self._sc._gateway.detach(self._java_obj) @classmethod - def _fromActiveSparkContext(cls, java_obj): + def fromActiveSparkContext(cls, java_obj): """Create from a currently active context""" sc = SparkContext._active_spark_context return cls(sc, java_obj) - def _call(self, name, *a): + def call(self, name, *a): """Call method of java_obj""" return callJavaFunc(self._sc, getattr(self._java_obj, name), *a) @@ -160,6 +160,10 @@ def __init__(self, java_model): sc = SparkContext.getOrCreate() super(JavaModelWrapper, self).__init__(sc, java_model) + @property + def _java_model(self): + return self._java_obj + def inherit_doc(cls): """ From 8d0f01a269adc8dca3383ecb9e4bf4f780806984 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 10 Mar 2016 16:42:00 -0800 Subject: [PATCH 07/17] moved JavaCallable to ML and changed to _call_java, it duplicates some code, but is cleaner and more consistent --- python/pyspark/ml/classification.py | 32 ++++++++++++------------ python/pyspark/ml/regression.py | 38 ++++++++++++++--------------- python/pyspark/ml/wrapper.py | 27 +++++++++++++++++--- python/pyspark/mllib/common.py | 35 ++++++-------------------- 4 files changed, 66 insertions(+), 66 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index f407313d87323..479c41e93e2ee 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -19,11 +19,11 @@ from pyspark import since from pyspark.ml.util import keyword_only -from pyspark.ml.wrapper import JavaEstimator, JavaModel +from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaCallable from pyspark.ml.param.shared import * from pyspark.ml.regression import ( RandomForestParams, TreeEnsembleParams, DecisionTreeModel, TreeEnsembleModels) -from pyspark.mllib.common import inherit_doc, JavaCallable +from pyspark.mllib.common import inherit_doc __all__ = ['LogisticRegression', 'LogisticRegressionModel', @@ -229,7 +229,7 @@ def summary(self): `trainingSummary == None`. """ java_blrt_summary = self._call_java("summary") - return BinaryLogisticRegressionTrainingSummary.fromActiveSparkContext(java_blrt_summary) + return BinaryLogisticRegressionTrainingSummary(java_blrt_summary) @property @since("2.0.0") @@ -248,7 +248,7 @@ def evaluate(self, df): @param dataset Test dataset to evaluate model on. "" java_blr_summary = self._call_java("evaluate", df) - return BinaryLogisticRegressionSummary._fromActiveSparkContext(java_blr_summary) + return BinaryLogisticRegressionSummary(java_blr_summary) """ @@ -265,7 +265,7 @@ def predictions(self): """ Dataframe outputted by the model's `transform` method. """ - return self.call("predictions") + return self._call_java("predictions") @property @since("2.0.0") @@ -274,7 +274,7 @@ def probabilityCol(self): Field in "predictions" which gives the calibrated probability of each instance as a vector. """ - return self.call("probabilityCol") + return self._call_java("probabilityCol") @property @since("2.0.0") @@ -283,7 +283,7 @@ def labelCol(self): Field in "predictions" which gives the true label of each instance. """ - return self.call("labelCol") + return self._call_java("labelCol") @property @since("2.0.0") @@ -292,7 +292,7 @@ def featuresCol(self): Field in "predictions" which gives the features of each instance as a vector. """ - return self.call("featuresCol") + return self._call_java("featuresCol") class LogisticRegressionTrainingSummary(LogisticRegressionSummary): @@ -310,7 +310,7 @@ def objectiveHistory(self): """ Objective function (scaled loss + regularization) at each iteration. """ - return self.call("objectiveHistory") + return self._call_java("objectiveHistory") @property @since("2.0.0") @@ -318,7 +318,7 @@ def totalIterations(self): """ Number of training iterations until termination. """ - return self.call("totalIterations") + return self._call_java("totalIterations") class BinaryLogisticRegressionSummary(LogisticRegressionSummary): @@ -343,7 +343,7 @@ def roc(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self.call("roc") + return self._call_java("roc") @property @since("2.0.0") @@ -356,7 +356,7 @@ def areaUnderROC(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self.call("areaUnderROC") + return self._call_java("areaUnderROC") @property @since("2.0.0") @@ -369,7 +369,7 @@ def pr(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self.call("pr") + return self._call_java("pr") @property @since("2.0.0") @@ -382,7 +382,7 @@ def fMeasureByThreshold(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self.call("fMeasureByThreshold") + return self._call_java("fMeasureByThreshold") @property @since("2.0.0") @@ -396,7 +396,7 @@ def precisionByThreshold(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self.call("precisionByThreshold") + return self._call_java("precisionByThreshold") @property @since("2.0.0") @@ -410,7 +410,7 @@ def recallByThreshold(self): `LogisticRegression.weightCol`. This will change in later Spark versions. """ - return self.call("recallByThreshold") + return self._call_java("recallByThreshold") class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 13b0fef5d23b0..8d064d8fec17c 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -20,8 +20,8 @@ from pyspark import since from pyspark.ml.param.shared import * from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaEstimator, JavaModel -from pyspark.mllib.common import inherit_doc, JavaCallable +from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaCallable +from pyspark.mllib.common import inherit_doc __all__ = ['AFTSurvivalRegression', 'AFTSurvivalRegressionModel', @@ -160,7 +160,7 @@ def summary(self): `trainingSummary == None`. """ java_lrt_summary = self._call_java("summary") - return LinearRegressionTrainingSummary.fromActiveSparkContext(java_lrt_summary) + return LinearRegressionTrainingSummary(java_lrt_summary) @property @since("2.0.0") @@ -179,7 +179,7 @@ def evaluate(self, df): @param dataset Test dataset to evaluate model on. "" java_lr_summary = self._call_java("evaluate", df) - return LinearRegressionSummary._fromActiveSparkContext(java_lr_summary) + return LinearRegressionSummary(java_lr_summary) """ @@ -198,7 +198,7 @@ def predictions(self): """ Dataframe outputted by the model's `transform` method. """ - return self.call("predictions") + return self._call_java("predictions") @property @since("2.0.0") @@ -207,7 +207,7 @@ def predictionCol(self): Field in "predictions" which gives the predicted value of the label at each instance. """ - return self.call("predictionCol") + return self._call_java("predictionCol") @property @since("2.0.0") @@ -216,7 +216,7 @@ def labelCol(self): Field in "predictions" which gives the true label of each instance. """ - return self.call("labelCol") + return self._call_java("labelCol") @property @since("2.0.0") @@ -230,7 +230,7 @@ def explainedVariance(self): `LinearRegression.weightCol`. This will change in later Spark versions. """ - return self.call("explainedVariance") + return self._call_java("explainedVariance") @property @since("2.0.0") @@ -244,7 +244,7 @@ def meanAbsoluteError(self): `LinearRegression.weightCol`. This will change in later Spark versions. """ - return self.call("meanAbsoluteError") + return self._call_java("meanAbsoluteError") @property @since("2.0.0") @@ -258,7 +258,7 @@ def meanSquaredError(self): `LinearRegression.weightCol`. This will change in later Spark versions. """ - return self.call("meanSquaredError") + return self._call_java("meanSquaredError") @property @since("2.0.0") @@ -271,7 +271,7 @@ def rootMeanSquaredError(self): `LinearRegression.weightCol`. This will change in later Spark versions. """ - return self.call("rootMeanSquaredError") + return self._call_java("rootMeanSquaredError") @property @since("2.0.0") @@ -284,7 +284,7 @@ def r2(self): `LinearRegression.weightCol`. This will change in later Spark versions. """ - return self.call("r2") + return self._call_java("r2") @property @since("2.0.0") @@ -292,7 +292,7 @@ def residuals(self): """ Residuals (label - predicted value) """ - return self.call("residuals") + return self._call_java("residuals") @property @since("2.0.0") @@ -300,7 +300,7 @@ def numInstances(self): """ Number of instances in DataFrame predictions """ - return self.call("numInstances") + return self._call_java("numInstances") @property @since("2.0.0") @@ -309,7 +309,7 @@ def devianceResiduals(self): The weighted residuals, the usual residuals rescaled by the square root of the instance weights. """ - return self.call("devianceResiduals") + return self._call_java("devianceResiduals") @property @since("2.0.0") @@ -317,7 +317,7 @@ def coefficientStandardErrors(self): """ Standard error of estimated coefficients and intercept. """ - return self.call("coefficientStandardErrors") + return self._call_java("coefficientStandardErrors") @property @since("2.0.0") @@ -325,7 +325,7 @@ def tValues(self): """ T-statistic of estimated coefficients and intercept. """ - return self.call("tValues") + return self._call_java("tValues") @property @since("2.0.0") @@ -333,7 +333,7 @@ def pValues(self): """ Two-sided p-value of estimated coefficients and intercept. """ - return self.call("pValues") + return self._call_java("pValues") class LinearRegressionTrainingSummary(LinearRegressionSummary): @@ -352,7 +352,7 @@ def totalIterations(self): """ Number of training iterations until termination. """ - return self.call("totalIterations") + return self._call_java("totalIterations") @inherit_doc diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index f8feaa1dfa2be..cd1d064009a87 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -34,10 +34,12 @@ class JavaWrapper(Params): __metaclass__ = ABCMeta - #: The wrapped Java companion object. Subclasses should initialize - #: it properly. The param values in the Java object should be - #: synced with the Python wrapper in fit/transform/evaluate/copy. - _java_obj = None + def __init__(self): + super(JavaWrapper, self).__init__() + #: The wrapped Java companion object. Subclasses should initialize + #: it properly. The param values in the Java object should be + #: synced with the Python wrapper in fit/transform/evaluate/copy. + self._java_obj = None @staticmethod def _new_java_obj(java_class, *args): @@ -191,3 +193,20 @@ def _call_java(self, name, *args): sc = SparkContext._active_spark_context java_args = [_py2java(sc, arg) for arg in args] return _java2py(sc, m(*java_args)) + + +class JavaCallable(object): + """ + Wrapper for a plain object in JVM to make Java calls + """ + def __init__(self, java_obj, sc=None): + self._sc = sc if sc is not None else SparkContext._active_spark_context + self._java_obj = java_obj + + def __del__(self): + self._sc._gateway.detach(self._java_obj) + + def _call_java(self, name, *args): + m = getattr(self._java_obj, name) + java_args = [_py2java(self._sc, arg) for arg in args] + return _java2py(self._sc, m(*java_args)) diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py index 86997b2888b91..9fda1b1682f57 100644 --- a/python/pyspark/mllib/common.py +++ b/python/pyspark/mllib/common.py @@ -130,39 +130,20 @@ def callMLlibFunc(name, *args): return callJavaFunc(sc, api, *args) -class JavaCallable(object): +class JavaModelWrapper(object): """ - Wrapper for an object in JVM to make Java calls + Wrapper for the model in JVM """ - def __init__(self, sc, java_obj): - self._sc = sc - self._java_obj = java_obj + def __init__(self, java_model): + self._sc = SparkContext.getOrCreate() + self._java_model = java_model def __del__(self): - self._sc._gateway.detach(self._java_obj) - - @classmethod - def fromActiveSparkContext(cls, java_obj): - """Create from a currently active context""" - sc = SparkContext._active_spark_context - return cls(sc, java_obj) + self._sc._gateway.detach(self._java_model) def call(self, name, *a): - """Call method of java_obj""" - return callJavaFunc(self._sc, getattr(self._java_obj, name), *a) - - -class JavaModelWrapper(JavaCallable): - """ - Wrapper for the model in JVM - """ - def __init__(self, java_model): - sc = SparkContext.getOrCreate() - super(JavaModelWrapper, self).__init__(sc, java_model) - - @property - def _java_model(self): - return self._java_obj + """Call method of java_model""" + return callJavaFunc(self._sc, getattr(self._java_model, name), *a) def inherit_doc(cls): From 460881cffcb9b6bce35b822e4a9999325352074d Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 16 Mar 2016 11:45:16 -0700 Subject: [PATCH 08/17] reverted change to JavaWrapper static _java_obj, to be done in another PR --- python/pyspark/ml/wrapper.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index cd1d064009a87..78225c043e9ff 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -34,12 +34,10 @@ class JavaWrapper(Params): __metaclass__ = ABCMeta - def __init__(self): - super(JavaWrapper, self).__init__() - #: The wrapped Java companion object. Subclasses should initialize - #: it properly. The param values in the Java object should be - #: synced with the Python wrapper in fit/transform/evaluate/copy. - self._java_obj = None + #: The wrapped Java companion object. Subclasses should initialize + #: it properly. The param values in the Java object should be + #: synced with the Python wrapper in fit/transform/evaluate/copy. + _java_obj = None @staticmethod def _new_java_obj(java_class, *args): From 49a1f79f6a0756ad135e7c3a83cfc87d592869ed Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 24 Mar 2016 11:25:06 -0700 Subject: [PATCH 09/17] Made JavaCallable class able to be a mixin for JavaModel to reuse _call_java --- python/pyspark/ml/wrapper.py | 47 ++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index da487b6a438c6..2410d56c795b8 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -171,8 +171,30 @@ def _transform(self, dataset): return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx) +class JavaCallable(object): + """ + Wrapper for a plain object in JVM to make Java calls, can be used + as a mixin to another class that defines a _java_obj wrapper + """ + def __init__(self, java_obj=None, sc=None): + super(JavaCallable, self).__init__() + self._sc = sc if sc is not None else SparkContext._active_spark_context + # if this class is a mixin and _java_obj is already defined then don't initialize + if java_obj is not None or not hasattr(self, "_java_obj"): + self._java_obj = java_obj + + def __del__(self): + if self._java_obj is not None: + self._sc._gateway.detach(self._java_obj) + + def _call_java(self, name, *args): + m = getattr(self._java_obj, name) + java_args = [_py2java(self._sc, arg) for arg in args] + return _java2py(self._sc, m(*java_args)) + + @inherit_doc -class JavaModel(Model, JavaTransformer): +class JavaModel(Model, JavaCallable, JavaTransformer): """ Base class for :py:class:`Model`s that wrap Java/Scala implementations. Subclasses should inherit this class before @@ -217,26 +239,3 @@ def copy(self, extra=None): that._java_obj = self._java_obj.copy(self._empty_java_param_map()) that._transfer_params_to_java() return that - - def _call_java(self, name, *args): - m = getattr(self._java_obj, name) - sc = SparkContext._active_spark_context - java_args = [_py2java(sc, arg) for arg in args] - return _java2py(sc, m(*java_args)) - - -class JavaCallable(object): - """ - Wrapper for a plain object in JVM to make Java calls - """ - def __init__(self, java_obj, sc=None): - self._sc = sc if sc is not None else SparkContext._active_spark_context - self._java_obj = java_obj - - def __del__(self): - self._sc._gateway.detach(self._java_obj) - - def _call_java(self, name, *args): - m = getattr(self._java_obj, name) - java_args = [_py2java(self._sc, arg) for arg in args] - return _java2py(self._sc, m(*java_args)) From 3571838b60fb1f0cf869e89e127b6ad7b95bd3b3 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 24 Mar 2016 15:38:00 -0700 Subject: [PATCH 10/17] Enabled evaluate() for Linear and Logistic regression, now that it will be public --- python/pyspark/ml/classification.py | 18 +++++++++--------- python/pyspark/ml/regression.py | 18 +++++++++--------- python/pyspark/ml/tests.py | 8 ++++++++ 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index bf01dee9e618b..063622d0269c9 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -254,16 +254,16 @@ def hasSummary(self): """ return self._call_java("hasSummary") - """ - TODO: enable once Scala API is made public - def evaluate(self, df): - "" - Evaluates the model on a testset. - @param dataset Test dataset to evaluate model on. - "" - java_blr_summary = self._call_java("evaluate", df) + @since("2.0.0") + def evaluate(self, dataset): + """ + Evaluates the model on a test dataset. + + :param dataset: + Test dataset to evaluate model on. + """ + java_blr_summary = self._call_java("evaluate", dataset) return BinaryLogisticRegressionSummary(java_blr_summary) - """ class LogisticRegressionSummary(JavaCallable): diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 74b7b287b4e8f..e45bffc5c09d1 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -171,16 +171,16 @@ def hasSummary(self): """ return self._call_java("hasSummary") - """ - TODO: enable once Scala API is made public - def evaluate(self, df): - "" - Evaluates the model on a testset. - @param dataset Test dataset to evaluate model on. - "" - java_lr_summary = self._call_java("evaluate", df) + @since("2.0.0") + def evaluate(self, dataset): + """ + Evaluates the model on a test dataset. + + :param dataset: + Test dataset to evaluate model on. + """ + java_lr_summary = self._call_java("evaluate", dataset) return LinearRegressionSummary(java_lr_summary) - """ class LinearRegressionSummary(JavaCallable): diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 350ea26a595ee..21bcd10482add 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -700,6 +700,10 @@ def test_linear_regression_summary(self): self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) + # test evaluation (with training dataset) produces a summary with same values + # one check is enough to verify a summary is returned, Scala version runs full test + sameSummary = model.evaluate(df) + self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance) def test_logistic_regression_summary(self): from pyspark.mllib.linalg import Vectors @@ -725,6 +729,10 @@ def test_logistic_regression_summary(self): self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) + # test evaluation (with training dataset) produces a summary with same values + # one check is enough to verify a summary is returned, Scala version runs full test + sameSummary = model.evaluate(df) + self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) if __name__ == "__main__": From 4ba3f731c58918c5e2eac13338dc834e411b933f Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 4 Apr 2016 15:18:41 -0700 Subject: [PATCH 11/17] added featuresCol and objectiveHistory to LinearRegressionTrainingSummary, added check in evaluation(dataset) to make sure input is a DataFrame, fixed issues in docstrings --- .../classification/LogisticRegression.scala | 8 +++--- .../ml/regression/LinearRegression.scala | 6 ++++- python/pyspark/ml/classification.py | 23 ++++++++++++----- python/pyspark/ml/regression.py | 25 ++++++++++++++++--- python/pyspark/ml/tests.py | 3 +++ 5 files changed, 51 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index aeb94a6600e51..ee836a6bb30af 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -777,10 +777,10 @@ sealed trait LogisticRegressionSummary extends Serializable { /** Dataframe outputted by the model's `transform` method. */ def predictions: DataFrame - /** Field in "predictions" which gives the calibrated probability of each instance as a vector. */ + /** Field in "predictions" which gives the calibrated probability of each class as a vector. */ def probabilityCol: String - /** Field in "predictions" which gives the true label of each instance. */ + /** Field in "predictions" which gives the true label of each instance (if available). */ def labelCol: String /** Field in "predictions" which gives the features of each instance as a vector. */ @@ -794,7 +794,7 @@ sealed trait LogisticRegressionSummary extends Serializable { * * @param predictions dataframe outputted by the model's `transform` method. * @param probabilityCol field in "predictions" which gives the calibrated probability of - * each instance as a vector. + * each class as a vector. * @param labelCol field in "predictions" which gives the true label of each instance. * @param featuresCol field in "predictions" which gives the features of each instance as a vector. * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. @@ -818,7 +818,7 @@ class BinaryLogisticRegressionTrainingSummary private[classification] ( * * @param predictions dataframe outputted by the model's `transform` method. * @param probabilityCol field in "predictions" which gives the calibrated probability of - * each instance. + * each class. * @param labelCol field in "predictions" which gives the true label of each instance. * @param featuresCol field in "predictions" which gives the features of each instance as a vector. */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 2633c06f40561..952f20bc62bac 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -511,7 +511,7 @@ object LinearRegressionModel extends MLReadable[LinearRegressionModel] { /** * :: Experimental :: * Linear regression training results. Currently, the training summary ignores the - * training coefficients except for the objective trace. + * training weights except for the objective trace. * * @param predictions predictions outputted by the model's `transform` method. * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. @@ -539,6 +539,10 @@ class LinearRegressionTrainingSummary private[regression] ( * Linear regression results evaluated on a dataset. * * @param predictions predictions outputted by the model's `transform` method. + * @param predictionCol Field in "predictions" which gives the predicted value of the label at + * each instance. + * @param labelCol Field in "predictions" which gives the true label of each instance + * (if available). */ @Since("1.5.0") @Experimental diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index cc718a98a3226..314865c36d339 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -25,9 +25,11 @@ from pyspark.ml.regression import ( RandomForestParams, TreeEnsembleParams, DecisionTreeModel, TreeEnsembleModels) from pyspark.mllib.common import inherit_doc +from pyspark.sql import DataFrame __all__ = ['LogisticRegression', 'LogisticRegressionModel', + 'LogisticRegressionSummary', 'LogisticRegressionTrainingSummary' 'BinaryLogisticRegressionSummary', 'BinaryLogisticRegressionTrainingSummary', 'DecisionTreeClassifier', 'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel', @@ -240,9 +242,10 @@ def summary(self): """ Gets summary (e.g. residuals, mse, r-squared ) of model on training set. An exception is thrown if - `trainingSummary == None`. + `trainingSummary is None`. """ java_blrt_summary = self._call_java("summary") + # Note: Once multiclass is added, update this to return correct summary return BinaryLogisticRegressionTrainingSummary(java_blrt_summary) @property @@ -260,8 +263,11 @@ def evaluate(self, dataset): Evaluates the model on a test dataset. :param dataset: - Test dataset to evaluate model on. + Test dataset to evaluate model on, where dataset is an + instance of :py:class:`pyspark.sql.DataFrame` """ + if not isinstance(dataset, DataFrame): + raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) java_blr_summary = self._call_java("evaluate", dataset) return BinaryLogisticRegressionSummary(java_blr_summary) @@ -286,7 +292,7 @@ def predictions(self): def probabilityCol(self): """ Field in "predictions" which gives the calibrated probability - of each instance as a vector. + of each class as a vector. """ return self._call_java("probabilityCol") @@ -295,7 +301,7 @@ def probabilityCol(self): def labelCol(self): """ Field in "predictions" which gives the true label of each - instance. + instance (if available). """ return self._call_java("labelCol") @@ -309,6 +315,7 @@ def featuresCol(self): return self._call_java("featuresCol") +@inherit_doc class LogisticRegressionTrainingSummary(LogisticRegressionSummary): """ Abstraction for multinomial Logistic Regression Training results. @@ -322,7 +329,8 @@ class LogisticRegressionTrainingSummary(LogisticRegressionSummary): @since("2.0.0") def objectiveHistory(self): """ - Objective function (scaled loss + regularization) at each iteration. + Objective function (scaled loss + regularization) at each + iteration. """ return self._call_java("objectiveHistory") @@ -335,6 +343,7 @@ def totalIterations(self): return self._call_java("totalIterations") +@inherit_doc class BinaryLogisticRegressionSummary(LogisticRegressionSummary): """ .. note:: Experimental @@ -377,7 +386,8 @@ def areaUnderROC(self): def pr(self): """ Returns the precision-recall curve, which is an Dataframe - containing two fields recall, precision with (0.0, 1.0) prepended to it. + containing two fields recall, precision with (0.0, 1.0) prepended + to it. Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. This will change in later Spark @@ -427,6 +437,7 @@ def recallByThreshold(self): return self._call_java("recallByThreshold") +@inherit_doc class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, LogisticRegressionTrainingSummary): """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 378865f3d94b2..9951a1c7c7eef 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -157,7 +157,7 @@ def summary(self): """ Gets summary (e.g. residuals, mse, r-squared ) of model on training set. An exception is thrown if - `trainingSummary == None`. + `trainingSummary is None`. """ java_lrt_summary = self._call_java("summary") return LinearRegressionTrainingSummary(java_lrt_summary) @@ -214,7 +214,7 @@ def predictionCol(self): def labelCol(self): """ Field in "predictions" which gives the true label of each - instance. + instance (if available). """ return self._call_java("labelCol") @@ -336,16 +336,35 @@ def pValues(self): return self._call_java("pValues") +@inherit_doc class LinearRegressionTrainingSummary(LinearRegressionSummary): """ .. note:: Experimental Linear regression training results. Currently, the training summary ignores the - training coefficients except for the objective trace. + training weights except for the objective trace. .. versionadded:: 2.0.0 """ + @property + @since("2.0.0") + def featuresCol(self): + """ + Field in "predictions" which gives the features of each instance + as a vector. + """ + return self._call_java("featuresCol") + + @property + @since("2.0.0") + def objectiveHistory(self): + """ + Objective function (scaled loss + regularization) at each + iteration. + """ + return self._call_java("objectiveHistory") + @property @since("2.0.0") def totalIterations(self): diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 5216f2009ae5f..a2c4cd8b6cc3d 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -726,6 +726,9 @@ def test_linear_regression_summary(self): self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertEqual(s.labelCol, "label") + self.assertEqual(s.featuresCol, "features") + objHist = s.objectiveHistory + self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertAlmostEqual(s.explainedVariance, 0.25, 2) self.assertAlmostEqual(s.meanAbsoluteError, 0.0) self.assertAlmostEqual(s.meanSquaredError, 0.0) From d23f546d5c0742c79840f2bbbd80eb22adcf32c0 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 5 Apr 2016 10:07:49 -0700 Subject: [PATCH 12/17] moved featuresCol from LinearRegressionTrainingSummary to LinearRegressionSummary --- .../ml/regression/LinearRegression.scala | 19 +++++++++++++------ python/pyspark/ml/regression.py | 18 +++++++++--------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 952f20bc62bac..2fc05b480465c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -190,9 +190,9 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String summaryModel.transform(dataset), predictionColName, $(labelCol), + $(featuresCol), summaryModel, model.diagInvAtWA.toArray, - $(featuresCol), Array(0D)) return lrModel.setSummary(trainingSummary) @@ -249,9 +249,9 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String summaryModel.transform(dataset), predictionColName, $(labelCol), + $(featuresCol), model, Array(0D), - $(featuresCol), Array(0D)) return copyValues(model.setSummary(trainingSummary)) } else { @@ -356,9 +356,9 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String summaryModel.transform(dataset), predictionColName, $(labelCol), + $(featuresCol), model, Array(0D), - $(featuresCol), objectiveHistory) model.setSummary(trainingSummary) } @@ -421,7 +421,7 @@ class LinearRegressionModel private[ml] ( // Handle possible missing or invalid prediction columns val (summaryModel, predictionColName) = findSummaryModelAndPredictionCol() new LinearRegressionSummary(summaryModel.transform(dataset), predictionColName, - $(labelCol), summaryModel, Array(0D)) + $(labelCol), $(featuresCol), summaryModel, Array(0D)) } /** @@ -522,11 +522,17 @@ class LinearRegressionTrainingSummary private[regression] ( predictions: DataFrame, predictionCol: String, labelCol: String, + featuresCol: String, model: LinearRegressionModel, diagInvAtWA: Array[Double], - val featuresCol: String, val objectiveHistory: Array[Double]) - extends LinearRegressionSummary(predictions, predictionCol, labelCol, model, diagInvAtWA) { + extends LinearRegressionSummary( + predictions, + predictionCol, + labelCol, + featuresCol, + model, + diagInvAtWA) { /** Number of training iterations until termination */ @Since("1.5.0") @@ -550,6 +556,7 @@ class LinearRegressionSummary private[regression] ( @transient val predictions: DataFrame, val predictionCol: String, val labelCol: String, + val featuresCol: String, val model: LinearRegressionModel, private val diagInvAtWA: Array[Double]) extends Serializable { diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 9951a1c7c7eef..459944d0505d6 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -218,6 +218,15 @@ def labelCol(self): """ return self._call_java("labelCol") + @property + @since("2.0.0") + def featuresCol(self): + """ + Field in "predictions" which gives the features of each instance + as a vector. + """ + return self._call_java("featuresCol") + @property @since("2.0.0") def explainedVariance(self): @@ -347,15 +356,6 @@ class LinearRegressionTrainingSummary(LinearRegressionSummary): .. versionadded:: 2.0.0 """ - @property - @since("2.0.0") - def featuresCol(self): - """ - Field in "predictions" which gives the features of each instance - as a vector. - """ - return self._call_java("featuresCol") - @property @since("2.0.0") def objectiveHistory(self): From b44c2338ec30028c4c701628527d5ee5e5abacaa Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 5 Apr 2016 13:33:17 -0700 Subject: [PATCH 13/17] was missing comma in classification __all_ list --- python/pyspark/ml/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 314865c36d339..4d3a10ffba3db 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -29,7 +29,7 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', - 'LogisticRegressionSummary', 'LogisticRegressionTrainingSummary' + 'LogisticRegressionSummary', 'LogisticRegressionTrainingSummary', 'BinaryLogisticRegressionSummary', 'BinaryLogisticRegressionTrainingSummary', 'DecisionTreeClassifier', 'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel', From e0ea89ed194298241156fc7edeec56a60bda3a61 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 5 Apr 2016 13:53:16 -0700 Subject: [PATCH 14/17] added seealso for solver dependent summary metrics --- python/pyspark/ml/regression.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 459944d0505d6..a2099b41ba29f 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -325,6 +325,9 @@ def devianceResiduals(self): def coefficientStandardErrors(self): """ Standard error of estimated coefficients and intercept. + This value is only available when using the "normal" solver. + + .. seealso:: :py:attr:`LinearRegression.solver` """ return self._call_java("coefficientStandardErrors") @@ -333,6 +336,9 @@ def coefficientStandardErrors(self): def tValues(self): """ T-statistic of estimated coefficients and intercept. + This value is only available when using the "normal" solver. + + .. seealso:: :py:attr:`LinearRegression.solver` """ return self._call_java("tValues") @@ -341,6 +347,9 @@ def tValues(self): def pValues(self): """ Two-sided p-value of estimated coefficients and intercept. + This value is only available when using the "normal" solver. + + .. seealso:: :py:attr:`LinearRegression.solver` """ return self._call_java("pValues") @@ -362,6 +371,9 @@ def objectiveHistory(self): """ Objective function (scaled loss + regularization) at each iteration. + This value is only available when using the "l-bfgs" solver. + + .. seealso:: :py:attr:`LinearRegression.solver` """ return self._call_java("objectiveHistory") @@ -370,6 +382,9 @@ def objectiveHistory(self): def totalIterations(self): """ Number of training iterations until termination. + This value is only available when using the "l-bfgs" solver. + + .. seealso:: :py:attr:`LinearRegression.solver` """ return self._call_java("totalIterations") From 7f2bed6aea450fae050e2c72967ec43d63f40ca0 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 5 Apr 2016 14:26:02 -0700 Subject: [PATCH 15/17] cleanup up docs --- .../classification/LogisticRegression.scala | 2 +- .../ml/regression/LinearRegression.scala | 20 ++++++++++++++++--- python/pyspark/ml/classification.py | 2 +- python/pyspark/ml/regression.py | 2 +- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index ee836a6bb30af..37182928cccc8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -818,7 +818,7 @@ class BinaryLogisticRegressionTrainingSummary private[classification] ( * * @param predictions dataframe outputted by the model's `transform` method. * @param probabilityCol field in "predictions" which gives the calibrated probability of - * each class. + * each class as a vector. * @param labelCol field in "predictions" which gives the true label of each instance. * @param featuresCol field in "predictions" which gives the features of each instance as a vector. */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 2fc05b480465c..9619e72a4594a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -534,7 +534,12 @@ class LinearRegressionTrainingSummary private[regression] ( model, diagInvAtWA) { - /** Number of training iterations until termination */ + /** + * Number of training iterations until termination + * + * This value is only available when using the "l-bfgs" solver. + * @see [[LinearRegression.solver]] + */ @Since("1.5.0") val totalIterations = objectiveHistory.length @@ -547,8 +552,8 @@ class LinearRegressionTrainingSummary private[regression] ( * @param predictions predictions outputted by the model's `transform` method. * @param predictionCol Field in "predictions" which gives the predicted value of the label at * each instance. - * @param labelCol Field in "predictions" which gives the true label of each instance - * (if available). + * @param labelCol Field in "predictions" which gives the true label of each instance. + * @param featuresCol Field in "predictions" which gives the features of each instance as a vector. */ @Since("1.5.0") @Experimental @@ -650,6 +655,9 @@ class LinearRegressionSummary private[regression] ( /** * Standard error of estimated coefficients and intercept. + * + * This value is only available when using the "normal" solver. + * @see [[LinearRegression.solver]] */ lazy val coefficientStandardErrors: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { @@ -671,6 +679,9 @@ class LinearRegressionSummary private[regression] ( /** * T-statistic of estimated coefficients and intercept. + * + * This value is only available when using the "normal" solver. + * @see [[LinearRegression.solver]] */ lazy val tValues: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { @@ -688,6 +699,9 @@ class LinearRegressionSummary private[regression] ( /** * Two-sided p-value of estimated coefficients and intercept. + * + * This value is only available when using the "normal" solver. + * @see [[LinearRegression.solver]] */ lazy val pValues: Array[Double] = { if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) { diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 4d3a10ffba3db..be7f9ea9efc11 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -301,7 +301,7 @@ def probabilityCol(self): def labelCol(self): """ Field in "predictions" which gives the true label of each - instance (if available). + instance. """ return self._call_java("labelCol") diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index a2099b41ba29f..071ba9b7644ca 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -214,7 +214,7 @@ def predictionCol(self): def labelCol(self): """ Field in "predictions" which gives the true label of each - instance (if available). + instance. """ return self._call_java("labelCol") From 13a10ecfb24bed6a7708fa1a683855b1416accdd Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 5 Apr 2016 14:32:29 -0700 Subject: [PATCH 16/17] needed to check evaluate() input for LinearRegressionModel is a DataFrame --- python/pyspark/ml/regression.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 071ba9b7644ca..6cd1b4bf3a149 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -22,6 +22,7 @@ from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaCallable from pyspark.mllib.common import inherit_doc +from pyspark.sql import DataFrame __all__ = ['AFTSurvivalRegression', 'AFTSurvivalRegressionModel', @@ -177,8 +178,11 @@ def evaluate(self, dataset): Evaluates the model on a test dataset. :param dataset: - Test dataset to evaluate model on. + Test dataset to evaluate model on, where dataset is an + instance of :py:class:`pyspark.sql.DataFrame` """ + if not isinstance(dataset, DataFrame): + raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) java_lr_summary = self._call_java("evaluate", dataset) return LinearRegressionSummary(java_lr_summary) From 1f030e91369404535d107a58cfc786f7c9299ab9 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 5 Apr 2016 16:50:09 -0700 Subject: [PATCH 17/17] added MiMa exclude for change in LinearRegressionSummary constructor --- project/MimaExcludes.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 2be490b94264a..9762ef96f9498 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -601,6 +601,9 @@ object MimaExcludes { // [SPARK-13674][SQL] Add wholestage codegen support to Sample ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.util.random.PoissonSampler.this"), ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.util.random.PoissonSampler.this") + ) ++ Seq( + // [SPARK-13430][ML] moved featureCol from LinearRegressionModelSummary to LinearRegressionSummary + ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.ml.regression.LinearRegressionSummary.this") ) case v if v.startsWith("1.6") => Seq(