From 50cfafe9e0cda8702fa43fd27ea75bfbd042268e Mon Sep 17 00:00:00 2001 From: Ming Jiang Date: Sun, 3 Sep 2017 21:22:33 -0700 Subject: [PATCH 1/7] added probabilityCol to LogisticRegressionSummary --- python/pyspark/ml/classification.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index f0f42a34942d7..3fc5c5ead5b5e 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -585,6 +585,14 @@ def probabilityCol(self): """ return self._call_java("probabilityCol") + @property + @since("2.3.0") + def predictionCol(self): + """ + Field in "predictions" which gives the prediction of each class. + """ + return self._call_java("predictionCol") + @property @since("2.0.0") def labelCol(self): From 60579d5f36ef26f6e3ec675a795ccc86d6a71c8d Mon Sep 17 00:00:00 2001 From: Ming Jiang Date: Mon, 4 Sep 2017 22:08:15 -0700 Subject: [PATCH 2/7] modified LogisticRegressionSummary and LogisticRegressionModel in classification.py --- python/pyspark/ml/classification.py | 115 +++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 3fc5c5ead5b5e..c4e7f408247dd 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -529,8 +529,13 @@ def summary(self): """ if self.hasSummary: java_blrt_summary = self._call_java("summary") - # Note: Once multiclass is added, update this to return correct summary - return BinaryLogisticRegressionTrainingSummary(java_blrt_summary) + java_blrt_interceptVector = self._call_java("interceptVector") + java_blrt_numClasses = self._call_java("numClasses") + java_blrt_binarysummary = self._call_java("binarySummary") + if (len(java_blrt_interceptVector) == 1): + return BinaryLogisticRegressionTrainingSummary(java_blrt_binarysummary) + else: + return LogisticRegressionTrainingSummary(java_blrt_summary) else: raise RuntimeError("No training summary available for this %s" % self.__class__.__name__) @@ -611,6 +616,112 @@ def featuresCol(self): """ return self._call_java("featuresCol") + @property + @since("2.3.0") + def labels(self): + """ + Returns the sequence of labels in ascending order. This order matches the order used + in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel. + + Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the + training set is missing a label, then all of the arrays over labels + (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the + expected numClasses. + """ + return self._call_java("labels") + + @property + @since("2.3.0") + def truePositiveRateByLabel(self): + """ + Returns true positive rate for each label (category). + """ + return self._call_java("truePositiveRateByLabel") + + @property + @since("2.3.0") + def falsePositiveRateByLabel(self): + """ + Returns false positive rate for each label (category). + """ + return self._call_java("falsePositiveRateByLabel") + + @property + @since("2.3.0") + def precisionByLabel(self): + """ + Returns precision for each label (category). + """ + return self._call_java("precisionByLabel") + + @property + @since("2.3.0") + def recallByLabel(self): + """ + Returns recall for each label (category). + """ + return self._call_java("recallByLabel") + + @property + @since("2.3.0") + def fMeasureByLabel(self, beta=1.0): + """ + Returns f-measure for each label (category). + """ + return self._call_java("fMeasureByLabel", beta) + + @property + @since("2.3.0") + def accuracy(self): + """ + Returns accuracy. + (equals to the total number of correctly classified instances + out of the total number of instances.) + """ + return self._call_java("accuracy") + + @property + @since("2.3.0") + def weightedTruePositiveRate(self): + """ + Returns weighted true positive rate. + (equals to precision, recall and f-measure) + """ + return self._call_java("weightedTruePositiveRate") + + @property + @since("2.3.0") + def weightedFalsePositiveRate(self): + """ + Returns weighted false positive rate. + """ + return self._call_java("weightedFalsePositiveRate") + + @property + @since("2.3.0") + def weightedRecall(self): + """ + Returns weighted averaged recall. + (equals to precision, recall and f-measure) + """ + return self._call_java("weightedRecall") + + @property + @since("2.3.0") + def weightedPrecision(self): + """ + Returns weighted averaged precision. + """ + return self._call_java("weightedPrecision") + + @property + @since("2.3.0") + def weightedFMeasure(self, beta=1.0): + """ + Returns weighted averaged f-measure. + """ + return self._call_java("weightedFMeasure", beta) + @inherit_doc class LogisticRegressionTrainingSummary(LogisticRegressionSummary): From 1a73e6c6d20d6374379ec2fb237e7f596e77bc62 Mon Sep 17 00:00:00 2001 From: Ming Jiang Date: Thu, 7 Sep 2017 15:39:08 -0700 Subject: [PATCH 3/7] test on multiclass summary --- python/pyspark/ml/classification.py | 5 ++--- python/pyspark/ml/tests.py | 34 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index c4e7f408247dd..41e6dc673ac03 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -529,10 +529,9 @@ def summary(self): """ if self.hasSummary: java_blrt_summary = self._call_java("summary") - java_blrt_interceptVector = self._call_java("interceptVector") java_blrt_numClasses = self._call_java("numClasses") - java_blrt_binarysummary = self._call_java("binarySummary") - if (len(java_blrt_interceptVector) == 1): + if (java_blrt_numClasses == 2): + java_blrt_binarysummary = self._call_java("binarySummary") return BinaryLogisticRegressionTrainingSummary(java_blrt_binarysummary) else: return LogisticRegressionTrainingSummary(java_blrt_summary) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 509698f6014eb..0126127f57200 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1478,6 +1478,40 @@ def test_logistic_regression_summary(self): sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) + def test_multiclass_logistic_regression_summary(self): + df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), + (0.0, 2.0, Vectors.sparse(1, [], [])), + (2.0, 2.0, Vectors.dense(2.0)), + (2.0, 2.0, Vectors.dense(1.9))], + ["label", "weight", "features"]) + lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) + model = lr.fit(df) + self.assertTrue(model.hasSummary) + s = model.summary + # test that api is callable and returns expected types + self.assertTrue(isinstance(s.predictions, DataFrame)) + self.assertEqual(s.probabilityCol, "probability") + self.assertEqual(s.labelCol, "label") + self.assertEqual(s.featuresCol, "features") + self.assertEqual(s.predictionCol, "prediction") + objHist = s.objectiveHistory + self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) + self.assertGreater(s.totalIterations, 0) + self.assertTrue(isinstance(s.labels, list)) + self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) + self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) + self.assertTrue(isinstance(s.precisionByLabel, list)) + self.assertTrue(isinstance(s.recallByLabel, list)) + self.assertTrue(isinstance(s.fMeasureByLabel, list)) + self.assertAlmostEqual(s.accuracy, 0.75, 2) + self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2) + self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2) + self.assertAlmostEqual(s.weightedRecall, 0.75, 2) + self.assertAlmostEqual(s.weightedPrecision, 0.583, 2) + self.assertAlmostEqual(s.weightedFMeasure, 0.65, 2) + # test evaluation (with training dataset) produces a summary with same values + # one check is enough to verify a summary is returned, Scala version runs full test + def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] From 53ac68e24731e405d44d50be70b1c25f2ab9df3d Mon Sep 17 00:00:00 2001 From: Ming Jiang Date: Thu, 7 Sep 2017 22:53:47 -0700 Subject: [PATCH 4/7] fixed numClasses --- python/pyspark/ml/classification.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 41e6dc673ac03..6080ae60add30 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -529,8 +529,7 @@ def summary(self): """ if self.hasSummary: java_blrt_summary = self._call_java("summary") - java_blrt_numClasses = self._call_java("numClasses") - if (java_blrt_numClasses == 2): + if (self.numClasses == 2): java_blrt_binarysummary = self._call_java("binarySummary") return BinaryLogisticRegressionTrainingSummary(java_blrt_binarysummary) else: From a4755d721013c1a535a4775fb277ae4d7a583587 Mon Sep 17 00:00:00 2001 From: jmwdpk Date: Mon, 11 Sep 2017 08:34:27 -0700 Subject: [PATCH 5/7] 0911 added more test and simplified summary logic --- python/pyspark/ml/classification.py | 11 ++++------- python/pyspark/ml/tests.py | 20 +++++++++++++++++--- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 6080ae60add30..4eedb1be9b99c 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -528,12 +528,11 @@ def summary(self): trained on the training set. An exception is thrown if `trainingSummary is None`. """ if self.hasSummary: - java_blrt_summary = self._call_java("summary") - if (self.numClasses == 2): - java_blrt_binarysummary = self._call_java("binarySummary") - return BinaryLogisticRegressionTrainingSummary(java_blrt_binarysummary) + java_lrt_summary = self._call_java("summary") + if (self.numClasses <= 2): + return BinaryLogisticRegressionTrainingSummary(java_lrt_summary) else: - return LogisticRegressionTrainingSummary(java_blrt_summary) + return LogisticRegressionTrainingSummary(java_lrt_summary) else: raise RuntimeError("No training summary available for this %s" % self.__class__.__name__) @@ -660,7 +659,6 @@ def recallByLabel(self): """ return self._call_java("recallByLabel") - @property @since("2.3.0") def fMeasureByLabel(self, beta=1.0): """ @@ -712,7 +710,6 @@ def weightedPrecision(self): """ return self._call_java("weightedPrecision") - @property @since("2.3.0") def weightedFMeasure(self, beta=1.0): """ diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 0126127f57200..a0ab2c0e27d1f 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1451,7 +1451,7 @@ def test_glr_summary(self): sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.deviance, s.deviance) - def test_logistic_regression_summary(self): + def test_binary_logistic_regression_summary(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) @@ -1473,6 +1473,13 @@ def test_logistic_regression_summary(self): self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) + + self.assertAlmostEqual(s.accuracy, 1.0, 2) + self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2) + self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2) + self.assertAlmostEqual(s.weightedRecall, 1.0, 2) + self.assertAlmostEqual(s.weightedPrecision, 1.0, 2) + self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) @@ -1502,15 +1509,22 @@ def test_multiclass_logistic_regression_summary(self): self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) self.assertTrue(isinstance(s.precisionByLabel, list)) self.assertTrue(isinstance(s.recallByLabel, list)) - self.assertTrue(isinstance(s.fMeasureByLabel, list)) + self.assertTrue(isinstance(s.fMeasureByLabel(), list)) self.assertAlmostEqual(s.accuracy, 0.75, 2) self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2) self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2) self.assertAlmostEqual(s.weightedRecall, 0.75, 2) self.assertAlmostEqual(s.weightedPrecision, 0.583, 2) - self.assertAlmostEqual(s.weightedFMeasure, 0.65, 2) + self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test + sameSummary = model.evaluate(df) + self.assertAlmostEqual(sameSummary.accuracy, s.accuracy) + self.assertAlmostEqual(sameSummary.weightedTruePositiveRate, s.weightedTruePositiveRate) + self.assertAlmostEqual(sameSummary.weightedFalsePositiveRate, s.weightedFalsePositiveRate) + self.assertAlmostEqual(sameSummary.weightedRecall, s.weightedRecall) + self.assertAlmostEqual(sameSummary.weightedPrecision, s.weightedPrecision) + self.assertAlmostEqual(sameSummary.weightedFMeasure(), s.weightedFMeasure()) def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), From eb8f6b431982d6f1f0118965391560f94812ab53 Mon Sep 17 00:00:00 2001 From: Ming Jiang Date: Mon, 11 Sep 2017 23:07:56 -0700 Subject: [PATCH 6/7] added more scala unit tests for binary summary, and additional tests for other binary logistic regression summary --- .../ml/classification/LogisticRegressionSuite.scala | 12 ++++++++++++ python/pyspark/ml/classification.py | 2 +- python/pyspark/ml/tests.py | 13 ++++++++++++- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index d43c7cdbde62c..14f550890d238 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -2416,6 +2416,18 @@ class LogisticRegressionSuite blorSummary.recallByThreshold.collect() === sameBlorSummary.recallByThreshold.collect()) assert( blorSummary.precisionByThreshold.collect() === sameBlorSummary.precisionByThreshold.collect()) + assert(blorSummary.labels === sameBlorSummary.labels) + assert(blorSummary.truePositiveRateByLabel === sameBlorSummary.truePositiveRateByLabel) + assert(blorSummary.falsePositiveRateByLabel === sameBlorSummary.falsePositiveRateByLabel) + assert(blorSummary.precisionByLabel === sameBlorSummary.precisionByLabel) + assert(blorSummary.recallByLabel === sameBlorSummary.recallByLabel) + assert(blorSummary.fMeasureByLabel === sameBlorSummary.fMeasureByLabel) + assert(blorSummary.accuracy === sameBlorSummary.accuracy) + assert(blorSummary.weightedTruePositiveRate === sameBlorSummary.weightedTruePositiveRate) + assert(blorSummary.weightedFalsePositiveRate === sameBlorSummary.weightedFalsePositiveRate) + assert(blorSummary.weightedRecall === sameBlorSummary.weightedRecall) + assert(blorSummary.weightedPrecision === sameBlorSummary.weightedPrecision) + assert(blorSummary.weightedFMeasure === sameBlorSummary.weightedFMeasure) lr.setFamily("multinomial") val mlorModel = lr.fit(smallMultinomialDataset) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 4eedb1be9b99c..8036ef7fbf741 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -529,7 +529,7 @@ def summary(self): """ if self.hasSummary: java_lrt_summary = self._call_java("summary") - if (self.numClasses <= 2): + if self.numClasses <= 2: return BinaryLogisticRegressionTrainingSummary(java_lrt_summary) else: return LogisticRegressionTrainingSummary(java_lrt_summary) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index a0ab2c0e27d1f..831fd0aa15345 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1464,22 +1464,30 @@ def test_binary_logistic_regression_summary(self): self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") + self.assertEqual(s.predictionCol, "prediction") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) + self.assertTrue(isinstance(s.labels, list)) + self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) + self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) + self.assertTrue(isinstance(s.precisionByLabel, list)) + self.assertTrue(isinstance(s.recallByLabel, list)) + self.assertTrue(isinstance(s.fMeasureByLabel(), list)) + self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) self.assertTrue(isinstance(s.roc, DataFrame)) self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) self.assertTrue(isinstance(s.pr, DataFrame)) self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) - self.assertAlmostEqual(s.accuracy, 1.0, 2) self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2) self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2) self.assertAlmostEqual(s.weightedRecall, 1.0, 2) self.assertAlmostEqual(s.weightedPrecision, 1.0, 2) self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2) + self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) @@ -1510,12 +1518,14 @@ def test_multiclass_logistic_regression_summary(self): self.assertTrue(isinstance(s.precisionByLabel, list)) self.assertTrue(isinstance(s.recallByLabel, list)) self.assertTrue(isinstance(s.fMeasureByLabel(), list)) + self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) self.assertAlmostEqual(s.accuracy, 0.75, 2) self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2) self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2) self.assertAlmostEqual(s.weightedRecall, 0.75, 2) self.assertAlmostEqual(s.weightedPrecision, 0.583, 2) self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2) + self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) @@ -1525,6 +1535,7 @@ def test_multiclass_logistic_regression_summary(self): self.assertAlmostEqual(sameSummary.weightedRecall, s.weightedRecall) self.assertAlmostEqual(sameSummary.weightedPrecision, s.weightedPrecision) self.assertAlmostEqual(sameSummary.weightedFMeasure(), s.weightedFMeasure()) + self.assertAlmostEqual(sameSummary.weightedFMeasure(1.0), s.weightedFMeasure(1.0)) def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), From 6529fa6ecb7d607d3b38e68c8007bc22d9e27907 Mon Sep 17 00:00:00 2001 From: Ming Jiang Date: Wed, 13 Sep 2017 21:27:51 -0700 Subject: [PATCH 7/7] removed extra summary test --- python/pyspark/ml/tests.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 831fd0aa15345..b16f42059f1ee 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1530,12 +1530,6 @@ def test_multiclass_logistic_regression_summary(self): # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.accuracy, s.accuracy) - self.assertAlmostEqual(sameSummary.weightedTruePositiveRate, s.weightedTruePositiveRate) - self.assertAlmostEqual(sameSummary.weightedFalsePositiveRate, s.weightedFalsePositiveRate) - self.assertAlmostEqual(sameSummary.weightedRecall, s.weightedRecall) - self.assertAlmostEqual(sameSummary.weightedPrecision, s.weightedPrecision) - self.assertAlmostEqual(sameSummary.weightedFMeasure(), s.weightedFMeasure()) - self.assertAlmostEqual(sameSummary.weightedFMeasure(1.0), s.weightedFMeasure(1.0)) def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),