From eb5e7bf4fb6812caf2c6bdddeee32cb7f81f80ab Mon Sep 17 00:00:00 2001 From: mbaddar1 Date: Sun, 13 Mar 2016 23:19:14 +0200 Subject: [PATCH 1/8] Add toString method to BinaryLogistic Regression with the following changes : +Added auxilary dataFrameToString method +Added roc , precision-recall , fmeasure conversion to string --- .../classification/LogisticRegression.scala | 48 +++++++++++++++++++ .../LogisticRegressionSuite.scala | 13 ++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 0d329d2c08d50..2571a11dcd075 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -17,6 +17,7 @@ package org.apache.spark.ml.classification +import scala.StringBuilder import scala.collection.mutable import breeze.linalg.{DenseVector => BDV} @@ -912,6 +913,53 @@ class BinaryLogisticRegressionSummary private[classification] ( @transient lazy val recallByThreshold: DataFrame = { binaryMetrics.recallByThreshold().toDF("threshold", "recall") } + /** + * Returns a string representation for the fields : [[roc]] , [[pr]] + * and [[fMeasureByThreshold]] + */ + @Since("1.6.0") + override def toString : String = { + val newLine = sys.props("line.separator") + + def dataFrameToString(dataFrameName : String,dataFrame:DataFrame,sep:String) : String ={ + val dataFrameStringBuilder = new StringBuilder() + val rowStringBuilder = new StringBuilder + + //Append data frame name + dataFrameStringBuilder.append(dataFrameName+":") + dataFrameStringBuilder.append(newLine) + //create header of string representation + dataFrameStringBuilder.append(dataFrame.columns.mkString(sep)) + dataFrameStringBuilder.append(newLine) + + //create data string representation + dataFrame.collect().map(row=> { + rowStringBuilder.clear + row.toSeq.map(s => { + if(s.isInstanceOf[Double]) + rowStringBuilder.append(f"${s.asInstanceOf[Double]}%1.2f") + else + rowStringBuilder.append(s.toString) + rowStringBuilder.append(sep) + rowStringBuilder.toString() + }) + dataFrameStringBuilder.append(rowStringBuilder.toString.trim) + dataFrameStringBuilder.append(newLine) + }) + dataFrameStringBuilder.toString + } + val summaryStringBuilder = new StringBuilder() + + val colSep = "\t" + //Building roc string + summaryStringBuilder.append(dataFrameToString("ROC",roc,colSep)) + summaryStringBuilder.append(newLine) + summaryStringBuilder.append(dataFrameToString("Precesion",pr,colSep)) + summaryStringBuilder.append(newLine) + summaryStringBuilder.append(dataFrameToString("F-Measure by threshold",fMeasureByThreshold,colSep)) + summaryStringBuilder.append(newLine) + summaryStringBuilder.toString + } } /** diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index cfb9bbfd41ee7..55c8b65b24350 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -798,7 +798,6 @@ class LogisticRegressionSuite .setThreshold(0.6) val model = lr.fit(dataset) val summary = model.summary.asInstanceOf[BinaryLogisticRegressionSummary] - val sameSummary = model.evaluate(dataset).asInstanceOf[BinaryLogisticRegressionSummary] assert(summary.areaUnderROC === sameSummary.areaUnderROC) assert(summary.roc.collect() === sameSummary.roc.collect()) @@ -937,6 +936,18 @@ class LogisticRegressionSuite testEstimatorAndModelReadWrite(lr, dataset, LogisticRegressionSuite.allParamSettings, checkModelData) } + + test("BinaryLogisticRegressionSummary toString") { + val lr = new LogisticRegression() + .setMaxIter(10) + .setRegParam(1.0) + .setThreshold(0.6) + val model = lr.fit(dataset) + val summary = model.summary.asInstanceOf[BinaryLogisticRegressionSummary] + + val strSummary = summary.toString + print(strSummary) + } } object LogisticRegressionSuite { From 822625ec1651ec0257e31afb05f66e9f7ebcfc96 Mon Sep 17 00:00:00 2001 From: mbaddar1 Date: Mon, 14 Mar 2016 14:40:44 +0200 Subject: [PATCH 2/8] Added unit testing for org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.toString The unit testing method test the following +Return type to be string +Retrun string should contains the words "ROC" , "Precision" , "F-Measure" --- .../apache/spark/ml/classification/LogisticRegression.scala | 2 +- .../spark/ml/classification/LogisticRegressionSuite.scala | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 2571a11dcd075..f4b4f0f184392 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -954,7 +954,7 @@ class BinaryLogisticRegressionSummary private[classification] ( //Building roc string summaryStringBuilder.append(dataFrameToString("ROC",roc,colSep)) summaryStringBuilder.append(newLine) - summaryStringBuilder.append(dataFrameToString("Precesion",pr,colSep)) + summaryStringBuilder.append(dataFrameToString("Precision",pr,colSep)) summaryStringBuilder.append(newLine) summaryStringBuilder.append(dataFrameToString("F-Measure by threshold",fMeasureByThreshold,colSep)) summaryStringBuilder.append(newLine) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 55c8b65b24350..b699937967fc4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -946,7 +946,10 @@ class LogisticRegressionSuite val summary = model.summary.asInstanceOf[BinaryLogisticRegressionSummary] val strSummary = summary.toString - print(strSummary) + assert(strSummary.isInstanceOf[String]) + assert(strSummary.contains("ROC")) + assert(strSummary.contains("Precision")) + assert(strSummary.contains("F-Measure")) } } From 60107f5b74df71840244e65b6e5d225e747de252 Mon Sep 17 00:00:00 2001 From: mbaddar1 Date: Mon, 14 Mar 2016 17:12:19 +0200 Subject: [PATCH 3/8] Added Style changes to follow the guide --- .../classification/LogisticRegression.scala | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index f4b4f0f184392..5d87d81889e42 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -17,7 +17,6 @@ package org.apache.spark.ml.classification -import scala.StringBuilder import scala.collection.mutable import breeze.linalg.{DenseVector => BDV} @@ -918,28 +917,31 @@ class BinaryLogisticRegressionSummary private[classification] ( * and [[fMeasureByThreshold]] */ @Since("1.6.0") - override def toString : String = { + override def toString: String = { val newLine = sys.props("line.separator") - def dataFrameToString(dataFrameName : String,dataFrame:DataFrame,sep:String) : String ={ + def dataFrameToString(dataFrameName: String, dataFrame: DataFrame, sep: String): String = { val dataFrameStringBuilder = new StringBuilder() val rowStringBuilder = new StringBuilder - //Append data frame name - dataFrameStringBuilder.append(dataFrameName+":") + // Append data frame name + dataFrameStringBuilder.append(dataFrameName + ":") dataFrameStringBuilder.append(newLine) - //create header of string representation + // create header of string representation dataFrameStringBuilder.append(dataFrame.columns.mkString(sep)) dataFrameStringBuilder.append(newLine) - //create data string representation - dataFrame.collect().map(row=> { + // create data string representation + dataFrame.collect().map(row => { rowStringBuilder.clear row.toSeq.map(s => { - if(s.isInstanceOf[Double]) + if (s.isInstanceOf[Double]) { rowStringBuilder.append(f"${s.asInstanceOf[Double]}%1.2f") - else + } + else { rowStringBuilder.append(s.toString) + } + rowStringBuilder.append(sep) rowStringBuilder.toString() }) @@ -951,12 +953,12 @@ class BinaryLogisticRegressionSummary private[classification] ( val summaryStringBuilder = new StringBuilder() val colSep = "\t" - //Building roc string - summaryStringBuilder.append(dataFrameToString("ROC",roc,colSep)) + summaryStringBuilder.append(dataFrameToString("ROC", roc, colSep)) summaryStringBuilder.append(newLine) - summaryStringBuilder.append(dataFrameToString("Precision",pr,colSep)) + summaryStringBuilder.append(dataFrameToString("Precision", pr, colSep)) summaryStringBuilder.append(newLine) - summaryStringBuilder.append(dataFrameToString("F-Measure by threshold",fMeasureByThreshold,colSep)) + summaryStringBuilder.append(dataFrameToString("F-Measure by threshold", fMeasureByThreshold, + colSep)) summaryStringBuilder.append(newLine) summaryStringBuilder.toString } From d11ee3d2125a9105d66151e7390b7c5aaaf60c05 Mon Sep 17 00:00:00 2001 From: mbaddar1 Date: Tue, 15 Mar 2016 14:01:21 +0200 Subject: [PATCH 4/8] Add TODOs --- .../spark/ml/classification/LogisticRegression.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 5d87d81889e42..cf32d49e8cf8f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -915,6 +915,7 @@ class BinaryLogisticRegressionSummary private[classification] ( /** * Returns a string representation for the fields : [[roc]] , [[pr]] * and [[fMeasureByThreshold]] + * //TODO add more fields in toString method */ @Since("1.6.0") override def toString: String = { @@ -943,15 +944,15 @@ class BinaryLogisticRegressionSummary private[classification] ( } rowStringBuilder.append(sep) - rowStringBuilder.toString() + rowStringBuilder.toString }) dataFrameStringBuilder.append(rowStringBuilder.toString.trim) dataFrameStringBuilder.append(newLine) }) dataFrameStringBuilder.toString } - val summaryStringBuilder = new StringBuilder() - + val summaryStringBuilder = new StringBuilder + //TODO Dynamic adjustment for separator based on col width (if needed) val colSep = "\t" summaryStringBuilder.append(dataFrameToString("ROC", roc, colSep)) summaryStringBuilder.append(newLine) From 82c8620c5193cf96efcf4355a25435550f62c93a Mon Sep 17 00:00:00 2001 From: mbaddar1 Date: Tue, 15 Mar 2016 14:02:21 +0200 Subject: [PATCH 5/8] Add assert in unit testing for non empty string --- .../apache/spark/ml/classification/LogisticRegressionSuite.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index b699937967fc4..2ed16c6788fc1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -947,6 +947,7 @@ class LogisticRegressionSuite val strSummary = summary.toString assert(strSummary.isInstanceOf[String]) + assert(strSummary.length >0) assert(strSummary.contains("ROC")) assert(strSummary.contains("Precision")) assert(strSummary.contains("F-Measure")) From 7cd6d664d1dfce8128ca150e3d513b3ca89b19b1 Mon Sep 17 00:00:00 2001 From: mbaddar1 Date: Tue, 15 Mar 2016 14:04:06 +0200 Subject: [PATCH 6/8] Add comments in unit testing --- .../spark/ml/classification/LogisticRegressionSuite.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 2ed16c6788fc1..0731d0502ef7c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -946,8 +946,9 @@ class LogisticRegressionSuite val summary = model.summary.asInstanceOf[BinaryLogisticRegressionSummary] val strSummary = summary.toString - assert(strSummary.isInstanceOf[String]) - assert(strSummary.length >0) + assert(strSummary.isInstanceOf[String]) // Check type of return + assert(strSummary.length >0) // Check non-empty return string + // Check whether return string return expected column headers assert(strSummary.contains("ROC")) assert(strSummary.contains("Precision")) assert(strSummary.contains("F-Measure")) From 2784e5b348d43dc57b0446408a87d14da612a2b2 Mon Sep 17 00:00:00 2001 From: mbaddar1 Date: Tue, 15 Mar 2016 14:04:43 +0200 Subject: [PATCH 7/8] Add TODO in unit testing --- .../apache/spark/ml/classification/LogisticRegressionSuite.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 0731d0502ef7c..3f8811a540420 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -952,6 +952,7 @@ class LogisticRegressionSuite assert(strSummary.contains("ROC")) assert(strSummary.contains("Precision")) assert(strSummary.contains("F-Measure")) + // TODO add checking with regex } } From 5e926bae64cf4711017fdb13fedf99e3bf619657 Mon Sep 17 00:00:00 2001 From: mbaddar1 Date: Tue, 15 Mar 2016 14:12:23 +0200 Subject: [PATCH 8/8] Added stylistic changes --- .../org/apache/spark/ml/classification/LogisticRegression.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index cf32d49e8cf8f..605021216672d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -952,7 +952,7 @@ class BinaryLogisticRegressionSummary private[classification] ( dataFrameStringBuilder.toString } val summaryStringBuilder = new StringBuilder - //TODO Dynamic adjustment for separator based on col width (if needed) + // TODO Dynamic adjustment for separator based on col width (if needed) val colSep = "\t" summaryStringBuilder.append(dataFrameToString("ROC", roc, colSep)) summaryStringBuilder.append(newLine)