awslabs · sscdotopen · Aug 9, 2019 · Apr 23, 2019 · Apr 26, 2019 · Apr 26, 2019
diff --git a/src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala b/src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala
@@ -332,6 +332,20 @@ object Preconditions {
         s"(${numericDataTypes.mkString(",")}), but found $columnDataType instead!")
     }
   }
+
+  /** Specified column has string type */
+  def isString(column: String): StructType => Unit = { schema =>
+    val columnDataType = schema(column).dataType
+    val hasStringType = columnDataType match {
+      case StringType => true
+      case _ => false
+    }
+
+    if (!hasStringType) {
+      throw new WrongColumnTypeException(s"Expected type of column $column to be " +
+        s"StringType, but found $columnDataType instead!")
+    }
+  }
 }
 
 private[deequ] object Analyzers {

diff --git a/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala b/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not
+ * use this file except in compliance with the License. A copy of the License
+ * is located at
+ *
+ *     http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+package com.amazon.deequ.analyzers
+
+import com.amazon.deequ.analyzers.Analyzers._
+import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString}
+import org.apache.spark.sql.functions.{length, max}
+import org.apache.spark.sql.types.{DoubleType, StructType}
+import org.apache.spark.sql.{Column, Row}
+
+case class MaxLength(column: String, where: Option[String] = None)
+  extends StandardScanShareableAnalyzer[MaxState]("MaxLength", column) {
+
+  override def aggregationFunctions(): Seq[Column] = {
+    max(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil
+  }
+
+  override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = {
+    ifNoNullsIn(result, offset) { _ =>
+      MaxState(result.getDouble(offset))
+    }
+  }
+
+  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
+    hasColumn(column):: isString(column) :: Nil
+  }
+}
diff --git a/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala b/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not
+ * use this file except in compliance with the License. A copy of the License
+ * is located at
+ *
+ *     http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+package com.amazon.deequ.analyzers
+
+import com.amazon.deequ.analyzers.Analyzers._
+import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString}
+import org.apache.spark.sql.functions.{length, min}
+import org.apache.spark.sql.types.{DoubleType, StructType}
+import org.apache.spark.sql.{Column, Row}
+
+case class MinLength(column: String, where: Option[String] = None)
+  extends StandardScanShareableAnalyzer[MinState]("MinLength", column) {
+
+  override def aggregationFunctions(): Seq[Column] = {
+    min(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil
+  }
+
+  override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = {
+    ifNoNullsIn(result, offset) { _ =>
+      MinState(result.getDouble(offset))
+    }
+  }
+
+  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
+    hasColumn(column) :: isString(column) :: Nil
+  }
+}
diff --git a/src/main/scala/com/amazon/deequ/analyzers/StateProvider.scala b/src/main/scala/com/amazon/deequ/analyzers/StateProvider.scala
@@ -105,6 +105,12 @@ case class HdfsStateProvider(
       case _: Maximum =>
         persistDoubleState(state.asInstanceOf[MaxState].maxValue, identifier)
 
+      case _: MaxLength =>
+        persistDoubleState(state.asInstanceOf[MaxState].maxValue, identifier)
+
+      case _: MinLength =>
+        persistDoubleState(state.asInstanceOf[MinState].minValue, identifier)
+
       case _ : FrequencyBasedAnalyzer | _ : Histogram =>
         persistDataframeLongState(state.asInstanceOf[FrequenciesAndNumRows], identifier)
 
@@ -151,6 +157,10 @@ case class HdfsStateProvider(
 
       case _ : Maximum => MaxState(loadDoubleState(identifier))
 
+      case _ : MaxLength => MaxState(loadDoubleState(identifier))
+
+      case _ : MinLength => MinState(loadDoubleState(identifier))
+
       case _ : FrequencyBasedAnalyzer | _ : Histogram => loadDataframeLongState(identifier)
 
       case _ : DataType => DataTypeHistogram.fromBytes(loadBytes(identifier))

diff --git a/src/main/scala/com/amazon/deequ/checks/Check.scala b/src/main/scala/com/amazon/deequ/checks/Check.scala
@@ -397,6 +397,39 @@ case class Check(
     addConstraint(approxQuantileConstraint(column, quantile, assertion, hint))
   }
 
+  /**
+    * Creates a constraint that asserts on the minimum length of the column
+    *
+    * @param column Column to run the assertion on
+    * @param assertion Function that receives a double input parameter and returns a boolean
+    * @param hint A hint to provide additional context why a constraint could have failed
+    * @return
+    */
+  def hasMinLength(
+      column: String,
+      assertion: Double => Boolean,
+      hint: Option[String] = None)
+    : CheckWithLastConstraintFilterable = {
+
+    addFilterableConstraint { filter => minLengthConstraint(column, assertion, filter, hint) }
+  }
+
+  /**
+    * Creates a constraint that asserts on the maximum length of the column
+    *
+    * @param column Column to run the assertion on
+    * @param assertion Function that receives a double input parameter and returns a boolean
+    * @param hint A hint to provide additional context why a constraint could have failed
+    * @return
+    */
+  def hasMaxLength(
+      column: String,
+      assertion: Double => Boolean,
+      hint: Option[String] = None)
+    : CheckWithLastConstraintFilterable = {
+
+    addFilterableConstraint { filter => maxLengthConstraint(column, assertion, filter, hint) }
+  }
 
   /**
     * Creates a constraint that asserts on the minimum of the column

diff --git a/src/main/scala/com/amazon/deequ/constraints/Constraint.scala b/src/main/scala/com/amazon/deequ/constraints/Constraint.scala
@@ -379,6 +379,50 @@ object Constraint {
     new NamedConstraint(constraint, s"ApproxQuantileConstraint($approxQuantile)")
   }
 
+  /**
+    * Runs max length analysis on the given column and executes the assertion
+    *
+    * @param column Column to run the assertion on
+    * @param assertion Function that receives a double input parameter and returns a boolean
+    * @param hint    A hint to provide additional context why a constraint could have failed
+    */
+  def maxLengthConstraint(
+      column: String,
+      assertion: Double => Boolean,
+      where: Option[String] = None,
+      hint: Option[String] = None)
+    : Constraint = {
+
+    val maxLength = MaxLength(column, where)
+
+    val constraint = AnalysisBasedConstraint[MaxState, Double, Double](maxLength, assertion,
+      hint = hint)
+
+    new NamedConstraint(constraint, s"MaxLengthConstraint($maxLength)")
+  }
+
+  /**
+    * Runs min length analysis on the given column and executes the assertion
+    *
+    * @param column Column to run the assertion on
+    * @param assertion Function that receives a double input parameter and returns a boolean
+    * @param hint    A hint to provide additional context why a constraint could have failed
+    */
+  def minLengthConstraint(
+      column: String,
+      assertion: Double => Boolean,
+      where: Option[String] = None,
+      hint: Option[String] = None)
+    : Constraint = {
+
+    val minLength = MinLength(column, where)
+
+    val constraint = AnalysisBasedConstraint[MinState, Double, Double](minLength, assertion,
+      hint = hint)
+
+    new NamedConstraint(constraint, s"MinLengthConstraint($minLength)")
+  }
+
   /**
     * Runs minimum analysis on the given column and executes the assertion
     *

diff --git a/src/test/scala/com/amazon/deequ/analyzers/AnalysisTest.scala b/src/test/scala/com/amazon/deequ/analyzers/AnalysisTest.scala
@@ -96,6 +96,23 @@ class AnalysisTest extends WordSpec with Matchers with SparkContextSpec with Fix
         Success(3.0)))
     }
 
+    "return string length statistics" in withSparkSession { sparkSession =>
+      val df = getDfWithVariableStringLengthValues(sparkSession)
+
+      val analysis = Analysis()
+        .addAnalyzer(MaxLength("att1"))
+        .addAnalyzer(MinLength("att1"))
+
+      val resultMetrics = analysis.run(df).allMetrics
+
+      assert(resultMetrics.size == analysis.analyzers.size)
+
+      resultMetrics should contain(DoubleMetric(Entity.Column, "MaxLength", "att1",
+        Success(4.0)))
+      resultMetrics should contain(DoubleMetric(Entity.Column, "MinLength", "att1",
+        Success(0.0)))
+    }
+
     "return the proper exception for non existing columns" in withSparkSession { sparkSession =>
       val df = getDfWithNumericValues(sparkSession)
 
@@ -147,7 +164,7 @@ class AnalysisTest extends WordSpec with Matchers with SparkContextSpec with Fix
           Failure(new NumberOfSpecifiedColumnsException(""))))
       }
 
-    "return the proper exception when the number of max histogramm bins is too big" in
+    "return the proper exception when the number of max histogram bins is too big" in
       withSparkSession { sparkSession =>
         val df = getDfWithNumericValues(sparkSession)
 

diff --git a/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala b/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala
@@ -17,13 +17,12 @@
 package com.amazon.deequ
 package analyzers
 
-import com.amazon.deequ.SparkContextSpec
 import com.amazon.deequ.analyzers.runners.NoSuchColumnException
 import com.amazon.deequ.metrics.{Distribution, DistributionValue, DoubleMetric, Entity}
 import com.amazon.deequ.utils.AssertionUtils.TryUtils
 import com.amazon.deequ.utils.FixtureSupport
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.functions.{col, expr, udf}
+import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.sql.types._
 import org.scalatest.{Matchers, WordSpec}
 
@@ -503,6 +502,42 @@ class AnalyzerTests extends WordSpec with Matchers with SparkContextSpec with Fi
       assert(result.value.isSuccess)
       assert(result.value.get == 99.0)
     }
+
+    "compute min length correctly for string data" in withSparkSession { sparkSession =>
+      val df = getDfWithVariableStringLengthValues(sparkSession)
+      val result = MinLength("att1").calculate(df).value
+      result shouldBe Success(0.0)
+    }
+
+    "compute min length correctly for string data with filtering" in
+      withSparkSession { sparkSession =>
+        val df = getDfWithVariableStringLengthValues(sparkSession)
+        val result = MinLength("att1", where = Some("att1 != ''")).calculate(df).value
+        result shouldBe Success(1.0)
+    }
+
+    "fail to compute min length for non string type" in withSparkSession { sparkSession =>
+      val df = getDfWithNumericValues(sparkSession)
+      assert(MinLength("att1").calculate(df).value.isFailure)
+    }
+
+    "compute max length correctly for string data" in withSparkSession { sparkSession =>
+      val df = getDfWithVariableStringLengthValues(sparkSession)
+      val result = MaxLength("att1").calculate(df).value
+      result shouldBe Success(4.0)
+    }
+
+    "compute max length correctly for string data with filtering" in
+      withSparkSession { sparkSession =>
+        val df = getDfWithVariableStringLengthValues(sparkSession)
+        val result = MaxLength("att1", where = Some("att1 != 'dddd'")).calculate(df).value
+        result shouldBe Success(3.0)
+    }
+
+    "fail to compute max length for non string type" in withSparkSession { sparkSession =>
+      val df = getDfWithNumericValues(sparkSession)
+      assert(MaxLength("att1").calculate(df).value.isFailure)
+    }
   }
 
   "Count distinct analyzers" should {

diff --git a/src/test/scala/com/amazon/deequ/analyzers/NullHandlingTests.scala b/src/test/scala/com/amazon/deequ/analyzers/NullHandlingTests.scala
@@ -66,6 +66,9 @@ class NullHandlingTests extends WordSpec with Matchers with SparkContextSpec wit
       Minimum("numericCol").computeStateFrom(data) shouldBe None
       Maximum("numericCol").computeStateFrom(data) shouldBe None
 
+      MinLength("stringCol").computeStateFrom(data) shouldBe None
+      MaxLength("stringCol").computeStateFrom(data) shouldBe None
+
       DataType("stringCol").computeStateFrom(data) shouldBe
         Some(DataTypeHistogram(8L, 0L, 0L, 0L, 0L))
 
@@ -103,6 +106,9 @@ class NullHandlingTests extends WordSpec with Matchers with SparkContextSpec wit
       assertFailedWithEmptyState(Minimum("numericCol").calculate(data))
       assertFailedWithEmptyState(Maximum("numericCol").calculate(data))
 
+      assertFailedWithEmptyState(MinLength("stringCol").calculate(data))
+      assertFailedWithEmptyState(MaxLength("stringCol").calculate(data))
+
       val dataTypeDistribution = DataType("stringCol").calculate(data).value.get
       dataTypeDistribution.values("Unknown").ratio shouldBe 1.0
 

diff --git a/src/test/scala/com/amazon/deequ/analyzers/StateProviderTest.scala b/src/test/scala/com/amazon/deequ/analyzers/StateProviderTest.scala
@@ -18,7 +18,6 @@ package com.amazon.deequ.analyzers
 
 import com.amazon.deequ.SparkContextSpec
 import com.amazon.deequ.utils.{FixtureSupport, TempFileUtils}
-import org.apache.spark.sql.functions.expr
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.scalatest.{Matchers, WordSpec}
 
@@ -47,6 +46,9 @@ class StateProviderTest extends WordSpec with Matchers with SparkContextSpec wit
       assertCorrectlyRestoresState[StandardDeviationState](provider, provider,
         StandardDeviation("price"), data)
 
+      assertCorrectlyRestoresState[MaxState](provider, provider, MaxLength("att1"), data)
+      assertCorrectlyRestoresState[MinState](provider, provider, MinLength("att1"), data)
+
       assertCorrectlyRestoresState[DataTypeHistogram](provider, provider, DataType("item"), data)
       assertCorrectlyRestoresStateForHLL(provider, provider, ApproxCountDistinct("att1"), data)
       assertCorrectlyRestoresState[CorrelationState](provider, provider,
@@ -84,6 +86,9 @@ class StateProviderTest extends WordSpec with Matchers with SparkContextSpec wit
       assertCorrectlyRestoresState[StandardDeviationState](provider, provider,
         StandardDeviation("price"), data)
 
+      assertCorrectlyRestoresState[MaxState](provider, provider, MaxLength("att1"), data)
+      assertCorrectlyRestoresState[MinState](provider, provider, MinLength("att1"), data)
+
       assertCorrectlyRestoresState[DataTypeHistogram](provider, provider, DataType("item"), data)
       assertCorrectlyRestoresStateForHLL(provider, provider, ApproxCountDistinct("att1"), data)
       assertCorrectlyRestoresState[CorrelationState](provider, provider,

diff --git a/src/test/scala/com/amazon/deequ/checks/CheckTest.scala b/src/test/scala/com/amazon/deequ/checks/CheckTest.scala
@@ -18,7 +18,7 @@ package com.amazon.deequ
 package checks
 
 import com.amazon.deequ.analyzers._
-import com.amazon.deequ.analyzers.runners.AnalyzerContext
+import com.amazon.deequ.analyzers.runners.{AnalysisRunner, AnalyzerContext}
 import com.amazon.deequ.anomalydetection.{Anomaly, AnomalyDetectionStrategy}
 import com.amazon.deequ.constraints.{ConstrainableDataTypes, ConstraintStatus}
 import com.amazon.deequ.metrics.{DoubleMetric, Entity}
@@ -440,6 +440,17 @@ class CheckTest extends WordSpec with Matchers with SparkContextSpec with Fixtur
         contextUninformative)
     }
 
+    "yield correct results for minimum and maximum length stats" in
+      withSparkSession { sparkSession =>
+        val baseCheck = Check(CheckLevel.Error, description = "a description")
+        val df = getDfWithVariableStringLengthValues(sparkSession)
+        val context = AnalysisRunner.onData(df)
+          .addAnalyzers(Seq(MinLength("att1"), MaxLength("att1"))).run()
+
+        assertSuccess(baseCheck.hasMinLength("att1", _ == 0.0), context)
+        assertSuccess(baseCheck.hasMaxLength("att1", _ == 4.0), context)
+    }
+
     "work on regular expression patterns for E-Mails" in withSparkSession { sparkSession =>
       val col = "some"
       val df = dataFrameWithColumn(col, StringType, sparkSession, Row("someone@somewhere.org"),