Unit Test for Column Profiling based the on the Titanic Dataset (#191)

awslabs · Feb 12, 2020 · 394ecc1 · 394ecc1
1 parent a647bcc
commit 394ecc1
Show file tree

Hide file tree

Showing 3 changed files with 953 additions and 0 deletions.
diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
@@ -400,4 +400,62 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
     }
   }
 
+  "return correct profile for the Titanic dataset" in withSparkSession { session =>
+    val data = session.read.format("csv")
+      .option("inferSchema", "true")
+      .option("header", "true")
+      .load("test-data/titanic.csv")
+
+    val columnProfiles = ColumnProfiler.profile(data)
+
+    val expectedProfiles = List(
+      StandardColumnProfile(
+        "PassengerId",
+        1.0,
+        891,
+        DataTypeInstances.Integral,
+        false,
+        Map.empty,
+        None),
+      StandardColumnProfile(
+        "Survived",
+        1.0,
+        2,
+        DataTypeInstances.Integral,
+        false,
+        Map.empty,
+        None),
+      StandardColumnProfile("Pclass", 1.0, 3, DataTypeInstances.Integral, false, Map.empty, None),
+      StandardColumnProfile("Name", 1.0, 0, DataTypeInstances.String, true, Map.empty, None),
+      StandardColumnProfile("Sex", 1.0, 2, DataTypeInstances.String, true, Map.empty, None),
+      StandardColumnProfile("Ticket", 1.0, 681, DataTypeInstances.String, true, Map.empty, None),
+      StandardColumnProfile("Fare", 1.0, 0, DataTypeInstances.Fractional, false, Map.empty, None),
+      StandardColumnProfile("Cabin", 0.22, 0, DataTypeInstances.String, true, Map.empty, None)
+    )
+
+    assertSameColumnProfiles(columnProfiles.profiles, expectedProfiles)
+
+  }
+
+  private[this] def assertSameColumnProfiles(
+      actualProfiles: Map[String, ColumnProfile],
+      expectedProfiles: List[ColumnProfile])
+    : Unit = {
+
+    expectedProfiles.foreach { expected =>
+      val actual = actualProfiles(expected.column)
+      val msg = s"""(Column "${expected.column}"")"""
+      assert(actual.dataType == expected.dataType, msg)
+      assert(actual.completeness >= expected.completeness, msg)
+      assert(actual.isDataTypeInferred == expected.isDataTypeInferred, msg)
+      if (expected.approximateNumDistinctValues > 0) {
+        val upperBound = 1.1 * expected.approximateNumDistinctValues
+        val lowerBound = 0.9 * expected.approximateNumDistinctValues
+        assert(
+          actual.approximateNumDistinctValues <= upperBound &&
+          actual.approximateNumDistinctValues >= lowerBound,
+          msg)
+      }
+    }
+  }
 }
diff --git a/test-data/README.md b/test-data/README.md
@@ -0,0 +1,3 @@
+# Dataset used for testing
+
+  * [titanic.csv](https://www.kaggle.com/c/titanic/data)