Skip to content

Commit

Permalink
Unit Test for Column Profiling based the on the Titanic Dataset (#191)
Browse files Browse the repository at this point in the history
  • Loading branch information
klangner committed Feb 12, 2020
1 parent a647bcc commit 394ecc1
Show file tree
Hide file tree
Showing 3 changed files with 953 additions and 0 deletions.
58 changes: 58 additions & 0 deletions src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -400,4 +400,62 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
}
}

"return correct profile for the Titanic dataset" in withSparkSession { session =>
val data = session.read.format("csv")
.option("inferSchema", "true")
.option("header", "true")
.load("test-data/titanic.csv")

val columnProfiles = ColumnProfiler.profile(data)

val expectedProfiles = List(
StandardColumnProfile(
"PassengerId",
1.0,
891,
DataTypeInstances.Integral,
false,
Map.empty,
None),
StandardColumnProfile(
"Survived",
1.0,
2,
DataTypeInstances.Integral,
false,
Map.empty,
None),
StandardColumnProfile("Pclass", 1.0, 3, DataTypeInstances.Integral, false, Map.empty, None),
StandardColumnProfile("Name", 1.0, 0, DataTypeInstances.String, true, Map.empty, None),
StandardColumnProfile("Sex", 1.0, 2, DataTypeInstances.String, true, Map.empty, None),
StandardColumnProfile("Ticket", 1.0, 681, DataTypeInstances.String, true, Map.empty, None),
StandardColumnProfile("Fare", 1.0, 0, DataTypeInstances.Fractional, false, Map.empty, None),
StandardColumnProfile("Cabin", 0.22, 0, DataTypeInstances.String, true, Map.empty, None)
)

assertSameColumnProfiles(columnProfiles.profiles, expectedProfiles)

}

private[this] def assertSameColumnProfiles(
actualProfiles: Map[String, ColumnProfile],
expectedProfiles: List[ColumnProfile])
: Unit = {

expectedProfiles.foreach { expected =>
val actual = actualProfiles(expected.column)
val msg = s"""(Column "${expected.column}"")"""
assert(actual.dataType == expected.dataType, msg)
assert(actual.completeness >= expected.completeness, msg)
assert(actual.isDataTypeInferred == expected.isDataTypeInferred, msg)
if (expected.approximateNumDistinctValues > 0) {
val upperBound = 1.1 * expected.approximateNumDistinctValues
val lowerBound = 0.9 * expected.approximateNumDistinctValues
assert(
actual.approximateNumDistinctValues <= upperBound &&
actual.approximateNumDistinctValues >= lowerBound,
msg)
}
}
}
}
3 changes: 3 additions & 0 deletions test-data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Dataset used for testing

* [titanic.csv](https://www.kaggle.com/c/titanic/data)
Loading

0 comments on commit 394ecc1

Please sign in to comment.