Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Data Quality Framework
# Agile Lab Data Quality

DQ is a framework to build parallel and distributed quality checks on big data environments.
It can be used to calculate metrics and perform checks to assure quality on structured or unstructured data.
Expand Down
59 changes: 38 additions & 21 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
import com.typesafe.sbt.packager.MappingsHelper.directory
import sbt.GlobFilter
import sbt.Keys.{logLevel, scalaVersion, test, updateOptions}
import sbtassembly.AssemblyPlugin.autoImport.assemblyOption
import src.main.scala.BuildEnvPlugin.autoImport.{BuildEnv, buildEnv}
import src.main.scala.BuildIntegrationPlugin.autoImport.{IntegrationEnv, integrationEnv}
import sbtassembly.AssemblyPlugin.autoImport.{assemblyExcludedJars, assemblyOption}
import NativePackagerHelper._

name := "DataQuality-framework"

lazy val commonSettings = Seq(version := "0.2.1")
lazy val commonSettings = Seq(
version := "1.1.0"
)

scalacOptions ++= Seq(
"-target:jvm-1.8",
"-deprecation",
"-feature",
"-language:implicitConversions",
"-language:postfixOps",
"-language:reflectiveCalls",
"-Xmax-classfile-name", "225"
// "-Ypartial-unification"
"-language:reflectiveCalls"
)

scalacOptions ++= Seq("-Xmax-classfile-name", "225")

resolvers ++= Seq(
Resolver.bintrayRepo("webjars","maven"),
Resolver.sonatypeRepo("public"),
Expand All @@ -42,15 +42,13 @@ lazy val common = (project in file("dq-common"))
lazy val core = (project in file("dq-core"))
.enablePlugins(UniversalPlugin, UniversalDeployPlugin)
.settings(
// inThisBuild(
// commonSettings ++ List(scalaVersion := "2.10.6")
// ),
scalaVersion := "2.10.6",
commonSettings,
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "1.6.0",
"org.apache.spark" %% "spark-sql" % "1.6.0",
"org.apache.spark" %% "spark-hive" % "1.6.0",
"org.apache.spark" %% "spark-core" % "1.6.0", //place % "provided" before deployment
"org.apache.spark" %% "spark-sql" % "1.6.0", //place % "provided" before deployment
"org.apache.spark" %% "spark-hive" % "1.6.0", //place % "provided" before deployment

"com.databricks" %% "spark-avro" % "2.0.1",
"com.databricks" %% "spark-csv" % "1.5.0",
"org.apache.commons" % "commons-lang3" % "3.0",
Expand All @@ -77,17 +75,36 @@ lazy val core = (project in file("dq-core"))
assemblyExcludedJars in assembly := (fullClasspath in assembly).value.filter(_.data.getName startsWith "spark-assembly"),
assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = true),
test in assembly := {},
assemblyMergeStrategy in assembly := {
case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
case PathList("org", "apache", xs @ _*) => MergeStrategy.last
case PathList("com", "google", xs @ _*) => MergeStrategy.last
case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
case "about.html" => MergeStrategy.rename
case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
case "META-INF/mailcap" => MergeStrategy.last
case "META-INF/mimetypes.default" => MergeStrategy.last
case "plugin.properties" => MergeStrategy.last
case "log4j.properties" => MergeStrategy.last
case x =>
val oldStrategy = (assemblyMergeStrategy in assembly).value
oldStrategy(x)
},
mappings in Universal += {
val confFile = buildEnv.value match {
case BuildEnv.Dev => "path to application.conf"
case BuildEnv.Test => "path to application.conf"
case BuildEnv.Production => "path to application.conf"
case BuildEnv.Stage => "conf/qa.conf"
case BuildEnv.Test => "conf/test.conf"
case BuildEnv.Production => "conf/prod.conf"
case BuildEnv.Dev => "conf/dev.conf"
}
((resourceDirectory in Compile).value / confFile) -> "conf/application.conf"
},
mappings in Universal ++= {
val integrationFolder = integrationEnv.value match {
case IntegrationEnv.local => "path to integration directory"
case _ => "integration/dev"
}
directory((resourceDirectory in Compile).value / integrationFolder / "bin") ++
directory((resourceDirectory in Compile).value / integrationFolder / "conf")
Expand Down Expand Up @@ -167,9 +184,9 @@ lazy val ui = (project in file("dq-ui"))

// use the combined tslint and eslint rules plus ng2 lint rules
(rulesDirectories in tslint) := Some(List(
tslintEslintRulesDir.value,
// codelyzer uses 'cssauron' which can't resolve 'through' see https://github.com/chrisdickinson/cssauron/pull/10
ng2LintRulesDir.value
tslintEslintRulesDir.value,
// codelyzer uses 'cssauron' which can't resolve 'through' see https://github.com/chrisdickinson/cssauron/pull/10
ng2LintRulesDir.value
)),

// the naming conventions of our test files
Expand Down
121 changes: 98 additions & 23 deletions docs/examples/conf/full-prostprocess-example.conf
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,28 @@ Sources: [
{
id = "GOT_B"
type = "HDFS"
path = "./Agile.DataQuality/side-code/example-data/battles.csv"
path = "./docs/examples/data/battles.csv"
delimiter = ","
header = true
fileType = "csv"
keyFields = ["name","year","attacker_king","defender_king"]
keyFields = ["name","year","defender_king"]
},
{
id = "GOT_D"
type = "HDFS"
path = "./Agile.DataQuality/side-code/example-data/character-deaths.csv"
path = "./docs/examples/data/character-deaths.csv"
delimiter = ","
header = true
fileType = "csv"
}
},
{
id = "customer"
type = "HDFS"
path = "./docs/examples/data/customer.csv"
delimiter = "|"
header = false
fileType = "csv"
},
]

VirtualSources: [
Expand Down Expand Up @@ -43,6 +51,39 @@ VirtualSources: [
},
]

LoadChecks: [
{
id = "customer_encoding_check"
type = "ENCODING"
source = "customer"
option = "UTF-8"
},
{
id = "customer_exact_column"
type = "EXACT_COLUMN_NUM"
source = "customer"
option = 1
},
{
id = "customer_min_column"
type = "MIN_COLUMN_NUM"
source = "customer"
option = 2
},
{
id = "customer_file_type"
type = "FILE_TYPE"
source = "customer"
option = "avro"
},
{
id = "customer_file_existence"
type = "EXIST"
source = "customer"
option = true
}
]

Metrics: [
{
id: "row_count"
Expand All @@ -53,6 +94,36 @@ Metrics: [
file: "GOT_B"
}
},
{
id: "customer_row_count"
name: "ROW_COUNT"
type: "FILE"
description: "rowcount"
config: {
file: "customer"
}
},
{
id: "null_values"
name: "NULL_VALUES"
type: "COLUMN"
description: "null values in column attacker_size"
config: {
file: "customer",
columns: ["attacker_size"],
positions: [1]
}
},
{
id: "null_values_col"
name: "NULL_VALUES"
type: "COLUMN"
description: "null values in column attacker_size"
config: {
file: "customer",
columns: ["C0"]
}
},
{
id: "average"
name: "AVG_NUMBER"
Expand Down Expand Up @@ -232,41 +303,45 @@ Checks: [

Targets: [
{
type: "CHECKS"
type: "FILE_METRICS"
config: {
fileFormat: "csv"
path: "./Agile.DataQuality/side-code/dump"
path: "./tmp/results"
delimiter: ","
savemode: "append"
}
},
{
type: "COLUMNAR-METRICS"
type: "COLUMN_METRICS"
config: {
fileFormat: "csv"
path: "./Agile.DataQuality/side-code/dump"
path: "./tmp/results"
delimiter: ","
savemode: "append"
}
},
{
type: "FILE-METRICS"
type: "COMPOSED_METRICS"
config: {
fileFormat: "csv"
path: "./Agile.DataQuality/side-code/dump"
path: "./tmp/results"
delimiter: ","
savemode: "append"
}
},
{
type: "COMPOSED-METRICS"
type: "CHECKS"
config: {
fileFormat: "csv"
path: "./Agile.DataQuality/side-code/dump"
path: "./tmp/results"
delimiter: ","
savemode: "append"
}
},
{
type: "LOAD_CHECKS"
config: {
fileFormat: "csv"
path: "./tmp/results"
delimiter: ","
}
}
]

Postprocessing: [
Expand All @@ -286,7 +361,7 @@ Postprocessing: [
saveTo: {
fileName: "tera_enriched"
fileFormat: "csv"
path: "./Agile.DataQuality/side-code/dump/postproc"
path: "./tmp/postproc"
delimiter: ","
}
}
Expand All @@ -299,7 +374,7 @@ Postprocessing: [
saveTo: {
fileName: "tera_transposed"
fileFormat: "csv"
path: "./Agile.DataQuality/side-code/dump/postproc"
path: "./tmp/postproc"
delimiter: ","
quoted: true
}
Expand All @@ -313,7 +388,7 @@ Postprocessing: [
saveTo: {
fileName: "tera_headless"
fileFormat: "csv"
path: "./Agile.DataQuality/side-code/dump/postproc"
path: "./tmp/postproc"
delimiter: ","
}
}
Expand All @@ -332,7 +407,7 @@ Postprocessing: [
saveTo: {
fileName: "tera_empty"
fileFormat: "csv"
path: "./Agile.DataQuality/side-code/dump/postproc"
path: "./tmp/postproc"
delimiter: ","
}
}
Expand All @@ -345,7 +420,7 @@ Postprocessing: [
saveTo: {
fileName: "empty_headless"
fileFormat: "csv"
path: "./Agile.DataQuality/side-code/dump/postproc"
path: "./tmp/postproc"
delimiter: ","
}
}
Expand All @@ -359,7 +434,7 @@ Postprocessing: [
saveTo: {
fileName: "empty_headless_keyed"
fileFormat: "csv"
path: "./Agile.DataQuality/side-code/dump/postproc"
path: "./tmp/postproc"
delimiter: ","
}
}
Expand All @@ -372,7 +447,7 @@ Postprocessing: [
saveTo: {
fileName: "tera_arranged"
fileFormat: "csv"
path: "./Agile.DataQuality/side-code/dump/postproc"
path: "./tmp/postproc"
delimiter: ","
}
}
Expand Down
6 changes: 4 additions & 2 deletions docs/examples/data/customer.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ id|name
|
null|null
NULL|NULL
nil|nil
nil|nil|toast
1|pew
0|2
30|Paolo
2|Rocco
2|Rocco
test
1312
Loading