diff --git a/README.md b/README.md index 8973859..fb9b4ef 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Data Quality Framework +# Agile Lab Data Quality DQ is a framework to build parallel and distributed quality checks on big data environments. It can be used to calculate metrics and perform checks to assure quality on structured or unstructured data. diff --git a/build.sbt b/build.sbt index d338d7b..98d0c52 100644 --- a/build.sbt +++ b/build.sbt @@ -1,13 +1,13 @@ -import com.typesafe.sbt.packager.MappingsHelper.directory import sbt.GlobFilter import sbt.Keys.{logLevel, scalaVersion, test, updateOptions} -import sbtassembly.AssemblyPlugin.autoImport.assemblyOption -import src.main.scala.BuildEnvPlugin.autoImport.{BuildEnv, buildEnv} -import src.main.scala.BuildIntegrationPlugin.autoImport.{IntegrationEnv, integrationEnv} +import sbtassembly.AssemblyPlugin.autoImport.{assemblyExcludedJars, assemblyOption} +import NativePackagerHelper._ name := "DataQuality-framework" -lazy val commonSettings = Seq(version := "0.2.1") +lazy val commonSettings = Seq( + version := "1.1.0" +) scalacOptions ++= Seq( "-target:jvm-1.8", @@ -15,11 +15,11 @@ scalacOptions ++= Seq( "-feature", "-language:implicitConversions", "-language:postfixOps", - "-language:reflectiveCalls", - "-Xmax-classfile-name", "225" -// "-Ypartial-unification" + "-language:reflectiveCalls" ) +scalacOptions ++= Seq("-Xmax-classfile-name", "225") + resolvers ++= Seq( Resolver.bintrayRepo("webjars","maven"), Resolver.sonatypeRepo("public"), @@ -42,15 +42,13 @@ lazy val common = (project in file("dq-common")) lazy val core = (project in file("dq-core")) .enablePlugins(UniversalPlugin, UniversalDeployPlugin) .settings( -// inThisBuild( -// commonSettings ++ List(scalaVersion := "2.10.6") -// ), scalaVersion := "2.10.6", commonSettings, libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-core" % "1.6.0", - "org.apache.spark" %% "spark-sql" % "1.6.0", - "org.apache.spark" %% "spark-hive" % "1.6.0", + "org.apache.spark" %% "spark-core" % "1.6.0", //place % "provided" before deployment + "org.apache.spark" %% "spark-sql" % "1.6.0", //place % "provided" before deployment + "org.apache.spark" %% "spark-hive" % "1.6.0", //place % "provided" before deployment + "com.databricks" %% "spark-avro" % "2.0.1", "com.databricks" %% "spark-csv" % "1.5.0", "org.apache.commons" % "commons-lang3" % "3.0", @@ -77,17 +75,36 @@ lazy val core = (project in file("dq-core")) assemblyExcludedJars in assembly := (fullClasspath in assembly).value.filter(_.data.getName startsWith "spark-assembly"), assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = true), test in assembly := {}, + assemblyMergeStrategy in assembly := { + case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last + case PathList("javax", "activation", xs @ _*) => MergeStrategy.last + case PathList("org", "apache", xs @ _*) => MergeStrategy.last + case PathList("com", "google", xs @ _*) => MergeStrategy.last + case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last + case PathList("com", "codahale", xs @ _*) => MergeStrategy.last + case PathList("com", "yammer", xs @ _*) => MergeStrategy.last + case "about.html" => MergeStrategy.rename + case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last + case "META-INF/mailcap" => MergeStrategy.last + case "META-INF/mimetypes.default" => MergeStrategy.last + case "plugin.properties" => MergeStrategy.last + case "log4j.properties" => MergeStrategy.last + case x => + val oldStrategy = (assemblyMergeStrategy in assembly).value + oldStrategy(x) + }, mappings in Universal += { val confFile = buildEnv.value match { - case BuildEnv.Dev => "path to application.conf" - case BuildEnv.Test => "path to application.conf" - case BuildEnv.Production => "path to application.conf" + case BuildEnv.Stage => "conf/qa.conf" + case BuildEnv.Test => "conf/test.conf" + case BuildEnv.Production => "conf/prod.conf" + case BuildEnv.Dev => "conf/dev.conf" } ((resourceDirectory in Compile).value / confFile) -> "conf/application.conf" }, mappings in Universal ++= { val integrationFolder = integrationEnv.value match { - case IntegrationEnv.local => "path to integration directory" + case _ => "integration/dev" } directory((resourceDirectory in Compile).value / integrationFolder / "bin") ++ directory((resourceDirectory in Compile).value / integrationFolder / "conf") @@ -167,9 +184,9 @@ lazy val ui = (project in file("dq-ui")) // use the combined tslint and eslint rules plus ng2 lint rules (rulesDirectories in tslint) := Some(List( - tslintEslintRulesDir.value, - // codelyzer uses 'cssauron' which can't resolve 'through' see https://github.com/chrisdickinson/cssauron/pull/10 - ng2LintRulesDir.value + tslintEslintRulesDir.value, + // codelyzer uses 'cssauron' which can't resolve 'through' see https://github.com/chrisdickinson/cssauron/pull/10 + ng2LintRulesDir.value )), // the naming conventions of our test files diff --git a/docs/examples/conf/full-prostprocess-example.conf b/docs/examples/conf/full-prostprocess-example.conf index 1a64b83..cc8ad2f 100644 --- a/docs/examples/conf/full-prostprocess-example.conf +++ b/docs/examples/conf/full-prostprocess-example.conf @@ -2,20 +2,28 @@ Sources: [ { id = "GOT_B" type = "HDFS" - path = "./Agile.DataQuality/side-code/example-data/battles.csv" + path = "./docs/examples/data/battles.csv" delimiter = "," header = true fileType = "csv" - keyFields = ["name","year","attacker_king","defender_king"] + keyFields = ["name","year","defender_king"] }, { id = "GOT_D" type = "HDFS" - path = "./Agile.DataQuality/side-code/example-data/character-deaths.csv" + path = "./docs/examples/data/character-deaths.csv" delimiter = "," header = true fileType = "csv" - } + }, + { + id = "customer" + type = "HDFS" + path = "./docs/examples/data/customer.csv" + delimiter = "|" + header = false + fileType = "csv" + }, ] VirtualSources: [ @@ -43,6 +51,39 @@ VirtualSources: [ }, ] +LoadChecks: [ + { + id = "customer_encoding_check" + type = "ENCODING" + source = "customer" + option = "UTF-8" + }, + { + id = "customer_exact_column" + type = "EXACT_COLUMN_NUM" + source = "customer" + option = 1 + }, + { + id = "customer_min_column" + type = "MIN_COLUMN_NUM" + source = "customer" + option = 2 + }, + { + id = "customer_file_type" + type = "FILE_TYPE" + source = "customer" + option = "avro" + }, + { + id = "customer_file_existence" + type = "EXIST" + source = "customer" + option = true + } +] + Metrics: [ { id: "row_count" @@ -53,6 +94,36 @@ Metrics: [ file: "GOT_B" } }, + { + id: "customer_row_count" + name: "ROW_COUNT" + type: "FILE" + description: "rowcount" + config: { + file: "customer" + } + }, + { + id: "null_values" + name: "NULL_VALUES" + type: "COLUMN" + description: "null values in column attacker_size" + config: { + file: "customer", + columns: ["attacker_size"], + positions: [1] + } + }, + { + id: "null_values_col" + name: "NULL_VALUES" + type: "COLUMN" + description: "null values in column attacker_size" + config: { + file: "customer", + columns: ["C0"] + } + }, { id: "average" name: "AVG_NUMBER" @@ -232,41 +303,45 @@ Checks: [ Targets: [ { - type: "CHECKS" + type: "FILE_METRICS" config: { fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump" + path: "./tmp/results" delimiter: "," - savemode: "append" } }, { - type: "COLUMNAR-METRICS" + type: "COLUMN_METRICS" config: { fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump" + path: "./tmp/results" delimiter: "," - savemode: "append" } }, { - type: "FILE-METRICS" + type: "COMPOSED_METRICS" config: { fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump" + path: "./tmp/results" delimiter: "," - savemode: "append" } }, { - type: "COMPOSED-METRICS" + type: "CHECKS" config: { fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump" + path: "./tmp/results" delimiter: "," - savemode: "append" } }, + { + type: "LOAD_CHECKS" + config: { + fileFormat: "csv" + path: "./tmp/results" + delimiter: "," + } + } ] Postprocessing: [ @@ -286,7 +361,7 @@ Postprocessing: [ saveTo: { fileName: "tera_enriched" fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump/postproc" + path: "./tmp/postproc" delimiter: "," } } @@ -299,7 +374,7 @@ Postprocessing: [ saveTo: { fileName: "tera_transposed" fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump/postproc" + path: "./tmp/postproc" delimiter: "," quoted: true } @@ -313,7 +388,7 @@ Postprocessing: [ saveTo: { fileName: "tera_headless" fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump/postproc" + path: "./tmp/postproc" delimiter: "," } } @@ -332,7 +407,7 @@ Postprocessing: [ saveTo: { fileName: "tera_empty" fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump/postproc" + path: "./tmp/postproc" delimiter: "," } } @@ -345,7 +420,7 @@ Postprocessing: [ saveTo: { fileName: "empty_headless" fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump/postproc" + path: "./tmp/postproc" delimiter: "," } } @@ -359,7 +434,7 @@ Postprocessing: [ saveTo: { fileName: "empty_headless_keyed" fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump/postproc" + path: "./tmp/postproc" delimiter: "," } } @@ -372,7 +447,7 @@ Postprocessing: [ saveTo: { fileName: "tera_arranged" fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump/postproc" + path: "./tmp/postproc" delimiter: "," } } diff --git a/docs/examples/data/customer.csv b/docs/examples/data/customer.csv index 8ffb205..60d2b13 100644 --- a/docs/examples/data/customer.csv +++ b/docs/examples/data/customer.csv @@ -2,8 +2,10 @@ id|name | null|null NULL|NULL -nil|nil +nil|nil|toast 1|pew 0|2 30|Paolo -2|Rocco \ No newline at end of file +2|Rocco +test +1312 diff --git a/docs/examples/data/gpp_sample.csv b/docs/examples/data/gpp_sample.csv new file mode 100644 index 0000000..0e1c84d --- /dev/null +++ b/docs/examples/data/gpp_sample.csv @@ -0,0 +1,10 @@ +"RO1","ROA","18A31131749E0600","0","2018-10-31 13:17:49.0","13:17:49","2018-10-31 13:20:05",,"BACXROBUXXX","FEEDER","SCTS2",,"008",,"103","SP8","S","02a1510971994118be7459f22db1e819","8304104854","COMPLETE",,,,"400","3213","3213","3213","3213",,,,"EUR","EUR","EUR","EUR",,"GWS",,,"2018-10-31 00:00:00.0","2018-11-01 00:00:00.0","2018-10-31 00:00:00.0",,,,,"2018-10-31 13:18:03.0",,,,,"False","A",,"False","O",,"0",,,,,,,,"False",,"02","A",,,"15197.49","0",,"055B8B2CE50118145986DD45A1B81644","2018-10-31 13:20:05.253",,,"False",,,,"SLEV","21083","73033","E0P",,"NOW",,"459058406",,,,"0","False","3213","3213","False","False","False","02a1510971994118be74",,,"False","False",,,,,,,"False","N","N","PAY",,,"False","0000000022028912","False",,,,,,,,"N",,,"4284312","4284312","0001","NT",,,,,"SA",,"False","DEFAULT-RO","False","True",,,,"Y",,"2018-11-01 15:00:00.0","B",,"N",,"False","TREA_EURD0",,"8304104854","False","False",,"O","True","True",,,,,,"False","26056","00001001","00134552",,"745","745",,"False",,,"ORAD","HO","ORAD","ORAD",,,"E0B",,"B","2018-10-31 15:45:00.0","ROCOTIBI","2018-10-31 00:00:00.0","False",,,,,,"RON",,,,,"18A3113200400104",,"400",,,"RO_SEPA_BULK","(BULK)","18A31131752F0200","18A31131749E0600","BACXROBUXXX",,"BKAUATWWXXX",,"103",,,,"1810310000000000",,"AC","RO59BACX0000000134552320",,,"TRANSMEC RO SRL","COMUNA BORS NR.278","JUDETUL BIHOR","SAT BORS ROMANIA 417075",,,"BACXROBUXXX",,,,,,,,,"BKAUATWWXXX",,,,,,,,"RZBRROBU",,,,,,"AC","RO20RZBR0000060008528952",,,"EUROHAUL SERVICES SRL","DAMBOVITA LOC.BARBULESTI NR.71",,,"8304104854","31081015e504422d8e5dcb78e1eb77c3","C/V Doc. 4051-13/09/2018",,,,,,,,,,,,,,,,,,,,,,,,,,,,"0020181031258921","0","ACC","00134552320",,,,,,,,,"00001001320",,"0.00",,"EUR",,,"@A","RO1","RO1",,,"SHA",,,,,,,,,,,"00134552320","00001001320","00134552320",,,,,,"BC949C3244248932C038C5B668218EF5",,"424D404C434040424D404C434040424D404C404040424D404C404040404D404C404040404D404C404040404D404C4040402D2D464043504240C1604040E140404044C04044404043C0404040","True",,,,,,"False",,,,,,,,,"1.00000000","1.00000000",,,"RO",,,,,,,,,,"False",,,,,"EUR",,,"EUR","651","033","R",,"0",,,"V","SA","SA",,,,,"RO1",,,,,,,,,,,,,,,,,,,,,,,,,,,,,"RO","RO","NOS","False",,,"V",,,,,,,,"K" +"RO1","ROA","18A31123452E0600","0","2018-10-31 12:34:52.0","12:34:52","2018-10-31 12:34:59",,"BACXROBUXXX","FEEDER","BOOK",,"103",,"103",,"S",,"8304103995","COMPLETE",,,,"400","1000","4666.8","1000","4666.8",,,,"EUR","RON","EUR","RON",,"GWX",,,"2018-10-31 00:00:00.0","2018-10-31 00:00:00.0","2018-10-31 00:00:00.0",,,,,"2018-10-31 12:34:59.0",,,,,"False","A",,"False","O",,"0",,,,,,,,"False",,"02","A",,,"4730","0",,"10F8903E03C023B620208FF61DB6DDEB","2018-10-31 12:34:59.372",,,"False",,,,"CRED","21083","73033","E0M",,"WAIVE",,"14727962",,,,"0","False","1000","4666.8","False","False","False",,,,"False","False",,,,,,,"False","N","N","PAY",,,"False","0000000022028156","False",,,,,,,,"N",,,"4284313","4284313",,"NT",,,,,"SA",,"False","DEFAULT-RO","False","False",,,,"Y",,,"N",,"N",,"False",,,"8304103995","False","False",,"O","True","True",,,"A",,,"False","26056","01663669","01663669",,"751","751",,"False",,,"ORIZ","ORIZ","ORIZ",,"4.64500000",,"E0M","2018-10-31 00:00:00.0","B","2018-10-31 15:45:00.0","ROCOTGWX","2018-10-31 00:00:00.0","False",,,,,,"RON",,,,,,,"400",,,,"(2RTR)",,"18A31123452E0600","BACXROBUXXX",,"BACXROBUXXX",,"103",,,,"1810310000000000",,"AC","RO51BACX0000001663669000",,,"GHITULESCU FLORICA","STR.Drumul Taberei NR.2","BL.I SC.B ET.- AP.15 SECT.6","BUCURESTI ROMANIA 061416",,,,,,,,,,,,,,,,,,,,,,,,,,"AC","01663669002",,,"GHITULESCU FLORICA",,,,,,"SCHIMB",,,,,,,,,,,,,,,,,,,,,,,,,,,,"0020181031257553","0","ACC","01663669000",,,,,,,,,"01663669002",,,,,,,"@Z","RO1","RO1",,,"SHA",,,,,,,,,,,"01663669000","01663669002",,,,,,,"2276256683F5C4F95AFB09A8EE6A36EE",,"424D404C434040424D404C434040404D404C434040404D404C404040404D404C404040404D404C404040404D404C4040402D2D46404350424041604040E04040404440404440404340404040","True",,,,,,"False",,,,,,,,,"1.00000000","4.66680000",,,"RO",,,,,,,,,,"False",,,,,,,,"RON","955","001","RR",,"0",,,"N","SA","SA",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"RO","RO","ACC","True",,,"V",,,,,,,,"K" +"RO1","ROA","18A31092323E0201","0","2018-10-31 09:23:23.0","09:23:23","2018-10-31 09:40:59",,"CRFIIT3FXXX","SCTS2","BOOK",,"008","SP8","103","SP8","S","FBAWI18101651776","8304201896","COMPLETE",,,,"400","783","783","783","783",,,,"EUR","EUR","EUR","EUR",,"ST2",,,"2018-10-31 00:00:00.0","2018-10-31 00:00:00.0","2018-10-31 00:00:00.0",,,,,"2018-10-31 09:40:59.0",,,,,"False","A",,"False","I",,"0",,,,,,,,"False",,"02","A",,,"3703.59","0",,"F653C1F77F2F0B4BB3F4C45AC8A4C0A2","2018-10-31 09:40:59.775",,,"False",,,,"SEPA","21083","73033","E0B","WAIVE",,,,"14727958",,,"0","False","783","783","False","False","False",,,,"False","False",,,,,,,"False","N","N","PAY",,,"False",,"False",,,,,,,,"N",,,"4284312","4284312","0001","NT",,,,,"SA",,"False","DEFAULT-RO","True","False",,,,"Y",,,"N",,"N",,"False",,,"8304201896","False","False",,"O","True","True",,,,,,"False","26055","01035953","00001001",,"758","758",,"False",,,"HO","UNIR","HO",,,,"E0M",,"B","2018-10-31 16:15:00.0","ROCOTST2","2018-10-31 00:00:00.0","False",,,,,,"RON",,,,"pacs.008.001.0220181031090000500001",,,"400",,,,"(2RTR)","18A31093112F0500","18A31092323E0201","BACXROBUXXX",,"BACXROBUXXX",,"103","SP8",,,,,"AC","IT32C0616014100100000004138",,,"IORGULESCU LOREDANA ARISTITA","VIA ANTONIO NARDI 5 52100AREZZO",,"AR",,,"CRFIIT3FXXX",,,,,,,,,,,,,,,,,,,,,,,"AC","RO40BACX0000001035953001",,,"MINTICI DAN",,,,"BK20023830348671186040EUV","NOTPROVIDED","plata reparatii",,,,,,,,,,,,,,,,,,,,,,,,,,,,"0020181031252408","0","NOS","00001001320",,,,,,,,,"01035953001",,,,,,,"@Z","RO1","RO1",,,"SHA",,,,,,,,,,,"00001001320","01035953001",,,,,,,"9C6472AD493E994288D05BD2A60E3F99",,"424D4040434040424D4040434040404D4040434040404D4040404040404D4040404040404D4040404040404D40404040402D2D46404350424041604040E040404044C0404440404340404040","True",,,,,,"False",,,,,,,,,"1.00000000","1.00000000",,,"RO",,,,,,,,,,"False",,,,,,,,"EUR","955","001","R",,"0",,,"V","SA","SA",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"IT","RO","ACC","False",,,"V",,,,,,,, +"RO1","ROA","18A31092248E0301","0","2018-10-31 09:22:48.0","09:22:48","2018-10-31 09:39:57",,"BCITITMMXXX","SCTS2","BOOK",,"008","SP8","103","SP8","S","FBAWI18101648333","8304201211","COMPLETE",,,,"400","1064","1064","1064","1064",,,,"EUR","EUR","EUR","EUR",,"ST2",,,"2018-10-31 00:00:00.0","2018-10-31 00:00:00.0","2018-10-31 00:00:00.0",,,,,"2018-10-31 09:39:57.0",,,,,"False","A",,"False","I",,"0",,,,,,,,"False",,"02","A",,,"5032.72","0",,"E949D4760A5563DEB74AF061404FEB56","2018-10-31 09:39:57.465",,,"False",,,,"SEPA","21083","73033","E0B","WAIVE",,,,"14727958",,,"0","False","1064","1064","False","False","False",,,,"False","False",,,,,,,"False","N","N","PAY",,,"False",,"False",,,,,,,,"N",,,"4284312","4284312","0001","NT",,,,,"SA",,"False","DEFAULT-RO","True","False",,,,"Y",,,"N",,"N",,"False",,,"8304201211","False","False",,"O","True","True",,,,,,"False","26055","04533888","00001001",,"758","758",,"False",,,"HO","BRA4","HO",,,,"E0M",,"B","2018-10-31 16:15:00.0","ROCOTST2","2018-10-31 00:00:00.0","False",,,,,,"RON",,,,"pacs.008.001.0220181031090000500001",,,"400",,,,"(2RTR)","18A31093029F0502","18A31092248E0301","BACXROBUXXX",,"BACXROBUXXX",,"103","SP8",,,,,"AC","IT39D0306902233100000009023",,,"ITAQUA","VIA KARL LUDWIG VON BRUCK 32 34144T",,"RIESTE TS",,,"BCITITMMXXX",,,,,,,,,,,,,,,,,,,,,,,"AC","RO10BACX0000004533888001",,,"URZICA NICUSOR",,,,"011810300KQLSC","TQT2IGPQBZ7WM15408958074450.9540127","S.DO 09/2018",,,,,,,,,,,,,,,,,,,,,,,,,,,,"0020181031252233","0","NOS","00001001320",,,,,,,,,"04533888001",,,,,,,"@Z","RO1","RO1",,,"SHA",,,,,,,,,,,"00001001320","04533888001",,,,,,,"67E16169C50FAA034EBA06ECB94E7AF3",,"424D4040434040424D4040434040404D4040434040404D4040404040404D4040404040404D4040404040404D40404040402D2D46404350424041604040E040404044C0404440404340404040","True",,,,,,"False",,,,,,,,,"1.00000000","1.00000000",,,"RO",,,,,,,,,,"False",,,,,,,,"EUR","955","001","R",,"0",,,"V","SA","SA",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"IT","RO","ACC","False",,,"V",,,,,,"SALA",, +"RO1","ROA","18A31141055E0200","0","2018-10-31 14:10:55.0","14:10:55","2018-10-31 14:11:01",,"BACXROBUXXX","FEEDER","BOOK",,"103",,"103",,"S",,"8304105553","COMPLETE",,,,"400","850","3918.5","850","3918.5",,,,"EUR","RON","EUR","RON",,"MOB",,,"2018-10-31 00:00:00.0","2018-10-31 00:00:00.0","2018-10-31 00:00:00.0",,,,,"2018-10-31 14:11:01.0",,,,,"False","A",,"False","O",,"0",,,,,,,,"False",,"02","A",,,"4020.5","0",,"7516F5C9573D0FBFECC3911482AB0191","2018-10-31 14:11:01.693",,,"False",,,,"CRED","21083","73033","E0M",,,,,,,,"0","False","850","3918.5","False","False","False",,,,"False","False",,,,,,,"False","N","N","PAY",,,"False","0000000022029586","False",,,,,,,,"N",,,"4284312","4284312",,"NT",,,,,"SA",,"False","DEFAULT-RO","False","False",,,,"Y",,,"N",,"N",,"False",,,"8304105553","False","False",,"O","True","True",,,"N",,,"False","26056","00878358","00878358",,"564","564",,"False",,,"TGJI","TGJI","TGJI",,"4.64500000",,"E0M","2018-10-31 00:00:00.0","B","2018-10-31 15:45:00.0","ROCOTGWX","2018-10-31 00:00:00.0","False",,,,,,"RON",,,,,,,"400",,,,"(2RTR)",,"18A31141055E0200","BACXROBUXXX",,"BACXROBUXXX",,"103",,,,"1810310000000000",,"AC","RO87BACX0000000878358001",,,"HAINARU SORIN","STR.PIRIU BOIA NR.139A","BL.- SC.- ET.- AP.- GORJ","Jupanesti ROMANIA 217270",,,,,,,,,,,,,,,,,,,,,,,,,,"AC","00878358000",,,"HAINARU SORIN",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"0020181031259741","0","ACC","00878358001",,,,,,,,,"00878358000",,,,,,,"@Z","RO1","RO1",,,"SHA",,,,,,,,,,,"00878358001","00878358000",,,,,,,"0086FF7CA13855EC1C2D6E0B16C6956D",,"424D404C434040424D404C434040404D404C434040404D404C404040404D404C404040404D404C404040404D404C4040402D2D46404350424041604040E04040404440404440404340404040","True",,,,,,"False",,,,,,,,,"1.00000000","4.61000000",,,"RO",,,,,,,,,,"False",,,,,,,,"RON","955","001","RR",,"0",,,"N","SA","SA",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"RO","RO","ACC","True",,,"V",,,,,,,,"K" +"RO1","ROA","18A31080213E0603","0","2018-10-31 08:02:13.0","08:02:13","2018-10-31 08:02:39",,"MGRMUS44XXX","SWIFT","DM101C",,"101","103","103",,"S","52672643","8304200223","COMPLETE",,,,"400","31606.91","31606.91","31606.91","31606.91",,,,"RON","RON","RON","RON",,"SWF",,,"2018-10-31 00:00:00.0","2018-10-31 00:00:00.0","2018-10-31 00:00:00.0",,,,,"2018-10-31 08:02:32.0",,,,,"False","A",,"False","I",,"1",,,,,,,,"False",,"02","A",,,"31606.91","0",,"015916C7663E02EBC490870A66B67930","2018-10-31 08:02:39.949",,,"False",,,,,"21083","73033","E0L",,"WAIVE",,"14728001",,,,"0","False","31606.91","31606.91","False","False","False",,,,"False","False",,,,,,,"False","W","N","PAY",,,"False",,"False",,,,,,,,"N",,,"4284312","4284312","0001","NT",,,,,"SA",,"False","DEFAULT-RO","False","False",,,,"Y",,,"N",,"N",,"False",,,"8304200223","False","False",,"O","True","True",,,,"ROCTO",,"False","26055","00000100","01030152",,"100","100",,"False",,,"GRIM",,"GRIM",,,,"E0B",,"B","2018-10-31 15:30:00.0","ROCOTMT101","2018-10-31 00:00:00.0","False",,,,,,"RON",,,,,,,"400",,,,"(2RTR)","18A31080215F0100","18A31080213E0603","BACXROBUXXX",,"RZBRROBUXXX",,"103",,,,"1810312749201681","1810319073850836","AC","RO03BACX0000001030152000",,,"Moneygram Payment Systems Inc","1550 Utica Avenue S 55416","Minneapoli Minnesota USA",,,,"BACXROBUXXX",,,,,,,,,,,,,,,"RO","RZBR","RZBRROBU",,,,,,"AC","RO44RZBR0000060013417223",,,"LIVERTI SRL - BRAILA","STEFAN CEL MARE 455 810102 BRAILA","BRAILA RO",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"0","ACC","01030152000",,,,,,,,,"GPPINTERNAL",,,,,,,"@A","RO1","RO1",,,"SHA",,,,,,,,,,,"01030152000","GPPINTERNAL",,,,,,,"D9ACE3EF19669C7F4BACF15D736C1240",,"424040404340404240404043404040404040434040404040404040404040404040404040404040404040404040404040402D2D44404350424041604040E1404040C4C04044404043C0404040","True",,,,,,"False","AC","RO03BACX0000001030152000",,,"Moneygram Payment Systems Inc","1550 Utica Avenue S 55416","Minneapoli Minnesota USA",,"1.00000000","1.00000000",,,"RO",,,,,,"52672643",,,,"False",,,,,,,,"RON","651","033","R",,"0",,,"V","SA","SA",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"CLR","False",,,"V",,,,,,,, +"RO1","ROA","18A31133737E0300","0","2018-10-31 13:37:37.0","13:37:37","2018-10-31 13:41:55",,"BACXROBUXXX","FEEDER","SWIFT",,"103",,"103",,"S",,"8304105106","COMPLETE",,,,"400","5969.38","5969.38","5969.38","5969.38","0","0",,"EUR","EUR","EUR","EUR",,"GWS",,,"2018-10-31 00:00:00.0","2018-11-02 00:00:00.0","2018-10-31 00:00:00.0",,"WAITOFAC",,,,,,,,"False","A",,"False","O",,"0",,,,,,,,"False",,"02","A",,,"28235.17","0",,"A64028C11E4C8B42259F7FF216A48A90","2018-10-31 13:41:55.896",,"N","False",,,,"CRED","21083","73033","E0Z",,"NOW",,"459048078",,,,"0","False","5969.38","5969.38","False","False","False",,,,"False","False",,,,,,,"False","N","N","OPI",,,"False","0000000022029154","False",,,"18A31133737E0300",,,,,"N",,,"4284312","4284312","0001","NT",,,,,"SA",,"False","DEFAULT-RO","False","True",,,,"Y",,"2018-11-02 15:00:00.0","B",,"N",,"False","TREA_EURD0",,"8304105106","False","False",,"O","True","True",,,,"ROSFO",,"False","26056","00001001","01430573",,"722","722",,"False",,,"VICT","HO","VICT","VICT",,"0",,,"B","2018-10-31 15:45:00.0","ROCOTIBI","2018-10-31 00:00:00.0","False",,"0",,,,"RON",,,,,,,"400",,,,"(2RTR)","18A31133740F0400","18A31133737E0300","BACXROBUXXX",,"CHASDEFXXXX",,"103",,"0",,"1810310000000000",,"AC","RO97BACX0000001430573001",,,"ROMWOOD GROUP SRL","STR.DRUMUL POTCOAVEI NR 59/BL LOT1/","ORAS VOLUNTARI /ILFOV","VOLUNTARI ROMANIA 77190",,,,,,,,,,,,,,,,,,,,"TICSRUMMXXX",,,,,,"AC","40802978700000000274",,,"IP YAKOV MIKHAILOVICH TRUSHKOV",,,,,,"INVOICES NO 018/31.10.2018, TRANSIT ACC. 40802978920000000274, J.P.MORGAN AG, ACC. 6231608701, SWIFT CHASDEFXXXX",,,,,,,,,,,,,,,,,,,,,,,,,,,,,"0","ACC","01430573001",,,,,,,,,"00001001320",,"23.65",,"RON",,,"@Z","RO1","RO1",,,"OUR",,,,,,,,,,,"01430573001","00001001320","01430573000",,,,,"09708005320","52C0C531D604876DB665008A987493BB",,"4040404C4340404040404C4340404040404C4340404040404C4040404040404C4040404040404C4040404040404C4040402D2D424043524040C1604040E1484040C4C0404440404340404040","True","0",,"0",,,"False",,,,,,,,,"1.00000000","1.00000000",,,"RU",,,,,,,,,,"False",,,,,"RON",,,"EUR",,,"RN",,"0",,,"N","SA","SA",,,,,"RO1",,,,"0",,,"BKAUATWWXXX",,,,,,,,,,,,,,,,,,,,,,"RO","RU","NOS","False",,,"V",,,,,,,,"K" +"RO1","ROA","18A31111833E0300","0","2018-10-31 11:18:33.0","11:18:33","2018-10-31 11:28:04",,"BACXROBUXXX","FEEDER","BOOK",,"103",,"103",,"S",,"8304102490","COMPLETE",,,"R580593","400","30","139.35","30","139.35",,,,"EUR","RON","EUR","RON",,"MOB",,,"2018-10-31 00:00:00.0","2018-10-31 00:00:00.0","2018-10-31 00:00:00.0",,,,,"2018-10-31 11:27:56.0",,,,,"False","A",,"False","O",,"0",,,,,,,,"False",,"02","A",,,"141.9","0",,"CDFB76639655184646DE79E76BC505B8","2018-10-31 11:28:04.436",,,"False",,,,"CRED","21083","73033","E0D",,"WAIVE",,"14728001",,,,"2","False","30","139.35","False","False","False",,,,"False","False",,,,,,,"False","N","N","PAY",,,"False","0000000022026747","False",,,,,,,,"N",,,"4284312","4284312",,"NT",,,,,"SA",,"False","DEFAULT-RO","False","False",,,,"Y",,,"N",,"N",,"False",,,"8304102490","False","False",,"O","True","True",,,"Y",,,"True","26056","00011329","00011329",,"564","564",,"False",,,"STAF","STAF","STAF",,"4.64500000",,"E0D",,"B","2018-10-31 15:45:00.0","ROCOTGWX","2018-10-31 00:00:00.0","False",,,,,,"RON",,,,,,,"400",,,,"(2RTR)",,"18A31111833E0300","BACXROBUXXX",,"BACXROBUXXX",,"103",,,,"1810310000000000",,"AC","RO34BACX0000000011329014",,,"BALSANU FLORIN","BLD.IULIU MANIU,NR.14,BL.13,ET.3,AP","SECTOR 6","BUCURESTI ROMANIA 230128",,,,,,,,,,,,,,,,,,,,,,,,,,"AC","00011329000",,,"BALSANU FLORIN",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"0020181031255810","0","ACC","00011329014",,,,,,,,,"00011329000",,,,,,,"@Z","RO1","RO1",,,"SHA",,,,,,,,,,,"00011329014","00011329000",,,,,,,"1094F188ECF05478F6C7655F75CB2C0A",,"424D404C434040424D404C434040404D404C434040404D404C404040404D404C404040404D404C404040404D404C4040402D2D46564350464041604040E04040404449404440404340404040","True",,,,,,"False",,,,,,,,,"1.00000000","4.64500000",,,"RO",,,,,,,,,,"False",,,,,,,,"RON","955","963","RR",,"0",,,"N","SA","SA",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"RO","RO","ACC","True",,,"V",,,,,,,,"K" +"RO1","ROA","18A31100939E0701","0","2018-10-31 10:09:39.0","10:09:39",,,"BKAUATWWXXX","SWIFT",,,"900",,"900",,"S","FBAWI18101689505","FBAWI18101689505","COMPLETE",,,,"400","35500","35500",,,,,,"EUR","EUR",,,,"SWF",,,"2018-10-31 00:00:00.0","2018-10-31 00:00:00.0","2018-10-31 00:00:00.0",,,,,,,,,,"False","A",,"False","I",,"0",,,,,,,,"False",,"02","A",,,,"0",,"7E7D61E6C1052ADE41162C1CFA2E99AF","2018-10-31 10:09:39.285",,,"False",,,,,,,,,,,,,,,"0","False",,,"False","False","False",,,,"False","False",,,,,,,"False","N","N","NAC",,,"False",,"False",,,,,,,,"N",,,,,,"NT",,,,,"SA",,"False",,"False","False",,,,,,,,,,,"False",,,,"False","False",,"O","True","True",,,,,,"False","26061",,,,,,,"False",,,,,,,,,,,,,,,"False",,,,,,,,,,,,,"400",,,,,,"18A31100939E0701","BACXROBUXXX",,"BACXROBUXXX",,"900",,,,"1810311402097437","1810319073852674",,,,,,,,,,,"BACXROBUXXX",,,,,,,,,,,,,,,,,,,,,,,,,"BACXROBUXXX",,,,,,"8304100815","8304100815",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"0",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"7267794A30FB31399F4362922201D271",,"404040404040404040404040404040404040404040404040404040404040404040404040404040404040404040404040402D2D40404040404040404040404040404040404040404240404040","False",,,,,,"False",,,,,,,,,,,,,,,,,,,,,,,"False",,,,,,,,,,,,,"0",,,"N","SA","SA",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"False",,,"N",,,,,,,, +"RO1","ROA","18A31092237E0201","0","2018-10-31 09:22:37.0","09:22:37","2018-10-31 11:53:10",,"RZSTAT2G151","SCTS2","BOOK",,"008","SP8","103","SP8","S","FBAWI18101647654","8304200991","COMPLETE",,,"R664428","400","3944.5","3944.5","3944.5","3944.5",,,,"EUR","EUR","EUR","EUR",,"ST2",,,"2018-10-31 00:00:00.0","2018-10-31 00:00:00.0","2018-10-31 00:00:00.0",,,,,"2018-10-31 11:52:50.0",,,,,"False","A",,"False","I",,"0",,,,,,,,"False",,"02","A",,,"18657.49","0",,"87D8CD47035CDAC48E28924F928B34A5","2018-10-31 11:53:10.642",,,"False",,,,"SEPA","21083","73033","E0B","WAIVE",,,,"14727958",,,"0","False","3944.5","3944.5","False","False","False",,,,"False","False",,,,,,,"False","N","N","PAY",,,"False",,"False",,,,,,,,"N",,,"4284312","4284312","0001","NT",,,,,"SA",,"False","DEFAULT-RO","True","False",,,,"Y",,,"N",,"N",,"False",,,"8304200991","False","False",,"O","True","True",,,,,,"True","26055","00030815","00001001",,"758","758",,"False",,,"HO","GRIM","HO",,,,"E0L",,"B","2018-10-31 16:15:00.0","ROCOTST2","2018-10-31 00:00:00.0","False",,,,,,"RON",,,,"pacs.008.001.0220181031090000500001",,,"400",,,,"(2RTR)","18A31115251F0100","18A31092237E0201","BACXROBUXXX",,"BACXROBUXXX",,"103","SP8",,,,,"AC","AT363815100004000485",,,"Hormann GesmbH","Ottendorf 90 8312 Ottendorf an der",,"Rittschein",,,"RZSTAT2G151",,"Raiffeisenbank Ilz-Grosssteinbach-R","Ilz 39",,"Ilz",,,,,,,,,"Ilz",,,,,,,,,"AC","RO03BACX0000000030815320",,,"HOLZINDUSTRIE SCHWEIGHOFER S.R.L.","STRADA INDUSTRIILOR 1",,"RO-515800 SEBES, UD.ALBA","38151181030D10265710000010","NOTPROVIDED","RE 18100770 LS 181-4572-3 Sebes Pellets",,,,,,,,,,,,,,,,,,,,,,,,,,,,"0020181031256387","0","NOS","00001001320",,,,,,,,,"00030815320",,,,,,,"@Z","RO1","RO1",,,"SHA",,,,,,,,,,,"00001001320","00030815320",,,,,,,"7312143EF9DEC8AC2D29D6A9EB7738E3",,"424D4040434040424D4040434040404D4040434040404D4040404040404D4040404040404D4040404040404D40404040402D2D46564350424041604040E040404044C8404440404340404040","True",,,,,,"False",,,,,,,,,"1.00000000","1.00000000",,,"RO",,,,,,,,,,"False",,,,,,,,"EUR","201","033","NR",,"0",,,"V","SA","SA",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"AT","RO","ACC","False",,,"V",,,,,,,, \ No newline at end of file diff --git a/docs/installation/core-setup.md b/docs/installation/core-setup.md index 7d72d31..e30c083 100644 --- a/docs/installation/core-setup.md +++ b/docs/installation/core-setup.md @@ -1,30 +1,53 @@ -Using DQ ------------- +## Data Quality core module setup and usage -DQ is written in Scala, and the build is managed with SBT. +DQ main application is written in Scala, and the build is managed with SBT. -Before starting: -- Install JDK -- Install SBT -- Install Git +> **Before starting:** Install JDK, Scala, sbt and Git. -The steps to getting DQ up and running for development are pretty simple: +First of all, clone this repository: +``` +git clone https://github.com/agile-lab-dev/DataQuality.git +``` -- Clone this repository: +Then you have 2 options: +- Run DQ on local +- Create an archive with setup to run in your distributed environment - `git clone https://github.com/agile-lab-dev/DataQuality.git` +#### Local run -- Start DQ. You can either run DQ in local or cluster mode: +Simply run `DQMasterBatch` class using your IDE or Java tools with the following arguments - - local: default setting - - cluster: set isLocal = false calling makeSparkContext() in `DQ/utils/DQMainClass` +- __-a__: Path to application configuration file. +> **Example:** ./Agile.DataQuality/dq-core/src/main/resources/conf/dev.conf -- Run DQ. You can either run DQ via scheduled or provided mode (shell): +- __-c__: Path to run configuration file. +> **Example:** ./Agile.DataQuality/docs/examples/conf/full-prostprocess-example.conf - - `run.sh`, takes parameters from command line: - **-n**, Spark job name - **-c**, Path to configuration file - **-r**, Indicates the date at which the DataQuality checks will be performed - **-d**, Specifies whether the application is operating under debugging conditions - **-h**, Path to hadoop configuration ---- \ No newline at end of file +- __-d__: Run date. +> **Example:** 2019-01-01 + +- __-l__: _Optional._ Flag to run in local mode. + +- __-r__: _Optional._ Flag to repartition sources after reading. + +#### Distributed environment + +##### Deployment +Primarily you'll need to deploy your application to the cluster. You can assemble the jar on your own using sbt + or you can use some of our predefined utilities. + +To use our `deploy.sh` script follow the following steps: +- Setup REMOTE_HOST and REMOTE_USERNAME in the `deploy.sh`. +- Create an `application.conf` for your environment. +- Create a directory with the internal directories `bin` and `conf`. In the corresponding directories put your + run scripts and configuration files. + > **Tip:** You can use `run-default.sh` as a base for your run script. +- Link `application.conf` file and directory with run scripts and confs to the correnspontig parameter values +in the `build.sbt`. +- Run `deploy.sh` with your parameters. + +##### Submitting +In distributed environment Data Quality application is being treated as a standard Spark Job, submitted + by `submit.sh` script. + +You can submit your job manually to leverage it on a run script. This is completely up to you. diff --git a/docs/load_checks.md b/docs/load_checks.md new file mode 100644 index 0000000..c015924 --- /dev/null +++ b/docs/load_checks.md @@ -0,0 +1,49 @@ +# Load Checks + +In addition to regular checks, which are relying on some metric results, the Data Quality Framework has Load Checks. + Similarly to metric checks, they are working on top of Source metadata and curtain boolean expression. + +You should use Load Checks for scenarion in which you don't need to iterate thru rows of the Source's Data Frame. + For example, when you need to check the number of columns. + +Load checks can be Pre or Post load. They are being performed at the beggining of the pipeline, before + any other metric or check. + +At the moment the following checks are present: +##### Pre +* __EXIST__: Check if the source is present in the defined path +* __ENCODING__: Checks if the source is loadable with the following encoding +* __FILE_TYPE__: Checks if the source is loadable in the desired format +##### Post +* __EXACT_COLUMN_NUM__: Checks if #columns of the source is the same as desired number +* __MIN_COLUMN_NUM__: Checks if #columns of the source is more or equal to the desired number + +### Example +```hocon +LoadChecks: [ + { + id = "encoding_check" + type = "ENCODING" + source = "sample_A" + option = "UTF-8" // String: Encoding name (please, use encoding names defined in Spark) + }, + { + id = "min_column" + type = "MIN_COLUMN_NUM" + source = "sample_A" + option = 10 // Integer: Num of columns + }, + { + id = "file_type" + type = "FILE_TYPE" + source = "sample_A" + option = "avro" // String: File formate (csv, avro) + }, + { + id = "file_existence" + type = "EXIST" + source = "sample_A" + option = true // Boolean: Expected result + } +] +``` diff --git a/docs/sources.md b/docs/sources.md index 90d88c6..49903c1 100644 --- a/docs/sources.md +++ b/docs/sources.md @@ -79,18 +79,20 @@ Will load Hive table as source with selected query. You need to setup hive conne ### HDFS files ##### CSV file -Classic comma separated values (separator is a variable). You also can provide a schema or parse it from the header +Classic comma separated values. You also can provide a schema or parse it from the header ```hocon { id = "CONT" type = "HDFS" path = "/path/resources/sample-data/contract.csv", fileType = "csv", - separator = "|", + delimiter = "|", // optional + quote = "'", // optional + escape = "\\", // optional header = false, // Optional fields schema = [{name: "a", type: "string"}, {name: "b", type: "string"},{name: "c", type: "string"}] - date = "2017-05-19" + date = "2017-05-19" } ``` ##### Fixed format file diff --git a/dq-core/schema.sql b/docs/sql/core.sql similarity index 75% rename from dq-core/schema.sql rename to docs/sql/core.sql index 7c7b3e1..f01040a 100644 --- a/dq-core/schema.sql +++ b/docs/sql/core.sql @@ -57,6 +57,19 @@ CREATE TABLE "results_check" ( +DROP TABLE IF EXISTS "results_check_load"; +CREATE TABLE "results_check_load" ( + "id" TEXT NOT NULL, + "src" TEXT NOT NULL, + "tipo" TEXT NOT NULL, + "expected" TEXT NOT NULL, + "date" TEXT NOT NULL, + "status" TEXT NOT NULL, + "message" TEXT, + UNIQUE(id, date) +); + + CREATE OR REPLACE FUNCTION upsert_colmet() RETURNS trigger AS $upsert_colmet$ @@ -189,4 +202,37 @@ create trigger checks_insert before insert on results_check for each row - execute procedure upsert_check(); \ No newline at end of file + execute procedure upsert_check(); + + +CREATE OR REPLACE FUNCTION upsert_check_load() + RETURNS trigger AS +$upsert_check_load$ +declare + existing record; +begin + if (select EXISTS (SELECT 1 FROM results_check_load + WHERE id = NEW.id AND date = NEW.date)) then + + UPDATE results_check_load SET + src = NEW.src, + tipo = NEW.tipo, + expected = NEW.expected, + status = NEW.status, + message = NEW.message + WHERE id = NEW.id AND + id = NEW.id; + + return null; + end if; + + return new; +end +$upsert_check_load$ + LANGUAGE plpgsql; + +create trigger checks_load_insert + before insert + on results_check_load + for each row +execute procedure upsert_check_load(); diff --git a/dq-core/deploy.sh b/dq-core/deploy.sh old mode 100644 new mode 100755 index f79d05c..5a3f347 --- a/dq-core/deploy.sh +++ b/dq-core/deploy.sh @@ -1,16 +1,20 @@ #!/usr/bin/env bash # $1 - build environment -# $2 - integration project: ... -# todo: add your credentials +# $2 - integration project: xsell, npe, gcif, ... +echo "BUILD ENV"$1 case $1 in - dev) - REMOTE_HOST= - REMOTE_USERNAME= + stage) + REMOTE_HOST=hdpqemu01.internal.unicreditgroup.eu + REMOTE_USERNAME=tubd2899 ;; test) - REMOTE_HOST= - REMOTE_USERNAME= + REMOTE_HOST=hdptemu02.internal.unicreditgroup.eu + REMOTE_USERNAME=tud2q799 + ;; + dev) + REMOTE_HOST=server07.cluster01.atscom.it + REMOTE_USERNAME=alessandro.marino ;; *) echo "Unknown environment! Please, select from: stage, test!" @@ -19,8 +23,8 @@ case $1 in esac ### TEST PARAMS -REMOTE_ROOT_DIR=/hdp_spool/$REMOTE_USERNAME/dataquality-${2} - +#REMOTE_ROOT_DIR=/hdp_spool/$REMOTE_USERNAME/dq-${2}-test +REMOTE_ROOT_DIR=/home/$REMOTE_USERNAME echo 'DELETING PRESENT FILE' rm -f ./dq-core/target/scala-2.10/*.jar 2> /dev/null rm -f ./dq-core/target/universal/* 2> /dev/null @@ -30,6 +34,16 @@ echo 'BUILDING ASSEMBLY...' sbt -Denv=$1 -Dintegration=$2 "project core" universal:packageBin echo 'DONE!' +#echo 'GENERATING GIT VERSION...' +#rm -f ./dq-core/target/scala-2.10/git_version.info 2> /dev/null +#printf "commit: " > ./dq-core/target/scala-2.10/git_version.info +#git rev-parse HEAD >> ./dq-core/target/scala-2.10/git_version.info +#printf "descr: " >> ./dq-core/target/scala-2.10/git_version.info +#git describe --long >> ./dq-core/target/scala-2.10/git_version.info +#printf "status: " >> ./dq-core/target/scala-2.10/git_version.info +#git status >> ./dq-core/target/scala-2.10/git_version.info +#echo 'DONE!' + echo 'UPLOADING FILES...' scp dq-core/target/universal/*.zip $REMOTE_USERNAME@$REMOTE_HOST:$REMOTE_ROOT_DIR diff --git a/dq-core/src/main/resources/application.conf b/dq-core/src/main/resources/application.conf deleted file mode 100644 index fb51b48..0000000 --- a/dq-core/src/main/resources/application.conf +++ /dev/null @@ -1,44 +0,0 @@ -dataquality { - appDirectory:"" - - appName: "AgileLab Data Quality" - hadoopConfDir: "" - hiveDir: "" //${HIVE_PATH} - hbaseHost: "" //${HBASE_HOST} - - errorDumpSize: 1000 - errorFolderPath: "./Agile.DataQuality/side-code/dump" - - // Configuration - vsDumpConfig: { - fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump/virtual" - delimiter: "," - } - - // Result storage configuration - // Supported types: "DB" - // "DB" subtypes: "SQLITE","ORACLE - storage:{ - type: "DB" - config: { - host: "localhost:5432/dataquality?user=postgres" - subtype: "POSTGRES" - } - } - - // Check failure alert mailer configuration - mailing { - // "external" - external SMTP server - // "internal" - internal SMTP thru bash script (check universal/bin/sendMail.sh for extra configuration) - mode: "internal" -// config: { -// address: "test.testovic@gmail.com" -// hostname: "smtp.gmail.com" -// username: "test.testovic" -// password: "password123" -// smtpPort: 465 -// sslOnConnect: true -// } - } -} \ No newline at end of file diff --git a/dq-core/src/main/resources/conf/dev.conf b/dq-core/src/main/resources/conf/dev.conf new file mode 100644 index 0000000..b21ec93 --- /dev/null +++ b/dq-core/src/main/resources/conf/dev.conf @@ -0,0 +1,64 @@ +data_quality { + + application_name: "local" + + hive_warehouse_path: "" + hbase_host: "" + + tmp_files_management: { + local_fs_path: "/home/emakhov/IdeaProjects/Agile.DataQuality/tmp/fs" + hdfs_path: "/home/emakhov/IdeaProjects/Agile.DataQuality/tmp/hdfs" + } + + metric_error_management: { + dump_directory_path: "/home/emakhov/IdeaProjects/Agile.DataQuality/tmp/errors" + dump_size: 1000 // max number of collected errors for 1 metric for 1 partition + file_config: { + format: "csv" + delimiter: "," + quote: "\"" + escape: "\\" + quote_mode: "ALL" + } + } + + virtual_sources_management: { + dump_directory_path: "/home/emakhov/IdeaProjects/Agile.DataQuality/tmp/virtual" + file_format: "csv" + delimiter: "," + } + + // Result storage configuration + // Supported types: "DB", "NONE" + // Use "" to turn off storage feature + // "DB" subtypes: "SQLITE", "POSTGRES", "ORACLE + storage:{ + type: "NONE" + config: { + subtype: "POSTGRES" + host: "localhost:5433/dataquality" + user: "postgres" + password: "postgres" + schema: "dev" + } + } + + // Check failure alert mailer configuration + mailing { + // "external" - to use external SMTP server + // "internal" - to use internal SMTP thru bash script (check universal/bin/sendMail.sh for extra configuration) + // "" - to turn off mailing + mode: "internal" + mail_script_path: "" + // config: { + // address: "test.testovic@gmail.com" + // hostname: "smtp.gmail.com" + // username: "test.testovic" + // password: "password123" + // smtpPort: 465 + // sslOnConnect: true + // } + + notifications: false + } +} diff --git a/dq-core/src/universal/bin/run-default.sh b/dq-core/src/main/resources/integration/dev/bin/run.sh similarity index 69% rename from dq-core/src/universal/bin/run-default.sh rename to dq-core/src/main/resources/integration/dev/bin/run.sh index 33c407d..a0256a1 100755 --- a/dq-core/src/universal/bin/run-default.sh +++ b/dq-core/src/main/resources/integration/dev/bin/run.sh @@ -1,44 +1,45 @@ #!/bin/bash + +# Utility functions displayUsageAndExit() { echo -e "\nUsage:\n\t$0 -r YYYY-MM-DD -c configpath [-d]\n" exit 1 } +# ----- + +export PROJECT_NAME="DEV" +REMOTE_USERNAME=$(whoami) +kinit -kt ~/${REMOTE_USERNAME}.keytab ${REMOTE_USERNAME} HOME_DIR="$( cd ../.. && pwd )" export SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/" && pwd )" export APP_DIR=${SCRIPT_DIR} - export LOG_DIRECTORY="${HOME_DIR}/logs/" -export SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/" && pwd )" source ${SCRIPT_DIR}/global-parameters.sh source ${SCRIPT_DIR}/functions.sh +FAILED_LIST_FILE=$SCRIPT_DIR"/failed.txt" + ###### CONFIG GENERATION PARAMETERS q REMOTE_USERNAME=$(whoami) -# todo: Add input dir path -DATA_DIR= +#export INPUT_DIR=${DATA_DIR} -export INPUT_DIR=${DATA_DIR} - -# todo: Add output dir path -export OUTPUT_BASE= +export OUTPUT_BASE=/user/$REMOTE_USERNAME/${PROJECT_NAME}-DQ/OUTPUT ######DATA QUALITY PARAMETERS -export HIVE_PATH=user/hive/warehouse/ +#export SQLITE_PATH=${SCRIPT_DIR}/../local-db/"dataquality.db" +export ERROR_DUMP_SIZE=200000 + +export HIVE_PATH="user/hive/warehouse/" START_TIME=$(date +"%d-%m-%Y %T") start_time_seconds=$(date +%s) export REFDATE=$(date +"%Y-%m-%d") -export PROJECT_NAME="Agile Lab DQ" -export REFERENCE_MONTH=$(date +"%m%Y") - -# todo: By default script will run every configuration stored in "current" directory inside "conf" -FOLDER_NAME="current" -arr=$(ls ${SCRIPT_DIR}/../conf/$FOLDER_NAME/*.conf) +arr=$(find ${SCRIPT_DIR}/../conf/testrun -name *.conf) FAILED_LIST=() echo "Loading confs from $FOLDER_NAME folder..." @@ -58,7 +59,7 @@ for i in $arr ; do echo "Current timestamp is "$TIMESTAMP_CREATION echo "Submitting DATA QUALITY Spark job with config: ${CONFIG_FILE_NAME} - ${REFDATE}" - bash ${SCRIPT_DIR}/submit.sh -c ${CONFIG_FILE} -d ${REFDATE} 2>&1 >> ${LOG_FILE} + bash ${SCRIPT_DIR}/submit.sh -a ${APP_CONFIG} -c ${CONFIG_FILE} -d ${REFDATE} 2>&1 >> ${LOG_FILE} if [ $? -ne 0 ]; then echo "Job finished. Status: FAILED" FAILED_LIST+=($CONFIG_FILE_NAME) @@ -79,9 +80,20 @@ echo "Total amount of configs: $counter. Succeeded: $succeeded, Failed: ${#FAILE printf '%s\n' "${FAILED_LIST[@]}" -if [ ${#FAILED_LIST[@]} > 0 ]; then +if [ $succeeded -eq 0 ] + then echo "exit code: 1" exit 1 +elif [ ${#FAILED_LIST[@]} -ne 0 ] + then + echo "exit code: 2" + + rm $FAILED_LIST_FILE + touch $FAILED_LIST_FILE + + printf "%s\n" "${FAILED_LIST[@]}" > $FAILED_LIST_FILE + + exit 2 else echo "exit code: 0" exit 0 diff --git a/dq-core/src/main/resources/integration/dev/dev.sql b/dq-core/src/main/resources/integration/dev/dev.sql new file mode 100644 index 0000000..9eb3417 --- /dev/null +++ b/dq-core/src/main/resources/integration/dev/dev.sql @@ -0,0 +1,189 @@ +DROP SCHEMA IF EXISTS dev; +CREATE SCHEMA dev; + +DROP TABLE IF EXISTS dev.results_metric_columnar; +CREATE TABLE dev.results_metric_columnar ( + "metric_id" TEXT NOT NULL, + "source_date" TEXT NOT NULL, + "name" TEXT NOT NULL, + "source_id" TEXT NOT NULL, + "column_names" TEXT[] NOT NULL, + "params" TEXT, + "result" TEXT NOT NULL, + "additional_result" TEXT, + UNIQUE(metric_id, source_date) +); + +DROP TABLE IF EXISTS dev.results_metric_file; +CREATE TABLE dev.results_metric_file ( + "metric_id" TEXT NOT NULL, + "source_date" TEXT NOT NULL, + "name" TEXT NOT NULL, + "source_id" TEXT NOT NULL, + "result" TEXT NOT NULL, + "additional_result" TEXT, + UNIQUE(metric_id, source_date) +); + +DROP TABLE IF EXISTS dev."results_metric_composed"; +CREATE TABLE dev."results_metric_composed" ( + "metric_id" TEXT NOT NULL, + "source_date" TEXT NOT NULL, + "name" TEXT NOT NULL, + "source_id" TEXT NOT NULL, + "formula" TEXT NOT NULL, + "result" TEXT NOT NULL, + "additional_result" TEXT, + UNIQUE(metric_id, source_date) +); + +DROP TABLE IF EXISTS dev."results_check"; +CREATE TABLE dev."results_check" ( + "check_id" TEXT NOT NULL, + "check_name" TEXT NOT NULL, + "description" TEXT, + "checked_file" TEXT NOT NULL, + "base_metric" TEXT NOT NULL, + "compared_metric" TEXT, + "compared_threshold" TEXT, + "status" TEXT NOT NULL, + "message" TEXT, + "exec_date" TEXT NOT NULL, + UNIQUE(check_id, exec_date, check_name) +); + + + +CREATE OR REPLACE FUNCTION dev_upsert_colmet() + RETURNS trigger AS +$dev_upsert_colmet$ +declare +existing record; +begin + if (select EXISTS (SELECT 1 FROM dev.results_metric_columnar WHERE + metric_id = NEW.metric_id AND + source_date = NEW.source_date + )) then + + UPDATE dev.results_metric_columnar SET + name = NEW.name, + column_names = NEW.column_names, + params = NEW.params, + result = NEW.result, + source_id = NEW.source_id, + additional_result = NEW.additional_result + WHERE metric_id = NEW.metric_id AND source_date = NEW.source_date; + + return null; + end if; + + return new; +end +$dev_upsert_colmet$ +LANGUAGE plpgsql; + +create trigger dev_column_metrics_insert +before insert + on dev.results_metric_columnar +for each row + execute procedure dev_upsert_colmet(); + + + +CREATE OR REPLACE FUNCTION dev_upsert_filemet() +RETURNS trigger AS +$dev_upsert_filemet$ +declare +existing record; +begin +if (select EXISTS (SELECT 1 FROM dev.results_metric_file WHERE metric_id = NEW.metric_id AND source_date = NEW.source_date)) then + +UPDATE dev.results_metric_file SET + name = NEW.name, + source_id = NEW.source_id, + result = NEW.result, + additional_result = NEW.additional_result +WHERE metric_id = NEW.metric_id AND source_date = NEW.source_date; + +return null; +end if; + +return new; +end +$dev_upsert_filemet$ +LANGUAGE plpgsql; + +create trigger dev_file_metrics_insert +before insert + on dev.results_metric_file +for each row + execute procedure dev_upsert_filemet(); + + + +CREATE OR REPLACE FUNCTION dev_upsert_compmet() +RETURNS trigger AS +$dev_upsert_compmet$ +declare +existing record; +begin +if (select EXISTS (SELECT 1 FROM results_metric_composed WHERE metric_id = NEW.metric_id AND source_date = NEW.source_date)) then + +UPDATE results_metric_composed SET + name = NEW.name, + source_id = NEW.source_id, + formula = NEW.formula, + result = NEW.result, + additional_result = NEW.additional_result +WHERE metric_id = NEW.metric_id AND source_date = NEW.source_date; + +return null; +end if; + +return new; +end +$dev_upsert_compmet$ +LANGUAGE plpgsql; + +create trigger dev_composed_metrics_insert +before insert + on dev.results_metric_composed +for each row + execute procedure dev_upsert_compmet(); + + + +CREATE OR REPLACE FUNCTION dev_upsert_check() +RETURNS trigger AS +$dev_upsert_check$ +declare +existing record; +begin +if (select EXISTS (SELECT 1 FROM dev.results_check +WHERE check_id = NEW.check_id AND exec_date = NEW.exec_date AND check_name = NEW.check_name)) then + +UPDATE dev.results_check SET + description = NEW.description, + checked_file = NEW.checked_file, + base_metric = NEW.base_metric, + compared_metric = NEW.compared_metric, + compared_threshold = NEW.compared_threshold, + status = NEW.status, + message = NEW.message +WHERE check_id = NEW.check_id AND + exec_date = NEW.exec_date AND + check_name = NEW.check_name; + +return null; +end if; + +return new; +end +$dev_upsert_check$ +LANGUAGE plpgsql; + +create trigger dev_checks_insert +before insert + on dev.results_check +for each row + execute procedure dev_upsert_check(); \ No newline at end of file diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/apps/DQMasterBatch.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/apps/DQMasterBatch.scala index 60d1a03..7d78fe4 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/apps/DQMasterBatch.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/apps/DQMasterBatch.scala @@ -2,55 +2,59 @@ package it.agilelab.bigdata.DataQuality.apps import java.sql.Connection -import it.agilelab.bigdata.DataQuality.checks.CheckResult +import it.agilelab.bigdata.DataQuality.checks.{CheckResult, CheckStatusEnum, LoadCheckResult} +import it.agilelab.bigdata.DataQuality.checks.LoadChecks.{ExeEnum, LoadCheck} import it.agilelab.bigdata.DataQuality.configs.ConfigReader import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException import it.agilelab.bigdata.DataQuality.metrics.{ComposedMetricCalculator, _} +import it.agilelab.bigdata.DataQuality.sources.SourceTypes.SourceType import it.agilelab.bigdata.DataQuality.sources.VirtualSourceProcessor.getActualSources import it.agilelab.bigdata.DataQuality.sources._ import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig import it.agilelab.bigdata.DataQuality.utils._ import it.agilelab.bigdata.DataQuality.utils.io.db.readers.HBaseLoader -import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter, HiveReader, LocalDBManager} +import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter, HistoryDBManager, HiveReader} +import it.agilelab.bigdata.DataQuality.utils.mailing.{NotificationManager, Summary} import org.apache.hadoop.fs.FileSystem import org.apache.spark.SparkContext import org.apache.spark.sql.hive.HiveContext -import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.apache.spark.sql.{DataFrame, SQLContext} import scala.util.{Failure, Success, Try} -/** - * Created by Gianvito Siciliano on 30/01/2017. - */ + object DQMasterBatch extends DQMainClass with DQSparkContext with Logging { override protected def body()(implicit fs: FileSystem, sparkContext: SparkContext, sqlContext: SQLContext, - sqlWriter: LocalDBManager, + sqlWriter: HistoryDBManager, settings: DQSettings): Boolean = { /** - * PARSE CONFIGURATION FILE - * + * Configuration file parsing */ + log.info(s"[STAGE 1/...] Parsing configuration file...") + log.info("Path: " + settings.configFilePath) val configuration = new ConfigReader(settings.configFilePath) - log.info("\n EXTERNAL DATABASES:") - log.info(configuration.dbConfigMap.mkString(" \n ")) - log.info("\n SOURCES:") - log.info(configuration.sourcesConfigMap.mkString(" \n ")) - log.info("\n METRICS:") - log.info(configuration.metricsBySourceList.mkString(" \n ")) - log.info("\n CHECKS:") - log.info(configuration.metricsByChecksList.mkString(" \n ")) - log.info("\n TARGETS:") - log.info(configuration.targetsConfigMap.mkString(" \n ")) + log.info(s"External database list (size ${configuration.dbConfigMap.size}):") + configuration.dbConfigMap.par.foreach(x => log.debug(s" ${x._1} -> ${x._2}")) + log.info(s"Source list (size ${configuration.sourcesConfigMap.size}):") + configuration.sourcesConfigMap.par.foreach(x => log.debug(s" ${x._1} -> ${x._2}")) + log.info(s"Virtual source list (size ${configuration.virtualSourcesConfigMap.size}):") + configuration.virtualSourcesConfigMap.par.foreach(x => log.debug(s" ${x._1} -> ${x._2}")) + log.info(s"Metrics list (size ${configuration.metricsBySourceList.size}):") + configuration.metricsBySourceList.par.foreach(x => log.debug(s" ${x._1} -> ${x._2}")) + log.info(s"Checks list (size ${configuration.metricsByChecksList.size}):") + configuration.metricsByChecksList.par.foreach(x => log.debug(s" ${x._2} -> ${x._1}")) + log.info(s"Targets list (size ${configuration.targetsConfigMap.size}):") + configuration.targetsConfigMap.par.foreach(x => log.debug(s" ${x._1} -> ${x._2}")) /** - * LOAD SOURCES + * Database connection management */ - log.info(s"\n# Connecting to external databases...") + log.info(s"[STAGE 2/...] Connecting to external databases...") val dbConnections: Map[String, Connection] = configuration.dbConfigMap.flatMap({ db => log.info("Trying to connect to " + db._1) @@ -62,43 +66,47 @@ object DQMasterBatch extends DQMainClass with DQSparkContext with Logging { res }) - log.info(s"\n# Loading data...") - val sources: Seq[Source] = configuration.sourcesConfigMap + /** + * Source loading + */ + log.info(s"[STAGE 3/...] Loading data...") + val (sources: Seq[Source], lcResults: Seq[LoadCheckResult]) = configuration.sourcesConfigMap .map { case (source, conf) => - // val keyFieldOpt=if(conf.keyfields) Some(conf.keyfields) else None conf match { case hdfsFile: HdfsFile => - HdfsReader + val loadChecks: Seq[LoadCheck] = configuration.loadChecksMap.getOrElse(source, Seq.empty[LoadCheck]) + + val preLoadRes: Seq[LoadCheckResult] = + loadChecks.filter(_.exeType == ExeEnum.pre).map(x => x.run(Some(hdfsFile))(fs, sqlContext, settings)) + + val src: Seq[Source] = HdfsReader .load(hdfsFile, settings.ref_date) .map(df => Source(source, hdfsFile.date, df, conf.keyfields)) + + val postLoadRes: Seq[LoadCheckResult] = loadChecks + .filter(_.exeType == ExeEnum.post) + .map(x => x.run(None, Try(src.head.df).toOption)(fs, sqlContext, settings)) + + (src, preLoadRes ++ postLoadRes) case hiveTableConfig: HiveTableConfig => sqlContext match { case hc: HiveContext => - HiveReader.loadHiveTable(hiveTableConfig)(hc).map(df => - Source(source, settings.refDateString, df, conf.keyfields)) + val src: Seq[Source] = HiveReader + .loadHiveTable(hiveTableConfig)(hc) + .map(df => Source(source, settings.refDateString, df, conf.keyfields)) + (src, Seq.empty[LoadCheckResult]) case _ => throw IllegalParameterException("Hive context wasn't set properly. Check your application.conf") } case hbConf: HBaseSrcConfig => - Seq( - Source(source, - settings.refDateString, - HBaseLoader.loadToDF(hbConf), - conf.keyfields)) - case outputFile: OutputFile => - val output = HdfsReader - .loadOutput(outputFile) - .map(t => Source(t._1, outputFile.date, t._2, conf.keyfields)) - output.foreach(log warn _) - output - .find(_.id == "DWH_SWIFT_ROWS") - .foreach(_.df.collect().foreach(log warn _)) - output + ( + Seq(Source(source, settings.refDateString, HBaseLoader.loadToDF(hbConf), conf.keyfields)), + Seq.empty[LoadCheckResult] + ) case tableConf: TableConfig => val databaseConfig = configuration.dbConfigMap(tableConf.dbId) - log.info( - s"Loading table ${tableConf.table} from ${tableConf.dbId}") + log.info(s"Loading table ${tableConf.table} from ${tableConf.dbId}") val df: DataFrame = (tableConf.password, tableConf.password) match { case (Some(u), Some(p)) => @@ -106,61 +114,64 @@ object DQMasterBatch extends DQMainClass with DQSparkContext with Logging { case _ => databaseConfig.loadData(tableConf.table) } - Seq(Source(source, settings.refDateString, df, conf.keyfields)) + (Seq(Source(source, settings.refDateString, df, conf.keyfields)), Seq.empty[LoadCheckResult]) case x => throw IllegalParameterException(x.getType.toString) } } - .toSeq - .flatten + .foldLeft((Seq.empty[Source], Seq.empty[LoadCheckResult]))((x, y) => (x._1 ++ y._1, x._2 ++ y._2)) + + sqlWriter.saveResultsToDB(lcResults.map(_.simplify()), "results_check_load") + if (sources.length != configuration.sourcesConfigMap.size) { + val failSrc: Seq[String] = configuration.sourcesConfigMap + .filterNot(x => sources.map(_.id).toSet.contains(x._1)).map(x =>s"- ${x._1}: ${x._2.getType}").toSeq + val additional = failSrc.mkString(s"Failed sources: ${failSrc.size}\n","\n","") + + log.error(additional) + + val summary = new Summary(configuration, None, Some(lcResults)) + log.debug(summary.toMailString() + "\n" + additional) + + NotificationManager.sendSummary(summary, Some(additional)) + NotificationManager.saveResultsLocally(summary, None, Some(lcResults)) + + return false + } val sourceMap: Map[String, Source] = sources.map(x => (x.id, x)).toMap - val vsToSave: Set[String] = - configuration.virtualSourcesConfigMap.filter(p => p._2.isSave).keys.toSet - val virtualSources: Seq[Source] = getActualSources( - configuration.virtualSourcesConfigMap, - sourceMap).values.toSeq + val vsToSave: Set[String] = configuration.virtualSourcesConfigMap.filter(p => p._2.isSave).keys.toSet + val virtualSources: Seq[Source] = getActualSources(configuration.virtualSourcesConfigMap, sourceMap).values.toSeq + + log.info("Saving required sources...") + virtualSources.foreach(source => { + (source, settings.vsDumpConfig) match { + case (src, Some(config)) if vsToSave.contains(src.id) && src.keyfields.nonEmpty => + val dataframe = src.df.select(src.keyfields.head, src.keyfields.tail: _*) + HdfsWriter.saveVirtualSource(dataframe, config.copy(fileName = src.id), settings.refDateString) + log.info(s"Source ${src.id} was saved with key fields.") + case (src, Some(config)) if vsToSave.contains(src.id) => + HdfsWriter.saveVirtualSource(source.df, config.copy(fileName = src.id), settings.refDateString) + log.info(s"Virtual source ${src.id} was saved.") + case (src, _) if vsToSave.contains(src.id) => + log.info(s"Virtual source ${src.id} will not be saved.") + case _ => + } + }) /** - * CALCULATE METRICS + * Metrics calculation */ - log.info(s"\n# Starting metrics processing...") - val allMetrics - : Seq[(String, - Map[Seq[String], Map[ColumnMetric, (Double, Option[String])]], - Map[FileMetric, (Double, Option[String])])] = + log.info(s"[STAGE 4/...] Calculating metrics...") + val allMetrics: Seq[(String, + Map[Seq[String], Map[ColumnMetric, (Double, Option[String])]], + Map[FileMetric, (Double, Option[String])])] = virtualSources.map(source => { - - (source, settings.vsDumpConfig) match { - case (src, Some(config)) - if vsToSave.contains(src.id) && src.keyfields.nonEmpty => - val dataframe = - src.df.select(src.keyfields.head, src.keyfields.tail: _*) - HdfsWriter.saveVirtualSource(dataframe, - config.copy(fileName = src.id), - settings.refDateString) - log.info(s"Source ${src.id} was saved with key fields.") - case (src, Some(config)) if vsToSave.contains(src.id) => - HdfsWriter.saveVirtualSource(source.df, - config.copy(fileName = src.id), - settings.refDateString) - log.info(s"Source ${src.id} was saved.") - case (src, _) if vsToSave.contains(src.id) => - log.info(s"Source ${src.id} will not be saved.") - case _ => - } - log.info(s"Calculating metrics for ${source.id}") //select all file metrics to do on this source val fileMetrics: Seq[FileMetric] = configuration.metricsBySourceMap.getOrElse(source.id, Nil).collect { case metric: FileMetric => - FileMetric(metric.id, - metric.name, - metric.description, - metric.source, - source.date, - metric.paramMap) + FileMetric(metric.id, metric.name, metric.description, metric.source, source.date, metric.paramMap) } log.info(s"Found file metrics: ${fileMetrics.size}") @@ -174,7 +185,8 @@ object DQMasterBatch extends DQMainClass with DQSparkContext with Logging { metric.source, source.date, metric.columns, - metric.paramMap) + metric.paramMap, + metric.positions) } log.info(s"Found column metrics: ${colMetrics.size}") @@ -185,13 +197,9 @@ object DQMasterBatch extends DQMainClass with DQSparkContext with Logging { Map.empty[FileMetric, (Double, Option[String])] empty) } else { //compute all metrics - val results - : (Map[Seq[String], Map[ColumnMetric, (Double, Option[String])]], - Map[FileMetric, (Double, Option[String])]) = - MetricProcessor.processAllMetrics(source.df, - colMetrics, - fileMetrics, - source.keyfields) + val results: (Map[Seq[String], Map[ColumnMetric, (Double, Option[String])]], + Map[FileMetric, (Double, Option[String])]) = + MetricProcessor.processAllMetrics(source.df, colMetrics, fileMetrics, source.keyfields) source.df.unpersist() @@ -243,27 +251,25 @@ object DQMasterBatch extends DQMainClass with DQSparkContext with Logging { /** * CALCULATING COMPOSED METRICS */ - log.info(s"\n# Calculating composed metrics...") - + log.info(s"[STAGE 5/...] Calculating composed metrics...") // todo: It's possible to calculate composed using $primitiveMetricResults as zero value to avoid extra merges val composedMetricResults: Seq[ComposedMetricResult] = configuration.composedMetrics - .foldLeft[Seq[ComposedMetricResult]](Seq.empty[ComposedMetricResult])( - (accum, curr) => { - log.info(s"Calculating ${curr.id} with formula ${curr.formula}") - val composedMetricCalculator = - new ComposedMetricCalculator(primitiveMetricResults ++ accum) - val currRes: ComposedMetricResult = - composedMetricCalculator.run(curr) - accum ++ Seq(currRes) - }) - - val allMetricResults - : Seq[MetricResult] = primitiveMetricResults ++ composedMetricResults + .foldLeft[Seq[ComposedMetricResult]](Seq.empty[ComposedMetricResult])((accum, curr) => { + log.info(s"Calculating ${curr.id} with formula ${curr.formula}") + val composedMetricCalculator = + new ComposedMetricCalculator(primitiveMetricResults ++ accum) + val currRes: ComposedMetricResult = + composedMetricCalculator.run(curr) + accum ++ Seq(currRes) + }) + + val allMetricResults: Seq[MetricResult] = primitiveMetricResults ++ composedMetricResults /** * DEFINE and PERFORM CHECKS */ + log.info(s"[STAGE 6/...] Performing checks...") val buildChecks = configuration.metricsByCheckMap.map { case (check, metricList) => val resList = metricList.flatMap { mId => @@ -273,9 +279,8 @@ object DQMasterBatch extends DQMainClass with DQSparkContext with Logging { check.addMetricList(resList) }.toSeq - val createdChecks = buildChecks.map(cmr => - s"${cmr.id}, ${cmr.getMetrics} - ${cmr.getDescription}") - log.info(s"\n# Checks created... ${createdChecks.size}") + val createdChecks = buildChecks.map(cmr => s"${cmr.id}, ${cmr.getMetrics} - ${cmr.getDescription}") + log.info(s" * Checks created: ${createdChecks.size}") createdChecks.foreach(str => log.info(str)) val checkResults: Seq[CheckResult] = buildChecks @@ -284,21 +289,17 @@ object DQMasterBatch extends DQMainClass with DQSparkContext with Logging { e.run } match { case Success(res) => Some(res) - case Failure(e) => { - log.error(e.getMessage) - None - } + case Failure(e) => None } } - log.info(s"\n# Check Results...") - checkResults.foreach(cr => log.info(cr.message)) + log.info(s"Check Results:") + checkResults.foreach(cr => log.info(" " + cr.message)) /** * PERFORM SQL CHECKS */ - log.info(s"\n# SQL checks processing...") - + log.info(s"[STAGE 7/...] Performing SQL checks...") val sqlCheckResults: List[CheckResult] = configuration.sqlChecksList.map(check => { log.info("Calculating " + check.id + " " + check.description) @@ -308,51 +309,39 @@ object DQMasterBatch extends DQMainClass with DQSparkContext with Logging { // Closing db connections dbConnections.values.foreach(_.close()) - /** - * SAVE RESULTS - */ val finalCheckResults: Seq[CheckResult] = checkResults ++ sqlCheckResults - log.info(s"\n# Saving results to the database...") + log.info(s"[STAGE 8/...] Processing results...") + log.info(s"Saving results to the database...") log.info(s"With reference date: ${settings.refDateString}") sqlWriter.saveResultsToDB(colMetricResultsList, "results_metric_columnar") sqlWriter.saveResultsToDB(fileMetricResultsList, "results_metric_file") sqlWriter.saveResultsToDB(composedMetricResults, "results_metric_composed") sqlWriter.saveResultsToDB(finalCheckResults, "results_check") - val targetResultMap - : Map[String, Seq[Product with Serializable with TypedResult]] = Map( - "COLUMNAR-METRICS" -> colMetricResultsList, - "FILE-METRICS" -> fileMetricResultsList, - "CHECKS" -> finalCheckResults, - "COMPOSED-METRICS" -> composedMetricResults + val targetResultMap: Map[String, Seq[Product with Serializable with TypedResult]] = Map( + "FILE_METRICS" -> fileMetricResultsList, + "COLUMN_METRICS" -> colMetricResultsList, + "COMPOSED_METRICS" -> composedMetricResults, + "CHECKS" -> finalCheckResults, + "LOAD_CHECKS" -> lcResults ) - log.info(s"\n# Targets processing...") + log. info("Saving targets...") configuration.targetsConfigMap.foreach(tar => tar._1 match { case "SYSTEM" => - tar._2.foreach(conf => - HdfsWriter.processSystemTarget(conf, finalCheckResults)) + tar._2.foreach(conf => HdfsWriter.processSystemTarget(conf, finalCheckResults)) case _ => - tar._2.foreach( - conf => - HdfsWriter.save(conf.asInstanceOf[HdfsTargetConfig], - targetResultMap(tar._1))) + tar._2.foreach(conf => HdfsWriter.save(conf.asInstanceOf[HdfsTargetConfig], targetResultMap(tar._1))) }) - log.info(s"\n# Starting postprocessing...") - + log.info(s"[STAGE 9/...] Postprocessing...") val vsHdfs: Set[HdfsFile] = settings.vsDumpConfig match { case Some(conf) => vsToSave.map(vs => { val fileName = conf.path + "/" + vs + "." + conf.fileFormat //-${targetConfig.subType} - HdfsFile(vs, - fileName, - conf.fileFormat, - conf.delimiter, - true, - settings.refDateString) + HdfsFile.apply(vs, fileName, conf.fileFormat, true, settings.refDateString) }) case _ => Set.empty[HdfsFile] } @@ -360,6 +349,12 @@ object DQMasterBatch extends DQMainClass with DQSparkContext with Logging { configuration.getPostprocessors.foldLeft(vsHdfs)((files, pp) => files.+(pp.process(files, allMetricResults, finalCheckResults))) + log.info(s"[STAGE 10/...] Saving summary files and mailing reports...") + val summary = new Summary(configuration, Some(checkResults), Some(lcResults)) + log.debug(summary.toMailString()) + NotificationManager.saveResultsLocally(summary, Some(checkResults), Some(lcResults)) + NotificationManager.sendSummary(summary) + true } } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/CheckResult.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/CheckResult.scala index dcbffdd..c7eb349 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/CheckResult.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/CheckResult.scala @@ -1,7 +1,9 @@ package it.agilelab.bigdata.DataQuality.checks +import it.agilelab.bigdata.DataQuality.checks.CheckStatusEnum.CheckResultStatus import it.agilelab.bigdata.DataQuality.metrics.DQResultTypes.DQResultType import it.agilelab.bigdata.DataQuality.metrics.{DQResultTypes, TypedResult} +import it.agilelab.bigdata.DataQuality.utils.DQSettings /** * Created by Gianvito Siciliano on 29/12/16. @@ -20,5 +22,54 @@ case class CheckResult( message: String, execDate: String ) extends TypedResult { + + def toCsvString()(implicit settings: DQSettings): String = { + Seq( + this.checkId, + this.status.toString, + this.execDate, + this.checkName, + this.baseMetric + ).mkString(settings.tmpFileDelimiter.getOrElse(",")) + } + override def getType: DQResultType = DQResultTypes.check -} \ No newline at end of file +} + +case class LoadCheckResult( + id: String, + src: String, + tipo: String, + expected: String, + date: String, + status: CheckResultStatus, + message: String = "" +) extends TypedResult { + override def getType: DQResultType = DQResultTypes.load + + def simplify(): LoadCheckResultSimple = LoadCheckResultSimple( + this.id, + this.src, + this.tipo, + this.expected, + this.date, + this.status.toString, + this.message + ) + + def toCsvString()(implicit settings: DQSettings): String = { + Seq(this.id, this.status.toString, this.date, this.tipo, this.src) + .mkString(settings.tmpFileDelimiter.getOrElse(",")) + } +} + +// TODO: Find a smarter way to solve issue with saving +case class LoadCheckResultSimple( + id: String, + src: String, + tipo: String, + expected: String, + date: String, + status: String, + message: String +) diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/CheckStatus.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/CheckStatus.scala index 662e600..c5129f4 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/CheckStatus.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/CheckStatus.scala @@ -5,6 +5,12 @@ package it.agilelab.bigdata.DataQuality.checks * * Representation of check statuses */ + +object CheckStatusEnum extends Enumeration { + type CheckResultStatus = Value + val Success, Failure, Error = Value +} + sealed trait CheckStatus { val stringValue: String } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/LoadChecks/LoadCheck.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/LoadChecks/LoadCheck.scala new file mode 100644 index 0000000..d5ea500 --- /dev/null +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/LoadChecks/LoadCheck.scala @@ -0,0 +1,220 @@ +package it.agilelab.bigdata.DataQuality.checks.LoadChecks +import it.agilelab.bigdata.DataQuality.checks.{CheckStatusEnum, LoadCheckResult} +import org.apache.hadoop.fs.FileSystem +import it.agilelab.bigdata.DataQuality.checks.LoadChecks.ExeEnum.LCType +import it.agilelab.bigdata.DataQuality.sources.{HdfsFile, SourceConfig, SourceTypes} +import it.agilelab.bigdata.DataQuality.utils.{DQSettings, Logging} +import org.apache.spark.sql.{DataFrame, SQLContext} + +import scala.util.Try + +/** + * Created by Egor Makhov on 07/05/19. + */ +object ExeEnum extends Enumeration { + type LCType = Value + val pre, post = Value +} + +object LoadCheckEnum extends Enumeration { + // pre + val existLC = LCDefinition("EXIST", classOf[ExistLoadCheck]) + val encodingLC = LCDefinition("ENCODING", classOf[EncodingLoadCheck]) + val fileTypeLC = LCDefinition("FILE_TYPE", classOf[FileTypeLoadCheck]) + // post + val columnNumLC = LCDefinition("EXACT_COLUMN_NUM", classOf[ColumnLoadCheck]) + val minColumnNumLC = LCDefinition("MIN_COLUMN_NUM", classOf[MinColumnLoadCheck]) + + def getCheckClass(name: String): Class[_ <: LoadCheck with Product with Serializable] = + convert(super.withName(name)).check + def names: Set[String] = values.map(_.toString) + def contains(s: String): Boolean = names.contains(s) + + protected case class LCDefinition(name: String, check: Class[_ <: LoadCheck with Product with Serializable]) + extends super.Val() { + override def toString(): String = this.name + } + + implicit def convert(value: Value): LCDefinition = value.asInstanceOf[LCDefinition] +} + +abstract class LoadCheck() { + def id: String + def tipo: String + def source: String + def option: Any + + val exeType: LCType + def run(confOpt: Option[SourceConfig] = None, df: Option[DataFrame] = None)(implicit fs: FileSystem, + sqlc: SQLContext, + settings: DQSettings): LoadCheckResult +} + +/** +* Check if the source is present in the defined path + * @param id Check id + * @param tipo Check type + * @param source Source ID + * @param option Expected result + */ +case class ExistLoadCheck(id: String, tipo: String, source: String, option: Any) extends LoadCheck with Logging { + override val exeType: LCType = ExeEnum.pre + override def run(confOpt: Option[SourceConfig] = None, + df: Option[DataFrame] = None)(implicit fs: FileSystem, sqlc: SQLContext, settings: DQSettings): LoadCheckResult = { + + val conf = confOpt match { + case Some(x) if x.getType == SourceTypes.hdfs => x.asInstanceOf[HdfsFile] + case _ => throw new IllegalArgumentException("Encoding load check can be used only on HDFS files.") + } + + val expected: Boolean = option.toString.toBoolean + + val (status, msg) = (fs.exists(new org.apache.hadoop.fs.Path(conf.path)), expected) match { + case (true, true) => (CheckStatusEnum.Success, "Source is present in the file system") + case (false, false) => (CheckStatusEnum.Success, "Source is not present in the file system") + case (false, true) => (CheckStatusEnum.Failure, s"No files have been found at path: ${conf.path}") + case (true, false) => (CheckStatusEnum.Failure, s"Some files have been found at path: ${conf.path}") + } + + LoadCheckResult(id, source, tipo, option.toString, settings.refDateString, status, msg) + } +} + +/** +* Checks if the source is loadable with the following encoding + * @param id Check id + * @param tipo Check type + * @param source Source ID + * @param option Encoding name (please, use encoding names defined in Spark) + */ +case class EncodingLoadCheck(id: String, tipo: String, source: String, option: Any) extends LoadCheck with Logging { + override val exeType: LCType = ExeEnum.pre + + override def run(confOpt: Option[SourceConfig] = None, + df: Option[DataFrame] = None)(implicit fs: FileSystem, sqlc: SQLContext, settings: DQSettings): LoadCheckResult = { + + val conf: HdfsFile = confOpt match { + case Some(x) if x.getType == SourceTypes.hdfs => x.asInstanceOf[HdfsFile] + case _ => throw new IllegalArgumentException("Encoding load check can be used only on HDFS files.") + } + + val dfOpt: Option[DataFrame] = conf.fileType.toLowerCase match { + case "avro" => + Try( + sqlc.read + .format("com.databricks.spark.avro") + .option("encoding", option.toString) + .load(conf.path)).toOption + case "csv" => + Try( + sqlc.read + .format("com.databricks.spark.csv") + .option("header", conf.header.toString) + .option("delimiter", conf.delimiter.getOrElse(",")) + .option("quote", conf.quote.getOrElse("\"")) + .option("escape", conf.escape.getOrElse("\\")) + .option("encoding", option.toString) + .load(conf.path) + ).toOption + case _ => throw new IllegalArgumentException(s"Unknown source type: $option.") + } + + val (status, msg) = dfOpt match { + case Some(_) => (CheckStatusEnum.Success, "") + case None => (CheckStatusEnum.Failure, s"Source can't be loaded with encoding: ${option.toString}") + } + + LoadCheckResult(id, source, tipo, option.toString, settings.refDateString, status, msg) + } +} + +/** +* Checks if the source is loadable in the desired format + * @param id Check id + * @param tipo Check type + * @param source Source ID + * @param option File format (csv, avro) + */ +case class FileTypeLoadCheck(id: String, tipo: String, source: String, option: Any) extends LoadCheck with Logging { + override val exeType: LCType = ExeEnum.pre + override def run(confOpt: Option[SourceConfig] = None, + df: Option[DataFrame] = None)(implicit fs: FileSystem, sqlc: SQLContext, settings: DQSettings): LoadCheckResult = { + + val conf: HdfsFile = confOpt match { + case Some(x) if x.getType == SourceTypes.hdfs => x.asInstanceOf[HdfsFile] + case _ => throw new IllegalArgumentException("Encoding load check can be used only on HDFS files.") + } + + val dfOpt: Option[DataFrame] = option.toString.toLowerCase match { + case "avro" => + Try( + sqlc.read + .format("com.databricks.spark.avro") + .load(conf.path)).toOption + case "csv" => + Try( + sqlc.read + .format("com.databricks.spark.csv") + .option("header", conf.header.toString) + .option("delimiter", conf.delimiter.getOrElse(",")) + .option("quote", conf.quote.getOrElse("\"")) + .option("escape", conf.escape.getOrElse("\\")) + .load(conf.path) + ).toOption + case _ => throw new IllegalArgumentException(s"Unknown source type: $option.") + } + + val (status, msg) = dfOpt match { + case Some(_) => (CheckStatusEnum.Success, "") + case None => (CheckStatusEnum.Failure, s"Source can't be loaded as ${option.toString.toLowerCase}") + } + + LoadCheckResult(id, source, tipo, option.toString, settings.refDateString, status, msg) + } +} + +/** +* Checks if #columns of the source is the same as desired number + * @param id Check id + * @param tipo Check type + * @param source Source ID + * @param option Num of columns + */ +case class ColumnLoadCheck(id: String, tipo: String, source: String, option: Any) extends LoadCheck with Logging { + override val exeType: LCType = ExeEnum.post + override def run(confOpt: Option[SourceConfig] = None, + df: Option[DataFrame] = None)(implicit fs: FileSystem, sqlc: SQLContext, settings: DQSettings): LoadCheckResult = { + + val (status, msg): (CheckStatusEnum.Value, String) = df match { + case Some(dataframe) => + if (dataframe.columns.length == option.toString.toInt) (CheckStatusEnum.Success, "") + else (CheckStatusEnum.Failure, s"Source #columns {${dataframe.columns.length}} is not equal to $option") + case None => (CheckStatusEnum.Error, "DataFrame hasn't been found") + } + LoadCheckResult(id, source, tipo, option.toString, settings.refDateString, status, msg) + } +} + + +/** + * Checks if #columns of the source is more or equal to the desired number + * @param id Check id + * @param tipo Check type + * @param source Source ID + * @param option Min num of columns + */ +case class MinColumnLoadCheck(id: String, tipo: String, source: String, option: Any) extends LoadCheck with Logging { + override val exeType: LCType = ExeEnum.post + override def run(confOpt: Option[SourceConfig] = None, + df: Option[DataFrame] = None)(implicit fs: FileSystem, sqlc: SQLContext, settings: DQSettings): LoadCheckResult = { + + val (status, msg): (CheckStatusEnum.Value, String) = df match { + case Some(dataframe) => + if (dataframe.columns.length >= option.toString.toInt) (CheckStatusEnum.Success, s"${dataframe.columns.length} >= ${option.toString.toInt}") + else (CheckStatusEnum.Failure, s"Source #columns {${dataframe.columns.length}} is less than $option") + case None => (CheckStatusEnum.Error, "DataFrame hasn't been found") + } + LoadCheckResult(id, source, tipo, option.toString, settings.refDateString, status, msg) + } +} + diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/AverageBoundCheck.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/AverageBoundCheck.scala index 50b1a71..8be2689 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/AverageBoundCheck.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/AverageBoundCheck.scala @@ -4,7 +4,7 @@ import it.agilelab.bigdata.DataQuality.checks._ import it.agilelab.bigdata.DataQuality.exceptions.IllegalConstraintResultException import it.agilelab.bigdata.DataQuality.metrics.MetricResult import it.agilelab.bigdata.DataQuality.utils.DQSettings -import it.agilelab.bigdata.DataQuality.utils.io.LocalDBManager +import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager /** * Created by Egor Makhov on 19/05/2017. @@ -35,8 +35,8 @@ case class AverageBoundFullCheck(id: String, threshold: Double, timewindow: Int, startDate: Option[String])( - implicit sqlWriter: LocalDBManager, - settings: DQSettings) + implicit sqlWriter: HistoryDBManager, + settings: DQSettings) extends TrendCheckCore(id, description, metrics, @@ -103,8 +103,8 @@ case class AverageBoundLowerCheck(id: String, threshold: Double, timewindow: Int, startDate: Option[String])( - implicit sqlWriter: LocalDBManager, - settings: DQSettings) + implicit sqlWriter: HistoryDBManager, + settings: DQSettings) extends TrendCheckCore(id, description, metrics, @@ -169,8 +169,8 @@ case class AverageBoundUpperCheck(id: String, threshold: Double, timewindow: Int, startDate: Option[String])( - implicit sqlWriter: LocalDBManager, - settings: DQSettings) + implicit sqlWriter: HistoryDBManager, + settings: DQSettings) extends TrendCheckCore(id, description, metrics, diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/AverageBoundRangeCheck.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/AverageBoundRangeCheck.scala new file mode 100644 index 0000000..71260f0 --- /dev/null +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/AverageBoundRangeCheck.scala @@ -0,0 +1,137 @@ +package it.agilelab.bigdata.DataQuality.checks.TrendChecks + +import it.agilelab.bigdata.DataQuality.checks._ +import it.agilelab.bigdata.DataQuality.exceptions.{IllegalConstraintResultException, IllegalParameterException} +import it.agilelab.bigdata.DataQuality.metrics.{ColumnMetricResult, ComposedMetricResult, FileMetricResult, MetricResult} +import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager +import it.agilelab.bigdata.DataQuality.utils.{DQSettings, mapResToColumnMet, mapResToComposedMet, mapResToFileMet} +import scala.util.Try + + +case class AverageBoundRangeCheck(id: String, + description: String, + metrics: Seq[MetricResult], + rule: String, + thresholdUpper: Double, + thresholdLower: Double, + timewindow: Int, + startDate: Option[String]) + (implicit sqlWriter: HistoryDBManager, settings: DQSettings) + extends Check with AverageCheckDistanceCalculator +{ + + + override def metricsList: Seq[MetricResult] = metrics + + def calculateCheck(metric: Double, avg: Double, thresholdUp: Double, thresholdDown: Double): Boolean = { + val upperBound = avg * (1 + thresholdUp) + val lowerBound = avg * (1 + thresholdDown) + + lowerBound <= metric && metric <= upperBound + } + + def getStatusString(status: CheckStatus, metric: Double, avg: Double, thresholdUp: Double, thresholdDown: Double + ): String = { + val upperBound = avg * (1 + thresholdUp) + val lowerBound = avg * (1 + thresholdDown) + + status match { + case CheckSuccess => + s"$lowerBound <= $metric <= $upperBound (with avg=$avg)" + case CheckFailure => + s"$metric not in [$lowerBound,$upperBound] (with avg=$avg)(failed: Should be avg * (1 + lowerBound) <= metricResult <= avg * (1 + upperBound))" + case CheckError(throwable) => + s"Checking $metric error: $throwable" + case default => throw IllegalConstraintResultException(id) + } + } + + override def addMetricList(metrics: Seq[MetricResult]) = + AverageBoundRangeCheck(id, + description, + metrics, + rule, + thresholdUpper, + thresholdLower, + timewindow, + startDate) + + override def run(): CheckResult = { + + val baseMetricResult: MetricResult = metrics.head + val targetMetricResult: MetricResult = metrics.last + + val metricIds: List[String] = List(baseMetricResult.metricId) + + // it will automatically select the correct table to load from, based on the main metric class + val dbMetResults: Seq[Double] = baseMetricResult match { + case _: ComposedMetricResult => + sqlWriter + .loadResults( + metricIds, + rule, + timewindow, + startDate.getOrElse(settings.refDateString))(mapResToComposedMet) + .map(x => x.result) + case _: ColumnMetricResult => + sqlWriter + .loadResults( + metricIds, + rule, + timewindow, + startDate.getOrElse(settings.refDateString))(mapResToColumnMet) + .map(x => x.result) + case _: FileMetricResult => + sqlWriter + .loadResults( + metricIds, + rule, + timewindow, + startDate.getOrElse(settings.refDateString))(mapResToFileMet) + .map(x => x.result) + case x => throw IllegalParameterException(x.toString) + } + + /* + * in the current state we're assuming that time distance between record is always the same + * so the prediction in the next record after provided ones + */ + val avg = calculatePrediction(dbMetResults) + + val checkStatus = CheckUtil.tryToStatus[Double]( + Try(targetMetricResult.result), + d => calculateCheck(d, avg, thresholdUpper, thresholdLower)) + + val statusString = + getStatusString(checkStatus, targetMetricResult.result, avg, thresholdUpper, thresholdLower) + + val checkMessage = CheckMessageGenerator(targetMetricResult, + thresholdUpper, + checkStatus, + statusString, + id, + subType, + Some(rule), + Some(timewindow) + ) + + val cr = CheckResult( + this.id, + subType, + this.description, + baseMetricResult.sourceId, + baseMetricResult.metricId, + Some(targetMetricResult.metricId), + thresholdUpper, + checkStatus.stringValue, + checkMessage.message, + settings.refDateString + ) + + cr + } + + + val subType = "AVERAGE_BOUND_RANGE_CHECK" + +} diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/TopNRankCheck.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/TopNRankCheck.scala index d926c9b..819ca6a 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/TopNRankCheck.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/TopNRankCheck.scala @@ -3,7 +3,7 @@ package it.agilelab.bigdata.DataQuality.checks.TrendChecks import it.agilelab.bigdata.DataQuality.checks._ import it.agilelab.bigdata.DataQuality.exceptions.IllegalConstraintResultException import it.agilelab.bigdata.DataQuality.metrics.{ColumnMetricResult, MetricResult} -import it.agilelab.bigdata.DataQuality.utils.io.LocalDBManager +import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager import it.agilelab.bigdata.DataQuality.utils.{DQSettings, mapResToColumnMet} import scala.util.Try @@ -18,8 +18,8 @@ case class TopNRankCheck(id: String, threshold: Double, timewindow: Int, startDate: Option[String])( - implicit sqlWriter: LocalDBManager, - settings: DQSettings) + implicit sqlWriter: HistoryDBManager, + settings: DQSettings) extends Check { def calculateJaccardDistance(set1: Set[String], set2: Set[String]): Double = { diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/TrendCheckCore.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/TrendCheckCore.scala index dde028c..111e540 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/TrendCheckCore.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/checks/TrendChecks/TrendCheckCore.scala @@ -3,7 +3,7 @@ package it.agilelab.bigdata.DataQuality.checks.TrendChecks import it.agilelab.bigdata.DataQuality.checks._ import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException import it.agilelab.bigdata.DataQuality.metrics.{ColumnMetricResult, ComposedMetricResult, FileMetricResult, MetricResult} -import it.agilelab.bigdata.DataQuality.utils.io.LocalDBManager +import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager import it.agilelab.bigdata.DataQuality.utils._ import scala.util.Try @@ -31,8 +31,8 @@ abstract class TrendCheckCore(id: String, threshold: Double, timewindow: Int, startDate: Option[String])( - implicit sqlWriter: LocalDBManager, - settings: DQSettings) + implicit sqlWriter: HistoryDBManager, + settings: DQSettings) extends Check with Logging { // Things to be implemented in the child classes diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/ConfigReader.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/ConfigReader.scala index b3868f8..edaa144 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/ConfigReader.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/ConfigReader.scala @@ -1,20 +1,24 @@ package it.agilelab.bigdata.DataQuality.configs import java.io.File +import java.util import java.util.Map.Entry import com.typesafe.config.{Config, ConfigFactory, ConfigObject, ConfigValue} import it.agilelab.bigdata.DataQuality.checks.Check +import it.agilelab.bigdata.DataQuality.checks.LoadChecks.{LoadCheck, LoadCheckEnum} import it.agilelab.bigdata.DataQuality.checks.SQLChecks.SQLCheck import it.agilelab.bigdata.DataQuality.checks.SnapshotChecks._ import it.agilelab.bigdata.DataQuality.checks.TrendChecks._ import it.agilelab.bigdata.DataQuality.exceptions.{IllegalParameterException, MissingParameterInException} -import it.agilelab.bigdata.DataQuality.metrics.{OutputMetric, _} +import it.agilelab.bigdata.DataQuality.metrics.MetricProcessor.ParamMap +import it.agilelab.bigdata.DataQuality.metrics._ import it.agilelab.bigdata.DataQuality.postprocessors.{BasicPostprocessor, PostprocessorType} import it.agilelab.bigdata.DataQuality.sources._ import it.agilelab.bigdata.DataQuality.targets.{HdfsTargetConfig, SystemTargetConfig, TargetConfig} -import it.agilelab.bigdata.DataQuality.utils.io.LocalDBManager +import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager import it.agilelab.bigdata.DataQuality.utils.{DQSettings, Logging, generateMetricSubId} +import org.apache.spark.storage.StorageLevel import org.joda.time.format.{DateTimeFormat, DateTimeFormatter} import scala.collection.JavaConversions._ @@ -22,26 +26,27 @@ import scala.collection.immutable.Seq import scala.collection.mutable import scala.util.Try -class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, settings:DQSettings) extends Logging { +class ConfigReader(configNameFile: String)(implicit sqlWriter: HistoryDBManager, settings: DQSettings) extends Logging { /** * Parsed config OBJECTS * */ - //load conf file val configObj: Config = ConfigFactory.parseFile(new File(configNameFile)).resolve() + val dbConfigMap: Map[String, DatabaseConfig] = getDatabasesById + //parse sources, metrics, checks, it.agilelab.bigdata.targets - val sourcesConfigMap: Map[String, SourceConfig] = getSourcesById + val sourcesConfigMap: Map[String, SourceConfig] = getSourcesById val virtualSourcesConfigMap: Map[String, VirtualFile] = getVirtualSourcesById - val dbConfigMap: Map[String, DatabaseConfig] = getDatabasesById + lazy val loadChecksMap: Map[String, Seq[LoadCheck]] = getLoadChecks - val metricsBySourceList: List[(String, Metric)] = getMetricsBySource + val metricsBySourceList: List[(String, Metric)] = getMetricsBySource lazy val metricsBySourceMap: Map[String, List[Metric]] = metricsBySourceList.groupBy(_._1).mapValues(_.map(_._2)) - val metricsByChecksList: List[(Check, String)] = getMetricByCheck + val metricsByChecksList: List[(Check, String)] = getMetricByCheck lazy val metricsByCheckMap: Map[Check, List[String]] = metricsByChecksList.groupBy(_._1).mapValues(_.map(_._2)) lazy val composedMetrics: List[ComposedMetric] = getComposedMetrics @@ -62,116 +67,133 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s * - DQ Targets * */ - /** * Parses sources from configuration file * @return Map of (source_id, source_config) */ private def getSourcesById: Map[String, SourceConfig] = { - val sourcesList: List[ConfigObject] = configObj.getObjectList("Sources").toList + val sourcesList: List[ConfigObject] = + configObj.getObjectList("Sources").toList def parseDateFromPath(path: String): String = { val length = path.length - val sub = path.substring(length - 8, length) // YYYYMMDD + val sub = path.substring(length - 8, length) // YYYYMMDD val formatter: DateTimeFormatter = DateTimeFormat.forPattern("yyyyMMdd") - val outputFormat = DateTimeFormat.forPattern("yyyy-MM-dd") + val outputFormat = DateTimeFormat.forPattern("yyyy-MM-dd") formatter.parseDateTime(sub).toString(outputFormat) } - sourcesList.map { - src => - val generalConfig = src.toConfig - val keyFieldList: scala.Seq[String] =if(generalConfig.hasPath("keyFields")) - generalConfig.getStringList("keyFields") else Seq.empty - generalConfig.getString("type") match { - case "HDFS" => - val id = generalConfig.getString("id") - val path = generalConfig.getString("path") - val fileType = generalConfig.getString("fileType") - val separator = Try(generalConfig.getString("separator")).toOption - val header = Try(generalConfig.getBoolean("header")).getOrElse(false) - val date = - Try(parseDateFromPath(path)) - .getOrElse(Try(generalConfig.getString("date")) - .getOrElse(settings.refDateString)) - val deps = Try(generalConfig.getStringList("deps").toList).getOrElse(List.empty[String]) - val schema = generalConfig.getString("fileType") match { - case "fixed" => - if (Try(generalConfig.getAnyRef("schema")).isSuccess) getFixedStructSchema(generalConfig) - else if (Try(generalConfig.getStringList("fieldLengths")).isSuccess) getFixedSchema(generalConfig) - else { - val allKeys = generalConfig.entrySet().map(_.getKey) - throw IllegalParameterException("\n CONFIG: " + allKeys.mkString(" - ")) - } - case "csv" => getStructSchema(generalConfig) - case "avro" => getStructSchema(generalConfig) - case "parquet" => getStructSchema(generalConfig) - case x => throw IllegalParameterException(x) - } - id -> HdfsFile(id, path, fileType, separator, header, date, deps, schema,keyFieldList) - case "OUTPUT" => - val path = generalConfig.getString("path") - "OUTPUT" -> OutputFile("OUTPUT", path, "csv", Some("|"), true, "*") - case "TABLE" => - val id = generalConfig.getString("id") - val databaseId = generalConfig.getString("database") - val table = generalConfig.getString("table") - val username = Try{generalConfig.getString("username")}.toOption - val password = Try{generalConfig.getString("password")}.toOption - - id -> TableConfig(id, databaseId, table, username, password,keyFieldList) - case "HIVE" => - val id = generalConfig.getString("id") - val date = generalConfig.getString("date") - val query = generalConfig.getString("query") - - id -> HiveTableConfig(id, date, query,keyFieldList) - case "HBASE" => - val id = generalConfig.getString("id") - val table = generalConfig.getString("table") - val hbColumns = generalConfig.getStringList("columns") - id -> HBaseSrcConfig(id, table, hbColumns) - case x => throw IllegalParameterException(x) - } + sourcesList.map { src => + val generalConfig = src.toConfig + val keyFieldList: scala.Seq[String] = + if (generalConfig.hasPath("keyFields")) + generalConfig.getStringList("keyFields") + else Seq.empty + generalConfig.getString("type") match { + case "HDFS" => + val id = generalConfig.getString("id") + val path = generalConfig.getString("path") + val fileType = generalConfig.getString("fileType") + + val header = Try(generalConfig.getBoolean("header")).getOrElse(false) + + val delimiter = Try(generalConfig.getString("delimiter")).toOption + val quote = Try(generalConfig.getString("quote")).toOption + val escape = Try(generalConfig.getString("escape")).toOption + + val date = Try(parseDateFromPath(path)) + .getOrElse(Try(generalConfig.getString("date")).getOrElse(settings.refDateString)) + + val schema = generalConfig.getString("fileType") match { + case "fixed" => + if (Try(generalConfig.getAnyRef("schema")).isSuccess) + getFixedStructSchema(generalConfig) + else if (Try(generalConfig.getStringList("fieldLengths")).isSuccess) + getFixedSchema(generalConfig) + else { + val allKeys = generalConfig.entrySet().map(_.getKey) + throw IllegalParameterException("\n CONFIG: " + allKeys.mkString(" - ")) + } + case "csv" => getStructSchema(generalConfig) + case "avro" => getStructSchema(generalConfig) + case "parquet" => getStructSchema(generalConfig) + case x => throw IllegalParameterException(x) + } + + id -> HdfsFile(id, path, fileType, header, date, delimiter, quote, escape, schema, keyFieldList) + case "OUTPUT" => + val path = generalConfig.getString("path") + "OUTPUT" -> OutputFile("OUTPUT", path, "csv", Some("|"), true, "*") + case "TABLE" => + val id = generalConfig.getString("id") + val databaseId = generalConfig.getString("database") + val table = generalConfig.getString("table") + val username = Try { generalConfig.getString("username") }.toOption + val password = Try { generalConfig.getString("password") }.toOption + + id -> TableConfig(id, databaseId, table, username, password, keyFieldList) + case "HIVE" => + val id = generalConfig.getString("id") + val date = generalConfig.getString("date") + val query = generalConfig.getString("query") + + id -> HiveTableConfig(id, date, query, keyFieldList) + case "HBASE" => + val id = generalConfig.getString("id") + val table = generalConfig.getString("table") + val hbColumns = generalConfig.getStringList("columns") + id -> HBaseSrcConfig(id, table, hbColumns) + case x => throw IllegalParameterException(x) + } }.toMap } - private def getVirtualSourcesById: Map[String, VirtualFile] = { - if(configObj.hasPath("VirtualSources")){ - val sourcesList: List[ConfigObject] = configObj.getObjectList("VirtualSources").toList - - sourcesList.map { - src => - val generalConfig = src.toConfig - val keyFieldList: scala.Seq[String] = if(generalConfig.hasPath("keyFields")) - generalConfig.getStringList("keyFields") else Seq() - - val parentSourcesIds: scala.Seq[String] = - generalConfig.getStringList("parentSources") - - val isSave: Boolean = Try{generalConfig.getBoolean("save")}.toOption.getOrElse(false) - val id = generalConfig.getString("id") - generalConfig.getString("type") match { - case "FILTER-SQL" => - val sql = generalConfig.getString("sql") - id -> VirtualFileSelect(id,parentSourcesIds,sql, keyFieldList, isSave) - case "JOIN-SQL" => - val sql = generalConfig.getString("sql") - id -> VirtualFileJoinSql(id,parentSourcesIds,sql, keyFieldList, isSave) - case "JOIN" => - val joiningColumns=generalConfig.getStringList("joiningColumns") - val joinType=generalConfig.getString("joinType") - id -> VirtualFileJoin(id,parentSourcesIds,joiningColumns,joinType, keyFieldList, isSave) - - case x => throw IllegalParameterException(x) - } + if (configObj.hasPath("VirtualSources")) { + val sourcesList: List[ConfigObject] = + configObj.getObjectList("VirtualSources").toList + + sourcesList.map { src => + val generalConfig = src.toConfig + val keyFieldList: scala.Seq[String] = + if (generalConfig.hasPath("keyFields")) + generalConfig.getStringList("keyFields") + else Seq() + + val parentSourcesIds: scala.Seq[String] = + generalConfig.getStringList("parentSources") + + val isSave: Boolean = Try { generalConfig.getBoolean("save") }.toOption + .getOrElse(false) + val id = generalConfig.getString("id") + generalConfig.getString("type") match { + case "FILTER-SQL" => + val sql = generalConfig.getString("sql") + val persist: Option[StorageLevel] = + if (generalConfig.hasPath("persist")) + Some(StorageLevel.fromString(generalConfig.getString("persist"))) + else None + id -> VirtualFileSelect(id, parentSourcesIds, sql, keyFieldList, isSave, persist) + case "JOIN-SQL" => + val sql = generalConfig.getString("sql") + val persist: Option[StorageLevel] = + if (generalConfig.hasPath("persist")) + Some(StorageLevel.fromString(generalConfig.getString("persist"))) + else None + id -> VirtualFileJoinSql(id, parentSourcesIds, sql, keyFieldList, isSave, persist) + case "JOIN" => + val joiningColumns = generalConfig.getStringList("joiningColumns") + val joinType = generalConfig.getString("joinType") + id -> VirtualFileJoin(id, parentSourcesIds, joiningColumns, joinType, keyFieldList, isSave) + + case x => throw IllegalParameterException(x) + } }.toMap - }else { + } else { Map.empty } @@ -181,26 +203,25 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s * Parses databases from configuration file * @return Map of (db_id, db_config) */ - private def getDatabasesById: Map[String, DatabaseConfig]= { - val dbList: List[ConfigObject] = Try{ + private def getDatabasesById: Map[String, DatabaseConfig] = { + val dbList: List[ConfigObject] = Try { configObj.getObjectList("Databases").toList }.getOrElse(List.empty) - dbList.map{ - db => - val generalConfig = db.toConfig - val outerConfig = generalConfig.getConfig("config") - val id = generalConfig.getString("id") - val subtype = generalConfig.getString("subtype") - val host = outerConfig.getString("host") + dbList.map { db => + val generalConfig = db.toConfig + val outerConfig = generalConfig.getConfig("config") + val id = generalConfig.getString("id") + val subtype = generalConfig.getString("subtype") + val host = outerConfig.getString("host") - val port = Try(outerConfig.getString("port")).toOption - val service = Try(outerConfig.getString("service")).toOption - val user = Try(outerConfig.getString("user")).toOption - val password = Try(outerConfig.getString("password")).toOption - val schema = Try(outerConfig.getString("schema")).toOption + val port = Try(outerConfig.getString("port")).toOption + val service = Try(outerConfig.getString("service")).toOption + val user = Try(outerConfig.getString("user")).toOption + val password = Try(outerConfig.getString("password")).toOption + val schema = Try(outerConfig.getString("schema")).toOption - id -> DatabaseConfig(id, subtype, host, port, service, user, password, schema) + id -> DatabaseConfig(id, subtype, host, port, service, user, password, schema) }.toMap } @@ -209,31 +230,30 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s * @return Map of (file_id, metric) */ private def getMetricsBySource: List[(String, Metric)] = { - val metricsList: List[ConfigObject] = configObj.getObjectList("Metrics").toList - - val metricFileList: List[(String, Metric)] = metricsList.map { - mts => - val outerConf = mts.toConfig - val metricType = outerConf.getString("type") - val id = outerConf.getString("id") - val name = outerConf.getString("name") - val descr = outerConf.getString("description") - val intConfig = outerConf.getObject("config").toConfig - val params = getParams(intConfig) - val applyFile = intConfig.getString("file") - - metricType match { - case "COLUMN" => - val applyColumns = intConfig.getStringList("columns") - log.warn("COLUMNS "+applyColumns.mkString(",")) - applyFile -> ColumnMetric(id, name, descr, applyFile, "", applyColumns, params) - case "FILE" => - applyFile -> FileMetric(id, name, descr, applyFile, "", params) - case "OUTPUT" => - val applyMetric = intConfig.getString("outputMetric") - applyMetric -> OutputMetric(id, name, descr, applyMetric, params) - case x => throw IllegalParameterException(x) - } + val metricsList: List[ConfigObject] = + configObj.getObjectList("Metrics").toList + + val metricFileList: List[(String, Metric)] = metricsList.map { mts => + val outerConf = mts.toConfig + val metricType = outerConf.getString("type") + val id = outerConf.getString("id") + val name = outerConf.getString("name") + val descr = outerConf.getString("description") + val intConfig = outerConf.getObject("config").toConfig + val params = getParams(intConfig) + val applyFile = intConfig.getString("file") + + metricType match { + case "COLUMN" => + val applyColumns = intConfig.getStringList("columns") + val columnPos: scala.Seq[Int] = + Try(intConfig.getIntList("positions").toSeq.map(_.toInt)).toOption + .getOrElse(Seq.empty) + applyFile -> ColumnMetric(id, name, descr, applyFile, "", applyColumns, params, columnPos) + case "FILE" => + applyFile -> FileMetric(id, name, descr, applyFile, "", params) + case x => throw IllegalParameterException(x) + } } metricFileList @@ -245,7 +265,7 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s */ private def getSqlChecks: List[SQLCheck] = { val checkList: List[ConfigObject] = configObj.getObjectList("Checks").toList - val sqlChecks = checkList.flatMap{ check => + val sqlChecks = checkList.flatMap { check => val outerConf = check.toConfig val checkType = outerConf.getString("type") checkType match { @@ -254,20 +274,20 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s Try { outerConf.getString("description") }.toOption - val subtype = outerConf.getString("subtype") + val subtype = outerConf.getString("subtype") val innerConf = outerConf.getConfig("config") - val source = innerConf.getString("source") - val query = innerConf.getString("query") + val source = innerConf.getString("source") + val query = innerConf.getString("query") val id = Try { outerConf.getString("id") - }.toOption.getOrElse(subtype+":"+checkType+":"+source+":"+query.hashCode) + }.toOption.getOrElse(subtype + ":" + checkType + ":" + source + ":" + query.hashCode) val date = Try { outerConf.getString("date") }.toOption.getOrElse(settings.refDateString) - val sourceConf: DatabaseConfig = this.dbConfigMap(source)// "ORACLE" - List(SQLCheck(id,description.getOrElse(""),subtype,source,sourceConf,query,date)) - case "snapshot"|"trend" => List.empty - case x => throw IllegalParameterException(x) + val sourceConf: DatabaseConfig = this.dbConfigMap(source) // "ORACLE" + List(SQLCheck(id, description.getOrElse(""), subtype, source, sourceConf, query, date)) + case "snapshot" | "trend" => List.empty + case x => throw IllegalParameterException(x) } } sqlChecks @@ -280,23 +300,24 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s private def getMetricByCheck: List[(Check, String)] = { val checksList: List[ConfigObject] = configObj.getObjectList("Checks").toList - val metricListByCheck = checksList.flatMap { - chks => - val outerConf = chks.toConfig - val checkType = outerConf.getString("type") - val descr = Try { - outerConf.getString("description") - }.toOption - val subtype = outerConf.getString("subtype") - val intConfig = outerConf.getObject("config").toConfig - - val params = getParams(intConfig) - val metricListByCheck: List[(Check, String)] = checkType.toUpperCase match { + val metricListByCheck = checksList.flatMap { chks => + val outerConf = chks.toConfig + val checkType = outerConf.getString("type") + val descr = Try { + outerConf.getString("description") + }.toOption + val subtype = outerConf.getString("subtype") + val intConfig = outerConf.getObject("config").toConfig + + val params = getParams(intConfig) + val metricListByCheck: List[(Check, String)] = + checkType.toUpperCase match { case "SNAPSHOT" => val metrics = intConfig.getStringList("metrics") val id = Try { outerConf.getString("id") - }.toOption.getOrElse(subtype+":"+checkType+":"+metrics.mkString("+")+":"+params.values.mkString(",")) + }.toOption.getOrElse(subtype + ":" + checkType + ":" + metrics + .mkString("+") + ":" + params.values.mkString(",")) subtype match { // There also a way to use check name, but with additional comparsment rule case "BASIC_NUMERIC" => @@ -304,44 +325,110 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s compRule.toUpperCase match { case "GT" => if (params.contains("threshold")) - metrics.map { m => GreaterThanThresholdCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("threshold").toString.toDouble) -> m }.toList + metrics.map { m => + GreaterThanThresholdCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("threshold").toString.toDouble) -> m + }.toList else if (params.contains("compareMetric")) - metrics.map { m => GreaterThanMetricCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("compareMetric").toString) -> m }.toList + metrics.map { m => + GreaterThanMetricCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("compareMetric").toString) -> m + }.toList else throw MissingParameterInException(subtype) case "LT" => if (params.contains("threshold")) - metrics.map { m => LessThanThresholdCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("threshold").toString.toDouble) -> m }.toList + metrics.map { m => + LessThanThresholdCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("threshold").toString.toDouble) -> m + }.toList else if (params.contains("compareMetric")) - metrics.map { m => LessThanMetricCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("compareMetric").toString) -> m }.toList + metrics.map { m => + LessThanMetricCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("compareMetric").toString) -> m + }.toList else throw MissingParameterInException(subtype) case "EQ" => if (params.contains("threshold")) - metrics.map { m => EqualToThresholdCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("threshold").toString.toDouble) -> m }.toList + metrics.map { m => + EqualToThresholdCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("threshold").toString.toDouble) -> m + }.toList else if (params.contains("compareMetric")) - metrics.map { m => EqualToMetricCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("compareMetric").toString) -> m }.toList + metrics.map { m => + EqualToMetricCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("compareMetric").toString) -> m + }.toList else throw MissingParameterInException(subtype) } case "GREATER_THAN" => if (params.contains("threshold")) - metrics.map { m => GreaterThanThresholdCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("threshold").toString.toDouble) -> m }.toList + metrics.map { m => + GreaterThanThresholdCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("threshold").toString.toDouble) -> m + }.toList else if (params.contains("compareMetric")) - metrics.map { m => GreaterThanMetricCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("compareMetric").toString) -> m }.toList + metrics.map { m => + GreaterThanMetricCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("compareMetric").toString) -> m + }.toList else throw MissingParameterInException(subtype) case "LESS_THAN" => if (params.contains("threshold")) - metrics.map { m => LessThanThresholdCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("threshold").toString.toDouble) -> m }.toList + metrics.map { m => + LessThanThresholdCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("threshold").toString.toDouble) -> m + }.toList else if (params.contains("compareMetric")) - metrics.map { m => LessThanMetricCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("compareMetric").toString) -> m }.toList + metrics.map { m => + LessThanMetricCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("compareMetric").toString) -> m + }.toList else throw MissingParameterInException(subtype) case "EQUAL_TO" => if (params.contains("threshold")) - metrics.map { m => EqualToThresholdCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("threshold").toString.toDouble) -> m }.toList + metrics.map { m => + EqualToThresholdCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("threshold").toString.toDouble) -> m + }.toList else if (params.contains("compareMetric")) - metrics.map { m => EqualToMetricCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("compareMetric").toString) -> m }.toList + metrics.map { m => + EqualToMetricCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("compareMetric").toString) -> m + }.toList else throw MissingParameterInException(subtype) case "DIFFER_BY_LT" => if (params.contains("threshold") && params.contains("compareMetric")) - metrics.map { m => DifferByLTMetricCheck(id, descr.getOrElse(""), Seq.empty[MetricResult], params("compareMetric").toString, params("threshold").toString.toDouble) -> m }.toList + metrics.map { m => + DifferByLTMetricCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + params("compareMetric").toString, + params("threshold").toString.toDouble) -> m + }.toList else throw MissingParameterInException(subtype) case x => throw IllegalParameterException(x) } @@ -349,62 +436,94 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s val metrics = intConfig.getStringList("metrics") val id = Try { outerConf.getString("id") - }.toOption.getOrElse(subtype+":"+checkType+":"+metrics.mkString("+")+":"+params.values.mkString(",")) - val rule = intConfig.getString("rule") - val startDate = Try {params("startDate").toString}.toOption + }.toOption.getOrElse(subtype + ":" + checkType + ":" + metrics + .mkString("+") + ":" + params.values.mkString(",")) + val rule = intConfig.getString("rule") + val startDate = Try { params("startDate").toString }.toOption subtype match { case "TOP_N_RANK_CHECK" => - if (params.contains("threshold") && params.contains("timewindow") ) - metrics.flatMap { m => { - val basecheck = TopNRankCheck(id, descr.getOrElse(""), - Seq.empty[MetricResult], rule, params("threshold").toString.toDouble, - params("timewindow").toString.toInt, startDate) - generateMetricSubId(m, params("targetNumber").toString.toInt).map(x => basecheck -> x) - }}.toList + if (params.contains("threshold") && params.contains("timewindow")) + metrics.flatMap { m => + { + val basecheck = + TopNRankCheck(id, + descr.getOrElse(""), + Seq.empty[MetricResult], + rule, + params("threshold").toString.toDouble, + params("timewindow").toString.toInt, + startDate) + generateMetricSubId(m, params("targetNumber").toString.toInt) + .map(x => basecheck -> x) + } + }.toList else throw MissingParameterInException(subtype) case "AVERAGE_BOUND_FULL_CHECK" => if (params.contains("threshold") && params.contains("timewindow")) - metrics.map { m => AverageBoundFullCheck( - id, - descr.getOrElse(""), - Seq.empty[MetricResult], - rule, - params("threshold").toString.toDouble, - params("timewindow").toString.toInt, - startDate - ) -> m }.toList + metrics.map { m => + AverageBoundFullCheck( + id, + descr.getOrElse(""), + Seq.empty[MetricResult], + rule, + params("threshold").toString.toDouble, + params("timewindow").toString.toInt, + startDate + ) -> m + }.toList else throw MissingParameterInException(subtype) + case "AVERAGE_BOUND_RANGE_CHECK" => + if (params.contains("thresholdUpper") && params.contains("thresholdLower") && params.contains( + "timewindow")) + metrics.map { m => + AverageBoundRangeCheck( + id, + descr.getOrElse(""), + Seq.empty[MetricResult], + rule, + params("thresholdUpper").toString.toDouble, + params("thresholdLower").toString.toDouble, + params("timewindow").toString.toInt, + startDate + ) -> m + }.toList + else throw MissingParameterInException(subtype) + case "AVERAGE_BOUND_UPPER_CHECK" => if (params.contains("threshold") && params.contains("timewindow")) - metrics.map { m => AverageBoundUpperCheck( - id, - descr.getOrElse(""), - Seq.empty[MetricResult], - rule, - params("threshold").toString.toDouble, - params("timewindow").toString.toInt, - startDate - ) -> m }.toList + metrics.map { m => + AverageBoundUpperCheck( + id, + descr.getOrElse(""), + Seq.empty[MetricResult], + rule, + params("threshold").toString.toDouble, + params("timewindow").toString.toInt, + startDate + ) -> m + }.toList else throw MissingParameterInException(subtype) case "AVERAGE_BOUND_LOWER_CHECK" => if (params.contains("threshold") && params.contains("timewindow")) - metrics.map { m => AverageBoundLowerCheck( - id, - descr.getOrElse(""), - Seq.empty[MetricResult], - rule, - params("threshold").toString.toDouble, - params("timewindow").toString.toInt, - startDate - ) -> m }.toList + metrics.map { m => + AverageBoundLowerCheck( + id, + descr.getOrElse(""), + Seq.empty[MetricResult], + rule, + params("threshold").toString.toDouble, + params("timewindow").toString.toInt, + startDate + ) -> m + }.toList else throw MissingParameterInException(subtype) case x => throw IllegalParameterException(x) } case "SQL" => List.empty - case x => throw IllegalParameterException(x) + case x => throw IllegalParameterException(x) } - metricListByCheck + metricListByCheck } metricListByCheck @@ -416,49 +535,77 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s */ private def getTargetsConfigMap: Map[String, List[TargetConfig]] = { - val optionalTargetList = Try{configObj.getObjectList("Targets").toList}.toOption + val optionalTargetList = Try { configObj.getObjectList("Targets").toList }.toOption optionalTargetList match { case Some(targetList) => - val parsedList = targetList.map { - trg => - val outerConf = trg.toConfig - val tipo = outerConf.getString("type") - val name = Try { - outerConf.getString("id") - }.getOrElse(tipo) + val parsedList = targetList.map { trg => + val outerConf = trg.toConfig + val inConfig = outerConf.getObject("config").toConfig - val inConfig = outerConf.getObject("config").toConfig - val fileFormat = inConfig.getString("fileFormat") - val path = inConfig.getString("path") - val delimiter = Try { - inConfig.getString("delimiter") - }.toOption - val savemode = Try { - inConfig.getString("savemode") - }.toOption - val date = Try { - inConfig.getString("date") - }.toOption + val tipo = outerConf.getString("type") + val name = Try(outerConf.getString("id")).getOrElse(tipo) - val hdfsTargetConfig = HdfsTargetConfig(name, fileFormat, path, delimiter, date, savemode) - tipo match { - case "SYSTEM" => - val checkList: Seq[String] = outerConf.getStringList("checkList").toList - val mailList: Seq[String] = outerConf.getStringList("mailingList").toList - tipo -> SystemTargetConfig(name, checkList, mailList, hdfsTargetConfig) - case _ => - tipo -> hdfsTargetConfig - } + val fileFormat = inConfig.getString("fileFormat") + val path = inConfig.getString("path") + + val delimiter = Try(inConfig.getString("delimiter")).toOption + val quote = Try(inConfig.getString("quote")).toOption + val escape = Try(inConfig.getString("escape")).toOption + + val quoteMode = Try(inConfig.getString("quoteMode")).toOption + + val date = Try(inConfig.getString("date")).toOption + + val hdfsTargetConfig = HdfsTargetConfig(name, fileFormat, path, delimiter, quote, escape, date, quoteMode) + + tipo.toUpperCase match { + case "SYSTEM" => + val checkList: Seq[String] = outerConf.getStringList("checkList").toList + val mailList: Seq[String] = outerConf.getStringList("mailingList").toList + tipo -> SystemTargetConfig(name, checkList, mailList, hdfsTargetConfig) + case _ => + tipo -> hdfsTargetConfig + } } - parsedList.groupBy(_._1).map { case (k,v) => (k,v.map(_._2))} + parsedList.groupBy(_._1).map { case (k, v) => (k, v.map(_._2)) } case None => Map.empty } } + private def getLoadChecks: Map[String, Seq[LoadCheck]] = { + val checkListConfOpt = Try { configObj.getObjectList("LoadChecks").toList }.toOption + checkListConfOpt match { + case Some(checkList) => + checkList + .map(x => { + val conf = x.toConfig + + val id: String = conf.getString("id") + val tipo: String = conf.getString("type") + val source: String = conf.getString("source") + val result: AnyRef = conf.getAnyRef("option") + + if (LoadCheckEnum.contains(tipo)) { + val check: LoadCheck = LoadCheckEnum + .getCheckClass(tipo) + .getConstructor(classOf[String], classOf[String], classOf[String], classOf[AnyRef]) + .newInstance(id, tipo, source, result) + .asInstanceOf[LoadCheck] + + (source, check) + } else throw new IllegalArgumentException(s"Unknown Load Check type: $tipo") + + }) + .groupBy(_._1) + .map { case (k, v) => (k, v.map(_._2)) } + case None => Map.empty[String, Seq[LoadCheck]] + } + + } + /** * Utilities */ - /** * Processes parameter sub-configuration * Made to prevent unexpected parameters and their values @@ -472,20 +619,23 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s entry: Entry[String, ConfigValue] <- p.entrySet() key = entry.getKey value = key match { - case "threshold" => p.getDouble(key) - case "timewindow" => p.getInt(key) - case "compareMetric" => p.getString(key) - case "compareValue" => p.getString(key) - case "targetValue" => p.getString(key) - case "maxCapacity" => p.getInt(key) - case "accuracyError" => p.getDouble(key) - case "targetNumber" => p.getInt(key) - case "targetSideNumber" => p.getDouble(key) // move to irrelevant params - case "domain" => p.getStringList(key).toSet - case "startDate" => p.getString(key) - case "compRule" => p.getString(key) + case "threshold" => p.getDouble(key) + case "thresholdUpper" => p.getDouble(key) + case "thresholdLower" => p.getDouble(key) + case "timewindow" => p.getInt(key) + case "compareMetric" => p.getString(key) + case "compareValue" => p.getString(key) + case "targetValue" => p.getString(key) + case "maxCapacity" => p.getInt(key) + case "accuracyError" => p.getDouble(key) + case "targetNumber" => p.getInt(key) + case "targetSideNumber" => + p.getDouble(key) // move to irrelevant params + case "domain" => p.getStringList(key).toSet + case "startDate" => p.getString(key) + case "compRule" => p.getString(key) case "dateFormat" => p.getString(key) - case "regex" => p.getString(key) + case "regex" => p.getString(key) case x => log.error(s"${key.toUpperCase} is an unexpected parameters from config!") throw IllegalParameterException(x) @@ -500,23 +650,24 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s * @return List of composed metrics */ private def getComposedMetrics: List[ComposedMetric] = { - val metricsList: List[ConfigObject] = Try(configObj.getObjectList("ComposedMetrics").toList).getOrElse(List.empty[ConfigObject]) - - val metricFileList: List[ComposedMetric] = metricsList.map { - mts => - val outerConf = mts.toConfig - val id = outerConf.getString("id") - val name = outerConf.getString("name") - val descr = outerConf.getString("description") - val formula = outerConf.getString("formula") - - ComposedMetric( - id, - name, - descr, - formula, - Map.empty - ) + val metricsList: List[ConfigObject] = + Try(configObj.getObjectList("ComposedMetrics").toList) + .getOrElse(List.empty[ConfigObject]) + + val metricFileList: List[ComposedMetric] = metricsList.map { mts => + val outerConf = mts.toConfig + val id = outerConf.getString("id") + val name = outerConf.getString("name") + val descr = outerConf.getString("description") + val formula = outerConf.getString("formula") + + ComposedMetric( + id, + name, + descr, + formula, + Map.empty + ) } metricFileList @@ -535,12 +686,13 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s conf.getObjectList("schema") }.toOption match { case Some(p) => p.asScala - case _ => Try { - conf.getString("schema") - }.toOption match { - case Some(s) => return Some(s) - case _ => mutable.Buffer.empty[ConfigObject] - } + case _ => + Try { + conf.getString("schema") + }.toOption match { + case Some(s) => return Some(s) + case _ => mutable.Buffer.empty[ConfigObject] + } } // exact schema parsing @@ -549,10 +701,11 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s StructColumn( cf.getString("name"), cf.getString("type"), - if (cf.getString("type") == "date") Option(cf.getString("format")) else None + if (cf.getString("type") == "date") Option(cf.getString("format")) + else None ) } - if (ll.isEmpty) None else Some(ll) + if (ll.isEmpty) None else Some(ll) } @@ -567,12 +720,13 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s conf.getObjectList("schema") }.toOption match { case Some(p) => p.toList - case _ => Try { - conf.getString("schema") - }.toOption match { - case Some(s) => return Some(s) - case _ => mutable.Buffer.empty[ConfigObject] - } + case _ => + Try { + conf.getString("schema") + }.toOption match { + case Some(s) => return Some(s) + case _ => mutable.Buffer.empty[ConfigObject] + } } val ll: Seq[StructFixedColumn] = list.toList.map { x => @@ -581,7 +735,8 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s cf.getString("name"), cf.getString("type"), cf.getInt("length"), - if (cf.getString("type") == "date") Option(cf.getString("format")) else None + if (cf.getString("type") == "date") Option(cf.getString("format")) + else None ) } @@ -612,7 +767,9 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s log.info(s"Found configuration with mode : $mode") val inner: Config = conf.getConfig("config") - meta.service.getConstructors.head.newInstance(inner).asInstanceOf[BasicPostprocessor] + meta.service.getConstructors.head + .newInstance(inner) + .asInstanceOf[BasicPostprocessor] case None => log.warn("Wrong mode name!") throw IllegalParameterException(mode) @@ -621,5 +778,3 @@ class ConfigReader(configNameFile: String)(implicit sqlWriter: LocalDBManager, s } } - - diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/GenStructType.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/GenStructType.scala index 1ab4f89..ba92b3d 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/GenStructType.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/GenStructType.scala @@ -1,8 +1,6 @@ package it.agilelab.bigdata.DataQuality.configs /** - * Created by Gianvito Siciliano on 12/01/17. - * * Representation of columns for schema parsing */ sealed abstract class GenStructColumn { @@ -11,17 +9,11 @@ sealed abstract class GenStructColumn { def tipo: String } -case class StructColumn(name: String, - tipo: String, - format: Option[String] = None) - extends GenStructColumn { +case class StructColumn(name: String, tipo: String, format: Option[String] = None) extends GenStructColumn { def getType = "StructColumn" } -case class StructFixedColumn(name: String, - tipo: String, - length: Int, - format: Option[String] = None) +case class StructFixedColumn(name: String, tipo: String, length: Int, format: Option[String] = None) extends GenStructColumn { def getType = "StructFixedColumn" } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/ColumnMetrics/BasicNumericMetrics.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/ColumnMetrics/BasicNumericMetrics.scala index b8304f0..d8fb4f2 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/ColumnMetrics/BasicNumericMetrics.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/ColumnMetrics/BasicNumericMetrics.scala @@ -318,7 +318,7 @@ object BasicNumericMetrics { } override def increment(values: Seq[Any]): MetricCalculator = { - if (Try { values.head.asInstanceOf[Double] }.isSuccess) + if (Try { values.head.toString.toDouble }.isSuccess) NumberCastValuesMetricCalculator(cnt + 1) else this } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/ColumnMetrics/BasicStringMetrics.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/ColumnMetrics/BasicStringMetrics.scala index 55dabfb..3b350d3 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/ColumnMetrics/BasicStringMetrics.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/ColumnMetrics/BasicStringMetrics.scala @@ -1,9 +1,12 @@ package it.agilelab.bigdata.DataQuality.metrics.ColumnMetrics +import java.text.SimpleDateFormat + import it.agilelab.bigdata.DataQuality.metrics.CalculatorStatus.CalculatorStatus import it.agilelab.bigdata.DataQuality.metrics.MetricProcessor.ParamMap import it.agilelab.bigdata.DataQuality.metrics.{CalculatorStatus, MetricCalculator, StatusableCalculator} import it.agilelab.bigdata.DataQuality.utils.{getParametrizedMetricTail, _} +import org.joda.time.LocalDateTime import org.joda.time.format.DateTimeFormat import scala.util.Try @@ -281,9 +284,11 @@ object BasicStringMetrics { paramMap) private def checkDate(value: Any, dateFormat: String) = { - val fmt = DateTimeFormat forPattern formatDate tryToString(value) match { - case Some(v) => Try(fmt parseDateTime v).isSuccess + case Some(v) => + val joda = Try(LocalDateTime.parse(v, DateTimeFormat.forPattern(dateFormat))).isSuccess + val sdf = Try(new SimpleDateFormat(dateFormat).parse(v)).isSuccess + joda || sdf case _ => false } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/Metric.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/Metric.scala index a6fdcf2..eceaaa1 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/Metric.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/Metric.scala @@ -18,8 +18,11 @@ case class ColumnMetric( source: String, sourceDate: String, columns: Seq[String], - paramMap: Map[String, Any] -) extends Metric + paramMap: Map[String, Any], + positions: Seq[Int] = Seq.empty +) extends Metric { + if (positions.nonEmpty && positions.size != columns.size) throw new IllegalArgumentException("paramMap.size != columns.size") +} case class FileMetric( id: String, @@ -38,10 +41,9 @@ case class ComposedMetric( paramMap: Map[String, Any] ) extends Metric -case class OutputMetric( - id: String, - name: String, - description: String, - outputMetric: String, - paramMap: Map[String, Any] -) extends Metric +case class ConfigMetric( + id: String, + name: String, + description: String, + paramMap: Map[String, Any] + ) extends Metric diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricCalculator.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricCalculator.scala index 5b0daa2..00f0ce8 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricCalculator.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricCalculator.scala @@ -19,8 +19,7 @@ object CalculatorStatus extends Enumeration { trait StatusableCalculator extends MetricCalculator { protected val status: CalculatorStatus protected val failCount: Int - protected def copyWithState( - failed: CalculatorStatus): MetricCalculator with StatusableCalculator + protected def copyWithState(failed: CalculatorStatus): MetricCalculator with StatusableCalculator def getFailCounter: Int = failCount @@ -31,6 +30,7 @@ trait StatusableCalculator extends MetricCalculator { * Basic metric calculator */ trait MetricCalculator { + /** * Merges two metric calculators together * @@ -54,7 +54,7 @@ trait MetricCalculator { */ def result(): Map[String, (Double, Option[String])] - def getStatus = CalculatorStatus.OK + def getStatus: CalculatorStatus = CalculatorStatus.OK } @@ -62,8 +62,7 @@ trait MetricCalculator { * Takes all metric results and then calculating new ones with formulas * @param primitiveMetrics metric results to operate with */ -class ComposedMetricCalculator(primitiveMetrics: Iterable[MetricResult]) - extends ExprParsers2 { +class ComposedMetricCalculator(primitiveMetrics: Iterable[MetricResult]) extends ExprParsers2 { private lazy val metricsResultMap: Map[String, String] = getMetricResultMap @@ -75,17 +74,11 @@ class ComposedMetricCalculator(primitiveMetrics: Iterable[MetricResult]) */ def run(ex: ComposedMetric)(implicit settings: DQSettings) = { val formulaWithParameters = ex.formula - val formulaWithValues = replaceMetricsInFormula(formulaWithParameters) + val formulaWithValues = replaceMetricsInFormula(formulaWithParameters) val result = calculateFormula(formulaWithValues) - ComposedMetricResult(ex.id, - settings.refDateString, - ex.name, - "", - formulaWithParameters, - result, - "") + ComposedMetricResult(ex.id, settings.refDateString, ex.name, "", formulaWithParameters, result, "") } /** @@ -135,7 +128,7 @@ sealed trait ExprParsers2 extends JavaTokenParsers { case class Mul(t1: Tree, t2: Tree) extends Tree case class Div(t1: Tree, t2: Tree) extends Tree case class Pow(t1: Tree, t2: Tree) extends Tree - case class Num(t: Double) extends Tree + case class Num(t: Double) extends Tree def eval(t: Tree): Double = t match { case Add(t1, t2) => eval(t1) + eval(t2) diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricProcessor.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricProcessor.scala index 73005b5..36e0e01 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricProcessor.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricProcessor.scala @@ -4,10 +4,12 @@ import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException import it.agilelab.bigdata.DataQuality.metrics.FileMetrics.FileMetrics.RowCountMetricCalculator import it.agilelab.bigdata.DataQuality.utils import it.agilelab.bigdata.DataQuality.utils.{DQSettings, Logging} -import org.apache.spark.sql.{DataFrame, Row} +import org.apache.hadoop.fs.FileSystem +import org.apache.spark.sql.{DataFrame, Row, SQLContext} import org.apache.spark.{Accumulable, SparkContext} import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer object MetricProcessor extends Logging { @@ -26,11 +28,11 @@ object MetricProcessor extends Logging { def processAllMetrics(df: DataFrame, colMetrics: Seq[ColumnMetric], fileMetrics: Seq[FileMetric], - sourceKeyFields: Seq[String])( - implicit settings: DQSettings, - sparkContext: SparkContext) - : (Map[Seq[String], Map[ColumnMetric, (Double, Option[String])]], - Map[FileMetric, (Double, Option[String])]) = { + sourceKeyFields: Seq[String])(implicit settings: DQSettings, + sc: SparkContext, + SQLContext: SQLContext, + fs: FileSystem) + : (Map[Seq[String], Map[ColumnMetric, (Double, Option[String])]], Map[FileMetric, (Double, Option[String])]) = { /** * Calls a unified metric calculator's constructor @@ -39,8 +41,7 @@ object MetricProcessor extends Logging { * @param paramMap parameter map * @return instance of metric calculator */ - def initGroupCalculator(tag: Class[_], - paramMap: ParamMap): MetricCalculator = { + def initGroupCalculator(tag: Class[_], paramMap: ParamMap): MetricCalculator = { tag .getConstructor(classOf[Map[String, Any]]) .newInstance(paramMap) @@ -61,13 +62,13 @@ object MetricProcessor extends Logging { // init file metric calculators val fileMetCalculators: Map[FileMetric, MetricCalculator] = fileMetrics.map { mm => - { - val calc = mm.name match { - case "ROW_COUNT" => RowCountMetricCalculator(0) //return rows count - case x => throw IllegalParameterException(x) + { + val calc = mm.name match { + case "ROW_COUNT" => RowCountMetricCalculator(0) //return rows count + case x => throw IllegalParameterException(x) + } + mm -> calc } - mm -> calc - } }.toMap /** @@ -86,26 +87,22 @@ object MetricProcessor extends Logging { * * So in the end we are initializing only unique calculators. */ - val metricsByColumn: Map[Seq[String], Seq[ColumnMetric]] = - colMetrics.groupBy(_.columns) + val metricsByColumn: Map[Seq[String], Seq[ColumnMetric]] = colMetrics.groupBy(_.columns) + + val columnsIndexes: Map[String, Int] = df.schema.fieldNames.map(s => s -> df.schema.fieldIndex(s)).toMap + val sourceKeyIds: Seq[Int] = sourceKeyFields.flatMap(i => columnsIndexes.get(i)) - val columnsIndexes: Map[String, Int] = - df.schema.fieldNames.map(s => s -> df.schema.fieldIndex(s)).toMap - val sourceKeyIds: Seq[Int] = sourceKeyFields.flatMap(i => columnsIndexes.get(i)) log.info(s"KEY FIELDS: [${sourceKeyFields.mkString(",")}]") - if (sourceKeyIds.size != sourceKeyFields.size) - log.warn("Some of key fields were not found! Please, check them.") + + if (sourceKeyIds.size != sourceKeyFields.size) log.error("Some of key fields were not found! Please, check them.") val dumpSize = settings.errorDumpSize - val groupedCalculators - : Map[Seq[String], Seq[(MetricCalculator, Seq[ColumnMetric])]] = + val groupedCalculators: Map[Seq[String], Seq[(MetricCalculator, Seq[ColumnMetric])]] = metricsByColumn.map { case (colId, metList) => colId -> metList - .map(mm => - (mm, - initGroupCalculator(MetricMapper.getMetricClass(mm.name), mm.paramMap))) + .map(mm => (mm, initGroupCalculator(MetricMapper.getMetricClass(mm.name), mm.paramMap))) .groupBy(_._2) .mapValues(_.map(_._1)) .toSeq @@ -113,42 +110,29 @@ object MetricProcessor extends Logging { /** * To calculate metrics we are using three-step processing: - * 1. Iterating over dataframe and passing values to the calculators + * 1. Iterating over RDD and passing values to the calculators * 2. Updating partition calculators before merging (operations like trimming, shifting, etc) * 3. Reducing (merging partition calculator) * * File and column metrics are storing separately */ - val failedRowsForMetric - : Accumulable[mutable.ArrayBuffer[(String, String)], (String, String)] = - sparkContext.accumulableCollection( - mutable.ArrayBuffer.empty[(String, String)]) - - val (columnMetricCalculators, fileMetricCalculators): (Map[ - Seq[String], - Seq[ - (MetricCalculator, - Seq[ - ColumnMetric])]], - Map[ - FileMetric, - MetricCalculator]) = + val failedRowsForMetric: Accumulable[mutable.ArrayBuffer[(String, String)], (String, String)] = + sc.accumulableCollection(mutable.ArrayBuffer.empty[(String, String)]) + + val (columnMetricCalculators, fileMetricCalculators): (Map[Seq[String], Seq[(MetricCalculator, Seq[ColumnMetric])]], + Map[FileMetric, MetricCalculator]) = df.rdd.treeAggregate((groupedCalculators, fileMetCalculators))( seqOp = { - case (( - colMetCalcs: Map[Seq[String], - Seq[(MetricCalculator, Seq[ColumnMetric])]], - fileMetCalcs: Map[FileMetric, MetricCalculator] - ), - row: Row) => - val updatedColRes - : Map[Seq[String], Seq[(MetricCalculator, Seq[ColumnMetric])]] = + case ((colMetCalcs: Map[Seq[String], Seq[(MetricCalculator, Seq[ColumnMetric])]], + fileMetCalcs: Map[FileMetric, MetricCalculator]), + row: Row) => + val updatedColRes: Map[Seq[String], Seq[(MetricCalculator, Seq[ColumnMetric])]] = colMetCalcs.map(m => { - val ids: Seq[Int] = m._1.map(x => columnsIndexes(x)) + val pos: Seq[Int] = m._2.flatMap(x => x._2.flatMap(_.positions)) + val ids: Seq[Int] = if (pos.isEmpty) m._1.map(x => columnsIndexes(x)) else pos val columnValues: Seq[Any] = ids.map(id => row.get(id)) - val incrementedCalculators - : Seq[(MetricCalculator, Seq[ColumnMetric])] = + val incrementedCalculators: Seq[(MetricCalculator, Seq[ColumnMetric])] = colMetCalcs(m._1).map { case (calc: MetricCalculator, met: Seq[ColumnMetric]) => (calc.increment(columnValues), met) @@ -164,14 +148,13 @@ object MetricProcessor extends Logging { val failedMetricIds: Iterable[String] = updatedColRes.values.flatten.collect { case (ic: StatusableCalculator, met: Seq[ColumnMetric]) - if ic.getStatus == CalculatorStatus.FAILED && ic.getFailCounter < dumpSize => + if ic.getStatus == CalculatorStatus.FAILED && ic.getFailCounter <= dumpSize => met.map(_.id) }.flatten if (failedMetricIds.nonEmpty && sourceKeyIds.nonEmpty) { - val columnValue = - sourceKeyIds.map(id => row.get(id)).mkString(",") - val metIds = failedMetricIds.mkString(",") + val columnValue = sourceKeyIds.map(id => if (row.isNullAt(id)) "" else row.get(id)).mkString("<;>") + val metIds = failedMetricIds.mkString("<;>") failedRowsForMetric.add((metIds, columnValue)) } @@ -179,11 +162,9 @@ object MetricProcessor extends Logging { (updatedColRes, updatedFileRes) }, combOp = (r, l) => { - val colMerged - : Map[Seq[String], Seq[(MetricCalculator, Seq[ColumnMetric])]] = + val colMerged: Map[Seq[String], Seq[(MetricCalculator, Seq[ColumnMetric])]] = l._1.map(c => { - val zipedCalcs: Seq[((MetricCalculator, Seq[ColumnMetric]), - (MetricCalculator, Seq[ColumnMetric]))] = r + val zipedCalcs: Seq[((MetricCalculator, Seq[ColumnMetric]), (MetricCalculator, Seq[ColumnMetric]))] = r ._1(c._1) zip l._1(c._1) val merged: Seq[(MetricCalculator, Seq[ColumnMetric])] = zipedCalcs.map(zc => (zc._1._1.merge(zc._2._1), zc._1._2)) @@ -197,26 +178,22 @@ object MetricProcessor extends Logging { columnMetricCalculators.values.flatten.foreach { case (calc: StatusableCalculator, metrics) => - log.info( - s"For metrics:[${metrics.map(_.id).mkString(",")}] were found ${calc.getFailCounter} errors.") + log.info(s"For metrics:[${metrics.map(_.id).mkString(",")}] were found ${calc.getFailCounter} errors.") case (_, _) => } settings.errorFolderPath match { case Some(_) => - log.info(s"Maximum error dump size: $dumpSize") val accumulator: mutable.Seq[(Array[String], String)] = failedRowsForMetric.value.map { - case (metIds, errorRow) => (metIds.split(","), errorRow) - } - val trimmedAccumulator: Map[String, mutable.Seq[String]] = accumulator - .flatMap { - case (met, row) => met.map(m => m -> row) + case (metIds, errorRow) => (metIds.split("<;>", -1), errorRow) } + val trimmedAccumulator: Map[String, mutable.Seq[Seq[String]]] = accumulator + .flatMap { case (met, row) => met.map(m => m -> row) } .groupBy(_._1) - .mapValues(_.map(_._2).take(dumpSize)) - trimmedAccumulator.foreach(metErrors => - utils.saveErrors(sourceKeyFields, metErrors)) + .mapValues(_.map(_._2.split("<;>", -1).toSeq)) + + trimmedAccumulator.foreach(metErrors => utils.saveErrors(sourceKeyFields, metErrors)) case None => log.info("No error dump path found") } @@ -234,7 +211,7 @@ object MetricProcessor extends Logging { */ // combine file metrics and results val fileMetResults: Map[FileMetric, (Double, Option[String])] = - fileMetricCalculators.map(x => x._1 -> x._2.result()(x._1.name)) + fileMetricCalculators.map(x => x._1 -> x._2.result()(x._1.name)) // init list of all metrics per column val resultsMap: Map[Seq[String], Map[String, (Double, Option[String])]] = @@ -250,53 +227,45 @@ object MetricProcessor extends Logging { */ // process metrics (SPLIT) val processedMetrics: Map[Seq[String], Seq[ColumnMetric]] = - metricsByColumn.map(col => { - - def splitMetric(baseMetric: ColumnMetric, - splitNum: Int): Seq[ColumnMetric] = { - def generateMetric(bm: ColumnMetric, - sn: Int, - aggr: Seq[ColumnMetric]): Seq[ColumnMetric] = { - if (sn > 0) { - val newMetric = ColumnMetric( - baseMetric.id + "_" + sn.toString, - baseMetric.name + "_" + sn.toString, - baseMetric.description, - baseMetric.source, - baseMetric.sourceDate, - baseMetric.columns, - baseMetric.paramMap - ) - return generateMetric(bm, sn - 1, aggr ++ Seq(newMetric)) + metricsByColumn.map(col => { + + def splitMetric(baseMetric: ColumnMetric, splitNum: Int): Seq[ColumnMetric] = { + def generateMetric(bm: ColumnMetric, sn: Int, aggr: Seq[ColumnMetric]): Seq[ColumnMetric] = { + if (sn > 0) { + val newMetric = ColumnMetric( + baseMetric.id + "_" + sn.toString, + baseMetric.name + "_" + sn.toString, + baseMetric.description, + baseMetric.source, + baseMetric.sourceDate, + baseMetric.columns, + baseMetric.paramMap + ) + return generateMetric(bm, sn - 1, aggr ++ Seq(newMetric)) + } + aggr } - aggr - } - generateMetric(baseMetric, splitNum, Seq.empty) - } + generateMetric(baseMetric, splitNum, Seq.empty) + } - val processed: Seq[ColumnMetric] = col._2.flatMap(metric => - metric.name match { - case "TOP_N" => - splitMetric( - metric, - metric.paramMap.getOrElse("targetNumber", 10).toString.toInt) - case _ => Seq(metric) + val processed: Seq[ColumnMetric] = col._2.flatMap(metric => + metric.name match { + case "TOP_N" => + splitMetric(metric, metric.paramMap.getOrElse("targetNumber", 10).toString.toInt) + case _ => Seq(metric) }) - (col._1, processed) - }) + (col._1, processed) + }) // combine column metrics and results - val unitedMetricResult - : Map[Seq[String], Map[ColumnMetric, (Double, Option[String])]] = + val unitedMetricResult: Map[Seq[String], Map[ColumnMetric, (Double, Option[String])]] = processedMetrics.map(colmet => { val resMap: Map[String, (Double, Option[String])] = resultsMap(colmet._1) val metResMap: Map[ColumnMetric, (Double, Option[String])] = colmet._2 .map(met => { - (met, - resMap.getOrElse(getParametrizedMetricName(met), - (0.0, Some("not_present")))) + (met, resMap.getOrElse(getParametrizedMetricName(met), (0.0, Some("not_present")))) }) .toMap (colmet._1, metResMap) diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricResult.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricResult.scala index 042e56d..3b60b89 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricResult.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/metrics/MetricResult.scala @@ -13,6 +13,7 @@ object DQResultTypes extends Enumeration { val composed: DQResultType = Value("Composed") val file: DQResultType = Value("File") val check: DQResultType = Value("Check") + val load: DQResultType = Value("Load") } trait TypedResult { diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/sources/Source.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/sources/Source.scala index 2e6e276..4578fd7 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/sources/Source.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/sources/Source.scala @@ -5,10 +5,11 @@ import it.agilelab.bigdata.DataQuality.metrics.CalculatorStatus.Value import it.agilelab.bigdata.DataQuality.sources.SourceTypes.SourceType import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig import it.agilelab.bigdata.DataQuality.utils.DQSettings + import org.apache.spark.sql.DataFrame -import scala.reflect._ +import org.apache.spark.storage.StorageLevel -import scala.reflect.ClassTag +import scala.reflect._ /** * Created by Gianvito Siciliano on 03/01/17. @@ -55,14 +56,16 @@ case class VirtualFileSelect(id: String, parentSourceIds: Seq[String], sqlQuery: String, keyfields: Seq[String], - save: Boolean = false) + save: Boolean = false, + persist: Option[StorageLevel]) extends VirtualFile(id, keyfields, save) case class VirtualFileJoinSql(id: String, parentSourceIds: Seq[String], sqlJoin: String, keyfields: Seq[String], - save: Boolean = false) + save: Boolean = false, + persist: Option[StorageLevel]) extends VirtualFile(id, keyfields, save) case class VirtualFileJoin(id: String, @@ -77,10 +80,11 @@ case class HdfsFile( id: String, path: String, fileType: String, - separator: Option[String], header: Boolean, date: String, - dependencies: List[String] = List.empty[String], + delimiter: Option[String] = None, + quote: Option[String] = None, + escape: Option[String] = None, schema: Option[Any] = None, keyfields: Seq[String] = Seq.empty ) extends SourceConfig { @@ -90,12 +94,13 @@ case class HdfsFile( tar.fileName, tar.path + "/" + tar.fileName + s".${tar.fileFormat}", tar.fileFormat, - tar.delimiter, true, - tar.date.getOrElse(settings.refDateString) - ) + tar.date.getOrElse(settings.refDateString), + tar.delimiter, + tar.quote, + tar.escape + ) } - override def getType: SourceType = SourceTypes.hdfs } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/sources/VirtualSourceProcessor.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/sources/VirtualSourceProcessor.scala index c4b21b0..19a2e64 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/sources/VirtualSourceProcessor.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/sources/VirtualSourceProcessor.scala @@ -1,109 +1,95 @@ package it.agilelab.bigdata.DataQuality.sources import it.agilelab.bigdata.DataQuality.utils._ + +import org.apache.spark.storage.StorageLevel +import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.{DataFrame, SQLContext} +import org.apache.spark.storage.StorageLevel import scala.collection.JavaConversions.asJavaCollection /** * Created by Rocco Caruso on 12/10/17. */ - object VirtualSourceProcessor { - def getActualSources(initialVirtualSourcesMap: Map[String, VirtualFile], - initialSourceMap: Map[String, Source])( - implicit sqlContext: SQLContext, - settings: DQSettings): Map[String, Source] = { + def getActualSources(initialVirtualSourcesMap: Map[String, VirtualFile], initialSourceMap: Map[String, Source])( + implicit sqlContext: SQLContext, + settings: DQSettings): Map[String, Source] = { @scala.annotation.tailrec - def loop(virtualSourcesMap: Map[String, VirtualFile], - actualSourcesMapAccumulator: Map[String, Source])( + def loop(virtualSourcesMap: Map[String, VirtualFile], actualSourcesMapAccumulator: Map[String, Source])( implicit sqlContext: SQLContext): Map[String, Source] = { - log.info( - "VIRTUAL SOURCES MAP SIZE " + virtualSourcesMap.size + " keys " + virtualSourcesMap.keySet - .mkString("-")) - log.info( - "ACTUAL SOURCES MAP SIZE " + actualSourcesMapAccumulator.size + " keys " + actualSourcesMapAccumulator.keySet - .mkString("-")) + log.info(s"Virtual sources to load: ${virtualSourcesMap.size}") + if (virtualSourcesMap.isEmpty) { + log.info(s"[SUCCESS] Virtual sources loading is complete.") actualSourcesMapAccumulator } else { val firstLevelVirtualSources: Map[String, VirtualFile] = virtualSourcesMap.filter { case (sourceId, conf: VirtualFile) => val parentIds = conf.parentSourceIds - log.info(s"* virtual source $sourceId | parentIDS ${parentIds.mkString( - "-")} sources ${actualSourcesMapAccumulator.keySet.mkString("-")}") + log.info(s" * Virtual source $sourceId | parents: ${parentIds.mkString(", ")}") actualSourcesMapAccumulator.keySet.containsAll(parentIds) } val otherSources: Map[String, Source] = firstLevelVirtualSources .map { - case (vid, virutalFile) => - virutalFile match { - case VirtualFileSelect(id, - parentSourceIds, - sqlCode, - keyfields, - _) => - log.info("VIRTUAL SOURCE SELECT " + vid) + case (vid, virtualFile) => + virtualFile match { + case VirtualFileSelect(id, parentSourceIds, sqlCode, keyfields, save, persist) => val firstParent = parentSourceIds.head - log.info("FIRST PARENT " + firstParent) - val dfSource = - actualSourcesMapAccumulator.get(firstParent).head + log.info(s"Processing '$id', type: 'FILTER-SQL', parent: '$firstParent'") + log.info(s"SQL: $sqlCode") + val dfSource = actualSourcesMapAccumulator.get(firstParent).head dfSource.df.registerTempTable(firstParent) val virtualSourceDF = sqlContext.sql(sqlCode) - Source(vid, - settings.refDateString, - virtualSourceDF, - keyfields) - case VirtualFileJoinSql(id, - parentSourceIds, - sqlCode, - keyfields, - _) => - log.info("VIRTUAL JOIN " + sqlCode) - val leftParent = parentSourceIds.head + //persist feature + if (persist.isDefined) { + virtualSourceDF.persist(persist.getOrElse(throw new RuntimeException("Something is wrong!"))) + log.info(s"Persisting VS $id (${persist.get.description})...") + } + + Source(vid, settings.refDateString, virtualSourceDF, keyfields) + + case VirtualFileJoinSql(id, parentSourceIds, sqlCode, keyfields, save, persist) => + val leftParent = parentSourceIds.head val rightParent = parentSourceIds(1) - log.info("LEFT PARENT " + leftParent) - log.info("RIGHT PARENT " + rightParent) - val dfSourceLeft: DataFrame = - actualSourcesMapAccumulator(leftParent).df - val dfSourceRight: DataFrame = - actualSourcesMapAccumulator(rightParent).df - val colLeft = dfSourceLeft.columns.toSeq.mkString(",") + log.info(s"Processing '$id', type: 'JOIN-SQL', parent: L:'$leftParent', R:'$rightParent'") + log.info(s"SQL: $sqlCode") + + val dfSourceLeft: DataFrame = actualSourcesMapAccumulator(leftParent).df + val dfSourceRight: DataFrame = actualSourcesMapAccumulator(rightParent).df + val colLeft = dfSourceLeft.columns.toSeq.mkString(",") val colRight = dfSourceRight.columns.toSeq.mkString(",") + dfSourceLeft.registerTempTable(leftParent) dfSourceRight.registerTempTable(rightParent) - log.info(s"column left $colLeft") - log.info(s"column right $colRight") + + log.debug(s"column left $colLeft") + log.debug(s"column right $colRight") val virtualSourceDF = sqlContext.sql(sqlCode) - log.info("VIRTUAL JOIN" + virtualSourceDF.explain()) - - Source(vid, - settings.refDateString, - virtualSourceDF, - keyfields) - - case VirtualFileJoin(id, - parentSourceIds, - joiningColumns, - joinType, - keyfields, - _) => - log.info("VIRTUAL JOIN " + joiningColumns.mkString("-")) - - val leftParent = parentSourceIds.head + + //persist feature + if (persist.isDefined) { + virtualSourceDF.persist(persist.getOrElse(throw new RuntimeException("Something is wrong!"))) + log.info(s"Persisting VS $id (${persist.get.description})...") + } + + Source(vid, settings.refDateString, virtualSourceDF, keyfields) + + case VirtualFileJoin(id, parentSourceIds, joiningColumns, joinType, keyfields, _) => + val leftParent = parentSourceIds.head val rightParent = parentSourceIds(1) - log.info("LEFT PARENT " + leftParent) - log.info("RIGHT PARENT " + rightParent) + log.info(s"Processing '$id', type: 'JOIN', parent: L:'$leftParent', R:'$rightParent'") + val dfSourceLeft = actualSourcesMapAccumulator(leftParent).df - val dfSourceRight = - actualSourcesMapAccumulator(rightParent).df + val dfSourceRight = actualSourcesMapAccumulator(rightParent).df val colLeftRenamedLeft: Array[(String, String)] = dfSourceLeft.columns @@ -114,30 +100,24 @@ object VirtualSourceProcessor { .filter(c => !joiningColumns.contains(c)) .map(colName => (colName, s"r_$colName")) - val dfLeftRenamed = colLeftRenamedLeft.foldLeft(dfSourceLeft)( - (dfAcc, cols) => dfAcc.withColumnRenamed(cols._1, cols._2)) - val dfRightRenamed = - colLeftRenamedRight.foldLeft(dfSourceRight)((dfAcc, cols) => - dfAcc.withColumnRenamed(cols._1, cols._2)) + val dfLeftRenamed = colLeftRenamedLeft + .foldLeft(dfSourceLeft)((dfAcc, cols) => dfAcc.withColumnRenamed(cols._1, cols._2)) + val dfRightRenamed = colLeftRenamedRight + .foldLeft(dfSourceRight)((dfAcc, cols) => dfAcc.withColumnRenamed(cols._1, cols._2)) - val colLeft = dfLeftRenamed.columns.toSeq.mkString(",") + val colLeft = dfLeftRenamed.columns.toSeq.mkString(",") val colRight = dfRightRenamed.columns.toSeq.mkString(",") dfLeftRenamed.registerTempTable(leftParent) dfRightRenamed.registerTempTable(rightParent) - log.info(s"column left $colLeft") - log.info(s"column right $colRight") + log.debug(s"column left $colLeft") + log.debug(s"column right $colRight") val virtualSourceDF = dfLeftRenamed.join(dfRightRenamed, joiningColumns, joinType) - log.info("VIRTUAL JOIN" + virtualSourceDF.explain()) - - Source(vid, - settings.refDateString, - virtualSourceDF, - keyfields) + Source(vid, settings.refDateString, virtualSourceDF, keyfields) } } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/targets/Target.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/targets/Target.scala index 530e72f..3cbfaee 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/targets/Target.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/targets/Target.scala @@ -1,32 +1,51 @@ package it.agilelab.bigdata.DataQuality.targets +import it.agilelab.bigdata.DataQuality.utils.enums +import it.agilelab.bigdata.DataQuality.utils.enums.Targets +import it.agilelab.bigdata.DataQuality.utils.enums.Targets.TargetType /** - * Created by Gianvito Siciliano on 02/01/17. + * Base target trait */ trait TargetConfig { - def getType: String + def getType: TargetType } +/** + * System target configuration. Send an email and save a file if some of the checks are failing + * @param id Target id + * @param checkList List of check to watch + * @param mailList List of notification recipients + * @param outputConfig Output file configuration + */ case class SystemTargetConfig( id: String, checkList: Seq[String], mailList: Seq[String], outputConfig: TargetConfig ) extends TargetConfig { - override def getType: String = "SYSTEM" + override def getType: enums.Targets.Value = Targets.system } /** - * Representation of file to save + * HDFS file target configuration + * @param fileName Name of the output file + * @param fileFormat File type (csv, avro) + * @param path desired path + * @param delimiter delimiter + * @param quote quote char + * @param escape escape char + * @param date output date + * @param quoteMode quote mode (refer to spark-csv) */ case class HdfsTargetConfig( fileName: String, fileFormat: String, path: String, delimiter: Option[String] = None, + quote: Option[String] = None, + escape: Option[String] = None, date: Option[String] = None, - savemode: Option[String] = None, - quoted: Boolean = false + quoteMode: Option[String] = None ) extends TargetConfig { - override def getType: String = "HDFS" + override def getType: enums.Targets.Value = Targets.hdfs } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQCommandLineOptions.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQCommandLineOptions.scala index 168f21b..279e588 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQCommandLineOptions.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQCommandLineOptions.scala @@ -16,21 +16,25 @@ object DQCommandLineOptions { def parser(): OptionParser[DQCommandLineOptions] = new OptionParser[DQCommandLineOptions]("dataquality") { + opt[String]('a', "application-conf") required () action { (x, c) => c.copy(applicationConf = x) } text "Path to application configuration file" + opt[String]('c', "configFilePath") required () action { (x, c) => c.copy(configFilePath = x) } text "Path to run configuration file" + opt[Calendar]('d', "reference-date") required () action { (x, c) => c.copy(refDate = x.getTime) - } text "Indicates the date at which the DataQuality checks will be performed" + } text "Indicates the date at which the DataQuality checks will be performed (format YYYY-MM-DD)" + opt[Unit]('r', "repartition") optional () action { (_, c) => c.copy(repartition = true) } text "Specifies whether the application is repartitioning the input data" + opt[Unit]('l', "local") optional () action { (_, c) => c.copy(local = true) } text "Specifies whether the application is operating in local mode" } - } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQMainClass.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQMainClass.scala index 6e28935..6cb4548 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQMainClass.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQMainClass.scala @@ -2,8 +2,8 @@ package it.agilelab.bigdata.DataQuality.utils import java.util.Locale -import it.agilelab.bigdata.DataQuality.utils.io.LocalDBManager -import org.apache.hadoop.fs.FileSystem +import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager +import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext @@ -23,31 +23,36 @@ trait DQMainClass { this: DQSparkContext with Logging => Logger.getLogger("org.apache.hadoop.hdfs.KeyProviderCache").setLevel(Level.OFF) } - private def makeFileSystem(sc: SparkContext) = { + private def makeFileSystem(settings: DQSettings, sc: SparkContext): FileSystem = { if (sc.isLocal) FileSystem.getLocal(sc.hadoopConfiguration) - else FileSystem.get(sc.hadoopConfiguration) + else{ + + if (!settings.s3Bucket.isEmpty) { + sc.hadoopConfiguration.set("fs.defaultFS", settings.s3Bucket) + sc.hadoopConfiguration.set("fs.AbstractFileSystem.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") + } + + FileSystem.get( sc.hadoopConfiguration) + + } } protected def body()(implicit fs: FileSystem, sparkContext: SparkContext, sqlContext: SQLContext, - sqlWriter: LocalDBManager, + sqlWriter: HistoryDBManager, settings: DQSettings): Boolean def preMessage(task: String): Unit = { - log.warn( - "************************************************************************") - log.warn(s" STARTING EXECUTION OF TASK $task") - log.warn( - "************************************************************************") + log.warn("************************************************************************") + log.warn(s" Starting execution of task $task") + log.warn("************************************************************************") } def postMessage(task: String): Unit = { - log.warn( - "************************************************************************") - log.warn(s" FINISHED EXECUTION OF TASK $task") - log.warn( - "************************************************************************") + log.warn("************************************************************************") + log.warn(s" Finishing execution of task $task") + log.warn("************************************************************************") } def main(args: Array[String]): Unit = { @@ -59,44 +64,36 @@ trait DQMainClass { this: DQSparkContext with Logging => case Some(commandLineOptions) => // Load our own config values from the default location, application.conf val settings = new DQSettings(commandLineOptions) + val sparkContext = makeSparkContext(settings) + val fs = makeFileSystem(settings, sparkContext) - log.info("Mailing mode: " + settings.mailingMode) - settings.mailingConfig match { - case Some(mconf) => log.info("With configuration: " + mconf.toString) - case None => - } + settings.logThis()(log) - log.info(s"Creating SparkContext, SqlContext and FileSystem...") - val sparkContext = makeSparkContext(settings) - val sqlContext: SQLContext = if (settings.hiveDir.nonEmpty) { - log.info(s"Hive context created with hive dir ${settings.hiveDir}") + val sqlContext: SQLContext = if (settings.hiveDir.isDefined) { val hc = new HiveContext(sparkContext) - hc.setConf("hive.metastore.warehouse.dir", settings.hiveDir) + hc.setConf("hive.metastore.warehouse.dir", settings.hiveDir.get) hc - } else { - makeSqlContext(sparkContext) - } + } else makeSqlContext(sparkContext) - val fs = makeFileSystem(sparkContext) - val localSqlWriter = new LocalDBManager(settings) + val historyDatabase = new HistoryDBManager(settings) - preMessage(s"{${settings.appName}}") + // Starting application body + preMessage(s"{Data Quality ${settings.appName}}") val startTime = System.currentTimeMillis() - body()(fs, sparkContext, sqlContext, localSqlWriter, settings) - postMessage(s"{${settings.appName}}") + body()(fs, sparkContext, sqlContext, historyDatabase, settings) + postMessage(s"{Data Quality ${settings.appName}}") - log.info( - s"Execution finished in [${(System.currentTimeMillis() - startTime) / 60000}] min(s)") - log.info("Closing application") + log.info(s"Execution finished in [${(System.currentTimeMillis() - startTime) / 60000}] min(s)") + log.info("Closing application...") - localSqlWriter.closeConnection() + historyDatabase.closeConnection() sparkContext.stop() - log.info("Spark context terminated. Exiting...") + log.info("Spark context were terminated. Exiting...") case None => - log.error("WRONG PARAMS") - throw new Exception("WRONG PARAMS") + log.error("Wrong parameters provided") + throw new Exception("Wrong parameters provided") } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSettings.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSettings.scala index a22a42f..f9d91ab 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSettings.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSettings.scala @@ -7,15 +7,29 @@ import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException import it.agilelab.bigdata.DataQuality.sources.DatabaseConfig import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig import it.agilelab.bigdata.DataQuality.utils +import it.agilelab.bigdata.DataQuality.utils.mailing.MailerConfiguration +import org.apache.log4j.Logger import org.joda.time import org.joda.time.DateTime import org.joda.time.format.{DateTimeFormat, DateTimeFormatter} import scala.util.Try +import scala.reflect.runtime.universe._ + + +object DQSettings { + def getConfigOption[T: TypeTag](path: String, conf: Config): Option[T] = { + val values = typeOf[T] match { + case x if x =:= typeOf[String] => Try(conf.getString(path)).toOption.filter(_.nonEmpty) + case x if x =:= typeOf[Int] => Try(conf.getInt(path)).toOption + case x if x =:= typeOf[Boolean] => Try(conf.getBoolean(path)).toOption + case _ => None + } + + values.map(_.asInstanceOf[T]) + } +} -/** - * Created by Paolo on 20/01/2017. - */ class DQSettings(conf: Config, val configFilePath: String, val repartition: Boolean, @@ -24,7 +38,7 @@ class DQSettings(conf: Config, def this(commandLineOpts: DQCommandLineOptions) { this( - ConfigFactory.parseFile(new File(commandLineOpts.applicationConf)).getConfig("dataquality").resolve(), + ConfigFactory.parseFile(new File(commandLineOpts.applicationConf)).getConfig("data_quality").resolve(), commandLineOpts.configFilePath, commandLineOpts.repartition, commandLineOpts.local, @@ -32,58 +46,78 @@ class DQSettings(conf: Config, ) } - private val inputArgDateFormat: DateTimeFormatter = - DateTimeFormat.forPattern("yyyy-MM-dd") + private val inputArgDateFormat: DateTimeFormatter = DateTimeFormat.forPattern(utils.applicationDateFormat) lazy val refDateString: String = ref_date.toString(inputArgDateFormat) /* application.conf parameters */ + val appName: String = Try(conf.getString("application_name")).toOption.getOrElse("Data_Quality") + + val s3Bucket: String = Try(conf.getString("s3_bucket")).toOption.getOrElse("") + + val hiveDir: Option[String] = DQSettings.getConfigOption[String]("hive_warehouse_path", conf) + val hbaseHost: Option[String] = DQSettings.getConfigOption[String]("hbase_host", conf) + val localTmpPath: Option[String] = DQSettings.getConfigOption[String]("tmp_files_management.local_fs_path", conf) + val hdfsTmpPath: Option[String] = DQSettings.getConfigOption[String]("tmp_files_management.hdfs_path", conf) + val tmpFileDelimiter: Option[String] = DQSettings.getConfigOption[String]("tmp_files_management.delimiter", conf) + + // Error managements parameters + val errorFolderPath: Option[String] = DQSettings.getConfigOption[String]("metric_error_management.dump_directory_path", conf) + val errorDumpSize: Int = DQSettings.getConfigOption[Int]("metric_error_management.dump_size", conf).getOrElse(1000) + + val errorFileFormat: String = DQSettings.getConfigOption[String]("metric_error_management.file_config.format", conf).getOrElse("csv") + val errorFileDelimiter: Option[String] = DQSettings.getConfigOption[String]("metric_error_management.file_config.delimiter", conf) + val errorFileQuote: Option[String] = DQSettings.getConfigOption[String]("metric_error_management.file_config.quote", conf) + val errorFileEscape: Option[String] = DQSettings.getConfigOption[String]("metric_error_management.file_config.escape", conf) + val errorFileQuoteMode: Option[String] = DQSettings.getConfigOption[String]("metric_error_management.file_config.quote_mode", conf) + + // Virtual sources parameters val vsDumpConfig: Option[HdfsTargetConfig] = Try { - val obj: Config = conf.getConfig("vsDumpConfig") - utils.parseTargetConfig(obj).get + val obj: Config = conf.getConfig("virtual_sources_management") + val path = DQSettings.getConfigOption[String]("dump_directory_path", obj).get + val fileFormat = DQSettings.getConfigOption[String]("file_format", obj).get + val delimiter = DQSettings.getConfigOption[String]("delimiter", obj) + HdfsTargetConfig.apply("vsd", fileFormat, path, delimiter) }.toOption - val appName: String = Try(conf.getString("appName")).toOption.getOrElse("") - val appDir: String = Try(conf.getString("appDirectory")).toOption.getOrElse("") - val errorDumpSize: Int = Try(conf.getInt("errorDumpSize")).toOption.getOrElse(1000) - val errorFolderPath: Option[String] = Try(conf.getString("errorFolderPath")).toOption - val hiveDir: String = Try(conf.getString("hiveDir")).toOption.getOrElse("") - val hbaseHost: String = Try(conf.getString("hbaseDir")).toOption.getOrElse("") - val hadoopConfDir: String = Try(conf.getString("hadoopConfDir")).toOption.getOrElse("") - val mailingMode: String = Try(conf.getString("mailing.mode").toLowerCase).getOrElse("internal") - val mailingConfig: Option[Mailer] = { + val mailingMode: Option[String] = DQSettings.getConfigOption[String]("mailing.mode", conf) + val scriptPath: Option[String] = DQSettings.getConfigOption[String]("mailing.mail_script_path", conf) + val mailingConfig: Option[MailerConfiguration] = { val monfig = Try(conf.getConfig("mailing.conf")).toOption monfig match { - case Some(c) => Try(new Mailer(c)).toOption + case Some(c) => Try(new MailerConfiguration(c)).toOption case None => None } } - private val storageType: String = conf.getString("storage.type") - private val storageConfig: Config = conf.getConfig("storage.config") - // todo add new storage types - val resStorage: Product = storageType match { - case "DB" => new DatabaseConfig(storageConfig) + val notifications: Boolean = DQSettings.getConfigOption[Boolean]("mailing.notifications", conf).getOrElse(false) + + val resStorage: Option[DatabaseConfig] = conf.getString("storage.type") match { + case "DB" => Some(new DatabaseConfig(conf.getConfig("storage.config"))) + case "NONE" => None case x => throw IllegalParameterException(x) } -} -case class Mailer( - address: String, - hostName: String, - username: String, - password: String, - smtpPortSSL: Int, - sslOnConnect: Boolean - ) { - def this(config: Config) = { - this( - config.getString("address"), - config.getString("hostname"), - Try(config.getString("username")).getOrElse(""), - Try(config.getString("password")).getOrElse(""), - Try(config.getInt("smtpPort")).getOrElse(465), - Try(config.getBoolean("sslOnConnect")).getOrElse(true) - ) + def logThis()(implicit log: Logger): Unit = { + log.info(s"[CONF] General application configuration:") + log.info(s"[CONF] - HBase host: ${this.hbaseHost}") + log.info(s"[CONF] - Hive warehouse path: ${this.hiveDir}") + log.info(s"[CONF] - Metric error management configuration:") + log.info(s"[CONF] - Dump directory path path: ${this.errorFolderPath}") + log.info(s"[CONF] - Dump size: ${this.errorDumpSize}") + log.info(s"[CONF] - Temporary files management configuration:") + log.info(s"[CONF] - Local FS path: ${this.localTmpPath}") + log.info(s"[CONF] - HDFS path: ${this.hdfsTmpPath}") + log.info(s"[CONF] - Virtual sources management configuration:") + log.info(s"[CONF] - Dump path: ${this.vsDumpConfig.map(_.path)}") + log.info(s"[CONF] - File format: ${this.vsDumpConfig.map(_.fileFormat)}") + log.info(s"[CONF] - Delimiter: ${this.vsDumpConfig.map(_.delimiter)}") + log.info(s"[CONF] - Storage configuration:") + log.info(s"[CONF] - Mode: ${conf.getString("storage.type")}") + log.info(s"[CONF] - Mailing configuration:") + log.info(s"[CONF] - Mode: ${this.mailingMode}") + log.info(s"[CONF] - Script path: ${this.scriptPath}") + log.info(s"[CONF] - Notifications: ${this.notifications}") } + } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSparkContext.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSparkContext.scala index 9639f2f..97a9c56 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSparkContext.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSparkContext.scala @@ -14,12 +14,14 @@ trait DQSparkContext { protected def withSparkConf(settings: DQSettings)( f: SparkConf => SparkContext): SparkContext = { val conf = new SparkConf() - .setAppName(settings.appName) + .setAppName(s"${settings.appName} Data Quality") .set("spark.serializer", serializerClassName) .set("spark.kryoserializer.buffer.max", "128") .set("spark.sql.parquet.compression.codec", "snappy") + + if (!settings.s3Bucket.isEmpty) conf.set("spark.sql.warehouse.dir", settings.s3Bucket + "/data_quality_output/spark/warehouse") if (settings.local) conf.setMaster("local[*]") - if (settings.hbaseHost.nonEmpty) conf.set("spark.hbase.host", settings.hbaseHost) + if (settings.hbaseHost.isDefined) conf.set("spark.hbase.host", settings.hbaseHost.get) f(conf) } @@ -28,7 +30,7 @@ trait DQSparkContext { new SparkContext(conf) } - protected def makeSqlContext(sc: SparkContext) = { + protected def makeSqlContext(sc: SparkContext): SQLContext = { new SQLContext(sc) } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/Logging.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/Logging.scala index 2a35ab2..54e0c93 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/Logging.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/Logging.scala @@ -3,5 +3,5 @@ package it.agilelab.bigdata.DataQuality.utils import org.apache.log4j.Logger trait Logging { - @transient lazy val log = Logger.getLogger(getClass.getName) + @transient lazy val log: Logger = Logger.getLogger(getClass.getName) } diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/enums.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/enums.scala new file mode 100644 index 0000000..e9d55ae --- /dev/null +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/enums.scala @@ -0,0 +1,23 @@ +package it.agilelab.bigdata.DataQuality.utils + +object enums { + + trait ConfigEnum extends Enumeration { + def names: Set[String] = values.map(_.toString) + def withNameOpt(s: String): Option[Value] = values.find(_.toString == s) + def contains(s: String): Boolean = names.contains(s) + } + + object Entities extends ConfigEnum { + val databases: Value = Value("Databases") + val sources: Value = Value("Sources") + val virtual: Value = Value("VirtualSources") + } + + object Targets extends ConfigEnum { + val system: Value = Value("system") + val hdfs: Value = Value("hdfs") + + type TargetType = Value + } +} diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/HdfsReader.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/HdfsReader.scala index 9f7eb9a..f5632f8 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/HdfsReader.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/HdfsReader.scala @@ -11,7 +11,7 @@ import org.apache.avro.Schema import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.rdd.RDD import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.apache.spark.sql.{DataFrame, DataFrameReader, Row, SQLContext} import org.joda.time.DateTime import scala.collection.JavaConversions._ @@ -33,15 +33,13 @@ object HdfsReader extends Logging { * @param settings dataquality configuration * @return sequency of dataframes */ - def load(inputConf: HdfsFile, refDate: DateTime)( - implicit fs: FileSystem, - sqlContext: SQLContext, - settings: DQSettings): Seq[DataFrame] = { + def load(inputConf: HdfsFile, + refDate: DateTime)(implicit fs: FileSystem, sqlContext: SQLContext, settings: DQSettings): Seq[DataFrame] = { - log.warn(refDate) // replaces {{yyyyMMdd}} in the source path val finalPath = PathUtils.replaceDateInPath(inputConf.path, refDate) + log.info(s"Loading ${inputConf.fileType.toUpperCase} file -> ${inputConf.id}:$finalPath") inputConf.fileType.toUpperCase match { case "CSV" => loadCsv(inputConf, finalPath) case "PARQUET" => loadParquet(inputConf, finalPath) @@ -59,20 +57,17 @@ object HdfsReader extends Logging { * @param sqlContext sql context * @return sequence of dataframes */ - def loadOutput(inputConf: OutputFile)( - implicit fs: FileSystem, - sqlContext: SQLContext): Seq[(String, DataFrame)] = { + @deprecated + def loadOutput(inputConf: OutputFile)(implicit fs: FileSystem, sqlContext: SQLContext): Seq[(String, DataFrame)] = { import sqlContext.implicits._ - log.info(s"Starting load ${inputConf.fileType.toUpperCase}") - if (!fs.exists(new Path(inputConf.path))) { log.warn("fixed input file: " + inputConf.path + " not found!") Nil } else { log.warn("loading fixed input file: " + inputConf.id) - val fileMetrics = sqlContext.sparkContext.textFile(inputConf.path + "/") + val fileMetrics = sqlContext.sparkContext.textFile(inputConf.path + "/") val columnMetrics = sqlContext.sparkContext.textFile(inputConf.path + "/") val columnData = @@ -108,11 +103,9 @@ object HdfsReader extends Logging { * @param sqlContext sql context * @return sequence of dataframes */ - private def loadWithSchema(inputConf: HdfsFile, filePath: String)( - implicit fs: FileSystem, - sqlContext: SQLContext): Seq[DataFrame] = { - log.info( - s"Starting load ${inputConf.fileType.toUpperCase} file -> ${filePath}") + private def loadWithSchema(inputConf: HdfsFile, filePath: String)(implicit fs: FileSystem, + sqlContext: SQLContext): Seq[DataFrame] = { + log.info(s"Starting load ${inputConf.fileType.toUpperCase} file -> ${filePath}") val result: Option[DataFrame] = if (!fs.exists(new Path(filePath))) { log.warn("fixed input file: " + filePath + " not found!") @@ -122,8 +115,7 @@ object HdfsReader extends Logging { val fieldSeq: List[GenStructColumn] = inputConf.schema.get match { case xs: List[_] => - xs.filter( - _ match { case _: GenStructColumn => true; case _ => false }) + xs.filter(_ match { case _: GenStructColumn => true; case _ => false }) .asInstanceOf[List[GenStructColumn]] case s: String => tryToLoadSchema(s) case e => throw IllegalParameterException(e.toString) @@ -132,8 +124,7 @@ object HdfsReader extends Logging { val ff: RDD[Row] = sqlContext.sparkContext.textFile(filePath).map { x => getRow(x, fieldSeq) } - val schema = StructType( - fieldSeq.map(x => StructField(x.name, StringType, nullable = true))) + val schema = StructType(fieldSeq.map(x => StructField(x.name, StringType, nullable = true))) Option(sqlContext.createDataFrame(ff, schema)) } @@ -149,14 +140,13 @@ object HdfsReader extends Logging { */ private def getRow(x: String, fields: List[GenStructColumn]) = { val columnArray = new Array[String](fields.size) - var pos = 0 + var pos = 0 fields.head.getType match { case "StructFixedColumn" => val ll: List[StructFixedColumn] = fields.map(_.asInstanceOf[StructFixedColumn]) ll.zipWithIndex.foreach { field => - columnArray(field._2) = - Try(x.substring(pos, pos + field._1.length).trim).getOrElse(null) + columnArray(field._2) = Try(x.substring(pos, pos + field._1.length).trim).getOrElse(null) pos += field._1.length } case _ => IllegalParameterException(fields.head.toString) @@ -173,19 +163,12 @@ object HdfsReader extends Logging { * @param settings dataquality configuration * @return sequence of dataframes */ - private def loadAvro(inputConf: HdfsFile, filePath: String)( - implicit fs: FileSystem, - sqlContext: SQLContext, - settings: DQSettings): Seq[DataFrame] = { - log.info( - s"Starting load ${inputConf.fileType.toUpperCase} file -> ${filePath}") + private def loadAvro(inputConf: HdfsFile, filePath: String)(implicit fs: FileSystem, + sqlContext: SQLContext, + settings: DQSettings): Seq[DataFrame] = { val result: Option[DataFrame] = - if (!fs.exists(new Path(filePath))) { - log.warn("avro input file: " + inputConf.id + " not found!") - None - } else { - log.warn("loading avro input file: " + inputConf.id) + if (!fs.exists(new Path(filePath))) { None } else { // It's possible to provide a scheme, so the following code splits the workflow val schema = Try { @@ -200,24 +183,24 @@ object HdfsReader extends Logging { } }.toOption - val res = schema match { - case Some(sc) => - sqlContext.read - .format("com.databricks.spark.avro") - .option("avroSchema", sc.toString) - .load(filePath) - case None => - if (inputConf.schema.isDefined) - log.warn("Failed to load the schema from file") - sqlContext.read - .format("com.databricks.spark.avro") - .load(filePath) - } + Try { + val res: DataFrame = schema match { + case Some(sc) => + sqlContext.read + .format("com.databricks.spark.avro") + .option("avroSchema", sc.toString) + .load(filePath) + case None => + if (inputConf.schema.isDefined) + log.warn("Failed to load the schema from file") + sqlContext.read + .format("com.databricks.spark.avro") + .load(filePath) + } - if (settings.repartition) - Some(res.repartition(sqlContext.sparkContext.defaultParallelism)) - else - Some(res) + if (settings.repartition) res.repartition(sqlContext.sparkContext.defaultParallelism) + else res + }.toOption } result.map(Seq(_)).getOrElse(Nil) } @@ -230,11 +213,8 @@ object HdfsReader extends Logging { * @param sqlContext sql context * @return sequence of dataframes */ - private def loadParquet(inputConf: HdfsFile, filePath: String)( - implicit fs: FileSystem, - sqlContext: SQLContext): Seq[DataFrame] = { - log.info( - s"Starting load ${inputConf.fileType.toUpperCase} file -> ${filePath}") + private def loadParquet(inputConf: HdfsFile, filePath: String)(implicit fs: FileSystem, + sqlContext: SQLContext): Seq[DataFrame] = { val result: Option[DataFrame] = if (!fs.exists(new Path(filePath))) { @@ -259,12 +239,9 @@ object HdfsReader extends Logging { * @param settings dataquality configuration * @return sequence of dataframes */ - private def loadCsv(inputConf: HdfsFile, filePath: String)( - implicit fs: FileSystem, - sqlContext: SQLContext, - settings: DQSettings): Seq[DataFrame] = { - log.info( - s"Starting load ${inputConf.fileType.toUpperCase} file -> ${filePath}") + private def loadCsv(inputConf: HdfsFile, filePath: String)(implicit fs: FileSystem, + sqlContext: SQLContext, + settings: DQSettings): Seq[DataFrame] = { val schema: Option[List[StructField]] = Try { inputConf.schema.get match { @@ -285,23 +262,30 @@ object HdfsReader extends Logging { } }.toOption - log.info("schema " + schema) + log.info(s"File header: ${inputConf.header}") + log.info("Entered schema: " + schema) + if (inputConf.header && schema.isDefined) + throw new IllegalArgumentException( + "Source can't have schema and header at the same time. Please, check the configuration file...") - val resultReader = sqlContext.read + val resultReader: DataFrameReader = sqlContext.read .format("com.databricks.spark.csv") .option("header", inputConf.header.toString) - .option("delimiter", inputConf.separator.getOrElse(",")) - - val result = schema match { - case Some(sc) => resultReader.schema(StructType(sc)).load(filePath) - case None => resultReader.load(filePath) - } + .option("delimiter", inputConf.delimiter.getOrElse(",")) + .option("quote", inputConf.quote.getOrElse("\"")) + .option("escape", inputConf.escape.getOrElse("\\")) + + val result = Try { + val res: DataFrame = schema match { + case Some(sc) => resultReader.schema(StructType(sc)).load(filePath) + case None => resultReader.load(filePath) + } -// log.info("result count "+result.count()) - if (settings.repartition) - return Seq(result.repartition(sqlContext.sparkContext.defaultParallelism)) + if (settings.repartition) res.repartition(sqlContext.sparkContext.defaultParallelism) + else res + }.toOption - Seq(result) + result.map(Seq(_)).getOrElse(Nil) } /** @@ -315,8 +299,7 @@ object HdfsReader extends Logging { var result = value - while (result != null && result.startsWith(quote)) result = - result.substring(1) + while (result != null && result.startsWith(quote)) result = result.substring(1) while (result != null && result.endsWith(quote)) if (result == quote) result = "" @@ -350,8 +333,7 @@ object HdfsReader extends Logging { * @param fs file system * @return list of column object */ - private def tryToLoadSchema(filePath: String)( - implicit fs: FileSystem): List[GenStructColumn] = { + private def tryToLoadSchema(filePath: String)(implicit fs: FileSystem): List[GenStructColumn] = { if (!fs.exists(new Path(filePath))) { log.warn("Schema does not exists") throw IllegalParameterException(filePath) @@ -360,7 +342,7 @@ object HdfsReader extends Logging { ConfigFactory.parseFile(new File(filePath)).resolve() }.getOrElse(Try { val inputStream = fs.open(new Path(filePath)) - val reader = new InputStreamReader(inputStream) + val reader = new InputStreamReader(inputStream) ConfigFactory.parseReader(reader) }.getOrElse(throw IllegalParameterException(filePath))) @@ -368,8 +350,8 @@ object HdfsReader extends Logging { val structColumns: List[ConfigObject] = configObj.getObjectList("Schema").toList structColumns.map(col => { - val conf = col.toConfig - val name = conf.getString("name") + val conf = col.toConfig + val name = conf.getString("name") val typeConf = conf.getString("type") Try { conf.getInt("length") diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/HdfsWriter.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/HdfsWriter.scala index 2687acb..0b345a5 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/HdfsWriter.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/HdfsWriter.scala @@ -2,10 +2,11 @@ package it.agilelab.bigdata.DataQuality.utils.io import java.io.IOException -import it.agilelab.bigdata.DataQuality.checks.{CheckFailure, CheckResult} +import it.agilelab.bigdata.DataQuality.checks.{CheckFailure, CheckResult, LoadCheckResult} import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException import it.agilelab.bigdata.DataQuality.metrics._ import it.agilelab.bigdata.DataQuality.targets.{HdfsTargetConfig, SystemTargetConfig, TargetConfig} +import it.agilelab.bigdata.DataQuality.utils.enums.Targets import it.agilelab.bigdata.DataQuality.utils.{Logging, _} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, FileUtil, Path} @@ -20,21 +21,18 @@ import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} */ object HdfsWriter extends Logging { - def processSystemTarget(conf: TargetConfig, - finalCheckResults: Seq[CheckResult])( - implicit sqlContext: SQLContext, - fs: FileSystem, - settings: DQSettings): Unit = { + def processSystemTarget(conf: TargetConfig, finalCheckResults: Seq[CheckResult])(implicit sqlContext: SQLContext, + fs: FileSystem, + settings: DQSettings): Unit = { val systemConfig: SystemTargetConfig = conf.asInstanceOf[SystemTargetConfig] val requestedChecks: Seq[CheckResult] = finalCheckResults.filter(x => systemConfig.checkList.contains(x.checkId)) - val numOfFailedChecks: Int = requestedChecks.count(checkRes => - checkRes.status == CheckFailure.stringValue) + val numOfFailedChecks: Int = requestedChecks.count(checkRes => checkRes.status == CheckFailure.stringValue) - systemConfig.outputConfig.getType.toUpperCase match { - case "HDFS" => + systemConfig.outputConfig.getType match { + case Targets.hdfs => val failedCheckIds: String = requestedChecks .filter(checkRes => checkRes.status == CheckFailure.stringValue) .map(x => x.checkId) @@ -45,30 +43,26 @@ object HdfsWriter extends Logging { this.save(hdfsFileConfig, requestedChecks) if (numOfFailedChecks > 0) { - log.warn( - s"$numOfFailedChecks of requested check failed. Sending alert email...") + log.warn(s"$numOfFailedChecks of requested check failed. Sending alert email...") val fullpath = hdfsFileConfig.path + "/" + hdfsFileConfig.fileName + s"_${settings.refDateString}" + "." + hdfsFileConfig .fileFormat (settings.mailingMode, settings.mailingConfig) match { - case ("internal", _) => - sendBashMail(numOfFailedChecks, - failedCheckIds, - fullpath, - systemConfig) - case ("external", Some(mconf)) => + case (Some("internal"), _) => + sendBashMail(numOfFailedChecks, failedCheckIds, fullpath, systemConfig) + case (Some("external"), Some(mconf)) => sendMail(systemConfig.mailList, None, fullpath)(mconf) case (_, _) => log.error("Mailing configuration is incorrect!") } } + case x => throw new IllegalArgumentException(s"Unknown target type: $x") } } - def saveVirtualSource(source: DataFrame, - targetConfig: HdfsTargetConfig, - execDate: String)(implicit fs: FileSystem, - sparkContext: SparkContext): Unit = { - saveCsv(source, targetConfig, execDate) + def saveVirtualSource(source: DataFrame, targetConfig: HdfsTargetConfig, execDate: String)( + implicit fs: FileSystem, + sparkContext: SparkContext): Unit = { + saveCsv(source, targetConfig) } /** @@ -79,11 +73,10 @@ object HdfsWriter extends Logging { * @param fs file system * @param settings DataQuality configuration */ - def save(target: HdfsTargetConfig, sq: Seq[Product with TypedResult])( - implicit sqlContext: SQLContext, - fs: FileSystem, - settings: DQSettings): Unit = { - log.info(s"starting 'write ${target.fileName.toUpperCase} results' ") + def save(target: HdfsTargetConfig, sq: Seq[Product with TypedResult])(implicit sqlContext: SQLContext, + fs: FileSystem, + settings: DQSettings): Unit = { + log.info(s"Saving Results: ${target.fileName.toUpperCase}...") if (sq.nonEmpty) { // since we want to allow you to save on the custom date @@ -102,69 +95,73 @@ object HdfsWriter extends Logging { sqlContext.createDataFrame(sq.asInstanceOf[Seq[ComposedMetricResult]]) case DQResultTypes.check => sqlContext.createDataFrame(sq.asInstanceOf[Seq[CheckResult]]) + case DQResultTypes.load => + sqlContext.createDataFrame(sq.asInstanceOf[Seq[LoadCheckResult]].map(_.simplify())) case x => throw IllegalParameterException(x.toString) } target.fileFormat.toUpperCase match { case "CSV" | "TXT" => - saveCsv(df, target, target.date.getOrElse(execDate))( - fs, - sqlContext.sparkContext) + saveCsv(df, target)(fs, sqlContext.sparkContext) case "PARQUET" => saveParquet(df, target, target.date.getOrElse(execDate)) case _ => throw IllegalParameterException(target.fileFormat.toUpperCase) } - } else log.warn("Failed to write an empty file") + } else log.warn("ERROR: Failed to save an empty file") + } + + def saveDF(target: HdfsTargetConfig, + df: DataFrame)(implicit sqlContext: SQLContext, fs: FileSystem, settings: DQSettings): Unit = { + log.info(s"Saving DF: ${target.fileName}...") + + // since we want to allow you to save on the custom date + val execDate: String = settings.refDateString + + target.fileFormat.toUpperCase match { + case "CSV" | "TXT" => + saveCsv(df, target)(fs, sqlContext.sparkContext) + case "PARQUET" => + saveParquet(df, target, target.date.getOrElse(execDate)) + case _ => throw IllegalParameterException(target.fileFormat.toUpperCase) + } } /** * Saves CSV file with results * @param df data frame to save * @param targetConfig target configuration - * @param execDate save date * @param fs file system */ - private def saveCsv(df: DataFrame, - targetConfig: HdfsTargetConfig, - execDate: String)(implicit fs: FileSystem, - sparkContext: SparkContext): Unit = { + private def saveCsv(df: DataFrame, targetConfig: HdfsTargetConfig)( + implicit fs: FileSystem, + sparkContext: SparkContext): Unit = { log.debug("path: " + targetConfig.path) val tempFileName = targetConfig.path + "/" + targetConfig.fileName + ".tmp" //-${targetConfig.subType} - val fileName = targetConfig.path + "/" + targetConfig.fileName + "." + targetConfig.fileFormat //-${targetConfig.subType} - - log.info("writing temp csv file: " + tempFileName) + val fileName = targetConfig.path + "/" + targetConfig.fileName + "." + targetConfig.fileFormat //-${targetConfig.subType} + + log.debug("writing temp csv file: " + tempFileName) + df.write + .format("com.databricks.spark.csv") + .option("header", "false") + .option("quoteMode", targetConfig.quoteMode.getOrElse("MINIMAL")) + .option("delimiter", targetConfig.delimiter.getOrElse(",")) + .option("quote", targetConfig.quote.getOrElse("\"")) + .option("escape", targetConfig.escape.getOrElse("\\")) + .option("nullValue", "") + .mode(SaveMode.Overwrite) + .save(tempFileName) val header: String = - if (targetConfig.quoted) { - df.write - .format("com.databricks.spark.csv") - .option("header", "false") - .option("quoteMode", "ALL") - .option("delimiter", targetConfig.delimiter.getOrElse("|")) - .option("nullValue", "") - .mode(SaveMode.Overwrite) - .save(tempFileName) - - df.schema.fieldNames.mkString( - "\"", - "\"" + s"${targetConfig.delimiter.getOrElse("|").toString}" + "\"", - "\"") + if (targetConfig.quoteMode == Some("ALL")) { + df.schema.fieldNames.mkString("\"", "\"" + s"${targetConfig.delimiter.getOrElse(",").toString}" + "\"", "\"") } else { - df.write - .format("com.databricks.spark.csv") - .option("header", "false") - // .option("quoteMode", "ALL") - .option("delimiter", targetConfig.delimiter.getOrElse("|")) - .option("nullValue", "") - .mode(SaveMode.Overwrite) - .save(tempFileName) - - df.schema.fieldNames.mkString(targetConfig.delimiter.getOrElse("|")) + df.schema.fieldNames.mkString(targetConfig.delimiter.getOrElse(",")) } - log.info("temp csv file: " + tempFileName + " written") + log.debug("temp csv file: " + tempFileName + " has been written") + try { val path = new Path(fileName) if (fs.exists(path)) fs.delete(path, false) @@ -172,19 +169,13 @@ object HdfsWriter extends Logging { fs.create(new Path(tempFileName + "/header")) headerOutputStream.writeBytes(header + "\n") headerOutputStream.close() - FileUtil.copyMerge(fs, - new Path(tempFileName), - fs, - path, - true, - new Configuration(), - null) + FileUtil.copyMerge(fs, new Path(tempFileName), fs, path, true, new Configuration(), null) } catch { case ioe: IOException => log.warn(ioe) } - log.info("final csv file: " + fileName + " merged") - log.info("'write output' step finished") + log.debug("final csv file: " + fileName + " merged") + log.debug("'write output' step finished") } /** @@ -194,14 +185,13 @@ object HdfsWriter extends Logging { * @param execDate save date * @param fs file system */ - private def saveParquet(df: DataFrame, - targetConfig: HdfsTargetConfig, - execDate: String)(implicit fs: FileSystem): Unit = { + private def saveParquet(df: DataFrame, targetConfig: HdfsTargetConfig, execDate: String)( + implicit fs: FileSystem): Unit = { log.info(s"starting 'write ${targetConfig.fileName.toUpperCase} results' ") log.debug("path: " + targetConfig.path) val tempFileName = targetConfig.path + "/" + targetConfig.fileName + s"_$execDate" + ".tmp" //-${targetConfig.subType} - val fileName = targetConfig.path + "/" + targetConfig.fileName + s"_$execDate" + "." + targetConfig.fileFormat //-${targetConfig.subType} + val fileName = targetConfig.path + "/" + targetConfig.fileName + s"_$execDate" + "." + targetConfig.fileFormat //-${targetConfig.subType} log.info("writing temp parquet file: " + tempFileName) df.coalesce(1) @@ -211,13 +201,7 @@ object HdfsWriter extends Logging { log.info("temp parquet file: " + tempFileName + " written") - FileUtil.copyMerge(fs, - new Path(tempFileName), - fs, - new Path(fileName), - true, - new Configuration(), - null) + FileUtil.copyMerge(fs, new Path(tempFileName), fs, new Path(fileName), true, new Configuration(), null) log.info("final parquet file: " + fileName + " merged") log.info("'write output' step finished") diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/HistoryDBManager.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/HistoryDBManager.scala new file mode 100644 index 0000000..1e64e7b --- /dev/null +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/HistoryDBManager.scala @@ -0,0 +1,157 @@ +package it.agilelab.bigdata.DataQuality.utils.io + +import java.sql.{Array, Connection, ResultSet} + +import it.agilelab.bigdata.DataQuality.metrics.ComposedMetricResult +import it.agilelab.bigdata.DataQuality.sources.DatabaseConfig +import it.agilelab.bigdata.DataQuality.utils +import it.agilelab.bigdata.DataQuality.utils.DQSettings +import org.joda.time.format.{DateTimeFormat, DateTimeFormatter} + +import scala.reflect.runtime.universe._ +import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException +import it.agilelab.bigdata.DataQuality.metrics.{ColumnMetricResult, FileMetricResult} +import it.agilelab.bigdata.DataQuality.utils.{Logging, camelToUnderscores} + +/** + * Created by Egor Makhov on 04/05/2017. + * + * Manager to do operations with history database. + */ +class HistoryDBManager(settings: DQSettings) extends Logging { + + private val dbConfig: Option[DatabaseConfig] = settings.resStorage + private val connection: Option[Connection] = if (dbConfig.isDefined) Some(dbConfig.get.getConnection) else None + + /** + * Saves metric results to a specific table + * Main idea is that method is class independent. It's automatically get the class fields and creates + * JDBC statement to fill and execute. + * @param metrics Sequence of metric results + * @param tb Target table + */ + def saveResultsToDB(metrics: Seq[AnyRef], tb: String): Unit = { + if (dbConfig.isDefined && connection.isDefined) { + val table = utils.makeTableName(dbConfig.get.schema, tb) + log.info(s"Saving '$table'") + try { + val fieldNames = metrics.head.getClass.getDeclaredFields + val fieldStructString: String = fieldNames + .map(field => camelToUnderscores(field.getName)) + .mkString(" (", ", ", ") ") + val paramString: String = + List.fill(fieldNames.length)("?").mkString(" (", ", ", ")") + + val insertSql = "INSERT INTO " + table + fieldStructString + "VALUES" + paramString + + val statement = connection.get.prepareStatement(insertSql) + + metrics.foreach(res => { + fieldNames + .zip(Stream from 1) + .foreach(f => { + f._1.setAccessible(true) + val value: Any = f._1.get(res) + value match { + // todo: add more formats + // case x: Int => statement.setInt(f._2, x) + case x: Seq[_] => + val xs: Seq[String] = x + .filter(_ match { + case _: String => true + case _ => false + }) + .asInstanceOf[Seq[String]] + val array: Array = connection.get.createArrayOf("text", xs.toArray) + statement.setArray(f._2, array) + case x => statement.setString(f._2, x.toString) + + } + }) + statement.addBatch() + }) + + statement.executeBatch() + log.info("Success!") + } catch { + case _: NoSuchElementException => + log.warn("Nothing to save!") + case e: Exception => + log.error("Failed with error:") + log.error(e.toString) + } + } else log.warn("History database is not connected. Avoiding saving the results...") + + } + + /** + * Loads metric results of previous runs from the local SQLite database + * Used in trend check processing. On call you should provide only the type of result you wanna get, + * method will automatically select the proper table + * @param metricSet Set of requested metrics (set of their ids) + * @param rule Rule for result selection. Should be "date" or "record" + * @param tw Requested time window + * For "date" selection rule will select from a window [startDate - tw * days, startDate] + * For "record" selection rule will select tw records from before starting date + * @param startDate Start date to selection. Keep in mind that all the time windows are retrospective, + * so the start date is actually a latest one + * @param f Mapping function (maps resultSet to Seq[MetResults]). Those function provided in the utils package + * That param assumes that in some cases you will want to cast result in some specific way. + * @tparam T Requested result type + * @return lazy sequence of Metric results + */ + def loadResults[T: TypeTag](metricSet: List[String], + rule: String, + tw: Int, + startDate: String = settings.refDateString)( + f: ResultSet => Seq[T]): Seq[T] = { + if (dbConfig.isDefined && connection.isDefined) { + val tb: String = typeOf[T] match { + case t if t =:= typeOf[ColumnMetricResult] => "results_metric_columnar" + case t if t =:= typeOf[FileMetricResult] => "results_metric_file" + case t if t =:= typeOf[ComposedMetricResult] => "results_metric_composed" + case x => throw IllegalParameterException(x.toString) + } + val table = utils.makeTableName(dbConfig.get.schema, tb) + + log.info(s"Loading results from $table...") + + val metricIdString = + List.fill(metricSet.length)("?").mkString("(", ", ", ")") + + val selectSQL = rule match { + case "record" => + // looks a bit bulky and, probably, there is a way to do the same completely with JDBC statements + s"SELECT * FROM $table WHERE metric_id IN $metricIdString AND source_date <= '$startDate' ORDER BY source_date DESC LIMIT ${tw * metricSet.length}" + case "date" => + // formatter based on joda time + val formatter: DateTimeFormatter = + DateTimeFormat.forPattern(utils.applicationDateFormat) + val lastDate = + formatter.parseDateTime(startDate).minusDays(tw).toString(formatter) + s"SELECT * FROM $table WHERE metric_id IN $metricIdString AND source_date >= '$lastDate' AND source_date <= '$startDate'" + case x => throw IllegalParameterException(x) + } + + val statement = connection.get.prepareStatement(selectSQL) + + metricSet.zip(Stream from 1).foreach(x => statement.setString(x._2, x._1)) + val results: ResultSet = statement.executeQuery() + + val resSet = f(results) + log.info(s"Results found: ${resSet.length}") + resSet + } else { + log.warn("History database is not connected. Providing empty historical results...") + Seq.empty[T] + } + } + + /** + * Closes connection with the local database (obviously) + * Defined since the connection is private + */ + def closeConnection(): Unit = { + if (connection.isDefined) connection.get.close() + } +} diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/LocalDBManager.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/LocalDBManager.scala deleted file mode 100644 index 8a42b20..0000000 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/io/LocalDBManager.scala +++ /dev/null @@ -1,157 +0,0 @@ -package it.agilelab.bigdata.DataQuality.utils.io - -import java.sql.{Array, ResultSet} - -import it.agilelab.bigdata.DataQuality.metrics.ComposedMetricResult -import it.agilelab.bigdata.DataQuality.sources.DatabaseConfig -import it.agilelab.bigdata.DataQuality.utils -import it.agilelab.bigdata.DataQuality.utils.DQSettings -import org.joda.time.format.{DateTimeFormat, DateTimeFormatter} - -import scala.reflect.runtime.universe._ -//import com.sun.tools.javac.code.TypeTag -import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException -import it.agilelab.bigdata.DataQuality.metrics.{ - ColumnMetricResult, - FileMetricResult -} -import it.agilelab.bigdata.DataQuality.utils.{Logging, camelToUnderscores} - -/** - * Created by Egor Makhov on 04/05/2017. - * - * Manager to do operations with local SQLite database. - * Probably, we'll move to other storing platform in future - */ -class LocalDBManager(settings: DQSettings) extends Logging { - - lazy private val dbFilePath: DatabaseConfig = - settings.resStorage.asInstanceOf[DatabaseConfig] - lazy private val connection = dbFilePath.getConnection - - /** - * Saves metric results to a specific table - * Main idea is that method is class independent. It's automatically get the class fields and creates - * JDBC statement to fill and execute. - * @param metrics Sequence of metric results - * @param tb Target table - */ - def saveResultsToDB(metrics: Seq[AnyRef], tb: String): Unit = { - val table = utils.makeTableName(dbFilePath.schema, tb) - log.info(s"Saving '$table'") - try { - val fieldNames = metrics.head.getClass.getDeclaredFields - val fieldStructString: String = fieldNames - .map(field => camelToUnderscores(field.getName)) - .mkString(" (", ", ", ") ") - val paramString: String = - List.fill(fieldNames.length)("?").mkString(" (", ", ", ")") - - val insertSql = "INSERT INTO " + table + fieldStructString + "VALUES" + paramString - - val statement = connection.prepareStatement(insertSql) - - metrics.foreach(res => { - fieldNames - .zip(Stream from 1) - .foreach(f => { - f._1.setAccessible(true) - val value: Any = f._1.get(res) - value match { - // todo: add more formats - // case x: Int => statement.setInt(f._2, x) - case x: Seq[_] => - val xs: Seq[String] = x - .filter(_ match { - case _: String => true - case _ => false - }) - .asInstanceOf[Seq[String]] - val array: Array = connection.createArrayOf("text", xs.toArray) - statement.setArray(f._2, array) - case x => statement.setString(f._2, x.toString) - - } - }) - statement.addBatch() - }) - - statement.executeBatch() - log.info("Success!") - } catch { - case _: NoSuchElementException => - log.warn("Nothing to save!") - case e: Exception => - log.error("Failed with error:") - log.error(e.toString) - } - - } - - /** - * Loads metric results of previous runs from the local SQLite database - * Used in trend check processing. On call you should provide only the type of result you wanna get, - * method will automatically select the proper table - * @param metricSet Set of requested metrics (set of their ids) - * @param rule Rule for result selection. Should be "date" or "record" - * @param tw Requested time window - * For "date" selection rule will select from a window [startDate - tw * days, startDate] - * For "record" selection rule will select tw records from before starting date - * @param startDate Start date to selection. Keep in mind that all the time windows are retrospective, - * so the start date is actually a latest one - * @param f Mapping function (maps resultSet to Seq[MetResults]). Those function provided in the utils package - * That param assumes that in some cases you will want to cast result in some specific way. - * @tparam T Requested result type - * @return lazy sequence of Metric results - */ - def loadResults[T: TypeTag](metricSet: List[String], - rule: String, - tw: Int, - startDate: String = settings.refDateString)( - f: ResultSet => Seq[T]): Seq[T] = { - - val tb: String = typeOf[T] match { - case t if t =:= typeOf[ColumnMetricResult] => "results_metric_columnar" - case t if t =:= typeOf[FileMetricResult] => "results_metric_file" - case t if t =:= typeOf[ComposedMetricResult] => "results_metric_composed" - case x => throw IllegalParameterException(x.toString) - } - val table = utils.makeTableName(dbFilePath.schema, tb) - - log.info(s"Loading results from $table...") - - val metricIdString = - List.fill(metricSet.length)("?").mkString("(", ", ", ")") - - val selectSQL = rule match { - case "record" => - // looks a bit bulky and, probably, there is a way to do the same completely with JDBC statements - s"SELECT * FROM $table WHERE metric_id IN $metricIdString AND source_date <= '$startDate' ORDER BY source_date DESC LIMIT ${tw * metricSet.length}" - case "date" => - // formatter based on joda time - val formatter: DateTimeFormatter = - DateTimeFormat.forPattern(utils.applicationDateFormat) - val lastDate = - formatter.parseDateTime(startDate).minusDays(tw).toString(formatter) - s"SELECT * FROM $table WHERE metric_id IN $metricIdString AND source_date >= '$lastDate' AND source_date <= '$startDate'" - case x => throw IllegalParameterException(x) - } - - val statement = connection.prepareStatement(selectSQL) - - metricSet.zip(Stream from 1).foreach(x => statement.setString(x._2, x._1)) - val results: ResultSet = statement.executeQuery() - - val resSet = f(results) - log.info(s"Results found: ${resSet.length}") - resSet - } - - /** - * Closes connection with the local database (obviously) - * Defined since the connection is private - */ - def closeConnection(): Unit = { - connection.close() - } -} diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/Mail.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/Mail.scala similarity index 94% rename from dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/Mail.scala rename to dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/Mail.scala index 46ab9a2..43ddae8 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/Mail.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/Mail.scala @@ -1,4 +1,4 @@ -package it.agilelab.bigdata.DataQuality.utils +package it.agilelab.bigdata.DataQuality.utils.mailing /** * Created by Egor Makhov on 13/10/2017. @@ -12,7 +12,7 @@ object Mail { case object Rich extends MailType case object MultiPart extends MailType - def a(mail: Mail)(implicit mailer: Mailer) { + def a(mail: Mail)(implicit mailer: MailerConfiguration) { import org.apache.commons.mail._ val format = diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/MailerConfiguration.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/MailerConfiguration.scala new file mode 100644 index 0000000..6bd1cb3 --- /dev/null +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/MailerConfiguration.scala @@ -0,0 +1,25 @@ +package it.agilelab.bigdata.DataQuality.utils.mailing + +import com.typesafe.config.Config + +import scala.util.Try + +case class MailerConfiguration( + address: String, + hostName: String, + username: String, + password: String, + smtpPortSSL: Int, + sslOnConnect: Boolean + ) { + def this(config: Config) = { + this( + config.getString("address"), + config.getString("hostname"), + Try(config.getString("username")).getOrElse(""), + Try(config.getString("password")).getOrElse(""), + Try(config.getInt("smtpPort")).getOrElse(465), + Try(config.getBoolean("sslOnConnect")).getOrElse(true) + ) + } +} diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/NotificationManager.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/NotificationManager.scala new file mode 100644 index 0000000..ee331b3 --- /dev/null +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/NotificationManager.scala @@ -0,0 +1,73 @@ +package it.agilelab.bigdata.DataQuality.utils.mailing +import it.agilelab.bigdata.DataQuality.apps.DQMasterBatch.log +import it.agilelab.bigdata.DataQuality.checks.{CheckResult, CheckStatusEnum, LoadCheckResult} +import it.agilelab.bigdata.DataQuality.utils.DQSettings + +object NotificationManager { + + def sendSummary(summary: Summary, additional: Option[String] = None)(implicit settings: DQSettings): Unit = { + if (settings.notifications) { + val text = summary.toMailString() + "\n" + additional.getOrElse("") + + settings.mailingMode match { + case Some("internal") => + import sys.process.stringSeqToProcess + Seq( + "/bin/bash", + settings.scriptPath.get, + text, + summary.status + ) !! + + log.info("Report have been sent.") + case x => throw new IllegalArgumentException(s"Illegal mailing mode: $x") + } + } else log.warn("Notifications are disabled.") + } + + def saveResultsLocally(summary: Summary, + checks: Option[Seq[CheckResult]] = None, + lc: Option[Seq[LoadCheckResult]] = None)(implicit settings: DQSettings): Unit = { + + if (settings.localTmpPath.isDefined) { + val runName: String = settings.appName + val dirPath = settings.localTmpPath.get + "/" +settings.refDateString + "/" + runName + + import java.io._ + + val dir = new File(dirPath) + dir.mkdirs() + + // summary.csv + log.info(s"Saving summary file to $dirPath/summary.csv") + val summaryFile = new File(dirPath + "/" + "summary.csv") + summaryFile.createNewFile() + val s_bw = new BufferedWriter(new FileWriter(summaryFile)) + s_bw.write(summary.toCsvString()) + s_bw.close() + + // failed_load_checks.csv + if(lc.isDefined) { + log.info(s"Saving failed load checks to $dirPath/failed_load_checks.csv") + val lcFile = new File(dirPath + "/" + "failed_load_checks.csv") + lcFile.createNewFile() + val lc_bw = new BufferedWriter(new FileWriter(lcFile)) + lc_bw.write(lc.get.filter(_.status != CheckStatusEnum.Success).map(_.toCsvString()).mkString("\n")) + lc_bw.close() + } + + // failed_metric_checks.csv + if(checks.isDefined) { + log.info(s"Saving failed metric checks to $dirPath/failed_metric_checks.csv") + val chkFile = new File(dirPath + "/" + "failed_metric_checks.csv") + chkFile.createNewFile() + val chk_bw = new BufferedWriter(new FileWriter(chkFile)) + chk_bw.write(checks.get.filter(_.status != "Success").map(_.toCsvString()).mkString("\n")) + chk_bw.close() + } + + log.info("Local results have been saved.") + } else log.warn("Local temp path is not defined") + } + +} diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/Summary.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/Summary.scala new file mode 100644 index 0000000..a16c95b --- /dev/null +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/mailing/Summary.scala @@ -0,0 +1,66 @@ +package it.agilelab.bigdata.DataQuality.utils.mailing +import it.agilelab.bigdata.DataQuality.checks.{CheckResult, CheckStatusEnum, LoadCheckResult} +import it.agilelab.bigdata.DataQuality.configs.ConfigReader +import it.agilelab.bigdata.DataQuality.utils.DQSettings + +case class Summary( + sources: Int, + metrics: Int, + composed_metrics: Int, + load_checks: Int, + checks: Int, + failed_load_checks: Option[Int], + failed_checks: Option[Int] +) { + + def this(conf: ConfigReader, checks: Option[Seq[CheckResult]] = None, lc: Option[Seq[LoadCheckResult]] = None) { + this( + sources = conf.sourcesConfigMap.size, + metrics = conf.metricsBySourceList.size, + composed_metrics = conf.composedMetrics.length, + load_checks = conf.loadChecksMap.values.foldLeft(0)(_ + _.size), + checks = conf.metricsByChecksList.size, + failed_load_checks = lc.map(x => x.count(_.status != CheckStatusEnum.Success)), + failed_checks = checks.map(x => x.count(_.status != "Success")) + ) + } + + val status: String = (failed_checks, failed_load_checks) match { + case (Some(x), Some(y)) => if (x + y == 0) "OK" else "KO" + case _ => "ERROR" + } + + // Status is appended in the send_mail script with the log file path + def toMailString()(implicit settings: DQSettings): String = s"""Reference date: ${settings.refDateString} + |Run configuration path: ${settings.configFilePath} + |Output location (HDFS): ${settings.hdfsTmpPath.getOrElse("")} + | + |Number of sources: $sources + |Number of metrics: $metrics + |Number of composed metrics: $composed_metrics + |Number of load checks: $load_checks + |Number of metric checks: $checks + | + |Failed load checks: ${failed_load_checks.getOrElse("null")} + |Failed checks: ${failed_checks.getOrElse("null")} + """.stripMargin + + def toCsvString()(implicit settings: DQSettings): String = { + val runName: String = settings.appName + Seq( + runName, + status, + settings.refDateString, + sources, + load_checks, + metrics, + composed_metrics, + checks, + failed_checks.getOrElse(""), + failed_load_checks.getOrElse(""), + settings.configFilePath, + settings.hdfsTmpPath.getOrElse("") + ).mkString(settings.tmpFileDelimiter.getOrElse(",")) + } + +} diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/package.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/package.scala index cbeeea1..08ea3ef 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/package.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/package.scala @@ -8,84 +8,89 @@ import com.typesafe.config.Config import it.agilelab.bigdata.DataQuality.metrics.MetricProcessor.ParamMap import it.agilelab.bigdata.DataQuality.metrics.{ColumnMetricResult, ComposedMetricResult, FileMetricResult} import it.agilelab.bigdata.DataQuality.targets.{HdfsTargetConfig, SystemTargetConfig} +import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter} +import it.agilelab.bigdata.DataQuality.utils.mailing.{Mail, MailerConfiguration} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.joda.time.format.{DateTimeFormat, DateTimeFormatter} import scala.collection.immutable.TreeMap -import scala.collection.mutable +import scala.collection.{immutable, mutable} +import scala.reflect.internal.util.TableDef.Column +import org.apache.spark.sql.functions.lit + import scala.util.Try import scala.util.parsing.json.JSONObject -/** - * Created by Egor Makhov on 04/05/2017. - */ package object utils extends Logging { // Application parameters - val applicationDateFormat: String = "yyyy-MM-dd" - val doubleFractionFormat: Int = 13 - val shortDateFormatter: DateTimeFormatter = - DateTimeFormat.forPattern("yyyyMMdd") + val applicationDateFormat: String = "yyyy-MM-dd" + val doubleFractionFormat: Int = 13 + val shortDateFormatter: DateTimeFormatter = DateTimeFormat.forPattern("yyyyMMdd") def parseTargetConfig(config: Config): Option[HdfsTargetConfig] = { Try { val name: Option[String] = Try(config.getString("fileName")).toOption - val format = config.getString("fileFormat") - val path = config.getString("path") + val format = config.getString("fileFormat") + val path = config.getString("path") + val delimiter = Try(config.getString("delimiter")).toOption - val quoted: Boolean = Try(config.getBoolean("quoted")).getOrElse(false) + val quote = Try(config.getString("quote")).toOption + val escape = Try(config.getString("escape")).toOption + + val quoteMode = Try(config.getString("quoteMode")).toOption + HdfsTargetConfig(name.getOrElse(""), format, path, - delimiter, - quoted = quoted) + delimiter = delimiter, + quote = quote, + escape = escape, + quoteMode = quoteMode) }.toOption } - def saveErrors(header: Seq[String], content: (String, mutable.Seq[String]))( - implicit sparkContext: SparkContext, - settings: DQSettings): Unit = { - - val hc = sparkContext.hadoopConfiguration - val fs = FileSystem.get(hc) - val dateString = settings.ref_date.toString(shortDateFormatter) - val basePath = settings.errorFolderPath.getOrElse("") - val finalPath = s"$basePath/$dateString/${content._1}.csv" - val separator: Char = ',' - - val headerString = "METRIC_ID" + header.zipWithIndex.foldLeft("") { - (base, n) => - base + s"${separator}COLUMN_${n._2 + 1}${separator}VALUE_${n._2 + 1}" - } - - val errorFile = fs.create(new Path(finalPath)) - try { - errorFile write (headerString + "\n").getBytes("UTF-8") - val (metric, errors) = content - errors.foreach { erStr => - val er = erStr.split(",") - val csvString: String = metric + er - .zip(header) - .foldLeft("")( - (base, n) => base + s"$separator${n._2}$separator${n._1}" - ) - errorFile write (csvString + "\n").getBytes("UTF-8") - } - } catch { - case e: Exception => - log.warn(s"Some error occurred while writing $finalPath") - log.warn(e.toString) - } finally { - errorFile.close() + def saveErrors(header: Seq[String], content: (String, mutable.Seq[Seq[String]]))(implicit fs: FileSystem, + sc: SparkContext, + sqlC: SQLContext, + settings: DQSettings): Unit = { + settings.errorFolderPath match { + case Some(path) => + val baseHeader: StructType = StructType((1 to header.size).map(x => StructField(s"VALUE_$x", StringType))) + val baseRDD: RDD[Row] = sqlC.sparkContext.parallelize(content._2.map(Row.fromSeq)) + + val ordSeq = Seq("METRIC_ID") ++ (1 to header.size).foldLeft(Seq.empty[String])((s, x) => + s ++ Seq(s"COLUMN_$x", s"VALUE_$x")) + + val baseDF = sqlC.createDataFrame(baseRDD, baseHeader) + val finalDF = header.zipWithIndex + .foldLeft(baseDF.withColumn("METRIC_ID", lit(content._1)))((df, i) => + df.withColumn(s"COLUMN_${i._2 + 1}", lit(i._1))) + .select(ordSeq.head, ordSeq.tail: _*) + + val tarConf = HdfsTargetConfig( + fileName = content._1, + fileFormat = settings.errorFileFormat, + path = path, + delimiter = settings.errorFileDelimiter, + escape = settings.errorFileEscape, + quote = settings.errorFileQuote, + quoteMode = settings.errorFileQuoteMode) + + HdfsWriter.saveDF(tarConf, finalDF) + + case _ => log.warn("Error dump path is not defined") } } def sendMail(recievers: Seq[String], text: Option[String], filepath: String)( - implicit mailer: Mailer): Unit = { + implicit mailer: MailerConfiguration): Unit = { - val defaultText = - "Some of requested checks failed. Please, check attached csv." + val defaultText = "Some of requested checks failed. Please, check attached csv." Mail a Mail( from = (mailer.address, "AgileLAB DataQuality"), @@ -97,25 +102,24 @@ package object utils extends Logging { } - def sendBashMail( - numOfFailedChecks: Int, - failedCheckIds: String, - fullPath: String, - systemConfig: SystemTargetConfig)(implicit settings: DQSettings): Unit = { + def sendBashMail(numOfFailedChecks: Int, failedCheckIds: String, fullPath: String, systemConfig: SystemTargetConfig)( + implicit settings: DQSettings): Unit = { import sys.process.stringSeqToProcess val mailList: Seq[String] = systemConfig.mailList - val mailListString = mailList.mkString(" ") - val targetName = systemConfig.id - - Seq( - "/bin/bash", - settings.appDir + "/sendMail.sh", - targetName, - mailListString, - numOfFailedChecks.toString, - failedCheckIds, - fullPath - ) !! + val mailListString = mailList.mkString(" ") + val targetName = systemConfig.id + + if (settings.scriptPath.isDefined) { + Seq( + "/bin/bash", + settings.scriptPath.get, + targetName, + mailListString, + numOfFailedChecks.toString, + failedCheckIds, + fullPath + ) !! + } else throw new IllegalArgumentException("Mail script path is not defined") } @@ -128,9 +132,7 @@ package object utils extends Logging { * @param aggr Generated id aggregator * * @return List of generated ids*/ - def generateMetricSubId(id: String, - n: Int, - aggr: List[String] = List.empty): List[String] = { + def generateMetricSubId(id: String, n: Int, aggr: List[String] = List.empty): List[String] = { if (n >= 1) { val newId: List[String] = List(id + "_" + n.toString) return generateMetricSubId(id, n - 1, aggr ++ newId) @@ -150,7 +152,7 @@ package object utils extends Logging { if (paramMap.nonEmpty) { // sorted by key to return the same result without affect of the map key order val sorted = TreeMap(paramMap.toArray: _*) - val tail = sorted.values.toList.mkString(":", ":", "") + val tail = sorted.values.toList.mkString(":", ":", "") return tail } "" @@ -311,7 +313,7 @@ package object utils extends Logging { val sep: String = "." schema match { case Some(x) => x + sep + table - case None => table + case None => table } } diff --git a/dq-core/src/test/scala/SparkTestSpec.scala b/dq-core/src/test/scala/SparkTestSpec.scala index 0ae652e..dc32da2 100644 --- a/dq-core/src/test/scala/SparkTestSpec.scala +++ b/dq-core/src/test/scala/SparkTestSpec.scala @@ -3,7 +3,7 @@ import it.agilelab.bigdata.DataQuality.configs.ConfigReader import it.agilelab.bigdata.DataQuality.metrics.{ColumnMetric, FileMetric, MetricProcessor} import it.agilelab.bigdata.DataQuality.sources.{HdfsFile, SourceConfig} import it.agilelab.bigdata.DataQuality.utils.DQSettings -import it.agilelab.bigdata.DataQuality.utils.io.LocalDBManager +import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.{SparkConf, SparkContext} @@ -36,7 +36,7 @@ class SparkTestSpec extends FunSuite with BeforeAndAfterAll { .set("spark.sql.parquet.compression.codec", "snappy") .setMaster("local[*]") - val localSqlWriter = new LocalDBManager(settings) + val localSqlWriter = new HistoryDBManager(settings) override def beforeAll() { _sc = new SparkContext(conf) @@ -47,7 +47,7 @@ class SparkTestSpec extends FunSuite with BeforeAndAfterAll { test("parse basic conf") { val configuration = new ConfigReader(settings.configFilePath)(localSqlWriter, settings) - val testSource: HdfsFile = HdfsFile("T1","./t1.csv","csv",None,true,"2018-03-26") + val testSource: HdfsFile = HdfsFile("T1", "./t1.csv", "csv", true, "2018-03-26", None) val sources: Map[String, SourceConfig] = configuration.sourcesConfigMap assert(sources.keySet.size == 3, "Should be equal 3") assert(sources.keySet == Set("T1","T2","T3")) @@ -68,7 +68,7 @@ class SparkTestSpec extends FunSuite with BeforeAndAfterAll { val input: DataFrame = sqlContext.createDataFrame(List.fill(SAMPLE_SIZE)(TestRow.apply())) val metric = FileMetric("123","ROW_COUNT","","input","2018-12-12",Map.empty) val res: (Map[Seq[String], Map[ColumnMetric, (Double, Option[String])]], Map[FileMetric, (Double, Option[String])]) = - MetricProcessor.processAllMetrics(input, Seq.empty, Seq(metric), Seq.empty)(settings,sc) + MetricProcessor.processAllMetrics(input, Seq.empty, Seq(metric), Seq.empty)(settings,sc, sqlContext, fs) assert(res._2(metric)._1 == SAMPLE_SIZE) } diff --git a/dq-core/src/universal/bin/aggregate_summary.sh b/dq-core/src/universal/bin/aggregate_summary.sh new file mode 100644 index 0000000..2e5d28c --- /dev/null +++ b/dq-core/src/universal/bin/aggregate_summary.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +export SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/" && pwd )" +export TMP_DIR=${SCRIPT_DIR}"../tmp" + +nowmonth=$(date +%Y-%m) +START_TIME="$( date -d "$nowmonth-15 last month" '+%Y-%m')" + +echo $START_TIME +export APP_NAME=${START_TIME} + +export SUMMARY_DIR=${SCRIPT_DIR}"/../tmp/${START_TIME}-summary" +rm -r $SUMMARY_DIR +mkdir $SUMMARY_DIR +SUMMARY_REGEX=${SCRIPT_DIR}"/../tmp/"${START_TIME}"-*/*/summary.csv" + +cat $SUMMARY_REGEX > $SUMMARY_DIR"/summary.csv" + +export MAIL_TO="EMAKHOV.external3@unicredit.eu PTomeo.external@unicredit.eu" +bash ${SCRIPT_DIR}"/send_mail.sh" "Content:\n$( cat $SUMMARY_DIR"/summary.csv" )" "SUMMARY" + +echo "SUCCESS" diff --git a/dq-core/src/universal/bin/global-parameters.sh b/dq-core/src/universal/bin/global_parameters.sh old mode 100755 new mode 100644 similarity index 93% rename from dq-core/src/universal/bin/global-parameters.sh rename to dq-core/src/universal/bin/global_parameters.sh index 4342c7b..2076bd9 --- a/dq-core/src/universal/bin/global-parameters.sh +++ b/dq-core/src/universal/bin/global_parameters.sh @@ -12,7 +12,7 @@ SPARK_PARALLELISM=200 HADOOP_DIR=/etc/hadoop/conf ###### LOGGING PARAMETERS -LOG_DIR="${SCRIPT_DIR}/../../logs" +LOG_DIR="${SCRIPT_DIR}/../../../logs" JAR_NAME="${SCRIPT_DIR}/../lib/*.jar" LOG_CONFIG="${SCRIPT_DIR}/../conf/log4j.properties" diff --git a/dq-core/src/universal/bin/sendMail.sh b/dq-core/src/universal/bin/sendMail.sh deleted file mode 100755 index 467f8f5..0000000 --- a/dq-core/src/universal/bin/sendMail.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -if [ -z $1 ] -then - echo "No arguments found!" - exit 1 -elif [ -n $1 ] -then -TARGET_NAME=$1 -MAIL_TO=$2 -NUMBER_FAILED_CHEKCS=$3 -FAILED_CHECKS=$4 -FILE_PATH=$5 -fi - -TEXT="for ${TARGET_NAME} ${NUMBER_FAILED_CHEKCS} checks have failed $2" -echo $TEXT - ATTACHED_FILE="/tmp/DQ-${TARGET_NAME}.csv" - rm -f $ATTACHED_FILE - hdfs dfs -get $FILE_PATH ${ATTACHED_FILE} - gzip -f $ATTACHED_FILE - TEXT="for ${TARGET_NAME} ${NUMBER_FAILED_CHEKCS} checks have failed : [${FAILED_CHECKS}] filepath: ${FILE_PATH} " - - echo "$TEXT" -# todo: Define proper credentials - echo "$TEXT" | mail -s "DATAQUALITY ${RESULT} REPORT CHECKS ${TARGET_NAME}" -a "${ATTACHED_FILE}.gz" -r $(whoami).$(hostname -s)@host.com ${MAIL_TO} - rm -f $ATTACHED_FILE - rm -f "${ATTACHED_FILE}.gz" - -exit 0 diff --git a/dq-core/src/universal/bin/send_mail.sh b/dq-core/src/universal/bin/send_mail.sh new file mode 100644 index 0000000..629756d --- /dev/null +++ b/dq-core/src/universal/bin/send_mail.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +if [[ -z $2 ]]; then + echo "No arguments found! Please specify text and status (OK/KO/ERROR/SUMMARY)." + exit 1 +elif [[ -n $2 ]]; then + TEXT=$1 + STATUS=$2 +fi + +echo "--------------------------------------" +echo -e "$STATUS" +echo -e "$MAIL_TO" +echo -e "$APP_NAME" +echo "--------------------------------------" + +MAX_SIZE=9500000 +TEXT="Status: $STATUS\nLog file path: $LOG_FILE\n$TEXT" + +if [[ ! -z $SUMMARY_DIR ]]; then + if [[ ! -z $LOG_FILE ]]; then + filesize=$(stat -c%s "$LOG_FILE") + echo "Size of $LOG_FILE = $filesize bytes." + if (( filesize < MAX_SIZE )); then + cp $LOG_FILE $SUMMARY_DIR + fi + fi + + cd $SUMMARY_DIR; + zip -r out.zip . ; + cd -; + + filesize=$(stat -c%s "$SUMMARY_DIR/summary.zip") + + if (( filesize < MAX_SIZE )); then + printf "$TEXT" | mail -s "[$STATUS] $APP_NAME Data Quality report" -a $SUMMARY_DIR/out.zip -r $(whoami).$(hostname -s)@unicredit.eu ${MAIL_TO} + rm $SUMMARY_DIR/out.zip + exit 0 + fi +fi + +printf "$TEXT" | mail -s "[$STATUS] $APP_NAME Data Quality report" -r $(whoami).$(hostname -s)@unicredit.eu ${MAIL_TO} +exit 0 diff --git a/dq-core/src/universal/bin/submit.sh b/dq-core/src/universal/bin/submit.sh old mode 100755 new mode 100644 index 0faca48..8ad1973 --- a/dq-core/src/universal/bin/submit.sh +++ b/dq-core/src/universal/bin/submit.sh @@ -5,7 +5,7 @@ . /etc/hbase/conf/hbase-env.sh ######################## END REMOVE THESE LINES ################################## -source $SCRIPT_DIR/global-parameters.sh +source $SCRIPT_DIR/global_parameters.sh cd "${SCRIPT_DIR}/../.." echo "-------------------------------------------------------" @@ -52,4 +52,4 @@ RESULT=$? echo "-------------------------------------------------------" echo "${LEGAL_ENTITY} ${YEAR}${MONTH} job completed on $(date '+%Y-%m-%d %H:%M:%S')" echo "-------------------------------------------------------" -exit $RESULT \ No newline at end of file +exit $RESULT diff --git a/project/build.properties b/project/build.properties index 6be4958..cd7364c 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version = 0.13.15 \ No newline at end of file +sbt.version = 0.13.17 \ No newline at end of file diff --git a/project/plugins.sbt b/project/plugins.sbt index a90f66f..de48374 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -2,18 +2,10 @@ import sbt.addSbtPlugin logLevel := Level.Warn -resolvers += "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/" - addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0") addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.2.0") -//// Cats support for Scala 2.10 -addSbtPlugin("org.lyranthe.sbt" % "partial-unification" % "1.1.0") - -// The Play plugin -addSbtPlugin("com.typesafe.play" % "sbt-plugin" % "2.5.14") - // provides server side compilation of typescript to ecmascript 5 or 3 addSbtPlugin("name.de-vries" % "sbt-typescript" % "2.5.2") // checks your typescript code for error prone constructions @@ -22,4 +14,9 @@ addSbtPlugin("name.de-vries" % "sbt-tslint" % "5.1.0") addSbtPlugin("name.de-vries" % "sbt-jasmine" % "0.0.3") addSbtPlugin("com.typesafe.sbt" % "sbt-digest" % "1.1.0") addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.1.10") -addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2") \ No newline at end of file +addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2") + +resolvers += "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/" +// The Play plugin +addSbtPlugin("com.typesafe.play" % "sbt-plugin" % "2.5.14") +addSbtPlugin("org.irundaia.sbt" % "sbt-sassify" % "1.4.4") \ No newline at end of file diff --git a/project/src/main/scala/BuildEnvPlugin.scala b/project/src/main/scala/BuildEnvPlugin.scala index 947bf0f..5186173 100644 --- a/project/src/main/scala/BuildEnvPlugin.scala +++ b/project/src/main/scala/BuildEnvPlugin.scala @@ -4,7 +4,8 @@ import sbt._ import sbt.Keys._ import sbt.plugins.JvmPlugin -/** Simple plugin to control build environment */ + +/** sets the build environment */ object BuildEnvPlugin extends AutoPlugin { // make sure it triggers automatically @@ -13,7 +14,7 @@ object BuildEnvPlugin extends AutoPlugin { object autoImport { object BuildEnv extends Enumeration { - val Production, Test, Dev = Value + val Production, Stage, Test, Dev = Value } val buildEnv = settingKey[BuildEnv.Value]("the current build environment") @@ -22,19 +23,15 @@ object BuildEnvPlugin extends AutoPlugin { override def projectSettings: Seq[Setting[_]] = Seq( buildEnv := { - sys.props - .get("env") + sys.props.get("env") .orElse(sys.env.get("BUILD_ENV")) .flatMap { case "dev" => Some(BuildEnv.Dev) - case "test" => Some(BuildEnv.Test) - case "prod" => Some(BuildEnv.Production) - //todo: Add more if needed case _ => None } .getOrElse(BuildEnv.Dev) }, - // give feedback + // give feed back onLoadMessage := { // depend on the old message as well val defaultMessage = onLoadMessage.value diff --git a/project/src/main/scala/BuildIntegrationPlugin.scala b/project/src/main/scala/BuildIntegrationPlugin.scala index 36290b5..c2a78d7 100644 --- a/project/src/main/scala/BuildIntegrationPlugin.scala +++ b/project/src/main/scala/BuildIntegrationPlugin.scala @@ -4,7 +4,7 @@ import sbt.Keys.onLoadMessage import sbt.plugins.JvmPlugin import sbt.{AllRequirements, AutoPlugin, Setting, settingKey} -/** Simple plugin to control project integration configurations */ +/** sets the build environment */ object BuildIntegrationPlugin extends AutoPlugin { // make sure it triggers automatically @@ -13,27 +13,26 @@ object BuildIntegrationPlugin extends AutoPlugin { object autoImport { object IntegrationEnv extends Enumeration { - val local = Value + val dev = Value } - val integrationEnv = settingKey[IntegrationEnv.Value]( - "the current build integration environment") + val integrationEnv = settingKey[IntegrationEnv.Value]("the current build integration environment") } import autoImport._ override def projectSettings: Seq[Setting[_]] = Seq( integrationEnv := { - sys.props - .get("integration") - .orElse(sys.env.get("INTEGRATION")) + sys.props.get("integration") + .orElse(sys.env.get("BUILD_ENV")) .flatMap { - case "local" => Some(IntegrationEnv.local) - //todo: Add more if needed + case "dev" => Some(IntegrationEnv.dev) case _ => None } - .getOrElse(IntegrationEnv.local) + .getOrElse(IntegrationEnv.dev) }, + // give feed back onLoadMessage := { + // depend on the old message as well val defaultMessage = onLoadMessage.value val env = integrationEnv.value s"""|$defaultMessage