diff --git a/.gitignore b/.gitignore index 3313ba6..df62300 100644 --- a/.gitignore +++ b/.gitignore @@ -67,7 +67,7 @@ lib_managed/ src_managed/ project/boot/ project/plugins/project/ -project/target +project/target/* project/project/target/ # Scala-IDE specific @@ -86,3 +86,5 @@ project/project/target/ # End of https://www.gitignore.io/api/intellij,scala,sbt .idea/ + +tmp/* diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..de0c2ea --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,35 @@ +image: hseeberger/scala-sbt:8u242_1.3.8_2.12.10 + +variables: + SBT_OPTS: > + -Dsbt.global.base=$CI_PROJECT_DIR/sbt-cache/sbtboot + -Dsbt.boot.directory=$CI_PROJECT_DIR/sbt-cache/boot + -Dsbt.ivy.home=$CI_PROJECT_DIR/sbt-cache/ivy +cache: + key: "$CI_COMMIT_REF_SLUG" + paths: + - "$CI_PROJECT_DIR/sbt-cache" + +stages: + - test + - assembly + +test-1.6: + stage: test + script: + - sbt -Dfile.encoding=UTF-8 "project core" 'set sparkVersion:="1.6.0"' 'testOnly * -- -n it.agilelab.bigdata.DataQuality.Spark1xTest' + +test-2.4: + stage: test + script: + - sbt -Dfile.encoding=UTF-8 "project core" 'set sparkVersion:="2.4.0"' 'testOnly * -- -n it.agilelab.bigdata.DataQuality.Spark2xTest' + +assembly-1.6: + stage: assembly + script: + - sbt "project core" 'set sparkVersion:="1.6.0"' assembly + +assembly-2.4: + stage: assembly + script: + - sbt "project core" 'set sparkVersion:="2.4.0"' assembly diff --git a/.scalafmt.conf b/.scalafmt.conf new file mode 100644 index 0000000..59dc217 --- /dev/null +++ b/.scalafmt.conf @@ -0,0 +1,2 @@ +align = more // For pretty alignment. +maxColumn = 120 // For my wide 30" display. diff --git a/build.sbt b/build.sbt index 2e12b0a..468fe40 100644 --- a/build.sbt +++ b/build.sbt @@ -1,14 +1,9 @@ import sbt._ -import Multiversion.sparkVersion import com.typesafe.sbt.SbtNativePackager.autoImport.NativePackagerHelper._ -import sbt.Keys.{scalaVersion, test} -import sbtassembly.AssemblyPlugin.autoImport.{assemblyExcludedJars, assemblyOption} +import sbt.Keys.scalaVersion -lazy val commonSettings = Seq( - version := "1.2.1" -) - -sparkVersion := "2.4.0" // default spark version +ThisBuild / organization := "it.agilelab" +ThisBuild / version := "1.3.0-SNAPSHOT" scalacOptions ++= Seq( "-target:jvm-1.8", @@ -30,21 +25,6 @@ resolvers ++= Seq( "Cloudera" at "https://repository.cloudera.com/artifactory/cloudera-repos/" ) -def calcVersionScala(sparkVersion: String): String = { - sparkVersion.head match { - case '1' => "2.10.6" - case '2' => "2.11.11" - case _ => throw new Exception("This Spark version is not supported") - } -} - -/* - MODULE: "DQ_ROOT" - */ -lazy val root = (project in file(".")).settings( - name := "DataQuality-framework" -).aggregate(core, common) - /* MODULE: "DQ_COMMON" */ @@ -55,23 +35,24 @@ lazy val common = /* MODULE: "DQ_CORE" */ + lazy val core = (project in file("dq-core")) .enablePlugins(UniversalPlugin, UniversalDeployPlugin) .settings( - sparkVersion := sparkVersion.all(ScopeFilter(inProjects(ProjectRef(file("."), "root")))).value.head, - scalaVersion := calcVersionScala(sparkVersion.value), - commonSettings, - libraryDependencies ++= { - //val sv = sparkVersion.all(ScopeFilter(inProjects(ProjectRef(file("."), "root")))).value.head - Dependencies.dq_core ++ Dependencies.sparkDependenciesCalculation(sparkVersion.value) + libraryDependencies ++= { + Dependencies.dq_core ++ Dependencies.getSparkDependencies(sparkVersion.value) }, - unmanagedResourceDirectories in Compile += baseDirectory(_ / "src/main/resources").value, - excludeFilter in Compile in unmanagedResources := "*", - unmanagedJars in Compile += file("dq-core/lib/ojdbc7.jar"), - assemblyExcludedJars in assembly := (fullClasspath in assembly).value.filter(_.data.getName startsWith "spark-assembly"), - assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = true), - test in assembly := {}, - assemblyMergeStrategy in assembly := { + Compile / unmanagedResourceDirectories += baseDirectory(_ / "src/main/resources").value, + Compile / unmanagedJars += file("dq-core/lib/ojdbc7.jar"), + + run in Compile := Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)).evaluated, + runMain in Compile := Defaults.runMainTask(fullClasspath in Compile, runner in(Compile, run)).evaluated, + parallelExecution in Test := false, + assembly / assemblyJarName := s"dq-${name.value}_${sparkVersion.value}_${scalaVersion.value}-${version.value}.jar", + assembly / test := {}, + assembly / assemblyMergeStrategy := { + case PathList("org","aopalliance", xs @ _*) => MergeStrategy.last + case PathList("javax", "inject", xs @ _*) => MergeStrategy.last case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last case PathList("javax", "activation", xs @ _*) => MergeStrategy.last case PathList("org", "apache", xs @ _*) => MergeStrategy.last @@ -89,8 +70,8 @@ lazy val core = (project in file("dq-core")) val oldStrategy = (assemblyMergeStrategy in assembly).value oldStrategy(x) }, - mappings in Universal += { - // TODO: Add paths application configuration files + + Universal / mappings += { val confFile = buildEnv.value match { case BuildEnv.Stage => "conf/qa.conf" case BuildEnv.Test => "conf/test.conf" @@ -99,18 +80,15 @@ lazy val core = (project in file("dq-core")) } ((resourceDirectory in Compile).value / confFile) -> "conf/application.conf" }, - mappings in Universal ++= { - // TODO: Add paths application integration files + + Universal / mappings ++= { val integrationFolder = integrationEnv.value match { case _ => "integration/dev" } directory((resourceDirectory in Compile).value / integrationFolder / "bin") ++ directory((resourceDirectory in Compile).value / integrationFolder / "conf") }, - mappings in Universal <+= (assembly in Compile) map { jar => - jar -> ("lib/" + jar.getName) - }, - assemblyJarName in assembly := s"dq-core_${sparkVersion.value}_${scalaVersion.value}.jar" + Universal / mappings += (assembly in Compile).map(jar => jar -> ("lib/" + jar.getName)).value ) /* @@ -119,66 +97,18 @@ lazy val core = (project in file("dq-core")) lazy val ui = (project in file("dq-ui")) .enablePlugins(PlayScala) .settings( - inThisBuild( - commonSettings ++ List(scalaVersion := "2.11.12") - ), - incOptions := incOptions.value.withNameHashing(true), + scalaVersion := "2.11.12", updateOptions := updateOptions.value.withCachedResolution(cachedResoluton = true), //we use nodejs to make our typescript build as fast as possible JsEngineKeys.engineType := JsEngineKeys.EngineType.Node, libraryDependencies ++= { - val ngVersion="4.4.4" - Seq( - jdbc, cache, ws, specs2%Test, evolutions, guice, - "com.typesafe.play" %% "play-json" % "2.5.14", - "org.scalatestplus.play" %% "scalatestplus-play" % "2.0.0" % "test", - "joda-time" % "joda-time" % "2.9.9", - "org.joda" % "joda-convert" % "1.9.2", - "org.squeryl" %% "squeryl" % "0.9.9", - "com.gilt" % "jerkson_2.11" % "0.6.9", - "org.webjars" %% "webjars-play" % "2.7.3", - "org.postgresql" % "postgresql" % "42.1.1", - "org.typelevel" %% "cats-core" % "1.1.0", - - //angular2 dependencies - "org.webjars.npm" % "angular__common" % ngVersion, - "org.webjars.npm" % "angular__compiler" % ngVersion, - "org.webjars.npm" % "angular__core" % ngVersion, - "org.webjars.npm" % "angular__http" % ngVersion, - "org.webjars.npm" % "angular__forms" % ngVersion, - "org.webjars.npm" % "angular__router" % ngVersion, - "org.webjars.npm" % "angular__platform-browser-dynamic" % ngVersion, - "org.webjars.npm" % "angular__platform-browser" % ngVersion, - "org.webjars.npm" % "angular__cdk" % "2.0.0-beta.10", - "org.webjars.npm" % "angular__material" % "2.0.0-beta.10", - "org.webjars.npm" % "angular__animations" % ngVersion, - "org.webjars.npm" % "systemjs" % "0.20.14", - "org.webjars.npm" % "rxjs" % "5.4.2", - "org.webjars.npm" % "reflect-metadata" % "0.1.8", - "org.webjars.npm" % "zone.js" % "0.8.4", - "org.webjars.npm" % "core-js" % "2.4.1", - "org.webjars.npm" % "symbol-observable" % "1.0.1", - - "org.webjars.npm" % "angular__flex-layout" % "2.0.0-beta.9", - - "org.webjars.npm" % "typescript" % "2.4.1", - "org.webjars.npm" % "codemirror" % "5.30.0", - "org.webjars.npm" % "ng2-codemirror" % "1.1.3", - - //tslint dependency - "org.webjars.npm" % "types__jasmine" % "2.5.53" % "test", - //test - "org.webjars.npm" % "jasmine-core" % "2.6.4", - "org.webjars.npm" % "ng2-file-upload" % "1.2.0", - "org.webjars.npm" % "file-saver" % "1.3.8", - "org.webjars.npm" % "types__file-saver" % "1.3.0" - ) + Seq(jdbc, cache, ws, specs2 % Test, evolutions, guice) ++ Dependencies.dq_ui ++ Dependencies.getJSDependencies("4.4.4") }, dependencyOverrides += "org.webjars.npm" % "minimatch" % "3.0.0", // use the webjars npm directory (target/web/node_modules ) for resolution of module imports of angular2/core etc resolveFromWebjarsNodeModulesDir := true, // compile our tests as commonjs instead of systemjs modules - (projectTestFile in typescript) := Some("tsconfig.test.json") + typescript / projectTestFile := Some("tsconfig.test.json") ).dependsOn(common) /* @@ -186,23 +116,10 @@ lazy val ui = (project in file("dq-ui")) */ lazy val api = (project in file("dq-api")) .settings( - inThisBuild( - commonSettings ++ List(scalaVersion := "2.11.12") - ), - incOptions := incOptions.value.withNameHashing(true), + scalaVersion := "2.11.12", updateOptions := updateOptions.value.withCachedResolution(cachedResoluton = true), libraryDependencies ++= { - val ngVersion="4.4.4" - Seq( - jdbc, cache, ws, specs2%Test, evolutions, guice, - "com.typesafe.play" %% "play-json" % "2.5.14", - "org.squeryl" %% "squeryl" % "0.9.9", - "org.postgresql" % "postgresql" % "42.1.1", - "com.gilt" % "jerkson_2.11" % "0.6.9", - "org.webjars" % "swagger-ui" % "3.1.5", - "org.scalatest" %% "scalatest" % "3.0.4" % Test, - "org.scalatestplus.play" %% "scalatestplus-play" % "3.1.2" % Test - ) + Seq(jdbc, cache, ws, specs2 % Test, evolutions, guice) ++ Dependencies.dq_api } ) @@ -212,22 +129,9 @@ lazy val api = (project in file("dq-api")) lazy val be = (project in file("dq-be")) .enablePlugins(PlayScala) .settings( - inThisBuild( - commonSettings ++ List(scalaVersion := "2.11.12") - ), - incOptions := incOptions.value.withNameHashing(true), + scalaVersion := "2.11.12", updateOptions := updateOptions.value.withCachedResolution(cachedResoluton = true), libraryDependencies ++= { - val ngVersion="4.4.4" - Seq( - jdbc, cache, ws, specs2%Test, evolutions, guice, - "com.typesafe.play" %% "play-json" % "2.5.14", - "org.squeryl" %% "squeryl" % "0.9.9", - "org.postgresql" % "postgresql" % "42.1.1", - "com.gilt" % "jerkson_2.11" % "0.6.9", - "org.webjars" % "swagger-ui" % "3.1.5", - "org.scalatest" %% "scalatest" % "3.0.4" % Test, - "org.scalatestplus.play" %% "scalatestplus-play" % "3.1.2" % Test - ) + Seq(jdbc, cache, ws, specs2 % Test, evolutions, guice) ++ Dependencies.dq_be } ).dependsOn(api,common) \ No newline at end of file diff --git a/docs/examples/conf/demo.conf b/docs/examples/conf/demo.conf new file mode 100644 index 0000000..15eb725 --- /dev/null +++ b/docs/examples/conf/demo.conf @@ -0,0 +1,228 @@ +Database: [ +] + +Sources: [ + { + id = "USGS_2000" + type = "HDFS" + path = "./docs/examples/data/usgs/USGS_2000.csv" + delimiter = "," + header = true + fileType = "csv" + keyFields = ["Depth", "Date", "Time"] + } +] + +VirtualSources: [ + { + id = "USGS_US_SOURCE" + type = "FILTER-SQL" + parentSources=["USGS_2000"], + sql ="select * from USGS_2000 where `Magnitude Source`='US'" + keyFields = ["Date","Time","Latitude","Longitude","Type","Depth"] + save = true + }, +] + +Metrics: [ + { + id: "depth_avg" + name: "AVG_NUMBER" + type: "COLUMN" + description: "Average value of depth" + config: { + file: "USGS_2000", + columns: ["Depth"] + } + }, + { + id: "null_values" + name: "NULL_VALUES" + type: "COLUMN" + description: "num of null vals" + config: { + file: "USGS_2000", + columns: ["Time"] + } + }, + { + id: "usgs_row_count" + name: "ROW_COUNT" + type: "FILE" + description: "" + config: { + file: "USGS_US_SOURCE" + } + }, + { + id: "usgs_us_row_count" + name: "ROW_COUNT" + type: "FILE" + description: "" + config: { + file: "USGS_2000" + } + }, + { + id: "eq_colimns" + name: "COLUMN_EQ" + type: "COLUMN" + description: "" + config: { + file: "USGS_2000" + columns: ["Location Source","Magnitude Source"] + params: {} + } + } +] + +ComposedMetrics: [ + { + id: "not_null_values" + name: "qwe" + description: "qwe" + formula: "$usgs_row_count-$null_values" + }, +] + +LoadChecks: [ + { + id = "usgs_is_avro" + type = "FILE_TYPE" + source = "USGS_2000" + option = "avro" + }, + { + id = "usgs_is_csv" + type = "FILE_TYPE" + source = "USGS_2000" + option = "csv" + }, +] + +Checks: [ + { + id: "depth_avg_check" + type: "snapshot" + subtype: "GREATER_THAN" + description: "Checks is average of depth is greather than 10" + config: { + metrics: ["usgs_row_count"] + params: {threshold: "50"} + } + }, + { + id: "fail_check" + type: "snapshot" + subtype: "GREATER_THAN" + description: "Checks is average of depth is greather than 10" + config: { + metrics: ["usgs_row_count"] + params: {threshold: "100000"} + } + }, + { + id: "date_trend_check" + type: "trend" + subtype: "AVERAGE_BOUND_FULL_CHECK" + name: "some basic trend" + description: "trend date" + config: { + metrics: ["usgs_row_count"] + rule: "date" + params: {threshold: "0.5", timewindow: "3"} + } + }, + { + id: "record_trend_check" + type: "trend" + subtype: "AVERAGE_BOUND_FULL_CHECK" // avg*(1-threshold) <= current <= avg*threshold + name: "some basic trend" + description: "trend rule" + config: { + metrics: ["usgs_row_count"] + rule: "record" + params: {threshold: "0.5", timewindow: "3"} + } + } +] + +Targets: [ + { + type: "FILE-METRICS" + config: { + fileFormat: "csv" + path: "/home/emakhov/IdeaProjects/Agile.DataQuality/tmp/fs/results" + delimiter: "," + } + }, + { + type: "COLUMNAR-METRICS" + config: { + fileFormat: "csv" + path: "/home/emakhov/IdeaProjects/Agile.DataQuality/tmp/fs/results" + delimiter: "," + } + }, + { + type: "COMPOSED-METRICS" + config: { + fileFormat: "csv" + path: "/home/emakhov/IdeaProjects/Agile.DataQuality/tmp/fs/results" + delimiter: "," + } + }, + { + type: "CHECKS" + config: { + fileFormat: "csv" + path: "/home/emakhov/IdeaProjects/Agile.DataQuality/tmp/fs/results" + delimiter: "," + } + }, + { + type: "LOAD-CHECKS" + config: { + fileFormat: "csv" + path: "/home/emakhov/IdeaProjects/Agile.DataQuality/tmp/fs/results" + delimiter: "," + } + } +] + +Postprocessing: [ + { + mode: "enrich" + config: { + source: "USGS_US_SOURCE" + metrics: ["usgs_row_count"] + checks: ["depth_avg_check"] + extra: { + person: "Pippo" + } + saveTo: { + fileName: "usgs_enriched" + fileFormat: "csv" + path: "./tmp/postproc" + delimiter: "," + } + } + }, + { + mode: "transpose_by_key" + config: { + keyColumns: ["Date", "Time"] + source: "usgs_enriched" + saveTo: { + fileName: "usgs_transposed" + fileFormat: "csv" + path: "./tmp/postproc" + delimiter: "," + quoted: true + } + } + } +] + + + diff --git a/docs/examples/conf/full-example.conf b/docs/examples/conf/full-example.conf deleted file mode 100644 index 9aeef6c..0000000 --- a/docs/examples/conf/full-example.conf +++ /dev/null @@ -1,285 +0,0 @@ -Databases: [ - { - id = "LOCAL_SQLITE" - subtype = "SQLITE" - config: { - host = "/url/me/up" - port = 4545 - service = "srvc" - user = "rocco" - password = "pizza" - } - } -] - -Sources: [ - { - id = "SCHEMA_TEST" - type = "HDFS" - path = "./Agile.DataQuality/dq-core/src/main/resources/sample-data/battles.csv", - fileType = "csv", - separator = ",", - header = true, - schema = [{name: "a", type: "string"}, {name: "b", type: "string"},{name: "c", type: "string"}] - date = "2017-05-19" - }, - { - id = "PATH_TEST" - type = "HDFS" - path = "./Agile.DataQuality/dq-core/src/main/resources/sample-data/battles.csv", - fileType = "csv", - separator = ",", - header = true, - schema = "some/path/here" - date = "2017-05-19" - }, - { - id = "TABLE_TEST" - type = "TABLE" - database = "LOCAL_SQLITE" - table = "pinsa" - username = "rocks" - password = "pocks" - }, - { - id = "HIVE_TEST" - type = "HIVE" - date = "2017-05-19" - query = "select * from sources;" - } -] - -VirtualSources: [] - -Metrics: [ - { - id: "101" - name: "ROW_COUNT" - type: "FILE" - description: "determine rows number of this table" - config: { - file: "SCHEMA_TEST", - } - }, - { - id: "103" - name: "NULL_VALUES" - type: "COLUMN" - description: "num of null vals" - config: { - file: "SCHEMA_TEST", - column: "attacker_size" - } - }, - { - id: "1031" - name: "EMPTY_VALUES" - type: "COLUMN" - description: "num of null vals" - config: { - file: "SCHEMA_TEST", - column: "attacker_size" - } - }, - { - id: "104" - name: "SUM_NUMBER" - type: "COLUMN" - description: "num of null vals" - config: { - file: "SCHEMA_TEST", - column: "attacker_size" - } - }, - { - id: "maxnum_1" - name: "NUMBER_VALUES" - type: "COLUMN" - description: "num of null vals" - config: { - file: "SCHEMA_TEST", - column: "attacker_size" - params: {compareValue:15000} - } - }, - { - id: "maxnum_2" - name: "NUMBER_VALUES" - type: "COLUMN" - description: "num of null vals" - config: { - file: "SCHEMA_TEST", - column: "defender_size" - params: {compareValue:10000} - } - } -] - -ComposedMetrics: [ - { - id: "400" - name: "Percent Null Values" - description: "" - formula: "$102 + $1031 + 1" - }, - { - id: "0.25_tdigest_check" - name: "Q1 error" - description: "" - formula: "$td025 - ($108 - 0.675*$109)" - }, - { - id: "0.5_tdigest_check" - name: "Q2 error" - description: "" - formula: "$td05 - $108" - }, - { - id: "0.75_tdigest_check" - name: "Q3 error" - description: "" - formula: "$td075 - ($108 + 0.675*$109)" - }, - { - id: "usgs_0.25" - name: "Q1 error" - description: "" - formula: "$usgsMean - 0.675*$usgsStd" - }, - { - id: "usgs_0.5" - name: "Q2 error" - description: "" - formula: "$usgsMean" - }, - { - id: "usgs_0.75" - name: "Q3 error" - description: "" - formula: "$usgsMean + 0.675*$usgsStd" - } -] - -Checks: [ - { - id: "sql_check" - type: "sql" - subtype: "COUNT_EQ_ZERO" - config: { - source: "LOCAL_SQLITE" - query: "select count(*) from column_metrics where name = 'Rocco'" - } - }, - { - id: "test_check" - type: "snapshot" - subtype: "GREATER_THAN" - description: "check for number rows limit with threshold on table A" - config: { - metrics: ["maxnum_2"] - params: {threshold: "10"} - } - }, - { - id: "test_check_fail" - type: "snapshot" - subtype: "GREATER_THAN" - description: "check for number rows limit with threshold on table A" - config: { - metrics: ["maxnum_2"] - params: {threshold: "100000000"} - } - }, - { - id: "test_0" - type: "snapshot" - subtype: "EQUAL_TO" - description: "check for number rows limit with threshold on table A" - config: { - metrics: ["maxnum_2"] - params: {threshold: "10"} - } - }, - { - id: "test_1" - type: "snapshot" - subtype: "LESS_THAN" - description: "check for number rows limit with threshold on table A" - config: { - metrics: ["maxnum_2"] - params: {threshold: "10"} - } - }, - { - id: "check3" - type: "snapshot" - subtype: "LESS_THAN" - description: "min less than max" - config: { - metrics: ["maxnum_2", "maxnum_1"] - params: {compareMetric: "maxnum_1"} - } - }, - { - id: "trend_check" - type: "trend" - subtype: "TOP_N_RANK_CHECK" - description: "some basic trend" - config: { - metrics: ["usgs_0.75"] - rule: "record" - params: {threshold: "0.5", timewindow: "2", targetNumber:4} - } - }, -] - -Targets: [ - { - type: "SYSTEM" - id: "xsell", - checkList: ["trend_check", "check3"] - mailingList: ["hello@egor.com", "bom@tat.eu"], - config: { - fileFormat: "csv" - path: "./Agile.DataQuality/dq-core/output" - delimiter: "," - savemode: "append" - } - }, - { - type: "FILE-METRICS" - config: { - fileFormat: "csv" - path: "./Agile.DataQuality/dq-core/output" - delimiter: "," - savemode: "append" - } - }, - { - type: "COLUMN-METRICS" - config: { - fileFormat: "csv" - path: "./Agile.DataQuality/dq-core/output" - delimiter: "," - savemode: "append" - } - }, - { - type: "COMPOSED-METRICS" - config: { - fileFormat: "csv" - path: "./Agile.DataQuality/dq-core/output" - delimiter: "," - savemode: "append" - } - }, - { - type: "CHECKS" - config: { - fileFormat: "csv" - path: "./Agile.DataQuality/dq-core/output" - delimiter: "," - savemode: "append" - } - } -] \ No newline at end of file diff --git a/docs/examples/conf/usgs-depth.conf b/docs/examples/conf/usgs-depth.conf index 2e97a7a..d554ee6 100644 --- a/docs/examples/conf/usgs-depth.conf +++ b/docs/examples/conf/usgs-depth.conf @@ -36,3 +36,14 @@ Checks: [ } ] +Targets: [ + { + type: "CHECKS" + config: { + fileFormat: "csv" + path: "./tmp/checks" + delimiter: "," + savemode: "append" + } + } +] \ No newline at end of file diff --git a/docs/installation/database-setup.md b/docs/installation/database-setup.md index b1e2768..c8be0ea 100644 --- a/docs/installation/database-setup.md +++ b/docs/installation/database-setup.md @@ -56,6 +56,13 @@ Configure core application setup in [application.conf](/dq-core/src/main/resourc } } ``` +- to setup core database using AWS Secrets Manager: +```hocon + storage:{ + type: "DB" + secrets_manager: "AWS-secrets-manager-name" + } +``` - setup a hive connection: ```hocon hiveDir: "" //${HIVE_PATH} diff --git a/dq-core/src/main/resources/conf/dev.conf b/dq-core/src/main/resources/conf/dev.conf index 14998e3..4b60b85 100644 --- a/dq-core/src/main/resources/conf/dev.conf +++ b/dq-core/src/main/resources/conf/dev.conf @@ -1,5 +1,4 @@ data_quality { - application_name: "local" run_configuration_version: 0.9 @@ -35,13 +34,13 @@ data_quality { // Use "" to turn off storage feature // "DB" subtypes: "SQLITE", "POSTGRES", "ORACLE storage:{ - type: "NONE" + type: "DB" config: { subtype: "POSTGRES" - host: "localhost:5433/dataquality" + host: "localhost:5432/postgres" user: "postgres" password: "postgres" - schema: "dev" + schema: "public" } } @@ -50,7 +49,7 @@ data_quality { // "external" - to use external SMTP server // "internal" - to use internal SMTP thru bash script (check universal/bin/sendMail.sh for extra configuration) // "" - to turn off mailing - mode: "internal" + mode: "" mail_script_path: "" // config: { // address: "test.testovic@gmail.com" diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/ConfigReader.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/ConfigReader.scala index 6c15483..23cc1d6 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/ConfigReader.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/configs/ConfigReader.scala @@ -100,7 +100,7 @@ class ConfigReader(configPath: String)(implicit sqlWriter: HistoryDBManager, set val header = Try(generalConfig.getBoolean("header")).getOrElse(false) - val delimiter = settings.backComp.delimiterExtractor(generalConfig) + val delimiter: Option[String] = settings.backComp.delimiterExtractor(generalConfig) val quote = Try(generalConfig.getString("quote")).toOption val escape = Try(generalConfig.getString("escape")).toOption diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/postprocessors/ArrangePostprocessor.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/postprocessors/ArrangePostprocessor.scala index 72cf27a..e036086 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/postprocessors/ArrangePostprocessor.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/postprocessors/ArrangePostprocessor.scala @@ -17,7 +17,7 @@ import scala.collection.JavaConversions._ final class ArrangePostprocessor(config: Config, settings: DQSettings) extends BasicPostprocessor(config, settings) { - private case class ColumnSelector(name: String, tipo: Option[String] = None, format: Option[String] = None, precision: Option[Integer] = None) { + case class ColumnSelector(name: String, tipo: Option[String] = None, format: Option[String] = None, precision: Option[Integer] = None) { def toColumn()(implicit df: DataFrame): Column = { val dataType: Option[NumericType with Product with Serializable] = @@ -48,7 +48,7 @@ final class ArrangePostprocessor(config: Config, settings: DQSettings) utils.parseTargetConfig(conf)(settings).get } - private val columns: Seq[ColumnSelector] = + val columns: Seq[ColumnSelector] = config.getAnyRefList("columnOrder").map { case x: String => ColumnSelector(x) case x: java.util.HashMap[_, String] => { diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSettings.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSettings.scala index 0193ae9..303a8c5 100644 --- a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSettings.scala +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/DQSettings.scala @@ -96,10 +96,13 @@ class DQSettings(conf: Config, } val notifications: Boolean = DQSettings.getConfigOption[Boolean]("mailing.notifications", conf).getOrElse(false) - val resStorage: Option[DatabaseConfig] = conf.getString("storage.type") match { - case "DB" => Some(new DatabaseConfig(conf.getConfig("storage.config"))) - case "NONE" => None - case x => throw IllegalParameterException(x) + val resStorage: Option[DatabaseConfig] = DQSettings.getConfigOption[String]("storage.secrets_manager", conf) match { + case Some(x) => Some( RdsCredentials.credentialsToDatabaseConnection(RdsCredentials.getCredentials(x)) ) + case None => conf.getString("storage.type") match { + case "DB" => Some(new DatabaseConfig(conf.getConfig("storage.config"))) + case "NONE" => None + case x => throw IllegalParameterException(x) + } } // Internal values diff --git a/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/RdsCredentials.scala b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/RdsCredentials.scala new file mode 100644 index 0000000..59a18fc --- /dev/null +++ b/dq-core/src/main/scala/it/agilelab/bigdata/DataQuality/utils/RdsCredentials.scala @@ -0,0 +1,66 @@ +package it.agilelab.bigdata.DataQuality.utils + +import com.amazonaws.services.secretsmanager.AWSSecretsManagerClientBuilder +import com.amazonaws.services.secretsmanager.model.GetSecretValueRequest +import it.agilelab.bigdata.DataQuality.sources.DatabaseConfig +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods.parse + +case class RdsCredentials( + engine: String, + host: String, + port: String, + dbName: String, + username: String, + password: String, + connectionUrl: String) { +} + + +object RdsCredentials { + + def getCredentials(secretName: String): RdsCredentials = { + val client = AWSSecretsManagerClientBuilder + .standard() + .withRegion("eu-central-1") + .build() + val request = new GetSecretValueRequest().withSecretId(secretName) + + val response = client.getSecretValue(request) + + implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats + val secret = parse(response.getSecretString) + .extract[Map[String, String]] + client.shutdown() + + + val engine = secret.getOrElse("engine", "") + val host = secret.getOrElse("host", "") + val port = secret.getOrElse("port", "") + val dbName = secret.getOrElse("dbname", "") + val username = secret.getOrElse("username", "") + val password = secret.getOrElse("password", "") + val connectionUrl: String = s"$host:$port/$dbName" + RdsCredentials( + engine, + host, + port, + dbName, + username, + password, + connectionUrl) + } + + def credentialsToDatabaseConnection(rdsCredentials: RdsCredentials): DatabaseConfig = { + DatabaseConfig( + "", + rdsCredentials.engine.toUpperCase, + rdsCredentials.connectionUrl, + Some(rdsCredentials.port), + None, + Some(rdsCredentials.username), + Some(rdsCredentials.password), + Some("public") + ) + } +} diff --git a/dq-core/src/test/resources/application.conf b/dq-core/src/test/resources/application.conf index abae30b..5374bec 100644 --- a/dq-core/src/test/resources/application.conf +++ b/dq-core/src/test/resources/application.conf @@ -1,43 +1,49 @@ -dataquality { - appDirectory:"" +data_quality { - appName: "DQ Test" - hadoopConfDir: "" - hiveDir: "" //${HIVE_PATH} + application_name: "data-quality-test" + run_configuration_version: 0.9 - errorDumpSize: 1000 - errorFolderPath: "./Agile.DataQuality/side-code/dump" + s3_bucket: "" + hive_warehouse_path: "" + hbase_host: "" - // Configuration - vsDumpConfig: { - fileFormat: "csv" - path: "./Agile.DataQuality/side-code/dump/virtual" + tmp_files_management: { + local_fs_path: "/tmp/dq-test/fs" + hdfs_path: "/tmp/dq-test/hdfs" + } + + metric_error_management: { + dump_directory_path: "/tmp/dq-test/errors" + dump_size: 1000 + file_config: { + format: "csv" + delimiter: "," + quote: "\"" + escape: "\\" + quote_mode: "ALL" + } + } + + virtual_sources_management: { + dump_directory_path: "/tmp/dq-test/virtual" + file_format: "csv" delimiter: "," } - // Result storage configuration - // Supported types: "DB" - // "DB" subtypes: "SQLITE","ORACLE storage:{ - type: "DB" + type: "NONE" config: { - host: "localhost:5432/dataquality?user=postgres" - subtype: "POSTGRES" + subtype: "" + host: "" + user: "" + password: "" + schema: "" } } - // Check failure alert mailer configuration mailing { - // "external" - external SMTP server - // "internal" - internal SMTP thru bash script (check universal/bin/sendMail.sh for extra configuration) mode: "internal" -// config: { -// address: "test.testovic@gmail.com" -// hostname: "smtp.gmail.com" -// username: "test.testovic" -// password: "password123" -// smtpPort: 465 -// sslOnConnect: true -// } + mail_script_path: "" + notifications: false } } \ No newline at end of file diff --git a/dq-core/src/test/resources/ci.conf b/dq-core/src/test/resources/ci.conf new file mode 100644 index 0000000..a7c793b --- /dev/null +++ b/dq-core/src/test/resources/ci.conf @@ -0,0 +1,65 @@ +data_quality { + + application_name: "local" + run_configuration_version: 1.0 + hive_warehouse_path: "" + hbase_host: "" + + tmp_files_management: { + local_fs_path: "/tmp/fs" + hdfs_path: "/tmp/hdfs" + } + + metric_error_management: { + dump_directory_path: "/tmp/dump" + dump_size: 1000 // max number of collected errors for 1 metric for 1 partition + empty_file: true + file_config: { + format: "csv" + delimiter: "," + quote: "\"" + escape: "\\" + quote_mode: "ALL" + } + } + + virtual_sources_management: { + dump_directory_path: "/tmp/virtual" + file_format: "csv" + delimiter: "," + } + + // Result storage configuration + // Supported types: "DB", "NONE" + // Use "" to turn off storage feature + // "DB" subtypes: "SQLITE", "POSTGRES", "ORACLE + storage:{ + type: "NONE" + config: { + subtype: "POSTGRES" + host: "localhost:5433/dataquality" + user: "postgres" + password: "postgres" + schema: "dev" + } + } + + // Check failure alert mailer configuration + mailing { + // "external" - to use external SMTP server + // "internal" - to use internal SMTP thru bash script (check universal/bin/sendMail.sh for extra configuration) + // "" - to turn off mailing + mode: "internal" + mail_script_path: "" + // config: { + // address: "test.testovic@gmail.com" + // hostname: "smtp.gmail.com" + // username: "test.testovic" + // password: "password123" + // smtpPort: 465 + // sslOnConnect: true + // } + + notifications: false + } +} diff --git a/dq-core/src/test/resources/conf/number_precision.conf b/dq-core/src/test/resources/conf/number_precision.conf new file mode 100644 index 0000000..48a09ea --- /dev/null +++ b/dq-core/src/test/resources/conf/number_precision.conf @@ -0,0 +1,25 @@ +Sources: [] + +VirtualSources: [] + +Metrics: [] + +Checks: [] + +Targets: [] + +Postprocessing: [ + { + mode: "arrange" + config: { + source: "" + columnOrder: ["name",{"surname": {"STRING": "Hello %s"}}, {"year": "INT"}, {"weigth": "DOUBLE"}, {"age": {"DOUBLE": 2}}, {"amount": {"DOUBLE": "%.2f"}}] + saveTo: { + fileName: "NUMBER_PRECISION_ARRANGE" + fileFormat: "csv" + path: "./tmp/postproc" + delimiter: "," + } + } + } +] diff --git a/dq-core/src/test/resources/output/CHECKS-1.6.csv b/dq-core/src/test/resources/output/CHECKS-1.6.csv new file mode 100644 index 0000000..76d2790 --- /dev/null +++ b/dq-core/src/test/resources/output/CHECKS-1.6.csv @@ -0,0 +1,2 @@ +checkId,checkName,description,checkedFile,baseMetric,comparedMetric,comparedThreshold,status,message,execDate +depth_avg_check,GREATER_THAN,Checks is average of depth is greather than 10,USGS_2000,depth_avg,,50.0,Success,Check depth_avg_check for metric AVG_NUMBER on column USGS_2000[Buffer(Depth)] check if (MetricResult) 85.05389330922242 is GREATER_THAN 50.0 (compareMetric/threshold). Result: Success. CheckStatus: 85.05389330922242 > 50.0.,2019-01-01 diff --git a/dq-core/src/test/resources/output/CHECKS-2.4.csv b/dq-core/src/test/resources/output/CHECKS-2.4.csv new file mode 100644 index 0000000..629fb80 --- /dev/null +++ b/dq-core/src/test/resources/output/CHECKS-2.4.csv @@ -0,0 +1,2 @@ +checkId,checkName,description,checkedFile,baseMetric,comparedMetric,comparedThreshold,status,message,execDate +depth_avg_check,GREATER_THAN,Checks is average of depth is greather than 10,USGS_2000,depth_avg,"",50.0,Success,Check depth_avg_check for metric AVG_NUMBER on column USGS_2000[Buffer(Depth)] check if (MetricResult) 85.05389330922242 is GREATER_THAN 50.0 (compareMetric/threshold). Result: Success. CheckStatus: 85.05389330922242 > 50.0.,2019-01-01 diff --git a/dq-core/src/test/scala/SparkTestSpec.scala b/dq-core/src/test/scala/SparkTestSpec.scala index dc32da2..af20741 100644 --- a/dq-core/src/test/scala/SparkTestSpec.scala +++ b/dq-core/src/test/scala/SparkTestSpec.scala @@ -23,7 +23,7 @@ class SparkTestSpec extends FunSuite with BeforeAndAfterAll { r.setSeed(123) val settings: DQSettings = new DQSettings( - conf = ConfigFactory.parseURL(getClass.getResource("/application.conf")).getConfig("dataquality"), + conf = ConfigFactory.parseURL(getClass.getResource("/application.conf")).getConfig("data_quality"), configFilePath = getClass.getResource("/conf/test.conf").getPath, repartition = false, local = true, @@ -47,7 +47,7 @@ class SparkTestSpec extends FunSuite with BeforeAndAfterAll { test("parse basic conf") { val configuration = new ConfigReader(settings.configFilePath)(localSqlWriter, settings) - val testSource: HdfsFile = HdfsFile("T1", "./t1.csv", "csv", true, "2018-03-26", None) + val testSource: HdfsFile = HdfsFile("T1", "./t1.csv", "csv", true, "2018-03-26", Some(",")) val sources: Map[String, SourceConfig] = configuration.sourcesConfigMap assert(sources.keySet.size == 3, "Should be equal 3") assert(sources.keySet == Set("T1","T2","T3")) diff --git a/dq-core/src/test/scala/it/agilelab/bigdata/DataQuality/apps/DQMasterBranchTest.scala b/dq-core/src/test/scala/it/agilelab/bigdata/DataQuality/apps/DQMasterBranchTest.scala new file mode 100644 index 0000000..d7326a5 --- /dev/null +++ b/dq-core/src/test/scala/it/agilelab/bigdata/DataQuality/apps/DQMasterBranchTest.scala @@ -0,0 +1,46 @@ +package it.agilelab.bigdata.DataQuality.apps + +import org.apache.log4j.Logger +import org.scalatest.{BeforeAndAfterAll, FunSuite, Tag} + +import scala.io.Source + +class DQMasterBranchTest extends FunSuite with BeforeAndAfterAll { + + @transient lazy val log: Logger = Logger.getLogger(getClass.getName) + + object Spark1xTest extends Tag("it.agilelab.bigdata.DataQuality.Spark1xTest") + object Spark2xTest extends Tag("it.agilelab.bigdata.DataQuality.Spark2xTest") + + val dir : String = Option(System.getenv("CI_PROJECT_DIR")).getOrElse("./") + + val outputFileName : String = s"$dir/tmp/checks/CHECKS.csv" + + test("must be run with spark 1.6.x", Spark1xTest) { + + DQMasterBatch.main(Array("-a", s"$dir/dq-core/src/test/resources/ci.conf", "-c", s"$dir/docs/examples/conf/usgs-depth.conf", "-d", "2019-01-01", "-l")) + + val expectedFileName : String = s"$dir/dq-core/src/test/resources/output/CHECKS-1.6.csv" + + log.info(s"Comparison between $outputFileName and $expectedFileName") + val resultFile : Array[String] = Source.fromFile(outputFileName).getLines.toArray + val expectedFile : Array[String] = Source.fromFile(expectedFileName).getLines.toArray + + assert(!resultFile.isEmpty) + assert(resultFile.sameElements(expectedFile)) + } + + test("must be run with spark 2.4.x", Spark2xTest) { + + DQMasterBatch.main(Array("-a", s"$dir/dq-core/src/test/resources/ci.conf", "-c", s"$dir/docs/examples/conf/usgs-depth.conf", "-d", "2019-01-01", "-l")) + + val expectedFileName : String = s"$dir/dq-core/src/test/resources/output/CHECKS-2.4.csv" + + log.info(s"Comparison between $outputFileName and $expectedFileName") + val resultFile : Array[String] = Source.fromFile(outputFileName).getLines.toArray + val expectedFile : Array[String] = Source.fromFile(expectedFileName).getLines.toArray + + assert(!resultFile.isEmpty) + assert(resultFile.sameElements(expectedFile)) + } +} \ No newline at end of file diff --git a/dq-core/src/test/scala/it/agilelab/bigdata/DataQuality/postprocessors/ArrangePostprocessorTest.scala b/dq-core/src/test/scala/it/agilelab/bigdata/DataQuality/postprocessors/ArrangePostprocessorTest.scala new file mode 100644 index 0000000..54c903f --- /dev/null +++ b/dq-core/src/test/scala/it/agilelab/bigdata/DataQuality/postprocessors/ArrangePostprocessorTest.scala @@ -0,0 +1,74 @@ +package it.agilelab.bigdata.DataQuality.postprocessors + +import com.typesafe.config._ +import it.agilelab.bigdata.DataQuality.configs.ConfigReader +import it.agilelab.bigdata.DataQuality.utils.DQSettings +import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager +import org.apache.spark.sql._ +import org.apache.spark.SparkConf +import org.joda.time +import org.scalatest.{BeforeAndAfterAll, FunSuite} + +class ArrangePostprocessorTest extends FunSuite with BeforeAndAfterAll { + implicit val settings: DQSettings = new DQSettings( + conf = ConfigFactory.parseURL(getClass.getResource("/application.conf")).getConfig("data_quality"), + configFilePath = getClass.getResource("/conf/number_precision.conf").getPath, + repartition = false, + local = true, + ref_date = time.DateTime.now() + ) + + implicit val sqlWriter = new HistoryDBManager(settings) + + val config: ConfigReader = new ConfigReader(settings.configFilePath) + + val conf = new SparkConf().setAppName(settings.appName).setMaster("local[*]") + + override def beforeAll() { + super.beforeAll() + } + + val spark = SparkSession.builder.config(conf).getOrCreate() + import spark.implicits._ + + val processor: ArrangePostprocessor = config.getPostprocessors.head.asInstanceOf[ArrangePostprocessor] + + implicit val df : DataFrame = Seq(("john", "black", "1992", "87.2", "50", "1.2842922E2")).toDF("name", "surname", "year", "weigth", "age", "amount") + + val result: Row = df.select(processor.columns.map(_.toColumn): _*).collect().head + + test("arrange, column String, return the same String") { + assert(result.get(0).isInstanceOf[String]) + assert(result.get(0).equals("john")) + } + + test("arrange, column String, return a formatted String (prefix Hello)") { + assert(result.get(1).isInstanceOf[String]) + assert(result.get(1).equals("Hello black")) + } + + test("arrange, column String, return an Integer column") { + assert(result.get(2).isInstanceOf[Integer]) + assert(result.get(2).equals(1992)) + } + + test("arrange, column String, return a Double column") { + assert(result.get(3).isInstanceOf[Double]) + assert(result.get(3).equals(87.2)) + } + + test("arrange, column String, return a String column with a cast to Double with precision equals to 2") { + assert(result.get(4).isInstanceOf[String]) + assert(result.get(4).equals("50.00")) + } + + test("arrange, column String, return a String column with a formatted String (numeric format)") { + assert (result.get(5).isInstanceOf[String]) + assert (result.get(5).equals("128.43")) + } + + override def afterAll() { + spark.stop() + super.afterAll() + } +} diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 759cbdf..c23f42f 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -2,47 +2,86 @@ import sbt._ object Dependencies { - val algebirdCore = "com.twitter" %% "algebird-core" % Version.algebird - - val catsCore = "org.typelevel" %% "cats-core" % Version.catsCore + val sparkAvro = "com.databricks" %% "spark-avro" % Version.sparkAvro + val sparkCsv = "com.databricks" %% "spark-csv" % Version.sparkCsv + val scopt = "com.github.scopt" %% "scopt" % Version.scopt + val typesafeConfig = "com.typesafe" % "config" % Version.typesafeConfig val commonLang = "org.apache.commons" % "commons-lang3" % Version.commonLang val commonMail = "org.apache.commons" % "commons-email" % Version.commonMail - - val hbaseConnector = "it.nerdammer.bigdata" % "spark-hbase-connector_2.10" % Version.hbaseConnector - - val isanr = "org.isarnproject" %% "isarn-sketches" % Version.isarn - val jodaTime = "joda-time" % "joda-time" % Version.jodaTime val jodaConvert = "org.joda" % "joda-convert" % Version.jodaConvert - val log4j = "log4j" % "log4j" % Version.log4j + val secretsManager = "com.amazonaws" % "aws-java-sdk-secretsmanager" % Version.secretsManager - val postgres = "org.postgresql" % "postgresql" % Version.postgres - - val scalaTest = "org.scalatest" %% "scalatest" % Version.scalaTest % "test" + val algebirdCore = "com.twitter" %% "algebird-core" % Version.algebird + val isarn = "org.isarnproject" %% "isarn-sketches" % Version.isarn + val catsCore = "org.typelevel" %% "cats-core" % Version.catsCore - val sparkAvro = "com.databricks" %% "spark-avro" % Version.sparkAvro - val sparkCsv = "com.databricks" %% "spark-csv" % Version.sparkCsv + val hbaseConnector = "it.nerdammer.bigdata" % "spark-hbase-connector_2.10" % Version.hbaseConnector + val postgres = "org.postgresql" % "postgresql" % Version.postgres + val sqlite = "org.xerial" % "sqlite-jdbc" % Version.sqlite - val scopt = "com.github.scopt" %% "scopt" % Version.scopt + val playJson = "com.typesafe.play" %% "play-json" % Version.playJson + val squeryl = "org.squeryl" %% "squeryl" % Version.squeryl + val jerkson = "com.gilt" % "jerkson_2.11" % Version.jerkson + val webjars = "org.webjars" %% "webjars-play" % Version.webjars + val swaggerUi = "org.webjars" % "swagger-ui" % Version.swaggerUi - val sqlite = "org.xerial" % "sqlite-jdbc" % Version.sqlite + val scalaTest = "org.scalatest" %% "scalatest" % Version.scalaTest % Test + val scalaTestPlay = "org.scalatestplus.play" %% "scalatestplus-play" % Version.scalaTestPlay % Test - val typesafeConfig = "com.typesafe" % "config" % Version.typesafeConfig + val dq_common = Seq(typesafeConfig, catsCore) + val dq_core = Seq(algebirdCore, commonLang, commonMail, hbaseConnector, isarn, jodaTime, jodaConvert, log4j, + postgres, scalaTest, sparkAvro, sparkCsv, scopt, sqlite, typesafeConfig, secretsManager) - val dq_core = Seq(algebirdCore, commonLang, commonMail, hbaseConnector, isanr, jodaTime, jodaConvert, log4j, - postgres, scalaTest, sparkAvro, sparkCsv, scopt, sqlite, typesafeConfig) + val dq_ui = Seq(scalaTest, scalaTestPlay, playJson, jodaTime, jodaConvert, squeryl, jerkson, webjars, postgres, catsCore) - val dq_common = Seq(typesafeConfig, catsCore) + val dq_be = Seq(scalaTest, scalaTestPlay, playJson, squeryl, postgres, jerkson, swaggerUi) + val dq_api = Seq(scalaTest, scalaTestPlay, playJson, squeryl, postgres, jerkson, swaggerUi) - def sparkDependenciesCalculation(sparkVersion:String): Seq[ModuleID] = + def getSparkDependencies(sparkVersion:String): Seq[ModuleID] = Seq( "org.apache.spark" %% "spark-core" % sparkVersion % "provided", "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", "org.apache.spark" %% "spark-hive" % sparkVersion % "provided" ) + def getJSDependencies(ngVersion: String): Seq[ModuleID] = { + Seq( + "org.webjars.npm" % "angular__common" % ngVersion, + "org.webjars.npm" % "angular__compiler" % ngVersion, + "org.webjars.npm" % "angular__core" % ngVersion, + "org.webjars.npm" % "angular__http" % ngVersion, + "org.webjars.npm" % "angular__forms" % ngVersion, + "org.webjars.npm" % "angular__router" % ngVersion, + "org.webjars.npm" % "angular__platform-browser-dynamic" % ngVersion, + "org.webjars.npm" % "angular__platform-browser" % ngVersion, + "org.webjars.npm" % "angular__cdk" % "2.0.0-beta.10", + "org.webjars.npm" % "angular__material" % "2.0.0-beta.10", + "org.webjars.npm" % "angular__animations" % ngVersion, + "org.webjars.npm" % "systemjs" % "0.20.14", + "org.webjars.npm" % "rxjs" % "5.4.2", + "org.webjars.npm" % "reflect-metadata" % "0.1.8", + "org.webjars.npm" % "zone.js" % "0.8.4", + "org.webjars.npm" % "core-js" % "2.4.1", + "org.webjars.npm" % "symbol-observable" % "1.0.1", + + "org.webjars.npm" % "angular__flex-layout" % "2.0.0-beta.9", + + "org.webjars.npm" % "typescript" % "2.4.1", + "org.webjars.npm" % "codemirror" % "5.30.0", + "org.webjars.npm" % "ng2-codemirror" % "1.1.3", + + //tslint dependency + "org.webjars.npm" % "types__jasmine" % "2.5.53" % "test", + //test + "org.webjars.npm" % "jasmine-core" % "2.6.4", + "org.webjars.npm" % "ng2-file-upload" % "1.2.0", + "org.webjars.npm" % "file-saver" % "1.3.8", + "org.webjars.npm" % "types__file-saver" % "1.3.0" + ) + } } diff --git a/project/Multiversion.scala b/project/Multiversion.scala deleted file mode 100644 index 625e295..0000000 --- a/project/Multiversion.scala +++ /dev/null @@ -1,6 +0,0 @@ -import sbt.SettingKey - -object Multiversion { - val sparkVersion = SettingKey[String]("sparkVersion","spark version") -} - diff --git a/project/Version.scala b/project/Version.scala index 5adbfcd..04d79b0 100644 --- a/project/Version.scala +++ b/project/Version.scala @@ -1,4 +1,5 @@ object Version { + val secretsManager: String = "1.11.728" val algebird = "0.13.0" val catsCore = "1.1.0" val commonLang = "3.0" @@ -8,11 +9,21 @@ object Version { val jodaTime = "2.9.9" val jodaConvert = "1.9.2" val log4j = "1.2.17" - val postgres = "42.1.1" - val scalaTest = "2.2.1" + val sparkAvro = "2.0.1" val sparkCsv = "1.5.0" val scopt = "3.2.0" - val sqlite = "3.8.11.2" + val typesafeConfig = "1.3.1" + val scalaTest = "3.0.4" + val scalaTestPlay = "3.1.2" + + val sqlite = "3.8.11.2" + val postgres = "42.1.1" + + val playJson = "2.5.14" + val squeryl = "0.9.9" + val jerkson = "0.6.9" + val webjars = "2.7.3" + val swaggerUi = "3.1.5" } diff --git a/project/build.properties b/project/build.properties index 07d9935..7609b47 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version = 0.13.18 \ No newline at end of file +sbt.version = 1.2.8 \ No newline at end of file diff --git a/project/plugins.sbt b/project/plugins.sbt index 10d5e42..cedddee 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,22 +1,18 @@ import sbt.addSbtPlugin -logLevel := Level.Warn +// General plugins +addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.4") // Eclipse compatibility +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.2.1") +addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.10.0-RC1") +addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.0") -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") -addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "4.0.0") -addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.2.0") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10") // Creates fat Jars +addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.6.1") // Creates universal packages -// provides server side compilation of typescript to ecmascript 5 or 3 -addSbtPlugin("name.de-vries" % "sbt-typescript" % "2.5.2") -// checks your typescript code for error prone constructions -//addSbtPlugin("name.de-vries" % "sbt-tslint" % "5.1.0") -// runs jasmine tests -addSbtPlugin("name.de-vries" % "sbt-jasmine" % "0.0.3") -addSbtPlugin("com.typesafe.sbt" % "sbt-digest" % "1.1.0") -addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.1.10") -addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.8.2") - -resolvers += "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/" -// The Play plugin +// Web application plugins addSbtPlugin("com.typesafe.play" % "sbt-plugin" % "2.7.3") -//addSbtPlugin("org.irundaia.sbt" % "sbt-sassify" % "1.4.13") \ No newline at end of file +addSbtPlugin("com.typesafe.sbt" % "sbt-digest" % "1.1.4") + +addSbtPlugin("name.de-vries" % "sbt-typescript" % "2.6.2") +addSbtPlugin("name.de-vries" % "sbt-tslint" % "5.7.0") +addSbtPlugin("name.de-vries" % "sbt-jasmine" % "0.0.4") \ No newline at end of file diff --git a/project/src/main/scala/BuildSparkPlugin.scala b/project/src/main/scala/BuildSparkPlugin.scala new file mode 100644 index 0000000..7bd3df1 --- /dev/null +++ b/project/src/main/scala/BuildSparkPlugin.scala @@ -0,0 +1,36 @@ +package src.main.scala + +import sbt._ +import sbt.Keys._ +import sbt.plugins.JvmPlugin + +/** sets the spark and scala versions */ +object BuildSparkPlugin extends AutoPlugin { + object autoImport { + val sparkVersion = settingKey[String]("The version of Apache Spark used for building") + } + + import autoImport._ + + // make sure it triggers automatically + override def trigger = AllRequirements + override def requires = JvmPlugin + + override def projectSettings: Seq[Setting[_]] = Seq( + sparkVersion := "2.4.0", // warning + scalaVersion := { + sparkVersion.value match { + case v if v >= "2.4.5" => "2.12.10" + case v if v >= "2.0.0" => "2.11.12" + case v if v >= "1.6.0" => "2.10.7" + case _ => "2.11.12" // default + } + }, + onLoadMessage := { + s"""|${onLoadMessage.value} + |Current Spark version: ${sparkVersion.value} + |Current Scala version: ${scalaVersion.value}""".stripMargin + } + ) + +}