DBT project for Airbyte normalization (#802)

Add a step that generates config files and SQL files in order to run DBT which takes care of converting JSON blob data (from catalog.json JSON Schema) into normalized tables Co-authored-by: cgardens <giardina.charles@gmail.com> Co-authored-by: Sherif Nada <snadalive@gmail.com>
airbytehq · Nov 6, 2020 · d042ef5 · d042ef5
1 parent e823bcb
commit d042ef5
Show file tree

Hide file tree

Showing 23 changed files with 635 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,5 @@ __pycache__
 .venv
 .mypy_cache
 
+# dbt
+profiles.yml
diff --git a/airbyte-integrations/bases/base-normalization/.dockerignore b/airbyte-integrations/bases/base-normalization/.dockerignore
@@ -3,3 +3,4 @@
 !entrypoint.sh
 !setup.py
 !normalization
+!dbt-project-template
diff --git a/airbyte-integrations/bases/base-normalization/Dockerfile b/airbyte-integrations/bases/base-normalization/Dockerfile
@@ -10,6 +10,9 @@ WORKDIR /airbyte/normalization_code
 COPY normalization ./normalization
 COPY setup.py .
 RUN pip install .
+COPY dbt-project-template/macros ./dbt-template/macros
+COPY dbt-project-template/dbt_project.yml ./dbt-template/dbt_project.yml
+COPY dbt-project-template/packages.yml ./dbt-template/packages.yml
 
 WORKDIR /airbyte
 

diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/.gitignore b/airbyte-integrations/bases/base-normalization/dbt-project-template/.gitignore
@@ -0,0 +1,4 @@
+build/
+logs/
+models/generated/
+dbt_modules/
diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/README.md b/airbyte-integrations/bases/base-normalization/dbt-project-template/README.md
@@ -0,0 +1,19 @@
+## Installing DBT
+
+1. Activate your venv and run `pip3 install dbt`
+1. Copy `airbyte-normalization/sample_files/profiles.yml` over to `~/.dbt/profiles.yml`
+1. Edit to configure your profiles accordingly
+
+## Running DBT
+
+1. `cd airbyte-normalization`
+1. You can now run DBT commands, to check the setup is fine: `dbt debug`
+1. To build the DBT tables in your warehouse: `dbt run`
+
+## Running DBT from Airbyte generated config
+
+1. You can also change directory (`cd /tmp/dev_root/workspace/1/0/normalize` for example) to one of the workspace generated by Airbyte within one of the `normalize` folder.
+1. You should find `profiles.yml` and a bunch of other DBT files/folders created there.
+1. To check everything is setup properly: `dbt debug --profiles-dir=$(pwd) --project-dir=$(pwd)`
+1. You can modify the `.sql` files and run `dbt run --profiles-dir=$(pwd) --project-dir=$(pwd)` too
+1. You can inspect compiled DBT `.sql` files before they are run in the destination engine in `normalize/build/compiled` or `normalize/build/run` folders
diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/dbt_project.yml b/airbyte-integrations/bases/base-normalization/dbt-project-template/dbt_project.yml
@@ -0,0 +1,40 @@
+# Name your package! Package names should contain only lowercase characters
+# and underscores. A good package name should reflect your organization's
+# name or the intended use of these models
+name: 'airbyte'
+version: '1.0'
+config-version: 2
+
+# This setting configures which "profile" dbt uses for this project. Profiles contain
+# database connection information, and should be configured in the  ~/.dbt/profiles.yml file
+profile: 'normalize'
+
+# These configurations specify where dbt should look for different types of files.
+# The `source-paths` config, for example, states that source models can be found
+# in the "models/" directory. You probably won't need to change these!
+source-paths: ["models"]
+docs-paths: ["docs"]
+analysis-paths: ["analysis"]
+test-paths: ["tests"]
+data-paths: ["data"]
+macro-paths: ["macros"]
+
+target-path: "build"  # directory which will store compiled SQL files
+clean-targets:         # directories to be removed by `dbt clean`
+    - "build"
+    - "dbt_modules"
+
+# https://docs.getdbt.com/reference/project-configs/quoting/
+quoting:
+  database: true
+  schema: true
+  identifier: true
+
+# You can define configurations for models in the `source-paths` directory here.
+# Using these configurations, you can enable or disable models, change how they
+# are materialized, and more!
+models:
+  airbyte:
+    # Schema (dataset) defined in profiles.yml is concatenated with schema below for dbt's final output
+    +schema: NORMALIZED
+    +materialized: view
diff --git a/...ntegrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/array.sql b/...ntegrations/bases/base-normalization/dbt-project-template/macros/cross_db_utils/array.sql
@@ -0,0 +1,33 @@
+{#
+    Adapter Macros for the following functions:
+    - Bigquery: unnest() -> https://cloud.google.com/bigquery/docs/reference/standard-sql/arrays#flattening-arrays-and-repeated-fields
+    - Snowflake: flatten() -> https://docs.snowflake.com/en/sql-reference/functions/flatten.html
+    - Redshift: -> https://blog.getdbt.com/how-to-unnest-arrays-in-redshift/
+    - postgres: unnest() -> https://www.postgresqltutorial.com/postgresql-array/
+#}
+
+{# flatten -------------------------------------------------     #}
+
+{% macro unnest(array_col) -%}
+  {{ adapter.dispatch('unnest')(array_col) }}
+{%- endmacro %}
+
+{% macro default__unnest(array_col) -%}
+    unnest({{ adapter.quote_as_configured(array_col, 'identifier')|trim }})
+{%- endmacro %}
+
+{% macro bigquery__unnest(array_col) -%}
+    unnest({{ adapter.quote_as_configured(array_col, 'identifier')|trim }})
+{%- endmacro %}
+
+{% macro postgres__unnest(array_col) -%}
+    unnest({{ adapter.quote_as_configured(array_col, 'identifier')|trim }})
+{%- endmacro %}
+
+{% macro redshift__unnest(array_col) -%}
+    -- FIXME to implement as described here? https://blog.getdbt.com/how-to-unnest-arrays-in-redshift/
+{%- endmacro %}
+
+{% macro snowflake__unnest(array_col) -%}
+    table(flatten({{ adapter.quote_as_configured(array_col, 'identifier')|trim }}))
+{%- endmacro %}
diff --git a/...s/bases/base-normalization/dbt-project-template/macros/cross_db_utils/json_operations.sql b/...s/bases/base-normalization/dbt-project-template/macros/cross_db_utils/json_operations.sql
@@ -0,0 +1,110 @@
+{#
+    Adapter Macros for the following functions:
+    - Bigquery: JSON_EXTRACT(json_string_expr, json_path_format) -> https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions
+    - Snowflake: JSON_EXTRACT_PATH_TEXT( <column_identifier> , '<path_name>' ) -> https://docs.snowflake.com/en/sql-reference/functions/json_extract_path_text.html
+    - Redshift: json_extract_path_text('json_string', 'path_elem' [,'path_elem'[, …] ] [, null_if_invalid ] ) -> https://docs.aws.amazon.com/redshift/latest/dg/JSON_EXTRACT_PATH_TEXT.html
+    - Postgres: json_extract_path_text(<from_json>, 'path' [, 'path' [, ...]]) -> https://www.postgresql.org/docs/12/functions-json.html
+#}
+
+{# format_json_path --------------------------------------------------     #}
+{% macro format_json_path(json_path_list) -%}
+  {{ adapter.dispatch('format_json_path')(json_path_list) }}
+{%- endmacro %}
+
+{% macro default__format_json_path(json_path_list) -%}
+  {{ '.' ~ json_path_list|join('.') }}
+{%- endmacro %}
+
+{% macro bigquery__format_json_path(json_path_list) -%}
+  {{ '"$.' ~ json_path_list|join('"."') ~ '"' }}
+{%- endmacro %}
+
+{% macro postgres__format_json_path(json_path_list) -%}
+ {{ "'" ~ json_path_list|join("','") ~ "'" }}
+{%- endmacro %}
+
+{% macro redshift__format_json_path(json_path_list) -%}
+ {{ "'" ~ json_path_list|join("','") ~ "'" }}
+{%- endmacro %}
+
+{% macro snowflake__format_json_path(json_path_list) -%}
+  {{ "'" ~ json_path_list|join('"."') ~ "'" }}
+{%- endmacro %}
+
+{# json_extract -------------------------------------------------     #}
+
+{% macro json_extract(json_column, json_path_list) -%}
+  {{ adapter.dispatch('json_extract')(json_column, json_path_list) }}
+{%- endmacro %}
+
+{% macro default__json_extract(json_column, json_path_list) -%}
+    json_extract({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro bigquery__json_extract(json_column, json_path_list) -%}
+    json_extract({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro postgres__json_extract(json_column, json_path_list) -%}
+    jsonb_extract_path_text({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro redshift__json_extract(json_column, json_path_list) -%}
+    json_extract_path_text({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro snowflake__json_extract(json_column, json_path_list) -%}
+    json_extract_path_text({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{# json_extract_scalar -------------------------------------------------     #}
+
+{% macro json_extract_scalar(json_column, json_path_list) -%}
+  {{ adapter.dispatch('json_extract_scalar')(json_column, json_path_list) }}
+{%- endmacro %}
+
+{% macro default__json_extract_scalar(json_column, json_path_list) -%}
+    json_extract_scalar({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro bigquery__json_extract_scalar(json_column, json_path_list) -%}
+    json_extract_scalar({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro postgres__json_extract_scalar(json_column, json_path_list) -%}
+    jsonb_extract_path_text({{ adapter.quote_as_configured(json_column, 'identifier')|trim }},{{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro redshift__json_extract_scalar(json_column, json_path_list) -%}
+    json_extract_path_text({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro snowflake__json_extract_scalar(json_column, json_path_list) -%}
+    json_extract_path_text({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{# json_extract_array -------------------------------------------------     #}
+
+{% macro json_extract_array(json_column, json_path_list) -%}
+  {{ adapter.dispatch('json_extract_array')(json_column, json_path_list) }}
+{%- endmacro %}
+
+{% macro default__json_extract_array(json_column, json_path_list) -%}
+    json_extract_array({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro bigquery__json_extract_array(json_column, json_path_list) -%}
+    json_extract_array({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro postgres__json_extract_array(json_column, json_path_list) -%}
+    jsonb_array_elements(jsonb_extract_path({{ adapter.quote_as_configured(json_column, 'identifier')|trim }},{{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro redshift__json_extract_array(json_column, json_path_list) -%}
+    json_extract_path_text({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
+
+{% macro snowflake__json_extract_array(json_column, json_path_list) -%}
+    json_extract_path_text({{ adapter.quote_as_configured(json_column, 'identifier')|trim }}, {{ format_json_path(json_path_list) }})
+{%- endmacro %}
diff --git a/airbyte-integrations/bases/base-normalization/dbt-project-template/packages.yml b/airbyte-integrations/bases/base-normalization/dbt-project-template/packages.yml
@@ -0,0 +1,5 @@
+# add dependencies. these will get pulled during the `dbt deps` process.
+
+packages:
+  - git: "https://github.com/fishtown-analytics/dbt-utils.git"
+    revision: 0.6.2
diff --git a/airbyte-integrations/bases/base-normalization/entrypoint.sh b/airbyte-integrations/bases/base-normalization/entrypoint.sh
@@ -2,10 +2,6 @@
 
 set -e
 
-# dbt looks specifically for files named profiles.yml and dbt_project.yml
-DBT_PROFILE=profiles.yml
-DBT_MODEL=dbt_project.yml
-
 function echo2() {
   echo >&2 "$@"
 }
@@ -15,6 +11,8 @@ function error() {
   exit 1
 }
 
+PROJECT_DIR=$(pwd)
+
 ## todo: make it easy to select source or destination and validate based on selection by adding an integration type env variable.
 function main() {
   CMD="$1"
@@ -43,15 +41,19 @@ function main() {
 
   case "$CMD" in
   run)
-    transform-config --config "$CONFIG_FILE" --integration-type "$INTEGRATION_TYPE" --out "$DBT_PROFILE"
-    # todo (cgardens) - @ChristopheDuong adjust these args if necessary.
-    transform-catalog --catalog "$CATALOG_FILE" --integration-type "$INTEGRATION_TYPE" --out "$DBT_MODEL"
-
-    # todo (cgardens) - @ChristopheDuong this is my best guess at how we are supposed to invoke dbt. adjust when they are inevitably not quite right.
-    dbt run --profiles-dir $(pwd) --project-dir $(pwd) --full-refresh --fail-fast
+    cp -r /airbyte/normalization_code/dbt-template/* $PROJECT_DIR
+    transform-config --config "$CONFIG_FILE" --integration-type "$INTEGRATION_TYPE" --out $PROJECT_DIR
+    transform-catalog --profile-config-dir $PROJECT_DIR --catalog "$CATALOG_FILE" --out $PROJECT_DIR/models/generated/ --json-column data
+    dbt deps --profiles-dir $PROJECT_DIR --project-dir $PROJECT_DIR
+    dbt run --profiles-dir $PROJECT_DIR --project-dir $PROJECT_DIR
     ;;
   dry-run)
-    error "Not Implemented"
+    cp -r /airbyte/normalization_code/dbt-template/* $PROJECT_DIR
+    transform-config --config "$CONFIG_FILE" --integration-type "$INTEGRATION_TYPE" --out $PROJECT_DIR
+    dbt debug --profiles-dir $PROJECT_DIR --project-dir $PROJECT_DIR
+    transform-catalog --profile-config-dir $PROJECT_DIR --catalog "$CATALOG_FILE" --out $PROJECT_DIR/models/generated/ --json-column data
+    dbt deps --profiles-dir $PROJECT_DIR --project-dir $PROJECT_DIR
+    dbt compile --profiles-dir $PROJECT_DIR --project-dir $PROJECT_DIR
     ;;
   *)
     error "Unknown command: $CMD"

diff --git a/airbyte-integrations/bases/base-normalization/normalization/destination_type.py b/airbyte-integrations/bases/base-normalization/normalization/destination_type.py
@@ -0,0 +1,31 @@
+"""
+MIT License
+
+Copyright (c) 2020 Airbyte
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+from enum import Enum
+
+
+class DestinationType(Enum):
+    bigquery = "bigquery"
+    postgres = "postgres"
+    snowflake = "snowflake"