Merge branch 'main' into redshift-benchmark-results

astronomer · Sep 14, 2022 · 9b6c74f · 9b6c74f
2 parents 52ff732 + 80deceb
commit 9b6c74f
Show file tree

Hide file tree

Showing 106 changed files with 1,693 additions and 365 deletions.
diff --git a/.github/workflows/ci-python-sdk.yaml b/.github/workflows/ci-python-sdk.yaml
@@ -4,10 +4,14 @@ on:
     branches: [ 'main', 'release-**' ]
     paths:
       - 'python-sdk/**'
+      - '.github/workflows/ci-python-sdk.yaml'
+      - '*'
   pull_request:
     branches: [ 'main', 'release-**' ]
     paths:
       - 'python-sdk/**'
+      - '.github/workflows/ci-python-sdk.yaml'
+      - '*'
   # Run on PRs from forks
   pull_request_target:
     branches: [ 'main' ]
@@ -43,7 +47,7 @@ jobs:
            path: |
              ~/.cache/pip
              .nox
-           key: ${{ runner.os }}-${{ hashFiles('pyproject.toml') }}
+           key: ${{ runner.os }}-${{ hashFiles('python-sdk/pyproject.toml') }}
       - run: pip3 install nox
       - run: nox -s type_check
 
@@ -60,7 +64,7 @@ jobs:
           path: |
             ~/.cache/pip
             .nox
-          key: ${{ runner.os }}-${{ hashFiles('pyproject.toml') }}
+          key: ${{ runner.os }}-${{ hashFiles('python-sdk/pyproject.toml') }}
       - run: pip3 install nox
       - run: nox -s build_docs
 
@@ -111,7 +115,7 @@ jobs:
           path: |
             ~/.cache/pip
             .nox
-          key: ${{ runner.os }}-${{ hashFiles('pyproject.toml') }}
+          key: ${{ runner.os }}-${{ hashFiles('python-sdk/pyproject.toml') }}
       - run: cat ../.github/ci-test-connections.yaml > test-connections.yaml
       - run: python -c 'import os; print(os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON", "").strip())' > ${{ env.GOOGLE_APPLICATION_CREDENTIALS }}
       - run: sqlite3 /tmp/sqlite_default.db "VACUUM;"
@@ -201,7 +205,7 @@ jobs:
           path: |
             ~/.cache/pip
             .nox
-          key: ${{ runner.os }}-2.3-${{ hashFiles('pyproject.toml') }}
+          key: ${{ runner.os }}-2.3-${{ hashFiles('python-sdk/pyproject.toml') }}
       - run: cat ../.github/ci-test-connections.yaml > test-connections.yaml
       - run: python -c 'import os; print(os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON", "").strip())' > ${{ env.GOOGLE_APPLICATION_CREDENTIALS }}
       - run: sqlite3 /tmp/sqlite_default.db "VACUUM;"
@@ -292,7 +296,7 @@ jobs:
           path: |
             ~/.cache/pip
             .nox
-          key: ${{ runner.os }}-2.2.5-${{ hashFiles('pyproject.toml') }}
+          key: ${{ runner.os }}-2.2.5-${{ hashFiles('python-sdk/pyproject.toml') }}
       - run: cat ../.github/ci-test-connections.yaml > test-connections.yaml
       - run: python -c 'import os; print(os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON", "").strip())' > ${{ env.GOOGLE_APPLICATION_CREDENTIALS }}
       - run: sqlite3 /tmp/sqlite_default.db "VACUUM;"
@@ -373,7 +377,7 @@ jobs:
     - uses: actions/cache@v2
       with:
         path: ~/.cache/pip
-        key: ${{ hashFiles('pyproject.toml') }}
+        key: ${{ hashFiles('python-sdk/pyproject.toml') }}
     - run: pip3 install nox
     - run: nox -s build
     - run: nox -s release -- dist/*

diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,7 @@ airflow_settings.yaml
 .idea%
 /build/
 /dist/
+**/dist/
 
 # Benchmark
 airflow.db

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,8 @@
 ---
 ci:
-  skip: [mypy]
+  skip:
+    - mypy  # requires additional dependencies in ci
+    - identity  # output is too verbose for ci; pre-commit.ci truncates almost all output after that
 default_stages: [commit, push]
 default_language_version:
   # force all unspecified python hooks to run python3
@@ -70,15 +72,13 @@ repos:
     rev: 5.10.1
     hooks:
       - id: isort
-        name: Run isort to sort imports in Python files
+        name: Run isort for python-sdk
         args: [ "--profile", "black" ]
-        files: .py$
-        entry: bash -c 'cd python-sdk && isort --profile black .' --
+        files: ^python-sdk/
       - id: isort
-        name: Run isort to sort imports in Python files
+        name: Run isort for sql-cli
         args: [ "--profile", "black" ]
-        files: .py$
-        entry: bash -c 'cd sql-cli && isort --profile black .' --
+        files: ^sql-cli/
 
   - repo: https://github.com/codespell-project/codespell
     rev: v2.2.1
@@ -92,7 +92,7 @@ repos:
       rev: 'v0.971'
       hooks:
       - id: mypy
-        additional_dependencies: [types-PyYAML, types-attrs]
+        additional_dependencies: [types-PyYAML, types-attrs, attrs]
         pass_filenames: false
         files: "^python-sdk/"
         entry: bash -c 'cd python-sdk && mypy "$@"' --
@@ -107,3 +107,11 @@ repos:
       hooks:
       -   id: pyupgrade
           args: [--py37-plus]
+
+  - repo: local
+    hooks:
+    - id: check-context-typing-compat
+      name: Ensure modules use local typing compat for airflow.utils.context.Context
+      entry: python-sdk/dev/scripts/pre_commit_context_typing_compat.py
+      language: python
+      pass_filenames: false
diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ pip install astro-sdk-python[amazon,google,snowflake,postgres]
 
     ```shell
     # The sqlite_default connection has different host for MAC vs. Linux
-    export SQL_TABLE_NAME=`airflow connections get sqlite_default -o     yaml | grep host | awk '{print $2}'`
+    export SQL_TABLE_NAME=`airflow connections get sqlite_default -o yaml | grep host | awk '{print $2}'`
     sqlite3 "$SQL_TABLE_NAME" "VACUUM;"
     ```
 
@@ -117,16 +117,16 @@ pip install astro-sdk-python[amazon,google,snowflake,postgres]
 
 The following are some key functions available in the SDK:
 
-- `load_file`: Load a given file into a SQL table
-- `transform`: Applies a SQL select statement to a source table and saves the result to a destination table
-- `drop_table`: Drops a SQL table
-- `run_raw_sql`: Run any SQL statement without handling its output
-- `append`: Insert rows from the source SQL table into the destination SQL table, if there are no conflicts
-- `merge`: Insert rows from the source SQL table into the destination SQL table, depending on conflicts:
+- [`load_file`](https://astro-sdk-python.readthedocs.io/en/stable/astro/sql/operators/load_file.html): Load a given file into a SQL table
+- [`transform`](https://astro-sdk-python.readthedocs.io/en/stable/astro/sql/operators/transform.html): Applies a SQL select statement to a source table and saves the result to a destination table
+- [`drop_table`](https://astro-sdk-python.readthedocs.io/en/stable/astro/sql/operators/drop_table.html): Drops a SQL table
+- [`run_raw_sql`](https://astro-sdk-python.readthedocs.io/en/stable/astro/sql/operators/raw_sql.html): Run any SQL statement without handling its output
+- [`append`](https://astro-sdk-python.readthedocs.io/en/stable/astro/sql/operators/append.html): Insert rows from the source SQL table into the destination SQL table, if there are no conflicts
+- [`merge`](https://astro-sdk-python.readthedocs.io/en/stable/astro/sql/operators/merge.html): Insert rows from the source SQL table into the destination SQL table, depending on conflicts:
   - `ignore`: Do not add rows that already exist
   - `update`: Replace existing rows with new ones
-- `export_file`: Export SQL table rows into a destination file
-- `dataframe`: Export given SQL table into in-memory Pandas data-frame
+- [`export_file`](https://astro-sdk-python.readthedocs.io/en/stable/astro/sql/operators/export.html): Export SQL table rows into a destination file
+- [`dataframe`](https://astro-sdk-python.readthedocs.io/en/stable/astro/sql/operators/dataframe.html): Export given SQL table into in-memory Pandas data-frame
 
 For a full list of available operators, see the [SDK reference documentation](https://astro-sdk-python.readthedocs.io/en/stable/operators.html).
 

diff --git a/python-sdk/LICENSE b/python-sdk/LICENSE
@@ -0,0 +1 @@
+../LICENSE
diff --git a/python-sdk/conftest.py b/python-sdk/conftest.py
@@ -11,7 +11,6 @@
 from airflow.utils import timezone
 from airflow.utils.db import create_default_connections
 from airflow.utils.session import create_session, provide_session
-
 from astro.constants import Database, FileLocation, FileType
 from astro.databases import create_database
 from astro.sql.table import MAX_TABLE_NAME_LENGTH, Table

diff --git a/python-sdk/dev/scripts/pre_commit_context_typing_compat.py b/python-sdk/dev/scripts/pre_commit_context_typing_compat.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+"""
+Pre-commit hook to verify ``airflow.utils.context.Context`` is not imported in provider modules.
+
+# TODO: This pre-commit hook can be removed once the repo has a minimum Apache Airflow requirement of 2.3.3+.
+"""
+from __future__ import annotations
+
+import os
+from ast import ImportFrom, NodeVisitor, parse
+from pathlib import Path, PosixPath
+
+SOURCES_ROOT = Path(__file__).parents[2]
+ASTRO_ROOT = SOURCES_ROOT / "src" / "astro"
+TYPING_COMPAT_PATH = "python-sdk/src/astro/utils/typing_compat.py"
+
+
+class ImportCrawler(NodeVisitor):
+    """AST crawler to determine if a module has an incompatible `airflow.utils.context.Context` import."""
+
+    def __init__(self) -> None:
+        self.has_incompatible_context_imports = False
+
+    def visit_ImportFrom(self, node: ImportFrom) -> None:
+        """Visit an ImportFrom node to determine if `airflow.utils.context.Context` is imported directly."""
+        if self.has_incompatible_context_imports:
+            return
+
+        for alias in node.names:
+            if f"{node.module}.{alias.name}" == "airflow.utils.context.Context":
+                if not self.has_incompatible_context_imports:
+                    self.has_incompatible_context_imports = True
+
+
+def get_all_provider_files() -> list[PosixPath]:
+    """Retrieve all eligible provider module files."""
+    provider_files = []
+    for (root, _, file_names) in os.walk(ASTRO_ROOT):
+        for file_name in file_names:
+            file_path = Path(root, file_name)
+            if (
+                file_path.is_file()
+                and file_path.name.endswith(".py")
+                and TYPING_COMPAT_PATH not in str(file_path)
+            ):
+                provider_files.append(file_path)
+
+    return provider_files
+
+
+def find_incompatible_context_imports(file_paths: list[PosixPath]) -> list[str]:
+    """Retrieve any provider files that import `airflow.utils.context.Context` directly."""
+    incompatible_context_imports = []
+    for file_path in file_paths:
+        file_ast = parse(file_path.read_text(), filename=file_path.name)
+        crawler = ImportCrawler()
+        crawler.visit(file_ast)
+        if crawler.has_incompatible_context_imports:
+            incompatible_context_imports.append(str(file_path))
+
+    return incompatible_context_imports
+
+
+if __name__ == "__main__":
+    provider_files = get_all_provider_files()
+    files_needing_typing_compat = find_incompatible_context_imports(provider_files)
+
+    if len(files_needing_typing_compat) > 0:
+        error_message = (
+            "The following files are importing `airflow.utils.context.Context`. "
+            "This is not compatible with the minimum `apache-airflow` requirement of this repository. "
+            "Please use `astro.utils.typing_compat.Context` instead.\n\n\t{}".format(
+                "\n".join(files_needing_typing_compat)
+            )
+        )
+
+        raise SystemExit(error_message)
diff --git a/python-sdk/docs/CHANGELOG.md b/python-sdk/docs/CHANGELOG.md
@@ -1,4 +1,74 @@
 # Changelog
+## 1.1.0b1
+
+### Features
+* Add support for Redshift ([#639](https://github.com/astronomer/astro-sdk/pull/639),
+[#753](https://github.com/astronomer/astro-sdk/pull/753),
+[#700](https://github.com/astronomer/astro-sdk/pull/700))
+* Support for Datasets introduced in Airflow 2.4 ([#786](https://github.com/astronomer/astro-sdk/pull/786),[#808](https://github.com/astronomer/astro-sdk/pull/808))
+
+    - `inlets` and `outlets` will be automatically set for all the operators.
+    - Users can now schedule DAGs on `File` and `Table` objects. Example:
+
+      ```python
+      input_file = File(
+          path="https://raw.githubusercontent.com/astronomer/astro-sdk/main/tests/data/imdb_v2.csv"
+      )
+      imdb_movies_table = Table(name="imdb_movies", conn_id="sqlite_default")
+      top_animations_table = Table(name="top_animation", conn_id="sqlite_default")
+      START_DATE = datetime(2022, 9, 1)
+
+
+      @aql.transform()
+      def get_top_five_animations(input_table: Table):
+          return """
+              SELECT title, rating
+              FROM {{input_table}}
+              WHERE genre1='Animation'
+              ORDER BY rating desc
+              LIMIT 5;
+          """
+
+
+      with DAG(
+          dag_id="example_dataset_producer",
+          schedule=None,
+          start_date=START_DATE,
+          catchup=False,
+      ) as load_dag:
+          imdb_movies = aql.load_file(
+              input_file=input_file,
+              task_id="load_csv",
+              output_table=imdb_movies_table,
+          )
+
+      with DAG(
+          dag_id="example_dataset_consumer",
+          schedule=[imdb_movies_table],
+          start_date=START_DATE,
+          catchup=False,
+      ) as transform_dag:
+          top_five_animations = get_top_five_animations(
+              input_table=imdb_movies_table,
+              output_table=top_animations_table,
+          )
+      ```
+* Dynamic Task Templates: Tasks that can be used with Dynamic Task Mapping (Airflow 2.3+)
+  * Get list of files from a Bucket - `get_file_list` ([#596](https://github.com/astronomer/astro-sdk/pull/596))
+  * Get list of values from a DB - `get_value_list` ([#673](https://github.com/astronomer/astro-sdk/pull/673))
+
+* Create upstream_tasks parameter for dependencies independent of data transfers ([#585](https://github.com/astronomer/astro-sdk/pull/585))
+
+### Bug fixes
+* Add response_size to run_raw_sql and warn about db thrashing ([#815](https://github.com/astronomer/astro-sdk/pull/815))
+
+### Docs
+* Add section explaining table metadata ([#774](https://github.com/astronomer/astro-sdk/pull/774))
+* Fix docstring for run_raw_sql ([#817](https://github.com/astronomer/astro-sdk/pull/817))
+* Add missing docs for Table class ([#788](https://github.com/astronomer/astro-sdk/pull/788))
+* Add the readme.md example dag to example dags folder ([#681](https://github.com/astronomer/astro-sdk/pull/681))
+* Add reason for enabling XCOM pickling ([#747](https://github.com/astronomer/astro-sdk/pull/747))
+
 ## 1.0.2
 
 ### Bug fixes

diff --git a/python-sdk/docs/astro/sql/operators/load_file.rst b/python-sdk/docs/astro/sql/operators/load_file.rst
@@ -80,11 +80,13 @@ Parameters to use when loading a file to a database table
 Inferring a Table Schema
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-There are two ways to infer the schema of the table to be created, listed by priority:
+There are three ways to infer the schema of the table to be created, listed by priority:
 
 #. **User specified schema** - You can specify the schema of the table to be created in the Table object, like the ``output_table`` section in :ref:`custom_schema`
 
-#. **Auto schema detection** - if you don't specify the schema in the table object, then ``load_file`` will infer the schema using the top 1000 rows. The default value of rows to look at is 1000, but this can be changed by creating an environment variable.
+#. **Native auto schema detection** - If available, this will be used over pandas auto schema detection below, which will use the schema inference mechanism provided by the database.
+
+#. **Pandas auto schema detection** - if you don't specify the schema in the table object, then ``load_file`` will infer the schema using the top 1000 rows. The default value of rows to look at is 1000, but this can be changed by creating an environment variable.
 
     .. code-block:: shell