Add OpenLineage support to BigQueryToGCSOperator (#35660)

apache · Nov 17, 2023 · ce16963 · ce16963
1 parent 02165c5
commit ce16963
Show file tree

Hide file tree

Showing 5 changed files with 571 additions and 1 deletion.
diff --git a/airflow/providers/google/cloud/transfers/bigquery_to_gcs.py b/airflow/providers/google/cloud/transfers/bigquery_to_gcs.py
@@ -29,6 +29,7 @@
 from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook, BigQueryJob
 from airflow.providers.google.cloud.links.bigquery import BigQueryTableLink
 from airflow.providers.google.cloud.triggers.bigquery import BigQueryInsertJobTrigger
+from airflow.utils.helpers import merge_dicts
 
 if TYPE_CHECKING:
     from google.api_core.retry import Retry
@@ -139,6 +140,8 @@ def __init__(
         self.hook: BigQueryHook | None = None
         self.deferrable = deferrable
 
+        self._job_id: str = ""
+
     @staticmethod
     def _handle_job_error(job: BigQueryJob | UnknownJob) -> None:
         if job.error_result:
@@ -240,6 +243,7 @@ def execute(self, context: Context):
                     f"Or, if you want to reattach in this scenario add {job.state} to `reattach_states`"
                 )
 
+        self._job_id = job.job_id
         conf = job.to_api_repr()["configuration"]["extract"]["sourceTable"]
         dataset_id, project_id, table_id = conf["datasetId"], conf["projectId"], conf["tableId"]
         BigQueryTableLink.persist(
@@ -255,7 +259,7 @@ def execute(self, context: Context):
                 timeout=self.execution_timeout,
                 trigger=BigQueryInsertJobTrigger(
                     conn_id=self.gcp_conn_id,
-                    job_id=job_id,
+                    job_id=self._job_id,
                     project_id=self.project_id or self.hook.project_id,
                 ),
                 method_name="execute_complete",
@@ -276,3 +280,72 @@ def execute_complete(self, context: Context, event: dict[str, Any]):
             self.task_id,
             event["message"],
         )
+
+    def get_openlineage_facets_on_complete(self, task_instance):
+        """Implementing on_complete as we will include final BQ job id."""
+        from pathlib import Path
+
+        from openlineage.client.facet import (
+            ExternalQueryRunFacet,
+            SymlinksDatasetFacet,
+            SymlinksDatasetFacetIdentifiers,
+        )
+        from openlineage.client.run import Dataset
+
+        from airflow.providers.google.cloud.hooks.gcs import _parse_gcs_url
+        from airflow.providers.google.cloud.utils.openlineage import (
+            get_facets_from_bq_table,
+            get_identity_column_lineage_facet,
+        )
+        from airflow.providers.openlineage.extractors import OperatorLineage
+
+        table_object = self.hook.get_client(self.hook.project_id).get_table(self.source_project_dataset_table)
+
+        input_dataset = Dataset(
+            namespace="bigquery",
+            name=str(table_object.reference),
+            facets=get_facets_from_bq_table(table_object),
+        )
+
+        output_dataset_facets = {
+            "schema": input_dataset.facets["schema"],
+            "columnLineage": get_identity_column_lineage_facet(
+                field_names=[field.name for field in table_object.schema], input_datasets=[input_dataset]
+            ),
+        }
+        output_datasets = []
+        for uri in sorted(self.destination_cloud_storage_uris):
+            bucket, blob = _parse_gcs_url(uri)
+            additional_facets = {}
+
+            if "*" in blob:
+                # If wildcard ("*") is used in gcs path, we want the name of dataset to be directory name,
+                # but we create a symlink to the full object path with wildcard.
+                additional_facets = {
+                    "symlink": SymlinksDatasetFacet(
+                        identifiers=[
+                            SymlinksDatasetFacetIdentifiers(
+                                namespace=f"gs://{bucket}", name=blob, type="file"
+                            )
+                        ]
+                    ),
+                }
+                blob = Path(blob).parent.as_posix()
+                if blob == ".":
+                    # blob path does not have leading slash, but we need root dataset name to be "/"
+                    blob = "/"
+
+            dataset = Dataset(
+                namespace=f"gs://{bucket}",
+                name=blob,
+                facets=merge_dicts(output_dataset_facets, additional_facets),
+            )
+            output_datasets.append(dataset)
+
+        run_facets = {}
+        if self._job_id:
+            run_facets = {
+                "externalQuery": ExternalQueryRunFacet(externalQueryId=self._job_id, source="bigquery"),
+            }
+
+        return OperatorLineage(inputs=[input_dataset], outputs=output_datasets, run_facets=run_facets)
diff --git a/airflow/providers/google/cloud/utils/openlineage.py b/airflow/providers/google/cloud/utils/openlineage.py
@@ -0,0 +1,80 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""This module contains code related to OpenLineage and lineage extraction."""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from openlineage.client.facet import (
+    ColumnLineageDatasetFacet,
+    ColumnLineageDatasetFacetFieldsAdditional,
+    ColumnLineageDatasetFacetFieldsAdditionalInputFields,
+    DocumentationDatasetFacet,
+    SchemaDatasetFacet,
+    SchemaField,
+)
+
+if TYPE_CHECKING:
+    from google.cloud.bigquery.table import Table
+    from openlineage.client.run import Dataset
+
+
+def get_facets_from_bq_table(table: Table) -> dict[Any, Any]:
+    """Get facets from BigQuery table object."""
+    facets = {
+        "schema": SchemaDatasetFacet(
+            fields=[
+                SchemaField(name=field.name, type=field.field_type, description=field.description)
+                for field in table.schema
+            ]
+        ),
+        "documentation": DocumentationDatasetFacet(description=table.description or ""),
+    }
+
+    return facets
+
+
+def get_identity_column_lineage_facet(
+    field_names: list[str],
+    input_datasets: list[Dataset],
+) -> ColumnLineageDatasetFacet:
+    """
+    Get column lineage facet.
+
+    Simple lineage will be created, where each source column corresponds to single destination column
+    in each input dataset and there are no transformations made.
+    """
+    if field_names and not input_datasets:
+        raise ValueError("When providing `field_names` You must provide at least one `input_dataset`.")
+
+    column_lineage_facet = ColumnLineageDatasetFacet(
+        fields={
+            field: ColumnLineageDatasetFacetFieldsAdditional(
+                inputFields=[
+                    ColumnLineageDatasetFacetFieldsAdditionalInputFields(
+                        namespace=dataset.namespace, name=dataset.name, field=field
+                    )
+                    for dataset in input_datasets
+                ],
+                transformationType="IDENTITY",
+                transformationDescription="identical",
+            )
+            for field in field_names
+        }
+    )
+    return column_lineage_facet
diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
@@ -272,6 +272,7 @@ codepoints
 Colour
 colour
 colours
+ColumnLineageDatasetFacet
 CommandType
 comparator
 compat