Skip to content

Commit

Permalink
support static_index.html docs (#999)
Browse files Browse the repository at this point in the history
- Support `static_index.html` for dbt docs.
- Refactor remote filesystem access functions in anticipation of moving
them out of `cosmos/plugins/__init__.py`. Refactoring is designed to
make them behave a little more predictably and to make them look a
little more like Airflow 2.8+'s `ObjectStoragePath` class. Of course,
this is far, far from complete.

# Related Issue(s)

- Main: #986
- Related: #927
  • Loading branch information
dwreeves committed Jun 10, 2024
1 parent fbe6ff1 commit 74c28e2
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 59 deletions.
86 changes: 61 additions & 25 deletions cosmos/plugin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from flask import abort, url_for
from flask_appbuilder import AppBuilder, expose

from cosmos.settings import dbt_docs_conn_id, dbt_docs_dir
from cosmos.settings import dbt_docs_conn_id, dbt_docs_dir, dbt_docs_index_file_name


def bucket_and_key(path: str) -> Tuple[str, str]:
Expand All @@ -19,65 +19,89 @@ def bucket_and_key(path: str) -> Tuple[str, str]:
return bucket, key


def open_s3_file(conn_id: Optional[str], path: str) -> str:
def open_s3_file(path: str, conn_id: Optional[str]) -> str:
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from botocore.exceptions import ClientError

if conn_id is None:
conn_id = S3Hook.default_conn_name

hook = S3Hook(aws_conn_id=conn_id)
bucket, key = bucket_and_key(path)
content = hook.read_key(key=key, bucket_name=bucket)
try:
content = hook.read_key(key=key, bucket_name=bucket)
except ClientError as e:
if e.response.get("Error", {}).get("Code", "") == "NoSuchKey":
raise FileNotFoundError(f"{path} does not exist")
raise e
return content # type: ignore[no-any-return]


def open_gcs_file(conn_id: Optional[str], path: str) -> str:
def open_gcs_file(path: str, conn_id: Optional[str]) -> str:
from airflow.providers.google.cloud.hooks.gcs import GCSHook
from google.cloud.exceptions import NotFound

if conn_id is None:
conn_id = GCSHook.default_conn_name

hook = GCSHook(gcp_conn_id=conn_id)
bucket, blob = bucket_and_key(path)
content = hook.download(bucket_name=bucket, object_name=blob)
try:
content = hook.download(bucket_name=bucket, object_name=blob)
except NotFound:
raise FileNotFoundError(f"{path} does not exist")
return content.decode("utf-8") # type: ignore[no-any-return]


def open_azure_file(conn_id: Optional[str], path: str) -> str:
def open_azure_file(path: str, conn_id: Optional[str]) -> str:
from airflow.providers.microsoft.azure.hooks.wasb import WasbHook
from azure.core.exceptions import ResourceNotFoundError

if conn_id is None:
conn_id = WasbHook.default_conn_name

hook = WasbHook(wasb_conn_id=conn_id)

container, blob = bucket_and_key(path)
content = hook.read_file(container_name=container, blob_name=blob)
try:
content = hook.read_file(container_name=container, blob_name=blob)
except ResourceNotFoundError:
raise FileNotFoundError(f"{path} does not exist")
return content # type: ignore[no-any-return]


def open_http_file(conn_id: Optional[str], path: str) -> str:
def open_http_file(path: str, conn_id: Optional[str]) -> str:
from airflow.providers.http.hooks.http import HttpHook
from requests.exceptions import HTTPError

if conn_id is None:
conn_id = ""

hook = HttpHook(method="GET", http_conn_id=conn_id)
res = hook.run(endpoint=path)
hook.check_response(res)
try:
res = hook.run(endpoint=path)
hook.check_response(res)
except HTTPError as e:
if str(e).startswith("404"):
raise FileNotFoundError(f"{path} does not exist")
raise e
return res.text # type: ignore[no-any-return]


def open_file(path: str) -> str:
"""Retrieve a file from http, https, gs, s3, or wasb."""
def open_file(path: str, conn_id: Optional[str] = None) -> str:
"""
Retrieve a file from http, https, gs, s3, or wasb.
Raise a (base Python) FileNotFoundError if the file is not found.
"""
if path.strip().startswith("s3://"):
return open_s3_file(conn_id=dbt_docs_conn_id, path=path)
return open_s3_file(path, conn_id=conn_id)
elif path.strip().startswith("gs://"):
return open_gcs_file(conn_id=dbt_docs_conn_id, path=path)
return open_gcs_file(path, conn_id=conn_id)
elif path.strip().startswith("wasb://"):
return open_azure_file(conn_id=dbt_docs_conn_id, path=path)
return open_azure_file(path, conn_id=conn_id)
elif path.strip().startswith("http://") or path.strip().startswith("https://"):
return open_http_file(conn_id=dbt_docs_conn_id, path=path)
return open_http_file(path, conn_id=conn_id)
else:
with open(path) as f:
content = f.read()
Expand Down Expand Up @@ -167,27 +191,39 @@ def dbt_docs(self) -> str:
def dbt_docs_index(self) -> str:
if dbt_docs_dir is None:
abort(404)
html = open_file(op.join(dbt_docs_dir, "index.html"))
# Hack the dbt docs to render properly in an iframe
iframe_resizer_url = url_for(".static", filename="iframeResizer.contentWindow.min.js")
html = html.replace("</head>", f'{iframe_script}<script src="{iframe_resizer_url}"></script></head>', 1)
return html
try:
html = open_file(op.join(dbt_docs_dir, dbt_docs_index_file_name), conn_id=dbt_docs_conn_id)
except FileNotFoundError:
abort(404)
else:
# Hack the dbt docs to render properly in an iframe
iframe_resizer_url = url_for(".static", filename="iframeResizer.contentWindow.min.js")
html = html.replace("</head>", f'{iframe_script}<script src="{iframe_resizer_url}"></script></head>', 1)
return html

@expose("/catalog.json") # type: ignore[misc]
@has_access([(permissions.ACTION_CAN_READ, permissions.RESOURCE_WEBSITE)])
def catalog(self) -> Tuple[str, int, Dict[str, Any]]:
if dbt_docs_dir is None:
abort(404)
data = open_file(op.join(dbt_docs_dir, "catalog.json"))
return data, 200, {"Content-Type": "application/json"}
try:
data = open_file(op.join(dbt_docs_dir, "catalog.json"), conn_id=dbt_docs_conn_id)
except FileNotFoundError:
abort(404)
else:
return data, 200, {"Content-Type": "application/json"}

@expose("/manifest.json") # type: ignore[misc]
@has_access([(permissions.ACTION_CAN_READ, permissions.RESOURCE_WEBSITE)])
def manifest(self) -> Tuple[str, int, Dict[str, Any]]:
if dbt_docs_dir is None:
abort(404)
data = open_file(op.join(dbt_docs_dir, "manifest.json"))
return data, 200, {"Content-Type": "application/json"}
try:
data = open_file(op.join(dbt_docs_dir, "manifest.json"), conn_id=dbt_docs_conn_id)
except FileNotFoundError:
abort(404)
else:
return data, 200, {"Content-Type": "application/json"}


dbt_docs_view = DbtDocsView()
Expand Down
1 change: 1 addition & 0 deletions cosmos/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
propagate_logs = conf.getboolean("cosmos", "propagate_logs", fallback=True)
dbt_docs_dir = conf.get("cosmos", "dbt_docs_dir", fallback=None)
dbt_docs_conn_id = conf.get("cosmos", "dbt_docs_conn_id", fallback=None)
dbt_docs_index_file_name = conf.get("cosmos", "dbt_docs_index_file_name", fallback="index.html")

try:
LINEAGE_NAMESPACE = conf.get("openlineage", "namespace")
Expand Down
8 changes: 8 additions & 0 deletions docs/configuration/hosting-docs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ or as an environment variable:
The path can be either a folder in the local file system the webserver is running on, or a URI to a cloud storage platform (S3, GCS, Azure).

If your docs were generated using the ``--static`` flag, you can set the index filename using ``dbt_docs_index_file_name``:

.. code-block:: cfg
[cosmos]
dbt_docs_index_file_name = static_index.html
Host from Cloud Storage
~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ dependencies = [
"importlib-metadata; python_version < '3.8'",
"Jinja2>=3.0.0",
"msgpack",
"packaging",
"pydantic>=1.10.0",
"typing-extensions; python_version < '3.8'",
"virtualenv",
Expand Down Expand Up @@ -127,6 +128,7 @@ dependencies = [
"apache-airflow-providers-cncf-kubernetes>=5.1.1",
"apache-airflow-providers-amazon>=3.0.0,<8.20.0", # https://github.com/apache/airflow/issues/39103
"apache-airflow-providers-docker>=3.5.0",
"apache-airflow-providers-google",
"apache-airflow-providers-microsoft-azure",
"apache-airflow-providers-postgres",
"types-PyYAML",
Expand Down
Loading

0 comments on commit 74c28e2

Please sign in to comment.