apache · villebro · Oct 31, 2022 · Oct 25, 2022 · Oct 25, 2022 · Oct 25, 2022
diff --git a/docs/docs/installation/alerts-reports.mdx b/docs/docs/installation/alerts-reports.mdx
@@ -371,18 +371,44 @@ to specify on behalf of which username to render the dashboards. In general dash
 are not accessible to unauthorized requests, that is why the worker needs to take over credentials
 of an existing user to take a snapshot.
 
+By default, Alerts and Reports are executed as the user that the `THUMBNAIL_SELENIUM_USER` config
+parameter is set to. To change this user, just change the config as follows:
+
 ```python
 THUMBNAIL_SELENIUM_USER = 'username_with_permission_to_access_dashboards'
 ```
 
+In addition, it's also possible to execute the reports as the report owners/creators. This is typically
+needed if there isn't a central service account that has access to all objects or databases (e.g.
+when using user impersonation on database connections). For this there's the config flag
+`ALERTS_REPORTS_EXECUTE_AS` which makes it possible to customize how alerts and reports are executed.
+To first try to execute as the creator in the owners list (if present), then fall
+back to the creator, then the last modifier in the owners list (if present), then the
+last modifier, then an owner (giving priority to the last modifier and then the
+creator if either is contained within the list of owners, otherwise the first owner
+will be used) and finally `THUMBNAIL_SELENIUM_USER`, set as follows:
+
+```python
+from superset.reports.types import ReportScheduleExecutor
+
+ALERT_REPORTS_EXECUTE_AS = [
+    ReportScheduleExecutor.CREATOR_OWNER,
+    ReportScheduleExecutor.CREATOR,
+    ReportScheduleExecutor.MODIFIER_OWNER,
+    ReportScheduleExecutor.MODIFIER,
+    ReportScheduleExecutor.OWNER,
+    ReportScheduleExecutor.SELENIUM,
+]
+```
+
 **Important notes**
 
 - Be mindful of the concurrency setting for celery (using `-c 4`). Selenium/webdriver instances can
   consume a lot of CPU / memory on your servers.
 - In some cases, if you notice a lot of leaked geckodriver processes, try running your celery
   processes with `celery worker --pool=prefork --max-tasks-per-child=128 ...`
 - It is recommended to run separate workers for the `sql_lab` and `email_reports` tasks. This can be
-  done using the `queue` field in `CELERY_ANNOTATIONS`.
+  done using the `queue` field in `task_annotations`.
 - Adjust `WEBDRIVER_BASEURL` in your configuration file if celery workers can’t access Superset via
   its default value of `http://0.0.0.0:8080/`.
 

diff --git a/superset/config.py b/superset/config.py
@@ -57,6 +57,7 @@
 from superset.advanced_data_type.types import AdvancedDataType
 from superset.constants import CHANGE_ME_SECRET_KEY
 from superset.jinja_context import BaseTemplateProcessor
+from superset.reports.types import ReportScheduleExecutor
 from superset.stats_logger import DummyStatsLogger
 from superset.superset_typing import CacheConfig
 from superset.utils.core import is_test, NO_TIME_RANGE, parse_boolean_string
@@ -1143,6 +1144,24 @@ def EMAIL_HEADER_MUTATOR(  # pylint: disable=invalid-name,unused-argument
 # sliding cron window size, should be synced with the celery beat config minus 1 second
 ALERT_REPORTS_CRON_WINDOW_SIZE = 59
 ALERT_REPORTS_WORKING_TIME_OUT_KILL = True
+# Which user to attempt to execute Alerts/Reports as. By default,
+# use the user defined in the `THUMBNAIL_SELENIUM_USER` config parameter.
+# To first try to execute as the creator in the owners list (if present), then fall
+# back to the creator, then the last modifier in the owners list (if present), then the
+# last modifier, then an owner (giving priority to the last modifier and then the
+# creator if either is contained within the list of owners, otherwise the first owner
+# will be used) and finally `THUMBNAIL_SELENIUM_USER`, set as follows:
+# ALERT_REPORTS_EXECUTE_AS = [
+#     ReportScheduleExecutor.CREATOR_OWNER,
+#     ReportScheduleExecutor.CREATOR,
+#     ReportScheduleExecutor.MODIFIER_OWNER,
+#     ReportScheduleExecutor.MODIFIER,
+#     ReportScheduleExecutor.OWNER,
+#     ReportScheduleExecutor.SELENIUM,
+# ]
+ALERT_REPORTS_EXECUTE_AS: List[ReportScheduleExecutor] = [
+    ReportScheduleExecutor.SELENIUM
+]
 # if ALERT_REPORTS_WORKING_TIME_OUT_KILL is True, set a celery hard timeout
 # Equal to working timeout + ALERT_REPORTS_WORKING_TIME_OUT_LAG
 ALERT_REPORTS_WORKING_TIME_OUT_LAG = int(timedelta(seconds=10).total_seconds())

diff --git a/superset/reports/commands/exceptions.py b/superset/reports/commands/exceptions.py
@@ -250,8 +250,8 @@ class ReportScheduleNotificationError(CommandException):
     message = _("Alert on grace period")
 
 
-class ReportScheduleSelleniumUserNotFoundError(CommandException):
-    message = _("Report Schedule sellenium user not found")
+class ReportScheduleUserNotFoundError(CommandException):
+    message = _("Report Schedule user not found")
 
 
 class ReportScheduleStateNotFoundError(CommandException):

diff --git a/superset/reports/commands/execute.py b/superset/reports/commands/execute.py
@@ -22,10 +22,9 @@
 
 import pandas as pd
 from celery.exceptions import SoftTimeLimitExceeded
-from flask_appbuilder.security.sqla.models import User
 from sqlalchemy.orm import Session
 
-from superset import app, security_manager
+from superset import app
 from superset.commands.base import BaseCommand
 from superset.commands.exceptions import CommandException
 from superset.common.chart_data import ChartDataResultFormat, ChartDataResultType
@@ -45,7 +44,6 @@
     ReportSchedulePreviousWorkingError,
     ReportScheduleScreenshotFailedError,
     ReportScheduleScreenshotTimeout,
-    ReportScheduleSelleniumUserNotFoundError,
     ReportScheduleStateNotFoundError,
     ReportScheduleUnexpectedError,
     ReportScheduleWorkingTimeoutError,
@@ -67,6 +65,7 @@
 from superset.reports.notifications import create_notification
 from superset.reports.notifications.base import NotificationContent
 from superset.reports.notifications.exceptions import NotificationError
+from superset.reports.utils import get_executor
 from superset.utils.celery import session_scope
 from superset.utils.core import HeaderDataType, override_user
 from superset.utils.csv import get_chart_csv_data, get_chart_dataframe
@@ -77,13 +76,6 @@
 logger = logging.getLogger(__name__)
 
 
-def _get_user() -> User:
-    user = security_manager.find_user(username=app.config["THUMBNAIL_SELENIUM_USER"])
-    if not user:
-        raise ReportScheduleSelleniumUserNotFoundError()
-    return user
-
-
 class BaseReportState:
     current_states: List[ReportState] = []
     initial: bool = False
@@ -182,11 +174,11 @@ def _get_url(
                 **kwargs,
             )
 
-        # If we need to render dashboard in a specific sate, use stateful permalink
+        # If we need to render dashboard in a specific state, use stateful permalink
         dashboard_state = self._report_schedule.extra.get("dashboard")
         if dashboard_state:
             permalink_key = CreateDashboardPermalinkCommand(
-                dashboard_id=self._report_schedule.dashboard_id,
+                dashboard_id=str(self._report_schedule.dashboard_id),
                 state=dashboard_state,
             ).run()
             return get_url_path("Superset.dashboard_permalink", key=permalink_key)
@@ -206,7 +198,7 @@ def _get_screenshots(self) -> List[bytes]:
         :raises: ReportScheduleScreenshotFailedError
         """
         url = self._get_url()
-        user = _get_user()
+        user = get_executor(self._report_schedule)
         if self._report_schedule.chart:
             screenshot: Union[ChartScreenshot, DashboardScreenshot] = ChartScreenshot(
                 url,
@@ -236,16 +228,15 @@ def _get_screenshots(self) -> List[bytes]:
 
     def _get_csv_data(self) -> bytes:
         url = self._get_url(result_format=ChartDataResultFormat.CSV)
-        auth_cookies = machine_auth_provider_factory.instance.get_auth_cookies(
-            _get_user()
-        )
+        user = get_executor(self._report_schedule)
+        auth_cookies = machine_auth_provider_factory.instance.get_auth_cookies(user)
 
         if self._report_schedule.chart.query_context is None:
             logger.warning("No query context found, taking a screenshot to generate it")
             self._update_query_context()
 
         try:
-            logger.info("Getting chart from %s", url)
+            logger.info("Getting chart from %s as user %s", url, user.username)
             csv_data = get_chart_csv_data(url, auth_cookies)
         except SoftTimeLimitExceeded as ex:
             raise ReportScheduleCsvTimeout() from ex
@@ -262,16 +253,15 @@ def _get_embedded_data(self) -> pd.DataFrame:
         Return data as a Pandas dataframe, to embed in notifications as a table.
         """
         url = self._get_url(result_format=ChartDataResultFormat.JSON)
-        auth_cookies = machine_auth_provider_factory.instance.get_auth_cookies(
-            _get_user()
-        )
+        user = get_executor(self._report_schedule)
+        auth_cookies = machine_auth_provider_factory.instance.get_auth_cookies(user)
 
         if self._report_schedule.chart.query_context is None:
             logger.warning("No query context found, taking a screenshot to generate it")
             self._update_query_context()
 
         try:
-            logger.info("Getting chart from %s", url)
+            logger.info("Getting chart from %s as user %s", url, user.username)
             dataframe = get_chart_dataframe(url, auth_cookies)
         except SoftTimeLimitExceeded as ex:
             raise ReportScheduleDataFrameTimeout() from ex
@@ -674,10 +664,16 @@ def __init__(self, task_id: str, model_id: int, scheduled_dttm: datetime):
     def run(self) -> None:
         with session_scope(nullpool=True) as session:
             try:
-                with override_user(_get_user()):
-                    self.validate(session=session)
-                    if not self._model:
-                        raise ReportScheduleExecuteUnexpectedError()
+                self.validate(session=session)
+                if not self._model:
+                    raise ReportScheduleExecuteUnexpectedError()
+                user = get_executor(self._model)
+                with override_user(user):
+                    logger.info(
+                        "Running report schedule %s as user %s",
+                        self._execution_id,
+                        user.username,
+                    )
                     ReportScheduleStateMachine(
                         session, self._execution_id, self._model, self._scheduled_dttm
                     ).run()
@@ -695,6 +691,8 @@ def validate(  # pylint: disable=arguments-differ
             self._model_id,
             self._execution_id,
         )
-        self._model = ReportScheduleDAO.find_by_id(self._model_id, session=session)
+        self._model = (
+            session.query(ReportSchedule).filter_by(id=self._model_id).one_or_none()
+        )
         if not self._model:
             raise ReportScheduleNotFoundError()
diff --git a/superset/reports/types.py b/superset/reports/types.py
@@ -14,10 +14,20 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+from enum import Enum
 from typing import TypedDict
 
 from superset.dashboards.permalink.types import DashboardPermalinkState
 
 
 class ReportScheduleExtra(TypedDict):
     dashboard: DashboardPermalinkState
+
+
+class ReportScheduleExecutor(str, Enum):
+    SELENIUM = "selenium"
+    CREATOR = "creator"
+    CREATOR_OWNER = "creator_owner"
+    MODIFIER = "modifier"
+    MODIFIER_OWNER = "modifier_owner"
+    OWNER = "owner"
diff --git a/superset/reports/utils.py b/superset/reports/utils.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from flask_appbuilder.security.sqla.models import User
+
+from superset import app, security_manager
+from superset.reports.commands.exceptions import ReportScheduleUserNotFoundError
+from superset.reports.models import ReportSchedule
+from superset.reports.types import ReportScheduleExecutor
+
+
+# pylint: disable=too-many-branches
+def get_executor(report_schedule: ReportSchedule) -> User:
+    """
+    Extract the user that should be used to execute a report schedule as.
+
+    :param report_schedule: The report to execute
+    :return: User to execute the report as
+    """
+    user_types = app.config["ALERT_REPORTS_EXECUTE_AS"]
+    owners = report_schedule.owners
+    owner_dict = {owner.id: owner for owner in owners}
+    for user_type in user_types:
+        if user_type == ReportScheduleExecutor.SELENIUM:
+            username = app.config["THUMBNAIL_SELENIUM_USER"]
+            if username and (user := security_manager.find_user(username=username)):
+                return user
+        if user_type == ReportScheduleExecutor.CREATOR_OWNER:
+            if (user := report_schedule.created_by) and (
+                owner := owner_dict.get(user.id)
+            ):
+                return owner
+        if user_type == ReportScheduleExecutor.CREATOR:
+            if user := report_schedule.created_by:
+                return user
+        if user_type == ReportScheduleExecutor.MODIFIER_OWNER:
+            if (user := report_schedule.changed_by) and (
+                owner := owner_dict.get(user.id)
+            ):
+                return owner
+        if user_type == ReportScheduleExecutor.MODIFIER:
+            if user := report_schedule.changed_by:
+                return user
+        if user_type == ReportScheduleExecutor.OWNER:
+            owners = report_schedule.owners
+            if len(owners) == 1:
+                return owners[0]
+            if len(owners) > 1:
+                if modifier := report_schedule.changed_by:
+                    if modifier and (user := owner_dict.get(modifier.id)):
+                        return user
+                if creator := report_schedule.created_by:
+                    if creator and (user := owner_dict.get(creator.id)):
+                        return user
+                return owners[0]
+
+    raise ReportScheduleUserNotFoundError()
diff --git a/superset/utils/machine_auth.py b/superset/utils/machine_auth.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from __future__ import annotations
+
 import importlib
 import logging
 from typing import Callable, Dict, TYPE_CHECKING
@@ -34,7 +36,7 @@
 
 class MachineAuthProvider:
     def __init__(
-        self, auth_webdriver_func_override: Callable[[WebDriver, "User"], WebDriver]
+        self, auth_webdriver_func_override: Callable[[WebDriver, User], WebDriver]
     ):
         # This is here in order to allow for the authenticate_webdriver func to be
         # overridden via config, as opposed to the entire provider implementation
@@ -43,7 +45,7 @@ def __init__(
     def authenticate_webdriver(
         self,
         driver: WebDriver,
-        user: "User",
+        user: User,
     ) -> WebDriver:
         """
         Default AuthDriverFuncType type that sets a session cookie flask-login style
@@ -69,7 +71,7 @@ def authenticate_webdriver(
         return driver
 
     @staticmethod
-    def get_auth_cookies(user: "User") -> Dict[str, str]:
+    def get_auth_cookies(user: User) -> Dict[str, str]:
         # Login with the user specified to get the reports
         with current_app.test_request_context("/login"):
             login_user(user)