Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions superset/commands/database/uploaders/columnar_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import pandas as pd
import pyarrow.parquet as pq
from flask import current_app
from flask_babel import lazy_gettext as _
from pyarrow.lib import ArrowException
from werkzeug.datastructures import FileStorage
Expand All @@ -33,10 +34,47 @@
FileMetadata,
ReaderOptions,
)
from superset.exceptions import SupersetException
from superset.utils.core import check_is_safe_zip

logger = logging.getLogger(__name__)


def _check_file_size(file: FileStorage) -> None:
"""
Reject an uploaded file whose raw (on-the-wire) size exceeds the configured
limit before its contents are buffered into memory.

This is complementary to the ZIP decompression-ratio guard: it bounds the
raw bytes accepted regardless of whether the payload is compressed.

:param file: The uploaded file to check.
:throws DatabaseUploadFailed: if the file exceeds the configured limit.
"""
max_size = current_app.config.get("UPLOAD_MAX_FILE_SIZE_BYTES")
if not max_size:
return
stream = file.stream
try:
current_position = stream.tell()
stream.seek(0, 2) # seek to end
size = stream.tell()
stream.seek(current_position)
except (AttributeError, OSError):
# If the stream is not seekable we cannot determine the size cheaply;
# skip the check and rely on downstream guards.
return
if size > max_size:
raise DatabaseUploadFailed(
_(
"File size %(size)s bytes exceeds the maximum allowed "
"upload size of %(max_size)s bytes",
size=size,
max_size=max_size,
)
)


class ColumnarReaderOptions(ReaderOptions, total=False):
columns_read: list[str]

Expand Down Expand Up @@ -80,6 +118,7 @@ def _yield_files(file: FileStorage) -> Generator[IO[bytes], None, None]:
:param file: The file to yield files from.
:return: A generator that yields files.
"""
_check_file_size(file)
file_suffix = Path(file.filename).suffix
if not file_suffix:
raise DatabaseUploadFailed(_("Unexpected no file extension found"))
Expand All @@ -89,6 +128,12 @@ def _yield_files(file: FileStorage) -> Generator[IO[bytes], None, None]:
raise DatabaseUploadFailed(_("Not a valid ZIP file"))
try:
with ZipFile(file) as zip_file:
# guard against decompression bombs before reading entries,
# mirroring the importer path
try:
check_is_safe_zip(zip_file)
except SupersetException as ex:
raise DatabaseUploadFailed(str(ex)) from ex
# check if all file types are of the same extension
Comment on lines +132 to 137
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: check_is_safe_zip() raises SupersetException, but this reader otherwise raises DatabaseUploadFailed for user upload errors. Letting SupersetException bubble here changes upload failures into generic 500 responses instead of the expected 4xx/422-style upload error handling. Catch and re-raise zip-safety failures as DatabaseUploadFailed to preserve the existing API error contract. [api mismatch]

Severity Level: Major ⚠️
- ❌ Columnar ZIP uploads hitting safety guard return HTTP 500.
- ⚠️ Upload UI misclassifies user ZIP issues as server errors.
Steps of Reproduction ✅
1. Create a high-compression-ratio ZIP using `_make_high_ratio_zip()` from
`tests/unit_tests/commands/databases/columnar_reader_test.py:234-245`, which writes ~1MB
of zeros into a ZIP entry.

2. Start Superset with this PR code and send `POST /api/v1/database/<pk>/upload/` (upload
endpoint implemented in `superset/databases/api.py:63-80,1789-1796`) with `type=columnar`
and the ZIP from step 1 as `file`.

3. The view constructs a `ColumnarReader` (`superset/databases/api.py:68-73`) and calls
`UploadCommand(...).run()`, which in turn calls `BaseDataReader.read()` and
`ColumnarReader.file_to_dataframe()`
(`superset/commands/database/uploaders/columnar_reader.py:51-59`), leading to
`_yield_files()` (`line 76` hunk) when the filename ends with `.zip`.

4. Inside `_yield_files`, when `file_suffix == "zip"`, `check_is_safe_zip(zip_file)` is
called (`superset/commands/database/uploaders/columnar_reader.py:33-37`), which raises a
plain `SupersetException` if the compression ratio is too high
(`superset/utils/core.py:11-29`); this exception is not caught or wrapped in
`DatabaseUploadFailed` anywhere in `ColumnarReader` or `UploadCommand`, so it bubbles up
to the global Flask error handler `show_unexpected_exception`
(`superset/views/error_handling.py:39-57`), which returns an HTTP 500 "generic backend
error", whereas other upload validation errors raise `DatabaseUploadFailed`
(`superset/commands/database/exceptions.py:17-19`, a `CommandException`) and are rendered
as structured 4xx/422 errors via the `CommandException` handler
(`superset/views/error_handling.py:45-37`), creating inconsistent API error semantics for
this specific ZIP-safety failure.

Fix in Cursor | Fix in VSCode Claude

(Use Cmd/Ctrl + Click for best experience)

Prompt for AI Agent 🤖
This is a comment left during a code review.

**Path:** superset/commands/database/uploaders/columnar_reader.py
**Line:** 94:96
**Comment:**
	*Api Mismatch: `check_is_safe_zip()` raises `SupersetException`, but this reader otherwise raises `DatabaseUploadFailed` for user upload errors. Letting `SupersetException` bubble here changes upload failures into generic 500 responses instead of the expected 4xx/422-style upload error handling. Catch and re-raise zip-safety failures as `DatabaseUploadFailed` to preserve the existing API error contract.

Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise.
Once fix is implemented, also check other comments on the same PR, and ask user if the user wants to fix the rest of the comments as well. if said yes, then fetch all the comments validate the correctness and implement a minimal fix
👍 | 👎

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed — wrapped the check_is_safe_zip() call in a try/except SupersetException block and re-raises as DatabaseUploadFailed. This keeps the unsafe-ZIP rejection aligned with other upload validation errors (422 instead of 500). Tests updated to assert DatabaseUploadFailed.

Comment on lines 130 to 137
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed — see the update to comment above. SupersetException from check_is_safe_zip is now caught and re-raised as DatabaseUploadFailed.

file_suffixes = {Path(name).suffix for name in zip_file.namelist()}
if len(file_suffixes) > 1:
Expand Down
11 changes: 11 additions & 0 deletions superset/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,11 @@ def _try_json_readsha(filepath: str, length: int) -> str | None:
# max rows retrieved by filter select auto complete
FILTER_SELECT_ROW_LIMIT = 10000

# Upper bound on the page size accepted by the generic DAO list/pagination layer.
# Caps how many rows a single paginated query can request, regardless of the
# requested page size, to keep query result sets bounded.
SQLALCHEMY_DAO_MAX_PAGE_SIZE = 1000

# SupersetClient HTTP retry configuration
# Controls retry behavior for all HTTP requests made through SupersetClient
# This helps handle transient server errors (like 502 Bad Gateway) automatically
Expand Down Expand Up @@ -1112,6 +1117,12 @@ class D3TimeFormat(TypedDict, total=False):
UPLOAD_FOLDER = BASE_DIR + "/static/uploads/"
UPLOAD_CHUNK_SIZE = 4096

# Upper bound, in bytes, on the size of a single uploaded data file (e.g. CSV,
# Excel, columnar). Files larger than this are rejected before their contents
# are buffered into memory, keeping the resources consumed by a single upload
# bounded. Set to ``None`` to disable the check. Defaults to 100 MB.
UPLOAD_MAX_FILE_SIZE_BYTES: int | None = 100 * 1024 * 1024

# ---------------------------------------------------
# Cache configuration
# ---------------------------------------------------
Expand Down
6 changes: 5 additions & 1 deletion superset/daos/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)

import sqlalchemy as sa
from flask import current_app
from flask_appbuilder.models.filters import BaseFilter
from flask_appbuilder.models.sqla.interface import SQLAInterface
from pydantic import BaseModel, Field
Expand Down Expand Up @@ -749,7 +750,10 @@ def list( # noqa: C901
else:
query = query.order_by(asc(column))
page = page
page_size = max(page_size, 1)
# Clamp the page size to a sane range: at least 1, and no larger than
# the configured upper bound, to keep result sets bounded.
max_page_size = current_app.config.get("SQLALCHEMY_DAO_MAX_PAGE_SIZE", 1000)
page_size = min(max(page_size, 1), max_page_size)
Comment on lines +755 to +756
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: The new clamp trusts SQLALCHEMY_DAO_MAX_PAGE_SIZE without validating it is at least 1. If that config is set to 0 or a negative value, page_size becomes non-positive (0 or negative), which breaks pagination semantics and can even produce effectively unbounded queries on some databases. Normalize the configured max to a positive integer before applying min(...). [incorrect condition logic]

Severity Level: Major ⚠️
- ❌ DAO-backed list endpoints can emit zero-row result pages.
- ⚠️ Negative max page size may cause unbounded DAO queries.
Steps of Reproduction ✅
1. Override `SQLALCHEMY_DAO_MAX_PAGE_SIZE` in deployment config (e.g.
`superset_config.py`) to `0` or a negative value, which is read via
`current_app.config.get("SQLALCHEMY_DAO_MAX_PAGE_SIZE", 1000)` in `BaseDAO.list`
(`superset/daos/base.py:755`); the default in core config is `1000`
(`superset/config.py:174`).

2. Call any DAO `list()` method that delegates to `BaseDAO.list` (e.g. `UserDAO.list`
exercised in tests at `tests/integration_tests/dao/base_dao_test.py:626-639`) with a
normal positive `page_size` argument such as `50`.

3. In `BaseDAO.list`, after building and ordering the query
(`superset/daos/base.py:720-732`), the page size is clamped using `page_size =
min(max(page_size, 1), max_page_size)` (`superset/daos/base.py:756`); with `max_page_size
== 0`, this computes `page_size = min(max(50, 1), 0) == 0`, or with `max_page_size == -5`
it computes `page_size = -5`.

4. The DAO then applies `query = query.offset(page * page_size).limit(page_size)`
(`superset/daos/base.py:757`), so a misconfigured non-positive
`SQLALCHEMY_DAO_MAX_PAGE_SIZE` causes `LIMIT 0` (empty results) or `LIMIT -5`
(driver-specific behavior, often treated as an effectively unbounded query or an error),
breaking the intended "bounded page size" guarantee for all DAO-backed list endpoints that
use `BaseDAO.list`.

Fix in Cursor | Fix in VSCode Claude

(Use Cmd/Ctrl + Click for best experience)

Prompt for AI Agent 🤖
This is a comment left during a code review.

**Path:** superset/daos/base.py
**Line:** 755:756
**Comment:**
	*Incorrect Condition Logic: The new clamp trusts `SQLALCHEMY_DAO_MAX_PAGE_SIZE` without validating it is at least 1. If that config is set to `0` or a negative value, `page_size` becomes non-positive (`0` or negative), which breaks pagination semantics and can even produce effectively unbounded queries on some databases. Normalize the configured max to a positive integer before applying `min(...)`.

Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise.
Once fix is implemented, also check other comments on the same PR, and ask user if the user wants to fix the rest of the comments as well. if said yes, then fetch all the comments validate the correctness and implement a minimal fix
👍 | 👎

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Valid concern, but the config key is documented as an int and operator misconfiguration (string or ≤0) is an admin error, not a user error. Adding a startup-time validation guard for this config would be a separate follow-up to avoid scope creep here.

Comment on lines +753 to +756
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above — this is a pre-existing config validation gap, not introduced by this PR. Follow-up candidate.

query = query.offset(page * page_size).limit(page_size)
items = query.all()
# If columns are specified, SQLAlchemy returns Row objects (not tuples or
Expand Down
85 changes: 84 additions & 1 deletion tests/unit_tests/commands/databases/columnar_reader_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@
import io
import tempfile
from typing import Any
from zipfile import ZipFile
from unittest.mock import patch
from zipfile import ZIP_DEFLATED, ZipFile

import numpy as np
import pytest
from flask import current_app
from werkzeug.datastructures import FileStorage

from superset.commands.database.exceptions import DatabaseUploadFailed
Expand Down Expand Up @@ -230,6 +232,87 @@ def test_columnar_reader_bad_zip():
assert str(ex.value) == "Not a valid ZIP file"


def _make_high_ratio_zip() -> io.BytesIO:
"""
Build a ZIP whose single entry has a very high decompression ratio,
well above the default ``ZIP_FILE_MAX_COMPRESS_RATIO`` threshold.
"""
buffer = io.BytesIO()
with ZipFile(buffer, "w", ZIP_DEFLATED) as zip_file:
# A megabyte of zeros compresses to roughly a kilobyte, far exceeding
# the default 200:1 ratio guard.
zip_file.writestr("test.parquet", b"\x00" * (1024 * 1024))
buffer.seek(0)
return buffer


def test_columnar_reader_unsafe_zip_rejected():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
)
unsafe_zip = _make_high_ratio_zip()
with pytest.raises(DatabaseUploadFailed) as ex:
reader.file_to_dataframe(FileStorage(unsafe_zip, "test.zip"))
assert "compress ratio above allowed threshold" in str(ex.value)


def test_columnar_reader_unsafe_zip_rejected_in_metadata():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
)
unsafe_zip = _make_high_ratio_zip()
with pytest.raises(DatabaseUploadFailed) as ex:
reader.file_metadata(FileStorage(unsafe_zip, "test.zip"))
assert "compress ratio above allowed threshold" in str(ex.value)
Comment on lines +249 to +266
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated — both test assertions now expect DatabaseUploadFailed with the same error message check.



def test_columnar_reader_oversize_file_rejected():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
)
file = create_columnar_file(COLUMNAR_DATA)
file.stream.seek(0, 2)
file_size = file.stream.tell()
file.stream.seek(0)
with patch.dict(
current_app.config,
{"UPLOAD_MAX_FILE_SIZE_BYTES": file_size - 1},
):
with pytest.raises(DatabaseUploadFailed) as ex:
reader.file_to_dataframe(file)
assert "exceeds the maximum allowed upload size" in str(ex.value)


def test_columnar_reader_oversize_file_rejected_in_metadata():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
)
file = create_columnar_file(COLUMNAR_DATA)
file.stream.seek(0, 2)
file_size = file.stream.tell()
file.stream.seek(0)
with patch.dict(
current_app.config,
{"UPLOAD_MAX_FILE_SIZE_BYTES": file_size - 1},
):
with pytest.raises(DatabaseUploadFailed) as ex:
reader.file_metadata(file)
assert "exceeds the maximum allowed upload size" in str(ex.value)


def test_columnar_reader_under_limit_accepted():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
)
file = create_columnar_file(COLUMNAR_DATA)
with patch.dict(
current_app.config,
{"UPLOAD_MAX_FILE_SIZE_BYTES": 100 * 1024 * 1024},
):
df = reader.file_to_dataframe(file)
assert len(df) == 3


def test_columnar_reader_metadata():
reader = ColumnarReader(
options=ColumnarReaderOptions(),
Expand Down
51 changes: 51 additions & 0 deletions tests/unit_tests/dao/base_dao_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,54 @@ def test_find_by_ids_none_id_column():
results = TestDAO.find_by_ids([1, 2, 3])

assert results == []


def _list_with_page_size(page_size: int) -> Mock:
"""
Run ``BaseDAO.list`` with a mocked query chain and return the mock query so
the ``.limit()`` call (the effective page size) can be inspected.
"""
mock_query = Mock()
# Every chainable call returns the same mock so the chain is easy to inspect
mock_query.options.return_value = mock_query
mock_query.filter.return_value = mock_query
mock_query.order_by.return_value = mock_query
mock_query.offset.return_value = mock_query
mock_query.limit.return_value = mock_query
mock_query.count.return_value = 0
mock_query.all.return_value = []

mock_data_model = Mock()
mock_data_model.session.query.return_value = mock_query

with (
patch("superset.daos.base.SQLAInterface", return_value=mock_data_model),
patch.object(TestDAO, "_apply_base_filter", side_effect=lambda q, **_: q),
):
TestDAO.list(page=0, page_size=page_size)

return mock_query


def test_list_page_size_oversized_is_clamped():
"""An oversized page_size is clamped to the configured maximum."""
from flask import current_app

max_page_size = current_app.config.get("SQLALCHEMY_DAO_MAX_PAGE_SIZE", 1000)
mock_query = _list_with_page_size(max_page_size + 5000)

mock_query.limit.assert_called_once_with(max_page_size)


def test_list_page_size_normal_unaffected():
"""A page_size within the allowed range is passed through unchanged."""
mock_query = _list_with_page_size(50)

mock_query.limit.assert_called_once_with(50)


def test_list_page_size_below_one_is_floored():
"""A non-positive page_size is floored to 1 (existing semantics)."""
mock_query = _list_with_page_size(0)

mock_query.limit.assert_called_once_with(1)
Loading