-
Notifications
You must be signed in to change notification settings - Fork 17.5k
fix(uploads,dao): add zip-safety check to columnar reader and cap DAO page size #40637
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ | |
|
|
||
| import pandas as pd | ||
| import pyarrow.parquet as pq | ||
| from flask import current_app | ||
| from flask_babel import lazy_gettext as _ | ||
| from pyarrow.lib import ArrowException | ||
| from werkzeug.datastructures import FileStorage | ||
|
|
@@ -33,10 +34,47 @@ | |
| FileMetadata, | ||
| ReaderOptions, | ||
| ) | ||
| from superset.exceptions import SupersetException | ||
| from superset.utils.core import check_is_safe_zip | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def _check_file_size(file: FileStorage) -> None: | ||
| """ | ||
| Reject an uploaded file whose raw (on-the-wire) size exceeds the configured | ||
| limit before its contents are buffered into memory. | ||
|
|
||
| This is complementary to the ZIP decompression-ratio guard: it bounds the | ||
| raw bytes accepted regardless of whether the payload is compressed. | ||
|
|
||
| :param file: The uploaded file to check. | ||
| :throws DatabaseUploadFailed: if the file exceeds the configured limit. | ||
| """ | ||
| max_size = current_app.config.get("UPLOAD_MAX_FILE_SIZE_BYTES") | ||
| if not max_size: | ||
| return | ||
| stream = file.stream | ||
| try: | ||
| current_position = stream.tell() | ||
| stream.seek(0, 2) # seek to end | ||
| size = stream.tell() | ||
| stream.seek(current_position) | ||
| except (AttributeError, OSError): | ||
| # If the stream is not seekable we cannot determine the size cheaply; | ||
| # skip the check and rely on downstream guards. | ||
| return | ||
| if size > max_size: | ||
| raise DatabaseUploadFailed( | ||
| _( | ||
| "File size %(size)s bytes exceeds the maximum allowed " | ||
| "upload size of %(max_size)s bytes", | ||
| size=size, | ||
| max_size=max_size, | ||
| ) | ||
| ) | ||
|
|
||
|
|
||
| class ColumnarReaderOptions(ReaderOptions, total=False): | ||
| columns_read: list[str] | ||
|
|
||
|
|
@@ -80,6 +118,7 @@ def _yield_files(file: FileStorage) -> Generator[IO[bytes], None, None]: | |
| :param file: The file to yield files from. | ||
| :return: A generator that yields files. | ||
| """ | ||
| _check_file_size(file) | ||
| file_suffix = Path(file.filename).suffix | ||
| if not file_suffix: | ||
| raise DatabaseUploadFailed(_("Unexpected no file extension found")) | ||
|
|
@@ -89,6 +128,12 @@ def _yield_files(file: FileStorage) -> Generator[IO[bytes], None, None]: | |
| raise DatabaseUploadFailed(_("Not a valid ZIP file")) | ||
| try: | ||
| with ZipFile(file) as zip_file: | ||
| # guard against decompression bombs before reading entries, | ||
| # mirroring the importer path | ||
| try: | ||
| check_is_safe_zip(zip_file) | ||
| except SupersetException as ex: | ||
| raise DatabaseUploadFailed(str(ex)) from ex | ||
| # check if all file types are of the same extension | ||
|
Comment on lines
130
to
137
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed — see the update to comment above. SupersetException from check_is_safe_zip is now caught and re-raised as DatabaseUploadFailed. |
||
| file_suffixes = {Path(name).suffix for name in zip_file.namelist()} | ||
| if len(file_suffixes) > 1: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ | |
| ) | ||
|
|
||
| import sqlalchemy as sa | ||
| from flask import current_app | ||
| from flask_appbuilder.models.filters import BaseFilter | ||
| from flask_appbuilder.models.sqla.interface import SQLAInterface | ||
| from pydantic import BaseModel, Field | ||
|
|
@@ -749,7 +750,10 @@ def list( # noqa: C901 | |
| else: | ||
| query = query.order_by(asc(column)) | ||
| page = page | ||
| page_size = max(page_size, 1) | ||
| # Clamp the page size to a sane range: at least 1, and no larger than | ||
| # the configured upper bound, to keep result sets bounded. | ||
| max_page_size = current_app.config.get("SQLALCHEMY_DAO_MAX_PAGE_SIZE", 1000) | ||
| page_size = min(max(page_size, 1), max_page_size) | ||
|
Comment on lines
+755
to
+756
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion: The new clamp trusts Severity Level: Major
|
||
| query = query.offset(page * page_size).limit(page_size) | ||
| items = query.all() | ||
| # If columns are specified, SQLAlchemy returns Row objects (not tuples or | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,10 +17,12 @@ | |
| import io | ||
| import tempfile | ||
| from typing import Any | ||
| from zipfile import ZipFile | ||
| from unittest.mock import patch | ||
| from zipfile import ZIP_DEFLATED, ZipFile | ||
|
|
||
| import numpy as np | ||
| import pytest | ||
| from flask import current_app | ||
| from werkzeug.datastructures import FileStorage | ||
|
|
||
| from superset.commands.database.exceptions import DatabaseUploadFailed | ||
|
|
@@ -230,6 +232,87 @@ def test_columnar_reader_bad_zip(): | |
| assert str(ex.value) == "Not a valid ZIP file" | ||
|
|
||
|
|
||
| def _make_high_ratio_zip() -> io.BytesIO: | ||
| """ | ||
| Build a ZIP whose single entry has a very high decompression ratio, | ||
| well above the default ``ZIP_FILE_MAX_COMPRESS_RATIO`` threshold. | ||
| """ | ||
| buffer = io.BytesIO() | ||
| with ZipFile(buffer, "w", ZIP_DEFLATED) as zip_file: | ||
| # A megabyte of zeros compresses to roughly a kilobyte, far exceeding | ||
| # the default 200:1 ratio guard. | ||
| zip_file.writestr("test.parquet", b"\x00" * (1024 * 1024)) | ||
| buffer.seek(0) | ||
| return buffer | ||
|
|
||
|
|
||
| def test_columnar_reader_unsafe_zip_rejected(): | ||
| reader = ColumnarReader( | ||
| options=ColumnarReaderOptions(), | ||
| ) | ||
| unsafe_zip = _make_high_ratio_zip() | ||
| with pytest.raises(DatabaseUploadFailed) as ex: | ||
| reader.file_to_dataframe(FileStorage(unsafe_zip, "test.zip")) | ||
| assert "compress ratio above allowed threshold" in str(ex.value) | ||
|
|
||
|
|
||
| def test_columnar_reader_unsafe_zip_rejected_in_metadata(): | ||
| reader = ColumnarReader( | ||
| options=ColumnarReaderOptions(), | ||
| ) | ||
| unsafe_zip = _make_high_ratio_zip() | ||
| with pytest.raises(DatabaseUploadFailed) as ex: | ||
| reader.file_metadata(FileStorage(unsafe_zip, "test.zip")) | ||
| assert "compress ratio above allowed threshold" in str(ex.value) | ||
|
Comment on lines
+249
to
+266
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated — both test assertions now expect DatabaseUploadFailed with the same error message check. |
||
|
|
||
|
|
||
| def test_columnar_reader_oversize_file_rejected(): | ||
| reader = ColumnarReader( | ||
| options=ColumnarReaderOptions(), | ||
| ) | ||
| file = create_columnar_file(COLUMNAR_DATA) | ||
| file.stream.seek(0, 2) | ||
| file_size = file.stream.tell() | ||
| file.stream.seek(0) | ||
| with patch.dict( | ||
| current_app.config, | ||
| {"UPLOAD_MAX_FILE_SIZE_BYTES": file_size - 1}, | ||
| ): | ||
| with pytest.raises(DatabaseUploadFailed) as ex: | ||
| reader.file_to_dataframe(file) | ||
| assert "exceeds the maximum allowed upload size" in str(ex.value) | ||
|
|
||
|
|
||
| def test_columnar_reader_oversize_file_rejected_in_metadata(): | ||
| reader = ColumnarReader( | ||
| options=ColumnarReaderOptions(), | ||
| ) | ||
| file = create_columnar_file(COLUMNAR_DATA) | ||
| file.stream.seek(0, 2) | ||
| file_size = file.stream.tell() | ||
| file.stream.seek(0) | ||
| with patch.dict( | ||
| current_app.config, | ||
| {"UPLOAD_MAX_FILE_SIZE_BYTES": file_size - 1}, | ||
| ): | ||
| with pytest.raises(DatabaseUploadFailed) as ex: | ||
| reader.file_metadata(file) | ||
| assert "exceeds the maximum allowed upload size" in str(ex.value) | ||
|
|
||
|
|
||
| def test_columnar_reader_under_limit_accepted(): | ||
| reader = ColumnarReader( | ||
| options=ColumnarReaderOptions(), | ||
| ) | ||
| file = create_columnar_file(COLUMNAR_DATA) | ||
| with patch.dict( | ||
| current_app.config, | ||
| {"UPLOAD_MAX_FILE_SIZE_BYTES": 100 * 1024 * 1024}, | ||
| ): | ||
| df = reader.file_to_dataframe(file) | ||
| assert len(df) == 3 | ||
|
|
||
|
|
||
| def test_columnar_reader_metadata(): | ||
| reader = ColumnarReader( | ||
| options=ColumnarReaderOptions(), | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggestion:
check_is_safe_zip()raisesSupersetException, but this reader otherwise raisesDatabaseUploadFailedfor user upload errors. LettingSupersetExceptionbubble here changes upload failures into generic 500 responses instead of the expected 4xx/422-style upload error handling. Catch and re-raise zip-safety failures asDatabaseUploadFailedto preserve the existing API error contract. [api mismatch]Severity Level: Major⚠️
Steps of Reproduction ✅
Fix in Cursor | Fix in VSCode Claude
(Use Cmd/Ctrl + Click for best experience)
Prompt for AI Agent 🤖
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed — wrapped the
check_is_safe_zip()call in atry/except SupersetExceptionblock and re-raises asDatabaseUploadFailed. This keeps the unsafe-ZIP rejection aligned with other upload validation errors (422 instead of 500). Tests updated to assertDatabaseUploadFailed.