Skip to content

Commit 85c438d

Browse files
sadpandajoeclaude
andcommitted
test(examples): add tests for schema=None fallback and data_file override
- Add test_find_dataset_falls_back_to_schema_none for legacy dataset lookup - Add test_get_multi_dataset_config_data_file_override for explicit data_file - Add test_get_multi_dataset_config_data_file_missing for missing file warning These tests cover the edge cases noted in PR review feedback. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 42ae0fd commit 85c438d

File tree

2 files changed

+108
-0
lines changed

2 files changed

+108
-0
lines changed

tests/unit_tests/examples/data_loading_test.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,3 +228,68 @@ def test_get_multi_dataset_config_missing_table_name(tmp_path: Path) -> None:
228228
# Falls back to dataset_name when table_name not in YAML
229229
assert result["table_name"] == "my_dataset"
230230
assert result["uuid"] == "test-uuid-5678"
231+
232+
233+
def test_get_multi_dataset_config_data_file_override(tmp_path: Path) -> None:
234+
"""Test that explicit data_file in YAML overrides the default data file."""
235+
from superset.examples.data_loading import _get_multi_dataset_config
236+
237+
# Create datasets directory and YAML file with explicit data_file
238+
datasets_dir = tmp_path / "datasets"
239+
datasets_dir.mkdir()
240+
241+
# Create data directory and the explicit data file
242+
data_dir = tmp_path / "data"
243+
data_dir.mkdir()
244+
explicit_file = data_dir / "custom_data.parquet"
245+
explicit_file.write_bytes(b"fake parquet")
246+
247+
yaml_content = """
248+
table_name: my_dataset
249+
schema: public
250+
uuid: 14f48794-ebfa-4f60-a26a-582c49132f1b
251+
data_file: custom_data.parquet
252+
"""
253+
dataset_yaml = datasets_dir / "my_dataset.yaml"
254+
dataset_yaml.write_text(yaml_content)
255+
256+
# Default data file (would be used if no override)
257+
default_data_file = data_dir / "my_dataset.parquet"
258+
259+
result = _get_multi_dataset_config(tmp_path, "my_dataset", default_data_file)
260+
261+
# Should use the explicit data_file from YAML
262+
assert result["data_file"] == explicit_file
263+
assert result["table_name"] == "my_dataset"
264+
assert result["uuid"] == "14f48794-ebfa-4f60-a26a-582c49132f1b"
265+
266+
267+
def test_get_multi_dataset_config_data_file_missing(tmp_path: Path) -> None:
268+
"""Test that missing explicit data_file keeps the default data file."""
269+
from superset.examples.data_loading import _get_multi_dataset_config
270+
271+
# Create datasets directory and YAML file with non-existent data_file
272+
datasets_dir = tmp_path / "datasets"
273+
datasets_dir.mkdir()
274+
275+
# Create data directory but NOT the explicit file
276+
data_dir = tmp_path / "data"
277+
data_dir.mkdir()
278+
279+
yaml_content = """
280+
table_name: my_dataset
281+
schema: public
282+
uuid: 14f48794-ebfa-4f60-a26a-582c49132f1b
283+
data_file: nonexistent.parquet
284+
"""
285+
dataset_yaml = datasets_dir / "my_dataset.yaml"
286+
dataset_yaml.write_text(yaml_content)
287+
288+
# Default data file passed to the function
289+
default_data_file = data_dir / "my_dataset.parquet"
290+
291+
result = _get_multi_dataset_config(tmp_path, "my_dataset", default_data_file)
292+
293+
# Should keep the default data_file since explicit one doesn't exist
294+
assert result["data_file"] == default_data_file
295+
assert result["uuid"] == "14f48794-ebfa-4f60-a26a-582c49132f1b"

tests/unit_tests/examples/generic_loader_test.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,49 @@ def test_find_dataset_no_uuid_no_schema(mock_db: MagicMock) -> None:
533533
assert found_by_uuid is False
534534

535535

536+
@patch("superset.examples.generic_loader.db")
537+
def test_find_dataset_falls_back_to_schema_none(mock_db: MagicMock) -> None:
538+
"""Test _find_dataset falls back to schema=None when exact schema match not found.
539+
540+
Scenario: A legacy dataset exists with schema=None (from older load-examples).
541+
When looking up with schema="public", the exact match fails, but we should
542+
find the legacy row with schema=None so we can backfill the schema.
543+
"""
544+
from superset.examples.generic_loader import _find_dataset
545+
546+
# Legacy row with schema=None (needs backfill)
547+
legacy_row = MagicMock()
548+
legacy_row.uuid = None
549+
legacy_row.table_name = "users"
550+
legacy_row.schema = None
551+
552+
# Exact schema match returns None, schema=None fallback returns legacy_row
553+
call_count = 0
554+
555+
def filter_by_side_effect(**kwargs):
556+
nonlocal call_count
557+
call_count += 1
558+
mock_result = MagicMock()
559+
if kwargs.get("schema") == "public":
560+
# Exact schema match fails
561+
mock_result.first.return_value = None
562+
elif kwargs.get("schema") is None:
563+
# Fallback to schema=None finds the legacy row
564+
mock_result.first.return_value = legacy_row
565+
else:
566+
mock_result.first.return_value = None
567+
return mock_result
568+
569+
mock_db.session.query.return_value.filter_by.side_effect = filter_by_side_effect
570+
571+
result, found_by_uuid = _find_dataset("users", 1, None, "public")
572+
573+
assert result is legacy_row
574+
assert found_by_uuid is False
575+
# Verify both lookups happened (exact match + fallback)
576+
assert call_count == 2
577+
578+
536579
@patch("superset.examples.generic_loader.db")
537580
@patch("superset.examples.generic_loader.get_example_database")
538581
def test_load_parquet_table_no_backfill_when_uuid_already_set(

0 commit comments

Comments
 (0)