Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion superset/commands/database/uploaders/csv_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,17 @@ def _read_csv( # noqa: C901
break

if chunks:
result = pd.concat(chunks, ignore_index=False)
try:
result = pd.concat(chunks, ignore_index=False)
except Exception as ex:
logger.warning(
"Error concatenating CSV chunks: %s. "
"This may be due to inconsistent date parsing "
"across chunks.",
str(ex),
)
raise

# When using chunking, we need to reset and rebuild the index
if kwargs.get("index_col") is not None:
# The index was already set by pandas during read_csv
Expand Down
65 changes: 65 additions & 0 deletions tests/unit_tests/commands/databases/csv_reader_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1352,3 +1352,68 @@ def track_read(size):

# Test that the method handles the sample sizes properly
assert all(size > 0 for size in read_sizes), "All sample sizes should be positive"


def test_csv_reader_chunk_concatenation_error_logging():
"""Test that pd.concat errors during chunking are logged and re-raised."""
from unittest.mock import patch

# Create a large CSV that will trigger chunking (>100k rows)
large_data = [["col1", "col2"]]
for i in range(100001):
large_data.append([f"val{i}", str(i)])

csv_reader = CSVReader(options=CSVReaderOptions())

# Mock pd.concat to raise an exception
with patch(
"superset.commands.database.uploaders.csv_reader.pd.concat"
) as mock_concat:
mock_concat.side_effect = ValueError(
"Cannot concatenate chunks with different dtypes"
)

with pytest.raises(DatabaseUploadFailed) as exc_info:
csv_reader.file_to_dataframe(create_csv_file(large_data))

# Verify the exception is still raised (wrapped as DatabaseUploadFailed)
assert "Cannot concatenate chunks with different dtypes" in str(exc_info.value)

# Verify concat was called (meaning chunking happened)
assert mock_concat.called


def test_csv_reader_chunk_concatenation_error_warning(caplog):
"""Test that pd.concat errors during chunking log a warning message."""
from unittest.mock import patch

# Create a large CSV that will trigger chunking (>100k rows)
large_data = [["col1", "col2"]]
for i in range(100001):
large_data.append([f"val{i}", str(i)])

csv_reader = CSVReader(options=CSVReaderOptions())

# Mock pd.concat to raise an exception
with patch(
"superset.commands.database.uploaders.csv_reader.pd.concat"
) as mock_concat:
mock_concat.side_effect = ValueError(
"Cannot concatenate chunks with different dtypes"
)

import logging

with caplog.at_level(logging.WARNING):
with pytest.raises(DatabaseUploadFailed):
csv_reader.file_to_dataframe(create_csv_file(large_data))

# Verify warning was logged
assert any(
"Error concatenating CSV chunks" in record.message
for record in caplog.records
)
assert any(
"inconsistent date parsing across chunks" in record.message
for record in caplog.records
)
Loading