diff --git a/superset/utils/csv.py b/superset/utils/csv.py index bf5045728ddf..b04a39739402 100644 --- a/superset/utils/csv.py +++ b/superset/utils/csv.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. import logging -import re import urllib.request from typing import Any, Optional, Union from urllib.error import URLError @@ -28,15 +27,32 @@ logger = logging.getLogger(__name__) -negative_number_re = re.compile(r"^-[0-9.]+$") +PROBLEMATIC_CSV_PREFIXES = "-@+|=%" -# This regex will match if the string starts with: -# -# 1. one of -, @, +, |, =, % -# 2. two double quotes immediately followed by one of -, @, +, |, =, % -# 3. one or more spaces immediately followed by one of -, @, +, |, =, % -# -problematic_chars_re = re.compile(r'^(?:"{2}|\s{1,})(?=[\-@+|=%])|^[\-@+|=%]') + +def _starts_with_formula_prefix(value: str) -> bool: + first = value[0] + if first in PROBLEMATIC_CSV_PREFIXES: + return True + if first == '"' and len(value) > 2: + return value[1] == '"' and value[2] in PROBLEMATIC_CSV_PREFIXES + return False + + +def _starts_like_spreadsheet_formula(value: str) -> bool: + first = value[0] + if first.isspace(): + stripped = value.lstrip() + return bool(stripped) and _starts_with_formula_prefix(stripped) + return _starts_with_formula_prefix(value) + + +def _is_negative_number(value: str) -> bool: + return ( + len(value) > 1 + and value[0] == "-" + and all("0" <= character <= "9" or character == "." for character in value[1:]) + ) def escape_value(value: str) -> str: @@ -45,10 +61,10 @@ def escape_value(value: str) -> str: http://georgemauer.net/2017/10/07/csv-injection.html """ - needs_escaping = problematic_chars_re.match(value) is not None - is_negative_number = negative_number_re.match(value) is not None + if not value: + return value - if needs_escaping and not is_negative_number: + if _starts_like_spreadsheet_formula(value) and not _is_negative_number(value): # Escape pipe to be extra safe as this # can lead to remote code execution value = value.replace("|", "\\|") diff --git a/tests/unit_tests/utils/csv_tests.py b/tests/unit_tests/utils/csv_tests.py index 747e8b32870c..f7dc903d331b 100644 --- a/tests/unit_tests/utils/csv_tests.py +++ b/tests/unit_tests/utils/csv_tests.py @@ -63,6 +63,9 @@ def test_escape_value(): result = csv.escape_value(" =10+2") assert result == "' =10+2" + result = csv.escape_value(' ""=10+2') + assert result == '\' ""=10+2' + def fake_get_chart_csv_data_none(chart_url, auth_cookies=None): return None