diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py index a694f1877..f7a6ead16 100644 --- a/awswrangler/catalog.py +++ b/awswrangler/catalog.py @@ -730,19 +730,9 @@ def table( def _sanitize_name(name: str) -> str: - name = "".join(c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn") - name = name.replace("{", "_") - name = name.replace("}", "_") - name = name.replace("]", "_") - name = name.replace("[", "_") - name = name.replace(")", "_") - name = name.replace("(", "_") - name = name.replace(" ", "_") - name = name.replace("-", "_") - name = name.replace(".", "_") - name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) - name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name) - return name.lower() + name = "".join(c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn") # strip accents + name = re.sub("[^A-Za-z0-9_]+", "_", name) # Removing non alphanumeric characters + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() # Converting CamelCase to snake_case def sanitize_column_name(column: str) -> str: @@ -750,6 +740,11 @@ def sanitize_column_name(column: str) -> str: https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html + Possible transformations: + - Strip accents + - Remove non alphanumeric characters + - Convert CamelCase to snake_case + Parameters ---------- column : str @@ -775,6 +770,11 @@ def sanitize_dataframe_columns_names(df: pd.DataFrame) -> pd.DataFrame: https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html + Possible transformations: + - Strip accents + - Remove non alphanumeric characters + - Convert CamelCase to snake_case + Parameters ---------- df : pandas.DataFrame @@ -800,6 +800,11 @@ def sanitize_table_name(table: str) -> str: https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html + Possible transformations: + - Strip accents + - Remove non alphanumeric characters + - Convert CamelCase to snake_case + Parameters ---------- table : str diff --git a/awswrangler/s3.py b/awswrangler/s3.py index 35432a1a8..e3447f4c0 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -530,6 +530,11 @@ def to_parquet( # pylint: disable=too-many-arguments The concept of Dataset goes beyond the simple idea of files and enable more complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). + Note + ---- + The table name and all column names will be automatically sanitize using + `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. + Note ---- In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count(). diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index 7cb94b229..8f62db1da 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -552,12 +552,25 @@ def test_athena_read_list(database): wr.athena.read_sql_query(sql=f"SELECT ARRAY[1, 2, 3]", database=database, ctas_approach=False) -def test_normalize_column_name(): - assert wr.catalog.sanitize_column_name("foo()__Boo))))____BAR") == "foo_____boo________bar" - assert ( - wr.catalog.sanitize_column_name("foo()__Boo))))_{}{}{{}{}{}{___BAR[][][][]") - == "foo_____boo____________________bar________" - ) +def test_sanitize_names(): + assert wr.catalog.sanitize_column_name("CamelCase") == "camel_case" + assert wr.catalog.sanitize_column_name("CamelCase2") == "camel_case2" + assert wr.catalog.sanitize_column_name("Camel_Case3") == "camel_case3" + assert wr.catalog.sanitize_column_name("Cámël_Casë4仮") == "camel_case4_" + assert wr.catalog.sanitize_column_name("Camel__Case5") == "camel__case5" + assert wr.catalog.sanitize_column_name("Camel{}Case6") == "camel_case6" + assert wr.catalog.sanitize_column_name("Camel.Case7") == "camel_case7" + assert wr.catalog.sanitize_column_name("xyz_cd") == "xyz_cd" + assert wr.catalog.sanitize_column_name("xyz_Cd") == "xyz_cd" + assert wr.catalog.sanitize_table_name("CamelCase") == "camel_case" + assert wr.catalog.sanitize_table_name("CamelCase2") == "camel_case2" + assert wr.catalog.sanitize_table_name("Camel_Case3") == "camel_case3" + assert wr.catalog.sanitize_table_name("Cámël_Casë4仮") == "camel_case4_" + assert wr.catalog.sanitize_table_name("Camel__Case5") == "camel__case5" + assert wr.catalog.sanitize_table_name("Camel{}Case6") == "camel_case6" + assert wr.catalog.sanitize_table_name("Camel.Case7") == "camel_case7" + assert wr.catalog.sanitize_table_name("xyz_cd") == "xyz_cd" + assert wr.catalog.sanitize_table_name("xyz_Cd") == "xyz_cd" def test_athena_ctas_empty(database):