From 5d52da45c1fcaffe29edc0e54ed6419f52010a3d Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 11 Apr 2020 10:59:30 -0300 Subject: [PATCH 1/3] improving the column name sanitization #161 --- awswrangler/catalog.py | 31 +++++++++++++--------- awswrangler/s3.py | 5 ++++ testing/test_awswrangler/test_data_lake.py | 14 ++++++---- 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/awswrangler/catalog.py b/awswrangler/catalog.py index a694f1877..f7a6ead16 100644 --- a/awswrangler/catalog.py +++ b/awswrangler/catalog.py @@ -730,19 +730,9 @@ def table( def _sanitize_name(name: str) -> str: - name = "".join(c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn") - name = name.replace("{", "_") - name = name.replace("}", "_") - name = name.replace("]", "_") - name = name.replace("[", "_") - name = name.replace(")", "_") - name = name.replace("(", "_") - name = name.replace(" ", "_") - name = name.replace("-", "_") - name = name.replace(".", "_") - name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) - name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name) - return name.lower() + name = "".join(c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn") # strip accents + name = re.sub("[^A-Za-z0-9_]+", "_", name) # Removing non alphanumeric characters + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() # Converting CamelCase to snake_case def sanitize_column_name(column: str) -> str: @@ -750,6 +740,11 @@ def sanitize_column_name(column: str) -> str: https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html + Possible transformations: + - Strip accents + - Remove non alphanumeric characters + - Convert CamelCase to snake_case + Parameters ---------- column : str @@ -775,6 +770,11 @@ def sanitize_dataframe_columns_names(df: pd.DataFrame) -> pd.DataFrame: https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html + Possible transformations: + - Strip accents + - Remove non alphanumeric characters + - Convert CamelCase to snake_case + Parameters ---------- df : pandas.DataFrame @@ -800,6 +800,11 @@ def sanitize_table_name(table: str) -> str: https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html + Possible transformations: + - Strip accents + - Remove non alphanumeric characters + - Convert CamelCase to snake_case + Parameters ---------- table : str diff --git a/awswrangler/s3.py b/awswrangler/s3.py index 35432a1a8..e3447f4c0 100644 --- a/awswrangler/s3.py +++ b/awswrangler/s3.py @@ -530,6 +530,11 @@ def to_parquet( # pylint: disable=too-many-arguments The concept of Dataset goes beyond the simple idea of files and enable more complex features like partitioning, casting and catalog integration (Amazon Athena/AWS Glue Catalog). + Note + ---- + The table name and all column names will be automatically sanitize using + `wr.catalog.sanitize_table_name` and `wr.catalog.sanitize_column_name`. + Note ---- In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count(). diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index 7cb94b229..4759331f1 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -553,11 +553,15 @@ def test_athena_read_list(database): def test_normalize_column_name(): - assert wr.catalog.sanitize_column_name("foo()__Boo))))____BAR") == "foo_____boo________bar" - assert ( - wr.catalog.sanitize_column_name("foo()__Boo))))_{}{}{{}{}{}{___BAR[][][][]") - == "foo_____boo____________________bar________" - ) + assert wr.catalog.sanitize_column_name("CamelCase") == "camel_case" + assert wr.catalog.sanitize_column_name("CamelCase2") == "camel_case2" + assert wr.catalog.sanitize_column_name("Camel_Case3") == "camel_case3" + assert wr.catalog.sanitize_column_name("Cámël_Casë4仮") == "camel_case4_" + assert wr.catalog.sanitize_column_name("Camel__Case5") == "camel__case5" + assert wr.catalog.sanitize_column_name("Camel{}Case6") == "camel_case6" + assert wr.catalog.sanitize_column_name("Camel.Case7") == "camel_case7" + assert wr.catalog.sanitize_column_name("xyz_cd") == "xyz_cd" + assert wr.catalog.sanitize_column_name("xyz_Cd") == "xyz_cd" def test_athena_ctas_empty(database): From bf158e8cbdce2206da7c481285b41a81594eb02c Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 11 Apr 2020 15:13:42 -0300 Subject: [PATCH 2/3] Add sanitize_table_name() tests --- testing/test_awswrangler/test_data_lake.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index 4759331f1..8f62db1da 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -552,7 +552,7 @@ def test_athena_read_list(database): wr.athena.read_sql_query(sql=f"SELECT ARRAY[1, 2, 3]", database=database, ctas_approach=False) -def test_normalize_column_name(): +def test_sanitize_names(): assert wr.catalog.sanitize_column_name("CamelCase") == "camel_case" assert wr.catalog.sanitize_column_name("CamelCase2") == "camel_case2" assert wr.catalog.sanitize_column_name("Camel_Case3") == "camel_case3" @@ -562,6 +562,15 @@ def test_normalize_column_name(): assert wr.catalog.sanitize_column_name("Camel.Case7") == "camel_case7" assert wr.catalog.sanitize_column_name("xyz_cd") == "xyz_cd" assert wr.catalog.sanitize_column_name("xyz_Cd") == "xyz_cd" + assert wr.catalog.sanitize_table_name("CamelCase") == "camel_case" + assert wr.catalog.sanitize_table_name("CamelCase2") == "camel_case2" + assert wr.catalog.sanitize_table_name("Camel_Case3") == "camel_case3" + assert wr.catalog.sanitize_table_name("Cámël_Casë4仮") == "camel_case4_" + assert wr.catalog.sanitize_table_name("Camel__Case5") == "camel__case5" + assert wr.catalog.sanitize_table_name("Camel{}Case6") == "camel_case6" + assert wr.catalog.sanitize_table_name("Camel.Case7") == "camel_case7" + assert wr.catalog.sanitize_table_name("xyz_cd") == "xyz_cd" + assert wr.catalog.sanitize_table_name("xyz_Cd") == "xyz_cd" def test_athena_ctas_empty(database): From 92a9c7a234fe68f56abf8f2347711986b6f92849 Mon Sep 17 00:00:00 2001 From: igorborgest Date: Sat, 11 Apr 2020 15:13:42 -0300 Subject: [PATCH 3/3] Add sanitize_table_name() tests #161 --- testing/test_awswrangler/test_data_lake.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/testing/test_awswrangler/test_data_lake.py b/testing/test_awswrangler/test_data_lake.py index 4759331f1..8f62db1da 100644 --- a/testing/test_awswrangler/test_data_lake.py +++ b/testing/test_awswrangler/test_data_lake.py @@ -552,7 +552,7 @@ def test_athena_read_list(database): wr.athena.read_sql_query(sql=f"SELECT ARRAY[1, 2, 3]", database=database, ctas_approach=False) -def test_normalize_column_name(): +def test_sanitize_names(): assert wr.catalog.sanitize_column_name("CamelCase") == "camel_case" assert wr.catalog.sanitize_column_name("CamelCase2") == "camel_case2" assert wr.catalog.sanitize_column_name("Camel_Case3") == "camel_case3" @@ -562,6 +562,15 @@ def test_normalize_column_name(): assert wr.catalog.sanitize_column_name("Camel.Case7") == "camel_case7" assert wr.catalog.sanitize_column_name("xyz_cd") == "xyz_cd" assert wr.catalog.sanitize_column_name("xyz_Cd") == "xyz_cd" + assert wr.catalog.sanitize_table_name("CamelCase") == "camel_case" + assert wr.catalog.sanitize_table_name("CamelCase2") == "camel_case2" + assert wr.catalog.sanitize_table_name("Camel_Case3") == "camel_case3" + assert wr.catalog.sanitize_table_name("Cámël_Casë4仮") == "camel_case4_" + assert wr.catalog.sanitize_table_name("Camel__Case5") == "camel__case5" + assert wr.catalog.sanitize_table_name("Camel{}Case6") == "camel_case6" + assert wr.catalog.sanitize_table_name("Camel.Case7") == "camel_case7" + assert wr.catalog.sanitize_table_name("xyz_cd") == "xyz_cd" + assert wr.catalog.sanitize_table_name("xyz_Cd") == "xyz_cd" def test_athena_ctas_empty(database):