diff --git a/README.md b/README.md index 5528ba212..1062cd078 100644 --- a/README.md +++ b/README.md @@ -220,6 +220,34 @@ session.athena.repair_table(database="db_name", table="tbl_name") ## Diving Deep + +### Pandas with null object columns (UndetectedType exception) + +Pandas has a too generic "data type" named object. Pandas object columns can be string, dates, etc, etc, etc. +We can handle this object column fine inferring the types of theses objects inside the values, Pyarrow does that like a charm. So the real problem starts when we have a completely null object column because we don't have anything to infer. + +To work with null object columns you can explicitly set the expected Athena data type for the target table doing: + +```py3 +import awswrangler +import pandas as pd + +dataframe = pd.DataFrame({ + "col": [1, 2], + "col_string_null": [None, None], + "col_date_null": [None, None], +}) +session = awswrangler.Session() +session.pandas.to_parquet( + dataframe=dataframe, + database="DATABASE", + path=f"s3://...", + cast_columns={ + "col_string_null": "string", + "col_date_null": "date" + }) +``` + ### Pandas to Redshift Flow ![Pandas to Redshift Flow](docs/source/_static/pandas-to-redshift-flow.jpg?raw=true "Pandas to Redshift Flow") diff --git a/awswrangler/data_types.py b/awswrangler/data_types.py index de733b37c..148d01124 100644 --- a/awswrangler/data_types.py +++ b/awswrangler/data_types.py @@ -3,7 +3,7 @@ import pyarrow -from awswrangler.exceptions import UnsupportedType +from awswrangler.exceptions import UnsupportedType, UndetectedType logger = logging.getLogger(__name__) @@ -160,6 +160,9 @@ def pyarrow2athena(dtype): return "date" elif dtype_str.startswith("list"): return f"array<{pyarrow2athena(dtype.value_type)}>" + elif dtype_str == "null": + raise UndetectedType( + "We can't infer the data type from an entire null object column") else: raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}") diff --git a/awswrangler/exceptions.py b/awswrangler/exceptions.py index e82e3c11a..fe6715a94 100644 --- a/awswrangler/exceptions.py +++ b/awswrangler/exceptions.py @@ -2,6 +2,10 @@ class UnsupportedType(Exception): pass +class UndetectedType(Exception): + pass + + class UnsupportedFileFormat(Exception): pass diff --git a/awswrangler/glue.py b/awswrangler/glue.py index d0c27c2f7..16c654344 100644 --- a/awswrangler/glue.py +++ b/awswrangler/glue.py @@ -4,7 +4,7 @@ from awswrangler import data_types from awswrangler.athena import Athena -from awswrangler.exceptions import UnsupportedFileFormat, InvalidSerDe, ApiError, UnsupportedType +from awswrangler.exceptions import UnsupportedFileFormat, InvalidSerDe, ApiError, UnsupportedType, UndetectedType logger = logging.getLogger(__name__) @@ -194,6 +194,11 @@ def _build_schema(dataframe, else: try: athena_type = data_types.pyarrow2athena(dtype) + except UndetectedType: + raise UndetectedType( + f"We can't infer the data type from an entire null object column ({name}). " + f"Please consider pass the type of this column explicitly using the cast " + f"columns argument") except UnsupportedType: raise UnsupportedType( f"Unsupported Pyarrow type for column {name}: {dtype}") diff --git a/docs/source/divingdeep.rst b/docs/source/divingdeep.rst index 1e785477c..3d3196669 100644 --- a/docs/source/divingdeep.rst +++ b/docs/source/divingdeep.rst @@ -3,6 +3,35 @@ Diving Deep =========== +Pandas with null object columns (UndetectedType exception) +---------------------------------------------------------- + +Pandas has a too generic "data type" named object. Pandas object columns can be string, dates, etc, etc, etc. +We can handle this object column fine inferring the types of theses objects inside the values, Pyarrow does that like a charm. So the real problem starts when we have a completely null object column because we don't have anything to infer. + +To work with null object columns you can explicitly set the expected Athena data type for the target table doing: + +.. code-block:: python + + import awswrangler + import pandas as pd + + dataframe = pd.DataFrame({ + "col": [1, 2], + "col_string_null": [None, None], + "col_date_null": [None, None], + }) + session = awswrangler.Session() + session.pandas.to_parquet( + dataframe=dataframe, + database="DATABASE", + path=f"s3://...", + cast_columns={ + "col_string_null": "string", + "col_date_null": "date" + }) + + Pandas to Redshift Flow ----------------------- diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py index 009bb446e..c2892fd8b 100644 --- a/testing/test_awswrangler/test_pandas.py +++ b/testing/test_awswrangler/test_pandas.py @@ -9,7 +9,7 @@ import numpy as np from awswrangler import Session, Pandas -from awswrangler.exceptions import LineTerminatorNotFound, EmptyDataframe, InvalidSerDe, UnsupportedType +from awswrangler.exceptions import LineTerminatorNotFound, EmptyDataframe, InvalidSerDe, UnsupportedType, UndetectedType logging.basicConfig( level=logging.INFO, @@ -962,3 +962,20 @@ def test_to_parquet_casting_to_string( assert len(dataframe.index) == len(dataframe2.index) assert (len(list(dataframe.columns)) + 1) == len(list(dataframe2.columns)) print(dataframe2) + + +def test_to_parquet_casting_with_null_object( + session, + bucket, + database, +): + dataframe = pd.DataFrame({ + "a": [1, 2, 3], + "b": [4, 5, 6], + "col_null": [None, None, None], + }) + with pytest.raises(UndetectedType): + assert session.pandas.to_parquet(dataframe=dataframe, + database=database, + path=f"s3://{bucket}/test/", + mode="overwrite")