Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,34 @@ session.athena.repair_table(database="db_name", table="tbl_name")

## Diving Deep


### Pandas with null object columns (UndetectedType exception)

Pandas has a too generic "data type" named object. Pandas object columns can be string, dates, etc, etc, etc.
We can handle this object column fine inferring the types of theses objects inside the values, Pyarrow does that like a charm. So the real problem starts when we have a completely null object column because we don't have anything to infer.

To work with null object columns you can explicitly set the expected Athena data type for the target table doing:

```py3
import awswrangler
import pandas as pd

dataframe = pd.DataFrame({
"col": [1, 2],
"col_string_null": [None, None],
"col_date_null": [None, None],
})
session = awswrangler.Session()
session.pandas.to_parquet(
dataframe=dataframe,
database="DATABASE",
path=f"s3://...",
cast_columns={
"col_string_null": "string",
"col_date_null": "date"
})
```

### Pandas to Redshift Flow

![Pandas to Redshift Flow](docs/source/_static/pandas-to-redshift-flow.jpg?raw=true "Pandas to Redshift Flow")
Expand Down
5 changes: 4 additions & 1 deletion awswrangler/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pyarrow

from awswrangler.exceptions import UnsupportedType
from awswrangler.exceptions import UnsupportedType, UndetectedType

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -160,6 +160,9 @@ def pyarrow2athena(dtype):
return "date"
elif dtype_str.startswith("list"):
return f"array<{pyarrow2athena(dtype.value_type)}>"
elif dtype_str == "null":
raise UndetectedType(
"We can't infer the data type from an entire null object column")
else:
raise UnsupportedType(f"Unsupported Pyarrow type: {dtype}")

Expand Down
4 changes: 4 additions & 0 deletions awswrangler/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ class UnsupportedType(Exception):
pass


class UndetectedType(Exception):
pass


class UnsupportedFileFormat(Exception):
pass

Expand Down
7 changes: 6 additions & 1 deletion awswrangler/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from awswrangler import data_types
from awswrangler.athena import Athena
from awswrangler.exceptions import UnsupportedFileFormat, InvalidSerDe, ApiError, UnsupportedType
from awswrangler.exceptions import UnsupportedFileFormat, InvalidSerDe, ApiError, UnsupportedType, UndetectedType

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -194,6 +194,11 @@ def _build_schema(dataframe,
else:
try:
athena_type = data_types.pyarrow2athena(dtype)
except UndetectedType:
raise UndetectedType(
f"We can't infer the data type from an entire null object column ({name}). "
f"Please consider pass the type of this column explicitly using the cast "
f"columns argument")
except UnsupportedType:
raise UnsupportedType(
f"Unsupported Pyarrow type for column {name}: {dtype}")
Expand Down
29 changes: 29 additions & 0 deletions docs/source/divingdeep.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,35 @@
Diving Deep
===========

Pandas with null object columns (UndetectedType exception)
----------------------------------------------------------

Pandas has a too generic "data type" named object. Pandas object columns can be string, dates, etc, etc, etc.
We can handle this object column fine inferring the types of theses objects inside the values, Pyarrow does that like a charm. So the real problem starts when we have a completely null object column because we don't have anything to infer.

To work with null object columns you can explicitly set the expected Athena data type for the target table doing:

.. code-block:: python

import awswrangler
import pandas as pd

dataframe = pd.DataFrame({
"col": [1, 2],
"col_string_null": [None, None],
"col_date_null": [None, None],
})
session = awswrangler.Session()
session.pandas.to_parquet(
dataframe=dataframe,
database="DATABASE",
path=f"s3://...",
cast_columns={
"col_string_null": "string",
"col_date_null": "date"
})


Pandas to Redshift Flow
-----------------------

Expand Down
19 changes: 18 additions & 1 deletion testing/test_awswrangler/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np

from awswrangler import Session, Pandas
from awswrangler.exceptions import LineTerminatorNotFound, EmptyDataframe, InvalidSerDe, UnsupportedType
from awswrangler.exceptions import LineTerminatorNotFound, EmptyDataframe, InvalidSerDe, UnsupportedType, UndetectedType

logging.basicConfig(
level=logging.INFO,
Expand Down Expand Up @@ -962,3 +962,20 @@ def test_to_parquet_casting_to_string(
assert len(dataframe.index) == len(dataframe2.index)
assert (len(list(dataframe.columns)) + 1) == len(list(dataframe2.columns))
print(dataframe2)


def test_to_parquet_casting_with_null_object(
session,
bucket,
database,
):
dataframe = pd.DataFrame({
"a": [1, 2, 3],
"b": [4, 5, 6],
"col_null": [None, None, None],
})
with pytest.raises(UndetectedType):
assert session.pandas.to_parquet(dataframe=dataframe,
database=database,
path=f"s3://{bucket}/test/",
mode="overwrite")