Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion awswrangler/s3/_read_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ def _arrowtable2df(
table: pa.Table,
categories: Optional[List[str]],
safe: bool,
map_types: bool,
use_threads: bool,
dataset: bool,
path: str,
Expand All @@ -257,7 +258,7 @@ def _arrowtable2df(
strings_to_categorical=False,
safe=safe,
categories=categories,
types_mapper=_data_types.pyarrow2pandas_extension,
types_mapper=_data_types.pyarrow2pandas_extension if map_types else None,
),
dataset=dataset,
path=path,
Expand All @@ -279,6 +280,7 @@ def _read_parquet_chunked(
columns: Optional[List[str]],
categories: Optional[List[str]],
safe: bool,
map_types: bool,
boto3_session: boto3.Session,
dataset: bool,
path_root: Optional[str],
Expand Down Expand Up @@ -325,6 +327,7 @@ def _read_parquet_chunked(
),
categories=categories,
safe=safe,
map_types=map_types,
use_threads=use_threads,
dataset=dataset,
path=path,
Expand Down Expand Up @@ -404,6 +407,7 @@ def _read_parquet(
columns: Optional[List[str]],
categories: Optional[List[str]],
safe: bool,
map_types: bool,
boto3_session: boto3.Session,
dataset: bool,
path_root: Optional[str],
Expand All @@ -421,6 +425,7 @@ def _read_parquet(
),
categories=categories,
safe=safe,
map_types=map_types,
use_threads=use_threads,
dataset=dataset,
path=path,
Expand All @@ -441,6 +446,7 @@ def read_parquet(
dataset: bool = False,
categories: Optional[List[str]] = None,
safe: bool = True,
map_types: bool = True,
use_threads: bool = True,
last_modified_begin: Optional[datetime.datetime] = None,
last_modified_end: Optional[datetime.datetime] = None,
Expand Down Expand Up @@ -524,6 +530,10 @@ def read_parquet(
data in a pandas DataFrame or Series (e.g. timestamps are always
stored as nanoseconds in pandas). This option controls whether it
is a safe cast or not.
map_types : bool, default True
True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is
used to override the default pandas type for conversion of built-in
pyarrow types or in absence of pandas_metadata in the Table schema.
use_threads : bool
True to enable concurrent requests, False to disable multiple threads.
If enabled os.cpu_count() will be used as the max number of threads.
Expand Down Expand Up @@ -597,6 +607,7 @@ def read_parquet(
"columns": columns,
"categories": categories,
"safe": safe,
"map_types": map_types,
"boto3_session": session,
"dataset": dataset,
"path_root": path_root,
Expand Down Expand Up @@ -633,6 +644,7 @@ def read_parquet_table(
validate_schema: bool = True,
categories: Optional[List[str]] = None,
safe: bool = True,
map_types: bool = True,
chunked: Union[bool, int] = False,
use_threads: bool = True,
boto3_session: Optional[boto3.Session] = None,
Expand Down Expand Up @@ -699,6 +711,10 @@ def read_parquet_table(
data in a pandas DataFrame or Series (e.g. timestamps are always
stored as nanoseconds in pandas). This option controls whether it
is a safe cast or not.
map_types : bool, default True
True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is
used to override the default pandas type for conversion of built-in
pyarrow types or in absence of pandas_metadata in the Table schema.
chunked : bool
If True will break the data in smaller DataFrames (Non deterministic number of lines).
Otherwise return a single DataFrame with the whole data.
Expand Down Expand Up @@ -767,6 +783,7 @@ def read_parquet_table(
validate_schema=validate_schema,
categories=categories,
safe=safe,
map_types=map_types,
chunked=chunked,
dataset=True,
use_threads=use_threads,
Expand Down
10 changes: 10 additions & 0 deletions tests/test_s3_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,16 @@ def test_to_parquet_file_dtype(path, use_threads):
assert str(df2.c1.dtype) == "string"


def test_read_parquet_map_types(path):
df = pd.DataFrame({"c0": [0, 1, 1, 2]}, dtype=np.int8)
file_path = f"{path}0.parquet"
wr.s3.to_parquet(df, file_path)
df2 = wr.s3.read_parquet(file_path)
assert str(df2.c0.dtype) == "Int8"
df3 = wr.s3.read_parquet(file_path, map_types=False)
assert str(df3.c0.dtype) == "int8"


@pytest.mark.parametrize("use_threads", [True, False])
@pytest.mark.parametrize("max_rows_by_file", [None, 0, 40, 250, 1000])
def test_parquet_with_size(path, use_threads, max_rows_by_file):
Expand Down