Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion awswrangler/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,7 +856,7 @@ def split_pandas_frame(df: pd.DataFrame, splits: int) -> list[pd.DataFrame]:
total = len(df)
each_section, extras = divmod(total, splits)
section_sizes = [0] + extras * [each_section + 1] + (splits - extras) * [each_section]
div_points = _nx.array(section_sizes, dtype=_nx.intp).cumsum() # type: ignore[attr-defined]
div_points = _nx.array(section_sizes, dtype=_nx.intp).cumsum()

sub_dfs = []
for i in range(splits):
Expand Down
2 changes: 1 addition & 1 deletion awswrangler/s3/_write_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def _get_bucketing_series(df: pd.DataFrame, bucketing_info: typing.BucketingInfo
axis="columns",
)
)
return bucket_number_series.astype(pd.CategoricalDtype(range(bucketing_info[1])))
return bucket_number_series.astype(np.array([pd.CategoricalDtype(range(bucketing_info[1]))]))


def _simulate_overflow(value: int, bits: int = 31, signed: bool = False) -> int:
Expand Down
215 changes: 116 additions & 99 deletions poetry.lock

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ pandas = [
{ version = ">=1.2.0,<3.0.0", markers = "python_version >= \"3.9\"" },
]
numpy = [
{version = "^1.18", markers = "python_version < \"3.12\""},
{version = "^1.26", markers = "python_version >= \"3.12\""}
{ version = ">=1.18,<2.0", markers = "python_version < \"3.9\"" },
{ version = ">=1.26,<3.0", markers = "python_version >= \"3.9\"" }
]
pyarrow = ">=8.0.0"
typing-extensions = "^4.4.0"
Expand Down Expand Up @@ -77,7 +77,7 @@ geopandas = [
# Distributed
modin = [
{ version = "0.23.1post0", markers = "python_version < \"3.9\"", optional = true },
{ version = "^0.26.0", markers = "python_version >= \"3.9\"", optional = true }
{ version = "^0.31.0", markers = "python_version >= \"3.9\"", optional = true }
]
ray = { version = "^2.30.0", extras = ["default", "data"], optional = true }

Expand Down Expand Up @@ -158,7 +158,7 @@ line-length = 120
target-version = "py38"

[tool.ruff.lint]
select = ["D", "E", "F", "I", "PL", "RUF100", "W", "FA", "UP", "PYI036"]
select = ["D", "E", "F", "I", "PL", "RUF100", "W", "FA", "UP", "PYI036", "NPY"]
ignore = ["E501", "PLR2004", "UP037"]
fixable = ["ALL"]

Expand Down
12 changes: 7 additions & 5 deletions tests/load/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,14 +238,16 @@ def test_wait_object_not_exists(path: str, benchmark_time: int, request: pytest.

@pytest.mark.parametrize("size", [(5000, 5000), (1, 5000), (5000, 1), (1, 1)])
def test_wide_df(size: tuple[int, int], path: str) -> None:
df = pd.DataFrame(np.random.randint(0, 100, size=size))
rand_gen = np.random.Generator()

df = pd.DataFrame(rand_gen.integers(0, 100, size=size))
df.columns = df.columns.map(str)

num_cols = size[0]
df["int"] = np.random.choice(["1", "2", None], num_cols)
df["decimal"] = np.random.choice(["1.0", "2.0", None], num_cols)
df["date"] = np.random.choice(["2020-01-01", "2020-01-02", None], num_cols)
df["par0"] = np.random.choice(["A", "B"], num_cols)
df["int"] = rand_gen.choice(["1", "2", None], num_cols)
df["decimal"] = rand_gen.choice(["1.0", "2.0", None], num_cols)
df["date"] = rand_gen.choice(["2020-01-01", "2020-01-02", None], num_cols)
df["par0"] = rand_gen.choice(["A", "B"], num_cols)

partitions_shape = np.array(unwrap_partitions(df)).shape
assert partitions_shape[1] == min(math.ceil(len(df.columns) / cfg.MinPartitionSize.get()), cfg.NPartitions.get())
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_athena.py
Original file line number Diff line number Diff line change
Expand Up @@ -969,8 +969,8 @@ def test_athena_nan_inf(glue_database, ctas_approach, data_source):
assert df.shape == (1, 4)
assert df.dtypes.to_list() == ["float64", "float64", "float64", "float64"]
assert np.isnan(df.nan.iloc[0])
assert df.inf.iloc[0] == np.PINF
assert df.inf_n.iloc[0] == np.NINF
assert df.inf.iloc[0] == np.inf
assert df.inf_n.iloc[0] == -np.inf
assert df.regular.iloc[0] == 1.2


Expand Down
3 changes: 2 additions & 1 deletion tests/unit/test_postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import awswrangler as wr
import awswrangler.pandas as pd

from .._utils import ensure_data_types, get_df, pandas_equals
from .._utils import ensure_data_types, get_df, is_ray_modin, pandas_equals

logging.getLogger("awswrangler").setLevel(logging.DEBUG)

Expand Down Expand Up @@ -96,6 +96,7 @@ def test_unknown_overwrite_method_error(postgresql_table, postgresql_con):
)


@pytest.mark.xfail(is_ray_modin, raises=ProgrammingError, reason="Broken export of values in Modin")
def test_sql_types(postgresql_table, postgresql_con):
table = postgresql_table
df = get_df()
Expand Down
8 changes: 7 additions & 1 deletion tests/unit/test_s3_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,17 +730,23 @@ def test_parquet_compression(path, compression) -> None:
"schema", [None, pa.schema([pa.field("c0", pa.int64()), pa.field("c1", pa.int64()), pa.field("par", pa.string())])]
)
def test_empty_file(path, use_threads, schema):
from awswrangler import _utils

df = pd.DataFrame({"c0": [1, 2, 3], "c1": [None, None, None], "par": ["a", "b", "c"]})
df.index = df.index.astype("Int64")
df["c0"] = df["c0"].astype("Int64")
df["par"] = df["par"].astype("string")
wr.s3.to_parquet(df, path, index=True, dataset=True, partition_cols=["par"])
bucket, key = wr._utils.parse_path(f"{path}test.csv")

bucket, key = _utils.parse_path(f"{path}test.csv")
boto3.client("s3").put_object(Body=b"", Bucket=bucket, Key=key)
with pytest.raises(wr.exceptions.InvalidFile):
wr.s3.read_parquet(path, use_threads=use_threads, ignore_empty=False, schema=schema)

df2 = wr.s3.read_parquet(path, dataset=True, use_threads=use_threads)
df2 = df2.sort_values(by=["c0"])
df2["par"] = df2["par"].astype("string")

assert_pandas_equals(df, df2)


Expand Down