Skip to content

Commit

Permalink
FIX-modin-project#6855: Make sure read_parquet works with integer col…
Browse files Browse the repository at this point in the history
…umns for pyarrow engine

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev committed Jan 22, 2024
1 parent 097ea52 commit 90c9d78
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 1 deletion.
9 changes: 8 additions & 1 deletion modin/core/io/column_stores/parquet_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,14 @@ def build_query_compiler(cls, dataset, columns, index_columns, **kwargs):
row_lengths = [part.length() for part in remote_parts.T[0]]
else:
row_lengths = None

if (
dataset.pandas_metadata
and "column_indexes" in dataset.pandas_metadata
and dataset.pandas_metadata["column_indexes"][0]["numpy_type"] == "int64"
):
columns = pandas.Index(columns).astype("int64").to_list()

frame = cls.frame_cls(
remote_parts,
index,
Expand Down Expand Up @@ -801,7 +809,6 @@ def _read(cls, path, engine, columns, use_nullable_dtypes, dtype_backend, **kwar
for c in column_names
if c not in index_columns and not cls.index_regex.match(c)
]

return cls.build_query_compiler(
dataset, columns, index_columns, dtype_backend=dtype_backend, **kwargs
)
Expand Down
11 changes: 11 additions & 0 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2034,6 +2034,17 @@ def test_read_parquet_5767(self, tmp_path, engine):
# both Modin and pandas read column "b" as a category
df_equals(test_df, read_df.astype("int64"))

def test_read_parquet_6855(self, tmp_path, engine):
if engine == "fastparquet":
pytest.skip("integer columns aren't supported")
test_df = pandas.DataFrame(np.random.rand(10**2, 10))
path = tmp_path / "data"
path.mkdir()
file_name = "issue6855.parquet"
test_df.to_parquet(path / file_name, engine=engine)
read_df = pd.read_parquet(path / file_name, engine=engine)
df_equals(test_df, read_df)

def test_read_parquet_s3_with_column_partitioning(
self, s3_resource, engine, s3_storage_options
):
Expand Down

0 comments on commit 90c9d78

Please sign in to comment.