FIX-modin-project#6855: Make sure read_parquet works with integer col…

…umns for pyarrow engine Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
anmyachev · Jan 22, 2024 · 90c9d78 · 90c9d78
1 parent 097ea52
commit 90c9d78
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 1 deletion.
diff --git a/modin/core/io/column_stores/parquet_dispatcher.py b/modin/core/io/column_stores/parquet_dispatcher.py
@@ -686,6 +686,14 @@ def build_query_compiler(cls, dataset, columns, index_columns, **kwargs):
             row_lengths = [part.length() for part in remote_parts.T[0]]
         else:
             row_lengths = None
+
+        if (
+            dataset.pandas_metadata
+            and "column_indexes" in dataset.pandas_metadata
+            and dataset.pandas_metadata["column_indexes"][0]["numpy_type"] == "int64"
+        ):
+            columns = pandas.Index(columns).astype("int64").to_list()
+
         frame = cls.frame_cls(
             remote_parts,
             index,
@@ -801,7 +809,6 @@ def _read(cls, path, engine, columns, use_nullable_dtypes, dtype_backend, **kwar
             for c in column_names
             if c not in index_columns and not cls.index_regex.match(c)
         ]
-
         return cls.build_query_compiler(
             dataset, columns, index_columns, dtype_backend=dtype_backend, **kwargs
         )

diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
@@ -2034,6 +2034,17 @@ def test_read_parquet_5767(self, tmp_path, engine):
         # both Modin and pandas read column "b" as a category
         df_equals(test_df, read_df.astype("int64"))
 
+    def test_read_parquet_6855(self, tmp_path, engine):
+        if engine == "fastparquet":
+            pytest.skip("integer columns aren't supported")
+        test_df = pandas.DataFrame(np.random.rand(10**2, 10))
+        path = tmp_path / "data"
+        path.mkdir()
+        file_name = "issue6855.parquet"
+        test_df.to_parquet(path / file_name, engine=engine)
+        read_df = pd.read_parquet(path / file_name, engine=engine)
+        df_equals(test_df, read_df)
+
     def test_read_parquet_s3_with_column_partitioning(
         self, s3_resource, engine, s3_storage_options
     ):