GH-34404: [Python] Failing tests because pandas.Index can now store a…

…ll numeric dtypes (not only 64bit versions) (#34498) ### Rationale for this change Several failing tests in the nightly build (https://github.com/ursacomputing/crossbow/actions/runs/4277727973/jobs/7446784501) ### What changes are included in this PR? Due to change in supported dtypes for Index in pandas, the tests expecting `int64`and not `int32` are failing with dev version of pandas. The failing tests are updated to match the new pandas behaviour. * Closes: #34404 Authored-by: Alenka Frim <frim.alenka@gmail.com> Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
apache · Mar 10, 2023 · 71f3c56 · 71f3c56
1 parent 9baefea
commit 71f3c56
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 17 deletions.
diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
@@ -735,8 +735,15 @@ def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True):
                    .reset_index(drop=True)
                    .reindex(columns=result_df.columns))
 
-    expected_df['foo'] = pd.Categorical(df['foo'], categories=foo_keys)
-    expected_df['bar'] = pd.Categorical(df['bar'], categories=bar_keys)
+    if use_legacy_dataset or Version(pd.__version__) < Version("2.0.0"):
+        expected_df['foo'] = pd.Categorical(df['foo'], categories=foo_keys)
+        expected_df['bar'] = pd.Categorical(df['bar'], categories=bar_keys)
+    else:
+        # With pandas 2.0.0 Index can store all numeric dtypes (not just
+        # int64/uint64/float64). Using astype() to create a categorical
+        # column preserves original dtype (int32)
+        expected_df['foo'] = expected_df['foo'].astype("category")
+        expected_df['bar'] = expected_df['bar'].astype("category")
 
     assert (result_df.columns == ['index', 'values', 'foo', 'bar']).all()
 

diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
@@ -1952,22 +1952,36 @@ def _check_datetime_components(timestamps, timezone=None):
         [iso_year, iso_week, iso_day],
         fields=iso_calendar_fields)
 
-    assert pc.year(tsa).equals(pa.array(ts.dt.year))
+    # Casting is required because pandas with 2.0.0 various numeric
+    # date/time attributes have dtype int32 (previously int64)
+    year = ts.dt.year.astype("int64")
+    month = ts.dt.month.astype("int64")
+    day = ts.dt.day.astype("int64")
+    dayofweek = ts.dt.dayofweek.astype("int64")
+    dayofyear = ts.dt.dayofyear.astype("int64")
+    quarter = ts.dt.quarter.astype("int64")
+    hour = ts.dt.hour.astype("int64")
+    minute = ts.dt.minute.astype("int64")
+    second = ts.dt.second.values.astype("int64")
+    microsecond = ts.dt.microsecond.astype("int64")
+    nanosecond = ts.dt.nanosecond.astype("int64")
+
+    assert pc.year(tsa).equals(pa.array(year))
     assert pc.is_leap_year(tsa).equals(pa.array(ts.dt.is_leap_year))
-    assert pc.month(tsa).equals(pa.array(ts.dt.month))
-    assert pc.day(tsa).equals(pa.array(ts.dt.day))
-    assert pc.day_of_week(tsa).equals(pa.array(ts.dt.dayofweek))
-    assert pc.day_of_year(tsa).equals(pa.array(ts.dt.dayofyear))
+    assert pc.month(tsa).equals(pa.array(month))
+    assert pc.day(tsa).equals(pa.array(day))
+    assert pc.day_of_week(tsa).equals(pa.array(dayofweek))
+    assert pc.day_of_year(tsa).equals(pa.array(dayofyear))
     assert pc.iso_year(tsa).equals(pa.array(iso_year))
     assert pc.iso_week(tsa).equals(pa.array(iso_week))
     assert pc.iso_calendar(tsa).equals(iso_calendar)
-    assert pc.quarter(tsa).equals(pa.array(ts.dt.quarter))
-    assert pc.hour(tsa).equals(pa.array(ts.dt.hour))
-    assert pc.minute(tsa).equals(pa.array(ts.dt.minute))
-    assert pc.second(tsa).equals(pa.array(ts.dt.second.values))
-    assert pc.millisecond(tsa).equals(pa.array(ts.dt.microsecond // 10 ** 3))
-    assert pc.microsecond(tsa).equals(pa.array(ts.dt.microsecond % 10 ** 3))
-    assert pc.nanosecond(tsa).equals(pa.array(ts.dt.nanosecond))
+    assert pc.quarter(tsa).equals(pa.array(quarter))
+    assert pc.hour(tsa).equals(pa.array(hour))
+    assert pc.minute(tsa).equals(pa.array(minute))
+    assert pc.second(tsa).equals(pa.array(second))
+    assert pc.millisecond(tsa).equals(pa.array(microsecond // 10 ** 3))
+    assert pc.microsecond(tsa).equals(pa.array(microsecond % 10 ** 3))
+    assert pc.nanosecond(tsa).equals(pa.array(nanosecond))
     assert pc.subsecond(tsa).equals(pa.array(subseconds))
     assert pc.local_timestamp(tsa).equals(pa.array(ts.dt.tz_localize(None)))
 
@@ -1982,7 +1996,7 @@ def _check_datetime_components(timestamps, timezone=None):
     day_of_week_options = pc.DayOfWeekOptions(
         count_from_zero=False, week_start=1)
     assert pc.day_of_week(tsa, options=day_of_week_options).equals(
-        pa.array(ts.dt.dayofweek + 1))
+        pa.array(dayofweek + 1))
 
     week_options = pc.WeekOptions(
         week_starts_monday=True, count_from_zero=False,

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -3250,7 +3250,7 @@ def test_table_from_pandas_schema_index_columns():
     schema = pa.schema([
         ('a', pa.int64()),
         ('b', pa.float64()),
-        ('index', pa.int32()),
+        ('index', pa.int64()),
     ])
 
     # schema includes index with name not in dataframe
@@ -3283,7 +3283,7 @@ def test_table_from_pandas_schema_index_columns():
 
     # schema has different order (index column not at the end)
     schema = pa.schema([
-        ('index', pa.int32()),
+        ('index', pa.int64()),
         ('a', pa.int64()),
         ('b', pa.float64()),
     ])