Skip to content

Commit

Permalink
GH-34404: [Python] Failing tests because pandas.Index can now store a…
Browse files Browse the repository at this point in the history
…ll numeric dtypes (not only 64bit versions) (#34498)

### Rationale for this change
Several failing tests in the nightly build (https://github.com/ursacomputing/crossbow/actions/runs/4277727973/jobs/7446784501) 

### What changes are included in this PR?
Due to change in supported dtypes for Index in pandas, the tests expecting `int64`and not `int32` are failing with dev version of pandas. The failing tests are updated to match the new pandas behaviour.
* Closes: #34404

Authored-by: Alenka Frim <frim.alenka@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
AlenkaF committed Mar 10, 2023
1 parent 9baefea commit 71f3c56
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 17 deletions.
11 changes: 9 additions & 2 deletions python/pyarrow/tests/parquet/test_dataset.py
Expand Up @@ -735,8 +735,15 @@ def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True):
.reset_index(drop=True)
.reindex(columns=result_df.columns))

expected_df['foo'] = pd.Categorical(df['foo'], categories=foo_keys)
expected_df['bar'] = pd.Categorical(df['bar'], categories=bar_keys)
if use_legacy_dataset or Version(pd.__version__) < Version("2.0.0"):
expected_df['foo'] = pd.Categorical(df['foo'], categories=foo_keys)
expected_df['bar'] = pd.Categorical(df['bar'], categories=bar_keys)
else:
# With pandas 2.0.0 Index can store all numeric dtypes (not just
# int64/uint64/float64). Using astype() to create a categorical
# column preserves original dtype (int32)
expected_df['foo'] = expected_df['foo'].astype("category")
expected_df['bar'] = expected_df['bar'].astype("category")

assert (result_df.columns == ['index', 'values', 'foo', 'bar']).all()

Expand Down
40 changes: 27 additions & 13 deletions python/pyarrow/tests/test_compute.py
Expand Up @@ -1952,22 +1952,36 @@ def _check_datetime_components(timestamps, timezone=None):
[iso_year, iso_week, iso_day],
fields=iso_calendar_fields)

assert pc.year(tsa).equals(pa.array(ts.dt.year))
# Casting is required because pandas with 2.0.0 various numeric
# date/time attributes have dtype int32 (previously int64)
year = ts.dt.year.astype("int64")
month = ts.dt.month.astype("int64")
day = ts.dt.day.astype("int64")
dayofweek = ts.dt.dayofweek.astype("int64")
dayofyear = ts.dt.dayofyear.astype("int64")
quarter = ts.dt.quarter.astype("int64")
hour = ts.dt.hour.astype("int64")
minute = ts.dt.minute.astype("int64")
second = ts.dt.second.values.astype("int64")
microsecond = ts.dt.microsecond.astype("int64")
nanosecond = ts.dt.nanosecond.astype("int64")

assert pc.year(tsa).equals(pa.array(year))
assert pc.is_leap_year(tsa).equals(pa.array(ts.dt.is_leap_year))
assert pc.month(tsa).equals(pa.array(ts.dt.month))
assert pc.day(tsa).equals(pa.array(ts.dt.day))
assert pc.day_of_week(tsa).equals(pa.array(ts.dt.dayofweek))
assert pc.day_of_year(tsa).equals(pa.array(ts.dt.dayofyear))
assert pc.month(tsa).equals(pa.array(month))
assert pc.day(tsa).equals(pa.array(day))
assert pc.day_of_week(tsa).equals(pa.array(dayofweek))
assert pc.day_of_year(tsa).equals(pa.array(dayofyear))
assert pc.iso_year(tsa).equals(pa.array(iso_year))
assert pc.iso_week(tsa).equals(pa.array(iso_week))
assert pc.iso_calendar(tsa).equals(iso_calendar)
assert pc.quarter(tsa).equals(pa.array(ts.dt.quarter))
assert pc.hour(tsa).equals(pa.array(ts.dt.hour))
assert pc.minute(tsa).equals(pa.array(ts.dt.minute))
assert pc.second(tsa).equals(pa.array(ts.dt.second.values))
assert pc.millisecond(tsa).equals(pa.array(ts.dt.microsecond // 10 ** 3))
assert pc.microsecond(tsa).equals(pa.array(ts.dt.microsecond % 10 ** 3))
assert pc.nanosecond(tsa).equals(pa.array(ts.dt.nanosecond))
assert pc.quarter(tsa).equals(pa.array(quarter))
assert pc.hour(tsa).equals(pa.array(hour))
assert pc.minute(tsa).equals(pa.array(minute))
assert pc.second(tsa).equals(pa.array(second))
assert pc.millisecond(tsa).equals(pa.array(microsecond // 10 ** 3))
assert pc.microsecond(tsa).equals(pa.array(microsecond % 10 ** 3))
assert pc.nanosecond(tsa).equals(pa.array(nanosecond))
assert pc.subsecond(tsa).equals(pa.array(subseconds))
assert pc.local_timestamp(tsa).equals(pa.array(ts.dt.tz_localize(None)))

Expand All @@ -1982,7 +1996,7 @@ def _check_datetime_components(timestamps, timezone=None):
day_of_week_options = pc.DayOfWeekOptions(
count_from_zero=False, week_start=1)
assert pc.day_of_week(tsa, options=day_of_week_options).equals(
pa.array(ts.dt.dayofweek + 1))
pa.array(dayofweek + 1))

week_options = pc.WeekOptions(
week_starts_monday=True, count_from_zero=False,
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/tests/test_pandas.py
Expand Up @@ -3250,7 +3250,7 @@ def test_table_from_pandas_schema_index_columns():
schema = pa.schema([
('a', pa.int64()),
('b', pa.float64()),
('index', pa.int32()),
('index', pa.int64()),
])

# schema includes index with name not in dataframe
Expand Down Expand Up @@ -3283,7 +3283,7 @@ def test_table_from_pandas_schema_index_columns():

# schema has different order (index column not at the end)
schema = pa.schema([
('index', pa.int32()),
('index', pa.int64()),
('a', pa.int64()),
('b', pa.float64()),
])
Expand Down

0 comments on commit 71f3c56

Please sign in to comment.