Skip to content

Commit

Permalink
ARROW-18173: [Python] Drop older versions of Pandas (<1.0) (#14631)
Browse files Browse the repository at this point in the history
This PR tries to make changes to drop older versions of pandas and support versions >= 1.0.0.

The changes will have to be done in:
- [x] the official documentation (pandas version support)
- [x] the CI jobs supporting older pandas versions
- [x] https://github.com/apache/arrow/blob/master/python/pyarrow/pandas-shim.pxi
- [x] tests that are specifically testing features on older versions of pandas

Lead-authored-by: Alenka Frim <frim.alenka@gmail.com>
Co-authored-by: Alenka Frim <AlenkaF@users.noreply.github.com>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
AlenkaF and jorisvandenbossche committed Nov 22, 2022
1 parent 3cc982e commit f769f6b
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 177 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
name:
- conda-python-docs
- conda-python-3.8-nopandas
- conda-python-3.7-pandas-0.23
- conda-python-3.7-pandas-1.0
- conda-python-3.9-pandas-latest
include:
- name: conda-python-docs
Expand All @@ -67,12 +67,12 @@ jobs:
image: conda-python
title: AMD64 Conda Python 3.8 Without Pandas
python: 3.8
- name: conda-python-3.7-pandas-0.23
- name: conda-python-3.7-pandas-1.0
cache: conda-python-3.7
image: conda-python-pandas
title: AMD64 Conda Python 3.7 Pandas 0.23
title: AMD64 Conda Python 3.7 Pandas 1.0
python: 3.7
pandas: 0.23
pandas: 1.0
numpy: 1.16
- name: conda-python-3.9-pandas-latest
cache: conda-python-3.9
Expand Down
15 changes: 15 additions & 0 deletions docs/source/python/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,18 @@ Installing from source
----------------------

See :ref:`python-development`.

Dependencies
------------

Required dependency

* **NumPy 1.16.6** or higher.

Optional dependencies

* **pandas 1.0** or higher,
* **cffi**.

Additional packages PyArrow is compatible with are :ref:`fsspec <filesystem-fsspec>`
and **pytz**, **dateutil** or **tzdata** package for timezones.
9 changes: 0 additions & 9 deletions python/pyarrow/feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@
import pyarrow.lib as ext
from pyarrow import _feather
from pyarrow._feather import FeatherError # noqa: F401
from pyarrow.vendored.version import Version


def _check_pandas_version():
if _pandas_api.loose_version < Version('0.17.0'):
raise ImportError("feather requires pandas >= 0.17.0")


class FeatherDataset:
Expand Down Expand Up @@ -96,7 +90,6 @@ def read_pandas(self, columns=None, use_threads=True):
pandas.DataFrame
Content of the file as a pandas DataFrame (of columns)
"""
_check_pandas_version()
return self.read_table(columns=columns).to_pandas(
use_threads=use_threads)

Expand Down Expand Up @@ -145,7 +138,6 @@ def write_feather(df, dest, compression=None, compression_level=None,
limited legacy format
"""
if _pandas_api.have_pandas:
_check_pandas_version()
if (_pandas_api.has_sparse and
isinstance(df, _pandas_api.pd.SparseDataFrame)):
df = df.to_dense()
Expand Down Expand Up @@ -230,7 +222,6 @@ def read_feather(source, columns=None, use_threads=True,
-------
df : pandas.DataFrame
"""
_check_pandas_version()
return (read_table(
source, columns=columns, memory_map=memory_map,
use_threads=use_threads).to_pandas(use_threads=use_threads, **kwargs))
Expand Down
27 changes: 7 additions & 20 deletions python/pyarrow/pandas-shim.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,16 @@ cdef class _PandasAPIShim(object):
self._version = pd.__version__
self._loose_version = Version(pd.__version__)

if self._loose_version < Version('0.23.0'):
if self._loose_version < Version('1.0.0'):
self._have_pandas = False
if raise_:
raise ImportError(
"pyarrow requires pandas 0.23.0 or above, pandas {} is "
"pyarrow requires pandas 1.0.0 or above, pandas {} is "
"installed".format(self._version)
)
else:
warnings.warn(
"pyarrow requires pandas 0.23.0 or above, pandas {} is "
"pyarrow requires pandas 1.0.0 or above, pandas {} is "
"installed. Therefore, pandas-specific integration is not "
"used.".format(self._version), stacklevel=2)
return
Expand All @@ -83,22 +83,12 @@ cdef class _PandasAPIShim(object):
self._series, self._index, self._categorical_type,
self._extension_array)
self._extension_dtype = pd.api.extensions.ExtensionDtype
if self._loose_version >= Version('0.24.0'):
self._is_extension_array_dtype = \
pd.api.types.is_extension_array_dtype
else:
self._is_extension_array_dtype = None

self._is_extension_array_dtype = (
pd.api.types.is_extension_array_dtype)
self._types_api = pd.api.types
self._datetimetz_type = pd.api.types.DatetimeTZDtype
self._have_pandas = True

if self._loose_version > Version('0.25'):
self.has_sparse = False
else:
self.has_sparse = True

self._pd024 = self._loose_version >= Version('0.24')
self.has_sparse = False

cdef inline _check_import(self, bint raise_=True):
if self._tried_importing_pandas:
Expand Down Expand Up @@ -232,10 +222,7 @@ cdef class _PandasAPIShim(object):
self._check_import()
if isinstance(obj.dtype, (self.pd.api.types.IntervalDtype,
self.pd.api.types.PeriodDtype)):
if self._pd024:
# only since pandas 0.24, interval and period are stored as
# such in Series
return obj.array
return obj.array
return obj.values

def assert_frame_equal(self, *args, **kwargs):
Expand Down
3 changes: 1 addition & 2 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1089,9 +1089,8 @@ def _pandas_type_to_numpy_type(pandas_type):


def _get_multiindex_codes(mi):
# compat for pandas < 0.24 (MI labels renamed to codes).
if isinstance(mi, _pandas_api.pd.MultiIndex):
return mi.codes if hasattr(mi, 'codes') else mi.labels
return mi.codes
else:
return None

Expand Down
10 changes: 4 additions & 6 deletions python/pyarrow/tests/parquet/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,11 @@ def test_filters_equivalency(tempdir, use_legacy_dataset):
result_df = table.to_pandas().reset_index(drop=True)

# Check that all rows in the DF fulfill the filter
# Pandas 0.23.x has problems with indexing constant memoryviews in
# categoricals. Thus we need to make an explicit copy here with np.array.
df_filter_1 = (np.array(result_df['integer']) == 1) \
& (np.array(result_df['string']) != 'b') \
& (np.array(result_df['boolean']) == 'True')
df_filter_1 = (result_df['integer'] == 1) \
& (result_df['string'] != 'b') \
& (result_df['boolean'] == 'True')
df_filter_2 = (np.array(result_df['integer']) == 0) \
& (np.array(result_df['boolean']) == 'False')
& (result_df['boolean'] == 'False')
assert df_filter_1.sum() > 0
assert df_filter_2.sum() > 0
assert result_df.shape[0] == (df_filter_1.sum() + df_filter_2.sum())
Expand Down
5 changes: 0 additions & 5 deletions python/pyarrow/tests/parquet/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from pyarrow.tests.parquet.common import (
parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported)
from pyarrow.util import guid
from pyarrow.vendored.version import Version

try:
import pyarrow.parquet as pq
Expand Down Expand Up @@ -561,10 +560,6 @@ def test_pandas_categorical_roundtrip(use_legacy_dataset):
def test_write_to_dataset_pandas_preserve_extensiondtypes(
tempdir, use_legacy_dataset
):
# ARROW-8251 - preserve pandas extension dtypes in roundtrip
if Version(pd.__version__) < Version("1.0.0"):
pytest.skip("__arrow_array__ added to pandas in 1.0.0")

df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
df['col'] = df['col'].astype("Int64")
table = pa.table(df)
Expand Down
82 changes: 31 additions & 51 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1812,14 +1812,6 @@ def test_strptime():
@pytest.mark.skipif(sys.platform == 'win32',
reason="Timezone database is not available on Windows yet")
def test_strftime():
from pyarrow.vendored.version import Version

def _fix_timestamp(s):
if Version(pd.__version__) < Version("1.0.0"):
return s.to_series().replace("NaT", pd.NaT)
else:
return s

times = ["2018-03-10 09:00", "2038-01-31 12:23", None]
timezones = ["CET", "UTC", "Europe/Ljubljana"]

Expand All @@ -1834,50 +1826,51 @@ def _fix_timestamp(s):
for fmt in formats:
options = pc.StrftimeOptions(fmt)
result = pc.strftime(tsa, options=options)
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
expected = pa.array(ts.strftime(fmt))
assert result.equals(expected)

fmt = "%Y-%m-%dT%H:%M:%S"

# Default format
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
result = pc.strftime(tsa, options=pc.StrftimeOptions())
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
expected = pa.array(ts.strftime(fmt))
assert result.equals(expected)

# Default format plus timezone
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
expected = pa.array(_fix_timestamp(ts.strftime(fmt + "%Z")))
expected = pa.array(ts.strftime(fmt + "%Z"))
assert result.equals(expected)

# Pandas %S is equivalent to %S in arrow for unit="s"
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
options = pc.StrftimeOptions("%S")
result = pc.strftime(tsa, options=options)
expected = pa.array(_fix_timestamp(ts.strftime("%S")))
expected = pa.array(ts.strftime("%S"))
assert result.equals(expected)

# Pandas %S.%f is equivalent to %S in arrow for unit="us"
tsa = pa.array(ts, type=pa.timestamp("us", timezone))
options = pc.StrftimeOptions("%S")
result = pc.strftime(tsa, options=options)
expected = pa.array(_fix_timestamp(ts.strftime("%S.%f")))
expected = pa.array(ts.strftime("%S.%f"))
assert result.equals(expected)

# Test setting locale
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
options = pc.StrftimeOptions(fmt, locale="C")
result = pc.strftime(tsa, options=options)
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
expected = pa.array(ts.strftime(fmt))
assert result.equals(expected)

# Test timestamps without timezone
fmt = "%Y-%m-%dT%H:%M:%S"
ts = pd.to_datetime(times)
tsa = pa.array(ts, type=pa.timestamp("s"))
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
expected = pa.array(_fix_timestamp(ts.strftime(fmt)))
expected = pa.array(ts.strftime(fmt))

# Positional format
assert pc.strftime(tsa, fmt) == result

Expand Down Expand Up @@ -1956,8 +1949,6 @@ def _check_datetime_components(timestamps, timezone=None):

@pytest.mark.pandas
def test_extract_datetime_components():
from pyarrow.vendored.version import Version

timestamps = ["1970-01-01T00:00:59.123456789",
"2000-02-29T23:23:23.999999999",
"2033-05-18T03:33:20.000000000",
Expand All @@ -1983,8 +1974,6 @@ def test_extract_datetime_components():
if sys.platform == 'win32':
# TODO: We should test on windows once ARROW-13168 is resolved.
pytest.skip('Timezone database is not available on Windows yet')
elif Version(pd.__version__) < Version('1.0.0'):
pytest.skip('Pandas < 1.0 extracts time components incorrectly.')
else:
for timezone in timezones:
_check_datetime_components(timestamps, timezone)
Expand All @@ -1995,8 +1984,6 @@ def test_extract_datetime_components():
@pytest.mark.skipif(sys.platform == 'win32',
reason="Timezone database is not available on Windows yet")
def test_assume_timezone():
from pyarrow.vendored.version import Version

ts_type = pa.timestamp("ns")
timestamps = pd.to_datetime(["1970-01-01T00:00:59.123456789",
"2000-02-29T23:23:23.999999999",
Expand Down Expand Up @@ -2040,31 +2027,29 @@ def test_assume_timezone():

timezone = "Europe/Brussels"

# nonexistent parameter was introduced in Pandas 0.24.0
if Version(pd.__version__) >= Version("0.24.0"):
options_nonexistent_raise = pc.AssumeTimezoneOptions(timezone)
options_nonexistent_earliest = pc.AssumeTimezoneOptions(
timezone, ambiguous="raise", nonexistent="earliest")
options_nonexistent_latest = pc.AssumeTimezoneOptions(
timezone, ambiguous="raise", nonexistent="latest")

with pytest.raises(ValueError,
match="Timestamp doesn't exist in "
f"timezone '{timezone}'"):
pc.assume_timezone(nonexistent_array,
options=options_nonexistent_raise)

expected = pa.array(nonexistent.tz_localize(
timezone, nonexistent="shift_forward"))
result = pc.assume_timezone(
nonexistent_array, options=options_nonexistent_latest)
expected.equals(result)

expected = pa.array(nonexistent.tz_localize(
timezone, nonexistent="shift_backward"))
result = pc.assume_timezone(
nonexistent_array, options=options_nonexistent_earliest)
expected.equals(result)
options_nonexistent_raise = pc.AssumeTimezoneOptions(timezone)
options_nonexistent_earliest = pc.AssumeTimezoneOptions(
timezone, ambiguous="raise", nonexistent="earliest")
options_nonexistent_latest = pc.AssumeTimezoneOptions(
timezone, ambiguous="raise", nonexistent="latest")

with pytest.raises(ValueError,
match="Timestamp doesn't exist in "
f"timezone '{timezone}'"):
pc.assume_timezone(nonexistent_array,
options=options_nonexistent_raise)

expected = pa.array(nonexistent.tz_localize(
timezone, nonexistent="shift_forward"))
result = pc.assume_timezone(
nonexistent_array, options=options_nonexistent_latest)
expected.equals(result)

expected = pa.array(nonexistent.tz_localize(
timezone, nonexistent="shift_backward"))
result = pc.assume_timezone(
nonexistent_array, options=options_nonexistent_earliest)
expected.equals(result)

options_ambiguous_raise = pc.AssumeTimezoneOptions(timezone)
options_ambiguous_latest = pc.AssumeTimezoneOptions(
Expand Down Expand Up @@ -2199,11 +2184,6 @@ def _check_temporal_rounding(ts, values, unit):
"second", "minute", "hour", "day"))
@pytest.mark.pandas
def test_round_temporal(unit):
from pyarrow.vendored.version import Version

if Version(pd.__version__) < Version('1.0.0'):
pytest.skip('Pandas < 1.0 rounds differently.')

values = (1, 2, 3, 4, 5, 6, 7, 10, 15, 24, 60, 250, 500, 750)
timestamps = [
"1923-07-07 08:52:35.203790336",
Expand Down
Loading

0 comments on commit f769f6b

Please sign in to comment.