Skip to content

Commit

Permalink
BUG: Pandas can't restore index from parquet with offset-specified ti…
Browse files Browse the repository at this point in the history
  • Loading branch information
alippai committed Oct 7, 2020
1 parent f1adcd1 commit 1772840
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 3 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ I/O
- Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`)
- Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`)
- Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`)
- Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`)

Plotting
^^^^^^^^
Expand Down
10 changes: 9 additions & 1 deletion pandas/_libs/tslibs/timezones.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import timezone
from datetime import timedelta, timezone

from cpython.datetime cimport datetime, timedelta, tzinfo

Expand Down Expand Up @@ -102,6 +102,14 @@ cpdef inline tzinfo maybe_get_tz(object tz):
# On Python 3 on Windows, the filename is not always set correctly.
if isinstance(tz, _dateutil_tzfile) and '.tar.gz' in tz._filename:
tz._filename = zone
elif tz[0] in {'-', '+'}:
hours = int(tz[0:3])
minutes = int(tz[0] + tz[4:6])
tz = timezone(timedelta(hours=hours, minutes=minutes))
elif tz[0:4] in {'UTC-', 'UTC+'}:
hours = int(tz[3:6])
minutes = int(tz[3] + tz[7:9])
tz = timezone(timedelta(hours=hours, minutes=minutes))
else:
tz = pytz.timezone(tz)
elif is_integer_object(tz):
Expand Down
4 changes: 4 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,10 @@ def iris(datapath):
"Asia/Tokyo",
"dateutil/US/Pacific",
"dateutil/Asia/Singapore",
"+01:15",
"-02:15",
"UTC+01:15",
"UTC-02:15",
tzutc(),
tzlocal(),
FixedOffset(300),
Expand Down
46 changes: 45 additions & 1 deletion pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,21 @@ def df_full():
)


@pytest.fixture(
params=[
datetime.datetime.now(datetime.timezone.utc),
datetime.datetime.now(datetime.timezone.min),
datetime.datetime.now(datetime.timezone.max),
datetime.datetime.strptime("2019-01-04T16:41:24+0200", "%Y-%m-%dT%H:%M:%S%z"),
datetime.datetime.strptime("2019-01-04T16:41:24+0215", "%Y-%m-%dT%H:%M:%S%z"),
datetime.datetime.strptime("2019-01-04T16:41:24-0200", "%Y-%m-%dT%H:%M:%S%z"),
datetime.datetime.strptime("2019-01-04T16:41:24-0215", "%Y-%m-%dT%H:%M:%S%z"),
]
)
def timezone_aware_date_list(request):
return request.param


def check_round_trip(
df,
engine=None,
Expand All @@ -134,6 +149,7 @@ def check_round_trip(
expected=None,
check_names=True,
check_like=False,
check_dtype=True,
repeat=2,
):
"""Verify parquet serializer and deserializer produce the same results.
Expand Down Expand Up @@ -175,7 +191,11 @@ def compare(repeat):
actual = read_parquet(path, **read_kwargs)

tm.assert_frame_equal(
expected, actual, check_names=check_names, check_like=check_like
expected,
actual,
check_names=check_names,
check_like=check_like,
check_dtype=check_dtype,
)

if path is None:
Expand Down Expand Up @@ -739,6 +759,21 @@ def test_timestamp_nanoseconds(self, pa):
df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)})
check_round_trip(df, pa, write_kwargs={"version": "2.0"})

def test_timezone_aware_index(self, pa, timezone_aware_date_list):
idx = 5 * [timezone_aware_date_list]
df = pd.DataFrame(index=idx, data={"index_as_col": idx})

# see gh-36004
# compare time(zone) values only, skip their class:
# pyarrow always creates fixed offset timezones using pytz.FixedOffset()
# even if it was datetime.timezone() originally
#
# technically they are the same:
# they both implement datetime.tzinfo
# they both wrap datetime.timedelta()
# this use-case sets the resolution to 1 minute
check_round_trip(df, pa, check_dtype=False)

@td.skip_if_no("pyarrow", min_version="0.17")
def test_filter_row_groups(self, pa):
# https://github.com/pandas-dev/pandas/issues/26551
Expand Down Expand Up @@ -877,3 +912,12 @@ def test_empty_dataframe(self, fp):
expected = df.copy()
expected.index.name = "index"
check_round_trip(df, fp, expected=expected)

def test_timezone_aware_index(self, fp, timezone_aware_date_list):
idx = 5 * [timezone_aware_date_list]

df = pd.DataFrame(index=idx, data={"index_as_col": idx})

expected = df.copy()
expected.index.name = "index"
check_round_trip(df, fp, expected=expected)
24 changes: 23 additions & 1 deletion pandas/tests/tslibs/test_timezones.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timedelta, timezone

import dateutil.tz
import pytest
Expand Down Expand Up @@ -118,3 +118,25 @@ def test_maybe_get_tz_invalid_types():
msg = "<class 'pandas._libs.tslibs.timestamps.Timestamp'>"
with pytest.raises(TypeError, match=msg):
timezones.maybe_get_tz(Timestamp.now("UTC"))


def test_maybe_get_tz_offset_only():
# see gh-36004

# timezone.utc
tz = timezones.maybe_get_tz(timezone.utc)
assert tz == timezone(timedelta(hours=0, minutes=0))

# without UTC+- prefix
tz = timezones.maybe_get_tz("+01:15")
assert tz == timezone(timedelta(hours=1, minutes=15))

tz = timezones.maybe_get_tz("-01:15")
assert tz == timezone(-timedelta(hours=1, minutes=15))

# with UTC+- prefix
tz = timezones.maybe_get_tz("UTC+02:45")
assert tz == timezone(timedelta(hours=2, minutes=45))

tz = timezones.maybe_get_tz("UTC-02:45")
assert tz == timezone(-timedelta(hours=2, minutes=45))

0 comments on commit 1772840

Please sign in to comment.