Skip to content

Commit

Permalink
Get local timezone from environment vi pytz, or dateutil.
Browse files Browse the repository at this point in the history
  • Loading branch information
ueshin committed Feb 9, 2018
1 parent 7f5f5fb commit e87bd76
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 10 deletions.
8 changes: 5 additions & 3 deletions python/pyspark/sql/tests.py
Expand Up @@ -3415,7 +3415,9 @@ def setUpClass(cls):
(u"b", 2, 20, 0.4, 4.0, Decimal("4.0"),
date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2)),
(u"c", 3, 30, 0.8, 6.0, Decimal("6.0"),
date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3))]
date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3)),
(u"d", 4, 30, 1.0, 8.0, Decimal("8.0"),
date(2100, 4, 4), datetime(2100, 4, 4, 4, 4, 4))]

@classmethod
def tearDownClass(cls):
Expand Down Expand Up @@ -4124,7 +4126,7 @@ def test_vectorized_udf_timestamps(self):
data = [(0, datetime(1969, 1, 1, 1, 1, 1)),
(1, datetime(2012, 2, 2, 2, 2, 2)),
(2, None),
(3, datetime(2100, 3, 3, 3, 3, 3))]
(3, datetime(2100, 4, 4, 4, 4, 4))]

df = self.spark.createDataFrame(data, schema=schema)

Expand Down Expand Up @@ -4206,7 +4208,7 @@ def test_vectorized_udf_timestamps_respect_session_timezone(self):
data = [(1, datetime(1969, 1, 1, 1, 1, 1)),
(2, datetime(2012, 2, 2, 2, 2, 2)),
(3, None),
(4, datetime(2100, 3, 3, 3, 3, 3))]
(4, datetime(2100, 4, 4, 4, 4, 4))]
df = self.spark.createDataFrame(data, schema=schema)

f_timestamp_copy = pandas_udf(lambda ts: ts, TimestampType())
Expand Down
21 changes: 14 additions & 7 deletions python/pyspark/sql/types.py
Expand Up @@ -1709,6 +1709,15 @@ def _check_dataframe_convert_date(pdf, schema):
return pdf


def _get_local_timezone():
""" Get local timezone from environment vi pytz, or dateutil. """
from pyspark.sql.utils import require_minimum_pandas_version
require_minimum_pandas_version()

import os
return os.environ.get('TZ', 'dateutil/:')


def _check_dataframe_localize_timestamps(pdf, timezone):
"""
Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone
Expand All @@ -1721,7 +1730,7 @@ def _check_dataframe_localize_timestamps(pdf, timezone):
require_minimum_pandas_version()

from pandas.api.types import is_datetime64tz_dtype
tz = timezone or 'tzlocal()'
tz = timezone or _get_local_timezone()
for column, series in pdf.iteritems():
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64tz_dtype(series.dtype):
Expand All @@ -1744,7 +1753,7 @@ def _check_series_convert_timestamps_internal(s, timezone):
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64_dtype(s.dtype):
tz = timezone or 'tzlocal()'
tz = timezone or _get_local_timezone()
return s.dt.tz_localize(tz).dt.tz_convert('UTC')
elif is_datetime64tz_dtype(s.dtype):
return s.dt.tz_convert('UTC')
Expand All @@ -1766,15 +1775,13 @@ def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone):

import pandas as pd
from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
from_tz = from_timezone or 'tzlocal()'
to_tz = to_timezone or 'tzlocal()'
from_tz = from_timezone or _get_local_timezone()
to_tz = to_timezone or _get_local_timezone()
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64tz_dtype(s.dtype):
return s.dt.tz_convert(to_tz).dt.tz_localize(None)
elif is_datetime64_dtype(s.dtype) and from_tz != to_tz:
# `s.dt.tz_localize('tzlocal()')` doesn't work properly when including NaT.
return s.apply(lambda ts: ts.tz_localize(from_tz).tz_convert(to_tz).tz_localize(None)
if ts is not pd.NaT else pd.NaT)
return s.dt.tz_localize(from_tz).dt.tz_convert(to_tz).dt.tz_localize(None)
else:
return s

Expand Down

0 comments on commit e87bd76

Please sign in to comment.