Skip to content

Commit

Permalink
use email.utils to parse date and time
Browse files Browse the repository at this point in the history
  • Loading branch information
zcqian committed Jul 13, 2021
1 parent 7be893a commit 181a36f
Showing 1 changed file with 10 additions and 28 deletions.
38 changes: 10 additions & 28 deletions biothings/hub/dataload/dumper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import cgi
import email.utils
import inspect
import os
import pprint
Expand Down Expand Up @@ -772,34 +773,15 @@ def remote_is_better(self, remotefile, localfile):
return True
# In accordance with RFC 7231
# The reason we are not using strptime is that it's locale sensitive
# and changing locale and then changing it bace is not thread safe.
# Although at the moment we are using Tornado which uses coroutines,
# we still have threads (for reasons) and down the line we may move
# to a different framework. The Dumper should be agnostic of what
# it is running on. Hence parsing the dates manually
last_modified_str = res.headers[self.LAST_MODIFIED]
http_date_regex = re.compile(
r'(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun), '
r'(?P<day>\d{2}) '
r'(?P<month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) '
r'(?P<year>\d{4}) '
r'(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2}) '
r'GMT'
)
match = http_date_regex.match(last_modified_str)
if not match:
self.logger.warning("Last-Modified is in an obsolete format")
return True
month_map = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr': 4, 'May': 5, 'Jun':6,
'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
dt_kwargs = {'tzinfo': timezone.utc}
for k in ('year', 'day', 'hour', 'minute', 'second'):
dt_kwargs[k] = int(match.group(k))
dt_kwargs['month'] = month_map[match.group('month')]
# RFC 7231 defines leap seconds
if dt_kwargs['second'] == 60:
dt_kwargs['second'] = 59
dt = datetime(**dt_kwargs)
# and changing locale and then changing it back is not thread safe.
dt_tuple = email.utils.parsedate(res.headers[self.LAST_MODIFIED])
# this utility function supports more malformed data so using this one
if dt_tuple[5] == 60:
_ = list(dt_tuple)
_[5] = 59
dt_tuple = tuple(_)
# deal with potential leap second as defined in the RFC, good enough solution
dt = datetime(*dt_tuple[:6], tzinfo=timezone.utc) # HTTP-date is always in UTC
remote_lastmodified = dt.timestamp()
try:
res = os.stat(localfile)
Expand Down

0 comments on commit 181a36f

Please sign in to comment.