Skip to content

Commit

Permalink
Merge branch 'feature/fix_date_timezones' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
pypt committed Jul 17, 2019
2 parents 83255c5 + 617676c commit afdb888
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 18 deletions.
26 changes: 19 additions & 7 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,47 @@
import pytest

from usp.exceptions import StripURLToHomepageException
from usp.helpers import html_unescape_strip, parse_sitemap_publication_date, is_http_url, strip_url_to_homepage
from usp.helpers import html_unescape_strip, parse_iso8601_date, is_http_url, strip_url_to_homepage, parse_rfc2822_date


def test_html_unescape_strip():
assert html_unescape_strip(" tests & tests ") == "tests & tests"
assert html_unescape_strip(None) is None


def test_parse_sitemap_publication_date():
assert parse_sitemap_publication_date("1997-07-16") == datetime.datetime(year=1997, month=7, day=16)
assert parse_sitemap_publication_date("1997-07-16T19:20+01:00") == datetime.datetime(
def test_parse_iso8601_date():
assert parse_iso8601_date("1997-07-16") == datetime.datetime(year=1997, month=7, day=16)
assert parse_iso8601_date("1997-07-16T19:20+01:00") == datetime.datetime(
year=1997, month=7, day=16, hour=19, minute=20,
tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)),
)
assert parse_sitemap_publication_date("1997-07-16T19:20:30+01:00") == datetime.datetime(
assert parse_iso8601_date("1997-07-16T19:20:30+01:00") == datetime.datetime(
year=1997, month=7, day=16, hour=19, minute=20, second=30,
tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)),
)
assert parse_sitemap_publication_date("1997-07-16T19:20:30.45+01:00") == datetime.datetime(
assert parse_iso8601_date("1997-07-16T19:20:30.45+01:00") == datetime.datetime(
year=1997, month=7, day=16, hour=19, minute=20, second=30, microsecond=450000,
tzinfo=datetime.timezone(datetime.timedelta(seconds=3600)),
)

# "Z" timezone instead of "+\d\d:\d\d"
assert parse_sitemap_publication_date("2018-01-12T21:57:27Z") == datetime.datetime(
assert parse_iso8601_date("2018-01-12T21:57:27Z") == datetime.datetime(
year=2018, month=1, day=12, hour=21, minute=57, second=27, tzinfo=datetime.timezone.utc,
)


def test_parse_rfc2822_date():
assert parse_rfc2822_date("Tue, 10 Aug 2010 20:43:53 -0000") == datetime.datetime(
year=2010, month=8, day=10, hour=20, minute=43, second=53, microsecond=0,
tzinfo=datetime.timezone(datetime.timedelta(seconds=0)),
)

assert parse_rfc2822_date("Thu, 17 Dec 2009 12:04:56 +0200") == datetime.datetime(
year=2009, month=12, day=17, hour=12, minute=4, second=56, microsecond=0,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
)


# noinspection SpellCheckingInspection
def test_is_http_url():
# noinspection PyTypeChecker
Expand Down
4 changes: 2 additions & 2 deletions tests/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import difflib
import textwrap
from decimal import Decimal
from email.utils import formatdate
from email.utils import format_datetime
from http import HTTPStatus
from unittest import TestCase

Expand Down Expand Up @@ -46,7 +46,7 @@ class TestSitemapTree(TestCase):
TEST_DATE_STR_ISO8601 = TEST_DATE_DATETIME.isoformat()
"""Test string date formatted as ISO 8601 (for XML and Atom 0.3 / 1.0 sitemaps)."""

TEST_DATE_STR_RFC2822 = formatdate(float(TEST_DATE_DATETIME.strftime('%s')), localtime=True)
TEST_DATE_STR_RFC2822 = format_datetime(TEST_DATE_DATETIME)
"""Test string date formatted as RFC 2822 (for RSS 2.0 sitemaps)."""

TEST_PUBLICATION_NAME = 'Test publication'
Expand Down
12 changes: 6 additions & 6 deletions usp/fetch_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
from .exceptions import SitemapException, SitemapXMLParsingException
from .helpers import (
html_unescape_strip,
parse_sitemap_publication_date,
parse_iso8601_date,
get_url_retry_on_client_errors,
ungzipped_response_content,
is_http_url,
parse_rss_atom_publication_date,
parse_rfc2822_date,
)
from .log import create_logger
from .objects import (
Expand Down Expand Up @@ -461,7 +461,7 @@ def page(self) -> Optional[SitemapPage]:

last_modified = html_unescape_strip(self.last_modified)
if last_modified:
last_modified = parse_sitemap_publication_date(last_modified)
last_modified = parse_iso8601_date(last_modified)

change_frequency = html_unescape_strip(self.change_frequency)
if change_frequency:
Expand Down Expand Up @@ -493,7 +493,7 @@ def page(self) -> Optional[SitemapPage]:

news_publish_date = html_unescape_strip(self.news_publish_date)
if news_publish_date:
news_publish_date = parse_sitemap_publication_date(date_string=news_publish_date)
news_publish_date = parse_iso8601_date(date_string=news_publish_date)

news_publication_name = html_unescape_strip(self.news_publication_name)
news_publication_language = html_unescape_strip(self.news_publication_language)
Expand Down Expand Up @@ -670,7 +670,7 @@ def page(self) -> Optional[SitemapPage]:

publication_date = html_unescape_strip(self.publication_date)
if publication_date:
publication_date = parse_rss_atom_publication_date(publication_date)
publication_date = parse_rfc2822_date(publication_date)

return SitemapPage(
url=link,
Expand Down Expand Up @@ -789,7 +789,7 @@ def page(self) -> Optional[SitemapPage]:

publication_date = html_unescape_strip(self.publication_date)
if publication_date:
publication_date = parse_rss_atom_publication_date(publication_date)
publication_date = parse_rfc2822_date(publication_date)

return SitemapPage(
url=link,
Expand Down
6 changes: 3 additions & 3 deletions usp/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def html_unescape_strip(string: Optional[str]) -> Optional[str]:
return string


def parse_sitemap_publication_date(date_string: str) -> datetime.datetime:
def parse_iso8601_date(date_string: str) -> datetime.datetime:
"""Parse sitemap's <publication_date> into datetime.datetime object."""
# FIXME parse known date formats faster

Expand All @@ -79,10 +79,10 @@ def parse_sitemap_publication_date(date_string: str) -> datetime.datetime:
return date


def parse_rss_atom_publication_date(date_string: str) -> datetime.datetime:
def parse_rfc2822_date(date_string: str) -> datetime.datetime:
"""Parse RSS / Atom feed's <pubDate> into datetime.datetime object."""
# FIXME parse known date formats faster
return parse_sitemap_publication_date(date_string)
return parse_iso8601_date(date_string)


def get_url_retry_on_client_errors(url: str,
Expand Down

0 comments on commit afdb888

Please sign in to comment.