Skip to content

Commit

Permalink
Merge branch 'feature/reduce_memory_consumption' into develop
Browse files Browse the repository at this point in the history
Fixes #2.
  • Loading branch information
pypt committed Jul 18, 2019
2 parents afdb888 + 5375ad5 commit 9b18eb4
Show file tree
Hide file tree
Showing 5 changed files with 569 additions and 115 deletions.
6 changes: 5 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Features
- Error-tolerant with more common sitemap bugs
- Tries to find sitemaps not listed in ``robots.txt``
- Uses fast and memory efficient Expat XML parsing
- Don't consume much memory even with massive sitemap hierarchies
- Provides a generated sitemap tree as easy to use object tree
- Supports using a custom web client
- Uses a small number of actively maintained third-party modules
Expand All @@ -55,7 +56,10 @@ Usage
from usp.tree import sitemap_tree_for_homepage
tree = sitemap_tree_for_homepage('https://www.nytimes.com/')
print(tree.all_pages())
# all_pages() returns an Iterator
for page in tree.all_pages():
print(page)
Check out the `API reference in the documentation <https://ultimate-sitemap-parser.readthedocs.io/en/latest/>`_ for more details.

3 changes: 0 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@ def __readme():
python_requires='>=3.5',
install_requires=[

# No dunder methods
'attrs>=18.2.0',

# Parsing arbitrary dates (sitemap date format is standardized but some implementations take liberties)
'python-dateutil>=2.1,<3.0.0',

Expand Down
25 changes: 16 additions & 9 deletions tests/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def test_sitemap_tree_for_homepage(self):

assert expected_sitemap_tree == actual_sitemap_tree, diff_str

assert len(actual_sitemap_tree.all_pages()) == 5
assert len(list(actual_sitemap_tree.all_pages())) == 6

def test_sitemap_tree_for_homepage_gzip(self):
"""Test sitemap_tree_for_homepage() with gzipped sitemaps."""
Expand Down Expand Up @@ -470,12 +470,15 @@ def test_sitemap_tree_for_homepage_gzip(self):
assert len(actual_sitemap_tree.sub_sitemaps) == 1

assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)
# noinspection PyUnresolvedReferences
assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2

# noinspection PyUnresolvedReferences
sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]
assert isinstance(sitemap_1, PagesXMLSitemap)
assert len(sitemap_1.pages) == 1

# noinspection PyUnresolvedReferences
sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]
assert isinstance(sitemap_2, PagesXMLSitemap)
assert len(sitemap_2.pages) == 1
Expand Down Expand Up @@ -533,19 +536,21 @@ def test_sitemap_tree_for_homepage_plain_text(self):
assert len(actual_sitemap_tree.sub_sitemaps) == 1

assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)
# noinspection PyUnresolvedReferences
assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2

# noinspection PyUnresolvedReferences
sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]
assert isinstance(sitemap_1, PagesTextSitemap)
assert len(sitemap_1.pages) == 2

# noinspection PyUnresolvedReferences
sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]
assert isinstance(sitemap_2, PagesTextSitemap)
assert len(sitemap_2.pages) == 2

pages = actual_sitemap_tree.all_pages()
assert len(pages) == 3
print(pages)
pages = list(actual_sitemap_tree.all_pages())
assert len(pages) == 4
assert SitemapPage(url='{}/news/foo.html'.format(self.TEST_BASE_URL)) in pages
assert SitemapPage(url='{}/news/bar.html'.format(self.TEST_BASE_URL)) in pages
assert SitemapPage(url='{}/news/baz.html'.format(self.TEST_BASE_URL)) in pages
Expand Down Expand Up @@ -770,7 +775,7 @@ def test_sitemap_tree_for_homepage_rss_atom(self):

assert expected_sitemap_tree == actual_sitemap_tree, diff_str

assert len(actual_sitemap_tree.all_pages()) == 6
assert len(list(actual_sitemap_tree.all_pages())) == 6

def test_sitemap_tree_for_homepage_rss_atom_empty(self):
"""Test sitemap_tree_for_homepage() with empty RSS 2.0 / Atom 0.3 / Atom 1.0 feeds."""
Expand Down Expand Up @@ -871,7 +876,7 @@ def test_sitemap_tree_for_homepage_rss_atom_empty(self):

assert expected_sitemap_tree == actual_sitemap_tree

assert len(actual_sitemap_tree.all_pages()) == 0
assert len(list(actual_sitemap_tree.all_pages())) == 0

def test_sitemap_tree_for_homepage_prematurely_ending_xml(self):
"""Test sitemap_tree_for_homepage() with clipped XML.
Expand Down Expand Up @@ -952,8 +957,10 @@ def test_sitemap_tree_for_homepage_prematurely_ending_xml(self):
assert len(actual_sitemap_tree.sub_sitemaps) == 1

assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)
# noinspection PyUnresolvedReferences
assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 1

# noinspection PyUnresolvedReferences
sitemap = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]
assert isinstance(sitemap, PagesXMLSitemap)
assert len(sitemap.pages) == 2
Expand Down Expand Up @@ -1220,7 +1227,7 @@ def test_sitemap_tree_for_homepage_huge_sitemap(self):

actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)

assert len(actual_sitemap_tree.all_pages()) == page_count
assert len(list(actual_sitemap_tree.all_pages())) == page_count

def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self):
"""Test sitemap_tree_for_homepage() with weird (but valid) spacing."""
Expand Down Expand Up @@ -1271,7 +1278,7 @@ def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self):
)

actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)
assert len(actual_sitemap_tree.all_pages()) == 1
assert len(list(actual_sitemap_tree.all_pages())) == 1

def test_sitemap_tree_for_homepage_utf8_bom(self):
"""Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap."""
Expand Down Expand Up @@ -1329,4 +1336,4 @@ def test_sitemap_tree_for_homepage_utf8_bom(self):
)

actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)
assert len(actual_sitemap_tree.all_pages()) == 1
assert len(list(actual_sitemap_tree.all_pages())) == 1
98 changes: 73 additions & 25 deletions usp/fetch_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
from decimal import Decimal
from typing import Optional, Dict

import attr

from .exceptions import SitemapException, SitemapXMLParsingException
from .helpers import (
html_unescape_strip,
Expand Down Expand Up @@ -434,21 +432,43 @@ def sitemap(self) -> AbstractSitemap:
class PagesXMLSitemapParser(AbstractXMLSitemapParser):
"""Pages XML sitemap parser."""

@attr.s(slots=True)
class Page(object):
"""Simple data class for holding various properties for a single <url> entry while parsing."""
url = attr.ib(type=str, default=None, hash=True)
last_modified = attr.ib(type=Optional[str], default=None, hash=False)
change_frequency = attr.ib(type=Optional[str], default=None, hash=False)
priority = attr.ib(type=Optional[str], default=None, hash=False)
news_title = attr.ib(type=Optional[str], default=None, hash=False)
news_publish_date = attr.ib(type=Optional[str], default=None, hash=False)
news_publication_name = attr.ib(type=Optional[str], default=None, hash=False)
news_publication_language = attr.ib(type=Optional[str], default=None, hash=False)
news_access = attr.ib(type=Optional[str], default=None, hash=False)
news_genres = attr.ib(type=Optional[str], default=None, hash=False)
news_keywords = attr.ib(type=Optional[str], default=None, hash=False)
news_stock_tickers = attr.ib(type=Optional[str], default=None, hash=False)

__slots__ = [
'url',
'last_modified',
'change_frequency',
'priority',
'news_title',
'news_publish_date',
'news_publication_name',
'news_publication_language',
'news_access',
'news_genres',
'news_keywords',
'news_stock_tickers',
]

def __init__(self):
self.url = None
self.last_modified = None
self.change_frequency = None
self.priority = None
self.news_title = None
self.news_publish_date = None
self.news_publication_name = None
self.news_publication_language = None
self.news_access = None
self.news_genres = None
self.news_keywords = None
self.news_stock_tickers = None

def __hash__(self):
return hash((
# Hash only the URL to be able to find unique ones
self.url,
))

def page(self) -> Optional[SitemapPage]:
"""Return constructed sitemap page if one has been completed, otherwise None."""
Expand Down Expand Up @@ -645,13 +665,27 @@ class PagesRSSSitemapParser(AbstractXMLSitemapParser):
https://validator.w3.org/feed/docs/rss2.html
"""

@attr.s(slots=True)
class Page(object):
"""Simple data class for holding various properties for a single <item> entry while parsing."""
link = attr.ib(type=str, default=None, hash=True)
title = attr.ib(type=Optional[str], default=None, hash=False)
description = attr.ib(type=Optional[str], default=None, hash=False)
publication_date = attr.ib(type=Optional[str], default=None, hash=False)

__slots__ = [
'link',
'title',
'description',
'publication_date',
]

def __init__(self):
self.link = None
self.title = None
self.description = None
self.publication_date = None

def __hash__(self):
return hash((
# Hash only the URL
self.link,
))

def page(self) -> Optional[SitemapPage]:
"""Return constructed sitemap page if one has been completed, otherwise None."""
Expand Down Expand Up @@ -764,13 +798,27 @@ class PagesAtomSitemapParser(AbstractXMLSitemapParser):

# FIXME merge with RSS parser class as there are too many similarities

@attr.s(slots=True)
class Page(object):
"""Simple data class for holding various properties for a single <entry> entry while parsing."""
link = attr.ib(type=str, default=None, hash=True)
title = attr.ib(type=Optional[str], default=None, hash=False)
description = attr.ib(type=Optional[str], default=None, hash=False)
publication_date = attr.ib(type=Optional[str], default=None, hash=False)

__slots__ = [
'link',
'title',
'description',
'publication_date',
]

def __init__(self):
self.link = None
self.title = None
self.description = None
self.publication_date = None

def __hash__(self):
return hash((
# Hash only the URL
self.link,
))

def page(self) -> Optional[SitemapPage]:
"""Return constructed sitemap page if one has been completed, otherwise None."""
Expand Down

0 comments on commit 9b18eb4

Please sign in to comment.