Merge branch 'feature/reduce_memory_consumption' into develop

Fixes #2.
mediacloud · Jul 18, 2019 · 9b18eb4 · 9b18eb4
2 parents afdb888 + 5375ad5
commit 9b18eb4
Show file tree

Hide file tree

Showing 5 changed files with 569 additions and 115 deletions.
diff --git a/README.rst b/README.rst
@@ -33,6 +33,7 @@ Features
 - Error-tolerant with more common sitemap bugs
 - Tries to find sitemaps not listed in ``robots.txt``
 - Uses fast and memory efficient Expat XML parsing
+- Don't consume much memory even with massive sitemap hierarchies
 - Provides a generated sitemap tree as easy to use object tree
 - Supports using a custom web client
 - Uses a small number of actively maintained third-party modules
@@ -55,7 +56,10 @@ Usage
     from usp.tree import sitemap_tree_for_homepage
 
     tree = sitemap_tree_for_homepage('https://www.nytimes.com/')
-    print(tree.all_pages())
+
+    # all_pages() returns an Iterator
+    for page in tree.all_pages():
+        print(page)
 
 Check out the `API reference in the documentation <https://ultimate-sitemap-parser.readthedocs.io/en/latest/>`_ for more details.
 
diff --git a/setup.py b/setup.py
@@ -35,9 +35,6 @@ def __readme():
     python_requires='>=3.5',
     install_requires=[
 
-        # No dunder methods
-        'attrs>=18.2.0',
-
         # Parsing arbitrary dates (sitemap date format is standardized but some implementations take liberties)
         'python-dateutil>=2.1,<3.0.0',
 

diff --git a/tests/test_tree.py b/tests/test_tree.py
@@ -383,7 +383,7 @@ def test_sitemap_tree_for_homepage(self):
 
             assert expected_sitemap_tree == actual_sitemap_tree, diff_str
 
-            assert len(actual_sitemap_tree.all_pages()) == 5
+            assert len(list(actual_sitemap_tree.all_pages())) == 6
 
     def test_sitemap_tree_for_homepage_gzip(self):
         """Test sitemap_tree_for_homepage() with gzipped sitemaps."""
@@ -470,12 +470,15 @@ def test_sitemap_tree_for_homepage_gzip(self):
             assert len(actual_sitemap_tree.sub_sitemaps) == 1
 
             assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)
+            # noinspection PyUnresolvedReferences
             assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2
 
+            # noinspection PyUnresolvedReferences
             sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]
             assert isinstance(sitemap_1, PagesXMLSitemap)
             assert len(sitemap_1.pages) == 1
 
+            # noinspection PyUnresolvedReferences
             sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]
             assert isinstance(sitemap_2, PagesXMLSitemap)
             assert len(sitemap_2.pages) == 1
@@ -533,19 +536,21 @@ def test_sitemap_tree_for_homepage_plain_text(self):
             assert len(actual_sitemap_tree.sub_sitemaps) == 1
 
             assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)
+            # noinspection PyUnresolvedReferences
             assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 2
 
+            # noinspection PyUnresolvedReferences
             sitemap_1 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]
             assert isinstance(sitemap_1, PagesTextSitemap)
             assert len(sitemap_1.pages) == 2
 
+            # noinspection PyUnresolvedReferences
             sitemap_2 = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[1]
             assert isinstance(sitemap_2, PagesTextSitemap)
             assert len(sitemap_2.pages) == 2
 
-            pages = actual_sitemap_tree.all_pages()
-            assert len(pages) == 3
-            print(pages)
+            pages = list(actual_sitemap_tree.all_pages())
+            assert len(pages) == 4
             assert SitemapPage(url='{}/news/foo.html'.format(self.TEST_BASE_URL)) in pages
             assert SitemapPage(url='{}/news/bar.html'.format(self.TEST_BASE_URL)) in pages
             assert SitemapPage(url='{}/news/baz.html'.format(self.TEST_BASE_URL)) in pages
@@ -770,7 +775,7 @@ def test_sitemap_tree_for_homepage_rss_atom(self):
 
             assert expected_sitemap_tree == actual_sitemap_tree, diff_str
 
-            assert len(actual_sitemap_tree.all_pages()) == 6
+            assert len(list(actual_sitemap_tree.all_pages())) == 6
 
     def test_sitemap_tree_for_homepage_rss_atom_empty(self):
         """Test sitemap_tree_for_homepage() with empty RSS 2.0 / Atom 0.3 / Atom 1.0 feeds."""
@@ -871,7 +876,7 @@ def test_sitemap_tree_for_homepage_rss_atom_empty(self):
 
             assert expected_sitemap_tree == actual_sitemap_tree
 
-            assert len(actual_sitemap_tree.all_pages()) == 0
+            assert len(list(actual_sitemap_tree.all_pages())) == 0
 
     def test_sitemap_tree_for_homepage_prematurely_ending_xml(self):
         """Test sitemap_tree_for_homepage() with clipped XML.
@@ -952,8 +957,10 @@ def test_sitemap_tree_for_homepage_prematurely_ending_xml(self):
             assert len(actual_sitemap_tree.sub_sitemaps) == 1
 
             assert isinstance(actual_sitemap_tree.sub_sitemaps[0], IndexRobotsTxtSitemap)
+            # noinspection PyUnresolvedReferences
             assert len(actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps) == 1
 
+            # noinspection PyUnresolvedReferences
             sitemap = actual_sitemap_tree.sub_sitemaps[0].sub_sitemaps[0]
             assert isinstance(sitemap, PagesXMLSitemap)
             assert len(sitemap.pages) == 2
@@ -1220,7 +1227,7 @@ def test_sitemap_tree_for_homepage_huge_sitemap(self):
 
             actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)
 
-            assert len(actual_sitemap_tree.all_pages()) == page_count
+            assert len(list(actual_sitemap_tree.all_pages())) == page_count
 
     def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self):
         """Test sitemap_tree_for_homepage() with weird (but valid) spacing."""
@@ -1271,7 +1278,7 @@ def test_sitemap_tree_for_homepage_robots_txt_weird_spacing(self):
             )
 
             actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)
-            assert len(actual_sitemap_tree.all_pages()) == 1
+            assert len(list(actual_sitemap_tree.all_pages())) == 1
 
     def test_sitemap_tree_for_homepage_utf8_bom(self):
         """Test sitemap_tree_for_homepage() with UTF-8 BOM in both robots.txt and sitemap."""
@@ -1329,4 +1336,4 @@ def test_sitemap_tree_for_homepage_utf8_bom(self):
             )
 
             actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.TEST_BASE_URL)
-            assert len(actual_sitemap_tree.all_pages()) == 1
+            assert len(list(actual_sitemap_tree.all_pages())) == 1
diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py
@@ -7,8 +7,6 @@
 from decimal import Decimal
 from typing import Optional, Dict
 
-import attr
-
 from .exceptions import SitemapException, SitemapXMLParsingException
 from .helpers import (
     html_unescape_strip,
@@ -434,21 +432,43 @@ def sitemap(self) -> AbstractSitemap:
 class PagesXMLSitemapParser(AbstractXMLSitemapParser):
     """Pages XML sitemap parser."""
 
-    @attr.s(slots=True)
     class Page(object):
         """Simple data class for holding various properties for a single <url> entry while parsing."""
-        url = attr.ib(type=str, default=None, hash=True)
-        last_modified = attr.ib(type=Optional[str], default=None, hash=False)
-        change_frequency = attr.ib(type=Optional[str], default=None, hash=False)
-        priority = attr.ib(type=Optional[str], default=None, hash=False)
-        news_title = attr.ib(type=Optional[str], default=None, hash=False)
-        news_publish_date = attr.ib(type=Optional[str], default=None, hash=False)
-        news_publication_name = attr.ib(type=Optional[str], default=None, hash=False)
-        news_publication_language = attr.ib(type=Optional[str], default=None, hash=False)
-        news_access = attr.ib(type=Optional[str], default=None, hash=False)
-        news_genres = attr.ib(type=Optional[str], default=None, hash=False)
-        news_keywords = attr.ib(type=Optional[str], default=None, hash=False)
-        news_stock_tickers = attr.ib(type=Optional[str], default=None, hash=False)
+
+        __slots__ = [
+            'url',
+            'last_modified',
+            'change_frequency',
+            'priority',
+            'news_title',
+            'news_publish_date',
+            'news_publication_name',
+            'news_publication_language',
+            'news_access',
+            'news_genres',
+            'news_keywords',
+            'news_stock_tickers',
+        ]
+
+        def __init__(self):
+            self.url = None
+            self.last_modified = None
+            self.change_frequency = None
+            self.priority = None
+            self.news_title = None
+            self.news_publish_date = None
+            self.news_publication_name = None
+            self.news_publication_language = None
+            self.news_access = None
+            self.news_genres = None
+            self.news_keywords = None
+            self.news_stock_tickers = None
+
+        def __hash__(self):
+            return hash((
+                # Hash only the URL to be able to find unique ones
+                self.url,
+            ))
 
         def page(self) -> Optional[SitemapPage]:
             """Return constructed sitemap page if one has been completed, otherwise None."""
@@ -645,13 +665,27 @@ class PagesRSSSitemapParser(AbstractXMLSitemapParser):
     https://validator.w3.org/feed/docs/rss2.html
     """
 
-    @attr.s(slots=True)
     class Page(object):
         """Simple data class for holding various properties for a single <item> entry while parsing."""
-        link = attr.ib(type=str, default=None, hash=True)
-        title = attr.ib(type=Optional[str], default=None, hash=False)
-        description = attr.ib(type=Optional[str], default=None, hash=False)
-        publication_date = attr.ib(type=Optional[str], default=None, hash=False)
+
+        __slots__ = [
+            'link',
+            'title',
+            'description',
+            'publication_date',
+        ]
+
+        def __init__(self):
+            self.link = None
+            self.title = None
+            self.description = None
+            self.publication_date = None
+
+        def __hash__(self):
+            return hash((
+                # Hash only the URL
+                self.link,
+            ))
 
         def page(self) -> Optional[SitemapPage]:
             """Return constructed sitemap page if one has been completed, otherwise None."""
@@ -764,13 +798,27 @@ class PagesAtomSitemapParser(AbstractXMLSitemapParser):
 
     # FIXME merge with RSS parser class as there are too many similarities
 
-    @attr.s(slots=True)
     class Page(object):
         """Simple data class for holding various properties for a single <entry> entry while parsing."""
-        link = attr.ib(type=str, default=None, hash=True)
-        title = attr.ib(type=Optional[str], default=None, hash=False)
-        description = attr.ib(type=Optional[str], default=None, hash=False)
-        publication_date = attr.ib(type=Optional[str], default=None, hash=False)
+
+        __slots__ = [
+            'link',
+            'title',
+            'description',
+            'publication_date',
+        ]
+
+        def __init__(self):
+            self.link = None
+            self.title = None
+            self.description = None
+            self.publication_date = None
+
+        def __hash__(self):
+            return hash((
+                # Hash only the URL
+                self.link,
+            ))
 
         def page(self) -> Optional[SitemapPage]:
             """Return constructed sitemap page if one has been completed, otherwise None."""