Minor improvements to the docs

mediacloud · Jul 18, 2019 · 3dde959 · 3dde959
1 parent 5075ec6
commit 3dde959
Show file tree

Hide file tree

Showing 8 changed files with 242 additions and 58 deletions.
diff --git a/usp/exceptions.py b/usp/exceptions.py
@@ -2,20 +2,28 @@
 
 
 class SitemapException(Exception):
-    """Problem due to which we can't run further, e.g. wrong input parameters."""
+    """
+    Problem due to which we can't run further, e.g. wrong input parameters.
+    """
     pass
 
 
 class SitemapXMLParsingException(Exception):
-    """XML parsing exception to be handled gracefully."""
+    """
+    XML parsing exception to be handled gracefully.
+    """
     pass
 
 
 class GunzipException(Exception):
-    """gunzip() exception."""
+    """
+    gunzip() exception.
+    """
     pass
 
 
 class StripURLToHomepageException(Exception):
-    """strip_url_to_homepage() exception."""
+    """
+    strip_url_to_homepage() exception.
+    """
     pass
diff --git a/usp/fetch_parse.py b/usp/fetch_parse.py
@@ -246,15 +246,20 @@ def sitemap(self) -> AbstractSitemap:
 
     @classmethod
     def __normalize_xml_element_name(cls, name: str):
-        """Replace the namespace URL in the argument element name with internal namespace.
+        """
+        Replace the namespace URL in the argument element name with internal namespace.
 
         * Elements from http://www.sitemaps.org/schemas/sitemap/0.9 namespace will be prefixed with "sitemap:",
           e.g. "<loc>" will become "<sitemap:loc>"
 
         * Elements from http://www.google.com/schemas/sitemap-news/0.9 namespace will be prefixed with "news:",
           e.g. "<publication>" will become "<news:publication>"
 
-        For non-sitemap namespaces, return the element name with the namespace stripped."""
+        For non-sitemap namespaces, return the element name with the namespace stripped.
+
+        :param name: Namespace URL plus XML element name, e.g. "http://www.sitemaps.org/schemas/sitemap/0.9 loc"
+        :return: Internal namespace name plus element name, e.g. "sitemap loc"
+        """
 
         name_parts = name.split(cls.__XML_NAMESPACE_SEPARATOR)
 
@@ -332,7 +337,9 @@ def _xml_char_data(self, data: str) -> None:
 
 
 class AbstractXMLSitemapParser(object, metaclass=abc.ABCMeta):
-    """Abstract XML sitemap parser."""
+    """
+    Abstract XML sitemap parser.
+    """
 
     __slots__ = [
         # URL of the sitemap that is being parsed
@@ -374,7 +381,9 @@ def sitemap(self) -> AbstractSitemap:
 
 
 class IndexXMLSitemapParser(AbstractXMLSitemapParser):
-    """Index XML sitemap parser."""
+    """
+    Index XML sitemap parser.
+    """
 
     __slots__ = [
         '_web_client',
@@ -430,7 +439,9 @@ def sitemap(self) -> AbstractSitemap:
 
 
 class PagesXMLSitemapParser(AbstractXMLSitemapParser):
-    """Pages XML sitemap parser."""
+    """
+    Pages XML sitemap parser.
+    """
 
     class Page(object):
         """Simple data class for holding various properties for a single <url> entry while parsing."""
@@ -666,7 +677,9 @@ class PagesRSSSitemapParser(AbstractXMLSitemapParser):
     """
 
     class Page(object):
-        """Simple data class for holding various properties for a single <item> entry while parsing."""
+        """
+        Simple data class for holding various properties for a single <item> entry while parsing.
+        """
 
         __slots__ = [
             'link',

diff --git a/usp/helpers.py b/usp/helpers.py
@@ -21,7 +21,12 @@
 
 
 def is_http_url(url: str) -> bool:
-    """Returns true if URL is in the "http" ("https") scheme."""
+    """
+    Returns true if URL is of the "http" ("https") scheme.
+
+    :param url: URL to test.
+    :return: True if argument URL is of the "http" ("https") scheme.
+    """
     if url is None:
         log.debug("URL is None")
         return False
@@ -58,7 +63,12 @@ def is_http_url(url: str) -> bool:
 
 
 def html_unescape_strip(string: Optional[str]) -> Optional[str]:
-    """Decode HTML entities, strip string, set to None if it's empty; ignore None as input."""
+    """
+    Decode HTML entities, strip string, set to None if it's empty; ignore None as input.
+
+    :param string: String to decode HTML entities in.
+    :return: Stripped string with HTML entities decoded; None if parameter string was empty or None.
+    """
     if string:
         string = html.unescape(string)
         string = string.strip()
@@ -68,7 +78,12 @@ def html_unescape_strip(string: Optional[str]) -> Optional[str]:
 
 
 def parse_iso8601_date(date_string: str) -> datetime.datetime:
-    """Parse sitemap's <publication_date> into datetime.datetime object."""
+    """
+    Parse ISO 8601 date (e.g. from sitemap's <publication_date>) into datetime.datetime object.
+
+    :param date_string: ISO 8601 date, e.g. "2018-01-12T21:57:27Z" or "1997-07-16T19:20:30+01:00".
+    :return: datetime.datetime object of a parsed date.
+    """
     # FIXME parse known date formats faster
 
     if not date_string:
@@ -80,7 +95,12 @@ def parse_iso8601_date(date_string: str) -> datetime.datetime:
 
 
 def parse_rfc2822_date(date_string: str) -> datetime.datetime:
-    """Parse RSS / Atom feed's <pubDate> into datetime.datetime object."""
+    """
+    Parse RFC 2822 date (e.g. from Atom's <issued>) into datetime.datetime object.
+
+    :param date_string: RFC 2822 date, e.g. "Tue, 10 Aug 2010 20:43:53 -0000".
+    :return: datetime.datetime object of a parsed date.
+    """
     # FIXME parse known date formats faster
     return parse_iso8601_date(date_string)
 
@@ -89,7 +109,15 @@ def get_url_retry_on_client_errors(url: str,
                                    web_client: AbstractWebClient,
                                    retry_count: int = 5,
                                    sleep_between_retries: int = 1) -> AbstractWebClientResponse:
-    """Fetch URL, retry on client errors (which, as per implementation, might be request timeouts too)."""
+    """
+    Fetch URL, retry on retryable errors.
+
+    :param url: URL to fetch.
+    :param web_client: Web client object to use for fetching.
+    :param retry_count: How many times to retry fetching the same URL.
+    :param sleep_between_retries: How long to sleep between retries, in seconds.
+    :return: Web client response object.
+    """
     assert retry_count > 0, "Retry count must be positive."
 
     response = None
@@ -114,7 +142,13 @@ def get_url_retry_on_client_errors(url: str,
 
 
 def __response_is_gzipped_data(url: str, response: AbstractWebClientResponse) -> bool:
-    """Return True if Response looks like it's gzipped."""
+    """
+    Return True if Response looks like it's gzipped.
+
+    :param url: URL the response was fetched from.
+    :param response: Response object.
+    :return: True if response looks like it might contain gzipped data.
+    """
     uri = urlparse(url)
     url_path = unquote_plus(uri.path)
     content_type = response.header('content-type') or ''
@@ -127,7 +161,12 @@ def __response_is_gzipped_data(url: str, response: AbstractWebClientResponse) ->
 
 
 def __gunzip(data: bytes) -> bytes:
-    """Gunzip data."""
+    """
+    Gunzip data.
+
+    :param data: Gzipped data.
+    :return: Gunzipped data.
+    """
 
     if data is None:
         raise GunzipException("Data is None.")
@@ -153,7 +192,13 @@ def __gunzip(data: bytes) -> bytes:
 
 
 def ungzipped_response_content(url: str, response: AbstractWebClientResponse) -> str:
-    """Return HTTP response's decoded content, gunzip it if necessary."""
+    """
+    Return HTTP response's decoded content, gunzip it if necessary.
+
+    :param url: URL the response was fetched from.
+    :param response: Response object.
+    :return: Decoded and (if necessary) gunzipped response string.
+    """
 
     data = response.raw_data()
 
@@ -172,7 +217,12 @@ def ungzipped_response_content(url: str, response: AbstractWebClientResponse) ->
 
 
 def strip_url_to_homepage(url: str) -> str:
-    """Strip URL (e.g. http://www.example.com/page.html) to its homepage (e.g. http://www.example.com/)."""
+    """
+    Strip URL to its homepage.
+
+    :param url: URL to strip, e.g. "http://www.example.com/page.html".
+    :return: Stripped homepage URL, e.g. "http://www.example.com/"
+    """
     if not url:
         raise StripURLToHomepageException("URL is empty.")
 

diff --git a/usp/log.py b/usp/log.py
@@ -4,7 +4,9 @@
 
 
 class Logger(object):
-    """Logging helper class."""
+    """
+    Logging helper class.
+    """
 
     __LEVELS = {
         'CRITICAL': logging.CRITICAL,
@@ -24,7 +26,11 @@ class Logger(object):
     ]
 
     def __init__(self, name: str):
-        """Initialize logger object for a given name."""
+        """
+        Initialize logger object for a given name.
+
+        :param name: Module name that the logger should be initialized for.
+        """
 
         self.__l = logging.getLogger(name)
         if not self.__l.handlers:
@@ -43,22 +49,43 @@ def __init__(self, name: str):
             self.__l.propagate = False
 
     def error(self, message: str) -> None:
-        """Log error message."""
+        """
+        Log error message.
+
+        :param message: Message to log.
+        """
         self.__l.error(message)
 
     def warning(self, message: str) -> None:
-        """Log warning message."""
+        """
+        Log warning message.
+
+        :param message: Message to log.
+        """
         self.__l.warning(message)
 
     def info(self, message: str) -> None:
-        """Log informational message."""
+        """
+        Log informational message.
+
+        :param message: Message to log.
+        """
         self.__l.info(message)
 
     def debug(self, message: str) -> None:
-        """Log debugging message."""
+        """
+        Log debugging message.
+
+        :param message: Message to log.
+        """
         self.__l.debug(message)
 
 
 def create_logger(name: str) -> Logger:
-    """Create and return Logger object."""
+    """
+    Create and return Logger object.
+
+    :param name: Module name that the logger should be initialized for.
+    :return: Logger object.
+    """
     return Logger(name=name)