Skip to content

Commit

Permalink
Minor improvements to the docs
Browse files Browse the repository at this point in the history
  • Loading branch information
pypt committed Jul 18, 2019
1 parent 5075ec6 commit 3dde959
Show file tree
Hide file tree
Showing 8 changed files with 242 additions and 58 deletions.
16 changes: 12 additions & 4 deletions usp/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,28 @@


class SitemapException(Exception):
"""Problem due to which we can't run further, e.g. wrong input parameters."""
"""
Problem due to which we can't run further, e.g. wrong input parameters.
"""
pass


class SitemapXMLParsingException(Exception):
"""XML parsing exception to be handled gracefully."""
"""
XML parsing exception to be handled gracefully.
"""
pass


class GunzipException(Exception):
"""gunzip() exception."""
"""
gunzip() exception.
"""
pass


class StripURLToHomepageException(Exception):
"""strip_url_to_homepage() exception."""
"""
strip_url_to_homepage() exception.
"""
pass
25 changes: 19 additions & 6 deletions usp/fetch_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,15 +246,20 @@ def sitemap(self) -> AbstractSitemap:

@classmethod
def __normalize_xml_element_name(cls, name: str):
"""Replace the namespace URL in the argument element name with internal namespace.
"""
Replace the namespace URL in the argument element name with internal namespace.
* Elements from http://www.sitemaps.org/schemas/sitemap/0.9 namespace will be prefixed with "sitemap:",
e.g. "<loc>" will become "<sitemap:loc>"
* Elements from http://www.google.com/schemas/sitemap-news/0.9 namespace will be prefixed with "news:",
e.g. "<publication>" will become "<news:publication>"
For non-sitemap namespaces, return the element name with the namespace stripped."""
For non-sitemap namespaces, return the element name with the namespace stripped.
:param name: Namespace URL plus XML element name, e.g. "http://www.sitemaps.org/schemas/sitemap/0.9 loc"
:return: Internal namespace name plus element name, e.g. "sitemap loc"
"""

name_parts = name.split(cls.__XML_NAMESPACE_SEPARATOR)

Expand Down Expand Up @@ -332,7 +337,9 @@ def _xml_char_data(self, data: str) -> None:


class AbstractXMLSitemapParser(object, metaclass=abc.ABCMeta):
"""Abstract XML sitemap parser."""
"""
Abstract XML sitemap parser.
"""

__slots__ = [
# URL of the sitemap that is being parsed
Expand Down Expand Up @@ -374,7 +381,9 @@ def sitemap(self) -> AbstractSitemap:


class IndexXMLSitemapParser(AbstractXMLSitemapParser):
"""Index XML sitemap parser."""
"""
Index XML sitemap parser.
"""

__slots__ = [
'_web_client',
Expand Down Expand Up @@ -430,7 +439,9 @@ def sitemap(self) -> AbstractSitemap:


class PagesXMLSitemapParser(AbstractXMLSitemapParser):
"""Pages XML sitemap parser."""
"""
Pages XML sitemap parser.
"""

class Page(object):
"""Simple data class for holding various properties for a single <url> entry while parsing."""
Expand Down Expand Up @@ -666,7 +677,9 @@ class PagesRSSSitemapParser(AbstractXMLSitemapParser):
"""

class Page(object):
"""Simple data class for holding various properties for a single <item> entry while parsing."""
"""
Simple data class for holding various properties for a single <item> entry while parsing.
"""

__slots__ = [
'link',
Expand Down
68 changes: 59 additions & 9 deletions usp/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@


def is_http_url(url: str) -> bool:
"""Returns true if URL is in the "http" ("https") scheme."""
"""
Returns true if URL is of the "http" ("https") scheme.
:param url: URL to test.
:return: True if argument URL is of the "http" ("https") scheme.
"""
if url is None:
log.debug("URL is None")
return False
Expand Down Expand Up @@ -58,7 +63,12 @@ def is_http_url(url: str) -> bool:


def html_unescape_strip(string: Optional[str]) -> Optional[str]:
"""Decode HTML entities, strip string, set to None if it's empty; ignore None as input."""
"""
Decode HTML entities, strip string, set to None if it's empty; ignore None as input.
:param string: String to decode HTML entities in.
:return: Stripped string with HTML entities decoded; None if parameter string was empty or None.
"""
if string:
string = html.unescape(string)
string = string.strip()
Expand All @@ -68,7 +78,12 @@ def html_unescape_strip(string: Optional[str]) -> Optional[str]:


def parse_iso8601_date(date_string: str) -> datetime.datetime:
"""Parse sitemap's <publication_date> into datetime.datetime object."""
"""
Parse ISO 8601 date (e.g. from sitemap's <publication_date>) into datetime.datetime object.
:param date_string: ISO 8601 date, e.g. "2018-01-12T21:57:27Z" or "1997-07-16T19:20:30+01:00".
:return: datetime.datetime object of a parsed date.
"""
# FIXME parse known date formats faster

if not date_string:
Expand All @@ -80,7 +95,12 @@ def parse_iso8601_date(date_string: str) -> datetime.datetime:


def parse_rfc2822_date(date_string: str) -> datetime.datetime:
"""Parse RSS / Atom feed's <pubDate> into datetime.datetime object."""
"""
Parse RFC 2822 date (e.g. from Atom's <issued>) into datetime.datetime object.
:param date_string: RFC 2822 date, e.g. "Tue, 10 Aug 2010 20:43:53 -0000".
:return: datetime.datetime object of a parsed date.
"""
# FIXME parse known date formats faster
return parse_iso8601_date(date_string)

Expand All @@ -89,7 +109,15 @@ def get_url_retry_on_client_errors(url: str,
web_client: AbstractWebClient,
retry_count: int = 5,
sleep_between_retries: int = 1) -> AbstractWebClientResponse:
"""Fetch URL, retry on client errors (which, as per implementation, might be request timeouts too)."""
"""
Fetch URL, retry on retryable errors.
:param url: URL to fetch.
:param web_client: Web client object to use for fetching.
:param retry_count: How many times to retry fetching the same URL.
:param sleep_between_retries: How long to sleep between retries, in seconds.
:return: Web client response object.
"""
assert retry_count > 0, "Retry count must be positive."

response = None
Expand All @@ -114,7 +142,13 @@ def get_url_retry_on_client_errors(url: str,


def __response_is_gzipped_data(url: str, response: AbstractWebClientResponse) -> bool:
"""Return True if Response looks like it's gzipped."""
"""
Return True if Response looks like it's gzipped.
:param url: URL the response was fetched from.
:param response: Response object.
:return: True if response looks like it might contain gzipped data.
"""
uri = urlparse(url)
url_path = unquote_plus(uri.path)
content_type = response.header('content-type') or ''
Expand All @@ -127,7 +161,12 @@ def __response_is_gzipped_data(url: str, response: AbstractWebClientResponse) ->


def __gunzip(data: bytes) -> bytes:
"""Gunzip data."""
"""
Gunzip data.
:param data: Gzipped data.
:return: Gunzipped data.
"""

if data is None:
raise GunzipException("Data is None.")
Expand All @@ -153,7 +192,13 @@ def __gunzip(data: bytes) -> bytes:


def ungzipped_response_content(url: str, response: AbstractWebClientResponse) -> str:
"""Return HTTP response's decoded content, gunzip it if necessary."""
"""
Return HTTP response's decoded content, gunzip it if necessary.
:param url: URL the response was fetched from.
:param response: Response object.
:return: Decoded and (if necessary) gunzipped response string.
"""

data = response.raw_data()

Expand All @@ -172,7 +217,12 @@ def ungzipped_response_content(url: str, response: AbstractWebClientResponse) ->


def strip_url_to_homepage(url: str) -> str:
"""Strip URL (e.g. http://www.example.com/page.html) to its homepage (e.g. http://www.example.com/)."""
"""
Strip URL to its homepage.
:param url: URL to strip, e.g. "http://www.example.com/page.html".
:return: Stripped homepage URL, e.g. "http://www.example.com/"
"""
if not url:
raise StripURLToHomepageException("URL is empty.")

Expand Down
41 changes: 34 additions & 7 deletions usp/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@


class Logger(object):
"""Logging helper class."""
"""
Logging helper class.
"""

__LEVELS = {
'CRITICAL': logging.CRITICAL,
Expand All @@ -24,7 +26,11 @@ class Logger(object):
]

def __init__(self, name: str):
"""Initialize logger object for a given name."""
"""
Initialize logger object for a given name.
:param name: Module name that the logger should be initialized for.
"""

self.__l = logging.getLogger(name)
if not self.__l.handlers:
Expand All @@ -43,22 +49,43 @@ def __init__(self, name: str):
self.__l.propagate = False

def error(self, message: str) -> None:
"""Log error message."""
"""
Log error message.
:param message: Message to log.
"""
self.__l.error(message)

def warning(self, message: str) -> None:
"""Log warning message."""
"""
Log warning message.
:param message: Message to log.
"""
self.__l.warning(message)

def info(self, message: str) -> None:
"""Log informational message."""
"""
Log informational message.
:param message: Message to log.
"""
self.__l.info(message)

def debug(self, message: str) -> None:
"""Log debugging message."""
"""
Log debugging message.
:param message: Message to log.
"""
self.__l.debug(message)


def create_logger(name: str) -> Logger:
"""Create and return Logger object."""
"""
Create and return Logger object.
:param name: Module name that the logger should be initialized for.
:return: Logger object.
"""
return Logger(name=name)

0 comments on commit 3dde959

Please sign in to comment.