Skip to content

Commit

Permalink
Added documentation, more tests and refactored code
Browse files Browse the repository at this point in the history
  • Loading branch information
agronholm committed Apr 15, 2017
1 parent b331c3e commit d34c3c4
Show file tree
Hide file tree
Showing 28 changed files with 560 additions and 337 deletions.
8 changes: 4 additions & 4 deletions README.rst
Expand Up @@ -5,12 +5,12 @@
:target: https://coveralls.io/github/asphalt-framework/asphalt-feedreader?branch=master
:alt: Code Coverage

This Asphalt framework component provides the ability to parse syndication feeds like RSS_ and
Atom_.
This Asphalt framework component provides the ability to monitor syndication feeds like RSS_ and
Atom_. Arbitrary HTML pages can also be scraped as feeds by means of a custom feed reader class.

Each feed is polled periodically and subscribers are notified of any newly published
entries. There is also support for persisting the state of each feed in various kinds of databases.
Arbitrary HTML pages can also be scraped as feeds by means of a custom feed reader class.
entries. There is also support for persisting the state of each feed, so as not to report old
items again when the application is restarted.

.. _RSS: http://cyber.harvard.edu/rss/rss.html
.. _Atom: https://tools.ietf.org/html/rfc4287
Expand Down
5 changes: 5 additions & 0 deletions asphalt/feedreader/__init__.py
@@ -0,0 +1,5 @@
from .api import FeedReader, FeedStateStore # noqa
from .component import create_feed, FeedReaderComponent # noqa
from .events import EntryEvent, MetadataEvent # noqa
from .metadata import FeedEntry, FeedMetadata # noqa
from .readers.base import BaseFeedReader # noqa
35 changes: 26 additions & 9 deletions asphalt/feedreader/api.py
@@ -1,17 +1,29 @@
from abc import ABCMeta, abstractmethod
from typing import Awaitable, Dict, Any, Optional

from asphalt.core import Context, Signal
from typing import Awaitable, Dict, Any

from asphalt.feedreader.events import EntryEvent, MetadataEvent
from asphalt.feedreader.metadata import FeedMetadata


class FeedReader(metaclass=ABCMeta):
"""
Interface for feed readers.
:var entry_discovered: a signal dispatched when a resource has been published in this context
:vartype entry_discovered: Signal[EntryEvent]
:var metadata_changed: a signal dispatched when the feed metadata has been changed
:vartype metadata_changed: Signal[MetadataEvent]
:ivar str url: the feed URL
"""

entry_discovered = Signal(EntryEvent)
metadata_changed = Signal(MetadataEvent)
url = None # type: str

@abstractmethod
def start(self, ctx: Context) -> Awaitable:
def start(self, ctx: Context) -> Awaitable[None]:
"""
Initialize the feed.
Expand Down Expand Up @@ -40,33 +52,38 @@ def metadata(self) -> FeedMetadata:
"""Return the feed's metadata."""

@abstractmethod
def update(self) -> Awaitable:
def update(self) -> Awaitable[None]:
"""Read the feed from the source and dispatch any events necessary."""

@classmethod
def can_parse(cls, document: str, content_type: str) -> bool:
def can_parse(cls, document: str, content_type: str) -> Optional[str]:
"""
Return ``True`` if this parser can parse this document, ``False`` otherwise.
Determine if this reader class is suitable for parsing the given document as a feed.
This method is only used for autodetection of feed type (ie. when the feed parser has not
This method is only used for autodetection of feed type by
:func:`~asphalt.feedreader.component.create_feed` (ie. when the feed parser has not
been specified). Autodetection is skipped when the feed parser has been explicitly given.
:param document: document loaded from the feed URL
:param content_type: MIME type of the loaded document
:return: the reason why this class cannot parse the given document, or ``None`` if it can
parse it
"""
return False
return 'Autodetection not implemented for this class'


class FeedStateStore(metaclass=ABCMeta):
"""Interface for feed state stores."""

@abstractmethod
def start(self, ctx: Context) -> Awaitable:
def start(self, ctx: Context) -> Awaitable[None]:
"""Initialize the store."""

@abstractmethod
def load_state(self, state_id: str) -> Awaitable[Dict[str, Any]]:
"""Load the named state from the store."""

@abstractmethod
def store_state(self, state_id: str, state: Dict[str, Any]) -> Awaitable:
def store_state(self, state_id: str, state: Dict[str, Any]) -> Awaitable[None]:
"""Add or update the indicated state in the store."""
74 changes: 44 additions & 30 deletions asphalt/feedreader/component.py
@@ -1,65 +1,80 @@
from typing import Dict, Type, Union

import aiohttp
import logging
from typing import Dict, Type, Union, Any

from asphalt.core import Component, Context, PluginContainer, merge_config
import aiohttp
from asphalt.core import Component, Context, PluginContainer, merge_config, qualified_name
from typeguard import check_argument_types

from asphalt.core.utils import qualified_name
from asphalt.feedreader.api import FeedReader

feed_readers = PluginContainer('asphalt.feedreader.readers')
feed_stores = PluginContainer('asphalt.feedreader.stores')
logger = logging.getLogger(__name__)


async def create_feed(url: str, kind: Union[str, Type[FeedReader]] = None,
**config) -> FeedReader:
async def create_feed(ctx: Context, reader: Union[str, Type[FeedReader]] = None,
**reader_args) -> FeedReader:
"""
Create a syndication feed.
Create and start a syndication feed.
The returned feed needs to be started (using :meth:`~asphalt.feedreader.api.FeedReader.start`).
.. note:: This function does **NOT** add the feed to the context as a resource.
:param url: the address of the feed
:param kind: either a feed reader class or the entry point name of one, or ``None`` to
attempt automatic detection of the feed type
:param config: keyword arguments passed to the feed reader class
:param ctx: a context object (passed to the :meth:`~asphalt.feedreader.api.FeedReader.start`
method)
:param reader: specifies the feed reader class by one of the following means:
* a subclass of :class:`~asphalt.feedreader.api.FeedReader`
* the entry point name of one
* a ``module:varname`` reference to one
* ``None`` to attempt automatic detection of the feed type
:param reader_args: keyword arguments passed to the feed reader class
:return: a feed reader
"""
assert check_argument_types()
if isinstance(kind, type):
feed_class = kind
elif kind:
feed_class = feed_readers.resolve(kind)
if isinstance(reader, type):
feed_class = reader
elif reader:
feed_class = feed_readers.resolve(reader)
else:
try:
url = reader_args['url']
except KeyError:
raise LookupError('no "url" option was specified – it is required for feed reader '
'autodetection') from None

feed_class = None
async with aiohttp.request('GET', url) as response:
response.raise_for_status()
text = await response.text()
for cls in feed_readers.all():
if cls.can_parse(text, response.content_type):
logger.info('Attempting autodetection of feed reader class for %s', url)
reason = cls.can_parse(text, response.content_type)
if reason:
logger.info('%s: %s', qualified_name(cls), reason)
else:
logger.info('Selected reader class %s for %s', qualified_name(cls), url)
feed_class = cls
break
else:
raise RuntimeError('unable to detect the feed type for url: ' + url)

if feed_class:
return feed_class(url=url, **config)
else:
raise RuntimeError('unable to detect the feed type for url: ' + url)
feed = feed_class(**reader_args)
await feed.start(ctx)
return feed


class FeedReaderComponent(Component):
"""
Creates :class:`~asphalt.feedreader.api.FeedReader` resources.
:param feeds: a dictionary of resource name ⭢ feed configuration
:param feeds: a dictionary of resource name ⭢ keyword arguments to :func:`~.create_feed`
:param stores: a dictionary of resource name ⭢ feed state store configuration
:param feed_defaults: defaults for keyword arguments passed to the constructors of the chosen
feed class(es)
:param feed_defaults: defaults for keyword arguments passed to the :func:`~.create_feed`
"""

def __init__(self, feeds: Dict[str, dict] = None, stores: Dict[str, dict] = None,
**feed_defaults):
def __init__(self, feeds: Dict[str, Dict[str, Any]] = None,
stores: Dict[str, Dict[str, Any]] = None, **feed_defaults):
assert check_argument_types()
if not feeds:
feed_defaults.setdefault('context_attr', 'feed')
Expand All @@ -85,8 +100,7 @@ async def start(self, ctx: Context):
qualified_name(store))

for resource_name, context_attr, config in self.feeds:
feed = await create_feed(**config)
await feed.start(ctx)
ctx.add_resource(feed, resource_name, context_attr)
feed = await create_feed(ctx, **config)
ctx.add_resource(feed, resource_name, context_attr, types=[type(feed), FeedReader])
logger.info('Configured feed (%s / ctx.%s; url=%s)', resource_name, context_attr,
feed.url)
48 changes: 11 additions & 37 deletions asphalt/feedreader/events.py
@@ -1,51 +1,24 @@
from datetime import datetime
from typing import Iterable, Dict, Any
from typing import Dict, Any

from asphalt.core import Event
from typeguard import check_argument_types

from asphalt.feedreader.metadata import FeedEntry


class EntryEvent(Event):
"""
Base class for feed entry events.
:ivar str id: globally unique identifier of the entry
:ivar title: short title of the entry
:vartype title: Optional[str]
:ivar summary: a short description of the entry
:vartype summary: Optional[str]
:ivar categories: a tuple of category names for the entry
:vartype categories: Tuple[str, ...]
:ivar link: a URL that links to the relevant web page
:vartype link: Optional[str]
:ivar published: a timezone aware date/time when the entry was published
:vartype published: Optional[datetime]
:ivar enclosure_url: URL to a related media object
:vartype enclosure_url: Optional[str]
:ivar enclosure_length: size (in bytes) of the related media object
:vartype enclosure_length: Optional[int]
:ivar enclosure_type: MIME type of the related media object
:vartype enclosure_type: Optional[str]
Signals that a new news entry has been discovered in a syndication feed.
:ivar FeedEntry entry: the entry that was discovered
"""

__slots__ = ('id', 'title', 'summary', 'categories', 'link', 'published', 'enclosure_url',
'enclosure_length', 'enclosure_type')
__slots__ = 'entry'

def __init__(self, source, topic: str, id: str, *, title: str = None,
summary: str = None, categories: Iterable[str] = (), link: str = None,
published: datetime = None, enclosure_url: str = None,
enclosure_length: int = None, enclosure_type: str = None):
def __init__(self, source, topic: str, entry: FeedEntry):
assert check_argument_types()
super().__init__(source, topic)
self.id = id
self.title = title
self.summary = summary
self.categories = tuple(categories)
self.link = link
self.published = published
self.enclosure_url = enclosure_url
self.enclosure_length = enclosure_length
self.enclosure_type = enclosure_type
self.entry = entry


class MetadataEvent(Event):
Expand All @@ -56,8 +29,9 @@ class MetadataEvent(Event):
:vartype changes: Dict[str, Any]
"""

__slots__ = ('changes',)
__slots__ = 'changes'

def __init__(self, source, topic: str, changes: Dict[str, Any]):
assert check_argument_types()
super().__init__(source, topic)
self.changes = changes
48 changes: 43 additions & 5 deletions asphalt/feedreader/metadata.py
@@ -1,12 +1,49 @@
from datetime import datetime # noqa
from typing import Dict, Any, Tuple # noqa
from datetime import datetime
from typing import Dict, Any, Tuple, Iterable # noqa

from dateutil.parser import parse
from typeguard import check_argument_types


class FeedEntry:
"""
Represents a news item in a syndication feed.
The following parameters are also available as instance variables:
:param id: globally unique identifier of the entry
:param title: short title of the entry
:param summary: a short description of the entry
:param categories: a tuple of category names for the entry
:param link: a URL that links to the relevant web page
:param published: a timezone aware date/time when the entry was published
:param enclosure_url: URL to a related media object
:param enclosure_length: size (in bytes) of the related media object
:param enclosure_type: MIME type of the related media object
"""

__slots__ = ('id', 'title', 'summary', 'categories', 'link', 'published', 'enclosure_url',
'enclosure_length', 'enclosure_type')

def __init__(self, id: str, *, title: str = None, summary: str = None,
categories: Iterable[str] = (), link: str = None, published: datetime = None,
enclosure_url: str = None, enclosure_length: int = None,
enclosure_type: str = None):
assert check_argument_types()
self.id = id
self.title = title
self.summary = summary
self.categories = tuple(categories)
self.link = link
self.published = published
self.enclosure_url = enclosure_url
self.enclosure_length = enclosure_length
self.enclosure_type = enclosure_type


class FeedMetadata:
"""
Contains feed metadata.
Contains metadata for a syndication feed.
:ivar icon: URL pointing to the image representing this feed
:vartype icon: Optional[str]
Expand All @@ -32,9 +69,9 @@ class FeedMetadata:

def __getstate__(self) -> Dict[str, Any]:
state = {
key: getattr(self, value) for key, value in
key: getattr(self, key) for key in
('categories', 'icon', 'title', 'link', 'generator', 'copyright')
if getattr(self, value) is not None
if getattr(self, key) is not None
}
state['version'] = 1
if self.updated:
Expand All @@ -43,6 +80,7 @@ def __getstate__(self) -> Dict[str, Any]:
return state

def __setstate__(self, state: Dict[str, Any]):
assert check_argument_types()
version = state.get('version')
if version != 1:
raise ValueError('cannot handle {} state version {}'.
Expand Down

0 comments on commit d34c3c4

Please sign in to comment.