In [4]:
from scrapy.http import TextResponse
from lxml import etree

def is_feed(response: TextResponse) -> bool:
    """
    Check if the response body is of a feed type (Atom, RSS2, RSS1, RSS0).

    Parameters:
    - response (TextResponse): A Scrapy response object.

    Returns:
    - bool: True if the response is a feed, otherwise False.
    """

    if not isinstance(response, TextResponse):
        return False

    try:
        root = etree.fromstring(response.body)

        # Check for Atom feed
        if root.tag == '{http://www.w3.org/2005/Atom}feed':
            return True

        # Check for various RSS feed types
        if root.tag == 'rss':
            # Check for version attribute to further narrow down
            version = root.get('version')
            if version in ['2.0', '1.0', '0.91', '0.92', '0.93', '0.94']:
                return True

        # Check for RDF (for RSS 1.0)
        if root.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF':
            return True

    except (etree.XMLSyntaxError, etree.XMLSchemaError):
        # Not a well-formed XML/Feed
        pass

    return False

# Example usage:
# response = some_fetched_response_using_scrapy
# if is_feed(response):
#     print("It's a feed!")


In [5]:
urls = ['https://siliconflorist.com/feed',
'https://silveiraneto.net/feed',
'https://silverscenesblog.blogspot.com/feeds/posts/default',
'https://silverscreenings.org/feed',
'https://silvestar.codes/rss.xml',
'https://silviamaggidesign.com/feed.xml',
'https://simberon.blogspot.com/feeds/posts/default',
'https://simblob.blogspot.com/feeds/posts/default',
'https://simbly.me/feed.xml',
'https://simeongriggs.dev/feed.xml',
'https://simeyeveritt.wixsite.com/brapa/blog-feed.xml',
'https://simme.dev/index.xml',
'https://simon-frey.com/blog/feed',
'https://simon-frey.eu/feed',
'https://simon.shimmerproject.org/feed',
'https://simonewebdesign.it/atom.xml',
'https://simonfredsted.com/feed',
'https://simonhearne.com/feed/feed.xml',
'https://simonknott.de/feed.xml',
'https://simonpai.github.io/index.xml',
'https://simonparrismaninchair.com/feed/',
'https://simonsafar.com/index.xml',
'https://simonschreibt.de/feed',
'https://simonwillison.net/atom/everything/',
'https://simplelivingsomerset.wordpress.com/feed',
'https://simpleprogrammer.com/feed',
'https://simply.jenett.org/feed.atom',
'https://simply.joejenett.com/feed.atom',
'https://simplyexplained.com/atom.xml',
'https://sin-ack.github.io/index.xml',
'https://sinclairtrails.com/feed/',
'https://sindre.is/feed/feed.xml',
'https://sindresorhus.com/rss.xml',
'https://singhkays.com/index.xml',
'https://trafilatura.readthedocs.io/en/latest/index.html,'
'https://sintraworks.github.io/feed.xml',]

In [6]:
urls = ['https://adrien.barbaresi.eu/blog/link-filtering-courlan-python.html']

In [7]:
for url in urls:
    downloaded = trafilatura.fetch_url(url)
    print(is_feed(downloaded))

NameError: name 'trafilatura' is not defined