Skip to content

Commit

Permalink
new: [rssfind.py] a simple script to discover RSS/Atom feeds from an URL
Browse files Browse the repository at this point in the history
  • Loading branch information
adulau committed Mar 3, 2024
1 parent 243ac1d commit 848b96a
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 0 deletions.
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,21 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen

## Tools

### rssfind

[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered.

~~~shell
Usage: Find RSS or Atom feeds from an URL
usage: rssfind.py [options]

Options:
-h, --help show this help message and exit
-l LINK, --link=LINK http link where to find one or more feed source(s)
-d, --disable-strict Include empty feeds in the list, default strict is
enabled
~~~

### rsscluster

[rsscluster.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script that clusters items from an RSS feed based on a specified time interval, expressed in days.
Expand Down
2 changes: 2 additions & 0 deletions REQUIREMENTS
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
bs4
feedparser
orjson
requests
89 changes: 89 additions & 0 deletions bin/rssfind.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/python3

import sys
import urllib.parse
from optparse import OptionParser

import feedparser
import orjson as json
import requests
from bs4 import BeautifulSoup as bs4


def findfeeds(url=None, disable_strict=False):
if url is None:
return None

raw = requests.get(url).text
results = []
discovered_feeds = []
html = bs4(raw, features="lxml")
feed_urls = html.findAll("link", rel="alternate")
if feed_urls:
for f in feed_urls:
tag = f.get("type", None)
if tag:
if "feed" in tag or "rss" in tag or "xml" in tag:
href = f.get("href", None)
if href:
discovered_feeds.append(href)

parsed_url = urllib.parse.urlparse(url)
base = f"{parsed_url.scheme}://{parsed_url.hostname}"
ahreftags = html.findAll("a")

for a in ahreftags:
href = a.get("href", None)
if href:
if "feed" in href or "rss" in href or "xml" in href:
discovered_feeds.append(f"{base}{href}")

for url in list(set(discovered_feeds)):
f = feedparser.parse(url)
if f.entries:
if url not in results:
results.append(url)

if disable_strict:
return list(set(discovered_feeds))
else:
return results


version = "0.2"

feedparser.USER_AGENT = (
"rssfind.py " + version + " +https://github.com/adulau/rss-tools"
)

usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"

parser = OptionParser(usage)

parser.add_option(
"-l",
"--link",
dest="link",
help="http link where to find one or more feed source(s)",
)

parser.add_option(
"-d",
"--disable-strict",
action="store_false",
default=False,
help="Include empty feeds in the list, default strict is enabled",
)

(options, args) = parser.parse_args()

if not options.link:
print("URL missing")
parser.print_help()
sys.exit(0)

print(
json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode(
"utf-8"
)
)

0 comments on commit 848b96a

Please sign in to comment.