# 00 — Config & Targets
Single source of truth for feeds, timeframe, symbols/phrases, and row caps.
This writes `data/raw/config.json` that downstream notebooks will read.


In [1]:
from pathlib import Path
import json, datetime as dt

# project paths
PROJ = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA = PROJ / "data"
RAW  = DATA / "raw"
PROC = DATA / "processed"
OUT  = DATA / "outputs"
for p in (DATA, RAW, PROC, OUT): p.mkdir(parents=True, exist_ok=True)

CONFIG_PATH = RAW / "config.json"


In [6]:
# ---- EDIT THESE ----

# Pick ~10 publications. Swap these for the ones you actually want.
# All are RSS URLs; archive URLs also work but RSS is faster/cleaner.
SUBSTACKS = {
    "Read Max": "https://maxread.substack.com/feed",
    "Noahpinion": "https://noahpinion.substack.com/feed",
    "The Intrinsic Perspective": "https://erikhoel.substack.com/feed",
    "Cliodynamica by Peter Turchin": "https://peterturchin.substack.com/feed",
    "The Culturist": "https://culturist.substack.com/feed",
    "Story Club by George Saunders": "https://georgesaunders.substack.com/feed",
    "Poetic Outlaws": "https://poeticoutlaws.substack.com/feed",
    "Hardware FYI": "https://hardwarefyi.substack.com/feed",
    "Letters from an American by Heather Cox Richardson": "https://heathercoxrichardson.substack.com/feed",
    "Anton Howes": "https://antonhowes.substack.com/feed",
}

# TIP: replace with the exact 10 you want. RSS is usually discoverable by appending /feed.

# Timeframe — keep it generous but finite
YEARS_BACK = 2
SINCE = (dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=365 * YEARS_BACK)).isoformat()

# Cap per publication to keep total rows reasonable for a 1–2h project
TARGET_POSTS_PER_PUB = 50

# Symbols to count later (02 notebook). You can tweak here so it's all centralized.
SYMBOLS = {
    "emdash": "—",
    "rocket": "🚀",
    "green_check": "✅",
    "sparkles": "✨",
    "fire": "🔥",
    "robot": "🤖",
    "chart_up": "📈",
    "ellipsis": "…",
}

# Common AI-ish phrases to flag
PHRASES = [
    r"\bwe'?re\s+excited\s+to\s+announce\b",
]

# Polite scrape throttle (seconds) used downstream
REQUEST_SLEEP_S = 0.2
REQUEST_TIMEOUT_S = 10


In [7]:
cfg = {
    "subscribes": SUBSTACKS,                     # {name: rss_url}
    "since_iso": SINCE,                          # ISO8601 string
    "years_back": YEARS_BACK,
    "target_posts_per_pub": TARGET_POSTS_PER_PUB,
    "symbols": SYMBOLS,                          # {label: literal}
    "phrases": PHRASES,                          # list[regex]
    "request_sleep_s": REQUEST_SLEEP_S,
    "request_timeout_s": REQUEST_TIMEOUT_S,
}

with open(CONFIG_PATH, "w", encoding="utf-8") as f:
    json.dump(cfg, f, ensure_ascii=False, indent=2)

print(f"Wrote {CONFIG_PATH.relative_to(PROJ)}")


Wrote data\raw\config.json


I use Comet to find all of the RSS URLs, so I want to validate the URLs.

In [4]:
import requests, feedparser, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
from pathlib import Path

PROJ = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA = PROJ / "data"; RAW = DATA / "raw"; OUT = DATA / "outputs"
for p in (DATA, RAW, OUT): p.mkdir(parents=True, exist_ok=True)

HEADERS = {"User-Agent": "Mozilla/5.0 (Lincoln/Workslop-FeedCheck)"}
TIMEOUT = 8

RSS_CT_HINTS = (
    "application/rss+xml",
    "application/atom+xml",
    "application/xml",
    "text/xml",
)

def is_xmlish_content_type(ct: str | None) -> bool:
    if not ct: return False
    ct = ct.lower()
    return any(h in ct for h in RSS_CT_HINTS)

def discover_rss_links(page_url: str) -> list[str]:
    """If page_url is an HTML page, look for <link rel="alternate" type=...> feed links."""
    try:
        r = requests.get(page_url, headers=HEADERS, timeout=TIMEOUT)
        r.raise_for_status()
    except Exception:
        return []
    soup = BeautifulSoup(r.text, "lxml")
    links = []
    for tag in soup.find_all("link", attrs={"rel": re.compile(r"\balternate\b", re.I)}):
        t = (tag.get("type") or "").lower()
        if any(h in t for h in RSS_CT_HINTS):
            href = tag.get("href")
            if href:
                links.append(urljoin(page_url, href))
    return list(dict.fromkeys(links))  # dedupe preserve order

def validate_feed_url(url: str) -> dict:
    """Return dict with validation details for this URL (RSS/Atom check)."""
    # 1) HEAD check
    head_ct = None
    try:
        h = requests.head(url, headers=HEADERS, allow_redirects=True, timeout=TIMEOUT)
        head_ct = h.headers.get("Content-Type", "")
    except Exception:
        pass

    # 2) If HEAD looks non-XML, we’ll still try GET+parse (some servers misreport)
    get_ct = None
    text_sample = None
    try:
        g = requests.get(url, headers=HEADERS, allow_redirects=True, timeout=TIMEOUT)
        g.raise_for_status()
        get_ct = g.headers.get("Content-Type", "")
        text_sample = g.text[:2000]  # for debugging
    except Exception as e:
        return {
            "url": url,
            "ok": False,
            "reason": f"GET failed: {e.__class__.__name__}",
            "head_content_type": head_ct,
            "get_content_type": get_ct,
            "feed_type": None,
            "entries": 0,
            "bozo": None,
        }

    # 3) Parse with feedparser
    fp = feedparser.parse(text_sample if text_sample else url)
    feed_type = None
    if fp.version:
        # feedparser sets version to e.g. 'rss20', 'atom10'
        feed_type = fp.version

    entries = len(fp.entries or [])
    bozo = getattr(fp, "bozo", 0)
    ok = bool(feed_type) and entries > 0

    # If not ok and content-type is HTML, try discovery
    reason = ""
    if not ok and get_ct and "html" in get_ct.lower():
        alt_links = discover_rss_links(url)
        if alt_links:
            # try first discovered link
            fp2 = feedparser.parse(alt_links[0])
            entries2 = len(fp2.entries or [])
            feed_type2 = fp2.version
            bozo2 = getattr(fp2, "bozo", 0)
            if feed_type2 and entries2 > 0 and bozo2 == 0:
                return {
                    "url": url,
                    "ok": True,
                    "reason": f"Discovered RSS via <link>: {alt_links[0]}",
                    "head_content_type": head_ct,
                    "get_content_type": get_ct,
                    "feed_type": feed_type2,
                    "entries": entries2,
                    "bozo": bozo2,
                    "resolved_feed": alt_links[0],
                }
            reason = f"HTML page; discovered feed(s) but parse failed or empty: {alt_links}"
        else:
            reason = "HTML page; no alternate RSS/Atom links found"
    elif not ok:
        reason = f"Parsed but invalid/empty (type={feed_type}, entries={entries}, bozo={bozo})"

    return {
        "url": url,
        "ok": ok,
        "reason": reason,
        "head_content_type": head_ct,
        "get_content_type": get_ct,
        "feed_type": feed_type,
        "entries": entries,
        "bozo": bozo,
    }


In [8]:
import json, pandas as pd
CONFIG_PATH = RAW / "config.json"
cfg = json.load(open(CONFIG_PATH, "r", encoding="utf-8"))
SUBS = cfg["subscribes"]

rows = []
for name, url in SUBS.items():
    res = validate_feed_url(url)
    res["pub"] = name
    rows.append(res)

df_check = pd.DataFrame(rows)[[
    "pub", "url", "ok", "feed_type", "entries",
    "head_content_type", "get_content_type", "bozo", "reason"
]]
df_check = df_check.sort_values(["ok","pub"], ascending=[False,True]).reset_index(drop=True)
display(df_check)

out_csv = OUT / "rss_validation_report.csv"
df_check.to_csv(out_csv, index=False)
print(f"Wrote {out_csv.relative_to(PROJ)}")


Unnamed: 0,pub,url,ok,feed_type,entries,head_content_type,get_content_type,bozo,reason
0,Anton Howes,https://antonhowes.substack.com/feed,True,rss20,1,application/xml; charset=utf-8,application/xml; charset=utf-8,1,
1,Cliodynamica by Peter Turchin,https://peterturchin.substack.com/feed,True,rss20,1,application/xml; charset=utf-8,application/xml; charset=utf-8,1,
2,Hardware FYI,https://hardwarefyi.substack.com/feed,True,rss20,1,application/xml; charset=utf-8,application/xml; charset=utf-8,1,
3,Letters from an American by Heather Cox Richar...,https://heathercoxrichardson.substack.com/feed,True,rss20,1,application/xml; charset=utf-8,application/xml; charset=utf-8,1,
4,Noahpinion,https://noahpinion.substack.com/feed,True,rss20,1,application/xml; charset=utf-8,application/xml; charset=utf-8,1,
5,Poetic Outlaws,https://poeticoutlaws.substack.com/feed,True,rss20,1,application/xml; charset=utf-8,application/xml; charset=utf-8,1,
6,Read Max,https://maxread.substack.com/feed,True,rss20,1,application/xml; charset=utf-8,application/xml; charset=utf-8,1,
7,Story Club by George Saunders,https://georgesaunders.substack.com/feed,True,rss20,1,application/xml; charset=utf-8,application/xml; charset=utf-8,1,
8,The Culturist,https://culturist.substack.com/feed,True,rss20,1,application/xml; charset=utf-8,application/xml; charset=utf-8,1,
9,The Intrinsic Perspective,https://erikhoel.substack.com/feed,True,rss20,1,application/xml; charset=utf-8,application/xml; charset=utf-8,1,


Wrote data\outputs\rss_validation_report.csv
