In [None]:
import os
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

feed_url = "https://gegevensmagazijn.tweedekamer.nl/SyncFeed/2.0/Feed"
headers = {"User-Agent": "Mozilla/5.0 (compatible; SyncFeedDownloader/1.0)"}
download_dir = "tweede_kamer_documents"
os.makedirs(download_dir, exist_ok=True)
response = requests.get(feed_url, headers=headers)
if response.status_code != 200:
    print(f"Error fetching feed: {response.status_code}")
    exit(1)


soup = BeautifulSoup(response.content, "xml")
entries = soup.find_all("entry")
print(f"Found {len(entries)} entries in the feed.")

if entries:
    print("Example entry structure:")
    print(entries[0].prettify())

registry = []

for entry in entries:
    id_tag = entry.find("id")
    title_tag = entry.find("title")
    updated_tag = entry.find("updated")
    
    if not id_tag:
        continue

    doc_id_raw = id_tag.text.strip()
    title = title_tag.text.strip() if title_tag else "No Title"
    updated = updated_tag.text.strip() if updated_tag else ""
    match = re.search(r'([0-9a-fA-F\-]{36})', doc_id_raw)
    guid = match.group(1) if match else None
    if not guid:
        continue

    enclosure_link = entry.find("link", {"rel": "enclosure"})
    if enclosure_link and enclosure_link.get("href"):
        resource_url = enclosure_link["href"]
    else:
        print(f"No enclosure link found for GUID {guid}")
        continue

    print(f"Downloading: {title} (GUID: {guid})")
    file_resp = requests.get(resource_url, headers=headers, stream=True)
    if file_resp.status_code == 200:
        content_type = file_resp.headers.get("Content-Type", "").lower()
        ext = ".pdf" if "pdf" in content_type else ".bin"
        safe_title = "".join(c if c.isalnum() or c in " _-" else "_" for c in title)
        filename = f"{safe_title}_{guid}{ext}"
        filepath = os.path.join(download_dir, filename)
        

        with open(filepath, "wb") as f:
            for chunk in file_resp.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print(f"Saved to {filepath}")
        
        registry.append({
            "guid": guid,
            "title": title,
            "updated": updated,
            "resource_url": resource_url,
            "local_path": filepath
        })
    else:
        print(f"Failed to download {guid}. Status code: {file_resp.status_code}")

registry_df = pd.DataFrame(registry)
registry_csv = os.path.join(download_dir, "document_registry.csv")
registry_df.to_csv(registry_csv, index=False)
print(f"\nRegistry saved to {registry_csv}")


Found 250 entries in the feed.
Example entry structure:
<entry>
 <title>
  dae54112-03da-4529-a3cf-5c95ef30012c
 </title>
 <id>
  https://gegevensmagazijn.tweedekamer.nl/SyncFeed/2.0/Entiteiten/dae54112-03da-4529-a3cf-5c95ef30012c
 </id>
 <author>
  <name>
   Tweede Kamer der Staten-Generaal
  </name>
 </author>
 <updated>
  2019-04-01T09:13:31.0870000Z
 </updated>
 <category term="verslag"/>
 <link href="https://gegevensmagazijn.tweedekamer.nl/SyncFeed/2.0/Resources/dae54112-03da-4529-a3cf-5c95ef30012c" rel="enclosure"/>
 <link href="https://gegevensmagazijn.tweedekamer.nl/SyncFeed/2.0/Feed?skiptoken=32139" rel="next"/>
 <content type="application/xml">
  <ns1:verslag id="dae54112-03da-4529-a3cf-5c95ef30012c" ns1:bijgewerkt="2019-03-28T23:06:33.972872+01:00" ns1:contentLength="1450742" ns1:contentType="text/xml" ns1:verwijderd="false" xmlns:ns1="http://www.tweedekamer.nl/xsd/tkData/v1-0">
   <ns1:vergadering ref="23a95a4b-549d-4f91-a9e3-336d55a4a3ef" xmlns:xsi="http://www.w3.org/2001/