In [6]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time

In [None]:

base_url = "https://www.congreso.es/es/opendata/votaciones"
legislatura = "XV"
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 4, 21)

headers = {
    "User-Agent": "Mozilla/5.0"  # To avoid basic bot blocks
}

found_links = []

current = start_date
while current <= end_date:
    formatted_date = current.strftime("%d/%m/%Y")
    params = {
        "p_p_id": "votaciones",
        "p_p_lifecycle": "0",
        "p_p_state": "normal",
        "p_p_mode": "view",
        "targetLegislatura": legislatura,
        "targetDate": formatted_date
    }

    #print(f"Checking {formatted_date}...")
    response = requests.get(base_url, params=params, headers=headers)
    if response.status_code != 200:
        print(f"Error {response.status_code}")
        current += timedelta(days=1)
        continue

    soup = BeautifulSoup(response.text, "html.parser")
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.endswith(".zip"):
            full_link = requests.compat.urljoin(base_url, href)
            print(f"✅ Found: {full_link}")
            found_links.append(full_link)

    current += timedelta(days=1)
    # sleep for 5 secons
    time.sleep(5)  # To avoid overwhelming the server

✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion089/20250122/VOT_20250122154424.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion090/20250211/VOT_20250211202646.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion091/20250212/VOT_20250212154035.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion092/20250218/VOT_20250218194303.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion093/20250219/VOT_20250219135528.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion094/20250225/VOT_20250225201643.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion095/20250226/VOT_20250226151614.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion096/20250311/VOT_20250311203731.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion098/20250313/VOT_20250313133

In [16]:
# Creates data/raw_zip folder if it doesn't exist
import os
os.makedirs("data/raw_zip", exist_ok=True)
# Iterate over the links and download the files if they don't exist
for link in found_links:
    filename = os.path.join("data/raw_zip", os.path.basename(link))
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        response = requests.get(link, headers=headers, stream=True)
        with open(filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"✅ Downloaded: {filename}")
    else:
        print(f"File already exists: {filename}")
# Notify completion
print("All files downloaded successfully!")

Downloading data/raw_zip\VOT_20250122154424.zip...
✅ Downloaded: data/raw_zip\VOT_20250122154424.zip
Downloading data/raw_zip\VOT_20250211202646.zip...
✅ Downloaded: data/raw_zip\VOT_20250211202646.zip
Downloading data/raw_zip\VOT_20250212154035.zip...
✅ Downloaded: data/raw_zip\VOT_20250212154035.zip
Downloading data/raw_zip\VOT_20250218194303.zip...
✅ Downloaded: data/raw_zip\VOT_20250218194303.zip
Downloading data/raw_zip\VOT_20250219135528.zip...
✅ Downloaded: data/raw_zip\VOT_20250219135528.zip
Downloading data/raw_zip\VOT_20250225201643.zip...
✅ Downloaded: data/raw_zip\VOT_20250225201643.zip
Downloading data/raw_zip\VOT_20250226151614.zip...
✅ Downloaded: data/raw_zip\VOT_20250226151614.zip
File already exists: data/raw_zip\VOT_20250311203731.zip
File already exists: data/raw_zip\VOT_20250313133849.zip
File already exists: data/raw_zip\VOT_20250318201319.zip
File already exists: data/raw_zip\VOT_20250320125659.zip
File already exists: data/raw_zip\VOT_20250325212229.zip
File alr

In [17]:
# Unzip the donloaded files into data/raw_json folder
import zipfile
import glob
os.makedirs("data/raw_json", exist_ok=True)
for zip_file in glob.glob("data/raw_zip/*.zip"):
        if os.path.exists("data/raw_json/" + os.path.basename(zip_file)[:-4]):
            print(f"File already unzipped: {zip_file}")
            continue
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall("data/raw_json/" + os.path.basename(zip_file)[:-4])
        print(f"Unzipped: {zip_file}")

Unzipped: data/raw_zip\VOT_20250122154424.zip
Unzipped: data/raw_zip\VOT_20250211202646.zip
Unzipped: data/raw_zip\VOT_20250212154035.zip
Unzipped: data/raw_zip\VOT_20250218194303.zip
Unzipped: data/raw_zip\VOT_20250219135528.zip
Unzipped: data/raw_zip\VOT_20250225201643.zip
Unzipped: data/raw_zip\VOT_20250226151614.zip
File already unzipped: data/raw_zip\VOT_20250311203731.zip
File already unzipped: data/raw_zip\VOT_20250313133849.zip
File already unzipped: data/raw_zip\VOT_20250318201319.zip
File already unzipped: data/raw_zip\VOT_20250320125659.zip
File already unzipped: data/raw_zip\VOT_20250325212229.zip
File already unzipped: data/raw_zip\VOT_20250326194433.zip
Unzipped: data/raw_zip\VOT_20250408213856.zip
Unzipped: data/raw_zip\VOT_20250410155135.zip
