In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time

In [3]:

base_url = "https://www.congreso.es/es/opendata/votaciones"
legislatura = "XV"
start_date = datetime(2023, 8, 17)
end_date = datetime(2025, 4, 21)

headers = {
    "User-Agent": "Mozilla/5.0"  # To avoid basic bot blocks
}

found_links = []

current = start_date
while current <= end_date:
    formatted_date = current.strftime("%d/%m/%Y")
    params = {
        "p_p_id": "votaciones",
        "p_p_lifecycle": "0",
        "p_p_state": "normal",
        "p_p_mode": "view",
        "targetLegislatura": legislatura,
        "targetDate": formatted_date
    }

    #print(f"Checking {formatted_date}...")
    response = requests.get(base_url, params=params, headers=headers)
    if response.status_code != 200:
        print(f"Error {response.status_code}")
        current += timedelta(days=1)
        continue

    soup = BeautifulSoup(response.text, "html.parser")
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.endswith(".zip"):
            full_link = requests.compat.urljoin(base_url, href)
            print(f"✅ Found: {full_link}")
            found_links.append(full_link)

    current += timedelta(days=1)
    # sleep for 1 seconds
    time.sleep(1)  # To avoid overwhelming the server

✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion002/20230919/VOT_20230919152000.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion003/20230921/VOT_20230921120042.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion005/20230927/VOT_20230927125500.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion006/20230929/VOT_20230929125600.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion008/20231116/VOT_20231116125700.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion009/20231128/VOT_20231128183352.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion011/20231212/VOT_20231212230015.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion013/20231219/VOT_20231219233057.zip
✅ Found: https://www.congreso.es/webpublica/opendata/votaciones/Leg15/Sesion015/20240110/VOT_20240111114

In [4]:
# Creates data/raw_zip folder if it doesn't exist
import os
os.makedirs("data/raw_zip", exist_ok=True)
# Iterate over the links and download the files if they don't exist
for link in found_links:
    filename = os.path.join("data/raw_zip", os.path.basename(link))
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        response = requests.get(link, headers=headers, stream=True)
        with open(filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"✅ Downloaded: {filename}")
    else:
        print(f"File already exists: {filename}")
# Notify completion
print("All files downloaded successfully!")

Downloading data/raw_zip\VOT_20230919152000.zip...
✅ Downloaded: data/raw_zip\VOT_20230919152000.zip
Downloading data/raw_zip\VOT_20230921120042.zip...
✅ Downloaded: data/raw_zip\VOT_20230921120042.zip
Downloading data/raw_zip\VOT_20230927125500.zip...
✅ Downloaded: data/raw_zip\VOT_20230927125500.zip
Downloading data/raw_zip\VOT_20230929125600.zip...
✅ Downloaded: data/raw_zip\VOT_20230929125600.zip
Downloading data/raw_zip\VOT_20231116125700.zip...
✅ Downloaded: data/raw_zip\VOT_20231116125700.zip
Downloading data/raw_zip\VOT_20231128183352.zip...
✅ Downloaded: data/raw_zip\VOT_20231128183352.zip
Downloading data/raw_zip\VOT_20231212230015.zip...
✅ Downloaded: data/raw_zip\VOT_20231212230015.zip
Downloading data/raw_zip\VOT_20231219233057.zip...
✅ Downloaded: data/raw_zip\VOT_20231219233057.zip
Downloading data/raw_zip\VOT_20240111114807.zip...
✅ Downloaded: data/raw_zip\VOT_20240111114807.zip
Downloading data/raw_zip\VOT_20240116165740.zip...
✅ Downloaded: data/raw_zip\VOT_202401161

In [5]:
# Unzip the donloaded files into data/raw_json folder
import zipfile
import glob
os.makedirs("data/raw_json", exist_ok=True)
for zip_file in glob.glob("data/raw_zip/*.zip"):
        if os.path.exists("data/raw_json/" + os.path.basename(zip_file)[:-4]):
            print(f"File already unzipped: {zip_file}")
            continue
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall("data/raw_json/" + os.path.basename(zip_file)[:-4])
        print(f"Unzipped: {zip_file}")

Unzipped: data/raw_zip\VOT_20230919152000.zip
Unzipped: data/raw_zip\VOT_20230921120042.zip
Unzipped: data/raw_zip\VOT_20230927125500.zip
Unzipped: data/raw_zip\VOT_20230929125600.zip
Unzipped: data/raw_zip\VOT_20231116125700.zip
Unzipped: data/raw_zip\VOT_20231128183352.zip
Unzipped: data/raw_zip\VOT_20231212230015.zip
Unzipped: data/raw_zip\VOT_20231219233057.zip
Unzipped: data/raw_zip\VOT_20240111114807.zip
Unzipped: data/raw_zip\VOT_20240116165740.zip
Unzipped: data/raw_zip\VOT_20240130183848.zip
Unzipped: data/raw_zip\VOT_20240206213117.zip
Unzipped: data/raw_zip\VOT_20240220203626.zip
Unzipped: data/raw_zip\VOT_20240222110726.zip
Unzipped: data/raw_zip\VOT_20240227205806.zip
Unzipped: data/raw_zip\VOT_20240229115032.zip
Unzipped: data/raw_zip\VOT_20240312204245.zip
Unzipped: data/raw_zip\VOT_20240314123708.zip
Unzipped: data/raw_zip\VOT_20240314135322.zip
Unzipped: data/raw_zip\VOT_20240319201817.zip
Unzipped: data/raw_zip\VOT_20240321105707.zip
Unzipped: data/raw_zip\VOT_2024040