In [1]:
output_folder = "data/DWD_RR_zip/"

In [2]:
import os
os.makedirs(output_folder, exist_ok=True)

In [3]:
# #fancy way of having an extra file again for absolutely no reason.
# #Directory trees are created. Ignore errors if they already exist.
# file_name = "data/downloads/produkt_rr_stunde_20240524_20251124_01078.txt"
# file_CSV_name = "data/output/produkt_rr_stunde.csv"
# directory_name = "data"
# import os
# os.makedirs(file_name,exist_ok = True) # it does not complain if the dir already exists.

In [4]:

# URL of the DWD website
url_base = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/"
url_temporal_resolution = "hourly/"
url_parameter = "precipitation/"
url_subdir = "recent/"
url_full = os.path.join(url_base, url_temporal_resolution, url_parameter, url_subdir)


In [5]:
url_full

'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/precipitation/recent/'

In [6]:
def grab_file(file_url, download_dir):
        # get only the file name from the full url
        file_name = file_url.split("/")[-1]
        # Download the file
        file_path =os.path.join(download_dir, file_name)
        with open(file_path, "wb") as file:
            file.write(requests.get(file_url).content)
        
    

In [7]:
import requests
from bs4 import BeautifulSoup
import re # to use regex expressions 


In [8]:

# Send an HTTP request to the URL
response = requests.get(url_full)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")
    # Look for the metadata file
    links = soup.find_all(href=re.compile("Beschreibung"))
    # Take the url of the file
    file_name = links[0].get("href")
    # Download the file
    grab_file(os.path.join(url_full, file_name), output_folder)
    print(f"Downloaded: {output_folder+file_name}")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

Downloaded: data/DWD_RR_zip/RR_Stundenwerte_Beschreibung_Stationen.txt


In [9]:
# get station path
file_path = os.path.join(output_folder,file_name)
# read the header of the file
header = open(file_path, encoding="latin").readline().split()
header

['Stations_id',
 'von_datum',
 'bis_datum',
 'Stationshoehe',
 'geoBreite',
 'geoLaenge',
 'Stationsname',
 'Bundesland',
 'Abgabe']

In [10]:
# translation dictionary
translate = \
{'Stations_id':'station_id',
 'von_datum':'date_from',
 'bis_datum':'date_to',
 'Stationshoehe':'altitude',
 'geoBreite': 'latitude',
 'geoLaenge': 'longitude',
 'Stationsname':'name',
 'Bundesland':'state'}

In [11]:
import pandas as pd


In [12]:
# read the stations dataframe
df_stations = pd.read_fwf(file_path,
                          skiprows=2,
                          names=header,
                          encoding="latin", 
                          parse_dates=["von_datum","bis_datum"],
                          dtype={"Stations_id":str}
                          #index_col="Stations_id"
                         )
df_stations

Unnamed: 0,Stations_id,von_datum,bis_datum,Stationshoehe,geoBreite,geoLaenge,Stationsname,Bundesland,Abgabe
0,00003,1995-09-01,2011-04-01,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen,Frei
1,00020,2004-08-14,2025-12-04,432,48.9219,9.9129,Abtsgmünd-Untergröningen,Baden-Württemberg,Frei
2,00029,2006-01-10,2025-12-04,260,49.7175,10.9101,Adelsdorf (Kläranlage),Bayern,Frei
3,00044,2007-04-01,2025-12-04,44,52.9336,8.2370,Großenkneten,Niedersachsen,Frei
4,00046,2006-01-01,2025-12-04,325,48.9450,12.4639,Aholfing,Bayern,Frei
...,...,...,...,...,...,...,...,...,...
1828,20052,2025-10-01,2025-12-04,96,51.3869,7.4119,Hagen-Werdringen,Nordrhein-Westfalen,Frei
1829,20098,2025-04-01,2025-12-04,1019,48.5651,8.2284,Seebach (Nationalpark Schwarzwald),Baden-Württemberg,Frei
1830,20107,2025-03-27,2025-12-04,311,49.9687,6.8272,Wittlich-Bergweiler,Rheinland-Pfalz,Frei
1831,20111,2025-04-11,2025-12-04,363,49.8495,6.5241,Eisenach/Eifel,Rheinland-Pfalz,


In [13]:
df_stations.rename(columns=translate,inplace=True)
#inplace=True keeps the changes there if they are unused you would see the df reverting back to original

In [14]:
# filter stations only in NRW
# I think this was requested ? otherwise skip :D
df_stations_NRW = df_stations.query("state == 'Nordrhein-Westfalen' " )

In [15]:
#and date_to >= 2025  #if you want to control the stations still running to August

In [16]:
df_stations_NRW

Unnamed: 0,station_id,date_from,date_to,altitude,latitude,longitude,name,state,Abgabe
0,00003,1995-09-01,2011-04-01,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen,Frei
38,00216,2004-10-01,2025-12-04,298,51.1143,7.8807,Attendorn-Neulisternohl,Nordrhein-Westfalen,Frei
63,00326,2004-07-01,2013-07-08,120,51.7204,8.0577,Beckum-Unterberg,Nordrhein-Westfalen,Frei
75,00389,2009-11-01,2025-12-04,436,51.0148,8.4318,"Berleburg, Bad-Arfeld",Nordrhein-Westfalen,Frei
76,00390,2004-07-01,2025-12-04,611,50.9837,8.3683,"Berleburg, Bad-Stünzel",Nordrhein-Westfalen,Frei
...,...,...,...,...,...,...,...,...,...
1824,20047,2025-10-01,2025-12-04,127,51.3444,7.4118,Hagen-Haspe/Ennepe,Nordrhein-Westfalen,Frei
1825,20048,2025-10-01,2025-12-04,94,51.3951,7.4510,Hagen-Hengstey,Nordrhein-Westfalen,Frei
1826,20049,2025-10-01,2025-12-04,140,51.3403,7.5732,Hagen-Nahmer,Nordrhein-Westfalen,Frei
1827,20051,2025-10-01,2025-12-04,176,51.2819,7.5309,Hagen-Rummenohl/Volmetal,Nordrhein-Westfalen,Frei


In [17]:
# get the links. 
links = soup.find_all(href=[re.compile("RR_"+x) for x in df_stations_NRW.loc[:,"station_id"]])
links



[<a href="stundenwerte_RR_00216_akt.zip">stundenwerte_RR_00216_akt.zip</a>,
 <a href="stundenwerte_RR_00389_akt.zip">stundenwerte_RR_00389_akt.zip</a>,
 <a href="stundenwerte_RR_00390_akt.zip">stundenwerte_RR_00390_akt.zip</a>,
 <a href="stundenwerte_RR_00554_akt.zip">stundenwerte_RR_00554_akt.zip</a>,
 <a href="stundenwerte_RR_00555_akt.zip">stundenwerte_RR_00555_akt.zip</a>,
 <a href="stundenwerte_RR_00603_akt.zip">stundenwerte_RR_00603_akt.zip</a>,
 <a href="stundenwerte_RR_00613_akt.zip">stundenwerte_RR_00613_akt.zip</a>,
 <a href="stundenwerte_RR_00617_akt.zip">stundenwerte_RR_00617_akt.zip</a>,
 <a href="stundenwerte_RR_00644_akt.zip">stundenwerte_RR_00644_akt.zip</a>,
 <a href="stundenwerte_RR_00796_akt.zip">stundenwerte_RR_00796_akt.zip</a>,
 <a href="stundenwerte_RR_00871_akt.zip">stundenwerte_RR_00871_akt.zip</a>,
 <a href="stundenwerte_RR_00902_akt.zip">stundenwerte_RR_00902_akt.zip</a>,
 <a href="stundenwerte_RR_00934_akt.zip">stundenwerte_RR_00934_akt.zip</a>,
 <a href="st

In [18]:
import tqdm


In [19]:
try:
    # iterate through the list
    for link in tqdm.tqdm(links):
        # Take the url of the file
        file_name = link.get("href")
        # Download the file
        grab_file(os.path.join(url_full, file_name), output_folder)
    
except:
    print("Failed to download")

print("Download complete")

100%|████████████████████████████████████████████████████████████████████████████████| 103/103 [00:12<00:00,  8.49it/s]

Download complete



