## 1. About the DWD Open Data Portal 

The data of the Climate Data Center (CDC) of the DWD (Deutscher Wetterdienst, German Weather Service) is provided on an **FTP server**. <br> **FTP** stands for _File Transfer Protocol_.

Open the FTP link ftp://opendata.dwd.de/climate_environment/CDC/ in your browser (copy-paste) and find our how it is structured hierarchically.

You can also open the link with **HTTPS** (Hypertext Transfer Protocol Secure): https://opendata.dwd.de/climate_environment/CDC/

We are interested in downloading the metadata of daily precipitation to get information related to their stations

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import re # to use regex expressions 
import tqdm
import pandas as pd
import geopandas as gpd

# URL of the DWD website
url_base = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/"
url_temporal_resolution = "daily/"
url_parameter = "more_precip/"
url_subdir = "historical/"
url_full = os.path.join(url_base, url_temporal_resolution, url_parameter, url_subdir)

# Directory to save the downloaded files
download_dir = "../data/original/dwd/" +  url_temporal_resolution + url_parameter + url_subdir

# Create the directory if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

print("download dir: ", download_dir)

# Directory and name to save the CSV file to create the layer for sub-task 1.2
outfname = r"../data/Selected_Stations_Nordrhein_Westfalen.csv"

download dir:  ../data/original/dwd/daily/more_precip/historical/


In [2]:
url_full

'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/more_precip/historical/'

In [3]:
def grab_file(file_url, download_dir):
        # get only the file name from the full url
        file_name = file_url.split("/")[-1]
        # Download the file
        file_path =os.path.join(download_dir, file_name)
        with open(file_path, "wb") as file:
            file.write(requests.get(file_url).content)
        
    

In [4]:
# Send an HTTP request to the URL
response = requests.get(url_full)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")
    # Look for the metadata file
    links = soup.find_all(href=re.compile("Beschreibung"))
    # Take the url of the file
    file_name = links[0].get("href")
    # Download the file
    grab_file(os.path.join(url_full, file_name), download_dir)
    print(f"Downloaded: {download_dir+file_name}")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

Downloaded: ../data/original/dwd/daily/more_precip/historical/RR_Tageswerte_Beschreibung_Stationen.txt


In [5]:
# get station path
file_path = os.path.join(download_dir,file_name)
# read the header of the file
header = open(file_path, encoding="latin").readline().split()
header

['Stations_id',
 'von_datum',
 'bis_datum',
 'Stationshoehe',
 'geoBreite',
 'geoLaenge',
 'Stationsname',
 'Bundesland',
 'Abgabe']

In [6]:
# translation dictionary
translate = \
{'Stations_id':'station_id',
 'von_datum':'date_from',
 'bis_datum':'date_to',
 'Stationshoehe':'altitude',
 'geoBreite': 'latitude',
 'geoLaenge': 'longitude',
 'Stationsname':'name',
 'Bundesland':'state'}

In [7]:
#pd.read_csv?

In [8]:
df_stations_2 = pd.read_fwf(file_path,
                          skiprows=[0,1],
                          names=translate,
                          encoding="latin", 
                          parse_dates=["von_datum","bis_datum"],
                          dtype={"Stations_id":str}
                          #index_col="Stations_id"
                         )
df_stations_2

  df_stations_2 = pd.read_fwf(file_path,


Unnamed: 0,Stations_id,von_datum,bis_datum,Stationshoehe,geoBreite,geoLaenge,Stationsname,Bundesland
1,19120101,1986-06-30,478,47.8413,8.8493,Aach,Baden-Württemberg,Frei
2,19510101,2006-12-31,138,50.8066,6.0996,Aachen (Kläranlage),Nordrhein-Westfalen,Frei
3,18910101,2011-03-31,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen,Frei
4,19510101,1979-10-31,243,50.7683,6.1207,Aachen-Brand,Nordrhein-Westfalen,Frei
6,19821101,2024-09-22,455,48.8361,10.0598,Aalen-Unterrombach,Baden-Württemberg,Frei
...,...,...,...,...,...,...,...,...
19898,20240101,2024-09-22,39,52.4970,13.2820,Berlin-Halensee,Berlin,Frei
19911,19631101,1986-09-30,215,50.9625,10.0663,Gerstungen/Thüringen,Thüringen,Frei
19917,20240325,2024-09-22,153,49.9321,8.0767,Schwabenheim a.d. Selz,Rheinland-Pfalz,
19993,18970507,1935-12-31,4,53.6870,9.6757,Uetersen,Schleswig-Holstein,Frei


In [9]:
# read the stations dataframe
df_stations = pd.read_fwf(file_path,
                          skiprows=2,
                          names=header,
                          encoding="latin", 
                          parse_dates=["von_datum","bis_datum"],
                          dtype={"Stations_id":str} 
                          #index_col="Stations_id"
                         )
df_stations

Unnamed: 0,Stations_id,von_datum,bis_datum,Stationshoehe,geoBreite,geoLaenge,Stationsname,Bundesland,Abgabe
0,00001,1912-01-01,1986-06-30,478,47.8413,8.8493,Aach,Baden-Württemberg,Frei
1,00002,1951-01-01,2006-12-31,138,50.8066,6.0996,Aachen (Kläranlage),Nordrhein-Westfalen,Frei
2,00003,1891-01-01,2011-03-31,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen,Frei
3,00004,1951-01-01,1979-10-31,243,50.7683,6.1207,Aachen-Brand,Nordrhein-Westfalen,Frei
4,00006,1982-11-01,2024-09-22,455,48.8361,10.0598,Aalen-Unterrombach,Baden-Württemberg,Frei
...,...,...,...,...,...,...,...,...,...
6487,19898,2024-01-01,2024-09-22,39,52.4970,13.2820,Berlin-Halensee,Berlin,Frei
6488,19911,1963-11-01,1986-09-30,215,50.9625,10.0663,Gerstungen/Thüringen,Thüringen,Frei
6489,19917,2024-03-25,2024-09-22,153,49.9321,8.0767,Schwabenheim a.d. Selz,Rheinland-Pfalz,
6490,19993,1897-05-07,1935-12-31,4,53.6870,9.6757,Uetersen,Schleswig-Holstein,Frei


Check all the different values in the "state" column. You can use the function <code>.unique()</code> for this.

In [10]:
df_stations.rename(columns=translate,inplace=True)

In [11]:
df_stations.loc[:,"state"].unique()

array(['Baden-Württemberg', 'Nordrhein-Westfalen', 'Hessen', 'Bayern',
       'Niedersachsen', 'Sachsen-Anhalt', 'Rheinland-Pfalz', 'Sachsen',
       'Mecklenburg-Vorpommern', 'Schleswig-Holstein', 'Brandenburg',
       'Thüringen', 'Saarland', 'Berlin', 'Bremen', 'Hamburg', 'Tirol'],
      dtype=object)

Select only stations in NRW which are still active (date_to is later than 2023) and which starting recording information at least in 1950.

In [12]:
#df_stations.query?

In [13]:
# filter stations only in Nordrhein-Westfalen which are active and older than 1950
df_stations_short = df_stations.query("state == 'Nordrhein-Westfalen' and date_to >= 2023 and date_from < 1950")

In [14]:
df_stations_short

Unnamed: 0,station_id,date_from,date_to,altitude,latitude,longitude,name,state,Abgabe
75,00079,1931-01-01,2024-09-20,160,50.6718,7.0155,Alfter-Volmershoven,Nordrhein-Westfalen,Frei
104,00110,1931-01-01,2024-09-22,65,52.0487,7.4877,Altenberge,Nordrhein-Westfalen,Frei
174,00187,1941-01-01,2024-09-22,265,51.4188,7.9126,Arnsberg-Holzen,Nordrhein-Westfalen,Frei
292,00325,1941-01-01,2024-09-22,127,51.7948,8.0718,Beckum-Vellern,Nordrhein-Westfalen,Frei
349,00389,1931-01-01,2024-09-22,436,51.0148,8.4318,"Berleburg, Bad-Arfeld",Nordrhein-Westfalen,Frei
...,...,...,...,...,...,...,...,...,...
5104,05579,1941-01-01,2024-09-22,273,51.5892,9.0296,Willebadessen-Borlinghausen,Nordrhein-Westfalen,Frei
5143,05619,1931-01-01,2024-09-22,360,51.1637,7.4234,Wipperfürth-Gardeweg,Nordrhein-Westfalen,Frei
5218,05699,1941-01-01,2024-09-22,312,51.5427,8.7784,Wünnenberg-Eilern,Nordrhein-Westfalen,Frei
5236,05717,1937-01-01,2024-09-22,134,51.2256,7.1052,Wuppertal-Buchenhofen,Nordrhein-Westfalen,Frei


In [15]:
# get the links. 
links = soup.find_all(href=[re.compile("RR_"+x) for x in df_stations_short.loc[:,"station_id"]])
links



[<a href="tageswerte_RR_00079_19310101_20231231_hist.zip">tageswerte_RR_00079_19310101_20231231_hist.zip</a>,
 <a href="tageswerte_RR_00110_19310101_20231231_hist.zip">tageswerte_RR_00110_19310101_20231231_hist.zip</a>,
 <a href="tageswerte_RR_00187_19410101_20231231_hist.zip">tageswerte_RR_00187_19410101_20231231_hist.zip</a>,
 <a href="tageswerte_RR_00325_19410101_20231231_hist.zip">tageswerte_RR_00325_19410101_20231231_hist.zip</a>,
 <a href="tageswerte_RR_00389_19310101_20231231_hist.zip">tageswerte_RR_00389_19310101_20231231_hist.zip</a>,
 <a href="tageswerte_RR_00488_19410101_20231231_hist.zip">tageswerte_RR_00488_19410101_20231231_hist.zip</a>,
 <a href="tageswerte_RR_00554_19460101_20231231_hist.zip">tageswerte_RR_00554_19460101_20231231_hist.zip</a>,
 <a href="tageswerte_RR_00555_19310101_20231231_hist.zip">tageswerte_RR_00555_19310101_20231231_hist.zip</a>,
 <a href="tageswerte_RR_00613_19410101_20231231_hist.zip">tageswerte_RR_00613_19410101_20231231_hist.zip</a>,
 <a href="

In [16]:
#soup

In [17]:
try:
    # iterate through the list
    for link in tqdm.tqdm(links):
        # Take the url of the file
        file_name = link.get("href")
        # Download the file
        grab_file(os.path.join(url_full, file_name), download_dir)
    
except:
    print("Failed to download")

print("Download complete")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 83/83 [01:05<00:00,  1.27it/s]

Download complete





extract one of the zip files to look at the content. Identify which file contains the data you are interested in.

In [18]:
import glob
zip_list = glob.glob(download_dir+"*.zip")
zip_list

['../data/original/dwd/daily/more_precip/historical\\tageswerte_RR_00079_19310101_20231231_hist.zip',
 '../data/original/dwd/daily/more_precip/historical\\tageswerte_RR_00110_19310101_20231231_hist.zip',
 '../data/original/dwd/daily/more_precip/historical\\tageswerte_RR_00187_19410101_20231231_hist.zip',
 '../data/original/dwd/daily/more_precip/historical\\tageswerte_RR_00325_19410101_20231231_hist.zip',
 '../data/original/dwd/daily/more_precip/historical\\tageswerte_RR_00389_19310101_20231231_hist.zip',
 '../data/original/dwd/daily/more_precip/historical\\tageswerte_RR_00488_19410101_20231231_hist.zip',
 '../data/original/dwd/daily/more_precip/historical\\tageswerte_RR_00554_19460101_20231231_hist.zip',
 '../data/original/dwd/daily/more_precip/historical\\tageswerte_RR_00555_19310101_20231231_hist.zip',
 '../data/original/dwd/daily/more_precip/historical\\tageswerte_RR_00613_19410101_20231231_hist.zip',
 '../data/original/dwd/daily/more_precip/historical\\tageswerte_RR_00617_19410101_

In [19]:
from zipfile import ZipFile
# example of the files inside the first zip file
with ZipFile(zip_list[0]) as myzip:
    print(myzip.namelist())

['Metadaten_Stationsname_Betreibername_00079.html', 'Metadaten_Stationsname_Betreibername_00079.txt', 'Metadaten_Parameter_nieder_tag_00079.html', 'Metadaten_Parameter_nieder_tag_00079.txt', 'Metadaten_Geraete_Neuschneehoehe_00079.html', 'Metadaten_Geraete_Neuschneehoehe_00079.txt', 'Metadaten_Geraete_Niederschlagshoehe_00079.html', 'Metadaten_Geraete_Niederschlagshoehe_00079.txt', 'Metadaten_Geraete_Schneehoehe_00079.html', 'Metadaten_Geraete_Schneehoehe_00079.txt', 'Metadaten_Geographie_00079.txt', 'Metadaten_Fehldaten_00079_19310101_20231231.html', 'Metadaten_Fehldaten_00079_19310101_20231231.txt', 'Metadaten_Fehlwerte_00079_19310101_20231231.txt', 'produkt_nieder_tag_19310101_20231231_00079.txt']


In [20]:
# use the name pattern to get the file name
with ZipFile(zip_list[0]) as myzip:
    prod_filename = [name for name in myzip.namelist() if name.split("_")[0]=="produkt"][0] 
    print(prod_filename)

produkt_nieder_tag_19310101_20231231_00079.txt


In [21]:
# Read one of the files as example

with ZipFile(zip_list[0]) as myzip:
    prod_filename = [name for name in myzip.namelist() if name.split("_")[0]=="produkt"][0] 
    
    #open just the product file within archive
    with myzip.open(prod_filename) as myfile:
    # read the time series data in a temporal dataframe
        df_prec = pd.read_csv(myfile, 
                      sep=";", 
                      parse_dates = ["MESS_DATUM"], 
                      index_col = "MESS_DATUM", 
                      na_values = [-999.0],
                    dtype={'STATIONS_ID':int} #easier to be integer because of the different format
                         )
df_prec.head()

Unnamed: 0_level_0,STATIONS_ID,QN_6,RS,RSF,SH_TAG,NSH_TAG,eor
MESS_DATUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1931-01-01,79,5,5.8,8.0,,,eor
1931-01-02,79,5,3.3,1.0,,,eor
1931-01-03,79,5,5.4,8.0,,,eor
1931-01-04,79,5,0.2,1.0,,,eor
1931-01-05,79,5,2.0,8.0,,,eor


Now repeat the example with all the files in the ziplist. And join them in a dataframe

In [22]:
df_prec.columns

Index(['STATIONS_ID', 'QN_6', '  RS', ' RSF', 'SH_TAG', 'NSH_TAG', 'eor'], dtype='object')

In [23]:
df_prec =  df_prec[['  RS','STATIONS_ID' ]]
df_prec

Unnamed: 0_level_0,RS,STATIONS_ID
MESS_DATUM,Unnamed: 1_level_1,Unnamed: 2_level_1
1931-01-01,5.8,79
1931-01-02,3.3,79
1931-01-03,5.4,79
1931-01-04,0.2,79
1931-01-05,2.0,79
...,...,...
2023-12-27,0.0,79
2023-12-28,0.0,79
2023-12-29,0.4,79
2023-12-30,0.0,79


In [34]:
df_prec[(df_prec.index.year >= 1950)]

Unnamed: 0_level_0,RS,STATIONS_ID
MESS_DATUM,Unnamed: 1_level_1,Unnamed: 2_level_1
1950-01-01,0.0,79
1950-01-02,13.0,79
1950-01-03,1.0,79
1950-01-04,2.8,79
1950-01-05,0.8,79
...,...,...
2023-12-27,0.0,79
2023-12-28,0.0,79
2023-12-29,0.4,79
2023-12-30,0.0,79


In [41]:
# create an empty dataFrame to merge the temperature data to
df_prec_long = pd.DataFrame()
# iterate through the zipfiles
for zip_file in zip_list:
    with ZipFile(zip_file) as myzip:
        #we are only interested in the file starting with 'produkt_'
        prod_filename = [name for name in myzip.namelist() if name.split("_")[0]=="produkt"][0] 
        
        #open just the product file within archive
        with myzip.open(prod_filename) as myfile:
            # read the time series data in a temporal dataframe
            df_dummy = pd.read_csv(myfile, 
                                  sep=";", 
                                  parse_dates = ["MESS_DATUM"], 
                                  index_col = "MESS_DATUM", 
                                  na_values = [-999.0],
                                  dtype={"STATIONS_ID":int}
                                 )
            
            
            df_dummy =  df_dummy[['  RS','STATIONS_ID' ]]
            # Only interested in the daily precipitation height parameter

            df_dummy = df_dummy[(df_dummy.index.year >= 1950)] #readings only after 1950
            
            if df_prec_long.empty:
                df_prec_long = df_dummy
            else:
                df_prec_long = pd.concat([df_prec_long,df_dummy])
            

In [42]:
df_prec_long.head()

Unnamed: 0_level_0,RS,STATIONS_ID
MESS_DATUM,Unnamed: 1_level_1,Unnamed: 2_level_1
1950-01-01,0.0,79
1950-01-02,13.0,79
1950-01-03,1.0,79
1950-01-04,2.8,79
1950-01-05,0.8,79


In [43]:
translate2 = \
{'STATIONS_ID':'station_id',
 '  RS':'prec'}

In [44]:
df_prec_long.rename(columns=translate2,inplace=True)

In [45]:
df_prec_long.index.rename("date_day", inplace = True)

In [46]:
df_prec_long

Unnamed: 0_level_0,prec,station_id
date_day,Unnamed: 1_level_1,Unnamed: 2_level_1
1950-01-01,0.0,79
1950-01-02,13.0,79
1950-01-03,1.0,79
1950-01-04,2.8,79
1950-01-05,0.8,79
...,...,...
2023-12-27,0.0,5791
2023-12-28,0.0,5791
2023-12-29,1.0,5791
2023-12-30,0.3,5791


In [47]:
%%time 
df_prec_long.to_csv("../data/time_series_prec_NRW.csv", sep= ";")

CPU times: total: 8.22 s
Wall time: 11 s


In [65]:
df_prec_long.max(axis=0, skipna=True, numeric_only=False)

prec           161.7
station_id    5791.0
dtype: float64

In [66]:
df_prec_long.min(axis=0, skipna=True, numeric_only=False)

prec           0.0
station_id    79.0
dtype: float64

In [69]:
df_prec_long.mean(axis=0, skipna=True, numeric_only=False)

prec             2.439929
station_id    2786.615242
dtype: float64

In [68]:
df_prec_long.median(axis=0, skipna=True, numeric_only=False)

prec             0.2
station_id    2744.0
dtype: float64