## 1. About the DWD Open Data Portal 

The data of the Climate Data Center (CDC) of the DWD (Deutscher Wetterdienst, German Weather Service) is provided on an **FTP server**. <br> **FTP** stands for _File Transfer Protocol_.

Open the FTP link ftp://opendata.dwd.de/climate_environment/CDC/ in your browser (copy-paste) and find our how it is structured hierarchically.

You can also open the link with **HTTPS** (Hypertext Transfer Protocol Secure): https://opendata.dwd.de/climate_environment/CDC/

We are interested in downloading the metadata of annual temperature to get information related to their stations

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import re # to use regex expressions 
import tqdm
import pandas as pd
import geopandas as gpd

# URL of the DWD website
url_base = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/"
url_temporal_resolution = "annual/"
url_parameter = "kl/"
url_subdir = "historical/"
url_full = os.path.join(url_base, url_temporal_resolution, url_parameter, url_subdir)

# Directory to save the downloaded files
download_dir = "../data/original/dwd/" +  url_temporal_resolution + url_parameter + url_subdir

# Create the directory if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

print("download dir: ", download_dir)

# Directory and name to save the CSV file to create the layer for sub-task 1.2
outfname = r"../data/Selected_Stations_Baden_Württemberg.csv"

download dir:  ../data/original/dwd/annual/kl/historical/


In [2]:
url_full

'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/annual/kl/historical/'

In [3]:
def grab_file(file_url, download_dir):
        # get only the file name from the full url
        file_name = file_url.split("/")[-1]
        # Download the file
        file_path =os.path.join(download_dir, file_name)
        with open(file_path, "wb") as file:
            file.write(requests.get(file_url).content)
        
    

In [4]:
# Send an HTTP request to the URL
response = requests.get(url_full)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")
    # Look for the metadata file
    links = soup.find_all(href=re.compile("Beschreibung"))
    # Take the url of the file
    file_name = links[0].get("href")
    # Download the file
    grab_file(os.path.join(url_full, file_name), download_dir)
    print(f"Downloaded: {download_dir+file_name}")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

Downloaded: ../data/original/dwd/annual/kl/historical/KL_Jahreswerte_Beschreibung_Stationen.txt


In [5]:
# get station path
file_path = os.path.join(download_dir,file_name)
# read the header of the file
header = open(file_path, encoding="latin").readline().split()
header

['Stations_id',
 'von_datum',
 'bis_datum',
 'Stationshoehe',
 'geoBreite',
 'geoLaenge',
 'Stationsname',
 'Bundesland',
 'Abgabe']

In [6]:
# translation dictionary
translate = \
{'Stations_id':'station_id',
 'von_datum':'date_from',
 'bis_datum':'date_to',
 'Stationshoehe':'altitude',
 'geoBreite': 'latitude',
 'geoLaenge': 'longitude',
 'Stationsname':'name',
 'Bundesland':'state'}

In [7]:
#pd.read_csv?

In [8]:
df_stations_2 = pd.read_fwf(file_path,
                          skiprows=[0,1],
                          names=translate,
                          encoding="latin", 
                          parse_dates=["von_datum","bis_datum"],
                          dtype={"Stations_id":str}
                          #index_col="Stations_id"
                         )
df_stations_2

  df_stations_2 = pd.read_fwf(file_path,


Unnamed: 0,Stations_id,von_datum,bis_datum,Stationshoehe,geoBreite,geoLaenge,Stationsname,Bundesland
1,19310101,1986-06-30,478,47.8413,8.8493,Aach,Baden-Württemberg,Frei
3,18510101,2011-03-31,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen,Frei
44,19710301,2023-12-31,44,52.9336,8.2370,Großenkneten,Niedersachsen,Frei
52,19730101,2001-12-31,46,53.6623,10.1990,Ahrensburg-Wulfsdorf,Schleswig-Holstein,Frei
61,19750701,1978-08-31,339,48.8443,12.6171,Aiterhofen,Bayern,Frei
...,...,...,...,...,...,...,...,...
19774,19710819,1994-02-28,174,51.4814,10.8057,Nordhausen (Umspannwerk),Thüringen,Frei
19781,18810101,1953-12-31,367,48.7429,11.4233,Ingolstadt,Bayern,Frei
19794,18810101,1954-12-31,370,49.4415,11.8529,Amberg (Mariahilfberg),Bayern,Frei
19856,20240801,2024-08-31,625,47.6134,12.9819,Schönau am Königssee,Bayern,Frei


In [9]:
# read the stations dataframe
df_stations = pd.read_fwf(file_path,
                          skiprows=2,
                          names=header,
                          encoding="latin", 
                          parse_dates=["von_datum","bis_datum"],
                          dtype={"Stations_id":str}
                          #index_col="Stations_id"
                         )
df_stations

Unnamed: 0,Stations_id,von_datum,bis_datum,Stationshoehe,geoBreite,geoLaenge,Stationsname,Bundesland,Abgabe
0,00001,1931-01-01,1986-06-30,478,47.8413,8.8493,Aach,Baden-Württemberg,Frei
1,00003,1851-01-01,2011-03-31,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen,Frei
2,00044,1971-03-01,2023-12-31,44,52.9336,8.2370,Großenkneten,Niedersachsen,Frei
3,00052,1973-01-01,2001-12-31,46,53.6623,10.1990,Ahrensburg-Wulfsdorf,Schleswig-Holstein,Frei
4,00061,1975-07-01,1978-08-31,339,48.8443,12.6171,Aiterhofen,Bayern,Frei
...,...,...,...,...,...,...,...,...,...
1202,19774,1971-08-19,1994-02-28,174,51.4814,10.8057,Nordhausen (Umspannwerk),Thüringen,Frei
1203,19781,1881-01-01,1953-12-31,367,48.7429,11.4233,Ingolstadt,Bayern,Frei
1204,19794,1881-01-01,1954-12-31,370,49.4415,11.8529,Amberg (Mariahilfberg),Bayern,Frei
1205,19856,2024-08-01,2024-08-31,625,47.6134,12.9819,Schönau am Königssee,Bayern,Frei


Check all the different values in the "state" column. You can use the function <code>.unique()</code> for this.

In [10]:
df_stations.rename(columns=translate,inplace=True)

In [11]:
df_stations.loc[:,"state"].unique()

array(['Baden-Württemberg', 'Nordrhein-Westfalen', 'Niedersachsen',
       'Schleswig-Holstein', 'Bayern', 'Hessen', 'Brandenburg',
       'Thüringen', 'Mecklenburg-Vorpommern', 'Sachsen',
       'Rheinland-Pfalz', 'Sachsen-Anhalt', 'Berlin', 'Saarland',
       'Bremen', 'Hamburg'], dtype=object)

In [12]:
#df_stations.query?

In [13]:
# filter stations only in Baden-Württemberg which are active and older than 1950
df_stations_short = df_stations.query("state == 'Baden-Württemberg' and date_to >= 2023 and date_from < 1950")

In [14]:
df_stations_short

Unnamed: 0,station_id,date_from,date_to,altitude,latitude,longitude,name,state,Abgabe
50,257,1881-01-01,2023-12-31,240,48.727,8.2457,Baden-Baden-Geroldsau,Baden-Württemberg,Frei
51,259,1881-01-01,2023-12-31,275,47.8064,7.6387,Müllheim,Baden-Württemberg,Frei
158,755,1881-01-01,2023-12-31,340,49.5182,9.3213,"Buchen, Kr. Neckar-Odenwald",Baden-Württemberg,Frei
239,1197,1931-01-01,2023-12-31,463,48.9895,10.1312,Ellwangen-Rindelbach,Baden-Württemberg,Frei
275,1346,1921-01-01,2023-12-31,486,47.8748,8.0038,Feldberg/Schwarzwald,Baden-Württemberg,Frei
296,1443,1869-01-01,2024-08-31,237,48.0232,7.8343,Freiburg,Baden-Württemberg,Frei
300,1468,1881-01-01,2023-12-31,797,48.4538,8.409,Freudenstadt,Baden-Württemberg,Frei
323,1602,1888-01-01,2023-12-31,177,48.433,7.993,Ohlsbach,Baden-Württemberg,Frei
388,2074,1881-01-01,2023-12-31,518,48.3752,8.98,Hechingen,Baden-Württemberg,Frei
495,2638,1928-01-01,2023-12-31,974,48.1054,8.7548,Klippeneck,Baden-Württemberg,Frei


In [15]:
df_stations_short.to_csv(outfname,sep =";")
print("CSV file saved in " + outfname )

CSV file saved in ../data/Selected_Stations_Baden_Württemberg.csv


In [16]:
gdf = gpd.GeoDataFrame(
    df_stations_short, geometry=gpd.points_from_xy(df_stations_short.longitude, df_stations_short.latitude),
    crs="EPSG:4326"
) 
# used EPSG:4326 because we only have a long./latit. which are angles rather than projected distances from a selected origin

In [17]:
gdf.to_file(r"../data/Baden-Württemberg_Selected_Stations_from_Jupyter.gpkg",driver = "GPKG", layer="selected_stations_points")
print("Geopackage file saved in " + r"../data/Baden-Württemberg_Selected_Stations_from_Jupyter.gpkg" )

Geopackage file saved in ../data/Baden-Württemberg_Selected_Stations_from_Jupyter.gpkg
