In [41]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np 
from datetime import datetime
from pathlib import Path
from multiprocessing import cpu_count 
from multiprocessing.pool import ThreadPool


# ----------------------------------------------
#   Input Data
# ----------------------------------------------

# variable name
# vars = ['air_temperature','wind','solar','extreme_temperature','precipitation','extreme_wind'] 
var = 'air_temperature' 

state= ['Bayern']  # state is a list, wrapped in [] 

target_folder ='../../0.raw/dwd'

year_start = 2010

year_end = 2022

# ----------------------------------------------
#  
# ----------------------------------------------
# if state is given as string, --> list
if isinstance(state,str):
    state = [state]

# if the target folder does not exist, then create it.
if not Path(target_folder).is_dir():
    Path(target_folder).mkdir(parents=True, exist_ok=True)



def dwd_meta_reader(var):
    # abbrevations for variables
    abbs = {'wind':'ff',
           'air_temperature':'tu',
          'solar':'sd',
          'precipitation':'rr',
          'extreme_wind':'fx',
          'extreme_temperature':'tx'}

    abb = abbs[var]
    
    # url of metadata
    url_meta = ('https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/'+ 
        var + '/historical/zehn_min_' + 
        abb +'_Beschreibung_Stationen.txt')
    
    # header 
    header = pd.read_csv(url_meta, nrows=1, delimiter=" ",encoding='latin1')
    # raw data 
    data = pd.read_fwf(url_meta, widths=[6, 9, 8, 15, 12, 10, 42, 98], header=None, skiprows=2, encoding='latin1')

    # rename columns
    station_meta = data.rename(columns=dict(zip(data.columns, header.columns)))
    
    # convert column types
    station_meta = station_meta.astype({'Stations_id': 'int',
                                   'von_datum':'str',
                                   'bis_datum':'str'})

    # convert von_datum and bis_datum to datatime.date format
    station_meta[['von_datum','bis_datum']] = station_meta[['von_datum','bis_datum']].apply(pd.to_datetime,format='%Y%m%d')

    return(station_meta)


# function to list all data in the url, with an extension of `ext`
def listFD(url, ext=''):
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'html.parser')
    links = [node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]
    return(links)


# function to download a single file  
def download_url(args): 
    url, fn = args[0], args[1] 
    
    try: 
        r = requests.get(url) 
        with open(fn, 'wb') as f:
            f.write(r.content) 
    except Exception as e: 
        print('Exception in download_url():', e)


# function to parallel downloading
def download_parallel(args):
    cpus = cpu_count() 
    results = ThreadPool(cpus - 1).imap_unordered(download_url, args) 

# function to list downloadable zip files for variable `var`
def dwd_file_list(var,ids,y_start=2010,y_end=2022,target_folder='./'):
    
    url = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/' + var + '/historical/'
    
    # file extension
    ext = 'zip'

    # all .zip files 
    fns = listFD(url,ext)

    # Creating a dataframe
    df = pd.DataFrame({'url':fns})

    # Extracting the third part of the strings in the 'url' column
    df[['id', 'von_datum','bis_datum']] = df['url'].str.split('_', expand=True).loc[:, [2,3,4]]

    df = df.astype({'id':int})

    df[['von_datum', 'bis_datum']] = df[['von_datum', 'bis_datum']].apply(pd.to_datetime, format='%Y%m%d')
    
    df = df[df['id'].isin(ids)]
    
    df = df[df.bis_datum >= datetime(y_start,1,1)]
    
    df = df[df.von_datum <= datetime(y_end,1,1)]

    # full_urls are URLs for downloading
    df = df.assign(full_url=url+df.url,
                  local = target_folder+df.url)
       
    return(df)


In [48]:
fm = dwd_meta_reader(var)

# all IDs in defined federal state
station_ids = fm.query(f"Bundesland in {state}")['Stations_id'].to_list()

# all available zip files and their metadata
zip_files = dwd_file_list(var,station_ids,year_start,year_end,target_folder='../../0.raw/dwd/')

urls = zip(zip_files.full_url,zip_files.local)

# TO DOWNLOAD ALL FILES USING   download_parallel(urls)


In [44]:
# Testing Code: download the first 3 code 

dat = zip_files.iloc[0:3,:]

urls = zip(dat.full_url,dat.local)

download_parallel(urls)

In [43]:
fs

Unnamed: 0,url,id,von_datum,bis_datum,full_url,local
8,10minutenwerte_TU_00073_20100101_20191231_hist...,73,2010-01-01,2019-12-31,https://opendata.dwd.de/climate_environment/CD...,../../0.raw/dwd/10minutenwerte_TU_00073_201001...
9,10minutenwerte_TU_00073_20200101_20221231_hist...,73,2020-01-01,2022-12-31,https://opendata.dwd.de/climate_environment/CD...,../../0.raw/dwd/10minutenwerte_TU_00073_202001...
22,10minutenwerte_TU_00142_20100101_20191231_hist...,142,2010-01-01,2019-12-31,https://opendata.dwd.de/climate_environment/CD...,../../0.raw/dwd/10minutenwerte_TU_00142_201001...
23,10minutenwerte_TU_00142_20200101_20221231_hist...,142,2020-01-01,2022-12-31,https://opendata.dwd.de/climate_environment/CD...,../../0.raw/dwd/10minutenwerte_TU_00142_202001...
28,10minutenwerte_TU_00151_20100101_20191231_hist...,151,2010-01-01,2019-12-31,https://opendata.dwd.de/climate_environment/CD...,../../0.raw/dwd/10minutenwerte_TU_00151_201001...
...,...,...,...,...,...,...
1567,10minutenwerte_TU_07431_20200101_20221231_hist...,7431,2020-01-01,2022-12-31,https://opendata.dwd.de/climate_environment/CD...,../../0.raw/dwd/10minutenwerte_TU_07431_202001...
1587,10minutenwerte_TU_13710_20100101_20191231_hist...,13710,2010-01-01,2019-12-31,https://opendata.dwd.de/climate_environment/CD...,../../0.raw/dwd/10minutenwerte_TU_13710_201001...
1588,10minutenwerte_TU_13710_20200101_20221231_hist...,13710,2020-01-01,2022-12-31,https://opendata.dwd.de/climate_environment/CD...,../../0.raw/dwd/10minutenwerte_TU_13710_202001...
1609,10minutenwerte_TU_15555_20160501_20191231_hist...,15555,2016-05-01,2019-12-31,https://opendata.dwd.de/climate_environment/CD...,../../0.raw/dwd/10minutenwerte_TU_15555_201605...
