In [59]:
%reset -sf
import pandas as pd
import numpy as np
from datetime import datetime
from IPython.display import Markdown as md

In [60]:
url = "https://opendata.dwd.de/"
path = 'climate_environment/CDC/observations_germany/climate/daily/kl/'
recent_path = path + 'recent/'
historical_path = path + 'historical/'
filename = 'KL_Tageswerte_Beschreibung_Stationen.txt' 
ws = pd.read_csv(url + recent_path + filename, sep="\t", header=0, skiprows = 0, encoding = "ISO-8859-1").dropna()
ws.drop(0, inplace=True)

# save original column names
colnames = ws.columns[0].split(' ')

# rename column for instance to 'dummy'
ws.columns = ['dummy']

# split string up to 6th column
ws = ws['dummy'].str.split('\s+', n=6, expand=True)

# convert 1:6 to numeric
for col in  ws.iloc[:,1:6]:
    ws[col] = pd.to_numeric(ws[col], errors='coerce') 
    
# concat columns back to a type consistent dataframe
wst = pd.concat([ws.iloc[:,0:6], ws[6].str.slice(0,41), ws[6].str.slice(41,)], axis=1)  
wst.columns = colnames
wst.head()

Unnamed: 0,Stations_id,von_datum,bis_datum,Stationshoehe,geoBreite,geoLaenge,Stationsname,Bundesland
1,1,19370101,19860630,478,47.8413,8.8493,Aach,Baden-...
2,3,18910101,20110331,202,50.7827,6.0941,Aachen,Nordrh...
3,11,19800901,20231130,680,47.9736,8.5205,Donaueschingen (Landeplatz),Baden-...
4,44,19690101,20231130,44,52.9336,8.237,Großenkneten,Nieder...
5,52,19690101,20011231,46,53.6623,10.199,Ahrensburg-Wulfsdorf,Schles...


In [61]:
# Convert von_datum and bis_datum to a date type
wst["von"] = pd.to_datetime(wst["von_datum"], format="%Y%m%d", errors="coerce")
wst["bis"] = pd.to_datetime(wst["bis_datum"], format="%Y%m%d", errors="coerce")
# print(wst.loc[:,['Stations_id','von','bis']])

# Function to extract stations Climatic Standard Normal periods, s. WMO-No. 1203
def dwd_extratCSN(df, yyyy, wmo1203=30):
    first = str(yyyy-wmo1203+1) + '-1-1'
    last = str(yyyy)+ '-12-31'
    fv = pd.to_datetime(first, format='%Y-%m-%d', errors="coerce").date()
    lv = pd.to_datetime(last, format='%Y-%m-%d', errors="coerce").date()
    return df[(df['von'] <= first) & (df['bis'] >= last)], (lv - fv).days

# Select stations having data for the Standard Reference Period and some Climatic Standard Normal periods
wmo_yyyy = 1990
CSN, CSN_days = dwd_extratCSN(wst.drop(['von_datum','bis_datum'], axis=1), wmo_yyyy)  # Standard Ref. Period
# print(CSN.loc[:,['Stations_id','von','bis']])
# print(CSN.describe())
# print(len(CSN.index))
# print(CSN_days)

### DWD Compare Each Month vs. Climatic Norm

In [62]:
md(f"{len(wst.index)} available weather stations {len(CSN.index)} of which contribute to the Climatic Standard Normal ending in {wmo_yyyy}. This period contains {CSN_days} days.")


1355 available weather stations 428 of which contribute to the Climatic Standard Normal ending in 1990. This period contains 10956 days.

In [63]:
# CSN.drop(['Stationshoehe','geoBreite','geoLaenge'], axis=1)
# print(CSN.loc[:,['Stations_id','Stationsname']].head())
# print(CSN.loc[:,['Stations_id','Stationsname']].tail(11).to_markdown())
CSN

Unnamed: 0,Stations_id,Stationshoehe,geoBreite,geoLaenge,Stationsname,Bundesland,von,bis
2,00003,202,50.7827,6.0941,Aachen,Nordrh...,1891-01-01,2011-03-31
10,00073,374,48.6183,13.0620,Aldersbach-Kramersepp,Bayern...,1959-03-01,2023-11-30
11,00078,64,52.4853,7.9125,Alfhausen,Nieder...,1961-01-01,2023-11-30
25,00142,511,48.4060,11.3117,Altomünster-Maisbrunn,Bayern...,1955-01-01,2023-11-30
26,00150,215,49.7273,8.1164,Alzey,Rheinl...,1951-01-01,2023-11-30
...,...,...,...,...,...,...,...,...
1327,15963,40,51.3258,6.5089,St. Tönis,Nordrh...,1954-01-01,2004-11-30
1334,19087,645,48.8049,13.5528,Freyung vorm Wald,Bayern...,1957-05-01,1995-11-30
1348,19582,754,48.1419,8.4255,Königsfeld,Baden-...,1951-01-01,2001-08-31
1350,19617,310,49.7391,10.6039,Burghaslach,Bayern...,1941-01-01,2006-12-31


Display number of weather stations by Bundesland:

In [64]:
print(CSN.groupby(['Bundesland'])['Bundesland'].count().to_markdown())

| Bundesland             |   Bundesland |
|:-----------------------|-------------:|
| Baden-Württemberg      |           78 |
| Bayern                 |           79 |
| Berlin                 |            3 |
| Brandenburg            |           13 |
| Bremen                 |            2 |
| Hamburg                |            4 |
| Hessen                 |           38 |
| Mecklenburg-Vorpommern |           16 |
| Niedersachsen          |           40 |
| Nordrhein-Westfalen    |           36 |
| Rheinland-Pfalz        |           27 |
| Saarland               |            5 |
| Sachsen                |           23 |
| Sachsen-Anhalt         |           22 |
| Schleswig-Holstein     |           14 |
| Thüringen              |           28 |


Download zip file from URL
[howto](https://pythonguides.com/download-zip-file-from-url-using-python/)

In [65]:
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
from re import compile

def collectRecords(zfile): 
    with BytesIO(zfile.read()) as b, ZipFile(b) as datafile: 
        r = compile("^produkt_klima_tag_.*\.txt$")
        dfound = list(filter(r.match, datafile.namelist()))
        number = len(dfound)
        assert number == 1, f"WARN: exactly one element expected, got {number} instead"
        # print(dfound[0])
        # print(type(datafile))
        rf = datafile.open(dfound[0])
        lines = rf.readlines()
        rf.close()
        header = True
        for bline in lines:
            line = bline.decode('unicode-escape').rstrip('\r\n').split(';')
            del line[-1] # remove last column containing only string 'eor'
            #print(line)
            if header: # initialize list of lists
                header = not(header)
                record = [line]
            else:
                record.append(line)
    return record

csn_lst = CSN['Stations_id'].to_list()
# csn_lst = csn_lst[:11]

Loop zip files in zip_url and extract observations

In [171]:
#
# WMO Guidelines on the Calculation of Climate Normals: WMO-No. 1203
#
# zip_url = url + recent_path 
# tw_regex = compile(r'tageswerte_KL_[0-9]{5}_akt.zip')

zip_url = url + historical_path 
tw_regex = compile(r'tageswerte_KL_[0-9]{5}_[0-9]{8}_[0-9]{8}_hist.zip')

print(zip_url)
once = True
with urlopen(zip_url) as f:
    for bline in f.readlines():
        zfound = tw_regex.search(bline.decode('utf-8'))
        # print(zfound)
        if zfound:
            zfilename = zfound.string[zfound.start():zfound.end()]
            sid = zfilename[14:19] # extracted weather station id
            if sid in csn_lst[111]:
            # if sid in csn_lst:
                # print (sid)
                with urlopen(zip_url + zfilename) as z:
                    record = collectRecords(z) # <-- function call   
                    df = pd.DataFrame(record[1:], columns=record[0])
                    df = df.apply(pd.to_numeric)
                    df["DATUM"] = pd.to_datetime(df["MESS_DATUM"], format="%Y%m%d", errors="coerce").dt.date
                    df.drop(['MESS_DATUM'], axis='columns',inplace=True)
                    df.replace(-999, np.NaN,inplace=True)  
                    # df['STATIONS_ID'] = df['STATIONS_ID'].astype('str')
                    df['STATIONS_ID'] = df['STATIONS_ID'].astype('str').apply(lambda x: '{0:0>5}'.format(x))
                    # print(df.describe())
                    # print(df.head())
                    
                    # if (once):
                    #     once = not once
                    #     df_merged = df
                    # else:
                    #     df_merged = pd.concat([df_merged, df], ignore_index=True, sort=False)                

# df_merged.reset_index(drop=True, inplace=True)

https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/kl/historical/


In [192]:
# Function to extract data for Climatic Standard Period
def dwd_extratCSNData(df, yyyy, wmo1203=30):
    first = str(yyyy-wmo1203+1) + '-1-1'
    last = str(yyyy)+ '-12-31'
    fv = pd.to_datetime(first, format='%Y-%m-%d', errors="coerce").date()
    lv = pd.to_datetime(last, format='%Y-%m-%d', errors="coerce").date()
    return df[(df['DATUM'] >= fv) & (df['DATUM'] <= lv)], (lv - fv).days

CSN_df, CSN_days = dwd_extratCSNData(df, 1990)
# print(CSN_df, CSN_days)
print(CSN_df.describe())

CSN_df, CSN_days = dwd_extratCSNData(df, 2000)
# print(CSN_df, CSN_days)
print(CSN_df.describe())

CSN_df, CSN_days = dwd_extratCSNData(df, 2010)
# print(CSN_df, CSN_days)
print(CSN_df.describe())

CSN_df, CSN_days = dwd_extratCSNData(df, 2020)
# print(CSN_df, CSN_days)
print(CSN_df.describe())

              QN_3           FX           FM          QN_4           RSK  \
count  8035.000000  8035.000000  8035.000000  10957.000000  10957.000000   
mean      7.272558     6.372806     0.833703      6.666515      3.733339   
std       2.489787     2.889053     0.482620      2.357076      7.221944   
min       5.000000     0.100000     0.000000      5.000000      0.000000   
25%       5.000000     4.100000     0.500000      5.000000      0.000000   
50%       5.000000     6.400000     0.800000      5.000000      0.200000   
75%      10.000000     8.300000     1.100000     10.000000      4.400000   
max      10.000000    23.400000     4.000000     10.000000     89.600000   

               RSKF           SDK       SHK_TAG            NM           VPM  \
count  10957.000000  10957.000000  10957.000000  10957.000000  10957.000000   
mean       2.690517      4.567436      7.797664      5.245788      8.619431   
std        3.142495      3.957143     16.730212      2.456808      3.958767   

In [199]:
df2022 = dwd_extratCSNData(df, 2022, wmo1203=1) # year 2022 only
print(df2022[0].describe())



             QN_3          FX          FM   QN_4         RSK        RSKF  \
count  365.000000  360.000000  360.000000  365.0  365.000000  365.000000   
mean     7.660274    7.830556    1.461389    3.0    3.395342    4.035616   
std      3.306618    2.923044    0.460934    0.0    6.220951    3.160773   
min      3.000000    1.700000    0.500000    3.0    0.000000    0.000000   
25%      3.000000    5.500000    1.100000    3.0    0.000000    0.000000   
50%     10.000000    8.300000    1.400000    3.0    0.100000    6.000000   
75%     10.000000    9.800000    1.800000    3.0    4.000000    6.000000   
max     10.000000   18.500000    3.000000    3.0   32.600000    8.000000   

              SDK     SHK_TAG          NM         VPM          PM         TMK  \
count  365.000000  365.000000  361.000000  363.000000  363.000000  365.000000   
mean     5.223433    0.884932    5.356787    9.394490  935.129890    9.053699   
std      4.262761    2.357573    2.449278    4.247986    6.805392    7.5

In [202]:
# Since start of measurement
print(df.describe())


               QN_3            FX            FM          QN_4           RSK  \
count  19663.000000  19595.000000  19591.000000  31441.000000  31429.000000   
mean       8.021106      6.851273      1.107886      6.616297      3.720172   
std        2.776251      2.854749      0.548576      2.490791      7.246544   
min        3.000000      0.100000      0.000000      3.000000      0.000000   
25%        5.000000      4.700000      0.700000      5.000000      0.000000   
50%       10.000000      6.900000      1.100000      5.000000      0.200000   
75%       10.000000      8.800000      1.500000      9.000000      4.400000   
max       10.000000     24.600000      5.000000     10.000000    138.200000   

               RSKF           SDK       SHK_TAG            NM           VPM  \
count  31429.000000  31429.000000  29579.000000  28628.000000  31418.000000   
mean       2.669382      4.574446      6.626154      5.388920      8.830416   
std        2.935768      3.973967     14.321823    