# Deutscher Wetterdienst

In [159]:
%reset -sf
import pandas as pd

url = "https://opendata.dwd.de/"
path = 'climate_environment/CDC/observations_germany/climate/daily/kl/'
recent_path = path + 'recent/'
historical_path = path + 'historical/'
filename = 'KL_Tageswerte_Beschreibung_Stationen.txt' 
ws = pd.read_csv(url + recent_path + filename, sep="\t", header=0, skiprows = 0, encoding = "ISO-8859-1").dropna()
ws.drop(0, inplace=True)

# save original column names
colnames = ws.columns[0].split(' ')

# rename column for instance to 'dummy'
ws.columns = ['dummy']

# split string up to 6th column
ws = ws['dummy'].str.split('\s+', n=6, expand=True)

# convert 1:6 to numeric
for col in  ws.iloc[:,1:6]:
    ws[col] = pd.to_numeric(ws[col], errors='coerce') 
    
# concat columns back to a type consistent dataframe
wst = pd.concat([ws.iloc[:,0:6], ws[6].str.slice(0,41), ws[6].str.slice(41,)], axis=1)  
wst.columns = colnames

## Read range subset of all weather stations

In [160]:
selection = '83Messstationen.csv'
selws = pd.read_csv(selection, sep=";", header=0, skiprows = 0)
selws

Unnamed: 0,STATIONS-KENNZIFFER,STATIONS_ID,ICAO-KENNUNG,STATIONSNAME,STATIONSHÖHE IN METERN,GEOGR. BREITE,GEOGR. LÄNGE,AUTOMAT SEIT:,BEGINN KLIMAREIHE
0,10501,3,,Aachen,202,50° 47',06° 05',01.07.1993,1891
1,10505,15000,,Aachen-Orsbach,231,50° 47',06° 01',01.04.2011,2011
2,10291,164,,Angermünde,54,53° 01',13° 59',01.11.1991,1947
3,10091,183,,Arkona,42,54° 40',13° 26',01.11.1991,1947
4,10852,232,EDMA,Augsburg,462,48° 25',10° 56',10.11.1996,1947
...,...,...,...,...,...,...,...,...,...
78,10544,5371,,Wasserkuppe,921,50° 29',09° 56',01.05.1995,1936
79,10688,5397,,Weiden,440,49° 40',12° 11',04.07.1991,1947
80,10980,5467,,Wendelstein,1832,47° 42',12° 00',01.01.1991,1951
81,10655,5705,,Würzburg,268,49° 46',09° 57',22.10.1992,1901


## Keep weather stations having observations in 1961 and earlier

In [161]:
refy = 1961
refydf = selws.loc[selws['BEGINN KLIMAREIHE'] <= refy,][['STATIONS_ID','STATIONSNAME']]
refydf

Unnamed: 0,STATIONS_ID,STATIONSNAME
0,3,Aachen
2,164,Angermünde
3,183,Arkona
4,232,Augsburg
5,282,Bamberg
...,...,...
78,5371,Wasserkuppe
79,5397,Weiden
80,5467,Wendelstein
81,5705,Würzburg


## Download zip file from URL
[howto](https://pythonguides.com/download-zip-file-from-url-using-python/)

In [162]:
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
from re import compile

def collectRecords(zfile): 
    with BytesIO(zfile.read()) as b, ZipFile(b) as datafile: 
        r = compile("^produkt_klima_tag_.*\.txt$")
        dfound = list(filter(r.match, datafile.namelist()))
        number = len(dfound)
        assert  number == 1, f"WARN: exactly one element expected, got {number} instead"
        rf = datafile.open(dfound[0])
        lines = rf.readlines()
        rf.close()
        header = True
        for bline in lines:
            line = bline.decode('unicode-escape').rstrip('\r\n').split(';')
            del line[-1] # remove last column containing only string 'eor'
            if header: # initialize list of lists
                header = not(header)
                record = [line]
            else:
                record.append(line)
    return record

## Extract observations for weather stations belonging to above subset

In [163]:
zip_url = url + recent_path 
twre = compile(r'tageswerte_KL_[0-9]{5}_akt.zip')

#zip_url = url + historical_path 
#twre = compile(r'tageswerte_KL_[0-9]{5}_[0-9]{8}_[0-9]{8}_hist.zip')

print(zip_url)
first = True
with urlopen(zip_url) as f:
    for bline in f.readlines():
        zfound = twre.search(bline.decode('utf-8'))
        #print(zfound)
        if zfound:
            zfilename = zfound.string[zfound.start():zfound.end()]
            if refydf['STATIONS_ID'].isin([int(zfilename[14:19])]).any():
                with urlopen(zip_url + zfilename) as z:
                    record = collectRecords(z) # <-- function call   
                    df = pd.DataFrame(record[1:], columns=record[0])
                    if (first):
                        DF = df
                        first = not first
                    else:
                        DF = pd.concat([DF, df]) 

https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/kl/recent/


## Transform data from string to the appropriate type

In [164]:
cols = DF.columns[2:]
DF = DF.replace('-999', '-', regex=True)
DF[cols] = DF[cols].apply(pd.to_numeric, errors='coerce', axis=1)
DF['STATIONS_ID'] = DF['STATIONS_ID'].astype(int)
DF['MESS_DATUM'] = pd.to_datetime(DF['MESS_DATUM'], format='%Y%m%d')
#print(DF.dtypes)
DF

Unnamed: 0,STATIONS_ID,MESS_DATUM,QN_3,FX,FM,QN_4,RSK,RSKF,SDK,SHK_TAG,NM,VPM,PM,TMK,UPM,TXK,TNK,TGK
0,164,2021-09-06,10.0,9.5,2.4,3.0,0.0,0.0,9.017,0.0,4.6,13.4,1017.35,16.2,73.67,22.8,9.4,7.0
1,164,2021-09-07,10.0,4.9,2.0,3.0,0.0,0.0,8.800,0.0,5.5,15.4,1016.93,16.8,81.79,22.3,11.4,9.8
2,164,2021-09-08,10.0,6.2,1.3,3.0,0.0,0.0,10.767,0.0,1.3,14.8,1014.23,17.0,79.50,24.4,9.3,7.8
3,164,2021-09-09,10.0,9.1,2.2,3.0,0.0,0.0,11.917,0.0,0.2,13.9,1007.60,18.6,69.33,26.5,10.4,7.5
4,164,2021-09-10,10.0,7.7,2.3,3.0,0.0,6.0,8.417,0.0,4.5,18.2,1006.63,20.0,77.75,28.0,14.1,11.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,5906,2023-03-05,1.0,6.1,1.8,1.0,0.0,6.0,0.000,0.0,7.9,5.9,1004.62,4.1,72.17,5.7,2.2,1.1
546,5906,2023-03-06,1.0,9.5,3.3,1.0,0.0,6.0,0.033,0.0,7.8,5.8,995.57,3.8,72.50,6.2,2.3,1.6
547,5906,2023-03-07,1.0,12.4,5.1,1.0,0.0,0.0,0.667,0.0,7.9,5.7,990.36,5.5,63.67,7.8,2.6,1.8
548,5906,2023-03-08,1.0,11.1,4.0,1.0,21.2,6.0,0.000,0.0,7.5,8.0,984.85,6.4,81.17,11.0,1.7,1.8
