In [141]:
import pandas as pd
from glob import glob
import seaborn as sns
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile

# Methods

In [142]:
def download_and_unzip(url, extract_to='dwd_data/'):
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)

In [143]:
def get_realizations():
    filename_wind = glob('dwd_data/produkt_ff*.txt')[0]
    filename_temp = glob('dwd_data/produkt_tu*.txt')[0]
    realizations_temp = pd.read_csv(filename_temp, sep = ";")
    realizations_wind = pd.read_csv(filename_wind, sep = ";")
    realizations_wind.columns = realizations_wind.columns.str.replace(" ","")
    realizations_temp["MESS_DATUM"] = pd.to_datetime(realizations_temp["MESS_DATUM"], format = "%Y%m%d%H")
    realizations_wind["MESS_DATUM"] = pd.to_datetime(realizations_wind["MESS_DATUM"], format = "%Y%m%d%H")

    #Drop unecessary columns
    realizations_wind = realizations_wind[["MESS_DATUM","F"]]
    realizations_temp = realizations_temp[["MESS_DATUM", "TT_TU"]]
    
    #Set index
    realizations_temp.set_index("MESS_DATUM", inplace = True)
    realizations_wind.set_index("MESS_DATUM", inplace = True)
    
    return (realizations_temp, realizations_wind)

In [144]:
def get_observed_time(x):
    res = pd.to_datetime(x["init_tm"]) + timedelta(hours = x["fcst_hour"])
    return res

In [146]:
def get_base_data():
    temperature = pd.read_csv("historic_data/icon_eps_t_2m.csv")
    wind = pd.read_csv("historic_data/icon_eps_wind_10m.csv")
    temperature["obs_tm"] = pd.to_datetime(temperature["obs_tm"])
    wind["obs_tm"] = pd.to_datetime(wind["obs_tm"])
    return (temperature, wind)

In [149]:
def add_observations(data, realizations, name, dropna):
    if name == "temperature":
        column = "TT_TU"
    elif name == "wind":
        column = "F"
    else:
        print("Error")
        return None
    
    #Join data
    index = data.loc[data["obs"].isna()][["obs_tm"]]
    fill_blancs = index.merge(realizations, left_on = "obs_tm", right_on = "MESS_DATUM", how = "left")[column]
    data.loc[data["obs"].isna(),"obs"] = fill_blancs.values
    
    #Dropna
    if dropna ==True:
        data.dropna(inplace = True)
    
    return data

In [155]:
def add_new_data(data, realizations, name, save = False):
    
    if name == "temperature":
        column = "TT_TU"
        method = "t_2m"
        name = method
        factor = 1
    elif name == "wind":
        column = "F"
        method = "wind_10m"
        name = "wind_mean_10m"
        factor = 3.6
    else:
        print("Error")
        return None
    #Get day to start adding new data
    start_init_date = pd.to_datetime(data["init_tm"]).iloc[-1] + timedelta(1)

    #Get index of possible realizations
    index = realizations[realizations.index >= start_init_date].index
    #Get days of index to load data
    days = index.strftime("%Y-%m-%d").drop_duplicates()
    
    #Obtain columns for dataframe
    true_columns = data.columns
    new_df = pd.DataFrame(columns = true_columns)

    #Iterate over all new days
    for current_date in days:
        #current_date = start_init_date
        path = "icon_data/icon-eu-eps_{}00_{}_Karlsruhe.txt"
        new_data = pd.read_csv(path.format(current_date.replace("-",""), name), skiprows = 3, sep = "|").dropna(axis = 1)
        new_data.columns = new_data.columns.str.replace(" ", "")
        columns = ["fcst_hour"]
        columns.extend(true_columns[6:-2])
        new_data.columns = columns
        #Add missing columns
        new_data["ens_mean"] = new_data.iloc[:,1:].mean(axis = 1)
        new_data["ens_var"] = new_data.iloc[:,1:-1].std(axis = 1)
        new_data.insert(0, "init_tm", current_date)
        new_data.insert(1, "met_var", method)
        new_data.insert(2, "location", "Karlsruhe")
        new_data.insert(3, "obs_tm", 0)
        new_data["obs_tm"] = new_data[["init_tm", "fcst_hour"]].apply(lambda x: get_observed_time(x), axis = 1)

        #Need to add observations than finished
        index = new_data["obs_tm"]
        new_obs = realizations[realizations.index.isin(index)][column] * factor
        new_data = new_data.merge(new_obs, left_on = "obs_tm", right_index = True)

        #Append to dataframe
        new_df = new_df.append(new_data)

    #Change columns
    new_df["obs"] = new_df[column]
    new_df.drop(column, axis = 1, inplace = True)
    
    #Aggregate data
    data = data.append(new_df)
    data["fcst_hour"] = data["fcst_hour"].astype("int")
    data = data.reset_index().drop("index", axis = 1)
    
    if save == True:
        data.to_pickle("complete_data_{}".format(method))
    else:          
        return data

# Combine data

In [145]:
url_temp = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/recent/stundenwerte_TU_04177_akt.zip"
url_wind = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/wind/recent/stundenwerte_FF_04177_akt.zip"
download_and_unzip(url_temp)
download_and_unzip(url_wind)

In [147]:
temperature, wind = get_base_data()

In [148]:
realizations_temp, realizations_wind = get_realizations()

In [150]:
temperature = add_observations(temperature, realizations_temp, "temperature", dropna = False)
wind = add_observations(wind, realizations_wind, "wind", dropna = True)

In [156]:
add_new_data(temperature, realizations_temp, "temperature",save = True)

In [157]:
add_new_data(wind, realizations_wind, "wind", save = True)