In [578]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, datetime, timedelta
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
import random
from sklearn.model_selection import train_test_split
import tensorflow.keras.backend as K
from tensorflow.math import erf
from scipy.stats import norm
from sklearn.preprocessing import Normalizer,StandardScaler, LabelEncoder
from tensorflow_addons.losses import pinball_loss

In [579]:
tf.test.is_gpu_available()

True

# Prepare data

In [580]:
def get_split_data():
    """
    Load data, normalize and get data splits
    """
    data = pd.read_pickle("data/complete_data_t_2m")
    data = data[data["fcst_hour"].isin(horizons)]
    data_np = data.iloc[:,3:-2].drop("obs_tm", axis = 1).to_numpy()
    #Create label encoding for embedding
    label_enc = LabelEncoder()
    encoding = label_enc.fit_transform(data_np[:,0])
    data_np[:,0] = encoding
    
    Y = data_np[:,1]
    X = np.delete(data_np, 1, axis = 1)
    train_val_data_X, test_data_X, train_val_data_Y, test_data_Y = train_test_split(X,Y, test_size = 0.1)
    train_data_X, val_data_X, train_data_Y,val_data_Y = train_test_split(train_val_data_X,train_val_data_Y, test_size = 0.2)

    #Normalize features data based on train set
    feature_normalizer = Normalizer()
    emb = train_data_X[:,0]
    train_data_X = feature_normalizer.fit_transform(train_data_X)
    train_data_X[:,0] = emb
    
    emb = val_data_X[:,0]
    val_data_X = feature_normalizer.transform(val_data_X)
    val_data_X[:,0] = emb
    
    emb = test_data_X[:,0]
    test_data_X = feature_normalizer.transform(test_data_X)
    test_data_X[:,0] = emb
    

    #Normalize target and save retransform
    target_scaler = StandardScaler()
    train_data_Y = target_scaler.fit_transform(train_data_Y.reshape(-1,1))
    val_data_Y = target_scaler.transform(val_data_Y.reshape(-1,1))
    
    return train_data_X, train_data_Y, val_data_X, val_data_Y, test_data_X, test_data_Y, feature_normalizer, target_scaler, label_enc

In [581]:
train_X, train_Y, val_X, val_Y, test_X, test_Y, feature_scaler, target_scaler, label_encoder = get_split_data()
no_features = 40

In [582]:
quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]

# Build model

In [583]:
def crps_cost_function(y_true, y_pred):
    """Compute the CRPS cost function for a normal distribution defined by
    the mean and standard deviation.
    Code inspired by Kai Polsterer (HITS).
    Args:
        y_true: True values
        y_pred: Tensor containing predictions: [mean, std]
    Returns:
        mean_crps: Scalar with mean CRPS over batch
    """

    # Split input
    mu = y_pred[:, 0]
    sigma = y_pred[:, 1]
    y_true = y_true[:, 0]   # Need to also get rid of axis 1 to match!

    # To stop sigma from becoming negative we first have to 
    # convert it the the variance and then take the square
    # root again. 
    var = K.square(sigma)
    # The following three variables are just for convenience
    loc = (y_true - mu) / K.sqrt(var)
    phi = 1.0 / np.sqrt(2.0 * np.pi) * K.exp(-K.square(loc) / 2.0)
    Phi = 0.5 * (1.0 + erf(loc / np.sqrt(2.0)))
    # First we will compute the crps for each input/target pair
    crps =  K.sqrt(var) * (loc * (2. * Phi - 1.) + 2 * phi - 1. / np.sqrt(np.pi))
    # Then we take the mean. The cost is now a scalar
    return K.mean(crps)

In [584]:
def basic_model(train_X, train_Y, no_features, n_embeddings = 65, no_outputs = 2):
    """
    trainX -- input values; shape: [number of samples, no_features]
    trainY -- output values; shape: [number of samples, 2
    """    
    inp = Input(shape = no_features+1)
    #Extract embedding features
    horizon = inp[:,0]
    features = inp[:,1:]
    
    #Embedding layer
    horizon_emb = Embedding(input_dim = n_embeddings, output_dim = 6)(horizon)
    
    #Concatenate
    conc = Concatenate(axis = 1)([features,horizon_emb])
    
    #Hidden layer
    #hidden = Dense(30, activation = "relu")(conc)
    
    #Linear layer
    outputs = Dense(no_outputs, activation = "linear")(conc)
    model = Model(inputs = inp, outputs = outputs)
    return model

In [585]:
model = basic_model(train_X, train_Y, no_features)
model.summary()

Model: "model_30"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_34 (InputLayer)           [(None, 41)]         0                                            
__________________________________________________________________________________________________
tf.__operators__.getitem_66 (Sl (None,)              0           input_34[0][0]                   
__________________________________________________________________________________________________
tf.__operators__.getitem_67 (Sl (None, 40)           0           input_34[0][0]                   
__________________________________________________________________________________________________
embedding_33 (Embedding)        (None, 6)            390         tf.__operators__.getitem_66[0][0]
___________________________________________________________________________________________

In [586]:
def train_model(model, train_X, train_Y, val_X, val_Y, no_features, batch_size, epochs, learning_rate):
    #Define optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
    #Early stopping
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta = 1e-5)
    #Compile model
    model.compile(optimizer = optimizer, loss = crps_cost_function)
    model.fit(train_X, train_Y, validation_data = (val_X, val_Y), epochs = EPOCHS, shuffle = True, callbacks = [callback], verbose = True)

In [587]:
BATCH_SIZE = 32
EPOCHS = 20
learning_rate = 0.01

In [588]:
train_model(model, train_X, train_Y, val_X, val_Y, no_features, BATCH_SIZE, EPOCHS, learning_rate)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


# Predict test data

In [589]:
#Get prediction
pred = model.predict(test_X)
#Retransform
pred = target_scaler.inverse_transform(pred)
#Square and root results
pred[:,1] = np.sqrt(pred[:,1]**2)
#Convert prediction to quantiles
quantile_pred = np.zeros(shape = (pred.shape[0],5))
for cnt,x in enumerate(pred):
    quantile_pred[cnt] = norm.ppf(quantiles, loc = x[0], scale = x[1])

## Evaluate data on realizations with pinball loss

In [590]:
for cnt,quantile in enumerate(quantiles):
    loss = pinball_loss(quantile_pred[:,cnt], np.squeeze(test_Y), tau = quantile).numpy()
    print("Pinball loss for quantile {} : \t {}".format(quantile,loss))

Pinball loss for quantile 0.025 : 	 9.553150710019251
Pinball loss for quantile 0.25 : 	 4.082893291924755
Pinball loss for quantile 0.5 : 	 2.965075153655866
Pinball loss for quantile 0.75 : 	 4.053272834190417
Pinball loss for quantile 0.975 : 	 9.42747440713632


## Evaluate naive forecast on test data

In [591]:
naive_pred = np.quantile(test_X[:,1:], quantiles, axis = 1)
for cnt,quantile in enumerate(quantiles):
    loss = pinball_loss(naive_pred[cnt], np.squeeze(test_Y), tau = quantile).numpy()
    print("Pinball loss for quantile {} : \t {}".format(quantile,loss))

Pinball loss for quantile 0.025 : 	 12.85571214256123
Pinball loss for quantile 0.25 : 	 9.88601912661762
Pinball loss for quantile 0.5 : 	 6.606105256502969
Pinball loss for quantile 0.75 : 	 3.334102414584358
Pinball loss for quantile 0.975 : 	 0.39872598266587644


# Predict new data

In [592]:
horizons = [36, 48 ,60, 72, 84]

In [593]:
def get_pred_data(name):
    if name == "temperature":
        method = "t_2m"
    elif name == "wind":
        method = "wind_mean_10m"
    else:
        print("Error")
        return None
    #Set current date
    current_date = date.today().strftime("%Y%m%d")
    path = "data/icon_data/icon-eu-eps_{}00_{}_Karlsruhe.txt".format(current_date, method)
    new_data = pd.read_csv(path.format(current_date.replace("-","")), skiprows = 3, sep = "|").dropna(axis = 1)
    new_data.columns = new_data.columns.str.replace(" ", "")
    return new_data

In [594]:
def get_final_forecast(name, horizons, feature_scaler, label_encoder, model, save = False):
    if name == "temperature":
        method = "t_2m"
    elif name == "wind":
        method = "wind_mean_10m"
    else:
        print("Error")
        return None
    #Get data
    data = get_pred_data(name)
    data = data[data["fcst_hour"].isin(horizons)].to_numpy()
    #Label encoding
    encoding = label_encoder.transform(data[:,0])
    data[:,0] = encoding
    #Normalize
    data_pred = feature_scaler.transform(data)
    #Predict
    pred = model.predict(data_pred)
    pred = target_scaler.inverse_transform(pred)
    
    #Create final prediction dataframe
    final_prediction = pd.DataFrame(columns = ["forecast_date","target","horizon","q0.025","q0.25","q0.5","q0.75","q0.975"], index = np.arange(0,5))
    final_prediction["forecast_date"] = datetime.today().strftime("%Y-%m-%d")
    final_prediction["horizon"] = ["{} hour".format(x) for x in horizons]
    final_prediction["target"] = method
    
    #Save prediction to dataframe
    for cnt,x in enumerate(pred):
        final_prediction.loc[final_prediction["horizon"] == "{} hour".format(horizons[cnt]), final_prediction.columns[3:]] = (norm.ppf(quantiles, loc = x[0], scale = x[1]))
        
    #Save prediction
    if save == True:
        final_prediction.to_pickle("../evaluation/predictions/single/{}_{}".format(name, date.today().strftime("%Y-%m-%d")))
    
    return final_prediction

In [595]:
get_final_forecast("temperature",horizons, feature_scaler, label_encoder, model, save = False)

Unnamed: 0,forecast_date,target,horizon,q0.025,q0.25,q0.5,q0.75,q0.975
0,2021-10-25,t_2m,36 hour,9.37708,13.533568,15.714482,17.895396,22.051885
1,2021-10-25,t_2m,48 hour,9.165731,13.284482,15.445595,17.606708,21.725459
2,2021-10-25,t_2m,60 hour,9.386095,13.553684,15.740422,17.927161,22.09475
3,2021-10-25,t_2m,72 hour,9.122155,13.357952,15.58048,17.803007,22.038804
4,2021-10-25,t_2m,84 hour,9.340326,13.535341,15.736469,17.937598,22.132612
