In [46]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, datetime, timedelta
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
import random
from sklearn.model_selection import train_test_split
import tensorflow.keras.backend as K
from tensorflow.math import erf
from scipy.stats import norm
from sklearn.preprocessing import Normalizer,StandardScaler
from tensorflow_addons.losses import pinball_loss

In [2]:
print("TensorFlow version: {}".format(tf.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))

TensorFlow version: 2.4.1
Eager execution: True


# Prepare data

In [321]:
def get_split_data():
    """
    Load data, normalize and get data splits
    """
    data = pd.read_pickle("data/complete_data_t_2m")
    data_np = data.iloc[:,3:-2].drop("obs_tm", axis = 1).to_numpy()
    Y = data_np[:,1]
    X = np.delete(data_np, 1, axis = 1)
    train_val_data_X, test_data_X, train_val_data_Y, test_data_Y = train_test_split(X,Y, test_size = 0.1)
    train_data_X, val_data_X, train_data_Y,val_data_Y = train_test_split(train_val_data_X,train_val_data_Y, test_size = 0.2)

    #Normalize features data based on train set
    feature_normalizer = Normalizer()
    train_data_X = feature_normalizer.fit_transform(train_data_X)
    val_data_X = feature_normalizer.transform(val_data_X)
    test_data_X = feature_normalizer.transform(test_data_X)

    #Normalize target and save retransform
    target_scaler = StandardScaler()
    train_data_Y = target_scaler.fit_transform(train_data_Y.reshape(-1,1))
    val_data_Y = target_scaler.transform(val_data_Y.reshape(-1,1))
    
    return train_data_X, train_data_Y, val_data_X, val_data_Y, test_data_X, test_data_Y, feature_normalizer, target_scaler

In [301]:
train_X, train_Y, val_X, val_Y, test_X, test_Y, feature_scaler, target_scaler = get_split_data()
no_features = 40

In [302]:
quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]

# Build model

In [303]:
def crps_cost_function(y_true, y_pred):
    """Compute the CRPS cost function for a normal distribution defined by
    the mean and standard deviation.
    Code inspired by Kai Polsterer (HITS).
    Args:
        y_true: True values
        y_pred: Tensor containing predictions: [mean, std]
    Returns:
        mean_crps: Scalar with mean CRPS over batch
    """

    # Split input
    mu = y_pred[:, 0]
    sigma = y_pred[:, 1]
    y_true = y_true[:, 0]   # Need to also get rid of axis 1 to match!

    # To stop sigma from becoming negative we first have to 
    # convert it the the variance and then take the square
    # root again. 
    var = K.square(sigma)
    # The following three variables are just for convenience
    loc = (y_true - mu) / K.sqrt(var)
    phi = 1.0 / np.sqrt(2.0 * np.pi) * K.exp(-K.square(loc) / 2.0)
    Phi = 0.5 * (1.0 + erf(loc / np.sqrt(2.0)))
    # First we will compute the crps for each input/target pair
    crps =  K.sqrt(var) * (loc * (2. * Phi - 1.) + 2 * phi - 1. / np.sqrt(np.pi))
    # Then we take the mean. The cost is now a scalar
    return K.mean(crps)

In [304]:
def basic_model(train_X, train_Y, no_features, n_embeddings = 65, no_outputs = 2):
    """
    trainX -- input values; shape: [number of samples, no_features]
    trainY -- output values; shape: [number of samples, 2
    """    
    inp = Input(shape = no_features+1)
    #Extract embedding features
    horizon = inp[:,0]
    features = inp[:,1:]
    
    #Embedding layer
    horizon_emb = Embedding(input_dim = n_embeddings, output_dim = 4)(horizon)
    
    #Concatenate
    conc = Concatenate(axis = 1)([features,horizon_emb])
    
    #Linear layer
    outputs = Dense(no_outputs, activation = "linear")(conc)
    model = Model(inputs = inp, outputs = outputs)
    return model

In [305]:
model = basic_model(train_X, train_Y, no_features)
model.summary()

Model: "model_18"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_22 (InputLayer)           [(None, 41)]         0                                            
__________________________________________________________________________________________________
tf.__operators__.getitem_42 (Sl (None,)              0           input_22[0][0]                   
__________________________________________________________________________________________________
tf.__operators__.getitem_43 (Sl (None, 40)           0           input_22[0][0]                   
__________________________________________________________________________________________________
embedding_21 (Embedding)        (None, 4)            260         tf.__operators__.getitem_42[0][0]
___________________________________________________________________________________________

In [306]:
BATCH_SIZE = 32
EPOCHS = 10
learning_rate = 0.01
optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)

#Early stopping
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, min_delta = 1e-5)

In [307]:
#Compile model
model.compile(optimizer = optimizer, loss = crps_cost_function)

In [313]:
model.fit(train_X, train_Y, validation_data = (val_X, val_Y), epochs = EPOCHS, shuffle = True, callbacks = [callback], verbose = True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<tensorflow.python.keras.callbacks.History at 0x160168d11c0>

# Predict test data

In [314]:
#Get prediction
pred = model.predict(test_X)
#Retransform
pred = target_scaler.inverse_transform(pred)
#Square and root results
pred[:,1] = np.sqrt(pred[:,1]**2)
#Convert prediction to quantiles
quantile_pred = np.zeros(shape = (pred.shape[0],5))
for cnt,x in enumerate(pred):
    quantile_pred[cnt] = norm.ppf(quantiles, loc = x[0], scale = x[1])

## Evaluate data on realizations with pinball loss

In [315]:
for cnt,quantile in enumerate(quantiles):
    loss = pinball_loss(quantile_pred[:,cnt], np.squeeze(test_Y), tau = quantile).numpy()
    print("Pinball loss for quantile {} : \t {}".format(quantile,loss))

Pinball loss for quantile 0.025 : 	 34.85618806044823
Pinball loss for quantile 0.25 : 	 9.323556179734924
Pinball loss for quantile 0.5 : 	 1.765189041542072
Pinball loss for quantile 0.75 : 	 9.060524149481232
Pinball loss for quantile 0.975 : 	 34.47369501229695


## Evaluate naive forecast on test data

In [316]:
naive_pred = np.quantile(test_X[:,1:], quantiles, axis = 1)
for cnt,quantile in enumerate(quantiles):
    loss = pinball_loss(naive_pred[cnt], np.squeeze(test_Y), tau = quantile).numpy()
    print("Pinball loss for quantile {} : \t {}".format(quantile,loss))

Pinball loss for quantile 0.025 : 	 11.728580052158545
Pinball loss for quantile 0.25 : 	 9.041139497832699
Pinball loss for quantile 0.5 : 	 6.0656028602783225
Pinball loss for quantile 0.75 : 	 3.0941338855338274
Pinball loss for quantile 0.975 : 	 0.42383174183491606


# Predict new data

In [317]:
horizons = [36, 48 ,60, 72, 84]

In [318]:
def get_pred_data(name):
    if name == "temperature":
        method = "t_2m"
    elif name == "wind":
        method = "wind_mean_10m"
    else:
        print("Error")
        return None
    #Set current date
    current_date = date.today().strftime("%Y%m%d")
    path = "data/icon_data/icon-eu-eps_{}00_{}_Karlsruhe.txt".format(current_date, method)
    new_data = pd.read_csv(path.format(current_date.replace("-","")), skiprows = 3, sep = "|").dropna(axis = 1)
    new_data.columns = new_data.columns.str.replace(" ", "")
    return new_data

In [319]:
def get_final_forecast(name, horizons, feature_scaler, model):
    if name == "temperature":
        method = "t_2m"
    elif name == "wind":
        method = "wind_mean_10m"
    else:
        print("Error")
        return None
    #Get data
    data = get_pred_data(name)
    data = data[data["fcst_hour"].isin(horizons)].to_numpy()
    #Normalize
    data_pred = feature_scaler.transform(data)
    #Predict
    pred = model.predict(data_pred)
    pred = target_scaler.inverse_transform(pred)
    
    #Create final prediction dataframe
    final_prediction = pd.DataFrame(columns = ["forecast_date","target","horizon","q0.025","q0.25","q0.5","q0.75","q0.975"], index = np.arange(0,5))
    final_prediction["forecast_date"] = datetime.today().strftime("%Y-%m-%d")
    final_prediction["horizon"] = ["{} hour".format(x) for x in horizons]
    final_prediction["target"] = method
    
    #Save prediction to dataframe
    for cnt,x in enumerate(pred):
        final_prediction.loc[final_prediction["horizon"] == "{} hour".format(horizons[cnt]), final_prediction.columns[3:]] = (norm.ppf(quantiles, loc = x[0], scale = x[1]))
    
    return final_prediction

In [320]:
get_final_forecast("temperature",horizons, feature_scaler, model)

Unnamed: 0,forecast_date,target,horizon,q0.025,q0.25,q0.5,q0.75,q0.975
0,2021-10-25,t_2m,36 hour,-19.305566,5.182126,18.030846,30.879565,55.367257
1,2021-10-25,t_2m,48 hour,-24.228226,-1.15943,10.944793,23.049015,46.117811
2,2021-10-25,t_2m,60 hour,-21.260019,2.867767,15.527642,28.187518,52.315304
3,2021-10-25,t_2m,72 hour,-27.270948,-4.69987,7.1432,18.98627,41.557348
4,2021-10-25,t_2m,84 hour,-23.162582,0.283384,12.585508,24.887633,48.333599
