In [2]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, datetime, timedelta
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import random
from sklearn.model_selection import train_test_split
import tensorflow.keras.backend as K
from scipy.stats import norm
from sklearn.preprocessing import Normalizer,StandardScaler, LabelEncoder
from tensorflow_addons.losses import pinball_loss
from sklearn.metrics import mean_pinball_loss

# Read and preprocess data

In [3]:
data = pd.read_feather("data/berlin_data/historic_data/icon_eps_wind_10m.feather")
data.head()

Unnamed: 0,init_tm,met_var,location,fcst_hour,obs_tm,obs,ens_1,ens_2,ens_3,ens_4,...,ens_33,ens_34,ens_35,ens_36,ens_37,ens_38,ens_39,ens_40,ens_mean,ens_var
0,2018-12-18 00:00:00+00:00,wind_10m,Berlin,0.0,2018-12-18 00:00:00+00:00,6.48,3.8,6.56,4.54,5.05,...,5.59,5.45,5.3,4.47,5.99,3.48,4.92,5.09,4.58675,0.565448
1,2018-12-18 00:00:00+00:00,wind_10m,Berlin,1.0,2018-12-18 01:00:00+00:00,6.12,3.68,7.03,5.06,5.33,...,4.92,5.18,4.98,4.88,6.39,3.74,5.18,4.85,4.6975,0.663747
2,2018-12-18 00:00:00+00:00,wind_10m,Berlin,2.0,2018-12-18 02:00:00+00:00,4.32,3.28,7.1,5.39,5.44,...,4.91,4.88,5.2,4.8,6.66,4.14,5.05,4.8,4.8165,0.8301
3,2018-12-18 00:00:00+00:00,wind_10m,Berlin,3.0,2018-12-18 03:00:00+00:00,5.04,3.47,7.45,6.11,5.66,...,5.37,4.58,5.3,4.86,6.96,4.41,5.26,4.74,5.01625,1.222111
4,2018-12-18 00:00:00+00:00,wind_10m,Berlin,4.0,2018-12-18 04:00:00+00:00,6.48,3.56,8.02,6.29,5.81,...,5.3,4.05,5.04,5.26,6.67,4.86,4.95,4.56,5.016,1.35505


In [4]:
quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]
horizons = [36, 48 ,60, 72, 84]

## Change data format

In [5]:
def prepare_data(data):
    data_prep = data.dropna().copy()
    data_prep.drop(["init_tm", "met_var", "location", "ens_mean", "ens_var", "obs_tm"], axis = 1, inplace = True)
    return data_prep.to_numpy()

In [6]:
data_np = prepare_data(data)

## Train, val, test split

In [7]:
train_val, test = train_test_split(data_np, test_size = 0.2)
train, val = train_test_split(train_val, test_size = 0.2)

### Normalize

In [8]:
def normalize(data, feature_scaler = None, target_scaler = None, learn = False):
    if learn == True:
        feature_scaler = StandardScaler()
        target_scaler = StandardScaler()
        #Learn target scaling
        target_scaled = target_scaler.fit_transform(data[:,1].reshape(-1,1))
        #Learn feature scaling
        feature_scaled = feature_scaler.fit_transform(data[:,2:])
        #Append
        data[:,1] = target_scaled.reshape(-1)
        data[:,2:] = feature_scaled
        
        return data, feature_scaler, target_scaler
    
    else:
        #Scale target
        target_scaled = target_scaler.transform(data[:,1].reshape(-1,1))
        #Scale features
        feature_scaled = feature_scaler.transform(data[:,2:])
        #Append
        data[:,1] = target_scaled.reshape(-1)
        data[:,2:] = feature_scaled
        
        return data

In [9]:
train, feature_scaler, target_scaler = normalize(train, learn = True)
train_val = normalize(train_val, feature_scaler, target_scaler)
test = normalize(test, feature_scaler, target_scaler)
val = normalize(val, feature_scaler, target_scaler)

In [10]:
def convert_format(input_data, predict = False):
    #Extract forecast embedding
    horizon_emb = input_data[:,0]
    
    if predict == False:        
        #Extract features
        features = input_data[:,2:]
        # Extract target
        target = np.expand_dims(input_data[:,1],1)
        return [features, horizon_emb], target
    else:
        #Extract features
        features = input_data[:,1:]
        return [features, horizon_emb]

In [11]:
train_data, train_target = convert_format(train)
val_data, val_target = convert_format(val)
test_data, test_target = convert_format(test)

# Create Model

In [12]:
BATCH_SIZE = 512
EPOCHS = 100
learning_rate = 0.01

In [13]:
class base_model(tf.keras.Model):    
    def __init__(self, n_embeddings = 121):
        super(base_model, self).__init__()
        #Embedding layers
        self.embedding = Embedding(input_dim = n_embeddings, output_dim = 4)
        #Create Dense layers
        self.hidden = Dense(25, activation = "relu")
        self.out = Dense(1, activation = "linear")

    def call(self, input_data):
        #Extract data
        features, horizon_emb = input_data
        #Calculate embedding
        emb = self.embedding(horizon_emb)
        emb = tf.squeeze(emb, axis = 1)
        conc = Concatenate(axis = 1)([features, emb])
        #Calculate output
        output = self.hidden(conc)
        output = self.out(output)

        return output

In [14]:
def train_model(model, quantile, train_data, train_target, validation_data, batch_size, epochs, learning_rate):
    #Define optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
    #Early stopping
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience = 5, min_delta = 1e-5)
    #Compile model
    model.compile(optimizer = optimizer, loss = lambda true,pred: pinball_loss(true, pred, tau = quantile))
    model.fit(x = train_data, y = train_target, validation_data = validation_data, epochs = epochs, batch_size = batch_size, callbacks = [callback], shuffle = True, verbose = False)

In [15]:
def create_all_models(train_data, train_target, validation_data, batch_size = BATCH_SIZE, epochs = EPOCHS, learning_rate = learning_rate, quantiles = quantiles):
    models = []
    for quantile in quantiles:
        model = base_model()
        train_model(model, quantile, train_data, train_target, validation_data, batch_size, epochs, learning_rate)
        print("Training finished for quantile: {}".format(quantile))
        models.append(model)
    return models

In [16]:
trained_models = create_all_models(train_data, train_target, (val_data, val_target))

KeyboardInterrupt: 

# Predict test data

## Evaluate pinball loss

In [None]:
predictions = []
for cnt,_ in enumerate(quantiles):
    pred = trained_models[cnt].predict(test_data)
    predictions.append(pred)

In [None]:
for cnt,quantile in enumerate(quantiles):
    loss = mean_pinball_loss(test_target.reshape(-1), predictions[cnt].reshape(-1), alpha = quantile)
    print("Pinball loss for quantile {} : \t {}".format(quantile,loss))

## Evaluate pinball loss on naive prediction

In [None]:
naive_pred = np.quantile(test_data[0], quantiles, axis = 1)
for cnt,quantile in enumerate(quantiles):
    loss = mean_pinball_loss(np.squeeze(test_target), naive_pred[cnt], alpha = quantile)
    print("Pinball loss for quantile {} : \t {}".format(quantile,loss))

## Check plausability of model

In [19]:
for cnt,pred in enumerate(predictions):
    q_smaller = (pred > test_target).sum()
    emp_quant = q_smaller / pred.size
    print("Quantile met for quantile = {}: \t {} %".format(quantiles[cnt], np.round(emp_quant,4)*100))

Quantile met for quantile = 0.025: 	 3.6700000000000004 %
Quantile met for quantile = 0.25: 	 26.07 %
Quantile met for quantile = 0.5: 	 49.51 %
Quantile met for quantile = 0.75: 	 76.27000000000001 %
Quantile met for quantile = 0.975: 	 97.49 %


# Predict new data

## Train on complete data without test set

In [17]:
train_data, train_target = convert_format(train_val)
val_data, val_target = convert_format(test)

In [18]:
trained_models = create_all_models(train_data, train_target, (val_data, val_target))

Training finished for quantile: 0.025
Training finished for quantile: 0.25
Training finished for quantile: 0.5
Training finished for quantile: 0.75
Training finished for quantile: 0.975


## Predict new data

In [19]:
#Set current date
current_date = date.today().strftime("%Y%m%d")
path = "data/berlin_data/icon_data/icon-eu-eps_{}00_wind_mean_10m_Berlin.txt".format(current_date)
new_data = pd.read_csv(path.format(current_date.replace("-","")), skiprows = 3, sep = "|").dropna(axis = 1)
new_data.columns = new_data.columns.str.replace(" ", "")
# Normalize and get horizons
new_data = new_data[new_data["fcst_hour"].isin(horizons)].to_numpy()
new_data[:,1:] = feature_scaler.transform(new_data[:,1:])

In [20]:
pred_data = convert_format(new_data, predict = True)

In [21]:
#Prepare dataframe
final_prediction = pd.DataFrame(columns = ["forecast_date","target","horizon","q0.025","q0.25","q0.5","q0.75","q0.975"], index = np.arange(0,5))
final_prediction["forecast_date"] = datetime.today().strftime("%Y-%m-%d")
final_prediction["horizon"] = ["{} hour".format(x) for x in horizons]
final_prediction["target"] = "wind"

In [22]:
for cnt, quantile in enumerate(quantiles):
    #Get prediction
    prediction = trained_models[cnt].predict(pred_data)
    #Retransform
    final_pred = target_scaler.inverse_transform(prediction)
    final_prediction.loc[:,"q{}".format(quantile)] = final_pred



In [23]:
final_prediction

Unnamed: 0,forecast_date,target,horizon,q0.025,q0.25,q0.5,q0.75,q0.975
0,2021-11-10,wind,36 hour,4.560685,7.889566,8.939392,10.100446,15.258412
1,2021-11-10,wind,48 hour,0.949883,3.039131,4.245919,5.383017,12.193696
2,2021-11-10,wind,60 hour,5.524999,9.14992,11.163344,13.3033,19.850378
3,2021-11-10,wind,72 hour,3.933633,8.213113,10.517086,13.155072,19.315844
4,2021-11-10,wind,84 hour,4.841019,9.06434,11.61726,13.631474,22.599207


In [24]:
new_data = pd.read_csv(path.format(current_date.replace("-","")), skiprows = 3, sep = "|").dropna(axis = 1)
new_data.columns = new_data.columns.str.replace(" ", "")
new_data[new_data["fcst_hour"].isin(horizons)].quantile(0.5, axis = 1)

36    7.11
48    4.01
52    8.21
56    8.32
58    9.00
Name: 0.5, dtype: float64

In [27]:
final_prediction.to_pickle("../evaluation/predictions/single/{}_{}".format("wind", date.today().strftime("%Y-%m-%d")))