In [None]:
from pathlib import Path
import pandas as pd
import cufflinks
cufflinks.go_offline()
import matplotlib.pyplot as plt

from datetime import datetime
import os
import yaml

from openstef.pipeline.train_create_forecast_backtest import train_model_and_forecast_back_test
from openstef.metrics.figure import plot_feature_importance
from openstef.data_classes.model_specifications import ModelSpecificationDataClass
from openstef.data_classes.prediction_job import PredictionJobDataClass


# Sum of forecasts or forecast the sum?
Decisions are seldom based on forecasts for single sensors. Therefore combining either sensor data or the forecasts of these timeseries is nescesarry to obtain forecasts for the quantitiy of interest. Various strategies can be employed to achieve this with each of them having advantages and disadvantages. Here, we compare two of these strategies: 1) Take the sum of the sensors and forecast this sum and 2) Forecast each sensor individually and sum the resulting forecasts. 

For two differen substations, Middenmeer and Oosterwolde, we make forecasts for each individual sensor and the sum of the sensors with the openSTEF backtest pipeline. Because of the stochastic nature of the backtest we repeat it 10 times to aquire some statistics. We compare the resulting forecasts in terms of MAE, rMAE and rMAE for the lowest 5% of the values.


In [None]:
# Load data Middenmeer
filename = Path("./input/Middenmeer-150kV.csv")

data_wop = pd.read_csv(filename, delimiter=";", decimal=",")
data_wop["Datetime"] = pd.to_datetime(data_wop["Datum"] + " " + data_wop["Tijd"])
data_wop = data_wop.set_index('Datetime').tz_localize('CET', ambiguous='NaT', nonexistent='NaT').tz_convert("UTC")

data_150_kv = data_wop.iloc[:, 2:-1]

data_150_kv["Sum"] = data_150_kv.sum(axis=1)

# Load data Oostterwolde
filename = Path("./input/Oosterwolde-10kV.csv")
data_wop = pd.read_csv(filename, delimiter=";", decimal=",")
data_wop["Datetime"] = pd.to_datetime(data_wop[" Datum"] + " " + data_wop["Tijd"])
data_wop = data_wop.set_index('Datetime').tz_localize('CET', ambiguous='NaT', nonexistent='NaT').tz_convert("UTC")

data_20_kv_oosterwolde = data_wop.iloc[:, 2:-1]
data_20_kv_oosterwolde["Sum"] = data_20_kv_oosterwolde.sum(axis=1)

data_20_kv_oosterwolde = data_20_kv_oosterwolde * -1
# Load predictors
predictors = pd.read_csv('./input/predictors.csv', index_col=0, parse_dates=True)


In [None]:
plt.matshow(data_150_kv.corr())
cb = plt.colorbar()
plt.show()

In [None]:
data_150_kv.iplot(xTitle="Datetime", yTitle="Power [MW]", title="Middenmeer")

In [None]:
plt.matshow(data_20_kv_oosterwolde.corr())
cb = plt.colorbar()
plt.show()


In [None]:
data_20_kv_oosterwolde.iplot(xTitle="Datetime", yTitle="Power [MW]", title="Oosterwolde")

In [None]:
# Run backtest 10 times to obtain enough samples
overal_results = {}
for sample in range(10):
    results = {}
    for target_column in data_20_kv_oosterwolde.columns:
        # Define properties of training/prediction. We call this a 'prediction_job' 
        pj=PredictionJobDataClass(
            id=1,
            name='TestPrediction',
            model='xgb',
            quantiles=[0.10,0.30,0.50,0.70,0.90],
            horizon_minutes=24*60,
            resolution_minutes=15,

            forecast_type="demand", # Note, this should become optional
            lat = 1, #should become optional
            lon = 1, #should become optional
                          )

        training_horizons=[0.25, 47.0, 24.0]

        # Make backtest using a single model for all lead times
        # Define backtest specs
        backtest_specs = dict(n_folds=3, training_horizons=training_horizons)
        modelspecs = ModelSpecificationDataClass(id=pj['id'])

        # Specify input data, use last column of the load dataframe
        input_data = pd.DataFrame(dict(load=data_20_kv_oosterwolde.loc[:,target_column])).merge(predictors, left_index=True, right_index=True)
        # Also resample to fix overlapping indices
        input_data = input_data.resample('15T').mean()


        # Perform the backtest
        forecast_single_model, model_single_model, train_data, validation_data, test_data = train_model_and_forecast_back_test(
            pj,
            modelspecs = modelspecs,
            input_data = input_data,
            **backtest_specs,
         )

        # Store the model, so it can be compared to the other models
        models=dict(multihorizonmodel=model_single_model)

        results[target_column] = forecast_single_model

    sum_forecast = []
    for key in results.keys():
        if key!="Sum":
            sum_forecast.append(results[key][results[key]["horizon"]==24.00]["forecast"].rename(key))

    res = sum_forecast[0]
    for col in sum_forecast[1:]:    
        res += col.fillna(0)
    res = res.rename("Sum_Forecast").to_frame()
    res["Realised"] = results["Sum"][results["Sum"]["horizon"]==24.00]["realised"]
    res["Forecast_Sum"] = results["Sum"][results["Sum"]["horizon"]==24.00]["forecast"]
    res.to_csv(f"results_trial_{sample}.csv")

# Results & Conclusion
For Oosterwolde we see a clear improvement of forecast quality when forecasting the sum instead of summing the individual forecasts in terms of the overal relative mean absolute error. This is not strange as the forecast of the sum takes advantage of averaging and strange fluctuations in the individual sensor traces are small when compared to the total.

When we look at the results for Middenmeer this effect is suprisingly not visible. The forecast quality of both strategies is roughly the same or slightly better for the sum of the individual forecasts. To find the exact cause of this we need to compare data from more substations. Another idea is to repeat this exercise fro forecasts of individual customers. 


In [None]:
# Compare results
import openstef.metrics.metrics as metrics

for substation in ["middenmeer", "oostterwolde"]:
    list_res = []
    overal_results = {}
    for sample in range(10):
        res_metrics = {}
        res = pd.read_csv(f"./{substation}/results_trial_{sample}.csv", parse_dates=True, index_col=0)
        res_metrics["Sum_Forecast"]= [metrics.r_mae_lowest(res["Realised"],res["Sum_Forecast"]),metrics.r_mae(res["Realised"],res["Sum_Forecast"])]
        res_metrics["Forecast_Sum"]= [metrics.r_mae_lowest(res["Realised"],res["Forecast_Sum"]),metrics.r_mae(res["Realised"],res["Forecast_Sum"])]

        res_metrics_df = pd.DataFrame.from_dict(res_metrics)
        res_metrics_df.index = ["rMAE_lowest", "rMAE"]

        overal_results[sample] = res_metrics_df



        list_res.append(overal_results[sample])

    results = pd.concat(list_res).reset_index()
    
    results[results["index"]=="rMAE"][["Sum_Forecast", "Forecast_Sum"]].iplot(title=substation, kind="box",  yTitle="rMAE")
    
    results[results["index"]=="rMAE_lowest"][["Sum_Forecast", "Forecast_Sum"]].iplot(kind="box",title=substation,  yTitle="rMAE_lowest")
    