In [None]:
import pandas as pd
import cufflinks
cufflinks.go_offline()

from pathlib import Path
from datetime import datetime, date
import os
import yaml
import numpy as np
import math
import matplotlib.pyplot as plt 

from openstef.pipeline.train_create_forecast_backtest import train_model_and_forecast_back_test
from openstef.metrics.figure import plot_feature_importance
from openstef.data_classes.model_specifications import ModelSpecificationDataClass
from openstef.data_classes.prediction_job import PredictionJobDataClass

# Set working dir to location of this file
os.chdir('.')


In [None]:
# Load inputs
filename = Path("../.data/Middenmeer-150kV.csv")

measurements = pd.read_csv(filename, delimiter=";", decimal=",")
measurements["Datetime"] = pd.to_datetime(measurements["Datum"] + " " + measurements["Tijd"])
measurements = measurements.set_index('Datetime').tz_localize('CET', ambiguous='NaT', nonexistent='NaT').tz_convert("UTC")
# Only keep relevant columns
measurements = measurements.iloc[:,2:-1]
# Sum the load
measurements['Total'] = measurements.sum(axis=1)
# By default, only a backtest is made for the total
target_column = 'Total'

measurements.iplot()

In [None]:
# Load predictors
predictors = pd.read_csv('../.data/predictors.csv', index_col=0, parse_dates=True)
predictors.head()

In [None]:
dwd_d_2 = (
    pd.read_parquet("../.data/df_dwd_iconeu_mdm150_d2.parquet")
    .assign(
        wind_speed_10=lambda df: np.sqrt(df["u10"] ** 2 + df["v10"] ** 2),
        wind_direction_10=lambda df: (
            (180 / math.pi)
            * np.arctan2(
                df["u10"] / df["wind_speed_10"], df["v10"] / df["wind_speed_10"]
            )
        )
        + 180,
    )
    .resample("15T")
    .interpolate(method="linear")
)
dwd_d_2.head()

df_era5 = pd.read_parquet("../.data/windspeed_100m_era5_MDM.parquet").tz_localize('UTC').resample("15T").interpolate(method="linear")
df_era5 = df_era5.rename(lambda x:x+'_era5', axis=1)

In [None]:
# %% define properties of training/prediction. We call this a 'prediction_job'
pj = PredictionJobDataClass(
    id=1,
    name="TestPrediction",
    model="xgb",
    quantiles=[0.10, 0.30, 0.50, 0.70, 0.90],
    horizon_minutes=24 * 60,
    resolution_minutes=15,
    forecast_type="demand",  # Note, this should become optional
    lat=1,  # should become optional
    lon=1,  # should become optional
    # train_components=False, #should become optional
    # model_type_group=None, # Note, this should become optional
    # hyper_params={}, # Note, this should become optional
    # feature_names=None, # Note, this should become optional
)
# Define backtest specs
backtest_specs = dict(n_folds=3, training_horizons=[47.0])

modelspecs = ModelSpecificationDataClass(id=pj["id"])
loadname = measurements.iloc[:, -1].name  # Var used to store the data

# Specify input data, use last column of the load dataframe
input_data = pd.DataFrame(dict(load=measurements.loc[:, loadname])).merge(
    predictors, left_index=True, right_index=True
)
# Also resample to fix overlapping indices
input_data = input_data.resample("15T").mean()
input_data = input_data.drop('windspeed_100m', axis=1)


In [None]:
input_data_d2 = input_data.copy()
input_data_d2['windspeed'] = dwd_d_2['wind_speed_10']
input_data_era5 = input_data.copy()
input_data_era5['windspeed'] = df_era5['ws_10m_era5']

In [None]:
# %% Perform the backtest
(
    forecast,
    model,
    train_data,
    validation_data,
    test_data,
) = train_model_and_forecast_back_test(
    pj,
    modelspecs=modelspecs,
    input_data=input_data,
    **backtest_specs,
)
(
    forecast_d2,
    model_d2,
    train_data_d2,
    validation_data_d2,
    test_data_d2,
) = train_model_and_forecast_back_test(
    pj,
    modelspecs=modelspecs,
    input_data=input_data_d2,
    **backtest_specs,
)
(
    forecast_era5,
    model_era5,
    train_data_era5,
    validation_data_era5,
    test_data_era5,
) = train_model_and_forecast_back_test(
    pj,
    modelspecs=modelspecs,
    input_data=input_data_era5,
    **backtest_specs,
)
# If n_folds>1, model is a list of models. In that case, only use the first model
if backtest_specs["n_folds"] > 1:
    model = model[0]
    model_d2 = model_d2[0]
    model_era5 = model_era5[0]

# Evaluate results

In [None]:
figs=dict(timeseries=dict())
for horizon in set(forecast_d2.horizon):
    fig = forecast_d2.loc[forecast_d2.horizon==horizon,['quantile_P10','quantile_P30',
                    'quantile_P50','quantile_P70','quantile_P90','realised','forecast']].iplot(asFigure=True,
                                                                                   title=f"Horizon: {horizon}")
    fig.update_traces(
         line=dict(color="green", width=1), fill='tonexty', fillcolor='rgba(0, 255, 0, 0.1)',
         selector=lambda x: 'quantile' in x.name and x.name != 'quantile_P10')
    fig.update_traces(
         line=dict(color="green", width=1),
         selector=lambda x: 'quantile_P10' == x.name)
    fig.update_traces(
         line=dict(color="red", width=2),
         selector=lambda x: 'realised' in x.name)
    fig.update_traces(
         line=dict(color="blue", width=2),
         selector=lambda x: 'forecast' in x.name)
    fig.show()

figs=dict(timeseries=dict())
for horizon in set(forecast_era5.horizon):
    fig = forecast_era5.loc[forecast_era5.horizon==horizon,['quantile_P10','quantile_P30',
                    'quantile_P50','quantile_P70','quantile_P90','realised','forecast']].iplot(asFigure=True,
                                                                                   title=f"Horizon: {horizon}")
    fig.update_traces(
         line=dict(color="green", width=1), fill='tonexty', fillcolor='rgba(0, 255, 0, 0.1)',
         selector=lambda x: 'quantile' in x.name and x.name != 'quantile_P10')
    fig.update_traces(
         line=dict(color="green", width=1),
         selector=lambda x: 'quantile_P10' == x.name)
    fig.update_traces(
         line=dict(color="red", width=2),
         selector=lambda x: 'realised' in x.name)
    fig.update_traces(
         line=dict(color="blue", width=2),
         selector=lambda x: 'forecast' in x.name)
    fig.show()

In [None]:
#Compare forecasts for three different windspeed input sources
forecast['err']=forecast['realised']-forecast['forecast']
forecast_d2['err']=forecast_d2['realised']-forecast_d2['forecast']
forecast_era5['err']=forecast_era5['realised']-forecast_era5['forecast']
mae = pd.concat([forecast['err'].rename('Shortest_leadtime'),forecast_d2['err'].rename('D-2 forecast'),forecast_era5['err'].rename('Era5')], axis=1).abs().mean()
mae_fig = mae.iplot(kind='bar',
          layout=dict(title='MAE',
                      xaxis=dict(title='model'),
                      yaxis=dict(title='MAE [MW]')), asFigure=True)
mae_fig.show()

In [None]:
df_check_weather_data = pd.concat([pd.concat(test_data_d2)[['windspeed_100mExtrapolated','windspeed']], df_era5[['ws_10m_era5','ws_100m_era5']]], axis=1).dropna()
df_check_weather_data['day'] = df_check_weather_data.index.dayofyear
df_check_weather_data.loc[df_check_weather_data.index.minute == 0].plot.scatter(x = 'windspeed',y ='ws_10m_era5', c = 'day', colormap = 'BrBG')

In [None]:
df_check_weather_data['error_10m_windspeed_forecast'] = (df_check_weather_data['windspeed'] - df_check_weather_data['ws_10m_era5'])
df_compare_errors = pd.concat([df_check_weather_data, forecast_d2['err']],axis=1).dropna()

In [None]:
#Some correlation between windforecast error and model error is clear, however the error correlates just as strong with windspeed in general
df_compare_errors.plot.scatter(x = 'error_10m_windspeed_forecast', y = 'err', c = 'day', colormap = 'BrBG')
df_compare_errors.loc[~np.isin(df_compare_errors.index.date, [date(2022,2,18),date(2022,1,31)])].plot.scatter(x = 'error_10m_windspeed_forecast', y = 'err', colormap = 'BrBG', c='day')
df_compare_errors.loc[~np.isin(df_compare_errors.index.date, [date(2022,2,18),date(2022,1,31)])][['ws_100m_era5','error_10m_windspeed_forecast','err']].corr()

In [None]:
feature_importance_fig = plot_feature_importance(model_era5.feature_importance_dataframe)
feature_importance_fig.show()
feature_importance_fig = plot_feature_importance(model_d2.feature_importance_dataframe)
feature_importance_fig.show()

# Store results
Store timeseries as csv, metadata as yaml, model as ... and write an overview to pdf.

In [None]:
run_name = f'{datetime.utcnow():%Y%m%d_%H%M%S}_D2_weather_forecast_comparison_MDM_Total'

In [None]:
def write_artifacts(run_name, forecast, model, prediction_job, backtest_specs):
    """Write timeseries to csv and generate PDF of result"""
    
    # Create output dir
    outdir = Path(f'output/{run_name}')
    if not os.path.exists(outdir):
        os.mkdir(outdir)
     
    # Write forecast_df (includes realised)
    forecast.to_csv(outdir / 'forecast.csv', compression='gzip')
    
    # Write model
    model.save_model(outdir / "model.json")
    
    # Write meta data - prediction job and backtest parameters
    # relevant prediction_job attributes
    rel_attrs = ['id','name','model','quantiles']
    rel_pj_dict={key:prediction_job[key] for key in rel_attrs}
    with open(outdir / "configs.yaml", "w") as file:
        documents = yaml.dump({**rel_pj_dict, **backtest_specs}, file)

write_artifacts(run_name, forecast, model, pj, backtest_specs)

In [None]:
nb_fname = '00.Evaluate_performance_using_Backtest_Pipeline'
command=f"jupyter nbconvert {nb_fname}.ipynb --to html --output results/{run_name}.html"
os.system(command)