# Compare training a single model for multiple horizons, versus horizon-dedicated models
Normally in OpenSTEF, a single model is trained to forecast load at continuously increasing lead time.

However, let's analyse how the accuracy of this setup compares to training models for specific lead times.

In [None]:
import pandas as pd
import cufflinks
cufflinks.go_offline()

from pathlib import Path
from datetime import datetime
import os
import yaml

from openstef.pipeline.train_create_forecast_backtest import train_model_and_forecast_back_test
from openstef.metrics.figure import plot_feature_importance
from openstef.data_classes.model_specifications import ModelSpecificationDataClass
from openstef.data_classes.prediction_job import PredictionJobDataClass

# Set working dir to location of this file
os.chdir('.')

In [None]:
# Load inputs
filename = Path("../.data/Middenmeer-150kV.csv")

measurements = pd.read_csv(filename, delimiter=";", decimal=",")
measurements["Datetime"] = pd.to_datetime(measurements["Datum"] + " " + measurements["Tijd"])
measurements = measurements.set_index('Datetime').tz_localize('CET', ambiguous='NaT', nonexistent='NaT').tz_convert("UTC")
# Only keep relevant columns
measurements = measurements.iloc[:,2:-1]
# Sum the load
measurements['Total'] = measurements.sum(axis=1)
# By default, only a backtest is made for the total
target_column = 'Total'

measurements.iplot()

In [None]:
# Load predictors
predictors = pd.read_csv('../.data/predictors.csv', index_col=0, parse_dates=True)
predictors.head()

In [None]:
# define properties of training/prediction. We call this a 'prediction_job' 
pj=PredictionJobDataClass(
    id=1,
    name='TestPrediction',
    model='xgb',
    quantiles=[0.10,0.30,0.50,0.70,0.90],
    horizon_minutes=24*60,
    resolution_minutes=15,
        
    forecast_type="demand", # Note, this should become optional
    lat = 1, #should become optional
    lon = 1, #should become optional
                  )

training_horizons=[0.25, 47.0]

# Make backtest using a single model for all lead times
# Define backtest specs
backtest_specs = dict(n_folds=3, training_horizons=training_horizons)
modelspecs = ModelSpecificationDataClass(id=pj['id'])

# Specify input data, use last column of the load dataframe
input_data = pd.DataFrame(dict(load=measurements.loc[:,target_column])).merge(predictors, left_index=True, right_index=True)
# Also resample to fix overlapping indices
input_data = input_data.resample('15T').mean()

# Perform the backtest
forecast_single_model, model, train_data, validation_data, test_data = train_model_and_forecast_back_test(
    pj,
    modelspecs = modelspecs,
    input_data = input_data,
    **backtest_specs,
 )

In [None]:
# Repeat backtest, but now with seperate models for each horizon
forecast_dedicated_model = pd.DataFrame()
for horizon in training_horizons:
    forecast, model, train_data, validation_data, test_data = train_model_and_forecast_back_test(
        pj,
        modelspecs = modelspecs,
        input_data = input_data,
        **dict(n_folds=backtest_specs['n_folds'], training_horizons=[horizon]),
    )
    forecast_dedicated_model = forecast_dedicated_model.append(forecast)

# Evaluate results

In [None]:
# Combine. df should have the P50 forecast for single/multimodel and for short/long horizon
df = pd.DataFrame(dict(forecast_multihorizonmodel_short=forecast_single_model[forecast_single_model.horizon==0.25]['forecast'].values,
                       forecast_multihorizonmodel_long =forecast_single_model[forecast_single_model.horizon==47.0]['forecast'].values,
                       forecast_dedicatedmodels_short = forecast_dedicated_model[forecast_dedicated_model.horizon==0.25]['forecast'].values,
                       forecast_dedicatedmodels_long = forecast_dedicated_model[forecast_dedicated_model.horizon==47.0]['forecast'].values,
                       realised = forecast_dedicated_model[forecast_dedicated_model.horizon==47.0]['realised'].values,
                       ),
                  index = forecast_dedicated_model[forecast_dedicated_model.horizon==47.0].index.values)

In [None]:
df.iplot()

In [None]:
err_df = df.apply(lambda x: x-x.realised, axis=1)


In [None]:
err_df.iloc[:,:-1].abs().mean()[[0,2,1,3]].iplot(kind='bar', yTitle='MAE')

In [None]:
figs=dict(timeseries=dict())
for horizon in set(forecast.horizon):
    fig = forecast.loc[forecast.horizon==0.25,['quantile_P10','quantile_P30',
                    'quantile_P50','quantile_P70','quantile_P90','realised','forecast']].iplot(asFigure=True,
                                                                                   title=f"Horizon: {horizon}")
    fig.update_traces(
         line=dict(color="green", width=1), fill='tonexty', fillcolor='rgba(0, 255, 0, 0.1)',
         selector=lambda x: 'quantile' in x.name and x.name != 'quantile_P10')
    fig.update_traces(
         line=dict(color="green", width=1),
         selector=lambda x: 'quantile_P10' == x.name)
    fig.update_traces(
         line=dict(color="red", width=2),
         selector=lambda x: 'realised' in x.name)
    fig.update_traces(
         line=dict(color="blue", width=2),
         selector=lambda x: 'forecast' in x.name)
    fig.show()

In [None]:
forecast['err']=forecast['realised']-forecast['forecast']
mae = forecast.pivot_table(index='horizon', values=['err'], aggfunc=lambda x: x.abs().mean())
mae.index=mae.index.astype(str)
mae_fig = mae.iplot(kind='bar',
          layout=dict(title='MAE',
                      xaxis=dict(title='horizon'),
                      yaxis=dict(title='MAE [MW]')), asFigure=True)
mae_fig.show()

In [None]:
feature_importance_fig = plot_feature_importance(model.feature_importance_dataframe)
feature_importance_fig.show()

# Store results
Store timeseries as csv, metadata as yaml, model as ... and write an overview to pdf.

In [None]:
run_name = f'{datetime.utcnow():%Y%m%d_%H%M%S}_MDM_Total'

In [None]:
def write_artifacts(run_name, forecast, model, prediction_job, backtest_specs):
    """Write timeseries to csv and generate PDF of result"""
    
    # Create output dir
    outdir = Path(f'output/{run_name}')
    if not os.path.exists(outdir):
        os.mkdir(outdir)
     
    # Write forecast_df (includes realised)
    forecast.to_csv(outdir / 'forecast.csv', compression='gzip')
    
    # Write model
    model.save_model(outdir / "model.json")
    
    # Write meta data - prediction job and backtest parameters
    # relevant prediction_job attributes
    rel_attrs = ['id','name','model','quantiles']
    rel_pj_dict={key:prediction_job[key] for key in rel_attrs}
    with open(outdir / "configs.yaml", "w") as file:
        documents = yaml.dump({**rel_pj_dict, **backtest_specs}, file)

write_artifacts(run_name, forecast, model, pj, backtest_specs)

In [None]:
nb_fname = '00.Evaluate_performance_using_Backtest_Pipeline'
command=f"jupyter nbconvert {nb_fname}.ipynb --to html --output results/{run_name}.html"
os.system(command)

In [None]:
command