# Imports and data preparation

## Import packages

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

from pathlib import Path
from datetime import datetime
import os
import yaml

from openstef.pipeline.train_create_forecast_backtest import train_model_and_forecast_back_test
from openstef.metrics.figure import plot_feature_importance
from openstef.data_classes.model_specifications import ModelSpecificationDataClass
from openstef.data_classes.prediction_job import PredictionJobDataClass

# Set working dir to location of this file
os.chdir('.')

# Set plotly as the default pandas plotting backend
pd.options.plotting.backend = 'plotly'

In [None]:
import plotly.io as pio

# This ensures Plotly output works in multiple places:
# plotly_mimetype: VS Code notebook UI
# notebook: "Jupyter: Export to HTML" command in VS Code
# See https://plotly.com/python/renderers/#multiple-renderers
pio.renderers.default = "plotly_mimetype+notebook"

## Load, pre-process, and visualize EMS measurements

In [None]:
# Load inputs
filename = Path("../.data/Middenmeer-150kV.csv")

measurements = pd.read_csv(filename, delimiter=";", decimal=",")
measurements["Datetime"] = pd.to_datetime(measurements["Datum"] + " " + measurements["Tijd"])
measurements = measurements.set_index('Datetime').tz_localize('CET', ambiguous='NaT', nonexistent='NaT').tz_convert("UTC")

# Only keep relevant columns
measurements = measurements.iloc[:,2:-1]

# Sum the load
measurements['Total'] = measurements.sum(axis=1)

# By default, only a backtest is made for the total
target_column = 'Total'

measurements.plot()

### Check the validity of the measurements

In [None]:
# Show all rows with a duplicate index
measurements[measurements.index.duplicated(keep=False)]

In [None]:
# Drop all rows with a NaT index.
measurements = measurements[measurements.index.notna()]

In [None]:
# Validate that there are no duplicates left
assert not(measurements.index.duplicated().any()), "Duplicate indices have been found in the measurements dataframe."

## Load, pre-process, and visualize predictors

In [None]:
# Load predictors
predictors = pd.read_csv('../.data/weather_apx_sji_sja_Middenmeer.csv', index_col=0, parse_dates=True)
predictors.head()

In [None]:
# Check the validity of the predictors data
assert not(predictors.duplicated().any()), "Duplicate values have been found in the predictors dataframe."
assert not(predictors.index.duplicated().any()), "Duplicate indices have been found in the predictors dataframe."

## Combine EMS measurements and predictors to get input data

In [None]:
# OpenSTEF always expects a column called "load". This is the column it will predict.
load = pd.DataFrame(dict(load=measurements.loc[:,target_column]))
input_data = load.merge(predictors, left_index=True, right_index=True, how='inner')

In [None]:
assert not(input_data.index.duplicated().any()), "There are duplicate indices in the input data."

# Backtest configuration and execution

## Configure training, prediction, and backtest specifications

In [None]:
# Define properties of training / prediction. We call this a 'prediction_job'.
pj=PredictionJobDataClass(
    id=1, # Does not matter in a backtest context
    name='TestPrediction', # Does not matter in a backtest context
    model='xgb',
    quantiles=[0.10,0.30,0.50,0.70,0.90],
    horizon_minutes=24*60, # TODO: Find out: Does this influence anything? Does this influence which lagged features are available at prediction time?
    resolution_minutes=15,
    forecast_type="demand", # Note, this should become optional
    lat = 1, # should become optional
    lon = 1, # should become optional
    # train_components=False, #should become optional
    # model_type_group=None, # Note, this should become optional
    # hyper_params={}, # Note, this should become optional
    # feature_names=None, # Note, this should become optional
)

# The modelspecs do not do much if only an "id" is specified.
modelspecs = ModelSpecificationDataClass(id=pj['id'])

# Define backtest specs.
backtest_specs = dict(n_folds=3, 
                      # The training horizon also decides for which forecast horizon, backtest forecasts are made.
                      training_horizons=[0.25, 47.0])

## Perform the backtest

In [None]:
# Perform the backtest
forecast, models, train_data, validation_data, test_data = train_model_and_forecast_back_test(
    pj,
    modelspecs = modelspecs,
    input_data = input_data,
    **backtest_specs,
 )

# If n_folds > 1, models is a list of models. In that case, only use the first model.
if backtest_specs['n_folds'] > 1:
    model=models[0]
else:
    model=models

In [None]:
pd.set_option("display.max_columns", 130)
train_data[0].head()

# Evaluation of the results

## Visualize forecasts for all horizons
TODO:
- Find out if there also is an in-sample fit that results from the backtest. (Ask JM.)

In [None]:
forecast.head()

In [None]:
from utils import quantile_plotting

for horizon in set(forecast.horizon):
    quantile_plotting.plot_quantile_forecasts_and_realized(
        realized=forecast.query("horizon == @horizon")["realised"],
        forecast=forecast.query("horizon == @horizon")["forecast"],
        quantiles=forecast.query("horizon == @horizon")[[q for q in forecast.columns if q[:8] == "quantile"]],
        horizon=horizon,
    )

## Compute and visualize performance measures

In [None]:
forecast['err'] = forecast['realised'] - forecast['forecast']
mae = forecast.pivot_table(index='horizon', values=['err'], aggfunc=lambda x: x.abs().mean())
mae.index = mae.index.astype(str)
mae_fig = mae.plot(kind='bar',
          labels=dict(title='MAE',
                      xaxis=dict(title='horizon'),
                      yaxis=dict(title='MAE [MW]')))
mae_fig.show()

## Visualize feature importance

In [None]:
feature_importance_fig = plot_feature_importance(model.feature_importance_dataframe)
feature_importance_fig.show()

# Store results
Store forecast timeseries as csv, metadata as yaml, model as json and write this notebook to html.

In [None]:
run_name = f'{datetime.utcnow():%Y%m%d_%H%M%S}_MDM_Total'

In [None]:
def write_artifacts(run_name, forecast, model, prediction_job, backtest_specs):
    """Write timeseries to csv and generate PDF of result"""
    
    # Create output dir
    outdir = Path(f'output/{run_name}')
    if not os.path.exists(outdir):
        os.mkdir(outdir)
     
    # Write forecast_df (includes realised)
    forecast.to_csv(outdir / 'forecast.csv', compression='gzip')
    
    # Write model
    model.save_model(outdir / "model.json")
    
    # Write meta data - prediction job and backtest parameters
    # relevant prediction_job attributes
    rel_attrs = ['id','name','model','quantiles']
    rel_pj_dict={key: prediction_job[key] for key in rel_attrs}
    with open(outdir / "configs.yaml", "w") as file:
        documents = yaml.dump({**rel_pj_dict, **backtest_specs}, file)

write_artifacts(run_name, forecast, model, pj, backtest_specs)

In [None]:

nb_fname = '00.Evaluate_performace_using_Backtest_Pipeline'
command=f"jupyter nbconvert {nb_fname}.ipynb --to html --no-input --output results/{run_name}.html"
print(f"Command to be executed: {command}.")
os.system(command)