# Persistence model

The persistence model is used as a reference model for both the LSTM and the DTW measure.

In [1]:
import numpy as np
import pandas as pd
%cd ..
from src.data.build_input import controlled_train_test_split
from src.dtw.dtw_measure import dtw_measure
from src.model.metrics import evaluate

/c/Users/u0124144/Documents/DTW_measure


In [2]:
startdate = '14-01-2001'
enddate = '01-01-2016'

data = pd.read_hdf('data/interim/data.h5', 'data')
data = data[startdate:enddate]
_, test = controlled_train_test_split(data)
output = 'Dst'
time_forward = 6

In [5]:
def extract_cont_intervals_from_index(index):
    r'''Check lookup table for time discontinuities
    output: 
        Returns list of continouos times inside the lookup table
    '''
    min_size = 10
    timeseries = []
    p = True
    series = index
    
    while len(series) > 0:
        # We can assume that the series starts from non-missing values, so the first diff gives sizes of continous intervals
        diff = pd.date_range(series[0], series[-1], freq='H').difference(series)
        if len(diff) > 0:
            if pd.Timedelta(diff[0] - pd.Timedelta('1h') - series[0])/pd.Timedelta('1h') > min_size:
                v1 = np.datetime64(series[0])
                v2 = np.datetime64(diff[0] - pd.Timedelta('1h'))
                timeseries.append([v1, v2])
            if pd.Timedelta(series[-1] - diff[-1] - pd.Timedelta('1h'))/pd.Timedelta('1h') > min_size:
                v1 = np.datetime64(diff[-1] + pd.Timedelta('1h'))
                v2 = np.datetime64(series[-1])
                timeseries.append([v1, v2])
            diff = pd.date_range(diff[0], diff[-1], freq='H').difference(diff)
        else:
            # Only when diff is empty
            v1 = np.datetime64(series[0])
            v2 = np.datetime64(series[-1])
            timeseries.append([v1, v2])
        series = diff
        

    return np.array(timeseries)

In [6]:
def persistence_predict(data, time):
    '''Forecast a given feature for a given forecast time
    Input:
        data: pandas dataframe containing all the to be forecasted features
        time: time to be forecasted
    Output:
        res: panas dataframe 
    '''
    res = data.shift(time)
    return res

In [7]:
def persistence_dtw_measure(data, time_forward):
    # Allow only one feature at the time
    assert(data.shape[1] == 1)
    
    pers = data.copy()
    for i in range(time_forward):
        pers['T_{}'.format(i+1)] = persistence_predict(data, i+1)
    pers = pers.dropna() # remove NaN-values
    intervals = extract_cont_intervals_from_index(pers.index)
    
    bincounts = np.zeros((time_forward,7))
    length = intervals.shape[0]
    for num, (start, stop) in enumerate(intervals):
        print('{} out of {}'.format(num+1, length))
        month = pers[start:stop]
        for i in range(time_forward):
            # dtw_measure(forecast, truth, warping path)
            _, path, _ = dtw_measure(month['T_{}'.format(i+1)].to_numpy(), month.iloc[:,0].to_numpy(), 6)
            bins, counts = np.unique(abs(path[0, :] - path[1, :]), return_counts=True)
            bincounts[i, bins] += counts

    bincounts = pd.DataFrame(data=bincounts, index=np.arange(1, time_forward+1), columns=np.arange(7))
    return bincounts

In [8]:
def persistence_eval(features, time_forward, dtw=True):
    r'''Evaluation of the persistence model. 
    This model does the standard metric test, together with a dtw count. 
    The dtw count keeps into consideration discontinuities, splitting the data
    in continuous pieces first.
    Evaluates times [1, 2, ..., time_forward]
    Input:
        data: Pandas dataframe with DateTime index and to be forecasted features
        time_forward: Number of hours evaluated
        dtw: boolean, run dtw measure when true
    Output:
        dtw-result is written to a file directly
        res: Metric evaluation
    '''
    if dtw:
        bincounts = persistence_dtw_measure(features, time_forward)
    else:
        bincounts = None
    
    data_all = np.repeat(features.to_numpy()[time_forward+1:-time_forward], time_forward, axis=1)
    pers_all = np.zeros(data_all.shape)
    for i, t in enumerate(range(1, 1+time_forward)):
        persist = persistence_predict(features, t)
        pers_all[:, t-1] = persist.to_numpy()[time_forward+1:-time_forward, 0]
        i += 1
    res = evaluate(pers_all, data_all)
    return res, bincounts

pers_res, bincounts = persistence_eval(test[[output]], 6)
pers_res

1 out of 45
2 out of 45
3 out of 45
4 out of 45
5 out of 45
6 out of 45
7 out of 45
8 out of 45
9 out of 45
10 out of 45
11 out of 45
12 out of 45
13 out of 45
14 out of 45
15 out of 45
16 out of 45
17 out of 45
18 out of 45
19 out of 45
20 out of 45
21 out of 45
22 out of 45
23 out of 45
24 out of 45
25 out of 45
26 out of 45
27 out of 45
28 out of 45
29 out of 45
30 out of 45
31 out of 45
32 out of 45
33 out of 45
34 out of 45
35 out of 45
36 out of 45
37 out of 45
38 out of 45
39 out of 45
40 out of 45
41 out of 45
42 out of 45
43 out of 45
44 out of 45
45 out of 45


{'A': array([-0.29857926, -0.78020272, -1.23413816, -1.6390297 , -2.00548404,
        -2.3500695 ]),
 'B': array([0.97539883, 0.93552117, 0.89797194, 0.86455615, 0.83441127,
        0.80617974]),
 'sigmaA': array([0.12193412, 0.12205529, 0.12219919, 0.12236647, 0.12256061,
        0.12277253]),
 'sigmaB': array([0.00550109, 0.00550655, 0.00551305, 0.00552059, 0.00552935,
        0.00553891]),
 'R': array([0.97448187, 0.93371385, 0.89518176, 0.86069157, 0.82936564,
        0.79992165]),
 'RMSE': array([ 4.21476204,  6.79638847,  8.55155521,  9.86547997, 10.92735439,
        11.84314043]),
 'MAE': array([2.67013018, 4.37683874, 5.51173468, 6.32410064, 6.96015948,
        7.49723623]),
 'ME': array([-0.00380584, -0.00761168, -0.01162896, -0.01612952, -0.02138521,
        -0.02769807]),
 'PE': array([0.94891479, 0.86716717, 0.78970001, 0.72011123, 0.65661676,
        0.59664932])}