# Persistence model

The persistence model is used as a reference model for both the LSTM and the DTW measure.

__Remark: Make sure that the previous notebooks have at least ran once to ensure the necessary files exists__

In [1]:
import sys
import numpy as np
import pandas as pd

sys.path.append('../')

from src.data.build_input import controlled_train_test_split
from src.dtw.dtw_measure import dtw_measure
from src.model.metrics import evaluate

Start by reading in the data

In [3]:
startdate = '14-01-2001'
enddate = '01-01-2016'

data = pd.read_hdf('../data/interim/data.h5', 'data')
data = data[startdate:enddate]
_, test = controlled_train_test_split(data)
output = 'Dst'
time_forward = 6

To make the method as accurate as possible, we first divide the data into continuous blocks. 

In [4]:
def extract_cont_intervals_from_index(index):
    r'''Check lookup table for time discontinuities
    output: 
        Returns list of continouos times inside the lookup table
    '''
    min_size = 10
    timeseries = []
    p = True
    series = index
    
    while len(series) > 0:
        # We can assume that the series starts from non-missing values, so the first diff gives sizes of continous intervals
        diff = pd.date_range(series[0], series[-1], freq='H').difference(series)
        if len(diff) > 0:
            if pd.Timedelta(diff[0] - pd.Timedelta('1h') - series[0])/pd.Timedelta('1h') > min_size:
                v1 = np.datetime64(series[0])
                v2 = np.datetime64(diff[0] - pd.Timedelta('1h'))
                timeseries.append([v1, v2])
            if pd.Timedelta(series[-1] - diff[-1] - pd.Timedelta('1h'))/pd.Timedelta('1h') > min_size:
                v1 = np.datetime64(diff[-1] + pd.Timedelta('1h'))
                v2 = np.datetime64(series[-1])
                timeseries.append([v1, v2])
            diff = pd.date_range(diff[0], diff[-1], freq='H').difference(diff)
        else:
            # Only when diff is empty
            v1 = np.datetime64(series[0])
            v2 = np.datetime64(series[-1])
            timeseries.append([v1, v2])
        series = diff
        

    return np.array(timeseries)

Define the persistence model

In [5]:
def persistence_predict(data, time):
    '''Forecast a given feature for a given forecast time
    Input:
        data: pandas dataframe containing all the to be forecasted features
        time: time to be forecasted
    Output:
        res: panas dataframe 
    '''
    res = data.shift(time)
    return res

Now we can apply the dtw measure to every continuous block extracted from the previous method.

In [6]:
def persistence_dtw_measure(data, time_forward):
    # Allow only one feature at the time
    assert(data.shape[1] == 1)
    
    pers = data.copy()
    for i in range(time_forward):
        pers['T_{}'.format(i+1)] = persistence_predict(data, i+1)
    pers = pers.dropna() # remove NaN-values
    intervals = extract_cont_intervals_from_index(pers.index)
    
    bincounts = np.zeros((time_forward,7))
    length = intervals.shape[0]
    for num, (start, stop) in enumerate(intervals):
        print('{} out of {} blocks'.format(num+1, length))
        month = pers[start:stop]
        for i in range(time_forward):
            _, path, _ = dtw_measure(month['T_{}'.format(i+1)].to_numpy(), month.iloc[:,0].to_numpy(), 6)
            bins, counts = np.unique(abs(path[0, :] - path[1, :]), return_counts=True)
            bincounts[i, bins] += counts

    bincounts = pd.DataFrame(data=bincounts, index=np.arange(1, time_forward+1), columns=np.arange(7))
    return bincounts

The persistence model can also be evaluated with the metrics defined by Liemohn et. al. This method combines both the dtw measure and this evaluation. 

In [8]:
def persistence_eval(features, time_forward, dtw=True):
    r'''Evaluation of the persistence model. 
    This model does the standard metric test, together with a dtw count. 
    The dtw count keeps into consideration discontinuities, splitting the data
    in continuous pieces first.
    Evaluates times [1, 2, ..., time_forward]
    Input:
        data: Pandas dataframe with DateTime index and to be forecasted features
        time_forward: Number of hours evaluated
        dtw: boolean, run dtw measure when true
    Output:
        dtw-result is written to a file directly
        res: Metric evaluation
    '''
    if dtw:
        bincounts = persistence_dtw_measure(features, time_forward)
    else:
        bincounts = None
    
    data_all = np.repeat(features.to_numpy()[time_forward+1:-time_forward], time_forward, axis=1)
    pers_all = np.zeros(data_all.shape)
    for i, t in enumerate(range(1, 1+time_forward)):
        persist = persistence_predict(features, t)
        pers_all[:, t-1] = persist.to_numpy()[time_forward+1:-time_forward, 0]
        i += 1
    res = evaluate(pers_all, data_all)
    return res, bincounts

pers_res, bincounts = persistence_eval(test[[output]], 6)

1 out of 45
2 out of 45
3 out of 45
4 out of 45
5 out of 45
6 out of 45
7 out of 45
8 out of 45
9 out of 45
10 out of 45
11 out of 45
12 out of 45
13 out of 45
14 out of 45
15 out of 45
16 out of 45
17 out of 45
18 out of 45
19 out of 45
20 out of 45
21 out of 45
22 out of 45
23 out of 45
24 out of 45
25 out of 45
26 out of 45
27 out of 45
28 out of 45
29 out of 45
30 out of 45
31 out of 45
32 out of 45
33 out of 45
34 out of 45
35 out of 45
36 out of 45
37 out of 45
38 out of 45
39 out of 45
40 out of 45
41 out of 45
42 out of 45
43 out of 45
44 out of 45
45 out of 45


Here the results are displayed. The first are the metric results, the second presents the dtw measure results.

In [17]:
new_ind = dict(list(enumerate(['t+{}'.format(i+1) for i in range(6)])))
pd.DataFrame.from_dict(pers_res).rename(index=new_ind)

Unnamed: 0,A,B,sigmaA,sigmaB,R,RMSE,MAE,ME,PE
t+1,-0.298579,0.975399,0.121934,0.005501,0.974482,4.214762,2.67013,-0.003806,0.948915
t+2,-0.780203,0.935521,0.122055,0.005507,0.933714,6.796388,4.376839,-0.007612,0.867167
t+3,-1.234138,0.897972,0.122199,0.005513,0.895182,8.551555,5.511735,-0.011629,0.7897
t+4,-1.63903,0.864556,0.122366,0.005521,0.860692,9.86548,6.324101,-0.01613,0.720111
t+5,-2.005484,0.834411,0.122561,0.005529,0.829366,10.927354,6.960159,-0.021385,0.656617
t+6,-2.350069,0.80618,0.122773,0.005539,0.799922,11.84314,7.497236,-0.027698,0.596649


In [18]:
bincounts

Unnamed: 0,0,1,2,3,4,5,6
1,101.0,33058.0,0.0,0.0,0.0,0.0,0.0
2,112.0,90.0,33002.0,0.0,0.0,0.0,0.0
3,120.0,97.0,90.0,32942.0,0.0,0.0,0.0
4,134.0,94.0,95.0,90.0,32881.0,0.0,0.0
5,141.0,94.0,93.0,93.0,90.0,32828.0,0.0
6,159.0,98.0,98.0,97.0,92.0,90.0,32750.0


In [20]:
def reformat_dtw_res(df, filename=None):
    '''Normalize the result from the dtw measure
    '''
    res = df.div(df.sum(axis=1), axis=0)

    shifts = np.array(['t+{}h'.format(i+1) for i in np.arange(res.shape[0])])
    res['Prediction'] = shifts.T
    res = res.set_index('Prediction')
    res.columns = ['{}h'.format(i) for i in res.columns]
    res = res.apply(lambda x: round(x, 3))
    if filename:
        res.to_csv('{}reformated_{}'.format(path, filename))
    return res

reformat_dtw_res(bincounts)

Unnamed: 0_level_0,0h,1h,2h,3h,4h,5h,6h
Prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
t+1h,0.003,0.997,0.0,0.0,0.0,0.0,0.0
t+2h,0.003,0.003,0.994,0.0,0.0,0.0,0.0
t+3h,0.004,0.003,0.003,0.991,0.0,0.0,0.0
t+4h,0.004,0.003,0.003,0.003,0.988,0.0,0.0
t+5h,0.004,0.003,0.003,0.003,0.003,0.985,0.0
t+6h,0.005,0.003,0.003,0.003,0.003,0.003,0.981


Notice that the effect of the persistence model is clearly visible. Almost every value is detected as shifted by the respective forecast time.