# Dynamic Time Warping measure

After observing the forecast in notebook `02-LSTM-experiment`, the delayed forecast can clearly be observed in the visualized storms, however, the used metrics do not show the model to give bad performance. We introduce a new measure based on dynamic time warping to detect this kind of error.

In [1]:
import sys
import pandas as pd
import numpy as np

sys.path.append('../')

from src.dtw.dtw_measure import dtw_measure

In [3]:
import h5py
# Import the data
def load_testing_sets(fname='../data/processed/datasets.h5'):
    with h5py.File(fname, 'r') as f:
        test_in = f['test_sets/test_in'][:]
        test_out = f['test_sets/test_out'][:]
        predict = f['test_sets/prediction'][:]
        lookup = f['test_sets/lookup'][:]
    return test_in, test_out, predict, lookup.astype('datetime64[s]')

test_in, test_out, predict, lookup = load_testing_sets()
time_forward = 6

An important condition for DTW is that each time series is continuous, e.g. combining independent time series into one and evaluating this will give incorrect results. In notebook `01-data-preparation`, invalid measurements were removed, breaking the test data into a set of continous time series. All of these series must first be identified.

In [4]:
def extract_continuous_intervals(table):
    r'''Check lookup table for time discontinuities
    output: 
        Returns list of continouos times inside the lookup table
    '''
    lookup = pd.DataFrame(data=np.arange(table.shape[0]), index=pd.to_datetime(table[:,0]))
    lookup.index = pd.DatetimeIndex(lookup.index)
    # split = [g for n,g in lookup.groupby(pd.Grouper(freq='M')) if g.shape[0] != 0]

    min_size = 10
    timeseries = []
    
    #for month in split:
    series = lookup.index
    while len(series) > 0:
        # We can assume that the series starts from non-missing values, so the first diff gives sizes of continous intervals
        diff = pd.date_range(series[0], series[-1], freq='H').difference(series)
        if len(diff) > 0:
            if pd.Timedelta(diff[0] - pd.Timedelta('1h') - series[0])/pd.Timedelta('1h') > min_size:
                v1 = lookup.loc[series[0]][0]
                v2 = lookup.loc[diff[0] - pd.Timedelta('1h')][0]
                timeseries.append([v1, v2])
            if pd.Timedelta(series[-1] - diff[-1] - pd.Timedelta('1h'))/pd.Timedelta('1h') > min_size:
                v1 = lookup.loc[diff[-1] + pd.Timedelta('1h')][0]
                v2 = lookup.loc[series[-1]][0]
                timeseries.append([v1, v2])
            diff = pd.date_range(diff[0], diff[-1], freq='H').difference(diff)
        else:
            # Only when diff is empty
            v1 = lookup.loc[series[0]][0]
            v2 = lookup.loc[series[-1]][0]
            timeseries.append([v1, v2])
        series = diff
        

    return np.array(timeseries)
intervals = extract_continuous_intervals(lookup)

Now that we have continous intervals, the dtw measure is applied to each interval. From the resulting path, we measure the time shift between the mapping. The total counts are summarized in a pandas DataFrame, which is then normalized with `reformat_dtw_res` over the rows to provide a percentage. 

In [5]:
bincounts = np.zeros((time_forward,7))
counter = 0
for start, stop in intervals:
    counter += 1
    for i in range(time_forward):
        _, path, _ = dtw_measure(predict[start:stop, 0, i], test_out[start:stop, 0, i], time_forward)
        bins, counts = np.unique(abs(path[0, :] - path[1, :]), return_counts=True)
        bincounts[i, bins] += counts
        
lat_res = pd.DataFrame(data=bincounts, index=np.arange(1, time_forward+1), columns=np.arange(7))
print(lat_res)

         0        1        2        3        4        5        6
1  10753.0  22537.0   1396.0    444.0    175.0     96.0     51.0
2   3911.0  12359.0  16762.0   2835.0   1284.0    577.0    319.0
3   2905.0   4163.0  10566.0  14300.0   4014.0   2179.0   1578.0
4   2790.0   2648.0   4309.0   9123.0  12396.0   4899.0   4046.0
5   3273.0   2319.0   2822.0   4228.0   8072.0  10761.0   8302.0
6   3692.0   2230.0   2594.0   3013.0   4277.0   7342.0  15777.0


In [6]:
def reformat_dtw_res(df, filename=None):
    '''Normalize the result from the dtw measure
    '''
    res = df.div(df.sum(axis=1), axis=0)

    shifts = np.array(['t+{}h'.format(i+1) for i in np.arange(res.shape[0])])
    res['Prediction'] = shifts.T
    res = res.set_index('Prediction')
    res.columns = ['{}h'.format(i) for i in res.columns]
    res = res.apply(lambda x: round(x, 3))
    if filename:
        res.to_csv('{}reformated_{}'.format(path, filename))
    return res
        
reformat_dtw_res(lat_res)

Unnamed: 0_level_0,0h,1h,2h,3h,4h,5h,6h
Prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
t+1h,0.303,0.636,0.039,0.013,0.005,0.003,0.001
t+2h,0.103,0.325,0.441,0.075,0.034,0.015,0.008
t+3h,0.073,0.105,0.266,0.36,0.101,0.055,0.04
t+4h,0.069,0.066,0.107,0.227,0.308,0.122,0.101
t+5h,0.082,0.058,0.071,0.106,0.203,0.271,0.209
t+6h,0.095,0.057,0.067,0.077,0.11,0.189,0.405
