In [None]:
#|default_exp data.unwindowed

# Unwindowed datasets

>Functionality that will allow you to create a dataset that applies sliding windows to the input data on the fly. This heavily reduces the size of the input data files, as only the original unwindowed data needs to be stored.

I'd like to thank both **Thomas Capelle** (https://github.com/tcapelle)  and **Xander Dunn** (https://github.com/xanderdunn) for their contributions to make this code possible. 

In [None]:
#|export
from tsai.imports import *
from tsai.utils import *
from tsai.data.validation import *
from tsai.data.core import *

In [None]:
#|export
class TSUnwindowedDataset():
    _types = TSTensor, TSLabelTensor
    def __init__(self, X=None, y=None, y_func=None, window_size=1, stride=1, drop_start=0, drop_end=0, seq_first=True, **kwargs):
        store_attr()
        if X is not None:
            if X.ndim == 1: X = np.expand_dims(X, 1)
            shape = X.shape
            assert len(shape) == 2
            if seq_first: 
                seq_len = shape[0]
            else: 
                seq_len = shape[-1]
            max_time = seq_len - window_size + 1 - drop_end
            assert max_time > 0, 'you need to modify either window_size or drop_end as they are larger than seq_len'
            self.all_idxs = np.expand_dims(np.arange(drop_start, max_time, step=stride), 0).T
            self.window_idxs = np.expand_dims(np.arange(window_size), 0)
            if 'split' in kwargs: self.split = kwargs['split']
            else: self.split = None
            self.n_inp = 1
            if y is None: 
                self.loss_func = MSELossFlat()
            else: 
                if (is_listy(y[0]) and isinstance(y[0][0], Integral)) or isinstance(y[0], Integral): 
                    self.loss_func = CrossEntropyLossFlat()
                else: 
                    self.loss_func = MSELossFlat()

    def __len__(self):
        if not hasattr(self, "split"): return 0
        elif self.split is not None: 
            return len(self.split)
        else: 
            return len(self.all_idxs)

    def __getitem__(self, idxs):
        if self.split is not None:
            idxs = self.split[idxs]
        widxs = self.all_idxs[idxs] + self.window_idxs
        if self.seq_first:
            xb = self.X[widxs]
            if xb.ndim == 3: xb = xb.transpose(0,2,1)
            else: xb = np.expand_dims(xb, 1)
        else:
            xb = self.X[:, widxs].transpose(1,0,2)
        if self.y is None:
            return (self._types[0](xb),)
        else:
            yb = self.y[widxs]
            if self.y_func is not None: 
                yb = self.y_func(yb)
            return (self._types[0](xb), self._types[1](yb))
    
    def new_empty(self): 
        return type(self)(X=None, y=None)
    
    @property
    def vars(self):
        s = self[0][0] if not isinstance(self[0][0], tuple) else self[0][0][0]
        return s.shape[-2]
    @property
    def len(self): 
        s = self[0][0] if not isinstance(self[0][0], tuple) else self[0][0][0]
        return s.shape[-1]    


class TSUnwindowedDatasets(FilteredBase):
    def __init__(self, dataset, splits):
        store_attr()
    def subset(self, i):
        return type(self.dataset)(self.dataset.X, y=self.dataset.y, y_func=self.dataset.y_func, window_size=self.dataset.window_size,
                                  stride=self.dataset.stride, drop_start=self.dataset.drop_start, drop_end=self.dataset.drop_end, 
                                  seq_first=self.dataset.seq_first, split=self.splits[i])
    @property
    def train(self): 
        return self.subset(0)
    @property
    def valid(self): 
        return self.subset(1)
    def __getitem__(self, i): return self.subset(i)

In [None]:
def y_func(y): return y.astype('float').mean(1)

This approach works with both univariate and multivariate data.

* Univariate: we'll use a simple array with 20 values, one with the seq_len first (X0), the other with seq_len second (X1).
* Multivariate: we'll use 2 time series arrays, one with the seq_len first (X2), the other with seq_len second (X3). No sliding window has been applied to them yet. 

In [None]:
# Univariate
X0 = np.arange(20).astype(float)
X1 = np.arange(20).reshape(1, -1).astype(float)
X0.shape, X0, X1.shape, X1

((20,),
 array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
        13., 14., 15., 16., 17., 18., 19.]),
 (1, 20),
 array([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
         13., 14., 15., 16., 17., 18., 19.]]))

In [None]:
# Multivariate
X2 = np.arange(20).reshape(-1,1)*np.array([1, 10, 100]).reshape(1,-1).astype(float)
X3 = np.arange(20).reshape(1,-1)*np.array([1, 10, 100]).reshape(-1,1).astype(float)
X2.shape, X3.shape, X2, X3

((20, 3),
 (3, 20),
 array([[0.0e+00, 0.0e+00, 0.0e+00],
        [1.0e+00, 1.0e+01, 1.0e+02],
        [2.0e+00, 2.0e+01, 2.0e+02],
        [3.0e+00, 3.0e+01, 3.0e+02],
        [4.0e+00, 4.0e+01, 4.0e+02],
        [5.0e+00, 5.0e+01, 5.0e+02],
        [6.0e+00, 6.0e+01, 6.0e+02],
        [7.0e+00, 7.0e+01, 7.0e+02],
        [8.0e+00, 8.0e+01, 8.0e+02],
        [9.0e+00, 9.0e+01, 9.0e+02],
        [1.0e+01, 1.0e+02, 1.0e+03],
        [1.1e+01, 1.1e+02, 1.1e+03],
        [1.2e+01, 1.2e+02, 1.2e+03],
        [1.3e+01, 1.3e+02, 1.3e+03],
        [1.4e+01, 1.4e+02, 1.4e+03],
        [1.5e+01, 1.5e+02, 1.5e+03],
        [1.6e+01, 1.6e+02, 1.6e+03],
        [1.7e+01, 1.7e+02, 1.7e+03],
        [1.8e+01, 1.8e+02, 1.8e+03],
        [1.9e+01, 1.9e+02, 1.9e+03]]),
 array([[0.0e+00, 1.0e+00, 2.0e+00, 3.0e+00, 4.0e+00, 5.0e+00, 6.0e+00,
         7.0e+00, 8.0e+00, 9.0e+00, 1.0e+01, 1.1e+01, 1.2e+01, 1.3e+01,
         1.4e+01, 1.5e+01, 1.6e+01, 1.7e+01, 1.8e+01, 1.9e+01],
        [0.0e+00, 1.0e+01, 2.0

Now, instead of applying SlidingWindow to create and save the time series that can be consumed by a time series model, we can use a dataset that creates the data on the fly. In this way we avoid the need to create and save large files. This approach is also useful when you want to test different sliding window sizes, as otherwise you would need to create files for every size you want to test.The dataset will create the samples correctly formatted and ready to be passed on to a time series architecture.

In [None]:
wds0 = TSUnwindowedDataset(X0, window_size=5, stride=2, seq_first=True)[:][0]
wds1 = TSUnwindowedDataset(X1, window_size=5, stride=2, seq_first=False)[:][0]
test_eq(wds0, wds1)
wds0, wds0.data, wds1, wds1.data

(TSTensor(samples:8, vars:1, len:5, device=cpu),
 tensor([[[ 0.,  1.,  2.,  3.,  4.]],
 
         [[ 2.,  3.,  4.,  5.,  6.]],
 
         [[ 4.,  5.,  6.,  7.,  8.]],
 
         [[ 6.,  7.,  8.,  9., 10.]],
 
         [[ 8.,  9., 10., 11., 12.]],
 
         [[10., 11., 12., 13., 14.]],
 
         [[12., 13., 14., 15., 16.]],
 
         [[14., 15., 16., 17., 18.]]]),
 TSTensor(samples:8, vars:1, len:5, device=cpu),
 tensor([[[ 0.,  1.,  2.,  3.,  4.]],
 
         [[ 2.,  3.,  4.,  5.,  6.]],
 
         [[ 4.,  5.,  6.,  7.,  8.]],
 
         [[ 6.,  7.,  8.,  9., 10.]],
 
         [[ 8.,  9., 10., 11., 12.]],
 
         [[10., 11., 12., 13., 14.]],
 
         [[12., 13., 14., 15., 16.]],
 
         [[14., 15., 16., 17., 18.]]]))

In [None]:
wds2 = TSUnwindowedDataset(X2, window_size=5, stride=2, seq_first=True)[:][0]
wds3 = TSUnwindowedDataset(X3, window_size=5, stride=2, seq_first=False)[:][0]
test_eq(wds2, wds3)
wds2, wds3, wds2.data, wds3.data

(TSTensor(samples:8, vars:3, len:5, device=cpu),
 TSTensor(samples:8, vars:3, len:5, device=cpu),
 tensor([[[0.0000e+00, 1.0000e+00, 2.0000e+00, 3.0000e+00, 4.0000e+00],
          [0.0000e+00, 1.0000e+01, 2.0000e+01, 3.0000e+01, 4.0000e+01],
          [0.0000e+00, 1.0000e+02, 2.0000e+02, 3.0000e+02, 4.0000e+02]],
 
         [[2.0000e+00, 3.0000e+00, 4.0000e+00, 5.0000e+00, 6.0000e+00],
          [2.0000e+01, 3.0000e+01, 4.0000e+01, 5.0000e+01, 6.0000e+01],
          [2.0000e+02, 3.0000e+02, 4.0000e+02, 5.0000e+02, 6.0000e+02]],
 
         [[4.0000e+00, 5.0000e+00, 6.0000e+00, 7.0000e+00, 8.0000e+00],
          [4.0000e+01, 5.0000e+01, 6.0000e+01, 7.0000e+01, 8.0000e+01],
          [4.0000e+02, 5.0000e+02, 6.0000e+02, 7.0000e+02, 8.0000e+02]],
 
         [[6.0000e+00, 7.0000e+00, 8.0000e+00, 9.0000e+00, 1.0000e+01],
          [6.0000e+01, 7.0000e+01, 8.0000e+01, 9.0000e+01, 1.0000e+02],
          [6.0000e+02, 7.0000e+02, 8.0000e+02, 9.0000e+02, 1.0000e+03]],
 
         [[8.0000e+00, 9.0

In [None]:
#|eval: false
#|hide
from tsai.export import get_nb_name; nb_name = get_nb_name(locals())
from tsai.imports import create_scripts; create_scripts(nb_name)

<IPython.core.display.Javascript object>

/Users/nacho/notebooks/tsai/nbs/014_data.unwindowed.ipynb saved at 2022-11-09 12:45:30
Correct notebook to script conversion! 😃
Wednesday 09/11/22 12:45:33 CET
