## Create Unit tests for this project

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from ts_boilerplate.params import ROOT_DIR, DATA, TRAIN, CROSS_VAL
from ts_boilerplate.data import get_X_y
from ts_boilerplate.generate_dummy_data import generate_data_monotonic_increase, generate_data_zeros_and_ones, generate_X_y_zeros_and_ones

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Let's create a dummy time series dataset whose value increment by 1 every day

In [2]:
data = generate_data_monotonic_increase()
data

array([[  0.,   0.,   0.,   0.,   0.],
       [  1.,   1.,   1.,   1.,   1.],
       [  2.,   2.,   2.,   2.,   2.],
       ...,
       [997., 997., 997., 997., 997.],
       [998., 998., 998., 998., 998.],
       [999., 999., 999., 999., 999.]])

In [3]:
data_df = pd.DataFrame(data)
data_df

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0,2.0
3,3.0,3.0,3.0,3.0,3.0
4,4.0,4.0,4.0,4.0,4.0
...,...,...,...,...,...
995,995.0,995.0,995.0,995.0,995.0
996,996.0,996.0,996.0,996.0,996.0
997,997.0,997.0,997.0,997.0,997.0
998,998.0,998.0,998.0,998.0,998.0


In [4]:
data_df.to_csv(os.path.join(ROOT_DIR, "data", "raw", "data_dummy.csv"), index=False)

In [5]:
pd.read_csv(os.path.join(ROOT_DIR, "data", "raw", "data_dummy.csv"))

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0,2.0
3,3.0,3.0,3.0,3.0,3.0
4,4.0,4.0,4.0,4.0,4.0
...,...,...,...,...,...
995,995.0,995.0,995.0,995.0,995.0
996,996.0,996.0,996.0,996.0,996.0
997,997.0,997.0,997.0,997.0,997.0
998,998.0,998.0,998.0,998.0,998.0


## getX_y

In [6]:
X, y = get_X_y(data, **TRAIN)
print(X.shape)
print(y.shape)

(975, 10, 5)
(975, 7, 2)


In [7]:
# Let's compute the shape arithmetically (for unittests)
(len(data) \
            - (TRAIN['input_length']  -1) \
            - (TRAIN['output_length'] -1) \
            - TRAIN['horizon']) \
        / TRAIN["stride"]

975.0

☝️ ceiling rounding function should be used for stride > 1

## Create train_test_split

In [8]:
train_test_ratio = TRAIN["train_test_ratio"]
input_length = TRAIN["input_length"]
output_length = TRAIN["output_length"]
data.shape

(1000, 5)

In [9]:
last_train_idx = round(train_test_ratio * len(data))
data_train = data[0:last_train_idx, :]

first_test_idx = last_train_idx - input_length
data_test = data[first_test_idx:, :]

In [10]:
data_train

array([[  0.,   0.,   0.,   0.,   0.],
       [  1.,   1.,   1.,   1.,   1.],
       [  2.,   2.,   2.,   2.,   2.],
       ...,
       [697., 697., 697., 697., 697.],
       [698., 698., 698., 698., 698.],
       [699., 699., 699., 699., 699.]])

In [11]:
data_test

array([[690., 690., 690., 690., 690.],
       [691., 691., 691., 691., 691.],
       [692., 692., 692., 692., 692.],
       ...,
       [997., 997., 997., 997., 997.],
       [998., 998., 998., 998., 998.],
       [999., 999., 999., 999., 999.]])

In [12]:
X_train, y_train = get_X_y(data_train, **TRAIN)
X_test, y_test = get_X_y(data_test, **TRAIN)
print("####### Last train pair")
print(X_train[-1])
print(y_train[-1])
print("####### First test pair")
print(X_test[0])
print(y_test[0])

####### Last train pair
[[427. 427. 427. 427. 427.]
 [428. 428. 428. 428. 428.]
 [429. 429. 429. 429. 429.]
 [430. 430. 430. 430. 430.]
 [431. 431. 431. 431. 431.]
 [432. 432. 432. 432. 432.]
 [433. 433. 433. 433. 433.]
 [434. 434. 434. 434. 434.]
 [435. 435. 435. 435. 435.]
 [436. 436. 436. 436. 436.]]
[[446. 446.]
 [447. 447.]
 [448. 448.]
 [449. 449.]
 [450. 450.]
 [451. 451.]
 [452. 452.]]
####### First test pair
[[904. 904. 904. 904. 904.]
 [905. 905. 905. 905. 905.]
 [906. 906. 906. 906. 906.]
 [907. 907. 907. 907. 907.]
 [908. 908. 908. 908. 908.]
 [909. 909. 909. 909. 909.]
 [910. 910. 910. 910. 910.]
 [911. 911. 911. 911. 911.]
 [912. 912. 912. 912. 912.]
 [913. 913. 913. 913. 913.]]
[[923. 923.]
 [924. 924.]
 [925. 925.]
 [926. 926.]
 [927. 927.]
 [928. 928.]
 [929. 929.]]


In [13]:
gap = np.min(y_test) - np.max(y_train)
gap

10.0

In [14]:
assert gap >= TRAIN["horizon"], "❗️❗️ Data leak detected between (X_train, y_train) and (X_test, y_test)❗️❗️ "

## Create folds

In [15]:
def get_folds(data: np.ndarray, fold_length: int, fold_stride: int):
    folds = []
    for i in range(0, len(data), fold_stride):
        if (i+fold_length) > len(data):
            break
        fold = data[i:i+fold_length, :]
        folds.append(fold)
    return folds

folds = get_folds(data, **CROSS_VAL)
print('n_folds= ', len(folds))
print(folds[-1])

n_folds=  2
[[300. 300. 300. 300. 300.]
 [301. 301. 301. 301. 301.]
 [302. 302. 302. 302. 302.]
 ...
 [897. 897. 897. 897. 897.]
 [898. 898. 898. 898. 898.]
 [899. 899. 899. 899. 899.]]


## Model

In [16]:
import tensorflow as tf
from keras.layers import Dense, SimpleRNN, Reshape

In [17]:
model = tf.keras.Sequential()
model.add(SimpleRNN(1, activation='tanh', input_shape=X_train.shape[1:]))
model.add(Dense(TRAIN['output_length'] * DATA["n_targets"], activation='linear'))
model.add(Reshape(y_train.shape[1:]))
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), metrics=tf.keras.metrics.MAPE)
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 1)                 7         
                                                                 
 dense (Dense)               (None, 14)                28        
                                                                 
 reshape (Reshape)           (None, 7, 2)              0         
                                                                 
Total params: 35
Trainable params: 35
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
verbose = 0
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                          patience=2,
                                          verbose=verbose,
                                          mode='min',
                                          restore_best_weights=True)
history = model.fit(X_train,
                        y_train,
                        epochs=50,
                        batch_size=32,
                        validation_split=0.3,
                        callbacks=[es],
                        verbose=verbose)

In [19]:
y_test

array([[[923., 923.],
        [924., 924.],
        [925., 925.],
        ...,
        [927., 927.],
        [928., 928.],
        [929., 929.]],

       [[983., 983.],
        [984., 984.],
        [985., 985.],
        ...,
        [987., 987.],
        [988., 988.],
        [989., 989.]],

       [[977., 977.],
        [978., 978.],
        [979., 979.],
        ...,
        [981., 981.],
        [982., 982.],
        [983., 983.]],

       ...,

       [[981., 981.],
        [982., 982.],
        [983., 983.],
        ...,
        [985., 985.],
        [986., 986.],
        [987., 987.]],

       [[853., 853.],
        [854., 854.],
        [855., 855.],
        ...,
        [857., 857.],
        [858., 858.],
        [859., 859.]],

       [[978., 978.],
        [979., 979.],
        [980., 980.],
        ...,
        [982., 982.],
        [983., 983.],
        [984., 984.]]])

In [20]:
y_pred = model.predict(X_test)
y_pred 

array([[[134.77231, 134.36353],
        [134.54387, 134.68999],
        [134.29008, 135.11214],
        ...,
        [135.20494, 134.58734],
        [134.24094, 134.69635],
        [135.10703, 134.24292]],

       [[134.77231, 134.36353],
        [134.54387, 134.68999],
        [134.29008, 135.11214],
        ...,
        [135.20494, 134.58734],
        [134.24094, 134.69635],
        [135.10703, 134.24292]],

       [[134.77231, 134.36353],
        [134.54387, 134.68999],
        [134.29008, 135.11214],
        ...,
        [135.20494, 134.58734],
        [134.24094, 134.69635],
        [135.10703, 134.24292]],

       ...,

       [[134.77231, 134.36353],
        [134.54387, 134.68999],
        [134.29008, 135.11214],
        ...,
        [135.20494, 134.58734],
        [134.24094, 134.69635],
        [135.10703, 134.24292]],

       [[134.77231, 134.36353],
        [134.54387, 134.68999],
        [134.29008, 135.11214],
        ...,
        [135.20494, 134.58734],
        [134.24094

In [23]:
from ts_boilerplate.metrics import mape
mape(y_test, y_pred)

84.0849