# Simple RNN model

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../../db/data/merge/tertiary/logs_.csv', index_col=0)
df.index = pd.to_datetime(df.index)
display(df.head(3))
display(df.tail(3))

Unnamed: 0,AUD,CAD,CHF,EUR,GBP,JPY,NZD,USD
2005-01-03 00:00:00+00:00,-0.064396,-0.014073,-0.105662,-0.127787,0.028474,0.172005,-0.015203,0.126642
2005-01-03 01:00:00+00:00,-0.006224,-0.067656,-0.119911,-0.035191,-0.030165,0.148821,0.088015,0.022311
2005-01-03 02:00:00+00:00,-0.085077,0.127799,-0.158482,-0.086595,-0.074298,0.183128,-0.067183,0.160709


Unnamed: 0,AUD,CAD,CHF,EUR,GBP,JPY,NZD,USD
2022-12-27 21:00:00+00:00,-0.093193,0.015155,-0.084192,0.179205,0.008747,-0.063549,-0.015301,0.053128
2022-12-27 22:00:00+00:00,-0.093193,0.015155,-0.084192,0.179205,0.008747,-0.063549,-0.015301,0.053128
2022-12-27 23:00:00+00:00,-0.093193,0.015155,-0.084192,0.179205,0.008747,-0.063549,-0.015301,0.053128


In [3]:
df.shape

(111096, 8)

## Clean unknown data

In [4]:
df = df.loc[:'2022-08-31 10:00']
df.shape

(109067, 8)

## Split train and test data

In [5]:
from datetime import datetime, timedelta
from pytz import UTC

In [6]:
DAY = datetime(2022, 7, 1, 0, 0, 0, 0, UTC)

In [7]:
df_train = df.loc[:DAY - timedelta(hours=1)]
df_test = df.loc[DAY:]
df_train.shape, df_test.shape

((108024, 8), (1043, 8))

## Use just EUR column

In [8]:
TARGETS = ['EUR']
df_train_eur = df_train[TARGETS]
df_test_eur = df_test[TARGETS]

## Scaling

In [28]:
from sklearn.preprocessing import MinMaxScaler

In [42]:
scaler = MinMaxScaler()
df_train_scaled = pd.DataFrame(scaler.fit_transform(df_train_eur), columns=df_train_eur.columns)
df_test_scaled = pd.DataFrame(scaler.transform(df_test_eur), columns=df_test_eur.columns)

## Create folds train and validation

In [43]:
from typing import List

In [44]:
FOLD_LENGTH = 12 * 21 * 24 # 1 year
FOLD_STRIDE = 1 * 21 * 24 # 1 month
TRAIN_TEST_RATIO = 0.8
INPUT_LENGTH = 1 * 21 * 24 # ?

In [45]:
def create_folds(df: pd.DataFrame,
                 length: int,
                 stride: int) -> List[pd.DataFrame]:
    '''
    This function slides through the Time Series dataframe of shape (n_timesteps, n_features) to create folds
    - of equal `length`
    - using `stride` between each fold
    
    Returns a list of folds, each as a DataFrame
    '''
    folds = []
    
    for i in range(len(df), 0, -stride):
        if i - length < 0:
            break
        fold = df.iloc[i - length:i]
        folds.insert(0, fold)
            
    return folds

In [46]:
folds = create_folds(df_train_scaled, FOLD_LENGTH, FOLD_STRIDE)
print(f'The function generated {len(folds)} folds.')
print(f'Each fold has a shape equal to {folds[0].shape}.')

The function generated 203 folds.
Each fold has a shape equal to (6048, 1).


In [47]:
folds[-1]

Unnamed: 0,EUR
101976,0.441654
101977,0.429481
101978,0.420444
101979,0.428927
101980,0.427824
...,...
108019,0.432167
108020,0.437175
108021,0.412708
108022,0.416648


### Create `train` and `test` splits for each fold

In [74]:
def train_val_split(fold: pd.DataFrame,
                    train_ratio: float,
                    input_length: int):
    '''
    Returns a train dataframe and a val dataframe (fold_train, fold_val)
    from which one can sample (X,y) sequences.
    df_train should contain all the timesteps until round(train_val_ratio * len(fold))   
    '''
    last_train_i = round(train_ratio * len(fold))
    fold_train = fold.iloc[:last_train_i]
    fold_val = fold.iloc[last_train_i - input_length:]
    
    return fold_train, fold_val

In [75]:
fold_train, fold_val = train_val_split(folds[-1], TRAIN_TEST_RATIO, INPUT_LENGTH)
fold_train.shape, fold_val.shape

((4838, 1), (1714, 1))

In [None]:
fold_test

### Create X and y sequences from each fold

#### Get chronological X, y (Option 1)

In [50]:
def get_X_y_strides(fold: pd.DataFrame,
                    input_length: int,
                    output_length: int,
                    stride: int):
    '''
    - slides through a `fold` Time Series (2D array) to create sequences of equal
        * `input_length` for X,
        * `output_length` for y,
    using a temporal gap `sequence_stride` between each sequence
    - returns a list of sequences, each as a 2D-array time series
    '''
    X, y = [], []
    
    for i in range(len(fold), 0, -stride):
        if i - input_length - output_length < 0:
            break
        X_i = fold.iloc[i - input_length: i]
        y_i = fold.iloc[i - input_length - output_length: i - input_length][TARGETS]
        X.insert(0, X_i)
        y.insert(0, y_i)
        
    return np.array(X), np.array(y)

In [51]:
SEQUENCE_STRIDE = 6 # Every 6 hours
OUTPUT_LENGTH = 1 # Number of values to predict

X_train, y_train = get_X_y_strides(fold_train, INPUT_LENGTH, OUTPUT_LENGTH, SEQUENCE_STRIDE)
X_val, y_val = get_X_y_strides(fold_val, INPUT_LENGTH, OUTPUT_LENGTH, SEQUENCE_STRIDE)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((723, 504, 1), (723, 1, 1), (202, 504, 1), (202, 1, 1))

#### Get random n sequences (Option 2) 

In [52]:
def get_Xi_yi(fold:pd.DataFrame, 
              input_length:int, 
              output_length:int):
    '''
    - given a fold, it returns one sequence (X_i, y_i)
    - with the starting point of the sequence being chosen at random
    '''
    
    first_possible_start = 0
    last_possible_start = len(fold) - input_length - output_length + 1
    start = np.random.randint(first_possible_start, last_possible_start)
    X_i = fold.iloc[start:start+input_length]    
    y_i = fold.iloc[start+input_length:
                  start+input_length+output_length][TARGETS]
    
    return X_i, y_i

In [53]:
def get_X_y(fold:pd.DataFrame,
            number_of_sequences:int,
            input_length:int,
            output_length:int):
        
    X, y = [], []

    for i in range(number_of_sequences):
        Xi, yi = get_Xi_yi(fold, input_length, output_length)
        X.append(Xi)
        y.append(yi)
        
    return np.array(X), np.array(y)

In [54]:
N_TRAIN_SEQ = 500
N_TEST_SEQ = round(500 * (1 - TRAIN_TEST_RATIO))

In [56]:
X_train, y_train = get_X_y(fold_train, N_TRAIN_SEQ, INPUT_LENGTH, OUTPUT_LENGTH)
X_val, y_val = get_X_y(fold_val, N_TEST_SEQ, INPUT_LENGTH, OUTPUT_LENGTH)
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((500, 504, 1), (500, 1, 1), (100, 504, 1), (100, 1, 1))

## Prepare X_test, y_test

In [77]:
X_test, y_test = get_X_y(df_test_scaled, N_TEST_SEQ, INPUT_LENGTH, OUTPUT_LENGTH)
X_test.shape, y_test.shape

((100, 504, 1), (100, 1, 1))

## Modelling

In [78]:
from typing import Union
from keras.api._v2.keras import Model
from keras.api._v2.keras.optimizers import Optimizer, Adam
from keras.api._v2.keras.models import Sequential
from keras.api._v2.keras.layers import Dense, SimpleRNN, Dropout, LSTM, Lambda
from keras.api._v2.keras.callbacks import EarlyStopping
from keras.api._v2.keras.layers.experimental.preprocessing import Normalization

### Create normalizer (works but not inside the model)

In [79]:
def init_normalizer(X_train):
    n = Normalization()
    n.adapt(X_train)
    return n

### Create baseline model

It will predict the last temperature, no need to `fit` it as it has no params.

In [80]:
def init_baseline():

    model = Sequential()
    model.add(Lambda(lambda x: x[:,-1,:, None]))

    return model

### Create simple RNN model

In [81]:
def init_srnn_model(X_train, y_train):
    
    regressor = Sequential()

    regressor.add(SimpleRNN(50, return_sequences=True, input_shape=X_train[0].shape))
    regressor.add(Dropout(0.2))

    regressor.add(SimpleRNN(50, return_sequences=True))
    regressor.add(Dropout(0.2))

    regressor.add(SimpleRNN(50, return_sequences=True))
    regressor.add(Dropout(0.2))

    regressor.add(SimpleRNN(50))

    regressor.add(Dense(y_train.shape[1], activation="linear"))
    
    return regressor

### Create LSTM model

In [82]:
def init_lstm_model(X_train, y_train):
    
    # norm = Normalization()
    # norm.adapt(X_train)
    
    lstm = Sequential()
    
    # lstm.add(norm)
    
    lstm.add(LSTM(32, return_sequences=True, recurrent_dropout=0.2))
    lstm.add(LSTM(32, recurrent_dropout=0.2))
    
    lstm.add(Dense(y_train.shape[1], activation='linear'))
    
    return lstm

### Create compile and fit functions

In [83]:
def compile_model(model: Model,
                  optimizer:Union[str, Optimizer]='adam',
                  loss='mse',
                  metrics:list=['mae']):
    
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    
    return model

In [84]:
def fit_model(model: Model,
              X, y,
              X_val, y_val,
              epochs:int=50,
              batch_size:int=16,
              verbose:int=1):
    
    es = EarlyStopping(patience=3,
                       mode="min",
                       restore_best_weights=True)

    history = model.fit(X, y,
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_data=(X_val, y_val),
                        callbacks=[es])
    
    return model, history

### Initialise, compile and train the models

In [85]:
baseline = init_baseline()
baseline = compile_model(baseline)
baseline_score = baseline.evaluate(X_test, y_test)

print(f"- The Baseline MAE on the test set is equal to {round(baseline_score[1],6)} log return")

- The Baseline MAE on the test set is equal to 0.030887 log return


In [86]:
srnn = init_srnn_model(X_train, y_train)
srnn = compile_model(srnn)
srnn, history = fit_model(srnn, X_train, y_train, X_val, y_val)
srnn_score = srnn.evaluate(X_test, y_test)

print(f"- The Simple RNN MAE on the test set is equal to {round(srnn_score[1],6)} log return")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
- The Simple RNN MAE on the test set is equal to 0.025223 log return


In [87]:
lstm = init_lstm_model(X_train, y_train)
lstm = compile_model(lstm)
lstm, history = fit_model(lstm, X_train, y_train, X_val, y_val)
lstm_score = lstm.evaluate(X_test, y_test)

print(f"- The LSTM MAE on the test set is equal to {round(lstm_score[1],6)} log return")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
- The LSTM MAE on the test set is equal to 0.023822 log return
