In [None]:
%%HTML
<link rel="stylesheet" type="text/css" href="../css/custom.css">

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam


%matplotlib inline

In [None]:
plt.rcParams["figure.figsize"] = 15, 6
np.random.seed(7)

# RNN forecast of airline passengers


![footer_logo](../images/logo.png)

## Goal 

- Create a many-to-one RNN in `Keras` using the `SimpleRNN` recurrent layer.
- Use callbacks to save models and monitor progress

## Data

We will consider the 'standard' airline passenger problem.
Given a year and a month, the task is to predict the number of international airline passengers in units of 1000.
The data ranges from January 1949 to December 1960 or 12 years, with 144 observations.

Load the passenger data, it's:

> "International airline passengers: monthly totals in thousands. Jan 1949 to Dec 1960"

In [None]:
passengers = pd.read_csv('../data/airline_passengers.csv',
                         parse_dates=True, 
                         index_col=0)
passengers.head(15)

Below we'll prepare the dataset for the `RNN`.
`y` contains the number of passengers for a given month.
`X` is of size $n \times p$ where $p$ is the number of months looking back (see `LOOK_BACK`) and $n$ is the total number of observations minus the number of months looking back.



In [None]:
def create_dataset(input_dataset, look_back=1):
    df = input_dataset.copy()
    
    # Add the lookback. 
    col_name = 't_min_{}'
    for i in range(1, look_back+1):
        df[col_name.format(i)] = df['passengers'].shift(i)
    
    # Remove the dates without all features.
    df = df.iloc[look_back:]

    # Create X and y. 
    X = df.drop(['passengers'], axis=1)
    X = X[X.columns[::-1]]

    y = pd.DataFrame(df['passengers'])

    
    return X, y


In [None]:
LOOK_BACK = 12

X, y = create_dataset(passengers, 12)


In [None]:
X.head()

In [None]:
y.head()

Standardize the data:

In [None]:
TEST_SIZE = 18  # months to forecast

# preprocess the data and split in train and test sets
train_mask = np.array([True] * (len(X) - TEST_SIZE) + [False] * TEST_SIZE)

# fit scaler to targets in training set
scaler = StandardScaler().fit(y[train_mask])

# rescale all data
X_train, y_train, X_test, y_test = (
    scaler.transform(dataset.values.reshape(-1, 1)).reshape(dataset.shape)
    for dataset in [X[train_mask], y[train_mask], X[~train_mask], y[~train_mask]]
)
test_ix = y[~train_mask].index

Reshape input to be [samples, time steps, features]

In [None]:
X_train = X_train[:, :, np.newaxis]
X_test = X_test[:, :, np.newaxis]
print(X_train.shape)
print(X_test.shape)

Note that we only have 1 feature (the historical number of passengers in a month).

## Model: Many to one

> #### Exercise: NNAR forecast of airline passengers in Keras 
>
> The RNN network should have the following design:
>
> - two hidden `SimpleRNN` layers with **200 and 30 ReLU nodes**
>     - what's the difference with the `RNN` layer?  
>     - what does the parameter `return_sequences` do? how should you use it for multiple layers?
>     - how is the parameter `recurrent_dropout` different to the `Dropout` layer? 
> - **dropout** layers with fraction **0.2**
> - an output layer should with a **single linear node**
> - **Adam optimizer** with learning rate of **1e-4** and decay of **1e-2** and **`mse` loss**
>     - you'll have to create an [optimizer object](https://keras.io/optimizers/#adam) to specify the learning rate
> - use `(LOOK_BACK, 1)` as the input shape for the first layer 
>
> During fitting of the model you should:
> 
> - use the test data for validation during training
> - use a batch size of 32
> - determine the right amount of epochs to train for
> - try to use the following callbacks:
>  - `TensorBoard` for checking the progress (write to a file in `ts.m2o_dir`)
>  - `ModelCheckpoint` saving best model only (note: make sure you run `utils.clean_create_dir(ts.m2o_dir)` before the fitting 
> 
> Train the many-to-one network a couple of times and consider the train and validation losses, what do you notice?
> What happens if you use Adam with its 'default' learning rate? What with a different optimizer?

In [None]:
import shutil

m2o_dir = os.path.join('../output', "m2o")
if os.path.exists(m2o_dir):
    shutil.rmtree(m2o_dir)
os.mkdir(m2o_dir)

model_path = os.path.join(m2o_dir, "many_to_one.h5")

In [None]:
from tensorflow.keras.models import Sequential

def make_m2o_model():
    """Function for making many-to-one RNN"""
    model = Sequential()
    # fill in
    return model

np.random.seed(7)


In [None]:
# %load ../answers/m2o.py


Load the best model and get predictions:

In [None]:
best_model = load_model(model_path)

truth = pd.DataFrame(scaler.inverse_transform(y_test), index=test_ix, columns=["truth"])
prediction = pd.DataFrame(
    scaler.inverse_transform(best_model.predict(X_test)), index=test_ix, columns=["m2o"]
)

In [None]:
from sklearn.metrics import mean_squared_error

def plot_forecast(truth, prediction):
    """Plot forecast and show MSE.

    Parameters
    ----------
    truth : dataframe
        Actual timeseries

    prediction : dataframe,
        Predicted timeseries

    Returns
    -------
        plot axis

    """
    fig, ax = plt.subplots(1, 1, figsize=(8, 3))
    ax = truth.plot(ax=ax)
    ax = prediction.plot(ax=ax)
    title = "Forecast - MSE - "
    legend = ["truth"]
    for col in prediction:
        mse = mean_squared_error(truth.loc[prediction.index], prediction[col])
        title += f"{col}: {mse:1.0f}"
        legend.append(f"prediction: {col}")
    ax.set_title(title, fontsize=16)
    ax.legend(legend)
    return ax

In [None]:
plot_forecast(truth, prediction);

Assuming you got your MSE lower than 1000, you got a pretty good model!
We've got a model that is able to predict one timestep in advance.

## Model: Many to many

Instead of using a many-to-one network, we could use a many-to-many architecture to, for example, predict one and two steps in the future instead of only one step.

Reshape the data to achieve the many-to-many architecture:

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
X_train_many = X_train[:-1]
X_test_many = X_test[:-1]
y_train_many = np.array(list(zip(y_train[:-1, 0], y_train[1:, 0])))[:, :, np.newaxis]
y_test_many = np.array(list(zip(y_test[:-1, 0], y_test[1:, 0])))[:, :, np.newaxis]

print(X_train_many.shape)
print(y_train_many.shape)

Convince yourself that the X and y are now in the right format:

> #### Exercise: Multi-year NNAR forecast of airline passengers in Keras
> 
> Use a similar architecture to make a many-to-many model. 
> 
> - Let the second RNN layer return sequences and use the `Lambda` layer to select the last two observations.
> - Use `TimeDistributed(Dense(1))` as the last layer. 
>    - What is the advantage of using `TimeDistributed(Dense(1))`? Is it different than `Dense(1)` in general? And in this case?
>    - Why are we using 1 unit? (Hint: look at `model.summary().`)

In [None]:
from tensorflow.keras.layers import Lambda
help(TimeDistributed)

In [None]:
import shutil

m2o_dir = os.path.join('../output', "m2m")
if os.path.exists(m2o_dir):
    shutil.rmtree(m2o_dir)
os.mkdir(m2o_dir)

model_path = os.path.join(m2o_dir, "many_to_many.h5")

In [None]:
from tensorflow.keras.models import Sequential


def make_m2m_model():
    """Function for making many-to-one RNN"""
    model = Sequential()
    # fill in
    return model


np.random.seed(7)


In [None]:
%load ../answers/m2m.py


In [None]:
best_model = load_model(model_path)

# Convert truth and predictions to dataframe.
truth = pd.DataFrame(
    scaler.inverse_transform(y_test_many.reshape(17, 2)),
    index=test_ix[:-1],
    columns=["truth_one_step", "truth_two_step"],
)
prediction = pd.DataFrame(
    scaler.inverse_transform(best_model.predict(X_test_many).reshape(17, 2)),
    index=test_ix[:-1],
    columns=["m2m_one_step", "m2m_two_step"],
)

# Plot the forecasts.
plot_forecast(truth["truth_one_step"], prediction[["m2m_one_step"]]);
plot_forecast(truth["truth_two_step"], prediction[["m2m_two_step"]]);

If all went well, both one-step as two-step predictions should follow the truth pretty well.
Note that the two-step plot should be moved one step ahead to align with the correct dates.
Using multiple tasks like this is called multitask learning and can improve performance.

## Conclusion

We've used recurrent neural networks to solve a typical timeseries problems.
Using neural nets allowed us to be flexible in the task that we've solved.

> #### Exercise
>
> - Try solving the two-steps prediction with a standard timeseries forecasting tool you're familiar with.