# Functions

In this section all functions are defined for training the algorithms. <br>
The algorithms used are: <br>
1. Persistence Model / Naïve Forecast
2. Linear Regression
3. Support Vector Regressor (SVR)
4. Random Forest Regressor
5. Long Short Term Memory (LSTM) --> RNN architecture 

#### Function for preprocessing

In [1]:
def yield_agb_arranger(data, name):
    """
    Arguments: 
        1. data: Data set that shall be rearranged
        2. name: Name of the column to be rearranged
    Returns: Arranged dataset
    """
    # Rearrange data as 1 column
    df = pd.concat([data, data.unstack().rename('%s' % (name))
                    .reset_index(drop=True)], axis = 1)
    df = df[['%s' % (name)]]
    df.dropna(inplace=True)
    # Add date column as index from 2001-01-01 till 2018-12-31
    start = datetime.datetime.strptime("2001-01-01", "%Y-%m-%d")
    end = datetime.datetime.strptime("2019-01-01", "%Y-%m-%d")
    date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]
    dates = pd.DataFrame(date_generated)
    # Concat dates with data and set date as index
    arranged = pd.concat([df.reset_index(drop=True), dates.reset_index(drop=True)], 
                       axis= 1)
    arranged.rename(columns={0:'Date'}, inplace=True)
    arranged.set_index('Date', inplace=True)
    return arranged

#### Function for all algorithms

In [1]:
# Import dataset and set date as index
def read_dataframe(file):
    """
    Arguments: 
        1. file: File path that already is in correct shape
    Returns: Pandas Dataframe with date as index
    """
    df = pd.read_csv(file)
    # Transform Date column to datetime
    df['Date'] = pd.to_datetime(df.Date)
    # Set date column as index
    df.set_index('Date', inplace=True)
    df.sort_index(inplace=True)
    return df

In [2]:
# Takes multivariate time series and frames it as a supervised learning dataset.
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Arguments: 
        1. data: sequence of observations as a list of 2D Numpy array,
        2. n_in: Number of lag observations as input(X).
        3. n_out: Number of oberservations as output(y).
        4. dropnan: Boolean whether or not to drop rows with NaN values. Defaults to True.
    Returns: Pandas Dataframe of series framed for supervised learning
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [1]:
# Transform series into train and test sets for supervised learning and difference the data
def prepare_data(df, n_test, n_features, n_lag, n_seq):
    """
    Arguments: 
        1. df: dataframe for data preparation,
        2. n_test: size of test set as an absolute number
        3. n_features: number of different input features
        4. n_lag: number of lag days that are used for forecasting (=lag_days)
        5. n_seq: number of forecasting days (=forecast_period)
    Returns: 
        1. scaler: the scaler used (e.g. MinMaxScaler)
        2. train: train dataset (scaled and differenced)
        3. test: test dataset (scaled and differenced)
        4. reframed: the reframed Pandas dataframe for debugging (scaled and differenced)
    """
    # transform data to be stationary by differencing
    df_diff = df.diff()
    df_diff.dropna(inplace=True)
    df_diff = df_diff.values
    # transform into supervised learning problem X, y
    reframed = series_to_supervised(df_diff, n_lag, n_seq)
    # Dropping the forecasting columns we don't want to predict
    reframed.drop(reframed.columns[-n_features:],axis=1, inplace=True)
    for i in range(1,n_seq):
        reframed.drop(reframed.columns[-n_features-i:-i],axis=1, inplace=True)
    # Dropping past values of price
    reframed.drop(reframed.columns[:1],axis=1, inplace=True)
    for i in range(1, lag_days):
        reframed.drop(reframed.columns[n_features*i:n_features*i+1],axis=1, inplace=True)
    # Get values for scaling
    reframed_values = reframed.values
    # rescale values to 0, 1
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_values = scaler.fit_transform(reframed_values)
    # split into train and test sets
    train, test = scaled_values[:-n_test], scaled_values[-n_test:]
    return scaler, train, test, reframed

In [4]:
# Fit an LSTM network to training data
def fit_lstm(train, n_lag, n_seq, n_batch, n_epochs, n_neurons, n_features, learning_rate):
    """
    Arguments:
        1. train: train dataset (scaled and differenced)
        2. n_lag: number of lag days that are used for forecasting (=lag_days)
        3. n_seq: number of forecasting days (=forecast_period)
        4. n_batch: batch size used for model fitting
        5. n_epochs: number of epochs used in model fitting
        6. n_neurons: number of LSTM units in the RNN model
        7. n_features: number of independent variables used in the model
        8. learning_rate: learning rate of the neural network
    Returns: 
        1. model: model used for training data
        2. model_fit: fitted model
    """
    # reshape training into [samples, timesteps, features]
    X_train, y_train = train[:, :-n_seq], train[:, -n_seq:]
    X_train = X_train.reshape(X_train.shape[0], n_lag, n_features)
    X_test, y_test = test[:, :-n_seq], test[:, -n_seq:]
    X_test = X_test.reshape(X_test.shape[0],  n_lag, n_features)
    # design network
    model = Sequential()
    model.add(LSTM(units = n_neurons, input_shape=(X_train.shape[1], X_train.shape[2]), 
                   return_sequences=True))
    model.add(LSTM(units = n_neurons, return_sequences=False))
    model.add(Dense(n_seq))
    adam = optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, amsgrad=False)
    model.compile(loss='mean_squared_error', optimizer=adam)
    model_fit = model.fit(X_train, y_train, epochs=n_epochs, batch_size=n_batch, verbose=1, 
              shuffle=False) # optional: add validation_data=(X_test,y_test) and test as param
    return model, model_fit

In [5]:
# Make forecasts with the trained model
def make_forecasts(model, n_batch, train, test, n_lag, n_seq, n_features):
    """
    Arguments: 
        1. model: model used for training
        2. n_batch: batch size used for model fitting
        3. train: train dataset (scaled and differenced)
        4. test: test dataset (scaled and differenced)
        5. n_lag: number of lag days that are used for forecasting (=lag_days)
        6. n_seq: number of forecasting days (=forecast_period)
        7. n_features: number of independent variables used in the model
    Returns: List of forecasts for whole test data
    """
    forecasts = list()
    for i in range(len(test)):
        X_test, y_test = test[i, :(n_features*n_lag)], test[i, -n_seq:]
        # reshape input pattern to [1, lag_timesteps, features]
        X_test = X_test.reshape(1, n_lag, n_features)
        # make forecast
        forecast = model.predict(X_test, batch_size=n_batch)
        # convert to array
        forecast = [x for x in forecast[0, :]]
        # store the forecast
        forecasts.append(forecast)
    return forecasts

In [6]:
# Invert differenced forecast
def inverse_difference(last_ob, forecast):
    """
    Arguments: 
        1. last_ob: Last observation (unscaled and undifferenced)
        2. forecast: forecasts for whole test data (unscaled)
    Returns: undifferenced list of one forecast series
    """
    # invert first forecast
    inverted = list()
    inverted.append(forecast[0] + last_ob[0])
    # propagate difference forecast using inverted first value
    for i in range(1, len(forecast)):
        inverted.append(forecast[i] + inverted[i-1])
    return inverted

In [7]:
# inverse data transform on forecasts
def inverse_transform(df, forecasts, train, test, scaler, n_test, n_seq, 
                      n_lag, n_features):
    """
    Arguments: 
        1. df: Pandas dataframe for debugging (unscaled and undifferenced)
        2. forecasts: List of forecasts for whole test data
        3. scaler: the scaler used (e.g. MinMaxScaler)
        4. n_test: size of test set as an absolute number
    Returns: unscaled and undifference forecasts series
    """
    # X, y split
    X_test, y_test = test[:, :-n_seq], test[:, -n_seq:]
    # create arry from forecasts and reshape [number of forecasts, forecast_period]
    yhat = np.array(forecasts)
    yhat = yhat.reshape(len(yhat), n_seq)
    # concat X_test with forecasts and train with test (incl. forecasts)
    test_re = np.concatenate((X_test, yhat), axis=1)
    reframed_yhat = np.concatenate((train, test_re))
    # inverse scale reframed dataset with forecasts
    inv_scale_df = scaler.inverse_transform(reframed_yhat)
    # select only forecasts --> now unscaled, but still differenced
    inv_scale_test = inv_scale_df[-n_test:]
    forecasts = [row[-forecast_period:] for row in inv_scale_test]
    # create list for storing undifferenced values
    inverted = list()
    for i in range(len(forecasts)):
        # create array from forecast
        inv_scale = np.array(forecasts[i])
        # invert differencing
        index = len(df) - (n_test+(n_seq-1)) + i - 1
        last_ob = df.values[index]
        inv_diff = inverse_difference(last_ob, inv_scale)
        # store
        inverted.append(inv_diff)
    return inverted, forecasts

In [2]:
def evaluate_forecasts(actual, forecasts, n_lag, n_seq, n_features):
    """
    Arguments: 
        1. actual: y test values (unscaled and undifferenced)
        2. forecasts: forecasted y values (unscaled and undifferenced)
        3. n_lag: number of lag days that are used for forecasting (=lag_days)
        4. n_seq: number of forecasting days (=forecast_period)
        5. n_features: Number of exogenous features used
    Returns: 
        1. RMSE score
        2. Adj. R^2 score
        3. MAE score
    """
    rmse_pred = list()
    R2_pred = list()
    mae_pred = list()
    for i in range(n_seq):
        acutal = [row[i] for row in actual]
        predicted = [forecast[i] for forecast in forecasts]
        rmse = sqrt(mean_squared_error(acutal, predicted))
        rmse_pred.append(rmse)
        R2_score = 1-(1-r2_score(acutal, predicted))*(len(forecasts)-1)/(len(forecasts) - 
                                                                         n_features - 1)
        R2_pred.append(R2_score)
        mae_score = mean_absolute_error(acutal, predicted)
        mae_pred.append(mae_score)
        #print('t+%d RMSE: %f' % ((i+1), rmse)) # optionally show
    return rmse_pred, R2_pred, mae_pred

In [2]:
def evaluation_RMSE(df,actual, forecasts, train, test, scaler, n_test, n_lag, n_seq, n_features):
    """
    Arguments: 
        1. df: Pandas dataframe for debugging (unscaled and undifferenced)
        2. actual: y test values (scaled and differenced)
        3. forecasts: forecasted y values (scaled and differenced)
        4. train: train dataset (scaled and differenced)
        5. test: test dataset (scaled and differenced)
        6. scaler: the scaler used (e.g. MinMaxScaler)
        7. n_test: size of test set as an absolute number
        3. n_lag: number of lag days that are used for forecasting (=lag_days)
        4. n_seq: number of forecasting days (=forecast_period)
        10. n_features: Number of exogenous features used
    Returns: 
        1. RMSE score
        2. Adj. R^2 score
        3. MAE score
        4. forecasts(unscaled und undifferenced)
        5. actual(unscaled and undifferenced)
    """
    forecasts_inv, forecasts = inverse_transform(df=df, forecasts=forecasts, train=train, 
        test=test, scaler=scaler, n_test=n_test,n_lag=n_lag, n_seq=n_seq, n_features=n_features)
    actual_inv, actual = inverse_transform(df=df, forecasts=actual, train=train, test=test, 
        scaler=scaler, n_test=n_test, n_lag=n_lag, n_seq=n_seq, n_features=n_features)
    rmse_pred, R2_pred, mae_pred = evaluate_forecasts(actual=actual_inv, 
        forecasts=forecasts_inv, n_lag=n_lag, n_seq=n_seq, n_features=n_features)
    return rmse_pred, R2_pred, mae_pred, forecasts_inv, actual_inv