In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nbeats.contrib.nbeatsx.nbeatsx import Nbeats
import time
plt.style.use('ggplot')
pd.options.display.max_rows = 999
np.set_printoptions(threshold=np.inf)

In [2]:
# Plot
def plot_prediction(y, y_hat,ax, title):
    n_y = len(y)
    n_yhat = len(y_hat)
    ds_y = np.array(range(n_y))
    ds_yhat = np.array(range(n_y-n_yhat, n_y))

    ax.plot(ds_y, y, label = 'y')
    ax.plot(ds_yhat, y_hat, label='y_hat')
    ax.set_title(title)
    
def plot_grid(x,n_row,n_col, titles=None, title_plot='plot_grid', dir='./'):
    n_graph = len(x)
    fig, axs = plt.subplots(n_row, n_col, figsize=(5*n_col, 3*n_row))
    plt.xticks(rotation=45)
    
    for i in range(n_graph):
        row = int(np.floor(i/n_col))
        col = i % n_col
        if titles is not None:
          title = titles[i]
        else:
          title = i
        plot_prediction(y=x[i][0],y_hat=x[i][1], ax=axs[row, col], title=title)
    fig_name = dir+str(title_plot)+'.png'
    plt.savefig(fig_name)
    #plt.show()

In [3]:
def ffill_missing_dates_particular_serie(serie, min_date, max_date, freq):
    date_range = pd.date_range(start=min_date, end=max_date, freq=freq)
    unique_id = serie['unique_id'].unique()
    df_balanced = pd.DataFrame({'ds':date_range, 'key':[1]*len(date_range), 'unique_id': unique_id[0]})

    # Check balance
    check_balance = df_balanced.groupby(['unique_id']).size().reset_index(name='count')
    assert len(set(check_balance['count'].values)) <= 1
    df_balanced = df_balanced.merge(serie, how="left", on=['unique_id', 'ds'])

    df_balanced['y'] = df_balanced['y'].fillna(method='ffill')

    return df_balanced

def ffill_missing_dates_per_serie(df, freq, fixed_max_date=None):
    """Receives a DataFrame with a date column and forward fills the missing gaps in dates, not filling dates before
    the first appearance of a unique key

    Parameters
    ----------
    df: DataFrame
        Input DataFrame
    key: str or list
        Name(s) of the column(s) which make a unique time series
    date_col: str
        Name of the column that contains the time column
    freq: str
        Pandas time frequency standard strings, like "W-THU" or "D" or "M"
    numeric_to_fill: str or list
        Name(s) of the columns with numeric values to fill "fill_value" with
    """
    if fixed_max_date is None:
        df_max_min_dates = df[['unique_id', 'ds']].groupby('unique_id').agg(['min', 'max']).reset_index()
    else:
        df_max_min_dates = df[['unique_id', 'ds']].groupby('unique_id').agg(['min']).reset_index()
        df_max_min_dates['max'] = fixed_max_date

    df_max_min_dates.columns = df_max_min_dates.columns.droplevel()
    df_max_min_dates.columns = ['unique_id', 'min_date', 'max_date']

    df_list = []
    for index, row in df_max_min_dates.iterrows():
        df_id = df[df['unique_id'] == row['unique_id']]
        df_id = ffill_missing_dates_particular_serie(df_id, row['min_date'], row['max_date'], freq)
        df_list.append(df_id)

    df_dates = pd.concat(df_list).reset_index(drop=True).drop('key', axis=1)[['unique_id', 'ds', 'y']]

    return df_dates

# Stock

In [4]:
########## TRAIN #############
data = pd.read_csv('data/stock/train.csv')
data['Date'] = pd.to_datetime(data['Date'])
data['unique_id'] = data['Company']
data = data.rename(columns={'Date':'ds', 'Close':'y'})

#Series must be complete in the frequency
data = ffill_missing_dates_per_serie(data,'D')
data = data.drop_duplicates(['unique_id','ds'])

X_train = data[['unique_id','ds']]
X_train['x'] = '1'
y_train = data[['unique_id','ds','y']]

########## TEST #############
data_test = pd.read_csv('data/stock/test.csv')
data_test['ds'] = pd.to_datetime(data_test['Date'])
data_test['unique_id'] = data_test['Company']
X_test = data_test[['unique_id','ds','Close']]
X_test.columns = ['unique_id', 'ds', 'y']

In [40]:
nbeats = Nbeats(input_size_multiplier=3,
                window_sampling_limit_multiplier=200,
                shared_weights=True,
                output_size=34,
                stack_types=['trend','seasonality'],
                n_blocks=[3,3],
                n_layers=[4,4],
                n_hidden=[256,2048],
                n_harmonics=1,
                n_polynomials=2,
                n_iterations=30,
                learning_rate=0.001,
                lr_decay=0.5,
                n_lr_decay_steps=3,
                batch_size=1024,
                loss='MAPE',
                seasonality=7,
                random_seed=1,
                device='cpu')

In [41]:
start = time.time()
nbeats.fit(y_df=y_train, verbose=True, display_steps=10)
end = time.time()
y_hat = nbeats.predict(X_test)
print('Fitting time:', end-start)
print('MAE:', np.abs(y_hat['y_hat']-y_hat['y']).mean())
print('NULLS:', y_hat['y_hat'].isnull().sum())

Infered frequency: D
Processing data ...
Creating dataloader ...
Step: 0, Time: 1.397, MAPE Loss: 0.186555699,
Step: 10, Time: 16.704, MAPE Loss: 0.095771044,
Step: 20, Time: 33.947, MAPE Loss: 0.072645113,
Fitting time: 51.404293060302734
MAE: 1.283045578789399
NULLS: 0
