# Time Series Forecasting with PyTorch Transformer network

This notebook has been created for running the code on your own system.
You will require a Nvidia GPU or TPU in order to run the training part of this project.

This is based on the pipeline script (pipeline.sh & pipeline.ps1).

## generate_time_serie.py

In [None]:
import random

import pandas as pd
from tqdm import tqdm
import numpy as np
from uuid import uuid4

periods = [7, 14, 28, 30]


def get_init_df():

    date_rng = pd.date_range(start="2015-01-01", end="2020-01-01", freq="D")

    dataframe = pd.DataFrame(date_rng, columns=["timestamp"])

    dataframe["index"] = range(dataframe.shape[0])

    dataframe["article"] = uuid4().hex

    return dataframe


def set_amplitude(dataframe):

    max_step = random.randint(90, 365)
    max_amplitude = random.uniform(0.1, 1)
    offset = random.uniform(-1, 1)

    phase = random.randint(-1000, 1000)

    amplitude = (
        dataframe["index"]
        .apply(lambda x: max_amplitude * (x % max_step + phase) / max_step + offset)
        .values
    )

    if random.random() < 0.5:
        amplitude = amplitude[::-1]

    dataframe["amplitude"] = amplitude

    return dataframe


def set_offset(dataframe):

    max_step = random.randint(15, 45)
    max_offset = random.uniform(-1, 1)
    base_offset = random.uniform(-1, 1)

    phase = random.randint(-1000, 1000)

    offset = (
        dataframe["index"]
        .apply(
            lambda x: max_offset * np.cos(x * 2 * np.pi / max_step + phase)
            + base_offset
        )
        .values
    )

    if random.random() < 0.5:
        offset = offset[::-1]

    dataframe["offset"] = offset

    return dataframe


def generate_time_series(dataframe):

    clip_val = random.uniform(0.3, 1)

    period = random.choice(periods)

    phase = random.randint(-1000, 1000)

    dataframe["views"] = dataframe.apply(
        lambda x: np.clip(
            np.cos(x["index"] * 2 * np.pi / period + phase), -clip_val, clip_val
        )
        * x["amplitude"]
        + x["offset"],
        axis=1,
    ) + np.random.normal(
        0, dataframe["amplitude"].abs().max() / 10, size=(dataframe.shape[0],)
    )

    return dataframe


def generate_df():
    dataframe = get_init_df()
    dataframe = set_amplitude(dataframe)
    dataframe = set_offset(dataframe)
    dataframe = generate_time_series(dataframe)
    return dataframe

### Load data

In [None]:
dataframe = pd.read_csv('Room-climate.csv', sep=',')

In [None]:
dataframe.head()

Run main method for generate_time_serie.py

In [None]:
import matplotlib.pyplot as plt

dataframes = []

for _ in tqdm(range(200)):
    df = generate_df()

    # fig = plt.figure()
    # plt.plot(df[-120:]["index"], df[-120:]["views"])
    # plt.show()

    dataframes.append(df)

all_data = pd.concat(dataframes, ignore_index=True)

all_data.to_csv("data/data.csv", index=False)

## data_utils.py

In [None]:
import json
from typing import List
import pandas as pd
from pathlib import Path
import numpy as np


def add_date_cols(dataframe: pd.DataFrame, date_col: str = "timestamp"):
    """
    add time features like month, week of the year ...
    :param dataframe:
    :param date_col:
    :return:
    """

    dataframe[date_col] = pd.to_datetime(dataframe[date_col], format="%Y-%m-%d")

    dataframe["day_of_month"] = dataframe[date_col].dt.day / 31
    dataframe["day_of_year"] = dataframe[date_col].dt.dayofyear / 365
    dataframe["month"] = dataframe[date_col].dt.month / 12
    dataframe["week_of_year"] = dataframe[date_col].dt.isocalendar().week / 53
    dataframe["year"] = (dataframe[date_col].dt.year - 2015) / 5

    return dataframe, ["day_of_month", "day_of_year", "month", "week_of_year", "year"]


def add_basic_lag_features(
    dataframe: pd.DataFrame,
    group_by_cols: List,
    col_names: List,
    horizons: List,
    fill_na=True,
):
    """
    Computes simple lag features
    :param dataframe:
    :param group_by_cols:
    :param col_names:
    :param horizons:
    :param fill_na:
    :return:
    """
    group_by_data = dataframe.groupby(by=group_by_cols)

    new_cols = []

    for horizon in horizons:
        dataframe[[a + "_lag_%s" % horizon for a in col_names]] = group_by_data[
            col_names
        ].shift(periods=horizon)
        new_cols += [a + "_lag_%s" % horizon for a in col_names]

    if fill_na:
        dataframe[new_cols] = dataframe[new_cols].fillna(0)

    return dataframe, new_cols


def process_df(dataframe: pd.DataFrame, target_col: str = "views"):

    """
    :param dataframe:
    :param target_col:
    :return:
    """

    dataframe, new_cols = add_date_cols(dataframe, date_col="timestamp")
    dataframe, lag_cols = add_basic_lag_features(
        dataframe, group_by_cols=["article"], col_names=[target_col], horizons=[1]
    )

    return dataframe, new_cols

Run main method from data_utils.py

In [None]:
data = pd.read_csv("data/data.csv")
out_path = "data/processed_data.csv"
config_path = "data/config.json"

data, cols = process_df(data)

data.to_csv(out_path, index=False)

config = {
    "features": cols,
    "target": "views",
    "group_by_key": "article",
    "lag_features": ["views_lag_1"],
}
with open(config_path, "w") as f:
    json.dump(config, f, indent=4)

## time_series_forecasting/training.py

In [None]:
import json
import random

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

from time_series_forecasting.model import TimeSeriesForcasting


def split_df(
    df: pd.DataFrame, split: str, history_size: int = 120, horizon_size: int = 30
):
    """
    Create a training / validation samples
    Validation samples are the last horizon_size rows

    :param df:
    :param split:
    :param history_size:
    :param horizon_size:
    :return:
    """
    if split == "train":
        end_index = random.randint(horizon_size + 1, df.shape[0] - horizon_size)
    elif split in ["val", "test"]:
        end_index = df.shape[0]
    else:
        raise ValueError

    label_index = end_index - horizon_size
    start_index = max(0, label_index - history_size)

    history = df[start_index:label_index]
    targets = df[label_index:end_index]

    return history, targets


def pad_arr(arr: np.ndarray, expected_size: int = 120):
    """
    Pad top of array when there is not enough history
    :param arr:
    :param expected_size:
    :return:
    """
    arr = np.pad(arr, [(expected_size - arr.shape[0], 0), (0, 0)], mode="edge")
    return arr


def df_to_np(df):
    arr = np.array(df)
    arr = pad_arr(arr)
    return arr


class Dataset(torch.utils.data.Dataset):
    def __init__(self, groups, grp_by, split, features, target):
        self.groups = groups
        self.grp_by = grp_by
        self.split = split
        self.features = features
        self.target = target

    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        group = self.groups[idx]

        df = self.grp_by.get_group(group)

        src, trg = split_df(df, split=self.split)

        src = src[self.features + [self.target]]

        src = df_to_np(src)

        trg_in = trg[self.features + [f"{self.target}_lag_1"]]

        trg_in = np.array(trg_in)
        trg_out = np.array(trg[self.target])

        src = torch.tensor(src, dtype=torch.float)
        trg_in = torch.tensor(trg_in, dtype=torch.float)
        trg_out = torch.tensor(trg_out, dtype=torch.float)

        return src, trg_in, trg_out


def train(
    data_csv_path: str,
    feature_target_names_path: str,
    output_json_path: str,
    log_dir: str = "ts_logs",
    model_dir: str = "ts_models",
    batch_size: int = 32,
    epochs: int = 10,
    horizon_size: int = 30,
):
    data = pd.read_csv(data_csv_path)

    with open(feature_target_names_path) as f:
        feature_target_names = json.load(f)

    data_train = data[~data[feature_target_names["target"]].isna()]

    grp_by_train = data_train.groupby(by=feature_target_names["group_by_key"])

    groups = list(grp_by_train.groups)

    full_groups = [
        grp for grp in groups if grp_by_train.get_group(grp).shape[0] > 2 * horizon_size
    ]

    train_data = Dataset(
        groups=full_groups,
        grp_by=grp_by_train,
        split="train",
        features=feature_target_names["features"],
        target=feature_target_names["target"],
    )
    val_data = Dataset(
        groups=full_groups,
        grp_by=grp_by_train,
        split="val",
        features=feature_target_names["features"],
        target=feature_target_names["target"],
    )

    print("len(train_data)", len(train_data))
    print("len(val_data)", len(val_data))

    train_loader = DataLoader(
        train_data,
        batch_size=batch_size,
        num_workers=10,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_data,
        batch_size=batch_size,
        num_workers=10,
        shuffle=False,
    )

    model = TimeSeriesForcasting(
        n_encoder_inputs=len(feature_target_names["features"]) + 1,
        n_decoder_inputs=len(feature_target_names["features"]) + 1,
        lr=1e-5,
        dropout=0.1,
    )

    logger = TensorBoardLogger(
        save_dir=log_dir,
    )

    checkpoint_callback = ModelCheckpoint(
        monitor="valid_loss",
        mode="min",
        dirpath=model_dir,
        filename="ts",
    )

    trainer = pl.Trainer(
        max_epochs=epochs,
        gpus=1,
        logger=logger,
        callbacks=[checkpoint_callback],
    )
    trainer.fit(model, train_loader, val_loader)

    result_val = trainer.test(test_dataloaders=val_loader)

    output_json = {
        "val_loss": result_val[0]["test_loss"],
        "best_model_path": checkpoint_callback.best_model_path,
    }

    if output_json_path is not None:
        with open(output_json_path, "w") as f:
            json.dump(output_json, f, indent=4)

    return output_json

Run training.py main method.

In [None]:
data_csv_path_p = "data/processed_data.csv"
feature_target_names_path_p = "data/config.json"
log_dir_p = "models/ts_views_logs"
model_dir_p = "models/ts_views_models"
output_json_path_p = "models/trained_config.json"

train(
    data_csv_path=data_csv_path_p,
    feature_target_names_path=feature_target_names_path_p,
    output_json_path=output_json_path_p,
    log_dir=log_dir_p,
    model_dir=model_dir_p
)

## Evaluation

In [None]:
import json
from typing import Optional
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm

from time_series_forecasting.model import TimeSeriesForcasting
from time_series_forecasting.training import split_df, Dataset


def smape(true, pred):
    """
    Symmetric mean absolute percentage error
    :param true:
    :param pred:
    :return:
    """
    true = np.array(true)
    pred = np.array(pred)

    smape_val = (
        100
        / pred.size
        * np.sum(2 * (np.abs(true - pred)) / (np.abs(pred) + np.abs(true) + 1e-8))
    )

    return smape_val


def evaluate_regression(true, pred):
    """
    eval mae + smape
    :param true:
    :param pred:
    :return:
    """

    return {"smape": smape(true, pred), "mae": mean_absolute_error(true, pred)}


def evaluate(
    data_csv_path: str,
    feature_target_names_path: str,
    trained_json_path: str,
    eval_json_path: str,
    horizon_size: int = 30,
    data_for_visualization_path: Optional[str] = None,
):
    """
    Evaluates the model on the last 8 labeled weeks of the data.
    Compares the model to a simple baseline : prediction the last known value
    :param data_csv_path:
    :param feature_target_names_path:
    :param trained_json_path:
    :param eval_json_path:
    :param horizon_size:
    :param data_for_visualization_path:
    :return:
    """
    data = pd.read_csv(data_csv_path)

    with open(trained_json_path) as f:
        model_json = json.load(f)

    model_path = model_json["best_model_path"]

    with open(feature_target_names_path) as f:
        feature_target_names = json.load(f)

    target = feature_target_names["target"]

    data_train = data[~data[target].isna()]

    grp_by_train = data_train.groupby(by=feature_target_names["group_by_key"])

    groups = list(grp_by_train.groups)

    full_groups = [
        grp for grp in groups if grp_by_train.get_group(grp).shape[0] > horizon_size
    ]

    val_data = Dataset(
        groups=full_groups,
        grp_by=grp_by_train,
        split="val",
        features=feature_target_names["features"],
        target=feature_target_names["target"],
    )

    model = TimeSeriesForcasting(
        n_encoder_inputs=len(feature_target_names["features"]) + 1,
        n_decoder_inputs=len(feature_target_names["features"]) + 1,
        lr=1e-4,
        dropout=0.5,
    )
    model.load_state_dict(torch.load(model_path)["state_dict"])

    model.eval()

    gt = []
    baseline_last_known_values = []
    neural_predictions = []

    data_for_visualization = []

    for i, group in tqdm(enumerate(full_groups[:100])):
        time_series_data = {"history": [], "ground_truth": [], "prediction": []}

        df = grp_by_train.get_group(group)
        src, trg = split_df(df, split="val")

        time_series_data["history"] = src[target].tolist()[-120:]
        time_series_data["ground_truth"] = trg[target].tolist()

        last_known_value = src[target].values[-1]

        trg["last_known_value"] = last_known_value

        gt += trg[target].tolist()
        baseline_last_known_values += trg["last_known_value"].tolist()

        src, trg_in, _ = val_data[i]

        src, trg_in = src.unsqueeze(0), trg_in.unsqueeze(0)

        with torch.no_grad():
            prediction = model((src, trg_in[:, :1, :]))
            for j in range(1, horizon_size):
                last_prediction = prediction[0, -1]
                trg_in[:, j, -1] = last_prediction
                prediction = model((src, trg_in[:, : (j + 1), :]))

            trg[target + "_predicted"] = (prediction.squeeze().numpy()).tolist()

            neural_predictions += trg[target + "_predicted"].tolist()

            time_series_data["prediction"] = trg[target + "_predicted"].tolist()

        data_for_visualization.append(time_series_data)

    baseline_eval = evaluate_regression(gt, baseline_last_known_values)
    model_eval = evaluate_regression(gt, neural_predictions)

    eval_dict = {
        "Baseline_MAE": baseline_eval["mae"],
        "Baseline_SMAPE": baseline_eval["smape"],
        "Model_MAE": model_eval["mae"],
        "Model_SMAPE": model_eval["smape"],
    }

    if eval_json_path is not None:
        with open(eval_json_path, "w") as f:
            json.dump(eval_dict, f, indent=4)

    if data_for_visualization_path is not None:
        with open(data_for_visualization_path, "w") as f:
            json.dump(data_for_visualization, f, indent=4)

    for k, v in eval_dict.items():
        print(k, v)

    return eval_dict

Run main method of evaluation.py.

In [None]:
data_csv_path = "data/processed_data.csv"
feature_target_names_path = "data/config.json"
trained_json_path = "models/trained_config.json"
eval_json_path = "data/eval.json"
data_for_visualization_path = "data/visualization.json"

evaluate(
    data_csv_path=data_csv_path,
    feature_target_names_path=feature_target_names_path,
    trained_json_path=trained_json_path,
    eval_json_path=eval_json_path,
    data_for_visualization_path=data_for_visualization_path,
)

## Plot images

In [None]:
import json
import os

import matplotlib.pyplot as plt

with open("data/visualization.json", "r") as f:
    data = json.load(f)

os.makedirs("data/images", exist_ok=True)

for i, sample in enumerate(data):
    hist_size = len(sample["history"])
    gt_size = len(sample["ground_truth"])
    plt.figure()
    plt.plot(range(hist_size), sample["history"], label="History")
    plt.plot(
        range(hist_size, hist_size + gt_size), sample["ground_truth"], label="Ground Truth"
    )
    plt.plot(
        range(hist_size, hist_size + gt_size), sample["prediction"], label="Prediction"
    )

    plt.xlabel("Time")

    plt.ylabel("Time Series")

    plt.legend()
    plt.show()

    plt.savefig(f"data/images/{i}.png")
    plt.close()