In [1]:
from pathlib import Path
import torch.nn as nn
import numpy as np

import pandas as pd
import torch
from torch.utils.data import DataLoader
from darts import TimeSeries
from utils import build_dataset_from_df, get_likelihood, get_optim_cls

In [2]:
input_path = Path("../experiments/top9_twitter_1_1600_avgproc_min_int5m_reduced_6hr_augmented/top9_twitter_1_1600_avgproc_min_int5m_reduced_6hr_augmented.pkl")
base_df = pd.read_pickle(input_path)

In [3]:
loss_fn = nn.MSELoss()

def calculate_mses(base_df):
    losses = []
    rnn_losses = []
    deepar_losses = []
    for idx, target in enumerate(sorted(base_df.hash_func.unique())):
        df = base_df[base_df.hash_func == target]

        model_dir = Path(f"../results_local/pred/top9_twitter_1_1600_avgproc_min_int5m_reduced_6hr_augmented_i={idx}")
        model_path = list(model_dir.glob("**/nhits/**/model.pt"))[0]
        rnn_model_path = list(model_dir.glob("**/rnn/**/model.pt"))[0]
        deepar_model_path = list(model_dir.glob("**/deepar/**/model.pt"))[0]
        model_path_str = str(model_path)

        begin = model_path_str.find("clen=") + 5
        end = model_path_str.find("_", begin)
        clen = int(model_path_str[begin:end])
        begin = end+6
        end = model_path_str.find("/", begin)
        plen = int(model_path_str[begin:end])
        print(clen, plen)

        print("Loading model from %s" % model_path)

        scaler_path = model_path.with_name("scaler.pt")
        scaler = torch.load(scaler_path)

        map_location = torch.device("cpu")
        with open(model_path, "rb") as fin:
            model = torch.load(fin, map_location=map_location)

        ckpt_path = model_path.with_name(model_path.name + ".ckpt")
        if ckpt_path.exists():
            model.model = model.model.__class__.load_from_checkpoint(ckpt_path)
            model.trainer = None
        model.trainer_params = {
            "enable_progress_bar": False,
            "logger": False,
        }

        print("Loading rnn model from %s" % rnn_model_path)
        with open(rnn_model_path, "rb") as fin:
            rnn_model = torch.load(fin, map_location=map_location)

        ckpt_path = rnn_model_path.with_name(rnn_model_path.name + ".ckpt")
        if ckpt_path.exists():
            rnn_model.model = rnn_model.model.__class__.load_from_checkpoint(ckpt_path)
            rnn_model.trainer = None
        rnn_model.trainer_params = {
            "enable_progress_bar": False,
            "logger": False,
        }

        print("Loading deepar model from %s" % deepar_model_path)
        with open(deepar_model_path, "rb") as fin:
            deepar_model = torch.load(fin, map_location=map_location)

        ckpt_path = deepar_model_path.with_name(deepar_model_path.name + ".ckpt")
        if ckpt_path.exists():
            deepar_model.model = deepar_model.model.__class__.load_from_checkpoint(ckpt_path)
            deepar_model.trainer = None
        deepar_model.trainer_params = {
            "enable_progress_bar": False,
            "logger": False,
        }

        has_sample_col = "sample" in df.columns
        dataset = build_dataset_from_df(df, check_validity=(not has_sample_col))
        # dataset_ts is ordered by 'group'
        dataset_ts = TimeSeries.from_group_dataframe(
            dataset, group_cols="group", time_col="time_idx", value_cols="value")

        if has_sample_col:
            # regenerate dataset for each sample by using scaler for each function
            raw_data = []
            for sample_idx in sorted(df["sample"].unique()):
                dataset = build_dataset_from_df(df[df["sample"] == sample_idx])
                dataset_ts = TimeSeries.from_group_dataframe(
                    dataset, group_cols="group", time_col="time_idx", value_cols="value")
                # scaler order will be the same as the dataset_ts order
                raw_data.extend(dataset_ts)

        count_len = int(df.counts.agg(lambda x: x.size).max())
        print(f"max count len: {count_len}", flush=True)

        # splits = [ts_item.split_after(count_len-1) for ts_item in scaled]
        # val, train = zip(*splits)
        # train = [ts.shift(-count_len) for ts in train]
        splits = [ts_item.split_after(len(ts_item) - count_len - 1) for ts_item in raw_data]
        train, val = zip(*splits)

        scaled = [scaler.transform(v) for v in val]
        val_dataset = model._build_train_dataset(scaled, None, None, None)
        val_loader = DataLoader(val_dataset, batch_size=32, drop_last=False, shuffle=False, collate_fn=model._batch_collate_fn)
        rnn_model.model.set_predict_parameters(n=plen, num_samples=1, roll_size=1, batch_size=32, n_jobs=1)
        deepar_model.model.set_predict_parameters(n=plen, num_samples=30, roll_size=1, batch_size=32, n_jobs=1)

        with torch.no_grad():
            for input_batch in val_loader:
                x, _, _, y = input_batch
                # print(x.shape)
                y2 = torch.FloatTensor(scaler._fitted_params[0].inverse_transform(y[:, :, 0]))
                pred = model.model((x, None))
                pred2 = torch.FloatTensor(scaler._fitted_params[0].inverse_transform(pred[:, :, 0, 0]))
                losses.append(loss_fn(pred2, y2).item())
                rnn_pred = rnn_model.model._get_batch_prediction(plen, input_batch, 1)
                rnn_pred2 = torch.FloatTensor(scaler._fitted_params[0].inverse_transform(rnn_pred[:, :, 0]))
                rnn_losses.append(loss_fn(rnn_pred2, y2).item())
                deepar_pred = []
                for _ in range(deepar_model.model.pred_num_samples):
                    deepar_pred.append(deepar_model.model._get_batch_prediction(plen, input_batch, 1))
                deepar_pred = torch.stack(deepar_pred).mean(axis=0)
                deepar_pred2 = torch.FloatTensor(scaler._fitted_params[0].inverse_transform(deepar_pred[:, :, 0]))
                deepar_losses.append(loss_fn(deepar_pred2, y2).item())
                # break

        # break

    return losses, rnn_losses, deepar_losses

In [4]:
# print(pred.shape)
# print(rnn_pred.shape)
# print(deepar_pred.shape)

In [5]:
nhits_losses, rnn_losses, deepar_losses = calculate_mses(base_df)

15 7
Loading model from ../results_local/pred/top9_twitter_1_1600_avgproc_min_int5m_reduced_6hr_augmented_i=0/seed=42/darts/clen=15_plen=7/nhits/bs=32_lr=0.001/b=1_s=3_l=2_lw=512_do=0.1/model.pt
Loading rnn model from ../results_local/pred/top9_twitter_1_1600_avgproc_min_int5m_reduced_6hr_augmented_i=0/seed=42/darts/clen=15_plen=7/rnn/bs=32_lr=0.001/LSTM/l=2_lw=512_do=0.1/model.pt
Loading deepar model from ../results_local/pred/top9_twitter_1_1600_avgproc_min_int5m_reduced_6hr_augmented_i=0/seed=42/darts/clen=15_plen=7/deepar/bs=32_lr=0.001/LSTM/l=2_lw=512_do=0.1_gaussian/model.pt
max count len: 360
15 7
Loading model from ../results_local/pred/top9_twitter_1_1600_avgproc_min_int5m_reduced_6hr_augmented_i=1/seed=42/darts/clen=15_plen=7/nhits/bs=32_lr=0.001/b=1_s=3_l=2_lw=512_do=0.1/model.pt
Loading rnn model from ../results_local/pred/top9_twitter_1_1600_avgproc_min_int5m_reduced_6hr_augmented_i=1/seed=42/darts/clen=15_plen=7/rnn/bs=32_lr=0.001/LSTM/l=2_lw=512_do=0.1/model.pt
Loading d

In [6]:
print(len(nhits_losses))
np.sqrt(np.mean(nhits_losses))

430


116.24016873484885

In [7]:
print(len(rnn_losses))
np.sqrt(np.mean(rnn_losses))

430


123.93639708611852

In [8]:
print(len(deepar_losses))
np.sqrt(np.mean(deepar_losses))

430


122.37697701215882