In [1]:
# !pip install neuralforecast datasetsforecast

In [2]:
# pip install horovod==0.26.1

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from typing import List
from datasetsforecast.long_horizon import LongHorizon
import pandas as pd
import numpy as np
import torch
from typing import Optional

from torch.utils.data import Dataset, DataLoader, Subset
from tqdm.auto import tqdm

from dataset import LongHorizonUnivariateDataModule, LongHorizonUnivariateDataset
import pytorch_lightning as pl


  __import__("pkg_resources").declare_namespace(__name__)


In [5]:
%%time

df = pd.read_excel('data/downloads/emhires_pv/EMHIRES_PVGIS_TSh_CF_n2_19862015_reformatt.xlsx')
df.drop('SE33', axis=1, inplace=True)
df['time_step'] = pd.date_range(start='1986-01-01', periods=len(df), freq='h')
df = df.rename({'time_step': 'ds'}, axis=1).set_index('ds')
df.to_parquet('data/emhires/pv_n2.parquet')

FileNotFoundError: [Errno 2] No such file or directory: 'data/downloads/emhires_pv/EMHIRES_PVGIS_TSh_CF_n2_19862015_reformatt.xlsx'

In [6]:
%%time

df = pd.read_excel('data/downloads/emhires_pv/EMHIRES_PVGIS_TSh_CF_n2_19862015_reformatt.xlsx')
df.drop('SE33', axis=1, inplace=True)
df['time_step'] = pd.date_range(start='1986-01-01', periods=len(df), freq='h')
df = df.rename({'time_step': 'ds'}, axis=1).set_index('ds')
df.to_parquet('data/emhires/pv_n2.parquet')

FileNotFoundError: [Errno 2] No such file or directory: 'data/downloads/emhires_pv/EMHIRES_PVGIS_TSh_CF_n2_19862015_reformatt.xlsx'

In [7]:
df.to_csv('test_pd.csv')

NameError: name 'df' is not defined

In [None]:
%%time

a = pd.read_parquet('test_pd.parquet')

In [None]:
a

In [None]:
def collate_fn_flat_deal(batch):
    out = {}
    for b in batch:
        for k, bv in b.items():
            v = out.get(k, [])
            v.append(bv)
            out[k] = v
            
    for k,v in out.items():
        v = np.concatenate(v)
        if type(v[0]) not in [np.str_, pd.Timestamp]:
            v = torch.as_tensor(v)
        out[k] = v
    return out

class LongHorizonUnivariateDataModule(pl.LightningDataModule):
    def __init__(self, 
                 name: str = 'ETTm2', 
                 train_batch_size: int = 128, 
                 eval_batch_size: int = None,
                 num_workers: int = 4,
                 persistent_workers: bool = True,
                 horizon_length: int = 720,
                 history_length: int = 720,
                 split_proportions: List[float] = [0.6, 0.2, 0.2]
                ):
        super().__init__()
        self.name = name
        self.train_batch_size = train_batch_size
        self.eval_batch_size = train_batch_size
        if eval_batch_size is not None:
            self.eval_batch_size = eval_batch_size
        self.num_workers = num_workers
        self.persistent_workers = persistent_workers
        self.history_length = history_length
        self.horizon_length = horizon_length
        self.split_proportions = np.array(split_proportions).cumsum()
        
        assert self.split_proportions[-1] == 1, "Split proportions must sum up to 1"

    def prepare_data(self):
        LongHorizon.load(directory='./data', group='ETTm2')

    def setup(self, stage: str):
        # Assign train/val datasets for use in dataloaders
        if stage == "fit":
            self.train_dataset = LongHorizonUnivariateDataset(name=self.name, split='train', 
                                                              split_start=0.0,
                                                              split_end=self.split_proportions[0],
                                                              horizon_length=self.horizon_length,
                                                              history_length = self.history_length)
            self.val_dataset = LongHorizonUnivariateDataset(name=self.name, split='val', 
                                                            split_start=self.split_proportions[0],
                                                            split_end=self.split_proportions[1],
                                                            horizon_length=self.horizon_length,
                                                            history_length = self.history_length)
        # Assign test dataset for use in dataloader(s)
        if stage == "test":
            self.test_dataset = LongHorizonUnivariateDataset(name=self.name, split='test', 
                                                             split_start=self.split_proportions[1],
                                                             split_end=self.split_proportions[2],
                                                             horizon_length=self.horizon_length,
                                                             history_length = self.history_length)
        if stage == "predict":
            self.predict_dataset = LongHorizonUnivariateDataset(name=self.name, split='test', 
                                                                split_start=self.split_proportions[1],
                                                                split_end=self.split_proportions[2],
                                                                horizon_length=self.horizon_length,
                                                                history_length = self.history_length)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.train_batch_size, 
                          shuffle=True, pin_memory=True, 
                          persistent_workers=self.persistent_workers,
                          num_workers=self.num_workers, collate_fn=collate_fn_flat_deal)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.eval_batch_size, 
                          shuffle=False, pin_memory=True, 
                          persistent_workers=self.persistent_workers,
                          num_workers=self.num_workers, collate_fn=collate_fn_flat_deal)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.eval_batch_size,
                          shuffle=False, pin_memory=True, 
                          persistent_workers=self.persistent_workers,
                          num_workers=self.num_workers, collate_fn=collate_fn_flat_deal)

    def predict_dataloader(self):
        return DataLoader(self.predict_dataset, batch_size=self.eval_batch_size, 
                          shuffle=False, pin_memory=True, 
                          persistent_workers=self.persistent_workers,
                          num_workers=self.num_workers, collate_fn=collate_fn_flat_deal)

In [None]:
dm = LongHorizonUnivariateDataModule(train_batch_size=128)
dm.prepare_data()
dm.setup(stage='fit')
dm.setup(stage='test')

In [None]:
dm.split_proportions

In [None]:
train_loader = dm.train_dataloader()
val_loader = dm.val_dataloader()
test_loader = dm.test_dataloader()

In [None]:
train_loader.dataset.time_features.shape

In [None]:
for b in tqdm(train_loader):
    pass

In [None]:
b

In [None]:
for b in tqdm(val_loader):
    pass

In [None]:
for b in tqdm(test_loader):
    pass

In [None]:
ETTm2 = LongHorizonUnivariateDataset(name='ETTm2', split='train', split_start=0, split_end=0.6)

In [None]:
len(ETTm2.df)

In [None]:
ETTm2.df.index.max()

In [None]:
len(ETTm2.df)

In [None]:
(12240-720) / 57600

In [None]:
ETTm2[2*ETTm2.num_windows+ETTm2.num_windows-1]

In [None]:
def collate_fn_flat_deal(batch):
    out = {}
    for b in batch:
        for k, bv in b.items():
            v = out.get(k, [])
            v.append(bv)
            out[k] = v
            
    for k,v in out.items():
        v = np.concatenate(v)
        if type(v[0]) not in [np.str_, pd.Timestamp]:
            v = torch.as_tensor(v)
        out[k] = v
    return out

dl = DataLoader(ETTm2, batch_size=512, 
                          shuffle=True, pin_memory=True, 
                          persistent_workers=True,
                          num_workers=4, collate_fn=collate_fn_flat_deal)

In [None]:
for b in tqdm(dl):
    
    a=1

In [None]:
b['history'].shape

In [None]:
Y_df.ds = pd.to_datetime(Y_df.ds)
# Y_df = Y_df.pivot(index='ds', columns='unique_id', values='y')

Y_df

In [None]:
Y_df

In [None]:
# from neuralforecast import TimeSeriesDataset

from neuralforecast.tsdataset import TimeSeriesDataset

In [None]:
ds, uids, last_dates, ds_sort  = TimeSeriesDataset.from_df(Y_df)

In [None]:
ds[0]['temporal']

In [None]:
len(ds)