In [60]:
from dataloading_helpers import utils, base
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import sklearn

GenericDataFormatter = base.GenericDataFormatter
DataTypes = base.DataTypes
InputTypes = base.InputTypes


class ElectricityFormatter(GenericDataFormatter):
    _column_definition = [
      ('id', DataTypes.REAL_VALUED, InputTypes.ID),
      ('hours_from_start', DataTypes.REAL_VALUED, InputTypes.TIME),
      ('power_usage', DataTypes.REAL_VALUED, InputTypes.TARGET),
      ('hour', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
      ('day_of_week', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
      ('hours_from_start', DataTypes.REAL_VALUED, InputTypes.KNOWN_INPUT),
      ('categorical_id', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
  ]

    def __init__(self):
        """Initialises formatter."""

        self.identifiers = None
        self._real_scalers = None
        self._cat_scalers = None
        self._target_scaler = None
        self._num_classes_per_cat_input = None
        self._time_steps = self.get_fixed_params()['total_time_steps']

    def split_data(self, df, valid_boundary=1315, test_boundary=1339):
        index = df['days_from_start']
        train = df.loc[index < valid_boundary]
        valid = df.loc[(index >= valid_boundary - 7) & (index < test_boundary)]
        test = df.loc[index >= test_boundary - 7]

        self.set_scalers(train)

        return (self.transform_inputs(data) for data in [train, valid, test])

    def set_scalers(self, df):
        column_definitions = self.get_column_definition()
        id_column = utils.get_single_col_by_input_type(InputTypes.ID,
                                                   column_definitions)
        target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
                                                       column_definitions)

        # Format real scalers
        real_inputs = utils.extract_cols_from_data_type(
            DataTypes.REAL_VALUED, column_definitions,
            {InputTypes.ID, InputTypes.TIME})

        # Initialise scaler caches
        self._real_scalers = {}
        self._target_scaler = {}
        identifiers = []
        for identifier, sliced in df.groupby(id_column):
            if len(sliced) >= self._time_steps:

                data = sliced[real_inputs].values
                targets = sliced[[target_column]].values
                self._real_scalers[identifier] \
              = sklearn.preprocessing.StandardScaler().fit(data)

                self._target_scaler[identifier] \
              = sklearn.preprocessing.StandardScaler().fit(targets)
                identifiers.append(identifier)

        # Format categorical scalers
        categorical_inputs = utils.extract_cols_from_data_type(
            DataTypes.CATEGORICAL, column_definitions,
            {InputTypes.ID, InputTypes.TIME})

        categorical_scalers = {}
        num_classes = []
        for col in categorical_inputs:
            srs = df[col].apply(str)
            categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(
                srs.values)
            num_classes.append(srs.nunique())

        # Set categorical scaler outputs
        self._cat_scalers = categorical_scalers
        self._num_classes_per_cat_input = num_classes

        # Extract identifiers in case required
        self.identifiers = identifiers

    def transform_inputs(self, df):
        if self._real_scalers is None and self._cat_scalers is None:
            raise ValueError('Scalers have not been set!')

        column_definitions = self.get_column_definition()
        id_col = utils.get_single_col_by_input_type(InputTypes.ID,
                                                column_definitions)
        real_inputs = utils.extract_cols_from_data_type(
            DataTypes.REAL_VALUED, column_definitions,
            {InputTypes.ID, InputTypes.TIME})
        categorical_inputs = utils.extract_cols_from_data_type(
            DataTypes.CATEGORICAL, column_definitions,
            {InputTypes.ID, InputTypes.TIME})
    
    # Transform real inputs per entity
        df_list = []
        for identifier, sliced in df.groupby(id_col):

          # Filter out any trajectories that are too short
          if len(sliced) >= self._time_steps:
            sliced_copy = sliced.copy()
            sliced_copy[real_inputs] = self._real_scalers[identifier].transform(
                sliced_copy[real_inputs].values)
            df_list.append(sliced_copy)

        output = pd.concat(df_list, axis=0)

    # Format categorical inputs
        for col in categorical_inputs:
            string_df = df[col].apply(str)
            output[col] = self._cat_scalers[col].transform(string_df)

        return output
    
    
    def get_fixed_params(self):
        """Returns fixed model parameters for experiments."""

        fixed_params = {
            'total_time_steps': 8 * 24,
            'num_encoder_steps': 7 * 24,
            'num_epochs': 100,
            'early_stopping_patience': 5,
            'multiprocessing_workers': 5
        }

        return fixed_params
    

    def format_predictions(self, predictions):
        if self._target_scaler is None:
            raise ValueError('Scalers have not been set!')

        column_names = predictions.columns

        df_list = []
        for identifier, sliced in predictions.groupby('identifier'):
            sliced_copy = sliced.copy()
            target_scaler = self._target_scaler[identifier]

            for col in column_names:
                if col not in {'forecast_time', 'identifier'}:
                    sliced_copy[col] = target_scaler.inverse_transform(sliced_copy[col])
            df_list.append(sliced_copy)

        output = pd.concat(df_list, axis=0)

        return output

In [61]:
electricity = pd.read_csv(csv_file, index_col=0)    
standardizer = ElectricityFormatter()
train, test, validation = standardizer.split_data(df=electricity)

In [66]:
a

Unnamed: 0,power_usage,time_idx,days_from_start,categorical_id,date,id,hour,day,day_of_week,month,hours_from_start,categorical_day_of_week,categorical_hour
17544,-0.127174,26304.0,1096,0,2014-01-01 00:00:00,MT_001,-1.661325,1,-0.499719,1,-1.731721,2,0
17545,-0.050713,26305.0,1096,0,2014-01-01 01:00:00,MT_001,-1.516862,1,-0.499719,1,-1.731062,2,1
17546,-0.050713,26306.0,1096,0,2014-01-01 02:00:00,MT_001,-1.372399,1,-0.499719,1,-1.730403,2,2
17547,-0.050713,26307.0,1096,0,2014-01-01 03:00:00,MT_001,-1.227936,1,-0.499719,1,-1.729744,2,3
17548,-0.127174,26308.0,1096,0,2014-01-01 04:00:00,MT_001,-1.083473,1,-0.499719,1,-1.729085,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10460738,0.305581,31555.0,1314,368,2014-08-07 19:00:00,MT_370,1.083473,7,0.002292,8,1.729085,3,19
10460739,0.101327,31556.0,1314,368,2014-08-07 20:00:00,MT_370,1.227936,7,0.002292,8,1.729744,3,20
10460740,-0.303532,31557.0,1314,368,2014-08-07 21:00:00,MT_370,1.372399,7,0.002292,8,1.730403,3,21
10460741,0.002848,31558.0,1314,368,2014-08-07 22:00:00,MT_370,1.516862,7,0.002292,8,1.731062,3,22


In [16]:
import pandas as pd
import numpy as np
from pathlib import Path
from config import *
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data.encoders import GroupNormalizer, TorchNormalizer, EncoderNormalizer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# set path in config.py
txt_file = CONFIG_DICT["datasets"]["electricity"] / "LD2011_2014.txt"
csv_file = CONFIG_DICT["datasets"]["electricity"] / "LD2011_2014.csv"
csv_file_normalized = CONFIG_DICT["datasets"]["electricity"] / "LD2011_2014_normalized.csv"



"""
prep_electricity_data function copied from google paper:
https://github.com/google-research/google-research/blob/master/tft/script_download_data.py

args:
  -txt_file: path to .txt document containg raw electricity dataset
      
  -output_path: path to save/load prepared csv to/from

output: electricity_dataset_dict
  -training dataset
  -training dataloader
  -validation dataloader
  -validation dataset

"""

def prep_electricity_data(txt_file):
    df = pd.read_csv(txt_file, index_col=0, sep=';', decimal=',')
    df.index = pd.to_datetime(df.index)
    df.sort_index(inplace=True)

    # Used to determine the start and end dates of a series
    output = df.resample('1h').mean().replace(0., np.nan)

    earliest_time = output.index.min()


    df_list = []
    for label in output:
        srs = output[label]

        start_date = min(srs.fillna(method='ffill').dropna().index)
        end_date = max(srs.fillna(method='bfill').dropna().index)

        active_range = (srs.index >= start_date) & (srs.index <= end_date)
        srs = srs[active_range].fillna(0.)

        tmp = pd.DataFrame({'power_usage': srs})
        date = tmp.index
        tmp['time_idx'] = (date - earliest_time).seconds / 60 / 60 + (
          date - earliest_time).days * 24
        tmp['days_from_start'] = (date - earliest_time).days
        tmp['categorical_id'] = label
        tmp['date'] = date
        tmp['id'] = label
        tmp['hour'] = date.hour
        tmp['day'] = date.day
        tmp['day_of_week'] = date.dayofweek
        tmp['month'] = date.month

        df_list.append(tmp)

    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)

    output['categorical_id'] = output['id'].copy()
    output['hours_from_start'] = output['time_idx']
    #output['categorical_day_of_week'] = output['day_of_week'].copy()
    #output['categorical_hour'] = output['hour'].copy()

    # Filter to match range used by other academic papers
    output = output[(output['days_from_start'] >= 1096)
                    & (output['days_from_start'] < 1346)].copy()

    output.to_csv(csv_file)
    
    
    # normalize dataset similar to google implementation
    valid_boundary = 1315
    #test_boundary = 1339
    
    real_scaler = {}
    cat_scaler = {}
    target_scaler = {}
    
    index = df['days_from_start']
    train = df.loc[index < valid_boundary]
    
    
    target = ["power_usage"]
    real = ["hour", "day_of_week", "hours_from_start"]
    categorical = ["categorical_id"]
    
    for i in target:
        data = train[i].values
        target_scaler[i] = StandardScaler().fit(data)
    
    for i in real:
        data = train[i].values
        real_scaler[i] = StandardScaler().fit(data)
      
    for i in categorical:
        data = train[i].values
        categorical_scaler[i] = LabelEncoder().fit(data)
    
    output_normalized.to_csv(csv_file_normalized)
    
    return output


def create_electricity_timeseries_tft():
   
    try:
        electricity_data = pd.read_csv(csv_file, index_col=0)    
    except:
        electricity_data = prep_electricity_data(txt_file)


    electricity_data['time_idx'] = electricity_data['time_idx'].astype('int')
    electricity_data['categorical_id'] = electricity_data['categorical_id'].astype('category')
  
    max_prediction_length = 24
    max_encoder_length = 168
    training_cutoff = electricity_data["time_idx"].max() - max_prediction_length
    

    training = TimeSeriesDataSet(
      electricity_data[lambda x: x.time_idx <= training_cutoff],
      time_idx="time_idx",
      target="power_usage",
      group_ids=["id"],
      min_encoder_length=max_encoder_length,# // 2,  # keep encoder length long (as it is in the validation set)
      max_encoder_length=max_encoder_length,
      min_prediction_length=max_prediction_length,
      max_prediction_length=max_prediction_length,
      static_categoricals=["categorical_id"],
      static_reals=[],
      time_varying_known_categoricals=[],
      #variable_groups={"special_days": special_days},  # group of categorical variables can be treated as one variable
      time_varying_known_reals=["time_idx", "hour", "day_of_week"],
      time_varying_unknown_categoricals=[],
      time_varying_unknown_reals=[],
      target_normalizer=TorchNormalizer(method="standard", center="False"),
      #categorical_encoders={"categorical_id": LabelEncoder()},
      add_relative_time_idx=False,
      add_target_scales=False,
      add_encoder_length=False, #
    )
    

  # create validation set (predict=True) which means to predict the last max_prediction_length points in time
  # for each series
    validation = TimeSeriesDataSet.from_dataset(training, electricity_data, predict=True, stop_randomization=True)

  # create dataloaders for model
    batch_size = 128  # set this between 32 to 128
    train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=45, pin_memory=True)
    val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 20, num_workers=30, pin_memory=True)


# output data as dict for easier modularity
    return {"training_dataset": training, 
          "train_dataloader": train_dataloader,
          "val_dataloader": val_dataloader, 
          "validaton_dataset": validation}


In [17]:
electricity = create_electricity_timeseries_tft()

timeseries_dict =  electricity
config_name_string = "electricity"
parameters = []
electricity



{'training_dataset': TimeSeriesDataSet[length=2118737](
 	time_idx='time_idx',
 	target='power_usage',
 	group_ids=['id'],
 	weight=None,
 	max_encoder_length=168,
 	min_encoder_length=168,
 	min_prediction_idx=26304,
 	min_prediction_length=24,
 	max_prediction_length=24,
 	static_categoricals=['categorical_id'],
 	static_reals=[],
 	time_varying_known_categoricals=[],
 	time_varying_known_reals=['time_idx', 'hour', 'day_of_week'],
 	time_varying_unknown_categoricals=[],
 	time_varying_unknown_reals=[],
 	variable_groups={},
 	constant_fill_strategy={},
 	allow_missing_timesteps=False,
 	lags={},
 	add_relative_time_idx=False,
 	add_target_scales=False,
 	add_encoder_length=False,
 	target_normalizer=TorchNormalizer(method='standard', center='False', transformation=None, method_kwargs={}),
 	categorical_encoders={'__group_id__id': NaNLabelEncoder(add_nan=False, warn=True), 'categorical_id': NaNLabelEncoder(add_nan=False, warn=True)},
 	scalers={'time_idx': StandardScaler(), 'hour': St

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from config import *
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data.encoders import GroupNormalizer, TorchNormalizer, EncoderNormalizer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# set path in config.py
txt_file = CONFIG_DICT["datasets"]["electricity"] / "LD2011_2014.txt"
csv_file = CONFIG_DICT["datasets"]["electricity"] / "LD2011_2014.csv"
csv_file_normalized = CONFIG_DICT["datasets"]["electricity"] / "LD2011_2014_normalized.csv"



"""
prep_electricity_data function copied from google paper:
https://github.com/google-research/google-research/blob/master/tft/script_download_data.py

args:
  -txt_file: path to .txt document containg raw electricity dataset
      
  -output_path: path to save/load prepared csv to/from

output: electricity_dataset_dict
  -training dataset
  -training dataloader
  -validation dataloader
  -validation dataset

"""

def prep_electricity_data(txt_file):
    df = pd.read_csv(txt_file, index_col=0, sep=';', decimal=',')
    df.index = pd.to_datetime(df.index)
    df.sort_index(inplace=True)

    # Used to determine the start and end dates of a series
    output = df.resample('1h').mean().replace(0., np.nan)

    earliest_time = output.index.min()


    df_list = []
    for label in output:
        srs = output[label]

        start_date = min(srs.fillna(method='ffill').dropna().index)
        end_date = max(srs.fillna(method='bfill').dropna().index)

        active_range = (srs.index >= start_date) & (srs.index <= end_date)
        srs = srs[active_range].fillna(0.)

        tmp = pd.DataFrame({'power_usage': srs})
        date = tmp.index
        tmp['time_idx'] = (date - earliest_time).seconds / 60 / 60 + (
          date - earliest_time).days * 24
        tmp['days_from_start'] = (date - earliest_time).days
        tmp['categorical_id'] = label
        tmp['date'] = date
        tmp['id'] = label
        tmp['hour'] = date.hour
        tmp['day'] = date.day
        tmp['day_of_week'] = date.dayofweek
        tmp['month'] = date.month

        df_list.append(tmp)

    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)

    output['categorical_id'] = output['id'].copy()
    output['hours_from_start'] = output['time_idx']
    #output['categorical_day_of_week'] = output['day_of_week'].copy()
    #output['categorical_hour'] = output['hour'].copy()

    # Filter to match range used by other academic papers
    output = output[(output['days_from_start'] >= 1096)
                    & (output['days_from_start'] < 1346)].copy()

    return output    

In [31]:
electricity_data = pd.read_csv(csv_file, index_col=0)    
electricity_data['time_idx'] = electricity_data['time_idx'].astype('int')
electricity_data['categorical_id'] = electricity_data['categorical_id'].astype('category')

In [32]:
# normalize dataset similar to google implementation
valid_boundary = 1315
    
real_scaler = {}
cat_scaler = {}
target_scaler = {}
    
index = electricity_data['days_from_start']
train = electricity_data.loc[index < valid_boundary]
    
    
real = ["hour", "day_of_week", "hours_from_start", "power_usage"]
categorical = ["categorical_id"]

for each customer in train.groupby("categorical_id")
    scaler = StandardScaler()
    X_numeric_std = pd.DataFrame(data=scaler.fit_transform(train[real]), columns=real)


cat_encoder = LabelEncoder()
X_std = pd.merge(X_numeric_std, X_cat_std[categorical], left_index=True, right_index=True)

LabelEncoder().fit_transform(X_std[categorical])

  y = column_or_1d(y, warn=True)


array([  0,   0,   0, ..., 368, 368, 368], dtype=int64)

In [33]:
X_std

Unnamed: 0,hour,day_of_week,hours_from_start,power_usage,categorical_id
0,-1.661328,-0.499545,-1.745523,-0.178208,0
1,-1.516865,-0.499545,-1.744862,-0.178113,0
2,-1.372402,-0.499545,-1.744201,-0.178113,0
3,-1.227939,-0.499545,-1.743540,-0.178113,0
4,-1.083476,-0.499545,-1.742879,-0.178208,0
...,...,...,...,...,...
1923531,1.083467,0.002468,1.725580,5.370831,368
1923532,1.227930,0.002468,1.726241,5.143978,368
1923533,1.372393,0.002468,1.726902,4.694323,368
1923534,1.516856,0.002468,1.727563,5.034602,368
