In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
!pip3 install thop



In [33]:

import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
import os, sys, gc, time, warnings, pickle, psutil, random
warnings.filterwarnings('ignore')

class Config(object):
  def __init__(self, args):
    #basic config
    self.is_training = args.get('is_training', 1)
    self.model_id = args.get('model_id', 'test')
    self.model = args.get('model', 'Autoformer')
    #dataloader
    self.train_data = args.get('train_data', 0.7)
    self.data = args.get('data', 'test')
    self.root_path = args.get('root_path', '/content/drive/MyDrive/M5-FITS/processed-nonparam')
    self.data_path = args.get('data_path', 'm5.csv')
    self.features = args.get('features', 'S')
    self.target = args.get('target', 'sales')
    self.freq = args.get('freq', 'd')
    self.scale = args.get('scale', True)
    self.checkpoints = args.get('checkpoints', '/content/drive/MyDrive/M5-FITS/checkpoints')
    #forecasting
    self.seq_len = args.get('seq_len', 56)
    self.pred_len = args.get('pred_len', 28)
    self.label_len = args.get('label_len', 28)
    self.individual = args.get('individual', False)
    #optimization
    self.num_workers = args.get('num_workers', 10)
    self.itr = args.get('itr', 2)
    self.train_epochs = args.get('train_epochs', 100)
    self.batch_size = args.get('batch_size', 32)
    self.kernel_size = args.get('kernel_size', 25) ## FOR OPTIMIZING TREND COMPONENT SMOOTHNESS
    self.kernel_type = args.get('kernel_type', 'moving_avg')
    self.patience = args.get('patience', 3)
    self.learning_rate = args.get('learning_rate', 0.0001)
    self.des = args.get('des', 'test')
    self.loss = args.get('loss', 'mse')
    self.lradj = args.get('lradj', 'type3')
    self.use_amp = args.get('use_amp', False)
    #GPU
    self.use_gpu = args.get('use_gpu', True)
    self.gpu = args.get('gpu', 0)
    self.use_multi_gpu = args.get('use_multi_gpu', False)
    self.devices = args.get('devices', '0,1,2,3')
    self.test_flop = args.get('test_flop', False)
    #Augmentation
    self.aug_method = args.get('aug_method', 'NA')
    self.aug_rate = args.get('aug_rate', 0.5)
    self.in_batch_augmentation = args.get('in_batch_augmentation', False)
    self.in_dataset_augmentation = args.get('in_dataset_augmentation', False)
    self.data_size = args.get('data_size', 1)
    self.aug_data_size = args.get('aug_data_size', 1)
    self.seed = args.get('seed', 2021)
    #continue learning
    self.testset_div = args.get('testset_div', 2)
    self.test_time_train = args.get('test_time_train', False)
    #Formers
    self.embed = args.get('embed', 'timeF')
    self.enc_in = args.get('enc_in', 7)
    self.dec_in = args.get('dec_in', 7)
    self.c_out = args.get('c_out', 7)
    self.d_model = args.get('d_model', 512)
    self.n_heads = args.get('n_heads', 8)
    self.e_layers = args.get('e_layers', 2)
    self.d_layers = args.get('d_layers', 1)
    self.d_ff = args.get('d_ff', 2048)
    self.moving_avg = args.get('moving_avg', 25)
    self.factor = args.get('factor', 1)
    self.distil = args.get('distil', True)
    self.dropout = args.get('dropout', 0.1)
    self.activation = args.get('activation', 'relu')
    self.output_attention = args.get('output_attention', False)
    self.do_predict = args.get('do_predict', False)

    #Flinear
    self.train_mode = args.get('train_mode', 0)
    self.cut_freq = args.get('cut_freq', 0)
    self.base_T = args.get('base_T', 24)
    self.H_order = args.get('H_order', 2)

    self.use_gpu = True if torch.cuda.is_available() and self.use_gpu else False
    cfreq = args.get('cut_freq', 0)
    if cfreq == 0:
      self.cut_freq = int(self.seq_len // self.base_T + 1) * self.H_order + 10

    fix_seed = self.seed
    random.seed(fix_seed)
    torch.manual_seed(fix_seed)
    np.random.seed(fix_seed)

In [34]:
processed_data_dir_base = '/content/drive/MyDrive/M5-thesis/M5-FITS/processed/'
submissions_dir = '/content/drive/MyDrive/M5-thesis/submissions/partial/'
thesis_dir = '/content/drive/MyDrive/M5-thesis/'

os.chdir(thesis_dir)

In [35]:
global_df = pd.read_pickle(processed_data_dir_base + 'raw_data.pkl')

In [36]:
from torch.utils.data import DataLoader
from utils.timefeatures import time_features
from utils.augmentations import augmentation

start_date = pd.Timestamp("2011-01-29")
TRAIN_START = 0
TRAIN_END = 1941
TEST_END = 1969
TRAIN_LEN = TRAIN_END - TRAIN_START
HORIZON = 28

"""
get df_train and features
"""
def prepare_features(calendar):
  df = calendar.copy()
  df['snap_CA'] = df['snap_CA'].astype(int)
  df['snap_TX'] = df['snap_TX'].astype(int)
  df['snap_WI'] = df['snap_WI'].astype(int)
  df['snap_count'] = df[['snap_CA','snap_TX','snap_WI']].apply(np.sum,axis=1)

  df['is_event_1'] = [isinstance(x , str)*1 for x in df['event_name_1']]
  df['is_event_2'] = [isinstance(x , str)*1 for x in df['event_name_2']]
  df['is_event'] = df[['is_event_1', 'is_event_2']].apply(np.sum, axis=1)
  #turn into boolean
  # df['is_event'] = np.where(df['is_event']>0,1,0)

  df.drop(columns=['event_name_1', 'event_name_2', 'snap_CA', 'snap_TX', 'snap_WI', 'is_event_1', 'is_event_2'], inplace=True)
  df['date'] = start_date + pd.to_timedelta(df['d'] - 1, unit='D')

  return df

"""
get df_train and features
"""
def get_data_by_store_id(store_id, dept_id):
  temp = global_df.copy()

  store_data = temp[(temp['store_id'] == store_id) & (temp['dept_id'] == dept_id) & (temp['d'] >= TRAIN_START)]
  del temp
  gc.collect()

  return store_data

def aggregate_data(df):
  temp = df[['d', 'sales', 'date']]
  temp = temp.groupby(['d']).agg({
    'sales': 'sum',
    'date': 'first'
  }).reset_index()

  return temp


class Dataset_Custom(Dataset):
    def __init__(self, config, df, flag='train', size=None,
                 features='S',
                 target='OT', scale=True, timeenc=0, freq='h'):
        self.args = config
        # info
        if size == None:
            self.seq_len = 24 * 4 * 4
            self.label_len = 24 * 4
            self.pred_len = 24 * 4
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]
        # init
        assert flag in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[flag]

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq

        self.df_raw = df

        self.__read_data__()
        self.collect_all_data()
        if self.args.in_dataset_augmentation and self.set_type==0:
            self.data_augmentation()

    def __read_data__(self):
        self.scaler = StandardScaler()
        #TODO: read bottom-level data. Reproduce similar functionality as in DLinear

        '''
        df_raw.columns: ['date', ...(other features), target feature]
        '''
        cols = list(self.df_raw.columns)
        cols.remove(self.target)
        cols.remove('date')
        self.df_raw = self.df_raw[['date'] + cols + [self.target]]
        # print(cols)
        # num_train = int(len(df_raw) * 0.7)
        # num_test = int(len(df_raw) * 0.2)
        num_test = HORIZON  # Fixed to the last 28 days
        num_train = int((TRAIN_END - TRAIN_START) * self.args.train_data)
        num_vali = TRAIN_END - TRAIN_START - num_train
        border1s = [0, num_train - self.seq_len, len(self.df_raw) - num_test - self.seq_len]
        border2s = [num_train, num_train + num_vali, len(self.df_raw)]

        if self.args.test_time_train:
            num_train = int(len(self.df_raw) * 0.9)
            border1s = [0, num_train - self.seq_len, len(self.df_raw)]
            border2s = [num_train, len(self.df_raw), len(self.df_raw)]

        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]

        if self.features == 'M' or self.features == 'MS':
            cols_data = self.df_raw.columns[1:]
            df_data = self.df_raw[cols_data]
        elif self.features == 'S':
            df_data = self.df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            # print(self.scaler.mean_)
            # exit()
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        df_stamp = self.df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        if self.timeenc == 0:
            df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
            df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
            df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1)
            df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
            data_stamp = df_stamp.drop(['date'], 1).values
        elif self.timeenc == 1:
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
            data_stamp = data_stamp.transpose(1, 0)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp
        print(border1, border2)

    def regenerate_augmentation_data(self):
        self.collect_all_data()
        self.data_augmentation()

    def reload_data(self, x_data, y_data, x_time, y_time):
        self.x_data = x_data
        self.y_data = y_data
        self.x_time = x_time
        self.y_time = y_time

    def collect_all_data(self):
        self.x_data = []
        self.y_data = []
        self.x_time = []
        self.y_time = []
        data_len = len(self.data_x) - self.seq_len - self.pred_len + 1
        mask_data_len = int((1-self.args.data_size) * data_len) if self.args.data_size < 1 else 0
        for i in range(len(self.data_x) - self.seq_len - self.pred_len + 1):
            if (self.set_type == 0 and i >= mask_data_len) or self.set_type != 0:
                s_begin = i
                s_end = s_begin + self.seq_len
                r_begin = s_end - self.label_len
                r_end = r_begin + self.label_len + self.pred_len
                self.x_data.append(self.data_x[s_begin:s_end])
                self.y_data.append(self.data_y[r_begin:r_end])
                self.x_time.append(self.data_stamp[s_begin:s_end])
                self.y_time.append(self.data_stamp[r_begin:r_end])

    def data_augmentation(self):
        origin_len = len(self.x_data)
        if not self.args.closer_data_aug_more:
            aug_size = [self.args.aug_data_size for i in range(origin_len)]
        else:
            aug_size = [int(self.args.aug_data_size * i/origin_len) + 1 for i in range(origin_len)]

        for i in range(origin_len):
            for _ in range(aug_size[i]):
                aug = augmentation('dataset')
                if self.args.aug_method == 'f_mask':
                    x,y = aug.freq_dropout(self.x_data[i],self.y_data[i],dropout_rate=self.args.aug_rate)
                elif self.args.aug_method == 'f_mix':
                    rand = float(np.random.random(1))
                    i2 = int(rand*len(self.x_data))
                    x,y = aug.freq_mix(self.x_data[i],self.y_data[i],self.x_data[i2],self.y_data[i2],dropout_rate=self.args.aug_rate)
                else:
                    raise ValueError
                self.x_data.append(x)
                self.y_data.append(y)
                self.x_time.append(self.x_time[i])
                self.y_time.append(self.y_time[i])

    def __getitem__(self, index):
        seq_x = self.x_data[index]
        seq_y = self.y_data[index]
        return seq_x, seq_y, self.x_time[index], self.y_time[index]

    def __len__(self):
        return len(self.x_data)

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)

def data_provider(args, df, flag):
  timeenc = 0 if args.embed != 'timeF' else 1

  if flag == 'test':
      shuffle_flag = False
      drop_last = False # True
      batch_size = args.batch_size
      freq = args.freq
  else:
      shuffle_flag = True
      drop_last = True
      batch_size = args.batch_size
      freq = args.freq

  data_set = Dataset_Custom(
      config=args,
      flag=flag,
      df=df,
      size=[args.seq_len, args.label_len, args.pred_len],
      features=args.features,
      target=args.target,
      timeenc=timeenc,
      freq=freq,
      scale=args.scale
  )

  data_loader = DataLoader(
      data_set,
      batch_size=batch_size,
      shuffle=shuffle_flag,
      num_workers=args.num_workers,
      drop_last=drop_last)

  return data_set, data_loader

In [37]:
from re import X
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class moving_avg(nn.Module):
    """
    Moving average block to highlight the trend of time series
    """
    def __init__(self, kernel_size, stride):
        super(moving_avg, self).__init__()
        self.kernel_size = kernel_size
        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x):
        # padding on the both ends of time series
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)
        x = self.avg(x.permute(0, 2, 1))
        x = x.permute(0, 2, 1)
        return x

class MovingMedian(nn.Module):
    """
    Moving median block to highlight the trend of time series
    """

    def __init__(self, kernel_size, stride):
        super(MovingMedian, self).__init__()
        self.kernel_size = kernel_size
        self.stride = stride

    def forward(self, x):
        # Padding on both ends of the time series
        front = x[:, 0:1].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)

        # Applying median filtering using unfold to extract sliding windows
        unfolded = x.unfold(dimension=1, size=self.kernel_size, step=self.stride)  # Shape: (batch, windows, kernel_size)
        median_filtered = unfolded.median(dim=-1).values  # Take the median along the last dimension

        return median_filtered

class series_decomp(nn.Module):
    """
    Series decomposition block
    """
    def __init__(self, configs):
        super(series_decomp, self).__init__()
        if configs.kernel_type == 'moving_avg':
          self.moving_avg = moving_avg(configs.kernel_size, stride=1)
        else:
          self.moving_avg = MovingMedian(configs.kernel_size, stride=1)

    def forward(self, x):
        moving_mean = self.moving_avg(x)
        res = x - moving_mean
        return res, moving_mean

class Model(nn.Module):
    """
    DLinear
    """
    def __init__(self, configs):
        super(Model, self).__init__()
        self.seq_len = configs.seq_len
        self.pred_len = configs.pred_len

        # Decompsition Kernel Size
        kernel_size = configs.kernel_size
        print(f"kernel size:")
        self.decompsition = series_decomp(configs)
        self.individual = configs.individual
        self.channels = configs.enc_in

        if self.individual:
            self.Linear_Seasonal = nn.ModuleList()
            self.Linear_Trend = nn.ModuleList()
            self.Linear_Decoder = nn.ModuleList()
            for i in range(self.channels):
                self.Linear_Seasonal.append(nn.Linear(self.seq_len,self.pred_len))
                self.Linear_Seasonal[i].weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len]))
                self.Linear_Trend.append(nn.Linear(self.seq_len,self.pred_len))
                self.Linear_Trend[i].weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len]))
                self.Linear_Decoder.append(nn.Linear(self.seq_len,self.pred_len))
        else:
            self.Linear_Seasonal = nn.Linear(self.seq_len,self.pred_len)
            self.Linear_Trend = nn.Linear(self.seq_len,self.pred_len)
            self.Linear_Decoder = nn.Linear(self.seq_len,self.pred_len)
            self.Linear_Seasonal.weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len]))
            self.Linear_Trend.weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len]))

    def forward(self, x):
        # x: [Batch, Input length, Channel]
        seasonal_init, trend_init = self.decompsition(x)
        seasonal_init, trend_init = seasonal_init.permute(0,2,1), trend_init.permute(0,2,1)
        if self.individual:
            seasonal_output = torch.zeros([seasonal_init.size(0),seasonal_init.size(1),self.pred_len],dtype=seasonal_init.dtype).to(seasonal_init.device)
            trend_output = torch.zeros([trend_init.size(0),trend_init.size(1),self.pred_len],dtype=trend_init.dtype).to(trend_init.device)
            for i in range(self.channels):
                seasonal_output[:,i,:] = self.Linear_Seasonal[i](seasonal_init[:,i,:])
                trend_output[:,i,:] = self.Linear_Trend[i](trend_init[:,i,:])
        else:
            seasonal_output = self.Linear_Seasonal(seasonal_init)
            trend_output = self.Linear_Trend(trend_init)

        x = seasonal_output + trend_output
        return x.permute(0,2,1) # to [Batch, Output length, Channel]

In [38]:
from utils.tools import EarlyStopping, adjust_learning_rate, test_params_flop, visual
from utils.metrics import metric
import numpy as np
import torch
import torch.nn as nn
from torch import optim
from utils.augmentations import augmentation
import os
import time

import warnings
import matplotlib.pyplot as plt
import numpy as np

from thop import profile

class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))

class DLinear(object):
    def __init__(self, args, setting, df_train):
        self.args = args
        self.device = self._acquire_device()
        self.model = self._build_model().to(self.device)
        self.df_train = df_train
        self.top_level = self.get_top_level()
        self.setting = setting

    def get_top_level(self):
      """
      Calculate top level time series by aggregating low level time-series
      """
      data = aggregate_data(self.df_train)

      return data

    def get_weights(self):
      temp = self.df_train.pivot(index='id', columns='d', values='sales')
      temp.fillna(0, inplace=True)
      w = np.sum(temp.iloc[:, -56:-28].values, axis=1) / sum(self.top_level['sales'].iloc[-56:-28])
      w = w.reshape(len(w), 1)

      del temp
      gc.collect()

      return w

    def _build_model(self):
        raise NotImplementedError
        return None

    def _acquire_device(self):
        if self.args.use_gpu:
            os.environ["CUDA_VISIBLE_DEVICES"] = str(
                self.args.gpu) if not self.args.use_multi_gpu else self.args.devices
            device = torch.device('cuda:{}'.format(self.args.gpu))
            print('Use GPU: cuda:{}'.format(self.args.gpu))
        else:
            device = torch.device('cpu')
            print('Use CPU')
        return device

    def _get_data(self):
        pass

    def vali(self):
        pass

    def train(self):
        pass

    def test(self):
        pass

class M5DLinear(DLinear):
    def __init__(self, args, setting, df_train):
        super(M5DLinear, self).__init__(args, setting, df_train)

    def _build_model(self):
        model = Model(self.args).float()

        if self.args.use_multi_gpu and self.args.use_gpu:
            model = nn.DataParallel(model, device_ids=self.args.device_ids)

        return model

    def _get_data(self, flag):
        data_set, data_loader = data_provider(self.args, self.get_top_level(), flag)

        return data_set, data_loader

    def _select_optimizer(self):
        model_optim = optim.Adam(self.model.parameters(), lr=self.args.learning_rate)
        print('!!!!!!!!!!!!!!learning rate!!!!!!!!!!!!!!!')
        print(self.args.learning_rate)
        return model_optim

    def _select_criterion(self):
        criterion = nn.MSELoss()
        return criterion

    def _get_profile(self, model):
        _input=torch.randn(self.args.batch_size, self.args.seq_len, self.args.enc_in).to(self.device)
        macs, params = profile(model, inputs=(_input,))
        print('FLOPs: ', macs)
        print('params: ', params)
        return macs, params

    def vali(self, vali_data, vali_loader, criterion):
        total_loss = []
        self.model.eval()
        with torch.no_grad():
            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(vali_loader):
                batch_x = batch_x.float().to(self.device)
                batch_y = batch_y.float().to(self.device)[:,-self.args.pred_len:,:]
                batch_xy = torch.cat([batch_x, batch_y], dim=1)

                batch_x_mark = batch_x_mark.float().to(self.device)
                batch_y_mark = batch_y_mark.float().to(self.device)

                # decoder input
                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len:, :]).float()
                dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
                # encoder - decoder
                outputs = self.model(batch_x)
                f_dim = -1 if self.args.features == 'MS' else 0
                outputs = outputs[:, -self.args.pred_len:, f_dim:]
                batch_y = batch_y[:, -self.args.pred_len:, f_dim:]

                pred = outputs.detach().cpu()
                true = batch_y.detach().cpu()

                loss = criterion(pred, true)

                total_loss.append(loss)
        total_loss = np.average(total_loss)
        self.model.train()
        return total_loss

    def train(self, ft=False):
        train_data, train_loader = self._get_data(flag='train')
        vali_data, vali_loader = self._get_data(flag='val')
        test_data, test_loader = self._get_data(flag='test')
        print(self.model)
        self._get_profile(self.model)
        print('Trainable parameters: ', sum(p.numel() for p in self.model.parameters() if p.requires_grad))

        path = os.path.join(self.args.checkpoints, self.setting)
        if not os.path.exists(path):
            os.makedirs(path)

        time_now = time.time()

        train_steps = len(train_loader)
        early_stopping = EarlyStopping(patience=self.args.patience, verbose=True)

        model_optim = self._select_optimizer()
        criterion = self._select_criterion()

        epoch_times = []
        for epoch in range(self.args.train_epochs):
            iter_count = 0
            train_loss = []

            self.model.train()
            epoch_time = time.time()
            if self.args.in_dataset_augmentation:
                train_loader.dataset.regenerate_augmentation_data()

            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader):
                iter_count += 1
                model_optim.zero_grad()

                batch_x = batch_x.float().to(self.device)
                batch_y = batch_y.float().to(self.device)[:,-self.args.pred_len:,:]
                batch_x_mark = batch_x_mark.float().to(self.device)
                batch_y_mark = batch_y_mark.float().to(self.device)
                batch_xy = torch.cat([batch_x, batch_y], dim=1)

                # decoder input
                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len:, :]).float()
                dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)

                # encoder - decoder
                outputs = self.model(batch_x)

                # print(outputs.shape,batch_y.shape)
                f_dim = -1 if self.args.features == 'MS' else 0
                outputs = outputs[:, -self.args.pred_len:, f_dim:]
                batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device)
                # print(outputs.shape,batch_xy.shape)
                #loss = criterion(outputs, batch_xy)
                loss = criterion(outputs, batch_y)
                train_loss.append(loss.item())

                if (i + 1) % 100 == 0:
                    print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format(i + 1, epoch + 1, loss.item()))
                    speed = (time.time() - time_now) / iter_count
                    left_time = speed * ((self.args.train_epochs - epoch) * train_steps - i)
                    print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time))
                    iter_count = 0
                    time_now = time.time()

                loss.backward()
                model_optim.step()

            dur = time.time() - epoch_time
            epoch_times.append(dur)
            print("Epoch: {} cost time: {}".format(epoch + 1, dur))
            train_loss = np.average(train_loss)
            vali_loss = self.vali(vali_data, vali_loader, criterion)
            test_loss = self.vali(test_data, test_loader, criterion)

            print("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format(
                epoch + 1, train_steps, train_loss, vali_loss, test_loss))
            early_stopping(vali_loss, self.model, path)
            if early_stopping.early_stop:
                print("Early stopping")
                break

            adjust_learning_rate(model_optim, epoch + 1, self.args)

        best_model_path = path + '/' + 'checkpoint.pth'
        torch.save(self.model.state_dict(), best_model_path)
        self.model.load_state_dict(torch.load(best_model_path, weights_only=True))

        return self.model, epoch_times

    def load_saved_model(self):
      print('loading model')
      path = os.path.join(self.args.checkpoints, self.setting)
      path += '/' + 'checkpoint.pth'
      state_dict = torch.load(path, weights_only=True)
      # del state_dict['total_ops']
      # del state_dict['total_params']

      self.model.load_state_dict(state_dict)

    def predict_top_level(self, test=0):
      pred_set, pred_loader = self._get_data(flag='test')

      if test:
        self.load_saved_model()

      preds = []

      self.model.eval()
      with torch.no_grad():
        for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(pred_loader):
          batch_x = batch_x.float().to(self.device)
          batch_y = batch_y.float()
          batch_x_mark = batch_x_mark.float().to(self.device)
          batch_y_mark = batch_y_mark.float().to(self.device)

          # decoder input
          dec_inp = torch.zeros([batch_y.shape[0], self.args.pred_len, batch_y.shape[2]]).float().to(batch_y.device)
          dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
          # encoder - decoder
          outputs = self.model(batch_x)
          f_dim = -1 if self.args.features == 'MS' else 0
          outputs_ = outputs[:, -self.args.pred_len:, f_dim:]
          pred = outputs_.detach().cpu().numpy()  # .squeeze()
          preds.append(pred)

        preds = np.array(preds)
        preds = preds.reshape(-1, preds.shape[-2], preds.shape[-1])
        preds = pred_set.inverse_transform(preds[0])

      return preds.flatten()

    def predict_bottom_level(self, test=0):
      w = self.get_weights()
      top_level_preds = self.predict_top_level(test=test)
      top_level_preds = top_level_preds.reshape(1,len(top_level_preds))
      preds = np.multiply(top_level_preds, w)

      item_ids = self.df_train['id'].unique()
      res = pd.DataFrame({
          'id': item_ids
      })

      predictions_df = pd.DataFrame(preds, columns=[f'F{i+1}' for i in range(28)])
      preds = pd.concat([res['id'], predictions_df], axis=1)

      preds.set_index('id', inplace=True)

      return preds

In [39]:
def get_model_name(config, dept_id, store_id):
  setting = '{}_{}_{}_ft{}_sl{}_ll{}_pl{}_H{}_{}_{}_{}'.format(
            config.model_id,
            config.model,
            config.data,
            config.features,
            config.seq_len,
            config.label_len,
            config.pred_len,
            config.H_order,
            dept_id,
            store_id, 1)

  return setting

In [27]:
args = {
    'model_id': 'dlinear-median-1.0',
    'data': 'custom',
    'features': 'S',
    'model': 'DLinear',
    'checkpoints': '/content/drive/MyDrive/M5-thesis/M5-DLinear/checkpoints/v2',
    'seq_len': 56,
    'batch_size': 24,
    'seed': 42,
    'learning_rate': 0.005,
    'train_data': 0.8,
    'kernel_size': 21
}

config = Config(args)

In [None]:
store_id = 'WI_2'
dept_id = 'FOODS_2'

In [None]:
setting = get_model_name(config, dept_id, store_id)
data = get_data_by_store_id(store_id, dept_id)

In [None]:
dlinear = M5DLinear(config, setting=setting, df_train=data)
model = dlinear.train()

In [40]:
stores = global_df['store_id'].unique()
depts = global_df['dept_id'].unique()
model_training_time_path = '/content/drive/MyDrive/M5-thesis/M5-DLinear/training_times/'

In [41]:
def train_model(model_name, args):
  print(f"training {model_name}")
  submission = pd.read_csv('/content/drive/MyDrive/data/m5-data/sample_submission.csv')
  submission.set_index('id', inplace=True)
  config = Config(args)

  full_training_time = 0
  cat_training_times = []
  for store_id in stores:
    for dept_id in depts:
      print(f"training {store_id} + {dept_id}")
      setting = '{}_{}_{}_ft{}_sl{}_ll{}_pl{}_H{}_{}_{}_{}'.format(
            config.model_id,
            config.model,
            config.data,
            config.features,
            config.seq_len,
            config.label_len,
            config.pred_len,
            config.H_order,
            dept_id,
            store_id, 1)

      df_train = get_data_by_store_id(store_id, dept_id)
      model = M5DLinear(config, setting=setting, df_train=df_train)
      _, train_times = model.train()
      train_times = sum(train_times)
      cat_training_times.append({'cat': f"{store_id}_{dept_id}", 'duration': train_times})
      full_training_time += train_times
      sub_submission = model.predict_bottom_level()

      submission.update(sub_submission)

      # sub_submission.reset_index(inplace=True)
      # sub_submission.to_csv(f"{submission_parts_dir}{model_name}_{store_id}_{dept_id}.csv", index=False)
      print(f"{model_name} trained for {train_times}")

      del df_train
      del model
      gc.collect()

  submission.reset_index(inplace=True)
  submission.to_csv(f"{submissions_dir}{model_name}.csv", index=False)
  train_times = pd.DataFrame(cat_training_times, columns=['cat', 'duration'])
  train_times.to_csv(f"{model_training_time_path}{model_name}_training_times.csv", index=False)

  return submission, full_training_time

In [42]:
models = [
  # {
  #     'model_id': 'dlinear-median-1.0',
  #     'data': 'custom',
  #     'features': 'S',
  #     'model': 'DLinear',
  #     'checkpoints': '/content/drive/MyDrive/M5-thesis/M5-DLinear/checkpoints/',
  #     'seq_len': 56,
  #     'batch_size': 24,
  #     'seed': 42,
  #     'learning_rate': 0.005,
  #     'train_data': 0.8,
  #     'kernel_size': 21,
  #     'kernel_type': 'moving_median'
  # },
#   {
#     'model_id': 'dlinear-moving-avg-1.0',
#     'data': 'custom',
#     'features': 'S',
#     'model': 'DLinear',
#     'checkpoints': '/content/drive/MyDrive/M5-thesis/M5-DLinear/checkpoints/',
#     'seq_len': 56,
#     'batch_size': 24,
#     'seed': 42,
#     'learning_rate': 0.005,
#     'train_data': 0.8,
#     'kernel_size': 21
# },
#     {
#     'model_id': 'dlinear-media-lb-84-1.0',
#     'data': 'custom',
#     'features': 'S',
#     'model': 'DLinear',
#     'checkpoints': '/content/drive/MyDrive/M5-thesis/M5-DLinear/checkpoints/',
#     'seq_len': 84,
#     'batch_size': 24,
#     'seed': 42,
#     'learning_rate': 0.005,
#     'train_data': 0.8,
#     'kernel_size': 21,
#     'kernel_type': 'moving_median',
# },
#       {
#     'model_id': 'dlinear-media-lb-84-0.7-1.0',
#     'data': 'custom',
#     'features': 'S',
#     'model': 'DLinear',
#     'checkpoints': '/content/drive/MyDrive/M5-thesis/M5-DLinear/checkpoints/',
#     'seq_len': 84,
#     'batch_size': 24,
#     'seed': 42,
#     'learning_rate': 0.005,
#     'train_data': 0.7,
#     'kernel_size': 21,
#     'kernel_type': 'moving_median',
# },
#         {
#     'model_id': 'dlinear-media-lb-70-1.0',
#     'data': 'custom',
#     'features': 'S',
#     'model': 'DLinear',
#     'checkpoints': '/content/drive/MyDrive/M5-thesis/M5-DLinear/checkpoints/',
#     'seq_len': 84,
#     'batch_size': 24,
#     'seed': 42,
#     'learning_rate': 0.005,
#     'train_data': 0.8,
#     'kernel_size': 21,
#     'kernel_type': 'moving_median',
# },
#           {
#     'model_id': 'dlinear-media-lb-70-0.7-1.0',
#     'data': 'custom',
#     'features': 'S',
#     'model': 'DLinear',
#     'checkpoints': '/content/drive/MyDrive/M5-thesis/M5-DLinear/checkpoints/',
#     'seq_len': 84,
#     'batch_size': 24,
#     'seed': 42,
#     'learning_rate': 0.005,
#     'train_data': 0.7,
#     'kernel_size': 21,
#     'kernel_type': 'moving_median',
# },
            {
    'model_id': 'dlinear-media-lb-112-0.7-1.0',
    'data': 'custom',
    'features': 'S',
    'model': 'DLinear',
    'checkpoints': '/content/drive/MyDrive/M5-thesis/M5-DLinear/checkpoints/',
    'seq_len': 112,
    'batch_size': 24,
    'seed': 42,
    'learning_rate': 0.005,
    'train_data': 0.7,
    'kernel_size': 21,
    'kernel_type': 'moving_median',
},
              {
    'model_id': 'dlinear-media-lb-112-0.85-1.0',
    'data': 'custom',
    'features': 'S',
    'model': 'DLinear',
    'checkpoints': '/content/drive/MyDrive/M5-thesis/M5-DLinear/checkpoints/',
    'seq_len': 112,
    'batch_size': 24,
    'seed': 42,
    'learning_rate': 0.005,
    'train_data': 0.85,
    'kernel_size': 21,
    'kernel_type': 'moving_median',
}
]

In [43]:
model_training_times = []
cat_times = []
for model in models:
  sub, durations = train_model(model['model_id'], model)
  model_training_times.append({'name': model['model_id'], 'duration': durations})

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0 1358
1246 1941
1829 1969
Model(
  (decompsition): series_decomp(
    (moving_avg): MovingMedian()
  )
  (Linear_Seasonal): Linear(in_features=112, out_features=28, bias=True)
  (Linear_Trend): Linear(in_features=112, out_features=28, bias=True)
  (Linear_Decoder): Linear(in_features=112, out_features=28, bias=True)
)
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
FLOPs:  1053696.0
params:  6328.0
Trainable parameters:  9492
!!!!!!!!!!!!!!learning rate!!!!!!!!!!!!!!!
0.005
Epoch: 1 cost time: 1.6267499923706055
Epoch: 1, Steps: 50 | Train Loss: 0.9391995 Vali Loss: 3.3450122 Test Loss: 17.2746773
Validation loss decreased (inf --> 3.345012).  Saving model ...
Updating learning rate to 0.005
Epoch: 2 cost time: 1.4875688552856445
Epoch: 2, Steps: 50 | Train Loss: 0.8676937 Vali Loss: 3.2757263 Test Loss: 17.2238350
Validation loss decreased (3.345012 --> 3.275726).  Saving model ...
Updating 

In [44]:
model_training_times_df = pd.DataFrame(model_training_times, columns=['name', 'duration'])
model_training_times_df.to_csv(model_training_time_path + 'model_training_times.csv', index=False)
model_training_times_df

Unnamed: 0,name,duration
0,dlinear-media-lb-112-0.7-1.0,617.031081
1,dlinear-media-lb-112-0.85-1.0,769.694833
