# Step By Step

In [19]:
# Clone the library
!git clone https://github.com/thuml/Time-Series-Library
# Switch working directory to the imported one
%cd '/content/Time-Series-Library'

fatal: destination path 'Time-Series-Library' already exists and is not an empty directory.
/content/Time-Series-Library


In [21]:
# Install all requirements
!pip install -r requirements.txt

Collecting einops==0.4.0 (from -r requirements.txt (line 1))
  Using cached einops-0.4.0-py3-none-any.whl (28 kB)
Collecting matplotlib==3.7.0 (from -r requirements.txt (line 2))
  Using cached matplotlib-3.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
Collecting numpy==1.23.5 (from -r requirements.txt (line 3))
  Using cached numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Collecting pandas==1.5.3 (from -r requirements.txt (line 4))
  Using cached pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
Collecting patool==1.12 (from -r requirements.txt (line 5))
  Using cached patool-1.12-py2.py3-none-any.whl (77 kB)
Collecting reformer-pytorch==1.4.4 (from -r requirements.txt (line 6))
  Using cached reformer_pytorch-1.4.4-py3-none-any.whl (16 kB)
Collecting scipy==1.10.1 (from -r requirements.txt (line 8))
  Using cached scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 

In [20]:
!pwd

/content/Time-Series-Library


In [16]:
#%cd '/content'

/content


In [17]:
# Import necessary packages
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.fft
from layers.Embed import DataEmbedding
from layers.Conv_Blocks import Inception_Block_V1  # For now, I stay with inception block

In [None]:
# Creating TimesBlock
class TimesBlock(nn.Module):
    def __init__(self, configs):    ##configs is the configuration defined for TimesBlock
        super(TimesBlock, self).__init__()
        self.seq_len = configs.seq_len   ##sequence length
        self.pred_len = configs.pred_len ##prediction length
        self.k = configs.top_k    ##k denotes how many top frequencies are
                                                                #taken into consideration
        # parameter-efficient design
        self.conv = nn.Sequential(
            Inception_Block_V1(configs.d_model, configs.d_ff,
                              num_kernels=configs.num_kernels),
            nn.GELU(),
            Inception_Block_V1(configs.d_ff, configs.d_model,
                              num_kernels=configs.num_kernels)
        )
    def forward(self, x):
            B, T, N = x.size()
                #B: batch size  T: length of time series  N:number of features
            period_list, period_weight = FFT_for_Period(x, self.k)
                #FFT_for_Period() will be shown later. Here, period_list([top_k]) denotes
                #the top_k-significant period and period_weight([B, top_k]) denotes its weight(amplitude)

            res = []
            for i in range(self.k):
                period = period_list[i]

                # padding : to form a 2D map, we need total length of the sequence, plus the part
                # to be predicted, to be divisible by the period, so padding is needed
                if (self.seq_len + self.pred_len) % period != 0:
                    length = (
                                    ((self.seq_len + self.pred_len) // period) + 1) * period
                    padding = torch.zeros([x.shape[0], (length - (self.seq_len + self.pred_len)), x.shape[2]]).to(x.device)
                    out = torch.cat([x, padding], dim=1)
                else:
                    length = (self.seq_len + self.pred_len)
                    out = x

                # reshape: we need each channel of a single piece of data to be a 2D variable,
                # Also, in order to implement the 2D conv later on, we need to adjust the 2 dimensions
                # to be convolutioned to the last 2 dimensions, by calling the permute() func.
                # Whereafter, to make the tensor contiguous in memory, call contiguous()
                out = out.reshape(B, length // period, period,
                                  N).permute(0, 3, 1, 2).contiguous()

                #2D convolution to grap the intra- and inter- period information
                out = self.conv(out)

                # reshape back, similar to reshape
                out = out.permute(0, 2, 3, 1).reshape(B, -1, N)

                #truncating down the padded part of the output and put it to result
                res.append(out[:, :(self.seq_len + self.pred_len), :])
            res = torch.stack(res, dim=-1) #res: 4D [B, length , N, top_k]

            # adaptive aggregation
            #First, use softmax to get the normalized weight from amplitudes --> 2D [B,top_k]
            period_weight = F.softmax(period_weight, dim=1)

            #after two unsqueeze(1),shape -> [B,1,1,top_k],so repeat the weight to fit the shape of res
            period_weight = period_weight.unsqueeze(
                1).unsqueeze(1).repeat(1, T, N, 1)

            #add by weight the top_k periods' result, getting the result of this TimesBlock
            res = torch.sum(res * period_weight, -1)

            # residual connection
            res = res + x
            return res

In [None]:
# FFT definition
def FFT_for_Period(x, k=2):
    # xf shape [B, T, C], denoting the amplitude of frequency(T) given the datapiece at B,N
    xf = torch.fft.rfft(x, dim=1)

    # find period by amplitudes: here we assume that the periodic features are basically constant
    # in different batch and channel, so we mean out these two dimensions, getting a list frequency_list with shape[T]
    # each element at pos t of frequency_list denotes the overall amplitude at frequency (t)
    frequency_list = abs(xf).mean(0).mean(-1)
    frequency_list[0] = 0

    #by torch.topk(),we can get the biggest k elements of frequency_list, and its positions(i.e. the k-main frequencies in top_list)
    _, top_list = torch.topk(frequency_list, k)

    #Returns a new Tensor 'top_list', detached from the current graph.
    #The result will never require gradient.Convert to a numpy instance
    top_list = top_list.detach().cpu().numpy()

    #period:a list of shape [top_k], recording the periods of mean frequencies respectively
    period = x.shape[1] // top_list

    #Here,the 2nd item returned has a shape of [B, top_k],representing the biggest top_k amplitudes
    # for each piece of data, with N features being averaged.
    return period, abs(xf).mean(-1)[:, top_list]

In [None]:
# Creating the model without unnecessary blocks
class Model(nn.Module):
    def __init__(self, configs):
      super(Model, self).__init__()
      #params init
      self.configs = configs
      self.task_name = configs.task_name
      self.seq_len = configs.seq_len
      self.label_len = configs.label_len
      self.pred_len = configs.pred_len

      #stack TimesBlock for e_layers times to form the main part of TimesNet, named model
      self.model = nn.ModuleList([TimesBlock(configs)
                                  for _ in range(configs.e_layers)])

      #embedding & normalization
      # enc_in is the encoder input size, the number of features for a piece of data
      # d_model is the dimension of embedding
      self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq,
                                        configs.dropout)
      self.layer = configs.e_layers # num of encoder layers
      self.layer_norm = nn.LayerNorm(configs.d_model)

      #define the some layers for different tasks
      if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
          self.predict_linear = nn.Linear(
              self.seq_len, self.pred_len + self.seq_len)
          self.projection = nn.Linear(
              configs.d_model, configs.c_out, bias=True)


    def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
        # Normalization from Non-stationary Transformer at temporal dimension
        means = x_enc.mean(1, keepdim=True).detach() #[B,T]
        x_enc = x_enc - means
        stdev = torch.sqrt(
            torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
        x_enc /= stdev

        # embedding: projecting a number to a C-channel vector
        enc_out = self.enc_embedding(x_enc, x_mark_enc)  # [B,T,C] C is d_model
        enc_out = self.predict_linear(enc_out.permute(0, 2, 1)).permute(
            0, 2, 1)  # align temporal dimension [B,pred_len+seq_len,C]

        # TimesNet: pass through TimesBlock for self.layer times each with layer normalization
        for i in range(self.layer):
            enc_out = self.layer_norm(self.model[i](enc_out))

        # project back  #[B,T,d_model]-->[B,T,c_out]
        dec_out = self.projection(enc_out)

        # De-Normalization from Non-stationary Transformer
        dec_out = dec_out * \
                  (stdev[:, 0, :].unsqueeze(1).repeat(
                      1, self.pred_len + self.seq_len, 1)) #lengthen the stdev to fit the dec_out
        dec_out = dec_out + \
                  (means[:, 0, :].unsqueeze(1).repeat(
                      1, self.pred_len + self.seq_len, 1)) #lengthen the mean to fit the dec_out
        return dec_out

In [None]:
import os

In [None]:
def train(self, setting):  #setting is the args for this model training
    #get train dataloader
    train_data, train_loader = self._get_data(flag='train')
    vali_data, vali_loader = self._get_data(flag='val')
    test_data, test_loader = self._get_data(flag='test')

    # set path of checkpoint for saving and loading model
    path = os.path.join(self.args.checkpoints, setting)
    if not os.path.exists(path):
        os.makedirs(path)
    time_now = time.time()

    train_steps = len(train_loader)

    # EarlyStopping is typically a custom class or function that monitors the performance
    # of a model during training, usually by tracking a certain metric (commonly validation
    # loss or accuracy).It's a common technique used in deep learning to prevent overfitting
    # during the training
    early_stopping = EarlyStopping(patience=self.args.patience, verbose=True)

    #Optimizer and Loss Function Selection
    model_optim = self._select_optimizer()
    criterion = self._select_criterion()

    # AMP training is a technique that uses lower-precision data types (e.g., float16)
    # for certain computations to accelerate training and reduce memory usage.
    if self.args.use_amp:
        scaler = torch.cuda.amp.GradScaler()
    for epoch in range(self.args.train_epochs):
        iter_count = 0
        train_loss = []
        self.model.train()
        epoch_time = time.time()

        #begin training in this epoch
        for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader):
            iter_count += 1
            model_optim.zero_grad()
            batch_x = batch_x.float().to(self.device)  #input features
            batch_y = batch_y.float().to(self.device)  #target features

            # _mark holds information about time-related features. Specifically, it is a
            # tensor that encodes temporal information and is associated with the
            # input data batch_x.
            batch_x_mark = batch_x_mark.float().to(self.device)
            batch_y_mark = batch_y_mark.float().to(self.device)
            # decoder input(didn't use in TimesNet case)
            dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len:, :]).float()
            dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
            # encoder - decoder
            if self.args.use_amp: #in the case of TimesNet, use_amp should be False
                with torch.cuda.amp.autocast():
                    # whether to output attention in ecoder,in TimesNet case is no
                    if self.args.output_attention:
                        outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
                    # model the input
                    else:
                        outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)

                    # forecasting task, options:[M, S, MS]; M:multivariate predict multivariate,
                    # S:univariate predict univariate, MS:multivariate predict univariate'
                    #if multivariate predict univariate',then output should be the last column of the decoder
                    # output, so f_dim = -1 to only contain the last column, else is all columns
                    f_dim = -1 if self.args.features == 'MS' else 0
                    outputs = outputs[:, -self.args.pred_len:, f_dim:]
                    batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device)

                    # calc loss
                    loss = criterion(outputs, batch_y)
                    train_loss.append(loss.item())
            else:  #similar to when use_amp is True
                if self.args.output_attention:
                    outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
                else:
                    outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
                f_dim = -1 if self.args.features == 'MS' else 0
                outputs = outputs[:, -self.args.pred_len:, f_dim:]
                batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device)
                loss = criterion(outputs, batch_y)
                train_loss.append(loss.item())

            # When train rounds attain some 100-multiple, print speed, left time, loss. etc feedback
            if (i + 1) % 100 == 0:
                print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format(i + 1, epoch + 1, loss.item()))
                speed = (time.time() - time_now) / iter_count
                left_time = speed * ((self.args.train_epochs - epoch) * train_steps - i)
                print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time))
                iter_count = 0
                time_now = time.time()

            #BP
            if self.args.use_amp:
                scaler.scale(loss).backward()
                scaler.step(model_optim)
                scaler.update()
            else:
                loss.backward()
                model_optim.step()

        #This epoch comes to end, print information
        print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time))
        train_loss = np.average(train_loss)

        #run test and validation on current model
        vali_loss = self.vali(vali_data, vali_loader, criterion)
        test_loss = self.vali(test_data, test_loader, criterion)

        #print train, test, vali loss information
        print("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format(
            epoch + 1, train_steps, train_loss, vali_loss, test_loss))

        #Decide whether to trigger Early Stopping. if early_stop is true, it means that
        #this epoch's training is now at a flat slope, so stop further training for this epoch.
        early_stopping(vali_loss, self.model, path)
        if early_stopping.early_stop:
            print("Early stopping")
            break

        #adjust learning keys
        adjust_learning_rate(model_optim, epoch + 1, self.args)
    best_model_path = path + '/' + 'checkpoint.pth'

    # loading the trained model's state dictionary from a saved checkpoint file
    # located at best_model_path.
    self.model.load_state_dict(torch.load(best_model_path))
    return self.model

In [None]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0):
        self.patience = patience # how many times will you tolerate for loss not being on decrease
        self.verbose = verbose  # whether to print tip info
        self.counter = 0 # now how many times loss not on decrease
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model, path):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, path)

        # meaning: current score is not 'delta' better than best_score, representing that
        # further training may not bring remarkable improvement in loss.
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            # 'No Improvement' times become higher than patience --> Stop Further Training
            if self.counter >= self.patience:
                self.early_stop = True

        else: #model's loss is still on decrease, save the now best model and go on training
            self.best_score = score
            self.save_checkpoint(val_loss, model, path)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, path):
    ### used for saving the current best model
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), path + '/' + 'checkpoint.pth')
        self.val_loss_min = val_loss