In [4]:
from typing import List

import numpy as np
import pandas as pd
from pandas.tseries import offsets
from pandas.tseries.frequencies import to_offset


class TimeFeature:
    def __init__(self):
        pass

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        pass

    def __repr__(self):
        return self.__class__.__name__ + "()"


class SecondOfMinute(TimeFeature):
    """Minute of hour encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.second / 59.0 - 0.5


class MinuteOfHour(TimeFeature):
    """Minute of hour encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.minute / 59.0 - 0.5


class HourOfDay(TimeFeature):
    """Hour of day encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.hour / 23.0 - 0.5


class DayOfWeek(TimeFeature):
    """Hour of day encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.dayofweek / 6.0 - 0.5


class DayOfMonth(TimeFeature):
    """Day of month encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.day - 1) / 30.0 - 0.5


class DayOfYear(TimeFeature):
    """Day of year encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.dayofyear - 1) / 365.0 - 0.5


class MonthOfYear(TimeFeature):
    """Month of year encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.month - 1) / 11.0 - 0.5


class WeekOfYear(TimeFeature):
    """Week of year encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.isocalendar().week - 1) / 52.0 - 0.5


def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
    """
    Returns a list of time features that will be appropriate for the given frequency string.
    Parameters
    ----------
    freq_str
        Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
    """

    features_by_offsets = {
        offsets.YearEnd: [],
        offsets.QuarterEnd: [MonthOfYear],
        offsets.MonthEnd: [MonthOfYear],
        offsets.Week: [DayOfMonth, WeekOfYear],
        offsets.Day: [DayOfWeek, DayOfMonth, DayOfYear],
        offsets.BusinessDay: [DayOfWeek, DayOfMonth, DayOfYear],
        offsets.Hour: [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear],
        offsets.Minute: [
            MinuteOfHour,
            HourOfDay,
            DayOfWeek,
            DayOfMonth,
            DayOfYear,
        ],
        offsets.Second: [
            SecondOfMinute,
            MinuteOfHour,
            HourOfDay,
            DayOfWeek,
            DayOfMonth,
            DayOfYear,
        ],
    }

    offset = to_offset(freq_str)

    for offset_type, feature_classes in features_by_offsets.items():
        if isinstance(offset, offset_type):
            return [cls() for cls in feature_classes]

    supported_freq_msg = f"""
    Unsupported frequency {freq_str}
    The following frequencies are supported:
        Y   - yearly
            alias: A
        M   - monthly
        W   - weekly
        D   - daily
        B   - business days
        H   - hourly
        T   - minutely
            alias: min
        S   - secondly
    """
    raise RuntimeError(supported_freq_msg)


def time_features(dates, freq='h'):
    return np.vstack([feat(dates) for feat in time_features_from_frequency_str(freq)])


In [5]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
from data_provider.timefeatures import time_features

class Dataset_Custom(Dataset):
    def __init__(self, root_path, flag='train', size=None,
                 features='S', data_path='all_countries.csv',
                 target='Price (EUR/MWhe)', scale=True, timeenc=0, freq='h'):
        # size [seq_len, label_len, pred_len]
        # info
        if size is None:
            self.seq_len = 24 * 4 * 4
            self.label_len = 24 * 4
            self.pred_len = 24 * 4
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]
        # init
        assert flag in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[flag]

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq

        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()
        self.len = self.__len__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path, self.data_path))

## encoding country names
        country_encoded = pd.get_dummies(df_raw['Country'])
        df_raw = pd.concat([df_raw, country_encoded], axis=1)
## dropping unnessesary columns
        df_raw = df_raw.drop(columns=['ISO3 Code', 'Datetime (Local)', 'Country'])


        cols = list(df_raw.columns)
        cols.remove(self.target)
        cols.remove('Datetime (UTC)')
        cols = ['Datetime (UTC)'] + cols + [self.target]
        df_raw = df_raw[cols]

## defining borders of train, test and validation sets based on the lenght (70% vs. 20% vs. 10%)
        num_train = int(len(df_raw) * 0.7)
        num_test = int(len(df_raw) * 0.2)
        num_vali = len(df_raw) - num_train - num_test
        border1s = [0, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len]
        border2s = [num_train, num_train + num_vali, len(df_raw)]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]

        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        df_stamp = df_raw[['Datetime (UTC)']][border1:border2]
        df_stamp['Datetime (UTC)'] = pd.to_datetime(df_stamp['Datetime (UTC)'])
        if self.timeenc == 0:
            df_stamp['month'] = df_stamp['Datetime (UTC)'].apply(lambda row: row.month)
            df_stamp['day'] = df_stamp['Datetime (UTC)'].apply(lambda row: row.day)
            df_stamp['weekday'] = df_stamp['Datetime (UTC)'].apply(lambda row: row.weekday())
            df_stamp['hour'] = df_stamp['Datetime (UTC)'].apply(lambda row: row.hour)
            data_stamp = df_stamp.drop(['Datetime (UTC)'], axis=1).values
        elif self.timeenc == 1:
            data_stamp = time_features(pd.to_datetime(df_stamp['Datetime (UTC)'].values), freq=self.freq)
            data_stamp = data_stamp.transpose(1, 0)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp

    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        seq_x_mark = self.data_stamp[s_begin:s_end]
        seq_y_mark = self.data_stamp[r_begin:r_end]
        
        index_list = np.arange(index, index + self.seq_len + self.pred_len, 1)
        norm_index = index_list / self.len

        return seq_x, seq_y, seq_x_mark, seq_y_mark, norm_index

    def __len__(self):
        return len(self.data_x) - self.seq_len - self.pred_len + 1

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)


In [6]:
import os
from torch.utils.data import DataLoader
from data_provider.data_loader import Dataset_Custom

def data_provider(args, flag):
    Data = Dataset_Custom
    timeenc = 0 if args.embed != 'timeF' else 1

    shuffle_flag = True if flag != 'test' else False
    drop_last = True
    batch_size = args.batch_size
    freq = args.freq

    data_set = Data(
        root_path=args.root_path,
        data_path=args.data_path,
        flag=flag,
        size=[args.seq_len, args.label_len, args.pred_len],
        features=args.features,
        target=args.target,
        timeenc=timeenc,
        freq=freq,
    )
    print(flag, len(data_set))
    data_loader = DataLoader(
        data_set,
        batch_size=batch_size,
        shuffle=shuffle_flag,
        num_workers=args.num_workers,
        drop_last=drop_last)
    return data_set, data_loader


In [7]:
import torch.nn as nn
import torch.nn.utils.weight_norm as wn
import torch
import torch.nn.functional as F
import numpy as np

class MLP_bottle(nn.Module):
    def __init__(self,input_len,output_len,bottleneck,bias=True):
        super().__init__()
        self.linear1 = nn.Sequential(
            wn(nn.Linear(input_len, bottleneck,bias=bias)),
            nn.ReLU(),
            wn(nn.Linear(bottleneck,bottleneck,bias=bias))
        )

        self.linear2 = nn.Sequential(
            wn(nn.Linear(bottleneck, bottleneck)),
            nn.ReLU(),
            wn(nn.Linear(bottleneck, output_len))
        )

        self.skip = wn(nn.Linear(input_len, bottleneck,bias=bias))
        self.act = nn.ReLU()
        
    def forward(self,x):
        x = self.act(self.linear1(x)+self.skip(x))
        x = self.linear2(x)
        
        return x

class Coefnet(nn.Module):
    def __init__(self, blocks,d_model,heads,norm_layer=None, projection=None):
        super().__init__()
        layers = [BCAB(d_model,heads) for i in range(blocks)]
        self.layers = nn.ModuleList(layers)
        self.norm = norm_layer
        self.projection = projection
        # heads = heads if blocks > 0 else 1
        self.last_layer = last_layer(d_model,heads)

    def forward(self, basis, series):
        attns1 = []
        attns2 = []
        for layer in self.layers:
            basis,series,basis_attn,series_attn = layer(basis,series)   #basis(B,N,d)  series(B,C,d)
            attns1.append(basis_attn)
            attns2.append(series_attn)
        
        coef = self.last_layer(series,basis)  #(B,k,C,N)
        
        return coef,attns1,attns2


In [8]:
import torch.nn as nn
import torch.nn.utils.weight_norm as wn
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import time
import math
import numpy as np

class Basisformer(nn.Module):
    def __init__(self,seq_len,pred_len,d_model,heads,basis_nums,block_nums,bottle,map_bottleneck,device,tau):
        super().__init__()
        self.d_model = d_model # model dimentions
        self.k = heads # number of attention heads
        self.N = basis_nums # number of basis functions
        self.coefnet = Coefnet(blocks=block_nums,d_model=d_model,heads=heads)
            
        self.pred_len = pred_len # prediction length
        self.seq_len = seq_len # sequence length

        # Multi-Layer Perceptron
        self.MLP_x = MLP_bottle(seq_len,heads * int(seq_len/heads),int(seq_len/bottle)) #processes the input sequence length to create a more compact representation
        self.MLP_y = MLP_bottle(pred_len,heads * int(pred_len/heads),int(pred_len/bottle)) #same for prediction
        self.MLP_sx = MLP_bottle(heads * int(seq_len/heads),seq_len,int(seq_len/bottle)) # re-expands the sequence length helping to restore some structure
        self.MLP_sy = MLP_bottle(heads * int(pred_len/heads),pred_len,int(pred_len/bottle)) # same for prediction
        
        # Linear layers with weight normalization for projecting sequences into the model dimension
        self.project1 = wn(nn.Linear(seq_len,d_model))
        self.project2 = wn(nn.Linear(seq_len,d_model))
        self.project3 = wn(nn.Linear(pred_len,d_model))
        self.project4 = wn(nn.Linear(pred_len,d_model))
        self.criterion1 = nn.MSELoss()
        self.criterion2 = nn.L1Loss(reduction='none')
        
        self.device = device # setting the device (CPU or GPU)
                        
        # smooth array
        arr = torch.zeros((seq_len+pred_len-2,seq_len+pred_len))
        for i in range(seq_len+pred_len-2):
            arr[i,i]=-1
            arr[i,i+1] = 2
            arr[i,i+2] = -1
        self.smooth_arr = arr.to(device)

        # initializing basis function
        # MLP maps input to a higher dim space
        self.map_MLP = MLP_bottle(1, # input dim
                                  self.N*(self.seq_len+self.pred_len), #output dim
                                  map_bottleneck, # hidden layer size for the MLP
                                  bias=True) 
        self.tau = tau # temperature parameter 
        self.epsilon = 1E-5 # to avoid deletion by zero
        
    def forward(self,x,mark,y=None,train=True,y_mark=None):
        # normalization
        mean_x = x.mean(dim=1,keepdim=True)
        std_x = x.std(dim=1,keepdim=True)
        feature = (x - mean_x) / (std_x + self.epsilon)
        # reshaping
        B,L,C = feature.shape
        feature = feature.permute(0,2,1)
        feature = self.project1(feature)   #(B,C,d)
        
        # creating basis function
        m = self.map_MLP( # maps the input marker to a higher-dimensional space
            mark[:,0].unsqueeze(1) # selects the first marker and reshapes it for the MLP
                         ).reshape(B,self.seq_len + self.pred_len,self.N) #reshapes the output to have other dimensions
        
        # normalization
        m = m / torch.sqrt(torch.sum(m**2,dim=1,keepdim=True)+self.epsilon)
        
        # using basis functions in the model by splitting and projecting basis functions
        raw_m1 = m[:,:self.seq_len].permute(0,2,1)  #(B,L,N) # corresponding to the input sequence
        raw_m2 = m[:,self.seq_len:].permute(0,2,1)   #(B,L',N) #corresponding to the prediction sequence
        # permute(0,2,1) changes the order of dimensions for compatibility with other operations

        m1 = self.project2(raw_m1)    #(B,N,d) projects the input sequence basis functions into the model dimension
        
        # attention mechanism with basis functions
        score,attn_x1,attn_x2 = self.coefnet(m1,feature)    #(B,k,C,N) 
        # applies the coefficient network to the projected basis functions and the features extracted from the input sequence
        # scores represent how much each basis function contributes to the final representation


        # combining basis functions
        base = self.MLP_y(raw_m2).reshape(B,self.N,self.k,-1).permute(0,2,1,3)   #(B,k,N,L/k)
        out = torch.matmul(score,base).permute(0,2,1,3).reshape(B,C,-1)  #(B,C,k * (L/k))
        out = self.MLP_sy(out).reshape(B,C,-1).permute(0,2,1)   #（BC,L）
        
        # reverse normalization
        output = out * (std_x + self.epsilon) + mean_x

        #loss calculation
        if train:
            l_smooth = torch.einsum('xl,bln->xbn',self.smooth_arr,m)
            l_smooth = abs(l_smooth).mean()
            # l_smooth = self.criterion1(l_smooth,torch.zeros_like(l_smooth))
            
            # #back
            mean_y = y.mean(dim=1,keepdim=True)
            std_y = y.std(dim=1,keepdim=True)
            feature_y_raw = (y - mean_y) / (std_y + self.epsilon)
            
            feature_y = feature_y_raw.permute(0,2,1)
            feature_y = self.project3(feature_y)   #(BC,d)
            m2 = self.project4(raw_m2)    #(N,d)
            
            score_y,attn_y1,attn_y2 = self.coefnet(m2,feature_y)    #(B,k,C,N)
            logit_q = score.permute(0,2,3,1) #(B,C,N,k)
            logit_k = score_y.permute(0,2,3,1) #(B,C,N,k)

            # l_pos = torch.bmm(logit_q.view(-1,1,self.k), logit_k.view(-1,self.k,1)).reshape(-1,1)  #(B*C*N,1,1)
            l_neg = torch.bmm(logit_q.reshape(-1,self.N,self.k), logit_k.reshape(-1,self.N,self.k).permute(0,2,1)).reshape(-1,self.N) # (B,C*N,N)

            labels = torch.arange(0,self.N,1,dtype=torch.long).unsqueeze(0).repeat(B*C,1).reshape(-1)

            labels = labels.to(self.device)

            cross_entropy_loss = nn.CrossEntropyLoss()
            l_entropy = cross_entropy_loss(l_neg/self.tau, labels)           
            
            return output,l_entropy,l_smooth,attn_x1,attn_x2,attn_y1,attn_y2
        else:
            #back
            mean_y = y.mean(dim=1,keepdim=True)
            std_y = y.std(dim=1,keepdim=True)
            feature_y_raw = (y - mean_y) / (std_y + self.epsilon)
            
            feature_y = feature_y_raw.permute(0,2,1)
            feature_y = self.project3(feature_y)   #(BC,d)
            m2 = self.project4(raw_m2)    #(N,d)
            
            score_y,attn_y1,attn_y2 = self.coefnet(m2,feature_y)    #(B,k,C,N)
            return output,m,attn_x1,attn_x2,attn_y1,attn_y2      
        
        


In [9]:
import numpy as np


def RSE(pred, true):
    return np.sqrt(np.sum((true - pred) ** 2)) / np.sqrt(np.sum((true - true.mean()) ** 2))


def CORR(pred, true):
    u = ((true - true.mean(0)) * (pred - pred.mean(0))).sum(0)
    d = np.sqrt(((true - true.mean(0)) ** 2 * (pred - pred.mean(0)) ** 2).sum(0))
    return (u / d).mean(-1)


def MAE(pred, true):
    return np.mean(np.abs(pred - true))


def MSE(pred, true):
    return np.mean((pred - true) ** 2)


def RMSE(pred, true):
    return np.sqrt(MSE(pred, true))


def MAPE(pred, true):
    return np.mean(np.abs((pred - true) / true))


def MSPE(pred, true):
    return np.mean(np.square((pred - true) / true))


def metric(pred, true):
    mae = MAE(pred, true)
    mse = MSE(pred, true)
    rmse = RMSE(pred, true)
    mape = MAPE(pred, true)
    mspe = MSPE(pred, true)

    return mae, mse, rmse, mape, mspe


In [10]:
import argparse
import torch
import sys
sys.path.append('.')
from data_provider.data_factory import data_provider
from torch import optim
from model import Basisformer
from torch import nn
import time
import numpy as np
from evaluate_tool import metric
import os
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from pyplot import plot_seq_feature
from adabelief_pytorch import AdaBelief
import logging
import random


def vali(vali_data, vali_loader, criterion, epoch, writer, flag='vali'):
    total_loss = []
    model.eval()
    count_error = 0
    with torch.no_grad():
        for i, (batch_x, batch_y, batch_x_mark, batch_y_mark,index) in enumerate(vali_loader):
            batch_x = batch_x.float().to(device)
            batch_y = batch_y.float()

            f_dim = -1 if args.features == 'MS' else 0
            origin = batch_y[:, :args.seq_len, f_dim:].to(device)
            batch_y = batch_y[:, -args.pred_len:, f_dim:].to(device)
            batch_y_mark = batch_y_mark.float().to(device)
            
            real_batch_x = batch_x
            
            outputs,m,attn_x1,attn_x2,attn_y1,attn_y2 = model(batch_x,index.float().to(device),batch_y,train=False,y_mark=batch_y_mark)
            
            pred = outputs.detach().cpu()
            true = batch_y.detach().cpu()

            loss_raw = criterion(pred, true)
            loss = loss_raw.mean()

            total_loss.append(loss)

            if i == 0:
                fig = plot_seq_feature(outputs, batch_y, real_batch_x, flag)
                writer.add_figure("figure_{}".format(flag), fig, global_step=epoch)
                    
    total_loss = np.average(total_loss)
        
    model.train()
    return total_loss

def train():
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    log_and_print('[Info] Number of parameters: {}'.format(num_params))

    # data sets and their corresponding loaders
    train_set, train_loader = data_provider(args, "train")
    vali_data, vali_loader = data_provider(args,flag='val')
    test_data, test_loader = data_provider(args,flag='test')
    

    para1 = [param for name,param in model.named_parameters() if 'map_MLP' in name]
    para2 = [param for name,param in model.named_parameters() if 'map_MLP' not in name]

    # optimizer updates the model parameters during training
    # optimizer = AdaBelief(model.parameters(), lr=args.learning_rate, eps=1e-16, betas=(0.9,0.999), weight_decouple = True, rectify = False) 
    optimizer = AdaBelief([{'params':para1,'lr':5e-3},{'params':para2,'lr':args.learning_rate}], eps=1e-16, betas=(0.9,0.999), weight_decouple = True, rectify = False) 
    # optimizer = AdaBelief(model.parameters(), lr=args.learning_rate, eps=1e-16, betas=(0.9,0.999), weight_decouple = True, rectify = False) 
    

    criterion = nn.MSELoss()
    criterion_view = nn.MSELoss(reduction='none')

    # number of batches in the training set?
    train_steps = len(train_loader)
    # initializing the Tensor Board writer for logging training process
    writer = SummaryWriter(os.path.join(record_dir,'event'))

    # defining for early stopping
    best_loss = 0
    count_error = 0
    count = 0
    

    #training loop

    for epoch in range(args.train_epochs):
        #lists to store
        train_loss = []
        loss_pred = []
        loss_of_ce = []
        l_s = []
        #setting model to training mode
        model.train()
        epoch_time = time.time()

        for i, (batch_x, batch_y, batch_x_mark, batch_y_mark,index) in enumerate(train_loader):
            #clears the gradients of all optimized tensors
            optimizer.zero_grad()

            # loading data to the specified device (originally to cuda)
            batch_x = batch_x.float().to(device) # (B,L,C)
            batch_y = batch_y.float().to(device) # (B,L,C)
            batch_y_mark = batch_y_mark.float().to(device)
            
            #feature dimension
            f_dim = -1 if args.features == 'MS' else 0
            #matching the target sequence length required by the model
            batch_y = batch_y[:, -args.pred_len:, f_dim:].to(device)

            #forward pass through the model to get outputs and losses
            outputs,loss_infonce,loss_smooth,attn_x1,attn_x2,attn_y1,attn_y2 = model(batch_x,index.float().to(device),batch_y,y_mark=batch_y_mark)
            
            # calculating loss
            loss_p = criterion(outputs, batch_y)
            lam1 = args.loss_weight_prediction
            lam2 = args.loss_weight_infonce
            lam3 = args.loss_weight_smooth
        
            # if loss_p > 5:
            #     count_error = count_error +1
            #     writer.add_scalar('error_loss', loss_p, global_step=count_error)
            #     fig = plot_seq_feature(outputs, batch_y,batch_x,error=True,input=batch_x)
            #     writer.add_figure("figure_error", fig, global_step=count_error)
            #     log_and_print(loss_p)

            # total loss  
            loss = lam1 * loss_p + lam2 * loss_infonce  + lam3 * loss_smooth
            train_loss.append(loss.item())
            loss_pred.append(loss_p.item())
            loss_of_ce.append(loss_infonce.item())
            l_s.append(loss_smooth.item())

            # greadient of the loss
            loss.backward()

            #updating model parameters
            optimizer.step()

            #logging every fifth step of the training process 
            if (i+1) % (train_steps//5) == 0:
                log_and_print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format(i + 1, epoch + 1, loss.item()))

        # every epoch logging
        log_and_print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time))
        # losses of every epoch
        train_loss = np.average(train_loss)
        loss1 = np.average(loss_pred)
        log_and_print('loss_pred:{0}'.format(loss1))
        loss2 = np.average(loss_of_ce)
        log_and_print('loss entropy:{0}'.format(loss2))
        loss3 = np.average(l_s)
        log_and_print('loss smooth:{0}'.format(loss3))
        vali_loss = vali(vali_data, vali_loader, criterion_view, epoch, writer, 'vali')
        test_loss = vali(test_data, test_loader, criterion_view, epoch, writer, 'test')
        # logging
        log_and_print("Epoch: {0} | Train Loss: {1:.7f} Vali Loss: {2:.7f} Test Loss: {3:.7f}".format(
            epoch + 1, train_loss, vali_loss, test_loss))

        # figures to TensorBoard
        fig = plot_seq_feature(outputs, batch_y, batch_x)
        writer.add_figure("figure_train", fig, global_step=epoch)
        writer.add_scalar('train_loss', train_loss, global_step=epoch)
        writer.add_scalar('vali_loss', vali_loss, global_step=epoch)
        writer.add_scalar('test_loss', test_loss, global_step=epoch)
        
        #saving model chaeckpoints
        ckpt_path = os.path.join(record_dir,args.check_point)
        if not os.path.exists(ckpt_path):
            os.makedirs(ckpt_path)
        #saving in new folder if it is firs tepoch
        if best_loss == 0:
            best_loss = vali_loss
            torch.save(model.state_dict(), os.path.join(ckpt_path, 'valid_best_checkpoint.pth'))
        else:
            if vali_loss < best_loss: #updates the results if vali loss improves
                torch.save(model.state_dict(), os.path.join(ckpt_path, 'valid_best_checkpoint.pth'))
                best_loss = vali_loss
                count = 0
            else:
                count = count + 1
        #final save at the end of each epoch
        torch.save(model.state_dict(), os.path.join(ckpt_path, 'final_checkpoint.pth'))
        #stopping training if loss doesn't improve for a number of epochs
        if count >= args.patience:
            break
    return

In [12]:
def log_and_print(text):
    logging.info(text)
    print(text)
    return

In [13]:
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Time series prediction - Basisformer')
    parser.add_argument('--is_training', type=bool, default=True, help='train or test')
    parser.add_argument('--device', type=int, default=0, help='gpu dvice')

    # data loader
    parser.add_argument('--num_workers', type=int, default=0, help='data loader num workers')
    parser.add_argument('--data', type=str, default='custom', help='dataset type')
    parser.add_argument('--root_path', type=str, default='data', help='root path of the data file')
    parser.add_argument('--data_path', type=str, default='all_countries.csv', help='data file')
    parser.add_argument('--features', type=str, default='M',
                        help='forecasting task, options:[M, S, MS]; M:multivariate predict multivariate, '
                            'S:univariate predict univariate, MS:multivariate predict univariate')
    parser.add_argument('--target', type=str, default='Price (EUR/MWhe)', help='target feature in S or MS task')
    parser.add_argument('--freq', type=str, default='h',
                    help='freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, '
                         'b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h')

    # forecasting task
    parser.add_argument('--seq_len', type=int, default=96, help='input sequence length')
    parser.add_argument('--label_len', type=int, default=96, help='start token length')
    parser.add_argument('--pred_len', type=int, default=96, help='prediction sequence length')
    # parser.add_argument('--cross_activation', type=str default='tanh'

    # model define
    parser.add_argument('--embed', type=str, default='timeF',
                        help='time features encoding, options:[timeF, fixed, learned]')
    parser.add_argument('--heads', type=int, default=16, help='head in attention')
    parser.add_argument('--d_model', type=int, default=100, help='dimension of model')
    parser.add_argument('--N', type=int, default=10, help='number of learnable basis')
    parser.add_argument('--block_nums', type=int, default=2, help='number of blocks')
    parser.add_argument('--bottleneck', type=int, default=2, help='reduction of bottleneck')
    parser.add_argument('--map_bottleneck', type=int, default=20, help='reduction of mapping bottleneck')

    # optimization
    parser.add_argument('--train_epochs', type=int, default=1, help='train epochs')
    parser.add_argument('--batch_size', type=int, default=32, help='batch size of train input data')
    parser.add_argument('--patience', type=int, default=1, help='early stopping patience')
    parser.add_argument('--learning_rate', type=float, default=5e-4, help='optimizer learning rate')
    parser.add_argument('--tau', type=float, default=0.07, help='temperature of infonce loss')
    parser.add_argument('--loss_weight_prediction', type=float, default=1.0, help='weight of prediction loss')
    parser.add_argument('--loss_weight_infonce', type=float, default=1.0, help='weight of infonce loss')
    parser.add_argument('--loss_weight_smooth', type=float, default=1.0, help='weight of smooth loss')


    #checkpoint_path
    parser.add_argument('--check_point',type=str,default='checkpoint',help='check point path, relative path')
    
    #args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    
    record_dir = os.path.join('records',args.data_path.split('.')[0],'features_'+args.features,\
                              'seq_len'+str(args.seq_len)+','+'pred_len'+str(args.pred_len))
    if not os.path.exists(record_dir):
        os.makedirs(record_dir)
    
    if args.is_training:
        logger_file = os.path.join(record_dir,'train.log')
    else:
        logger_file = os.path.join(record_dir,'test.log')
        
    if os.path.exists(logger_file):
        with open(logger_file, "w") as file:
            file.truncate(0)
    logging.basicConfig(filename=logger_file, level=logging.INFO)
    
    log_and_print('Args in experiment:')
    log_and_print(args)

    device = init_dl_program(args.device, seed=0,max_threads=8) if torch.cuda.is_available() else "cpu"
    model = Basisformer(args.seq_len,args.pred_len,args.d_model,args.heads,args.N,args.block_nums,args.bottleneck,args.map_bottleneck,device,args.tau)

    log_and_print(model)
    model.to(device)

Args in experiment:
Namespace(N=10, batch_size=32, block_nums=2, bottleneck=2, check_point='checkpoint', d_model=100, data='custom', data_path='all_countries.csv', device=0, embed='timeF', features='M', freq='h', heads=16, is_training=True, label_len=96, learning_rate=0.0005, loss_weight_infonce=1.0, loss_weight_prediction=1.0, loss_weight_smooth=1.0, map_bottleneck=20, num_workers=0, patience=3, pred_len=96, root_path='data', seq_len=96, target='Price (EUR/MWhe)', tau=0.07, train_epochs=1)
Basisformer(
  (coefnet): Coefnet(
    (layers): ModuleList(
      (0-1): 2 x BCAB(
        (cross_attention_basis): channel_AutoCorrelationLayer(
          (query_projection): Linear(in_features=100, out_features=96, bias=True)
          (key_projection): Linear(in_features=100, out_features=96, bias=True)
          (value_projection): Linear(in_features=100, out_features=96, bias=True)
          (out_projection): Linear(in_features=96, out_features=100, bias=True)
          (attend): Softmax(dim=-



In [None]:
## defining new parameter values
sys.argv = ['--batch_size=16', 
            '--seq_len=24', 
            '--label_len=24', 
            '--pred_len=24', 
            '--train_epochs=1', 
            '--heads=2', 
            '--d_model=4', 
            '--block_nums=1', 
            '--N=5', 
            '--bottleneck=1', 
            '--map_bottleneck=10']
## parsing new values
args = parser.parse_args()


## initializing device and model
device = init_dl_program(args.device, seed=0, max_threads=8) if torch.cuda.is_available() else "cpu"
model = Basisformer(args.seq_len, args.pred_len, args.d_model, args.heads, args.N, args.block_nums, args.bottleneck, args.map_bottleneck, device, args.tau)

## starting
if args.is_training:
    train()
else:
    test()