In [1]:
import sys
if not 'Informer2020' in sys.path:
    sys.path += ['Informer2020']

In [2]:
from data.data_loader import Dataset_Custom
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import os
pd.set_option("display.precision", 8)

In [3]:
from utils.tools import dotdict
from exp.exp_informer import Exp_Informer
import torch

In [14]:
my_data = pd.read_csv('../GD030A_S.csv')

In [15]:
# Define the recover_timestamp function
def recover_timestamp(data):
    # Combine 'date' and 'time' to form a datetime column
    data['datetime'] = pd.to_datetime(data['date'] + ' ' + data['time'].astype(str) + ':00', format='%Y-%m-%d %H:%M')

    # Set 'datetime' as index
    data = data.set_index('datetime')

    # Create a complete range of timestamps with hourly frequency
    full_time_range = pd.date_range(start=data.index.min(), end=data.index.max(), freq='H')

    # Reindex the data to include all timestamps, filling missing rows with NaN
    data_full = data.reindex(full_time_range)

    return data_full

Custom data (xxx.csv) has to include at least 2 features: `date`(format: `YYYY-MM-DD hh:mm:ss`) and `target feature`.

In [26]:
traffic_full = recover_timestamp(my_data)
traffic_full.drop(['date', 'time'], axis=1, inplace=True)
traffic_full = traffic_full.reset_index(names='date')
traffic_full['date'] = traffic_full['date'].astype(str)
traffic_full

Unnamed: 0,date,flow
0,2019-10-01 00:00:00,15.0
1,2019-10-01 01:00:00,9.0
2,2019-10-01 02:00:00,9.0
3,2019-10-01 03:00:00,7.0
4,2019-10-01 04:00:00,9.0
...,...,...
35059,2023-09-30 19:00:00,129.0
35060,2023-09-30 20:00:00,119.0
35061,2023-09-30 21:00:00,106.0
35062,2023-09-30 22:00:00,88.0


In [27]:
#traffic_full.to_csv('traffic_full.csv', index=False)

Sample Data

In [51]:
df = pd.read_csv(os.path.join(args.root_path, args.data_path))
sample_df = df[['date', 'HULL']]
sample_df
#sample_df.to_csv('sample_df.csv', index=False)

Unnamed: 0,date,HULL
0,2016-07-01 00:00:00,2.009
1,2016-07-01 01:00:00,2.076
2,2016-07-01 02:00:00,1.741
3,2016-07-01 03:00:00,1.942
4,2016-07-01 04:00:00,1.942
...,...,...
17415,2018-06-26 15:00:00,3.550
17416,2018-06-26 16:00:00,4.287
17417,2018-06-26 17:00:00,3.818
17418,2018-06-26 18:00:00,3.818


## Prepare

In [4]:
args = dotdict()

args.model = 'informer' # model of experiment, options: [informer, informerstack, informerlight(TBD)]

args.data = 'traffic_full' # data
args.root_path = './' # root path of data file
args.data_path = 'traffic_full.csv' # data file
args.features = 'S' # forecasting task, options:[M, S, MS]; M:multivariate predict multivariate, S:univariate predict univariate, MS:multivariate predict univariate
args.target = 'flow' # target feature in S or MS task
args.freq = 'h' # freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h
args.checkpoints = './informer_checkpoints' # location of model checkpoints

args.seq_len = 24 # input sequence length of Informer encoder
args.label_len = 24 # start token length of Informer decoder
args.pred_len = 6 # prediction sequence length
# Informer decoder input: concat[start token series(label_len), zero padding series(pred_len)]

args.enc_in = 7 # encoder input size
args.dec_in = 7 # decoder input size
args.c_out = 7 # output size
args.factor = 5 # probsparse attn factor
args.d_model = 512 # dimension of model
args.n_heads = 8 # num of heads
args.e_layers = 2 # num of encoder layers
args.d_layers = 1 # num of decoder layers
args.d_ff = 2048 # dimension of fcn in model
args.dropout = 0.05 # dropout
args.attn = 'prob' # attention used in encoder, options:[prob, full]
args.embed = 'timeF' # time features encoding, options:[timeF, fixed, learned]
args.activation = 'gelu' # activation
args.distil = True # whether to use distilling in encoder
args.output_attention = False # whether to output attention in ecoder
args.mix = True
args.padding = 0
args.freq = 'h'

args.batch_size = 32 
args.learning_rate = 0.0001
args.loss = 'mse'
args.lradj = 'type1'
args.use_amp = False # whether to use automatic mixed precision training

args.num_workers = 0
args.itr = 1
args.train_epochs = 6
args.patience = 3
args.des = 'exp'

args.use_gpu = True if torch.cuda.is_available() else False
args.gpu = 0

args.use_multi_gpu = False
args.devices = '0,1,2,3'

Sample Data

In [5]:
args.data = 'sample_df' # data
args.root_path = './' # root path of data file
args.data_path = 'sample_df.csv' # data file
args.features = 'S' # forecasting task, options:[M, S, MS]; M:multivariate predict multivariate, S:univariate predict univariate, MS:multivariate predict univariate
args.target = 'HULL' # target feature in S or MS task
args.freq = 'h' # freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h

Data = Dataset_Custom
timeenc = 0 if args.embed!='timeF' else 1
flag = 'train'; shuffle_flag = True; drop_last = True; batch_size = args.batch_size

sample_df = pd.read_csv(os.path.join(args.root_path, args.data_path))

In [6]:
# Get training split (70%)
num_train = int(len(sample_df) * 0.7)
train_data = sample_df['HULL'].iloc[:num_train]

print(f"Training data size: {len(train_data)}")
print(f"Mean: {train_data.mean()}")
print(f"Std: {train_data.std()}")
print(f"Min: {train_data.min()}")
print(f"Max: {train_data.max()}")
print(f"Unique values in first 100: {train_data.head(100).nunique()}")

# Check if there are constant segments
print(f"\nFirst 30 values:\n{train_data.head(30).values}")

Training data size: 12194
Mean: 1.9569892560519682
Std: 2.1130795784159178
Min: -4.75600004196167
Max: 10.11400032043457
Unique values in first 100: 46

First 30 values:
[2.00900006 2.07599998 1.74100006 1.94200003 1.94200003 2.14299989
 2.94700003 3.28200006 3.01399994 2.54500008 2.54500008 2.54500008
 2.54500008 2.6789999  2.94700003 3.148      2.41100001 2.3440001
 2.88000011 3.01399994 3.01399994 2.94700003 2.94700003 2.88000011
 3.08100009 3.01399994 3.148      3.08100009 3.08100009 3.28200006]


My Data

In [6]:
args.data = 'traffic_full' # data
args.root_path = './' # root path of data file
args.data_path = 'traffic_full.csv' # data file
args.features = 'S' # forecasting task, options:[M, S, MS]; M:multivariate predict multivariate, S:univariate predict univariate, MS:multivariate predict univariate
args.target = 'flow' # target feature in S or MS task
args.freq = 'h' # freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h

Data = Dataset_Custom
timeenc = 0 if args.embed!='timeF' else 1
flag = 'train'; shuffle_flag = True; drop_last = True; batch_size = args.batch_size

traffic_full = pd.read_csv(os.path.join(args.root_path, args.data_path))

In [8]:
# Get training split (70%)
num_train = int(len(traffic_full) * 0.7)
train_data = traffic_full['flow'].iloc[:num_train]

print(f"Training data size: {len(train_data)}")
print(f"Mean: {train_data.mean()}")
print(f"Std: {train_data.std()}")
print(f"Min: {train_data.min()}")
print(f"Max: {train_data.max()}")
print(f"Unique values in first 100: {train_data.head(100).nunique()}")

# Check if there are constant segments
print(f"\nFirst 30 values:\n{train_data.head(30).values}")

Training data size: 24544
Mean: 125.97447115793051
Std: 82.75304428090972
Min: 0.0
Max: 510.0
Unique values in first 100: 76

First 30 values:
[ 15.   9.   9.   7.   9.  34. 122. 156. 254. 223. 151. 193. 162. 180.
 171. 295. 194. 362. 195. 134.  99.  84.  54.  22.  22.  10.  11.   7.
  14.  43.]


In [18]:
data_set = Data(
    root_path=args.root_path,
    data_path=args.data_path,
    flag=flag,
    size=[args.seq_len, args.label_len, args.pred_len],
    features=args.features,
    timeenc=timeenc,
    target=args.target, # HULL here
    freq=args.freq, # 'h': hourly, 't':minutely
    scale=True
)
data_loader = DataLoader(
    data_set,
    batch_size=batch_size,
    shuffle=shuffle_flag,
    num_workers=args.num_workers,
    drop_last=drop_last)

# Use the correct attribute names
print("Mean:", data_set.scaler.mean_)
print("Std:", data_set.scaler.scale_)

# Check the data
seq_x, seq_y, seq_x_mark, seq_y_mark = data_set[0]
print("\nFirst sample seq_x:")
print(seq_x[:5])

Mean: [125.97447116]
Std: [82.7512867]

First sample seq_x:
[[-1.34106037]
 [-1.4135668 ]
 [-1.4135668 ]
 [-1.4377356 ]
 [-1.4135668 ]]


In [17]:
data = np.array([[1.], [2.], [np.nan], [4.], [5.]])
scaler = StandardScaler()
scaler.fit(data)

print("Mean:", scaler.mean_)   # [nan] ← contaminated!
print("Std:", scaler.scale_)   # [nan] ← contaminated!

scaled = scaler.transform(data)
print("Scaled:\n", scaled)

Mean: [3.]
Std: [1.58113883]
Scaled:
 [[-1.26491106]
 [-0.63245553]
 [        nan]
 [ 0.63245553]
 [ 1.26491106]]


In [102]:
len(data_set)

24515

In [13]:
batch_x,batch_y,batch_x_mark,batch_y_mark = data_set[0]

In [14]:
batch_x,batch_y,batch_x_mark,batch_y_mark

(array([[-1.34106037],
        [-1.4135668 ],
        [-1.4135668 ],
        [-1.4377356 ],
        [-1.4135668 ],
        [-1.11145669],
        [-0.04802912],
        [ 0.36284063],
        [ 1.54711224],
        [ 1.17249571],
        [ 0.30241861],
        [ 0.80996359],
        [ 0.43534705],
        [ 0.65286633],
        [ 0.54410669],
        [ 2.04257282],
        [ 0.82204799],
        [ 2.8522279 ],
        [ 0.83413239],
        [ 0.09698373],
        [-0.32597041],
        [-0.50723648],
        [-0.8697686 ],
        [-1.25646954]]),
 array([[-1.34106037],
        [-1.4135668 ],
        [-1.4135668 ],
        [-1.4377356 ],
        [-1.4135668 ],
        [-1.11145669],
        [-0.04802912],
        [ 0.36284063],
        [ 1.54711224],
        [ 1.17249571],
        [ 0.30241861],
        [ 0.80996359],
        [ 0.43534705],
        [ 0.65286633],
        [ 0.54410669],
        [ 2.04257282],
        [ 0.82204799],
        [ 2.8522279 ],
        [ 0.83413239],
        [

In [78]:
traffic_full.head(30)

Unnamed: 0,date,flow
0,2019-10-01 00:00:00,15.0
1,2019-10-01 01:00:00,9.0
2,2019-10-01 02:00:00,9.0
3,2019-10-01 03:00:00,7.0
4,2019-10-01 04:00:00,9.0
5,2019-10-01 05:00:00,34.0
6,2019-10-01 06:00:00,122.0
7,2019-10-01 07:00:00,156.0
8,2019-10-01 08:00:00,254.0
9,2019-10-01 09:00:00,223.0


In [67]:
for i, (batch_x,batch_y,batch_x_mark,batch_y_mark) in enumerate(data_loader):
    print(i)
    print(len(batch_x),len(batch_y),len(batch_x_mark),len(batch_y_mark))
    for j in range(len(batch_x)):
        print(len(batch_x[j]), len(batch_y[j]), len(batch_x_mark[j]), len(batch_y_mark[j]))

0
32 32 32 32
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
1
32 32 32 32
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
2
32 32 32 32
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 30
24 30 24 3