In [1]:
import os
import numpy as np
import pandas as pd
from torch.utils.data import Dataset 
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
from timefeatures import time_features
from torch.utils.data import DataLoader

In [2]:
class TimeLLMDataset(Dataset):
    def __init__(self, root_path, flag='train', size=None,
                 features='M', data_path='your_dataset.csv',
                 target='target_column', scale=True, timeenc=0, freq='h', percent=100):
        if size is None:
            self.seq_len = 384  # Example sequence length
            self.label_len = 96  # Example label length
            self.pred_len = 96  # Example prediction length
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]

        # Initialize parameters
        assert flag in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[flag]

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq
        self.percent = percent

        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()

        self.enc_in = self.data_x.shape[-1]
        self.tot_len = len(self.data_x) - self.seq_len - self.pred_len + 1

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path, self.data_path))

        cols = list(df_raw.columns)
        #print(cols)
        #cols.remove(self.target)
        cols.remove('Date')
        df_raw = df_raw[['Date'] + cols] #+ [self.target]]
        #print(len(df_raw))

        num_train = int(len(df_raw) * 0.7)
        num_test = int(len(df_raw) * 0.2)
        num_vali = len(df_raw) - num_train - num_test
        border1s = [0, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len]
        border2s = [num_train, num_train + num_vali, len(df_raw)]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]
        #print("border1: ", border1)
        #print("border2: ", border2)
        #print(num_train,num_test,num_vali)

        if self.set_type == 0:
            border2 = (border2 - self.seq_len) * self.percent // 100 + self.seq_len

        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]
            #print(cols_data)
            df_data = df_raw[cols_data]
            #print('data',df_data)
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            #print("train", train_data)
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
            #print(data)
        else:
            data = df_data.values

        df_stamp = df_raw[['Date']][border1:border2]
        df_stamp['Date'] = pd.to_datetime(df_stamp.Date)
        #print(self.timeenc)
        if self.timeenc == 0:
            df_stamp['month'] = df_stamp.Date.apply(lambda row: row.month, 1)
            df_stamp['day'] = df_stamp.Date.apply(lambda row: row.day, 1)
            df_stamp['weekday'] = df_stamp.Date.apply(lambda row: row.weekday(), 1)
            df_stamp['hour'] = df_stamp.Date.apply(lambda row: row.hour, 1)
            data_stamp = df_stamp.drop(['Date'], 1).values
            #print('data',data_stamp)
        elif self.timeenc == 1:
            data_stamp = time_features(pd.to_datetime(df_stamp['Date'].values), freq=self.freq)
            data_stamp = data_stamp.transpose(1, 0)
            #print(data_stamp)

        self.data_x = data[border1:border2]
        #print(len(self.data_x))
        self.data_y = data[border1:border2]
        #print(len(self.data_x))
        self.data_stamp = data_stamp
        print(f"data_x shape: {self.data_x.shape}")
        print(f"data_y shape: {self.data_y.shape}")

    def __getitem__(self, index):
        feat_id = index // self.tot_len
        #print("hi",feat_id)
        s_begin = index % self.tot_len

        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len
        seq_x = self.data_x[s_begin:s_end, feat_id:feat_id + 1]
        seq_y = self.data_y[r_begin:r_end, feat_id:feat_id + 1]
        seq_x_mark = self.data_stamp[s_begin:s_end]
        seq_y_mark = self.data_stamp[r_begin:r_end]

        return seq_x, seq_y, seq_x_mark, seq_y_mark

    def __len__(self):
        return (len(self.data_x) - self.seq_len - self.pred_len + 1) * self.enc_in

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)


In [3]:
def data_provider(args, flag):
    # Set time encoding based on the 'embed' argument
    #print(args.embed)
    timeenc = 0 if args.embed != 'timeF' else 1
    #print(timeenc)
    
    # Determine whether to shuffle data and whether to drop the last incomplete batch
    if flag == 'test':
        shuffle_flag = False
        drop_last = True
    else:
        shuffle_flag = True
        drop_last = True

    # Set batch size and frequency
    batch_size = args.batch_size
    freq = args.freq

    # Initialize the custom dataset
    data_set = TimeLLMDataset(
        root_path=args.root_path,
        data_path=args.data_path,
        flag=flag,
        size=[args.seq_len, args.label_len, args.pred_len],
        features=args.features,
        target=args.target,
        timeenc=timeenc,
        freq=freq,
        percent=args.percent
    ) #seasonal_patterns=args.seasonal_patterns
    
    # Create DataLoader
    data_loader = DataLoader(
        data_set,
        batch_size=batch_size,
        shuffle=shuffle_flag,
        num_workers=args.num_workers,
        drop_last=drop_last
    )

    return data_set, data_loader


In [4]:
import argparse 
args = argparse.Namespace(
    root_path='./',  # Assuming the file is in the current directory
    data_path='NVDA.csv',  # Your CSV file
    seq_len=384,
    label_len=96,
    pred_len=96,
    features='M',  # 'M' for multivariate, 'S' for univariate
    target= None,  # Replace with the actual name of the target column in your CSV
    embed='timeF',  # Use 'timeF' for time feature encoding
    scale=True,
    percent=100,
    num_workers=0,
    batch_size=32,
    freq='d'
)

In [5]:
train_data, train_loader = data_provider(args, flag='train')
val_data, val_loader = data_provider(args, flag='val')
test_data, test_loader = data_provider(args, flag='test')

data_x shape: (870, 6)
data_y shape: (870, 6)
data_x shape: (509, 6)
data_y shape: (509, 6)
data_x shape: (632, 6)
data_y shape: (632, 6)


In [6]:
for batch in train_loader:
    seq_x, seq_y, seq_x_mark, seq_y_mark = batch
    print("Input Sequence (seq_x):", seq_x.shape)
    print("Target Sequence (seq_y):", seq_y.shape)
    print("Input Time Markers (seq_x_mark):", seq_x_mark.shape)
    print("Target Time Markers (seq_y_mark):", seq_y_mark.shape)
    print("First Batch Input Data:\n", seq_x[0])  # Inspect the first item in the batch
    print("First Batch Target Data:\n", seq_y[0])  # Inspect the first target in the batch
    break  # Only inspect the first batch

Input Sequence (seq_x): torch.Size([32, 384, 1])
Target Sequence (seq_y): torch.Size([32, 192, 1])
Input Time Markers (seq_x_mark): torch.Size([32, 384, 3])
Target Time Markers (seq_y_mark): torch.Size([32, 192, 3])
First Batch Input Data:
 tensor([[-2.7608e-01],
        [-2.7009e-01],
        [-2.2698e-01],
        [-2.1943e-01],
        [-2.3665e-01],
        [-2.2224e-01],
        [-2.3234e-01],
        [-2.2355e-01],
        [-2.3143e-01],
        [-2.7068e-01],
        [-2.7294e-01],
        [-2.8662e-01],
        [-2.8031e-01],
        [-2.5076e-01],
        [-2.6400e-01],
        [-2.5547e-01],
        [-2.1297e-01],
        [-3.2827e-01],
        [-2.2184e-01],
        [-2.3165e-01],
        [-1.8135e-01],
        [-2.0130e-01],
        [-1.9444e-01],
        [-2.4281e-01],
        [-2.9253e-01],
        [-2.6834e-01],
        [-2.1866e-01],
        [-1.4545e-01],
        [-1.6807e-01],
        [-1.7671e-01],
        [-2.0852e-01],
        [-2.8403e-01],
        [-2.6459e-01],


In [7]:
df = pd.read_csv("NVDA.csv")

In [8]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,8/21/2019,4.263250,4.336250,4.241500,4.280750,4.256181,427244000
1,8/22/2019,4.290500,4.333250,4.247500,4.287000,4.262396,303488000
2,8/23/2019,4.210000,4.264750,4.041000,4.061000,4.037692,568056000
3,8/26/2019,4.140250,4.164500,4.097750,4.136250,4.112510,318208000
4,8/27/2019,4.174750,4.177500,4.015500,4.045000,4.021784,290968000
...,...,...,...,...,...,...,...
1238,7/24/2024,119.169998,119.949997,113.440002,114.250000,114.250000,327776900
1239,7/25/2024,113.040001,116.629997,106.300003,112.279999,112.279999,460067000
1240,7/26/2024,116.190002,116.199997,111.580002,113.059998,113.059998,293399100
1241,7/29/2024,113.690002,116.279999,111.300003,111.589996,111.589996,248152100
