In [43]:
import numpy as np, pandas as pd
import os, sys
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import joblib

In [44]:
data_dir = './data' 

# Sine Data

In [80]:
selected = 'sine'

In [81]:
def gen_sine_data(no, seq_len, dim):   
    """Sine data generation.
    Args:
    - no: the number of samples
    - seq_len: sequence length of the time-series
    - dim: feature dimensions

    Returns:
    - data: generated data
    """ 
    size = (no, 1, dim)
    freq = np.random.uniform(0, 1, size)    
    phase = np.random.uniform(-1, 1, size)
    
    seq = np.arange(seq_len)
    seq = np.expand_dims(seq, axis=0)
    seq = np.expand_dims(seq, axis=-1)
    
    data = np.sin(freq * seq + phase)    
#     data = (data + 1) * 0.5
    return data

In [82]:
N, T, D = 10000, 24, 5 

In [83]:
TEST_PERC = 0.05
TRAIN_PERC = 1- TEST_PERC

perc_of_train_data = [5, 10, 20, 100]

In [85]:
sine_datasets = []
for p in perc_of_train_data:
    sine_data = gen_sine_data(int(N * p / 100), T, D)
    sine_data = np.array(sine_data)
    
    sine_datasets.append(sine_data)
    #print(sine_data.shape)
    
    fname = f'{selected}_subsampled_train_perc_{p}.npy'
    full_path = os.path.join(data_dir, fname)
    print(full_path)
    
    np.save(full_path, sine_data)

./data\sine_subsampled_train_perc_5.npy
./data\sine_subsampled_train_perc_10.npy
./data\sine_subsampled_train_perc_20.npy
./data\sine_subsampled_train_perc_100.npy


# MinMax Scale and Save

# Custom scaler for 3d data
class TSMinMaxScaler():
    '''Scales history and forecast parts of time-series based on history data'''
    def __init__(self, forecast_len, input_dim, upper_bound = 5.):         
        self.forecast_len = forecast_len
        self.min_vals_per_d = None      
        self.max_vals_per_d = None  
        self.input_dim = input_dim
        self.upper_bound = upper_bound
        

    def fit(self, X, y=None): 
        curr_len = X.shape[1]
        self.scaling_len = curr_len - self.forecast_len
        # print(self.scaling_len); sys.exit()

        if self.scaling_len < 1: 
            msg = f''' Error scaling series. 
            scaling_len needs to be at least 2. Given length is {self.scaling_len}.  '''
            raise Exception(msg)

        self.min_vals_per_d = np.expand_dims( X[ :,  : self.scaling_len , : ].min(axis=1), axis = 1)
        self.max_vals_per_d = np.expand_dims( X[ :,  : self.scaling_len , : ].max(axis=1), axis = 1)
        self.range_per_d = self.max_vals_per_d - self.min_vals_per_d

        self.range_per_d = np.where(self.range_per_d == 0, 1e-5, self.range_per_d)
              
        return self
    
    def transform(self, X, y=None):         
        assert X.shape[0] == self.min_vals_per_d.shape[0], "Error: Dimension of array to scale doesn't match fitted array."
        assert X.shape[2] == self.min_vals_per_d.shape[2], "Error: Dimension of array to scale doesn't match fitted array."
         
        X = X - self.min_vals_per_d
        X = np.divide(X, self.range_per_d )        
        X[:, :, :self.input_dim] = np.where( X[:, :, :self.input_dim] < self.upper_bound, X[:, :, :self.input_dim], self.upper_bound)
        return X
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
        

    def inverse_transform(self, X):
        X[:, :, : self.input_dim] = X[:, :, : self.input_dim ] * self.range_per_d[:, :, : self.input_dim] 
        X[:, :, : self.input_dim] = X[:, :, : self.input_dim] + self.min_vals_per_d[:, :, : self.input_dim]
        # print(X.shape)
        return X

scaled_datasets = []
for p, d in zip(perc_of_train_data, sine_datasets):
    scaler = TSMinMaxScaler(
        forecast_len = 20,
        input_dim = D,
        upper_bound = 5.0
    )    
    
    scaled_data = scaler.fit_transform(d)
    np.save(os.path.join(data_dir, f'{selected}_scaled_train_perc_{p}.npy'), scaled_data)
    joblib.dump(scaler, os.path.join(data_dir, f'{selected}_scaler_perc_{p}.save'))   

# Energy and Stock Data

In [50]:
data_dict = {
    'energy': {
        'file': 'Energy-energydata_complete.csv',
        'time_col': 'date',
    },
    'stocks': {
        'file': 'stock_data.csv',
        'time_col': None, 
    }
}

### Choose Data

In [60]:
# selected = 'energy'
selected = 'stocks'

In [61]:
file_path = os.path.join(data_dir, data_dict[selected]['file'])

if data_dict[selected]['time_col'] is not None: 
    data = pd.read_csv(file_path, parse_dates=[data_dict[selected]['time_col']])
else:
    data = pd.read_csv(file_path)
print(data.head())

        Open       High        Low      Close  Adj_Close    Volume
0  49.676899  51.693783  47.669952  49.845802  49.845802  44994500
1  50.178635  54.187561  49.925285  53.805050  53.805050  23005800
2  55.017166  56.373344  54.172661  54.346527  54.346527  18393200
3  55.260582  55.439419  51.450363  52.096165  52.096165  15361800
4  52.140873  53.651051  51.604362  52.657513  52.657513   9257400


In [62]:
print("data shape: ", data.shape)

data shape:  (3685, 6)


### Sort data by time

In [63]:
time_col = data_dict[selected]['time_col']
print(time_col)

None


In [64]:
if time_col in data: 
    data.sort_values(by=time_col, inplace=True)
    data.reset_index(drop=True, inplace=True)

### Delete the time col

In [65]:
if time_col in data: del data[time_col]
print("data shape: ", data.shape)

data shape:  (3685, 6)


### Train Test Split

In [66]:
N = data.shape[0]

frac_datasets = []
for p in perc_of_train_data: 
    
    num_test = int(N * TEST_PERC)   # to be held-out
    orig_num_train = N - num_test
    
    test_data = data.tail(num_test)  # to be used for evaluation later
    
    orig_train_data = data.loc[np.arange(orig_num_train)]
    #print(orig_train_data.shape)
    
    num_train_frac = int(orig_num_train * p / 100.)
    #print(f'Num train steps for p = {p} is {num_train_frac}')
    
    train_data_frac = data.tail(num_train_frac) 
    frac_datasets.append(train_data_frac)
    print(f'frac:{p}%, train_data_frac shape: {train_data_frac.shape}' )
    
    #np.save(os.path.join(data_dir, f'{selected}_train_perc_{p}.npy'), train_data_frac)

frac:20%, train_data_frac shape: (700, 6)


### Min Max Scale Data and Save Scaler

scaled_frac_datasets = []
for p, d_tup in zip(perc_of_train_data, frac_datasets):
    
    train_data, test_data = d_tup    
    
    scaler = MinMaxScaler()
    train_data_arr = scaler.fit_transform(train_data)
    test_data_arr = scaler.transform(test_data)   
    
    scaled_frac_datasets.append(train_data_arr)
    
    
    np.save(os.path.join(data_dir, f'{selected}_scaled_test_perc_{p}.npy'), test_data_arr)
    
    joblib.dump(scaler, os.path.join(data_dir, f'{selected}_scaler_perc_{p}.save'))     
    

### Convert to 3D tensors
shape = N, T, D 

In [67]:
for i, d in enumerate(frac_datasets):
    frac_datasets[i] = d.values.reshape(1, *d.shape)
    print("reshaped: ", frac_datasets[i].shape)

reshaped:  (1, 700, 6)


### Create Subsampled series of required seq_len

In [68]:
target_len = 24
for p, d in zip(perc_of_train_data, frac_datasets):
    subsampled_dataset = []
    for idx in range(d.shape[1] - T):
        ser = d[:, idx: idx+T, :]
        subsampled_dataset.append(ser)
    subsampled_dataset = np.concatenate(subsampled_dataset, axis=0)
    print('p:', p, '3d tensor shape:', subsampled_dataset.shape)
    
    np.save(os.path.join(data_dir, f'{selected}_subsampled_train_perc_{p}.npy'), subsampled_dataset)

p: 20 3d tensor shape: (676, 24, 6)
