In [146]:
import numpy as np, pandas as pd
import os, sys
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import joblib
from datetime import datetime, date, time, timedelta

In [188]:
data_dir = './data' 

# Sine Data

In [253]:
selected = 'sine'

In [254]:
def gen_sine_data(no, seq_len, dim):   
    """Sine data generation.
    Args:
    - no: the number of samples
    - seq_len: sequence length of the time-series
    - dim: feature dimensions

    Returns:
    - data: generated data
    """ 
    size = (no, 1, dim)
    freq = np.random.uniform(0, 1, size)    
    phase = np.random.uniform(-1, 1, size)
    
    seq = np.arange(seq_len)
    seq = np.expand_dims(seq, axis=0)
    seq = np.expand_dims(seq, axis=-1)
    
    data = np.sin(freq * seq + phase)    
#     data = (data + 1) * 0.5
    return data

In [255]:
N, T, D = 10000, 24, 5 

In [256]:
TEST_PERC = 0.00
TRAIN_PERC = 1- TEST_PERC

perc_of_train_data = [2, 5, 10, 20, 100]

In [257]:
for p in perc_of_train_data:
    N_small = int(N * p / 100)
    sine_data = gen_sine_data(N_small, T, D)
    sine_data = np.array(sine_data)
    
    np.random.shuffle(sine_data)    
    print(sine_data.shape)
    
    fname = f'{selected}_subsampled_train_perc_{p}.npz'
    full_path = os.path.join(data_dir, fname)
    print(full_path)
    
#     np.save(full_path, sine_data)
    np.savez_compressed(full_path, data=sine_data)

(200, 24, 5)
./data\sine_subsampled_train_perc_2.npz
(500, 24, 5)
./data\sine_subsampled_train_perc_5.npz
(1000, 24, 5)
./data\sine_subsampled_train_perc_10.npz
(2000, 24, 5)
./data\sine_subsampled_train_perc_20.npz
(10000, 24, 5)
./data\sine_subsampled_train_perc_100.npz


# Preprocess Air Quality Data

Need to convert the data to have a single time column. Original data has Date and Time in separate columns.

In [209]:
air = pd.read_csv("./data/AirQualityUCI.csv")
air.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,3/10/2004,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,3/10/2004,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,3/10/2004,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,3/10/2004,21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,3/10/2004,22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888


In [210]:
def parse_time_from_string(time_string,  format = '%H:%M:%S'):
    return datetime.strptime(time_string, format).time()

def parse_date_from_string(date_string, format='%m/%d/%Y'):
    return datetime.strptime(date_string, format)     # '%Y-%m-%d'

In [211]:
air['Date'] = air['Date'].apply(parse_date_from_string)
air['Time'] = air['Time'].apply(parse_time_from_string)

air['DateTime'] = air.apply(lambda row: datetime.combine(row['Date'], row['Time']), axis=1 )

air.drop(columns = ['Date', 'Time'], inplace=True)
air.head()

air.to_csv("./data/AirQualityUCI_processed.csv", index=False)

# Energy and Stock Data

In [212]:
data_dict = {
    'energy': {
        'file': 'Energy-energydata_complete.csv',
        'time_col': 'date',
    },
    'stocks': {
        'file': 'stock_data.csv',
        'time_col': None, 
    },
    'air': {
        'file': 'AirQualityUCI_processed.csv',
        'time_col': 'DateTime',
    },
}

### Choose Data

In [243]:
selected = 'energy'
# selected = 'stocks'
# selected = 'air'

In [244]:
file_path = os.path.join(data_dir, data_dict[selected]['file'])

if data_dict[selected]['time_col'] is not None: 
    data = pd.read_csv(file_path, parse_dates=[data_dict[selected]['time_col']])
else:
    data = pd.read_csv(file_path)
print(data.head())

                 date  Appliances  lights     T1       RH_1    T2       RH_2  \
0 2016-01-11 17:00:00          60      30  19.89  47.596667  19.2  44.790000   
1 2016-01-11 17:10:00          60      30  19.89  46.693333  19.2  44.722500   
2 2016-01-11 17:20:00          50      30  19.89  46.300000  19.2  44.626667   
3 2016-01-11 17:30:00          50      40  19.89  46.066667  19.2  44.590000   
4 2016-01-11 17:40:00          60      40  19.89  46.333333  19.2  44.530000   

      T3       RH_3         T4  ...         T9   RH_9     T_out  Press_mm_hg  \
0  19.79  44.730000  19.000000  ...  17.033333  45.53  6.600000        733.5   
1  19.79  44.790000  19.000000  ...  17.066667  45.56  6.483333        733.6   
2  19.79  44.933333  18.926667  ...  17.000000  45.50  6.366667        733.7   
3  19.79  45.000000  18.890000  ...  17.000000  45.40  6.250000        733.8   
4  19.79  45.000000  18.890000  ...  17.000000  45.40  6.133333        733.9   

   RH_out  Windspeed  Visibility  Tdew

In [245]:
print("data shape: ", data.shape)

data shape:  (19735, 29)


### Sort data by time

In [246]:
time_col = data_dict[selected]['time_col']
print(time_col)

date


In [247]:
if time_col in data: 
    data.sort_values(by=time_col, inplace=True)
    data.reset_index(drop=True, inplace=True)

### Delete the time col

In [248]:
if time_col in data: del data[time_col]
print("data shape: ", data.shape)

data shape:  (19735, 28)


### Train Test Split

In [249]:
N = data.shape[0]

frac_datasets = []
for p in perc_of_train_data: 
    
    num_test = int(N * TEST_PERC)   # to be held-out
    orig_num_train = N - num_test
    
    test_data = data.tail(num_test)  # to be used for evaluation later
    
    orig_train_data = data.loc[np.arange(orig_num_train)]
    #print(orig_train_data.shape)
    
    num_train_frac = int(orig_num_train * p / 100.)
    #print(f'Num train steps for p = {p} is {num_train_frac}')
    
    train_data_frac = data.tail(num_train_frac) 
    frac_datasets.append(train_data_frac)
    print(f'frac:{p}%, train_data_frac shape: {train_data_frac.shape}' )
    
    #np.save(os.path.join(data_dir, f'{selected}_train_perc_{p}.npy'), train_data_frac)

frac:2%, train_data_frac shape: (394, 28)
frac:5%, train_data_frac shape: (986, 28)
frac:10%, train_data_frac shape: (1973, 28)
frac:20%, train_data_frac shape: (3947, 28)
frac:100%, train_data_frac shape: (19735, 28)


### Convert to 3D tensors
shape = N, T, D 

In [250]:
for i, d in enumerate(frac_datasets):
    frac_datasets[i] = d.values.reshape(1, *d.shape)
    print("reshaped: ", frac_datasets[i].shape)

reshaped:  (1, 394, 28)
reshaped:  (1, 986, 28)
reshaped:  (1, 1973, 28)
reshaped:  (1, 3947, 28)
reshaped:  (1, 19735, 28)


### Create Subsampled series of required seq_len

In [251]:
target_len = 24
for p, d in zip(perc_of_train_data, frac_datasets):
    subsampled_dataset = []
    for idx in range(d.shape[1] - T):
        ser = d[:, idx: idx+T, :]
        subsampled_dataset.append(ser)
    subsampled_dataset = np.concatenate(subsampled_dataset, axis=0)
    print('p:', p, '3d tensor shape:', subsampled_dataset.shape)
    
    np.savez_compressed(os.path.join(f'{data_dir}/{selected}_subsampled_train_perc_{p}.npz'), data=subsampled_dataset)

p: 2 3d tensor shape: (370, 24, 28)
p: 5 3d tensor shape: (962, 24, 28)
p: 10 3d tensor shape: (1949, 24, 28)
p: 20 3d tensor shape: (3923, 24, 28)
p: 100 3d tensor shape: (19711, 24, 28)


# Loading data

In [260]:
selected = 'energy'
loaded = np.load(f'./data/{selected}_subsampled_train_perc_100.npz')
data = loaded['data']
print(data.shape)
data[0, 0]

(19711, 24, 28)


array([ 60.        ,  30.        ,  19.89      ,  47.59666667,
        19.2       ,  44.79      ,  19.79      ,  44.73      ,
        19.        ,  45.56666667,  17.16666667,  55.2       ,
         7.02666667,  84.25666667,  17.2       ,  41.62666667,
        18.2       ,  48.9       ,  17.03333333,  45.53      ,
         6.6       , 733.5       ,  92.        ,   7.        ,
        63.        ,   5.3       ,  13.27543316,  13.27543316])

In [262]:
selected = 'sine'
loaded = np.load(f'./data/{selected}_subsampled_train_perc_100.npz')
data = loaded['data']
print(data.shape)

(10000, 24, 5)
