In [3]:
# imports 
import numpy as np
import torch 
from torch import nn
from torch import functional as F
import os

In [6]:
os.getcwd()

'/content/TimeGAN'

In [5]:
os.chdir("/content/TimeGAN/")

In [9]:
def Normalize(dta):
  return (dta - np.min(dta, 0)) /  (np.max(dta, 0) - np.min(dta, 0) + 1e-7)

In [10]:
def real_data_loading (path, seq_len):
  """Load and preprocess real-world datasets.
  
  Args:
    - data_name: stock or energy
    - seq_len: sequence length
    
  Returns:
    - data: preprocessed data.
  """  
  assert os.path.isfile(path)
  ori_data = np.loadtxt(path, delimiter = ",",skiprows = 1)
        
  # Flip the data to make chronological data
  ori_data = ori_data[::-1]
  # Normalize the data
  ori_data = Normalize(ori_data)
    
  # Preprocess the dataset
  temp_data = []    
  # Cut data by sequence length
  for i in range(0, len(ori_data) - seq_len):
    _x = ori_data[i:i + seq_len]
    # yields an array of dims [len(dta) - seq_len, seq_len, n_variables]
    temp_data.append(_x)
        
  # Mix the datasets (to make it similar to i.i.d)
  idx = np.random.permutation(len(temp_data))    
  data = []
  for i in range(len(temp_data)):
    data.append(temp_data[idx[i]])
    
  return data

In [11]:
# taken from the tutorial
seq_len = 24
path = "/content/TimeGAN/stock_data.csv"
ori_data = real_data_loading(path, seq_len)

In [22]:
np.array(ori_data).shape

(3661, 24, 6)

In [27]:
def train_test_divide (data_x, data_x_hat, data_t, data_t_hat, train_rate = 0.8):
  """Divide train and test data for both original and synthetic data.
  
  Args:
    - data_x: original data
    - data_x_hat: generated data
    - data_t: original time
    - data_t_hat: generated time
    - train_rate: ratio of training data from the original data
  """
  # Divide train/test index (original data)
  no = len(data_x)
  idx = np.random.permutation(no)
  train_idx = idx[:int(no*train_rate)]
  test_idx = idx[int(no*train_rate):]
    
  train_x = [data_x[i] for i in train_idx]
  test_x = [data_x[i] for i in test_idx]
  train_t = [data_t[i] for i in train_idx]
  test_t = [data_t[i] for i in test_idx]      
    
  # Divide train/test index (synthetic data)
  no = len(data_x_hat)
  idx = np.random.permutation(no)
  train_idx = idx[:int(no*train_rate)]
  test_idx = idx[int(no*train_rate):]
  
  train_x_hat = [data_x_hat[i] for i in train_idx]
  test_x_hat = [data_x_hat[i] for i in test_idx]
  train_t_hat = [data_t_hat[i] for i in train_idx]
  test_t_hat = [data_t_hat[i] for i in test_idx]
  
  return train_x, train_x_hat, test_x, test_x_hat, train_t, train_t_hat, test_t, test_t_hat

In [28]:
def extract_time (data):
  """Returns Maximum sequence length and each sequence length.
  
  Args:
    - data: original data
    
  Returns:
    - time: extracted time information
    - max_seq_len: maximum sequence length
  """
  time = list()
  max_seq_len = 0
  for i in range(len(data)):
    max_seq_len = max(max_seq_len, len(data[i][:,0]))
    time.append(len(data[i][:,0]))
    
  return time, max_seq_len

In [35]:
seq_lengths, max_seq_len = extract_time(ori_data)

In [36]:
np.array(seq_lengths).shape

(3661,)

In [39]:
def batch_generator(data, time, batch_size):
  """Mini-batch generator.
  
  Args:
    - data: time-series data
    - time: time information
    - batch_size: the number of samples in each batch
    
  Returns:
    - X_mb: time-series data in each batch
    - T_mb: time information in each batch
  """
  no = len(data)
  idx = np.random.permutation(no)
  train_idx = idx[:batch_size]     
            
  X_mb = list(data[i] for i in train_idx)
  T_mb = list(time[i] for i in train_idx)
  
  return X_mb, T_mb

In [49]:
ori_time, max_seq_len = extract_time(ori_data)
no, seq_len, dim = np.asarray(ori_data).shape
batch_size = 128
z_dim = dim

In [37]:
def random_generator (batch_size, z_dim, T_mb, max_seq_len):
  """Random vector generation.
  
  Args:
    - batch_size: size of the random vector
    - z_dim: dimension of random vector
    - T_mb: time information for the random vector
    - max_seq_len: maximum sequence length
    
  Returns:
    - Z_mb: generated random vector
  """
  Z_mb = list()
  for i in range(batch_size):
    temp = np.zeros([max_seq_len, z_dim])
    temp_Z = np.random.uniform(0., 1, [T_mb[i], z_dim])
    temp[:T_mb[i],:] = temp_Z
    Z_mb.append(temp_Z)
  return Z_mb

In [42]:
X_mb, T_mb = batch_generator(ori_data, ori_time, batch_size) 

In [45]:
np.array(X_mb).shape, np.array(T_mb).shape
# we feed batches of 128 sequences each into the network

((128, 24, 6), (128,))

In [51]:
Z_mb = random_generator(batch_size, z_dim, T_mb, max_seq_len)
np.array(Z_mb).shape

(128, 24, 6)

In [163]:
# X = tf.placeholder(tf.float32, [None, max_seq_len, dim], name = "myinput_x")
# Z = tf.placeholder(tf.float32, [None, max_seq_len, z_dim], name = "myinput_z")
# T = tf.placeholder(tf.int32, [None], name = "myinput_t")
# X is the original data => Flexible batch size, 24, 6 for stock data
# Z is the generated fake data => Flex batch size, 24, 6 for stock data
# T is the time information => Flexible batch size, here 128 for the time data
# the embedding network uses 
num_layers = 2
hidden_dim = 24
seq_len = 24
# i think input size should be 24 for the stock data
input_size = 6
device = "cpu"
batch_size = 128

# from the RNN documentation: 
# arguments: inputsize = num_features, hidden_size, num_layers
# rnn = nn.RNN(6, 24, 2, batch_first=True)
# batch seq features 128, 24, 6
# input: batch, length, hidden
# input = torch.randn(128, 24, 6)
# num layers, batch, hidden size
# h0 = torch.randn(2, 128, 24)
# output, hn = rnn(input, h0)

class RNN(nn.Module):
  def __init__(self, input_size, hidden_dim, num_layers):
    super(RNN, self).__init__()
    self.num_layers = num_layers
    self.hidden_dim = hidden_dim
    # batch must be first dimension, inputsize = 6 for tme data
    # arguments: inputsize = num_features, hidden_size, num_layers
    # rnn = nn.RNN(6, 24, 2, batch_first=True)
    self.rnn = nn.RNN(input_size, hidden_dim, num_layers, batch_first = True)
    # X => batch_size, seq_length, num_features like specified above
    self.fc = nn.Linear(hidden_dim, hidden_dim)
    self.nonlinearity = nn.Sigmoid()

  # rnn needs two inputs: data & initial state
  def forward(self, x):
    # num layers, batch, hidden size
    # h0 = torch.randn(2, 128, 24)
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device).float()
    out, hn = self.rnn(x, h0)
    # out: batch_size, seq_len, hidden_dim
    # out = out[:, -1, :]
    # out (128, 24)
    out = self.fc(out)
    out = self.nonlinearity(out)
    return out

In [164]:
# testing the RNN module
net = RNN(6, 24, 5)
# converting everything to float to avoid data type runtime errors
net.float()

# using one minibatch as example data
dta = np.array(X_mb)
dta.astype(np.float32)
dta = torch.from_numpy(dta)

net(dta.float()).shape
# yields 128 x 24 embeddings => 128 samples in batch, every embedding has 24 features
# => we embed in a higher dimensional space in this case, as mentioned in the paper

torch.Size([128, 24, 24])

In [175]:
# lets modify the code to add lstm and gru 
class Custom_RNN(nn.Module):
  def __init__(self, input_size, hidden_dim, num_layers, rnn_type, orig_dim = 24, embed = True):
    super(Custom_RNN, self).__init__()
    self.num_layers = num_layers
    self.hidden_dim = hidden_dim
    self.rnn_type = rnn_type
    self.embed = embed
    # batch must be first dimension, inputsize = 6 for tme data
    # arguments: inputsize = num_features, hidden_size, num_layers
    # rnn = nn.RNN(6, 24, 2, batch_first=True)
    if self.rnn_type == "rnn":
      self.net = nn.RNN(input_size, hidden_dim, num_layers, batch_first = True)
    elif self.rnn_type == "gru": # gru uses batch_size, seq_length, inputsize too
      self.net = nn.GRU(input_size, hidden_dim, num_layers, batch_first = True)
    elif self.rnn_type == "lstm": # input params still the same for lstm
      self.net = nn.LSTM(input_size, hidden_dim, num_layers, batch_first = True)
    # X => batch_size, seq_length, num_features like specified above
    if self.embed == True: # add distinction between embedding & recovery
      self.fc = nn.Linear(hidden_dim, hidden_dim)
    else:
      self.fc = nn.Linear(hidden_dim, orig_dim)
    self.nonlinearity = nn.Sigmoid()

  # rnn needs two inputs: data & initial state
  def forward(self, x):
    # num layers, batch, hidden size
    # h0 = torch.randn(2, 128, 24)
    if self.rnn_type in ["rnn", "gru"]:
      h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device).float()
      out, hn = self.net(x, h0)
    elif self.rnn_type == "lstm": # additional initial cell state has same shape
      h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device).float()
      c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device).float()
      out, hn = self.net(x, (h0, c0))
    # out: batch_size, seq_len, hidden_dim
    # out = out[:, -1, :]
    # out (128, 24)
    # apply nonlinearity like in original implementation
    out = self.fc(out)
    out = self.nonlinearity(out)
    return out

In [176]:
# testing the RNN module
net = Custom_RNN(6, 24, 5, rnn_type="lstm")
# converting everything to float to avoid data type runtime errors
net.float()

# using one minibatch as example data
dta = np.array(X_mb)
dta.astype(np.float32)
dta = torch.from_numpy(dta)

net(dta.float()).shape
# yields 128 x 24 embeddings => 128 samples in batch, every embedding has 24 features
# => we embed in a higher dimensional space in this case, as mentioned in the paper
# everything seems to work until here

torch.Size([128, 24, 24])

In [177]:
def embedder(X, hidden_dim, num_layers, rnn_type): # sequence length is only important for dynamic rnns 
  """Embedding network between original feature space to latent space.
    
    Args:
      - X: input time-series features (batch of input data)
      - hidden_dim: Hidden dimension of RNN
      - num_layers: Number of RNN layers
      - rnn_type: type of rnn [rnn, gru, lstm]

    Returns:
      - H: embeddings
  """
  input_size = np.array(X).shape[2] # number of features [128, 24, 6]
  input = np.array(X)
  input.astype(np.float32)
  input = torch.from_numpy(input)

  net = Custom_RNN(input_size, hidden_dim, num_layers, rnn_type)
  net.float()
  H = net(input.float())
  return H

In [178]:
H = embedder(X_mb, 20, 2, "lstm")
H.shape

torch.Size([128, 24, 20])

In [184]:
def recovery(H, hidden_dim, num_layers, rnn_type, orig_dim):   
  """Recovery network from latent space to original space.
    
  Args:
    - H: latent representation
    - hidden_dim: Hidden dimension of RNN
    - num_layers: Number of RNN layers
    - rnn_type: type of rnn [rnn, gru, lstm] 
    - orig_dim: original data dimensionality to recover 
  Returns:
    - X_tilde: recovered data
  """            
  input_size = H.shape[2]

  net = Custom_RNN(input_size, hidden_dim, num_layers, rnn_type, orig_dim, embed=False)
  net.float()
  X_tilde = net(H)
  return X_tilde

In [185]:
X_tilde = recovery(H, 20, 2, "lstm", orig_dim=6)
X_tilde.shape

torch.Size([128, 24, 6])

torch.Size([128, 24, 20])