In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as md

import seaborn as sns
import random
import os
import sys
import time
import datetime
from gretel_synthetics.timeseries_dgan.dgan import DGAN
from gretel_synthetics.timeseries_dgan.config import DGANConfig,OutputType

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset
from sktime.datasets import load_from_ucr_tsv_to_dataframe
from sktime.datasets import load_from_tsfile


In [None]:
# Read smart home sensor data from https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction
sensor_df = pd.read_csv("/Users/adirserruya/Documents/GitHub/Datasets/energydata_complete.csv")
sensor_df["datetime"] = pd.to_datetime(sensor_df["date"])
sensor_df

In [6]:
# X,y = load_from_tsfile("/Users/adirserruya/Documents/GitHub/Datasets/ArticularyWordRecognition/ArticularyWordRecognition_TRAIN.ts")
X,y = load_from_tsfile("/Users/adirserruya/Documents/GitHub/Datasets/BasicMotions/BasicMotions_TRAIN.ts")

In [7]:
def preprocess_dgan(df:pd.DataFrame,sequence_length:int):
    df = df.copy(deep=True)
    data = []
    for row in df.iterrows():
        for col in df.columns:
            data.append([row[1][col]])
    data = np.array(data)
    data = data.reshape((df.shape[0], sequence_length, df.shape[1]))
    return data

# Split the data into train and test sets
def split_dataset_by_label(X, y):
    splits = {}
    unique_labels = np.unique(y)
    for label in unique_labels:
        splits[label] = {'X': np.aarray(data[y == label]), 'y': np.array(y[y == label])}
    return splits

def train_generator_per_label(splitted_data):
    models = {}
    for label in splitted_data.keys():
        print(f"Training generator for label {label}")
        X = splitted_data[label]['X']
        model = train_dgan(X,50)
        models[label] = model
    return models

def train_dgan(data:np.ndarray,epochs:int):
    model = DGAN(DGANConfig(
        max_sequence_len=data.shape[1], #  144 
        sample_len=4, #
        batch_size=min(1000, data.shape[0]),
        apply_feature_scaling=False,
        apply_example_scaling=False,
        use_attribute_discriminator=False,
        generator_learning_rate=1e-4,
        discriminator_learning_rate=1e-4,
        epochs=epochs,
    ))

    model.train_numpy(
        data,
        feature_types=[OutputType.CONTINUOUS] * data.shape[2],
    )
    return model

# Generate synthetic data for each label
def generate_data_per_label(models,num_samples):
    generated_data = {}
    for label in models.keys():
        print(f"Generating data for label {label}")
        generated_data[label] = models[label].generate_numpy(num_samples)[1]
    return generated_data

class TimeSeriesDataset(Dataset):    
    def __init__(self, X, y, transform=None, trarget_transform=None):
        self.X = X 
        self.y = y
        self.transform = transform
        self.target_transform = trarget_transform
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self,idx):
        X = self.X[idx]
        y = self.y[idx]
        if self.transform:
            X = self.transform(X)
        if self.target_transform:
            y = self.target_transform(y)
        return torch.tensor(X), torch.tensor(y)
    
class LSTM_Classifier(nn.Module):
    def __init__(self, input_dim=31, hidden_dim=256, num_layers=1, output_dim=5, dropout=0):
        '''
        input_dim = number of features at each time step 
        hidden_dim = number of features produced by each LSTM cell (in each layer)
        num_layers = number of LSTM layers
        output_dim = number of classes (number of activities)
        '''
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, 
                            num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        
        
    def forward(self, X):
        _, (h_n, c_n) = self.lstm(X)  # (h_0, c_0) default to zeros
        out = self.fc(h_n[-1,:,:])
        out = self.softmax(out)
        return out


In [8]:
data = preprocess_dgan(X,100)

In [None]:
splitted_data = split_dataset_by_label(data,y)
models = train_generator_per_label(splitted_data)
generated_data = generate_data_per_label(models,100)

In [34]:
generated_data['badminton'].shape

(100, 100, 6)

In [24]:
splitted_data['badminton']['X'].shape

(10, 100, 6)

In [186]:
torch.randn(3, 5).softmax(dim=1)

tensor([[0.1807, 0.4789, 0.1167, 0.0938, 0.1299],
        [0.0759, 0.5857, 0.0741, 0.0904, 0.1740],
        [0.1581, 0.6647, 0.0230, 0.1427, 0.0115]])

In [236]:
data[0,:,5]

array([-1.943010e-01,  3.539820e-01, -2.362410e-01, -1.462320e-01,
       -1.355090e-01, -2.662970e-01, -2.410890e-01, -1.944300e-02,
       -2.528690e-01, -1.721960e-01, -2.085700e-01, -4.209230e-01,
       -1.248750e-01, -1.626970e-01, -2.321580e-01, -9.448000e-02,
        7.565090e-01,  1.070128e+00,  6.238780e-01,  4.474530e-01,
        2.921080e-01,  1.317670e-01, -1.582440e-01, -8.501500e-02,
       -7.837000e-03,  1.560100e-02,  9.123900e-02,  2.429100e-02,
       -1.000830e-01,  2.004520e-01,  1.559600e-01,  1.673040e-01,
       -4.605800e-02, -6.540994e+00,  1.111255e+00,  2.686220e-01,
        2.938900e-01, -3.150000e-03,  5.432600e-02, -2.332400e-02,
       -1.169660e-01, -1.335200e-01,  3.189000e-03, -3.115300e-02,
        3.409000e-02,  4.102100e-02, -1.535000e-03,  1.804730e-01,
       -4.395500e-02,  2.654000e-02, -1.728529e+00,  4.794100e-02,
        1.118620e-01,  7.990100e-02, -1.171880e-01, -6.392100e-02,
        2.663000e-03, -1.331700e-02, -2.663000e-03,  5.593100e

In [242]:
# Map the labels to integers
label_to_int = {label: i for i, label in enumerate(np.unique(y))}
int_to_label = {i: label for label, i in label_to_int.items()}
y_int = np.array([label_to_int[label] for label in y])

In [259]:
X_gen_1 = data[:10]
X_gen_2 = data[10:20]
X_gen_3 = data[20:30]
X_gen_4 = data[30:]

model_1 = train_dgan(X_gen_1,epochs=1000)
model_2 = train_dgan(X_gen_2,epochs=1000)
model_3 = train_dgan(X_gen_3,epochs=1000)
model_4 = train_dgan(X_gen_4,epochs=1000)

_,synthetic_data_1 = model_1.generate_numpy(100)
_,synthetic_data_2 = model_2.generate_numpy(100)
_,synthetic_data_3 = model_3.generate_numpy(100)
_,synthetic_data_4 = model_4.generate_numpy(100)

y_gen_1 = np.ones((synthetic_data_1.shape[0],))*2
y_gen_2 = np.ones((synthetic_data_2.shape[0],))
y_gen_3 = np.ones((synthetic_data_3.shape[0],))*3
y_gen_4 = np.zeros((synthetic_data_4.shape[0],))

X_gen = np.concatenate((synthetic_data_1,synthetic_data_2,synthetic_data_3,synthetic_data_4),axis=0)
y_gen = np.concatenate((y_gen_1,y_gen_2,y_gen_3,y_gen_4),axis=0)


2023-03-23 22:19:29,621 : MainThread : INFO : epoch: 0
2023-03-23 22:19:29,800 : MainThread : INFO : epoch: 1
2023-03-23 22:19:29,846 : MainThread : INFO : epoch: 2
2023-03-23 22:19:29,879 : MainThread : INFO : epoch: 3
2023-03-23 22:19:29,910 : MainThread : INFO : epoch: 4
2023-03-23 22:19:29,941 : MainThread : INFO : epoch: 5
2023-03-23 22:19:29,996 : MainThread : INFO : epoch: 6
2023-03-23 22:19:30,030 : MainThread : INFO : epoch: 7
2023-03-23 22:19:30,057 : MainThread : INFO : epoch: 8
2023-03-23 22:19:30,086 : MainThread : INFO : epoch: 9
2023-03-23 22:19:30,114 : MainThread : INFO : epoch: 10
2023-03-23 22:19:30,140 : MainThread : INFO : epoch: 11
2023-03-23 22:19:30,172 : MainThread : INFO : epoch: 12
2023-03-23 22:19:30,216 : MainThread : INFO : epoch: 13
2023-03-23 22:19:30,243 : MainThread : INFO : epoch: 14
2023-03-23 22:19:30,277 : MainThread : INFO : epoch: 15
2023-03-23 22:19:30,312 : MainThread : INFO : epoch: 16
2023-03-23 22:19:30,346 : MainThread : INFO : epoch: 17
20

In [279]:
# transform y to float
y = y_int
data = data
train_dataloader = DataLoader(TimeSeriesDataset(X_gen[:390],y_gen[:390]),batch_size=20,shuffle=True)
validation_dataloader = DataLoader(TimeSeriesDataset(X_gen[390:],y_gen[390:]),batch_size=20,shuffle=True)
for sample in train_dataloader:
    print(sample[0].shape)
    break

for sample in validation_dataloader:
    print(sample[0].shape, sample[1].shape)
    break

torch.Size([20, 100, 6])
torch.Size([10, 100, 6]) torch.Size([10])


In [289]:
#Shuffle data together with y
data_ = np.concatenate((data, y.reshape(-1,1)), axis=1)
data_ = np.random.shuffle(data_)
#Split again to data and y
data_ = data_[:,:-1]
y_ = data_[:,-1]



ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 3 dimension(s) and the array at index 1 has 2 dimension(s)

In [282]:
y = y_int
data = data
train_dataloader = DataLoader(TimeSeriesDataset(data[:30],y[:30]),batch_size=20,shuffle=True)
validation_dataloader = DataLoader(TimeSeriesDataset(data[30:],y[30:]),batch_size=20,shuffle=True)

In [265]:
def train_loop(data_loader, model,device,loss_fn,optimizer,print_every_n=200):
    model.train()
    size = len(data_loader.dataset)
    num_batches = len(data_loader)
    train_loss=0
    tp=0
    for batch,(X,y) in enumerate(data_loader):
        X = X.to(device)
        y = y.type(torch.LongTensor)
        y = y.to(device)
        pred = model(X.float())
        # print(f'Preds : {pred.argmax(1)}')
        # print(f'GT : {y}')
        loss = loss_fn(pred,y)
        train_loss += loss
        tp += (y==pred.argmax(1)).type(torch.float).sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss, current = loss.item(), batch*len(X)
        if batch%print_every_n==0:
            print(f'loss={loss:.3f}, {current} / {size}')

    train_loss /= num_batches
    train_acc = tp/size    
    print(f'train accuracy = {train_acc}, val_loss = {train_loss:2f}')
    return train_loss,train_acc

def validation_loop(data_loader,model,device,loss_fn):
    model.eval()
    size=len(data_loader.dataset)
    num_batches = len(data_loader)
    val_loss=0
    tp=0
    with torch.no_grad():
        for X,y in data_loader:
            X = X.to(device)
            y = y.type(torch.LongTensor)
            y = y.to(device)
            pred = model(X.float())
            val_loss += loss_fn(pred,y).item()
            tp += (y==pred.argmax(1)).type(torch.float).sum().item()
        
    val_loss /= num_batches
    val_acc = tp/size
    print(f'validation accuracy = {val_acc}, val_loss = {val_loss:2f}')
    return val_loss,val_acc

In [285]:
output_dim = len(np.unique(y))
lr = 0.001
best_acc = 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
best_loss = np.inf
model = LSTM_Classifier(num_layers=2,input_dim = 6,hidden_dim=64,output_dim=output_dim,dropout=0.3)
model = model
model.to(device)
criterion = nn.CrossEntropyLoss()
criterion = criterion
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
save_each_epoch = True

In [286]:

import neptune.new as neptune

results = []
run = neptune.init_run(
    project="astarteam/FinalProject",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI3Y2Y1YTE5OC1hNzhhLTQwNjctYjgyZS03Y2Y1MzU5ZDg1YjYifQ==",
)  # your credentialscredentials

params = {"batch_size": 20,
            "learning_rate": lr, 
            "optimizer": "CrossEntropyLoss"}

run["parameters"] = params
run['experiment_details'] = {'Dataset name': 'Basic Motions',
                             'Dataset ID': 1,
                             'Pretraining': False,
}
# define the number of epochs and early stopping patience
epochs = 10
patience = 5
for epoch in range(epochs):
    start_time = time.time()
    train_loss, train_acc = train_loop(train_dataloader, model, device, criterion, optimizer)
    total_train_time = (time.time() - start_time)/60
    val_loss, val_acc = validation_loop(validation_dataloader, model, device, criterion)
    results.append({'epoch_number':epoch,'train_loss':train_loss.detach().cpu().numpy(),'val_loss':val_loss,'train_acc':train_acc,'val_acc':val_acc, 'train_time':total_train_time})
    
    run["train/accuracy"].log(train_acc)
    run["train/loss"].log(train_loss)
    
    val_loss,val_acc = validation_loop(validation_dataloader, model, device, criterion)
    run["validation/accuracy"].log(val_acc)
    run["validation/loss"].log(val_loss)
    if val_loss < best_loss:
        best_loss = val_loss
        early_stopping_counter = 0
    # otherwise, increment the early stopping counter
    else:
        early_stopping_counter += 1     
    # if the early stopping counter has reached the patience, stop training
    if early_stopping_counter == patience:
        break
run.stop()


https://app.neptune.ai/astarteam/FinalProject/e/FIN-8
loss=1.385, 0 / 30
train accuracy = 0.1, val_loss = 1.385217
validation accuracy = 0.0, val_loss = 1.396533
validation accuracy = 0.0, val_loss = 1.396533
loss=1.383, 0 / 30
train accuracy = 0.43333333333333335, val_loss = 1.378181
validation accuracy = 0.0, val_loss = 1.403581
validation accuracy = 0.0, val_loss = 1.403581
loss=1.375, 0 / 30
train accuracy = 0.4, val_loss = 1.374516
validation accuracy = 0.0, val_loss = 1.410836
validation accuracy = 0.0, val_loss = 1.410836
loss=1.370, 0 / 30
train accuracy = 0.43333333333333335, val_loss = 1.369052
validation accuracy = 0.0, val_loss = 1.418613
validation accuracy = 0.0, val_loss = 1.418614
loss=1.364, 0 / 30
train accuracy = 0.5, val_loss = 1.362408
validation accuracy = 0.0, val_loss = 1.427138
validation accuracy = 0.0, val_loss = 1.427138
loss=1.356, 0 / 30
train accuracy = 0.6666666666666666, val_loss = 1.354739
validation accuracy = 0.0, val_loss = 1.436689
validation accur

In [277]:
run.stop()

In [None]:
# Train DGAN model
model = DGAN(DGANConfig(
    max_sequence_len=data.shape[1], #  144 
    sample_len=4, #
    batch_size=min(1000, features.shape[0]),
    apply_feature_scaling=True,
    apply_example_scaling=False,
    use_attribute_discriminator=False,
    generator_learning_rate=1e-4,
    discriminator_learning_rate=1e-4,
    epochs=1000,
))

model.train_numpy(
    features,
    feature_types=[OutputType.CONTINUOUS] * features.shape[2],
)

# Generate synthetic data
_, synthetic_features = model.generate_numpy(1000)

In [None]:
plot_day(synthetic_features[3,:,:4])
