In [17]:
import math

import torch
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
import pytorch_lightning as pl
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from torch.nn import functional as F
from torchvision.datasets import MNIST
from torchvision import transforms
import torchvision
import os
import monai
from monai.networks.layers.factories import Act, Norm
from losses import *
from hyperopt import hp
from ray.tune.suggest.hyperopt import HyperOptSearch
import numpy as np

# from source.ray_utils import * # create_search_space, create_test_search_space

# import source.transforms as transforms
# import source.transforms.oral_cavity_transforms as transforms
# import source.losses as losses
# import deepgrow
from monai.metrics.meandice import compute_meandice
PATH_DATASETS = os.environ.get("PATH_DATASETS", ".")
AVAIL_GPUS = min(1, torch.cuda.device_count())

In [2]:
# ! pip install "monai==0.5.3"
# ! pip install hyperopt

In [3]:
# # Start tensorboard.
%load_ext tensorboard
%tensorboard --logdir logs/ --port 7991

In [4]:
class MNISTDataModule(LightningDataModule):
    def __init__(
        self,
        batch_size =  256,
        data_dir=PATH_DATASETS
    ):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        
        self.transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Resize((32,32)),
            ]
        )

    def prepare_data(self):
        # download
        MNIST(self.data_dir, train=True, download=True)
        MNIST(self.data_dir, train=False, download=True)

    def setup(self, stage=None):

        # Assign train/val datasets for use in dataloaders
        if stage == "fit" or stage is None:
            mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
            self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])

        # Assign test dataset for use in dataloader(s)
        if stage == "test" or stage is None:
            self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=self.batch_size)

In [7]:
class LightningVAE(pl.LightningModule):
    def __init__(self, config):
        super(LightningVAE, self).__init__()

        self.lr = config["lr"]
        self.batch_size = config["batch_size"]
        self.latent_dim = config["latent_dim"]

        self.model = monai.networks.nets.VarAutoEncoder(
            dimensions=2,  
            kernel_size=config["kernel_size"],
            in_shape=[1, 32,32],
            out_channels=1,
            channels=config["channel"],
            strides=config["stride"],
            latent_size=config["latent_dim"],
            norm=config["norm"],
            dropout=config["dropout_rate"],
            num_res_units=config["num_resnets"],
        )
 

        self.vae_loss = KLLoss(alpha=config["alpha"], beta=config["beta"])
        self.dice = Dice()

    def forward(self, x):
        
        return self.model(x)


    def training_step(self, train_batch, batch_idx):
        x, _ = train_batch
        y = x
        output = self.forward(x)
        recon_batch = output[0]
        loss, avg_kl = self.vae_loss(output, y)
        # dice, avgkl, totalkl
        recon_batch = (recon_batch > 0.5).float()
        dice= self.dice(recon_batch, y)

        return {"loss": loss, "dice": dice, "avg_kl": avg_kl}

    def training_epoch_end(self, outputs):

        mean_loss = torch.stack([x["loss"] for x in outputs]).mean()
        mean_dice = torch.stack([x["dice"] for x in outputs]).mean()
        mean_avg_kl = torch.stack([x["avg_kl"] for x in outputs]).mean()
        mean_total_kl = mean_avg_kl * self.latent_dim

        self.logger.experiment.add_scalar(
            "Training/loss", mean_loss, self.current_epoch
        )
        self.logger.experiment.add_scalar(
            "Training/dice", mean_dice, self.current_epoch
        )
        self.logger.experiment.add_scalar(
            "Training/avg_kl", mean_avg_kl, self.current_epoch
        )
        self.logger.experiment.add_scalar(
            "Training/total_kl", mean_total_kl, self.current_epoch
        )

    def validation_step(self, val_batch, batch_idx):

        x, _ = val_batch
        y = x
        if(y.shape[0]>=5):
            self.sample_random_batch = y[:5]
        output = self.forward(x)
        recon_batch = output[0]
        loss, avg_kl = self.vae_loss(output, y)
        # dice, avgkl, totalkl, surface_dice
        recon_batch = (recon_batch > 0.5).float()
        dice= self.dice(recon_batch, y)
        # dice = compute_meandice(recon_batch, x, include_background=False)

        return {
            "loss": loss,
            "dice": dice,
            "avg_kl": avg_kl,
        }

    def validation_epoch_end(self, outputs):

        mean_loss = torch.stack([x["loss"] for x in outputs]).mean()
        mean_dice = torch.stack([x["dice"] for x in outputs]).mean()
        mean_avg_kl = torch.stack([x["avg_kl"] for x in outputs]).mean()
        mean_total_kl = mean_avg_kl * self.latent_dim

        self.logger.experiment.add_scalar(
            "Validation/loss", mean_loss, self.current_epoch
        )
        self.logger.experiment.add_scalar(
            "Validation/dice", mean_dice, self.current_epoch
        )
        self.logger.experiment.add_scalar(
            "Validation/avg_kl", mean_avg_kl, self.current_epoch
        )
        self.logger.experiment.add_scalar(
            "Validation/total_kl", mean_total_kl, self.current_epoch
        )
        self.log("ptl/dice", mean_dice)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer
    
    def on_epoch_end(self):

        # log sampled images
        if(self.current_epoch%5 == 0):
            sample_out = self.forward(self.sample_random_batch)
            sample_out = sample_out[0].detach().cpu().numpy()
            sample_in = self.sample_random_batch.cpu().numpy()
            data = []
            slice_index = sample_out.shape[2]//2
            for i in range(1):
                for j in range(sample_out.shape[0]):
                    data.append(sample_out[j,i])
                for j in range(sample_out.shape[0]):
                    data.append(sample_in[j,i])
            data_tensor = torch.from_numpy(np.array(data)).unsqueeze(1)
            grid = torchvision.utils.make_grid(data_tensor,
                                              normalize = True, 
                                             scale_each = True,
                                             nrow = sample_out.shape[0])
            self.logger.experiment.add_image("generated_images", grid, self.current_epoch)


In [8]:
from pytorch_lightning.loggers import TensorBoardLogger
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import (
    TuneReportCallback,
    TuneReportCheckpointCallback,
)

In [14]:
def train_vae_tune(config, num_epochs=10, num_gpus=1):
    # data_dir = os.path.expanduser(data_dir)
    model = LightningVAE(config)
    data_module = MNISTDataModule(
        batch_size=config["batch_size"]
    )
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCallback(
                {"dice": "ptl/dice"},
                on="validation_end",
            )
        ],
    )
    trainer.fit(model, data_module)

In [10]:
# single run
def train_vae_single(config, num_epochs=1, num_gpus=1):
    model = LightningVAE(config)
    data_module = MNISTDataModule(
        batch_size=config["batch_size"]
    )

    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        logger=TensorBoardLogger(save_dir="./logs"),
    )

    trainer.fit(model, data_module)

In [11]:
param = {
        "lr": 0.00001,
        "latent_dim": 256,
        "kernel_size": 3,
        "dropout_rate": 0.1,
        "alpha": 1,
        "beta": 0.01,
        "norm": Norm.INSTANCE,
         "batch_size": 256,
        
            "val": 3,
            "channel": (32, 64, 64),
            "stride": (1, 2, 4),
            # "resnet_units_batch" : hp.choice("res6", res_d6),

        "num_resnets":  0,
           
        
    }

train_vae_single(param, num_epochs=50 , num_gpus =1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type           | Params
--------------------------------------------
0 | model    | VarAutoEncoder | 899 K 
1 | vae_loss | KLLoss         | 0     
2 | dice     | Dice           | 0     
--------------------------------------------
899 K     Trainable params
0         Non-trainable params
899 K     Total params
3.598     Total estimated model params size (MB)


                                                                      

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Epoch 0:   1%|          | 2/235 [00:00<00:23,  9.85it/s, loss=1.75, v_num=28]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


Epoch 0:  91%|█████████▏| 215/235 [00:13<00:01, 16.31it/s, loss=1.22, v_num=28]
Validating: 0it [00:00, ?it/s][A
Epoch 0:  92%|█████████▏| 217/235 [00:13<00:01, 16.37it/s, loss=1.22, v_num=28]
Validating:  10%|█         | 2/20 [00:00<00:00, 18.58it/s][A
Epoch 0:  94%|█████████▍| 221/235 [00:13<00:00, 16.49it/s, loss=1.22, v_num=28]
Epoch 0:  96%|█████████▌| 225/235 [00:13<00:00, 16.62it/s, loss=1.22, v_num=28]
Epoch 0:  97%|█████████▋| 229/235 [00:13<00:00, 16.74it/s, loss=1.22, v_num=28]
Validating:  70%|███████   | 14/20 [00:00<00:00, 27.50it/s][A
Epoch 0: 100%|██████████| 235/235 [00:13<00:00, 16.90it/s, loss=1.22, v_num=28]
Epoch 1:  30%|███       | 71/235 [00:04<00:10, 16.24it/s, loss=1.13, v_num=28] 

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
def train_vae_tune(config, num_epochs=10, num_gpus=1):
    # data_dir = os.path.expanduser(data_dir)
    model = LightningVAE(config)
    data_module = MNISTDataModule(
        batch_size=config["batch_size"]
    )
    trainer = pl.Trainer(
        max_epochs=num_epochs,
        gpus=num_gpus,
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=0,
        callbacks=[
            TuneReportCallback(
                {"dice": "ptl/dice"},
                on="validation_end",
            )
        ],
    )
    trainer.fit(model, data_module)

In [15]:
def tune_vae_asha(exp_name, run_name, num_samples=10, num_epochs=10, gpus_per_trial=0):

    # config = create_search_space()

    checkpoint_dir = os.path.join("runs", exp_name)

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    # search_alg = HyperOptSearch(
    #     config,
    #     metric="dice",
    #     mode="max"
    #     # , points_to_evaluate=intial_best_config
    # )
    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)


    train_fn_with_parameters = tune.with_parameters(
        train_vae_tune,
        num_epochs=num_epochs,
        num_gpus=gpus_per_trial,
        # data_dir=data_dir,
    )
    resources_per_trial = {"cpu": 1, "gpu": gpus_per_trial}
    # resources_per_trial = (
    #     tune.PlacementGroupFactory(
    #         [
    #             {"CPU": args.num_workers, "GPU": args.gpus},
    #         ],
    #         strategy="SPREAD",
    #     ),
    # )
    config={
        "lr": tune.grid_search([0.0001, 0.001]),
        "latent_dim":tune.grid_search([256, 512]),
        "kernel_size" :tune.grid_search([5]),
        "dropout_rate":tune.grid_search([0.4]),
        "alpha": tune.grid_search([1,5]),
        "beta": tune.grid_search([0.01,0.05, 0.1, 0.5]),# 
        "norm" : tune.grid_search([Norm.INSTANCE]),
        # "depth" : tune.grid_search([3]),
        "channel" : tune.grid_search([(32, 64, 128)]),
        "stride" : tune.grid_search([(2,2,4)]),
        "num_resnets" : tune.grid_search([1]),
        "batch_size" : tune.grid_search([30]),
        
        
    }
    analysis = tune.run(
        train_fn_with_parameters,
        resources_per_trial=resources_per_trial,
        # search_alg=search_alg,
        # stop=stopper,
        metric="dice",
        mode="max",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        # progress_reporter=reporter,
        name=run_name,
        local_dir=checkpoint_dir,
        log_to_file=True,
        checkpoint_freq=0,
        keep_checkpoints_num=0,
    )

    print("Best hyperparameters found were: ", analysis.best_config)

In [16]:
tune_vae_asha(
    exp_name="ray",
    run_name="test",
    num_samples=1,
    num_epochs=2,
    gpus_per_trial=1,
)

Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"


[2m[36m(train_vae_tune pid=10428)[0m   f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
[2m[36m(train_vae_tune pid=10428)[0m GPU available: True, used: True
[2m[36m(train_vae_tune pid=10428)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_vae_tune pid=10428)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_vae_tune pid=10428)[0m   "The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7."
  0%|          | 0/9912422 [00:00<?, ?it/s]


[2m[36m(train_vae_tune pid=10428)[0m Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
[2m[36m(train_vae_tune pid=10428)[0m Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz


9913344it [00:00, 93009203.47it/s]                             


[2m[36m(train_vae_tune pid=10428)[0m Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw
[2m[36m(train_vae_tune pid=10428)[0m 
[2m[36m(train_vae_tune pid=10428)[0m Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
[2m[36m(train_vae_tune pid=10428)[0m Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz
[2m[36m(train_vae_tune pid=10428)[0m Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw
[2m[36m(train_vae_tune pid=10428)[0m 
[2m[36m(train_vae_tune pid=10428)[0m Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
[2m[36m(train_vae_tune pid=10428)[0m Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz


29696it [00:00, 110420258.50it/s]        
1649664it [00:00, 23938942.74it/s]         


[2m[36m(train_vae_tune pid=10428)[0m Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw
[2m[36m(train_vae_tune pid=10428)[0m 
[2m[36m(train_vae_tune pid=10428)[0m Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
[2m[36m(train_vae_tune pid=10428)[0m Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz


5120it [00:00, 32537631.03it/s]         
[2m[36m(train_vae_tune pid=10428)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(train_vae_tune pid=10428)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[2m[36m(train_vae_tune pid=10428)[0m Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw
[2m[36m(train_vae_tune pid=10428)[0m 


Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"


[2m[36m(train_vae_tune pid=10428)[0m 
[2m[36m(train_vae_tune pid=10428)[0m   | Name     | Type           | Params
[2m[36m(train_vae_tune pid=10428)[0m --------------------------------------------
[2m[36m(train_vae_tune pid=10428)[0m 0 | model    | VarAutoEncoder | 1.1 M 
[2m[36m(train_vae_tune pid=10428)[0m 1 | vae_loss | KLLoss         | 0     
[2m[36m(train_vae_tune pid=10428)[0m 2 | dice     | Dice           | 0     
[2m[36m(train_vae_tune pid=10428)[0m --------------------------------------------
[2m[36m(train_vae_tune pid=10428)[0m 1.1 M     Trainable params
[2m[36m(train_vae_tune pid=10428)[0m 0         Non-trainable params
[2m[36m(train_vae_tune pid=10428)[0m 1.1 M     Total params
[2m[36m(train_vae_tune pid=10428)[0m 4.516     Total estimated model params size (MB)
[2m[36m(train_vae_tune pid=10428)[0m   f"The dataloader, {name}, does not have many workers which may be a bottleneck."
[2m[36m(train_vae_tune pid=10428)[0m   f"The dataloader, 

Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"


Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"


Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"


Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"


Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"


Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"


Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)"


Result for train_vae_tune_dd20d_00000:
  date: 2022-04-13_11-57-57
  dice: 0.7010906934738159
  done: false
  experiment_id: 603cf124e44a4f5998b0c2161a9ef29f
  hostname: ayon-playground
  iterations_since_restore: 1
  node_ip: 10.128.0.35
  pid: 10428
  time_since_restore: 39.42370557785034
  time_this_iter_s: 39.42370557785034
  time_total_s: 39.42370557785034
  timestamp: 1649851077
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: dd20d_00000
  


Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride,iter,total time (s),dice
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",1.0,39.4237,0.701091
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)",,,


Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride,iter,total time (s),dice
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",1.0,39.4237,0.701091
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)",,,




Trial name,status,loc,alpha,batch_size,beta,channel,dropout_rate,kernel_size,latent_dim,lr,norm,num_resnets,stride,iter,total time (s),dice
train_vae_tune_dd20d_00000,RUNNING,10.128.0.35:10428,1,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",1.0,39.4237,0.701091
train_vae_tune_dd20d_00001,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00002,PENDING,,1,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00003,PENDING,,5,30,0.05,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00004,PENDING,,1,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00005,PENDING,,5,30,0.1,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00006,PENDING,,1,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00007,PENDING,,5,30,0.5,"(32, 64, 128)",0.4,5,256,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00008,PENDING,,1,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)",,,
train_vae_tune_dd20d_00009,PENDING,,5,30,0.01,"(32, 64, 128)",0.4,5,512,0.0001,INSTANCE,1,"(2, 2, 4)",,,


2022-04-13 11:58:11,901	ERROR tune.py:635 -- Trials did not complete: [train_vae_tune_dd20d_00000, train_vae_tune_dd20d_00001, train_vae_tune_dd20d_00002, train_vae_tune_dd20d_00003, train_vae_tune_dd20d_00004, train_vae_tune_dd20d_00005, train_vae_tune_dd20d_00006, train_vae_tune_dd20d_00007, train_vae_tune_dd20d_00008, train_vae_tune_dd20d_00009, train_vae_tune_dd20d_00010, train_vae_tune_dd20d_00011, train_vae_tune_dd20d_00012, train_vae_tune_dd20d_00013, train_vae_tune_dd20d_00014, train_vae_tune_dd20d_00015, train_vae_tune_dd20d_00016]
2022-04-13 11:58:11,902	INFO tune.py:639 -- Total run time: 55.43 seconds (55.18 seconds for the tuning loop).


Best hyperparameters found were:  {'lr': 0.0001, 'latent_dim': 256, 'kernel_size': 5, 'dropout_rate': 0.4, 'alpha': 1, 'beta': 0.01, 'norm': 'INSTANCE', 'channel': (32, 64, 128), 'stride': (2, 2, 4), 'num_resnets': 1, 'batch_size': 30}
