In [1]:
# Based on tutorial available at: https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html

In [2]:
!pip install ray[tune]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
import torch
import torch.nn as nn
import numpy as np
import torchvision.datasets
import torchvision.utils as vutils
from torchvision import transforms
from torch.utils.data import Subset
from torch.utils.data import DataLoader
from torch.nn.modules.flatten import Flatten
import time, copy
import matplotlib.pyplot as plt
import sklearn.metrics as metrics

from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

from ray.air import session
from ray.air.checkpoint import Checkpoint
from ray.air.config import ScalingConfig

from functools import partial


# device config (train our model on GPU if it is available which is much faster)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
image_size = 64
transform = transforms.Compose([
                                transforms.Resize(image_size),
                                transforms.CenterCrop(image_size),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                            ])
flowers_train = torchvision.datasets.Flowers102('', split = "train", transform =transform, download=True)


In [5]:
# Define a Data loader
def load_data(image_size:int=64, batch_size=102):
  image_size = 64
  transform = transforms.Compose([
                                transforms.Resize(image_size),
                                transforms.CenterCrop(image_size),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
                            ])
  flowers_train = torchvision.datasets.Flowers102('/content', split = "train", transform =transform, download=False)
  dataloader = DataLoader(flowers_train, batch_size=batch_size, shuffle=True)
  return dataloader

In [6]:
# Define a configurable neural network
class Discriminator(nn.Module):
    def __init__(self, nc, ndf):
        super(Discriminator, self).__init__()
        self.pipeline = nn.Sequential(
            # input is (nc) x 64 x 64
            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 32 x 32
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*2) x 16 x 16
            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 8 x 8
            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*8) x 4 x 4
            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
            nn.Sigmoid(),
        )

    def forward(self, input):
        return self.pipeline(input)

class Generator(nn.Module):
    def __init__(self, nc, nz, ngf):
        super(Generator, self).__init__()
        self.pipeline = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(),
            # state size. (ngf*8) x 4 x 4
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(),
            # state size. (ngf*4) x 8 x 8
            nn.ConvTranspose2d( ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(),
            # state size. (ngf*2) x 16 x 16
            nn.ConvTranspose2d( ngf * 2, ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(),
            # state size. (ngf) x 32 x 32
            nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
            nn.Tanh(),
            # state size. (nc) x 64 x 64
        )

    def forward(self, input):
        return self.pipeline(input)

In [7]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)

In [8]:
def train_dcgan(config, checkpoint_dir=None, data_dir=None):
  nc = 3 # number of channels in training images
  nz = config['nz'] # size of latent vector (generator input)
  ngf = config['ngf'] # size of feature maps in generator
  ndf = config['ndf'] # size of feature maps in discriminator
  
  # Instantiate models
  netD = Discriminator(nc, ndf)
  netG = Generator(nc, nz, ngf)
  
  # Enable multi GPU support if available with DataParallel
  device = "cpu"
  if torch.cuda.is_available():
      device = "cuda:0"
      if torch.cuda.device_count() > 1:
          netD = nn.DataParallel(netD)
          netG = nn.DataParallel(netG)

  netD.to(device)
  netG.to(device)

  # Initialize weights
  netD.apply(weights_init)
  netG.apply(weights_init)
  
  # Number of training epochs
  num_epochs = config['epochs']

  # Learning rate for optimizers
  lr_G = config['lr_G']
  lr_D = config['lr_D']

  # Beta1 hyperparam for Adam optimizers
  beta1 = config['beta1']

  # TODO: Initialize BCELoss function
  criterion = nn.BCELoss()

  # Setup Adam optimizers for both G and D
  optimizerD = torch.optim.Adam(netD.parameters(), lr=lr_D, betas=(beta1, 0.999))
  optimizerG = torch.optim.Adam(netG.parameters(), lr=lr_G, betas=(beta1, 0.999))

  # Enable checkpoints
  if checkpoint_dir:
    # For generator
    model_state, optimizer_state = torch.load(os.path.join(checkpoint_dir, "checkpoint_G"))
    netG.load_state_dict(model_state)
    optimizerG.load_state_dict(optimizer_state)
    # For discriminator
    model_state, optimizer_state = torch.load(os.path.join(checkpoint_dir, "checkpoint_D"))
    netD.load_state_dict(model_state)
    optimizerD.load_state_dict(optimizer_state)

  # Prepare training data
  dataloader = load_data()

  # Keep track of training time
  since = time.time()
    
  # Keep track of how loss evolves during training
  training_curves = {}
  training_curves['G'] = [] # for generator
  training_curves['D'] = [] # for discriminator

  # Establish convention for real and fake labels during training
  real_label = 1.
  fake_label = 0.

  for epoch in range(num_epochs):

    print(f'\nEpoch {epoch+1}/{num_epochs}')
    print('-' * 10)

    for _, inputs in enumerate(dataloader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        optimizerD.zero_grad()
        # Format batch
        real_cpu = inputs[0].to(device)
        b_size = real_cpu.size(0)
        label = torch.full((b_size,), real_label, dtype=torch.float, device=device)
        # Forward pass real batch through D
        output = netD(real_cpu).view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, nz, 1, 1, device=device)
        # Generate fake image batch with G
        fake = netG(noise)
        label.fill_(fake_label)
        # Classify all fake batch with D
        output = netD(fake.detach()).view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output = netD(fake).view(-1)
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        training_curves['D'].append(errD.item())
        training_curves['G'].append(errG.item())

        print(f'D Loss: {errD.item():.4f}  G Loss: {errG.item():.4f}')

    with tune.checkpoint_dir(epoch) as checkpoint_dir:
        path = os.path.join(checkpoint_dir, "checkpoint_D")
        torch.save((netD.state_dict(), optimizerD.state_dict()), path)

        path = os.path.join(checkpoint_dir, "checkpoint_G")
        torch.save((netG.state_dict(), optimizerG.state_dict()), path)

    tune.report(D_loss=errD.item(),G_loss=errG.item())

  time_elapsed = time.time() - since
  print(f'\nTraining complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')

In [9]:
config = {
	"nz":     tune.sample_from(lambda _: 2**np.random.randint(4, 9)),
	"ngf":    tune.sample_from(lambda _: 2**np.random.randint(4, 9)),
	"ndf":    tune.sample_from(lambda _: 2**np.random.randint(4, 9)),
	"epochs": tune.choice([10]),
	"lr_G":   tune.loguniform(1e-5, 1e-2),
	"lr_D":   tune.loguniform(1e-5, 1e-2),
	"beta1":  tune.uniform(0, 1),
}

In [10]:
data_dir = "/content/drive/MyDrive/Doutorado/Deep Learning MIT/M6 - DCGAN/checkpoint"

In [11]:
def main(num_samples=20, max_num_epochs=100, gpus_per_trial=1, config=config):
    scheduler = ASHAScheduler(
        metric="G_loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=5,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["G_loss", "D_loss"])
    result = tune.run(
        partial(train_dcgan, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("G_loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final G_loss: {}".format(
        best_trial.last_result["G_loss"]))

In [12]:
# os.environ['TUNE_DISABLE_STRICT_METRIC_CHECKING']="1"
main()

2023-04-08 18:23:35,492	INFO worker.py:1553 -- Started a local Ray instance.

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/master/tune/api_docs/trainable.html



== Status ==
Current time: 2023-04-08 18:23:37 (running for 00:00:00.20)
Memory usage on this node: 1.9/25.5 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 80.000: None | Iter 40.000: None | Iter 20.000: None | Iter 10.000: None | Iter 5.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/15.05 GiB heap, 0.0/7.52 GiB objects
Result logdir: /root/ray_results/train_dcgan_2023-04-08_18-23-36
Number of trials: 16/20 (15 PENDING, 1 RUNNING)
+-------------------------+----------+-------------------+----------+----------+-------------+-------------+-------+-------+------+
| Trial name              | status   | loc               |    beta1 |   epochs |        lr_D |        lr_G |   ndf |   ngf |   nz |
|-------------------------+----------+-------------------+----------+----------+-------------+-------------+-------+-------+------|
| train_dcgan_7a714_00000 | RUNNING  | 172.28.0.12:47879 | 0.496684 |       10 | 5.06506e-05 | 0.000174306 |   128 |   128 |   64 |
| train_dcgan_7a7

Trial name,D_loss,G_loss,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_dcgan_7a714_00000,2.41119,9.56293,2023-04-08_18-25-36,True,,65dd894d48b446dabdf0a689fa7c38ca,3695f14f9796,10,172.28.0.12,47879,True,115.695,11.1496,115.695,1680978336,0,,10,7a714_00000,0.00383878
train_dcgan_7a714_00001,4.66893,0.400654,2023-04-08_18-29-10,True,,8153c4758e77497f95c79d17010cb2ad,3695f14f9796,10,172.28.0.12,48580,True,210.874,20.5897,210.874,1680978550,0,,10,7a714_00001,0.00370193
train_dcgan_7a714_00002,3.45958,0.38466,2023-04-08_18-31-23,True,,fab8ec22d9c14f8c98dc757f32a0bbe8,3695f14f9796,10,172.28.0.12,49669,True,130.161,12.6633,130.161,1680978683,0,,10,7a714_00002,0.00356078
train_dcgan_7a714_00003,0.0142584,6.50717,2023-04-08_18-32-22,True,,c7111d8dbe974256be8185223626cd69,3695f14f9796,5,172.28.0.12,50434,True,54.4911,10.2143,54.4911,1680978742,0,,5,7a714_00003,0.00383019
train_dcgan_7a714_00004,100.0,0.0,2023-04-08_18-35-39,True,,17de7453b72749ef99885e8e2bb57349,3695f14f9796,10,172.28.0.12,50799,True,193.384,18.9431,193.384,1680978939,0,,10,7a714_00004,0.00366688
train_dcgan_7a714_00005,0.00753225,7.71449,2023-04-08_18-37-28,True,,a8617ef8718e4a14b2aae92668bde9e7,3695f14f9796,5,172.28.0.12,51823,True,105.944,20.3416,105.944,1680979048,0,,5,7a714_00005,0.00368857
train_dcgan_7a714_00006,3.59335e-38,99.6656,2023-04-08_18-39-47,True,,06eaffabbf0640f294bbe3659e05760f,3695f14f9796,5,172.28.0.12,52404,True,135.09,26.4374,135.09,1680979187,0,,5,7a714_00006,0.00368404
train_dcgan_7a714_00007,100.0,0.0,2023-04-08_18-43-06,True,,2910d11c8d8f46ce89ddfb7c9aba99dc,3695f14f9796,10,172.28.0.12,53115,True,195.4,19.159,195.4,1680979386,0,,10,7a714_00007,0.00394654
train_dcgan_7a714_00008,2.93637,0.334687,2023-04-08_18-44-34,True,,dfb1fcc05be7498c8b17d813a375c0b4,3695f14f9796,10,172.28.0.12,54142,True,83.9468,8.0882,83.9468,1680979474,0,,10,7a714_00008,0.00353646
train_dcgan_7a714_00009,100.0,0.0,2023-04-08_18-46-19,True,,2eee650ed9cd44c0b87e5e53f2e77e96,3695f14f9796,10,172.28.0.12,54692,True,100.906,9.69681,100.906,1680979579,0,,10,7a714_00009,0.00374365


[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
[2m[36m(func pid=54692)[0m 
[2m[36m(func pid=54692)[0m Epoch 8/10
[2m[36m(func pid=54692)[0m ----------
[2m[36m(func pid=54692)[0m D Loss: 100.0000  G Loss: 0.0000
[2m[36m(func pid=54692)[0m D Loss: 100.0000  G Loss: 0.0000
[2m[36m(func pid=54692)[0m D Loss: 100.0000  G Loss: 0.0000
[2m[36m(func pid=54692)[0m D Loss: 100.0000  G Loss: 0.0000
[2m[36m(func pid=54692)[0m D Loss: 100.0000  G Loss: 0.0000
== Status ==
Current time: 2023-04-08 18:45:55 (running for 00:22:18.50)
Memory usage on this node: 4.6/25.5 GiB 
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 80.000: None | Iter 40.000: None | Iter 20.000: None | Iter 10.000: -0.3596733510494232 | Iter 5.000: -0.7604804039001465
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/15.05 GiB heap, 0.0/7.52 GiB objects
Result logdir: /root/ray_results/train_dcgan_2023-04-08_18-23-36
Number of trials: 20/20 (10 PENDING, 1 RUNNING, 9 TERMINATED)


2023-04-08 18:58:48,331	INFO tune.py:798 -- Total run time: 2111.44 seconds (2111.30 seconds for the tuning loop).


== Status ==
Current time: 2023-04-08 18:58:48 (running for 00:35:11.32)
Memory usage on this node: 4.5/25.5 GiB 
Using AsyncHyperBand: num_stopped=11
Bracket: Iter 80.000: None | Iter 40.000: None | Iter 20.000: None | Iter 10.000: -0.004764430224895477 | Iter 5.000: -2.4467177987098694
Resources requested: 0/4 CPUs, 0/1 GPUs, 0.0/15.05 GiB heap, 0.0/7.52 GiB objects
Result logdir: /root/ray_results/train_dcgan_2023-04-08_18-23-36
Number of trials: 20/20 (20 TERMINATED)
+-------------------------+------------+-------------------+----------+----------+-------------+-------------+-------+-------+------+-------------+---------------+
| Trial name              | status     | loc               |    beta1 |   epochs |        lr_D |        lr_G |   ndf |   ngf |   nz |      G_loss |        D_loss |
|-------------------------+------------+-------------------+----------+----------+-------------+-------------+-------+-------+------+-------------+---------------|
| train_dcgan_7a714_00000 | TERM