In [1]:
import os
import time
import sys
import math
import gc
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch as tc
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data

import torchvision
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
from torch.autograd import Variable

#get_ipython().magic('matplotlib inline')
from PIL import Image

import torch.nn.functional as F

### System properties and libs currently in use
- We have developed using python 3.5.x, pytorch 0.2.1
- No significant attention was given to backwards compatibility

In [2]:
print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA VERSION')
#from subprocess import call
#call(["nvcc", "--version"])
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__Devices')
#call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
print('Active CUDA Device: GPU', torch.cuda.current_device())

__Python VERSION: 3.5.4 |Continuum Analytics, Inc.| (default, Aug 14 2017, 13:41:13) [MSC v.1900 64 bit (AMD64)]
__pyTorch VERSION: 0.2.1+a4fc05a
__CUDA VERSION
__CUDNN VERSION: None
__Number CUDA Devices: 1
__Devices
Active CUDA Device: GPU 0


### Utilities
- Saving images and models

In [3]:
def save_images(netG, noise, outputDir,epoch):
   # the first 64 samples from the mini-batch are saved.
   fake,_ = netG(fixed_noise)
   vutils.save_image(fake.data[0:64,:,:,:],'%s/fake_samples_epoch_%03d.png' % (outputDir, epoch), nrow=8)

def save_models(netG, netD, outputDir, epoch):
   torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (outputDir, epoch))
   torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (outputDir, epoch))

In [4]:
cudnn.benchmark = True

use_gpu = torch.cuda.is_available()
#use_gpu = False
if use_gpu:
    print("You are using CUDA. If it is not what you want, manually set this as False!")

You are using CUDA. If it is not what you want, manually set this as False!


### Output Directory
- This is where images will be saved to

In [5]:
outputDir = 'outputdir_train_classifier'

try:
    os.makedirs(outputDir)
except OSError as err:
    print("OS error: {0}".format(err))

OS error: [WinError 183] Não é possível criar um arquivo já existente: 'outputdir_train_classifier'


### Dataset definition and hyperparameter setting
- Changing dataset name alters network architecture parameters
- Currently supporting few datasets
- Hyperparameters defined according to Radford et al. (2015)

Valores típicos são

nc = 3,
ngpu = 1,
nz = 100,
ngf = 64,
ndf = 64,
n_extra_d = 0,
n_extra_g = 1,
imageSize = 64

In [6]:
batch_size=64

# This should, in the future, be set in CLI
chosen_dataset = 'CIFAR10'

datasets = {
    'MNIST': torchvision.datasets.MNIST,
    'CIFAR10': torchvision.datasets.CIFAR10,
    'ANIME': '/home/gabriel/Redes Neurais/Projeto_Final_GANS/Tutorial_2/dataset/min_anime-faces',
}

dataset = datasets[chosen_dataset]

In [7]:
possible_parameters = {
    'MNIST': {
        'ndf': 64,
        'ngf': 64,
        'nz': 50,
        'nc': 1,
        'n_classes' : 10,
    },
    'CIFAR10': {
        'ndf': 64,
        'ngf': 64,
        'nz': 100,
        'nc': 3,
        'n_extra_d' : 0,
        'n_extra_g' : 0, # Aqui a jogada é que o gerador deve ser mais poderoso q o detetive
        'imageSize' : 32,
        'n_classes' : 10,
        'ngpu' : 1,
    },
    'ANIME': {
        'nc' : 3,
        'ngpu' : 1,
        'nz' : 100,
        'ngf' : 64,
        'ndf' : 64,
        'n_extra_d' : 0,
        'n_extra_g' : 0, # Aqui a jogada é que o gerador deve ser mais poderoso q o detetive
        'imageSize' : 64,
        'n_classes' : 1
    }
}

In [8]:
ngf = possible_parameters[chosen_dataset]['ngf']
ndf = possible_parameters[chosen_dataset]['ndf']
nz = possible_parameters[chosen_dataset]['nz']
nc = possible_parameters[chosen_dataset]['nc']
imageSize = possible_parameters[chosen_dataset]['imageSize']
n_classes = possible_parameters[chosen_dataset]['n_classes']
ngpu = possible_parameters[chosen_dataset]['ngpu']
n_extra_d = possible_parameters[chosen_dataset]['n_extra_d']
n_extra_g = possible_parameters[chosen_dataset]['n_extra_g']

## Creating the Dataset!

In [9]:
if dataset == 'ANIME':
    dataset = dset.ImageFolder(
        root='/home/gabriel/Redes Neurais/Projeto_Final_GANS/Tutorial_2/dataset/min_anime-faces',
        transform=transforms.Compose([
                transforms.Scale((imageSize, imageSize)),
                # transforms.CenterCrop(opt.imageSize),
                transforms.ToTensor(),
                #transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5)), # bring images to (-1,1)
            ])
    )
else:
    transform = transforms.Compose([
                    transforms.Scale((imageSize, imageSize)),
                    transforms.ToTensor(),
                    transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5)), # bring images to (-1,1)
                ]) 
    dataset_done = dataset('./datasets', train=True, download=True, transform=transform)
    dataloader = tc.utils.data.DataLoader(dataset_done, batch_size=batch_size, shuffle=True, num_workers=1
)


Files already downloaded and verified


## Definição dos modelos
- Model is a DCGAN
- Images are sized (nc, 32, 32)

In [10]:
class _netD_1(nn.Module):
    def __init__(self, ngpu, nz, nc, ndf,  n_extra_layers_d, n_classes):
        super(_netD_1, self).__init__()
        self.ngpu = ngpu
        self.conv1 = nn.Conv2d(in_channels = nc, out_channels = ndf, kernel_size=4, stride=2, padding=1, bias=False)
        self.conv2 = nn.Conv2d(in_channels = ndf, out_channels = ndf*2, kernel_size=4, stride=2, padding=1, bias=False)
        self.batch2 = nn.BatchNorm2d(ndf * 2)
        self.conv3 = nn.Conv2d(in_channels = ndf*2, out_channels = ndf*4, kernel_size=4, stride=2, padding=1, bias=False)
        self.batch3 = nn.BatchNorm2d(ndf * 4)
        self.conv4 = nn.Conv2d(in_channels = ndf*4, out_channels = ndf*8, kernel_size=4, stride=2, padding=1, bias=False)
        self.batch4 = nn.BatchNorm2d(ndf * 8)
        
        self.final_conv = nn.Conv2d(in_channels=ndf*8, out_channels=n_classes+1,kernel_size=2,stride=1,padding=0,bias=False)
        
    def forward(self, x):
        x = F.leaky_relu(self.conv1(x),0.2,inplace=True)
        x = F.leaky_relu(self.batch2(self.conv2(x)),0.2,inplace=True)
        x = F.leaky_relu(self.batch3(self.conv3(x)),0.2,inplace=True)
        x = F.leaky_relu(self.batch4(self.conv4(x)),0.2,inplace=True)
        x = self.final_conv(x)
        
        x = F.sigmoid(x)

        return(x)

In [11]:
class _netG_1(nn.Module):
    def __init__(self, ngpu, nz, nc , ngf, n_extra_layers_g):
        super(_netG_1, self).__init__()
        self.ngpu = ngpu
        self.z = None
        self.convt1 = nn.ConvTranspose2d(in_channels=nz, out_channels=ngf * 8, kernel_size=4, stride=2, padding=0, bias=False)
        self.batch1 = nn.BatchNorm2d(ngf*8)
        self.convt2 = nn.ConvTranspose2d(in_channels=ngf * 8, out_channels=ngf * 4, kernel_size=4, stride=2, padding=1, bias=False)
        self.batch2 = nn.BatchNorm2d(ngf*4)
        self.convt3 = nn.ConvTranspose2d(in_channels=ngf * 4, out_channels=ngf * 2, kernel_size=4, stride=2, padding=1, bias=False)
        self.batch3 = nn.BatchNorm2d(ngf*2)
        self.convt4 = nn.ConvTranspose2d(in_channels=ngf*2, out_channels=ngf, kernel_size=4, stride=2, padding=1, bias=False)
        self.batch4 = nn.BatchNorm2d(ngf)
        
        self.final_conv = nn.ConvTranspose2d(in_channels=ngf, out_channels=nc, kernel_size=1, stride=1, padding=0, bias=False)
        
    def forward(self, x):
        x = F.relu(self.batch1(self.convt1(x)))
        x = F.relu(self.batch2(self.convt2(x)))
        x = F.relu(self.batch3(self.convt3(x)))
        x = F.relu(self.batch4(self.convt4(x)))
        x = self.final_conv(x)
        x = F.tanh(x)
        
        return (x),self.z

In [12]:
print(ngpu, nz, nc, ngf, n_extra_g)
netG = _netG_1(ngpu, nz, nc, ngf, n_extra_g)
#netG_parallel = torch.nn.DataParallel(_netG_1(ngpu, nz, nc, ngf, n_extra_g))
print(ngpu, nz, nc, ndf, n_extra_d,n_classes)
netD = _netD_1(ngpu, nz, nc, ndf, n_extra_d,n_classes)
#netD_parallel = torch.nn.DataParallel(_netD_1(ngpu, nz, nc, ndf, n_extra_d))

1 100 3 64 0
1 100 3 64 0 10


## Inicializador de pesos

In [13]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

In [14]:
netG.apply(weights_init)
netD.apply(weights_init)
print(netG, '\n', netD)

_netG_1 (
  (convt1): ConvTranspose2d(100, 512, kernel_size=(4, 4), stride=(2, 2), bias=False)
  (batch1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True)
  (convt2): ConvTranspose2d(512, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
  (batch2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
  (convt3): ConvTranspose2d(256, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
  (batch3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True)
  (convt4): ConvTranspose2d(128, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
  (batch4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
  (final_conv): ConvTranspose2d(64, 3, kernel_size=(1, 1), stride=(1, 1), bias=False)
) 
 _netD_1 (
  (conv1): Conv2d(3, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
  (conv2): Conv2d(64, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
  (batch2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affi

## Losses
- Binary Cross-Entropy is used to differentiate real and fake images
- Class loss should be Cross-Entropy

In [15]:
criterion = nn.BCELoss()
criterion_MSE = nn.MSELoss()

## Sizes of the tensors

In [16]:
input = torch.FloatTensor(batch_size, 3, imageSize, imageSize)
print(input.size())
noise = torch.FloatTensor(batch_size, nz, 1, 1)
print(noise.size())

torch.Size([64, 3, 32, 32])
torch.Size([64, 100, 1, 1])


In [17]:
binary=False
#Ele testa pergunta se vc quer que o seu Z venha da distribuição bernoulli
if binary:
    bernoulli_prob = torch.FloatTensor(batch_size, nz, 1, 1).fill_(0.5)
    fixed_noise = torch.bernoulli(bernoulli_prob)
else:
    fixed_noise = torch.FloatTensor(batch_size, nz, 1, 1).normal_(0, 1)

In [18]:
label = torch.FloatTensor(batch_size,n_classes)
print('Label size:', label.size())
real_label = 1
fake_label = 0

Label size: torch.Size([64, 10])


In [19]:
one_hot = torch.LongTensor(64, n_classes+1).zero_()


## Broadcast to GPU

In [20]:
if use_gpu:
    netD.cuda()
    netG.cuda()
    criterion = criterion.cuda()
    criterion_MSE = criterion_MSE.cuda()
    input,label = input.cuda(), label.cuda()
    noise, fixed_noise = noise.cuda(), fixed_noise.cuda()

## Create Variables
- Convert frequently used tensors to Variables, avoids broadcasting things to GPU and definition overheads

In [21]:
input = Variable(input)
label = Variable(label)
noise = Variable(noise)
fixed_noise = Variable(fixed_noise)
one_hot = Variable(one_hot)

## Optimizer Parameters
- Following the lead of Radford et al., 2015:

    <b>
    1. beta1 = 0.5
    2. lr = 0.0002
    </b>
    
- For WGAN, parameters follow Arjovsky et al., 2017:

    <b>
    1. clamp_lower = clamp_upper = 0.1
    2. Optimizer: Adam(lr=1e-5, beta1=0.5, beta2=0.999)
    </b>

In [22]:
beta1, beta2 = 0.5, 0.999
lr = 2.0e-4
lr_wgan = 5.0e-5
optimizerD = optim.Adam(netD.parameters(), lr = lr, betas = (beta1, beta2))
#optimizerD = optim.Adam(netD_parallel.parameters(), lr = lr, betas = (beta1, beta2))

optimizerG = optim.Adam(netG.parameters(), lr = lr, betas = (beta1, beta2))
#optimizerG = optim.Adam(netG_parallel.parameters(), lr = lr, betas = (beta1, beta2))

optimizerD_wgan = optim.RMSprop(netD.parameters(), lr = lr_wgan)
optimizerG_wgan = optim.RMSprop(netG.parameters(), lr = lr_wgan)

In [23]:
test_input_G = tc.randn(64, 100, 1, 1)
test_input_G = Variable(test_input_G)
test_output_G,_ = netG(test_input_G.cuda())

test_output_D = netD(test_output_G)

print('Generator output size:', test_output_G.size())
print('Discriminator output size:', test_output_D.size())

Generator output size: torch.Size([64, 3, 32, 32])
Discriminator output size: torch.Size([64, 11, 1, 1])



## Treinamento 

In [24]:
def train_gan(num_epochs, dataloader, netD, netG, d_labelSmooth, outputDir,
              model_option=1, binary=False, epoch_interval=100,
              D_steps=1, G_steps=1):
    # This validation is subjective. WGAN-GP uses 100 D_steps...
    assert D_steps < 5, "Keep it low, D_steps is too high."
    assert G_steps < 3, "Keep it low, G_steps is too high."
    #assert batch_size % D_steps == 0, "Use batch_size multiple of D_steps."
    
    print('Lets train!')
    for epoch in range(num_epochs):
        start_iter = time.time()  
        D_x = 0
        D_G_z1 = 0
        D_G_z2 = 0
        errD_acum = 0
        errG_acum = 0

        for batch, data in enumerate(dataloader, 0):
            for step in range(D_steps):
                #############################################################
                # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
                # 1A - Train the detective network in the Real Dataset
                #############################################################
                
                netD.zero_grad()
                one_hot.data.fill_(0)
                #real_cpu, _ = data
                start = step*(int(data[0].size()[0]/D_steps))
                end = (step+1)*int(data[0].size()[0]/D_steps)
                #real_cpu = data[0][step*(int(data[0].size()[0]/D_steps)):(step+1)*int(data[0].size()[0]/D_steps)]
                inp, target = data

                #Aqui começa um one-hot-encoding
                target_ = tc.unsqueeze(target,1)
                one_hot.data.resize_(target_.size()[0],one_hot.size()[1])
                one_hot.scatter_(1, target_, 1)
                real_cpu = data[0][start:end]
                real_cpu = real_cpu.cuda()
                if (epoch == 0 and step == 0):
                    vutils.save_image(real_cpu[0:64,:,:,:], '%s/real_samples.png' % outputDir, nrow=8)

                batch_size = real_cpu.size(0)
                input.data.resize_(real_cpu.size()).copy_(real_cpu)
                ones_ = Variable(tc.ones(inp.size(0),1)).cuda()
                label.data.resize_(inp.size(0),label.size(1))
                label2 = Variable(tc.cat((label.data,ones_.data),1))

                label2.data.resize_(batch_size,one_hot.size(1)).copy_(one_hot.data) # use smooth label for discriminator

                output = netD(input)
                errD_real = criterion(output.squeeze(),label2)
                errD_real.backward()
                
                D_x += output.data.mean()
                
                #######################################################
                # 1B - Train the detective network in the False Dataset
                #######################################################
                
                noise.data.resize_(batch_size, nz, 1, 1)
                if binary:
                    bernoulli_prob.resize_(noise.data.size())
                    noise.data.copy_(2*(torch.bernoulli(bernoulli_prob)-0.5))
                else:
                    noise.data.normal_(0, 1)
                fake,_ = netG(noise)
                label.data.fill_(fake_label)
                output = netD(fake.detach()) # add ".detach()" to avoid backprop through G
                label3 = Variable(tc.cat((label.data,tc.zeros(inp.size(0),1).cuda()),1))
                errD_fake = criterion(output.squeeze(), label3)
                errD_fake.backward() # gradients for fake/real will be accumulated
                
                D_G_z1 += output.data.mean()

                errD_acum += errD_real.data[0] + errD_fake.data[0]

                optimizerD.step() # .step() can be called once the gradients are computed

            for step in range(G_steps):
                ####################################################################################
                # (2) Update G network: maximize log(D(G(z)))
                # Train the faker with de output from the Detective (but don't train the Detective)
                ####################################################################################
                
                netG.zero_grad()
                label.data.fill_(real_label) # fake labels are real for generator cost
                output = netD(fake)
                errG = criterion(output.squeeze(), label2)
                errG.backward(retain_graph = False) # True if backward through the graph for the second time
                #errG.backward() # True if backward through the graph for the second time

                '''if model_option == 2: # with z predictor
                    errG_z = criterion_MSE(z_prediction, noise)
                    errG_z.backward()'''
                
                D_G_z2 += output.data.mean()
                errG_acum += errG.data[0]
                optimizerG.step()

        print('epoch = ',epoch)

        end_iter = time.time()        

        print('[%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f Elapsed %.2f s'
              % (epoch, num_epochs, errD_acum/D_steps, errG_acum/G_steps, D_x, D_G_z1, D_G_z2, end_iter-start_iter))

        #Save a grid with the pictures from the dataset, up until 64
        save_images(netG = netG, noise = fixed_noise, outputDir = outputDir, epoch = epoch)

        if epoch % epoch_interval == 0:
            # do checkpointing
            save_models(netG = netG, netD = netD, outputDir = outputDir, epoch = epoch)
            

In [None]:
def train_WGAN(num_epochs, dataloader, netD, netG, outputDir, D_iters):
    gen_iterations = 0
    for epoch in range(num_epochs):
        data_iter = iter(dataloader)
        i = 0
        self.D_iters = D_iters
        while i < len(dataloader):
            ############################
            # (1) Update D network
            ###########################
            for p in netD.parameters(): # reset requires_grad
                p.requires_grad = True # they are set to False below in netG update

            # train the discriminator D_iters times
            if gen_iterations < 25 or gen_iterations % 500 == 0:
                D_iters = 100
            else:
                D_iters = self.D_iters
            j = 0
            while j < D_iters and i < len(dataloader):
                j += 1

                # clamp parameters to a cube
                for p in netD.parameters():
                    p.data.clamp_(clamp_lower, clamp_upper)

                data = data_iter.next()
                i += 1

                # train with real
                real_cpu, _ = data
                netD.zero_grad()
                batch_size = real_cpu.size(0)

                if use_cuda:
                    real_cpu = real_cpu.cuda()
                input.resize_as_(real_cpu).copy_(real_cpu)
                inputv = Variable(input)

                errD_real = netD(inputv)
                errD_real.backward(one)

                # train with fake
                noise.resize_(opt.batchSize, nz, 1, 1).normal_(0, 1)
                noisev = Variable(noise, volatile = True) # totally freeze netG
                fake = Variable(netG(noisev).data)
                inputv = fake
                errD_fake = netD(inputv)
                errD_fake.backward(mone)
                errD = errD_real - errD_fake
                optimizerD.step()

            ############################
            # (2) Update G network
            ###########################
            for p in netD.parameters():
                p.requires_grad = False # to avoid computation
            netG.zero_grad()
            # in case our last batch was the tail batch of the dataloader,
            # make sure we feed a full batch of noise
            noise.resize_(batch_size, nz, 1, 1).normal_(0, 1)
            noisev = Variable(noise)
            fake = netG(noisev)
            errG = netD(fake)
            errG.backward(one)
            optimizerG.step()
            gen_iterations += 1

            print('[%d/%d][%d/%d][%d] Loss_D: %f Loss_G: %f Loss_D_real: %f Loss_D_fake %f'
                % (epoch, num_epochs, i, len(dataloader), gen_iterations,
                errD.data[0], errG.data[0], errD_real.data[0], errD_fake.data[0]))
            if gen_iterations % 500 == 0:
                real_cpu = real_cpu.mul(0.5).add(0.5)
                vutils.save_image(real_cpu, '{0}/real_samples.png'.format(outputDir))
                fake = netG(Variable(fixed_noise, volatile=True))
                fake.data = fake.data.mul(0.5).add(0.5)
                vutils.save_image(fake.data, '{0}/fake_samples_{1}.png'.format(outputDir, gen_iterations))


In [None]:
num_epochs = 100
d_labelSmooth = 0.2

train_gan(num_epochs, dataloader, netD,netG,d_labelSmooth, outputDir)

Lets train!
epoch =  0
[0/100] Loss_D: 175.1227 Loss_G: 401.2646 D(x): 62.1774 D(G(z)): 11.2965 / 7.1651 Elapsed 347.27 s
epoch =  1
[1/100] Loss_D: 135.9184 Loss_G: 486.9321 D(x): 62.9956 D(G(z)): 7.8007 / 4.3787 Elapsed 462.62 s
epoch =  2
[2/100] Loss_D: 118.9677 Loss_G: 504.4207 D(x): 61.8220 D(G(z)): 8.9303 / 5.5620 Elapsed 462.47 s
epoch =  3
[3/100] Loss_D: 106.3372 Loss_G: 530.0243 D(x): 61.9986 D(G(z)): 8.7394 / 5.5370 Elapsed 462.43 s
epoch =  4
[4/100] Loss_D: 93.3501 Loss_G: 559.6893 D(x): 63.0682 D(G(z)): 7.6923 / 4.8354 Elapsed 462.46 s
epoch =  5
[5/100] Loss_D: 82.7167 Loss_G: 595.5511 D(x): 63.6585 D(G(z)): 7.1837 / 4.2380 Elapsed 461.83 s
epoch =  6
[6/100] Loss_D: 74.9327 Loss_G: 616.8739 D(x): 63.5913 D(G(z)): 7.2211 / 4.1723 Elapsed 462.33 s
epoch =  7
[7/100] Loss_D: 67.3900 Loss_G: 639.7911 D(x): 63.4693 D(G(z)): 7.3892 / 4.0358 Elapsed 461.92 s
epoch =  8
[8/100] Loss_D: 60.4419 Loss_G: 666.1385 D(x): 63.5175 D(G(z)): 7.3082 / 3.7780 Elapsed 460.35 s
epoch =  9
