In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
# import torchvision.datasets as dset
# import torchvision.transforms as transforms
# import torchvision.utils as vutils
from torch.autograd import Variable
import torch.autograd as autograd
import numpy as np
import pescador
import librosa

In [2]:
class PhaseShuffle(nn.Module):
    """
    Performs phase shuffling, i.e. shifting feature axis of a 3D tensor
    by a random integer in {-n, n} and performing reflection padding where
    necessary
    """
    def __init__(self, n):
        super(PhaseShuffle, self).__init__()
        self.n = n
        
    def forward(self, x):
        # Make sure to use PyTorch to generate number RNG state is all shared
        k = int(torch.Tensor(1).random_(0,self.n + 1)) - 5
        
        # Return if no phase shift
        if k == 0:
            return x
        
        # Slice feature dimension
        if k > 0:
            x_trunc = x[:, :, :-k]
            pad = (0, k)
        else:
            x_trunc = x[:, :, -k:]
            pad = (-k, 0)
        
        # Reflection padding
        x_shuffle = F.pad(x_trunc, pad, mode='reflect')
        assert x_shuffle.shape == x.shape, "{}, {}".format(x_shuffle.shape, x.shape)
        return x_shuffle
        


class WaveGANGenerator(nn.Module):
    def __init__(self, d, ngpus, c=1, latent_dim=100, verbose=False):
        super(WaveGANGenerator, self).__init__()
        self.ngpus = ngpus
        self.d = d
        self.c = c
        self.latent_dim = latent_dim
        self.fc1 = nn.Linear(100, 256*d)
        self.verbose = verbose
        
        self.tconv1 = nn.ConvTranspose1d(16*d, 8*d, 25, stride=4, padding=11, output_padding=1) 
        self.tconv2 = nn.ConvTranspose1d(8*d, 4*d, 25, stride=4, padding=11, output_padding=1) 
        self.tconv3 = nn.ConvTranspose1d(4*d, 2*d, 25, stride=4, padding=11, output_padding=1) 
        self.tconv4 = nn.ConvTranspose1d(2*d, d, 25, stride=4, padding=11, output_padding=1) 
        self.tconv5 = nn.ConvTranspose1d(d, c, 25, stride=4, padding=11, output_padding=1) 

    def forward(self, x):
        
        x = F.relu(self.fc1(x))

        x = x.view(-1, 16*self.d, 16)
        if self.verbose:
            print(x.shape)
            
        x = F.relu(self.tconv1(x))
        if self.verbose:
            print(x.shape)
            
        x = F.relu(self.tconv2(x))
        if self.verbose:
            print(x.shape)
            
        x = F.relu(self.tconv3(x))
        if self.verbose:
            print(x.shape)
            
        x = F.relu(self.tconv4(x))
        if self.verbose:
            print(x.shape)
            
        output = F.tanh(self.tconv5(x))
        if self.verbose:
            print(output.shape)

        return output

    
class WaveGANDiscriminator(nn.Module):
    def __init__(self, d, ngpus, c=1, n=2, verbose=False):
        super(WaveGANDiscriminator, self).__init__()
        self.d = d
        self.ngpus = ngpus
        self.c = c
        self.n = n
        self.verbose = verbose
        # Conv2d(in_channels, out_channels, kernel_size, stride=1, etc.)
        self.conv1 = nn.Conv1d(c, d, 25, stride=4, padding=11)
        self.conv2 = nn.Conv1d(d, 2*d, 25, stride=4, padding=11)
        self.conv3 = nn.Conv1d(2*d, 4*d, 25, stride=4, padding=11)
        self.conv4 = nn.Conv1d(4*d, 8*d, 25, stride=4, padding=11)
        self.conv5 = nn.Conv1d(8*d, 16*d, 25, stride=4, padding=11)
        self.ps1 = PhaseShuffle(n)
        self.ps2 = PhaseShuffle(n)
        self.ps3 = PhaseShuffle(n)
        self.ps4 = PhaseShuffle(n)
        self.fc1 = nn.Linear(256*d, 1)

    def forward(self, x):
        x = F.leaky_relu(self.conv1(x))
        if self.verbose:
            print(x.shape)
        x = self.ps1(x)
        
        x = F.leaky_relu(self.conv2(x))
        if self.verbose:
            print(x.shape)            
        x = self.ps2(x)
        
        x = F.leaky_relu(self.conv3(x))
        if self.verbose:
            print(x.shape)
        x = self.ps3(x)
        
        x = F.leaky_relu(self.conv4(x))
        if self.verbose:
            print(x.shape)            
        x = self.ps4(x)
        
        x = self.conv5(x)
        if self.verbose:
            print(x.shape)
            
        x = x.view(-1, 256*self.d)
        if self.verbose:
            print(x.shape)
            
        return F.sigmoid(self.fc1(x))

In [3]:
# Create generator
latent_dim = 100
gen = WaveGANGenerator(d=64, ngpus=0, c=1, latent_dim=latent_dim).cuda()

In [4]:
# Create discriminator
disc = WaveGANDiscriminator(d=64, ngpus=0).cuda()

In [5]:
# Sample from noise distribution p(z)
z = torch.Tensor(5, latent_dim).uniform_(0, 1)
z = Variable(z)
z = z.cuda()

In [6]:
# Run the sample through the generator to generate a sample
# from the model distribution
out = gen(z)

In [7]:
# Evaluate the given waveforms with the discriminator
disc(out)

Variable containing:
 0.4997
 0.4997
 0.4997
 0.4997
 0.4997
[torch.cuda.FloatTensor of size 5x1 (GPU 0)]

In [8]:
def file_sample_generator(filepath, window_length=16384, fs=16000):
    """
    Audio sample generator
    """
    audio_data = librosa.load(filepath, sr=fs)
    audio_len = len(audio_data)
    
    # Pad audio to at least a single frame
    if audio_len < window_length:
        pad_length = window_length - audio_len
        left_pad = pad_length // 2
        right_pad = pad_length - left_pad
        
        audio_data = np.pad(audio_data, (left_pad, right_pad), mode='constant')
        audio_len = len(audio_data)
        
    while True:
        if audio_len == window_length:
            # If we only have a single frame's worth of audio, just yield the whole audio
            sample = audio_data
        else:
            # Sample a random window from the audio file
            start_idx = np.random.randint(0,audio_len - window_length)
            end_idx = start_idx + window_length
            sample = audio_data[start_idx:end_idx]
            
        yield {'X': sample}
    
def create_batch_generator(audio_filepath_list, batch_size):
    streamers = []
    for audio_filepath in audio_filepath_list:
        s = pescador.Streamer(file_sample_generator, audio_filepath)
        streamers.append(s)
        
    mux = pescador.ShuffledMux(streamers)
    batch_gen = pescador.buffer_stream(mux, batch_size)
    
    return batch_gen

In [9]:
# Adapted from https://github.com/caogang/wgan-gp/blob/master/gan_toy.py
def calc_gradient_penalty(model_dis, real_data, fake_data, batch_size, lmbda):
    # Compute interpolation factors
    alpha = torch.rand(batch_size, 1)
    alpha = alpha.expand(real_data.size())
    alpha = alpha.cuda() if use_cuda else alpha

    # Interpolate between real and fake data
    interpolates = alpha * real_data + ((1 - alpha) * fake_data)
    if use_cuda:
        interpolates = interpolates.cuda()
    interpolates = autograd.Variable(interpolates, requires_grad=True)

    # Evaluate discriminator
    disc_interpolates = model_dis(interpolates)

    # Obtain gradients of the discriminator with respect to the inputs
    gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates,
                              grad_outputs=torch.ones(disc_interpolates.size()).cuda() if use_cuda else torch.ones(
                                  disc_interpolates.size()),
                              create_graph=True, retain_graph=True, only_inputs=True)[0]

    # Compute MSE between 1.0 and the gradient of the norm penalty to encourage discriminator
    # to be a 1-Lipschitz function
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * lmbda
    return gradient_penalty

In [18]:
# Adapted from https://github.com/caogang/wgan-gp/blob/master/gan_toy.py
def train_wavegan(train_gen, valid_data, test_data, num_epochs, batches_per_epoch,
                  batch_size, lmbda=0.1, ngpus=1, model_size=64, discriminator_updates=5, epochs_per_sample=10,
                  sample_size=20, loss='wgan-gp', lr=1e-4, beta_1=0.5, beta_2=0.9, latent_dim=100):
    # TODO: Incorporate validation and test data
    
    # Build the models
    model_gen = WaveGANGenerator(model_size, ngpus, c=1)
    model_dis = WaveGANDiscriminator(model_size, ngpus, c=1, n=2)
    
    use_cuda = ngpus >= 1
    
    # Convenient values for 
    one = torch.FloatTensor([1])
    neg_one = one * -1
    if use_cuda:
        one = one.cuda()
        neg_one = neg_one.cuda()
    
    # Initialize optimizers for each model
    optimizer_gen = optim.Adam(model_gen.parameters(), lr=lr, betas=(beta_1, beta_2))
    optimizer_dis = optim.Adam(model_dis.parameters(), lr=lr, betas=(beta_1, beta_2))

    # Sample noise used for seeing the evolution of generated output samples throughout training
    sample_noise = torch.randn(sample_size, 100)
    if use_cuda:
        sample_noise = sample_noise.cuda()
    sample_noisev = autograd.Variable(sample_noise)
    
    samples = {}
    history = []
    
    train_iter = iter(train_gen)
    
    # Loop over the dataset multiple times
    for epoch in range(num_epochs):
        print("epoch", epoch)

        
        
        epoch_history = []

        for batch_idx in range(batches_per_epoch):

            # Set model parameters to require gradients to be computed and stored
            for p in model_dis.parameters():
                p.requires_grad = True

            # Initialize the metrics for this batch
            batch_history = {
                'discriminator': [],
                'generator': {}
            }
            
            # Discriminator Training Phase:
            # -> Train discriminator k times
            for iter_d in range(discriminator_updates):
                # Get new batch of real training data
                real_data = next(train_iter)
                real_data = torch.Tensor(real_data['X']).view(real_data['X'].shape[0], real_data['X'].shape[1], 1)
                if use_cuda:
                    real_data = real_data.cuda()
                real_data_v = autograd.Variable(real_data)

                # Reset gradients
                model_dis.zero_grad()

                # a) Compute loss contribution from real training data and backprop
                # (negative of the empirical mean, w.r.t. the data distribution, of the discr. output)
                D_real = model_dis(real_data_v)
                D_real = D_real.mean()
                # Negate since we want to _maximize_ this quantity
                D_real.backward(neg_one)

                # b) Compute loss contribution from generated data and backprop
                # (empirical mean, w.r.t. the generator distribution, of the discr. output)
                # Generate noise in latent space
                noise = torch.randn(batch_size, 2)
                if use_cuda:
                    noise = noise.cuda()
                noisev = autograd.Variable(noise, volatile=True)  # totally freeze model_gen
                # Generate data by passing noise through the generator
                fake = autograd.Variable(model_gen(noisev, real_data_v).data)
                inputv = fake
                D_fake = model_dis(inputv)
                D_fake = D_fake.mean()
                D_fake.backward(one)

                # c) Compute gradient penalty and backprop
                gradient_penalty = calc_gradient_penalty(model_dis, real_data_v.data, fake.data, batch_size, lmbda)
                gradient_penalty.backward()
                
                # d) Update the discriminator
                optimizer_dis.step()
                
                # Compute metrics and record in batch history
                D_cost = D_fake - D_real + gradient_penalty
                Wasserstein_D = D_real - D_fake
                batch_history['discriminator'].append({
                    'cost': D_cost.data.numpy(),
                    'wasserstein_cost': Wasserstein_D.data.numpy()
                })

            ############################
            # (2) Update G network
            ###########################
            
            # Prevent discriminator from computing gradients, since
            # we are only updating the generator
            for p in model_dis.parameters():
                p.requires_grad = False

            # Reset generator gradients
            model_gen.zero_grad()

            # Sample from the generator
            noise = torch.randn(batch_size, 100)
            if use_cuda:
                noise = noise.cuda()
            noisev = autograd.Variable(noise)
            fake = model_gen(noisev)
            
            # Compute generator loss and backprop
            # (negative of empirical mean (w.r.t generator distribution) of discriminator output)
            G = model_dis(fake)
            G = G.mean()
            G.backward(neg_one)
            G_cost = -G

            # Update generator
            optimizer_gen.step()

            # Record generator loss
            batch_history['generator']['cost'] = G_cost.data.numpy()
            
            # Record batch metrics
            epoch_history.append(batch_history)
            
        # Record epoch metrics
        history.append(epoch_history)
        
        if (epoch + 1) % samples_per_epoch == 0:
            # Generate outputs for fixed latent samples
            samples[epoch] = model_gen(sample_noisev).data.numpy()
            
    return model_gen, model_dis, history, samples

In [19]:
# Try on some training data
# gen1 = create_batch_generator(["/scratch/nl1115/Bach_Lute.mp3"], 128)
gen1 = create_batch_generator(["/beegfs/jtc440/aml/TheDrumClub-Kit004-THEMEGABUNDLE/Doepfer MS-404/Tom.wav"],128)

model_gen, model_dis, history, samples = train_wavegan(train_gen=gen1, 
                                                       valid_data=None, 
                                                       test_data=None, 
                                                       num_epochs=10, 
                                                       batches_per_epoch=4,
                                                       batch_size=8, 
                                                       lmbda=0.1, 
                                                       ngpus=4, 
                                                       model_size=64, 
                                                       discriminator_updates=5, 
                                                       epochs_per_sample=10,
                                                       sample_size=20, 
                                                       loss='wgan-gp')

epoch 0


RuntimeError: Given groups=1, weight[64, 1, 25], so expected input[128, 16384, 1] to have 1 channels, but got 16384 channels instead

In [None]:
torch.cuda.is_available()