In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import os
# import torchvision.datasets as dset
# import torchvision.transforms as transforms
# import torchvision.utils as vutils
from torch.autograd import Variable
import torch.autograd as autograd
import numpy as np
import pescador
import librosa
import pprint
from IPython.display import Audio

In [14]:
class PhaseShuffle(nn.Module):
    """
    Performs phase shuffling, i.e. shifting feature axis of a 3D tensor
    by a random integer in {-n, n} and performing reflection padding where
    necessary
    """
    def __init__(self, n):
        super(PhaseShuffle, self).__init__()
        self.n = n
        
    def forward(self, x):
        # Make sure to use PyTorch to generate number RNG state is all shared
        k = int(torch.Tensor(1).random_(0,self.n + 1)) - 5
        
        # Return if no phase shift
        if k == 0:
            return x
        
        # Slice feature dimension
        if k > 0:
            x_trunc = x[:, :, :-k]
            pad = (0, k)
        else:
            x_trunc = x[:, :, -k:]
            pad = (-k, 0)
        
        # Reflection padding
        x_shuffle = F.pad(x_trunc, pad, mode='reflect')
        assert x_shuffle.shape == x.shape, "{}, {}".format(x_shuffle.shape, x.shape)
        return x_shuffle
        


class WaveGANGenerator(nn.Module):
    def __init__(self, d, ngpus, c=1, latent_dim=100, verbose=False):
        super(WaveGANGenerator, self).__init__()
        self.ngpus = ngpus
        self.d = d
        self.c = c
        self.latent_dim = latent_dim
        self.fc1 = nn.DataParallel(nn.Linear(latent_dim, 256*d))
        self.verbose = verbose
        
        self.tconv1 = nn.DataParallel(nn.ConvTranspose1d(16*d, 8*d, 25, stride=4, padding=11, output_padding=1))
        self.tconv2 = nn.DataParallel(nn.ConvTranspose1d(8*d, 4*d, 25, stride=4, padding=11, output_padding=1))
        self.tconv3 = nn.DataParallel(nn.ConvTranspose1d(4*d, 2*d, 25, stride=4, padding=11, output_padding=1))
        self.tconv4 = nn.DataParallel(nn.ConvTranspose1d(2*d, d, 25, stride=4, padding=11, output_padding=1))
        self.tconv5 = nn.DataParallel(nn.ConvTranspose1d(d, c, 25, stride=4, padding=11, output_padding=1))
        
        for m in self.modules():
            if isinstance(m, nn.ConvTranspose1d) or isinstance(m, nn.Linear):
                nn.init.kaiming_normal(m.weight.data)

    def forward(self, x):
        
        x = self.fc1(x).view(-1, 16*self.d, 16)
        x = F.relu(x)
        if self.verbose:
            print(x.shape)
            
        x = F.relu(self.tconv1(x))
        if self.verbose:
            print(x.shape)
            
        x = F.relu(self.tconv2(x))
        if self.verbose:
            print(x.shape)
            
        x = F.relu(self.tconv3(x))
        if self.verbose:
            print(x.shape)
            
        x = F.relu(self.tconv4(x))
        if self.verbose:
            print(x.shape)
            
        output = F.tanh(self.tconv5(x))
        if self.verbose:
            print(output.shape)

        return output

    
class WaveGANDiscriminator(nn.Module):
    def __init__(self, d, ngpus, c=1, n=2, alpha=0.2, verbose=False):
        super(WaveGANDiscriminator, self).__init__()
        self.d = d
        self.ngpus = ngpus
        self.c = c
        self.n = n
        self.alpha=alpha
        self.verbose = verbose
        # Conv2d(in_channels, out_channels, kernel_size, stride=1, etc.)
        self.conv1 = nn.DataParallel(nn.Conv1d(c, d, 25, stride=4, padding=11))
        self.conv2 = nn.DataParallel(nn.Conv1d(d, 2*d, 25, stride=4, padding=11))
        self.conv3 = nn.DataParallel(nn.Conv1d(2*d, 4*d, 25, stride=4, padding=11))
        self.conv4 = nn.DataParallel(nn.Conv1d(4*d, 8*d, 25, stride=4, padding=11))
        self.conv5 = nn.DataParallel(nn.Conv1d(8*d, 16*d, 25, stride=4, padding=11))
        self.ps1 = PhaseShuffle(n)
        self.ps2 = PhaseShuffle(n)
        self.ps3 = PhaseShuffle(n)
        self.ps4 = PhaseShuffle(n)
        self.fc1 = nn.DataParallel(nn.Linear(256*d, 1))
        
        for m in self.modules():
            if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear):
                nn.init.kaiming_normal(m.weight.data)

    def forward(self, x):
        x = F.leaky_relu(self.conv1(x), negative_slope=self.alpha)
        if self.verbose:
            print(x.shape)
        x = self.ps1(x)
        
        x = F.leaky_relu(self.conv2(x), negative_slope=self.alpha)
        if self.verbose:
            print(x.shape)            
        x = self.ps2(x)
        
        x = F.leaky_relu(self.conv3(x), negative_slope=self.alpha)
        if self.verbose:
            print(x.shape)
        x = self.ps3(x)
        
        x = F.leaky_relu(self.conv4(x), negative_slope=self.alpha)
        if self.verbose:
            print(x.shape)            
        x = self.ps4(x)
        
        
        x = F.leaky_relu(self.conv5(x), negative_slope=self.alpha)
        if self.verbose:
            print(x.shape)
            
        x = x.view(-1, 256*self.d)
        if self.verbose:
            print(x.shape)
            
        return F.sigmoid(self.fc1(x))

In [15]:
# Create generator
latent_dim = 100
gen = WaveGANGenerator(d=64, ngpus=0, c=1, latent_dim=latent_dim).cuda()

RuntimeError: cuda runtime error (46) : all CUDA-capable devices are busy or unavailable at /opt/conda/conda-bld/pytorch_1518243271935/work/torch/lib/THC/generic/THCStorage.cu:58

In [7]:
gen

NameError: name 'gen' is not defined

In [12]:
# Create discriminator
disc = WaveGANDiscriminator(d=64, ngpus=0).cuda()

RuntimeError: cuda runtime error (46) : all CUDA-capable devices are busy or unavailable at /opt/conda/conda-bld/pytorch_1518243271935/work/torch/lib/THC/generic/THCStorage.cu:58

In [11]:
disc

NameError: name 'disc' is not defined

In [6]:
# Sample from noise distribution p(z)
z = torch.Tensor(5, latent_dim).uniform_(0, 1)
z = Variable(z)
z = z.cuda()

In [7]:
# Run the sample through the generator to generate a sample
# from the model distribution
out = gen(z)

In [8]:
# Evaluate the given waveforms with the discriminator
disc(out)

Variable containing:
 0.7596
 0.6637
 0.5983
 0.7279
 0.6790
[torch.cuda.FloatTensor of size 5x1 (GPU 0)]

In [9]:
def file_sample_generator(filepath, window_length=16384, fs=16000):
    """
    Audio sample generator
    """
    try:
        audio_data, _ = librosa.load(filepath, sr=fs)
    except Exception as e:
        raise StopIteration()
        
    audio_len = len(audio_data)
    
    # Pad audio to at least a single frame
    if audio_len < window_length:
        pad_length = window_length - audio_len
        left_pad = pad_length // 2
        right_pad = pad_length - left_pad
        
        audio_data = np.pad(audio_data, (left_pad, right_pad), mode='constant')
        audio_len = len(audio_data)
        
    while True:
        if audio_len == window_length:
            # If we only have a single frame's worth of audio, just yield the whole audio
            sample = audio_data
        else:
            # Sample a random window from the audio file
            start_idx = np.random.randint(0,audio_len - window_length)
            end_idx = start_idx + window_length
            sample = audio_data[start_idx:end_idx]
            
        sample = sample.astype('float32')
        assert not np.any(np.isnan(sample))
            
        yield {'X': sample}
    
def create_batch_generator(audio_filepath_list, batch_size):
    streamers = []
    for audio_filepath in audio_filepath_list:
        s = pescador.Streamer(file_sample_generator, audio_filepath)
        streamers.append(s)
        
    mux = pescador.ShuffledMux(streamers)
    batch_gen = pescador.buffer_stream(mux, batch_size)
    
    return batch_gen

In [18]:
def get_all_audio_filepaths(audio_dir):
    return [os.path.join(root, fname)
            for (root, dir_names, file_names) in os.walk(audio_dir)
            for fname in file_names
            if fname.lower().endswith('.mp3')]

In [11]:
def create_data_split(audio_filepath_list, valid_ratio, test_ratio, train_batch_size, valid_size, test_size):
    num_files = len(audio_filepath_list)
    num_valid = int(np.ceil(num_files * valid_ratio))
    num_test = int(np.ceil(num_files * test_ratio))
    num_train = num_files - num_valid - num_test
    
    assert num_valid > 0
    assert num_test > 0
    assert num_train > 0
    
    valid_files = audio_filepath_list[:num_valid]
    test_files = audio_filepath_list[num_valid:num_valid+num_test]
    train_files = audio_filepath_list[num_valid+num_test:]
    
    train_gen = create_batch_generator(train_files, train_batch_size)
    valid_data = next(iter(create_batch_generator(valid_files, valid_size)))
    test_data = next(iter(create_batch_generator(train_files, test_size)))
    
    return train_gen, valid_data, test_data

In [12]:
# Adapted from https://github.com/caogang/wgan-gp/blob/master/gan_toy.py
def calc_gradient_penalty(model_dis, real_data, fake_data, batch_size, lmbda, use_cuda=True):
    # Compute interpolation factors
    alpha = torch.rand(batch_size, 1, 1)
    alpha = alpha.expand(real_data.size())
    alpha = alpha.cuda() if use_cuda else alpha

    # Interpolate between real and fake data
    interpolates = alpha * real_data + ((1 - alpha) * fake_data)
    if use_cuda:
        interpolates = interpolates.cuda()
    interpolates = autograd.Variable(interpolates, requires_grad=True)

    # Evaluate discriminator
    disc_interpolates = model_dis(interpolates)

    # Obtain gradients of the discriminator with respect to the inputs
    gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates,
                              grad_outputs=torch.ones(disc_interpolates.size()).cuda() if use_cuda else torch.ones(
                                  disc_interpolates.size()),
                              create_graph=True, retain_graph=True, only_inputs=True)[0]
    gradients = gradients.view(gradients.size(0), -1)

    # Compute MSE between 1.0 and the gradient of the norm penalty to encourage discriminator
    # to be a 1-Lipschitz function
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * lmbda
    return gradient_penalty

In [13]:
def compute_discr_loss_terms(model_dis, model_gen, real_data_v, noise_v, batch_size, latent_dim,
                             lmbda, use_cuda, compute_grads=False):
    
    # Convenient values for 
    one = torch.FloatTensor([1])
    neg_one = one * -1
    if use_cuda:
        one = one.cuda()
        neg_one = neg_one.cuda()
    
    # Reset gradients
    model_dis.zero_grad()

    # a) Compute loss contribution from real training data and backprop
    # (negative of the empirical mean, w.r.t. the data distribution, of the discr. output)
    D_real = model_dis(real_data_v)
    D_real = D_real.mean()
    # Negate since we want to _maximize_ this quantity
    if compute_grads:
        D_real.backward(neg_one)

    # b) Compute loss contribution from generated data and backprop
    # (empirical mean, w.r.t. the generator distribution, of the discr. output)
    # Generate noise in latent space

    # Generate data by passing noise through the generator
    fake = autograd.Variable(model_gen(noise_v).data)
    inputv = fake
    D_fake = model_dis(inputv)
    D_fake = D_fake.mean()
    if compute_grads:
        D_fake.backward(one)


    # c) Compute gradient penalty and backprop
    gradient_penalty = calc_gradient_penalty(model_dis, real_data_v.data, fake.data,
                                             batch_size, lmbda, use_cuda=use_cuda)

    if compute_grads:
        gradient_penalty.backward(one)



    # Compute metrics and record in batch history
    D_cost = D_fake - D_real + gradient_penalty
    Wasserstein_D = D_real - D_fake

    return D_cost, Wasserstein_D

                

In [14]:
def compute_gener_loss_terms(model_dis, model_gen, batch_size, latent_dim, use_cuda, compute_grads=False):
    # Convenient values for 
    one = torch.FloatTensor([1])
    neg_one = one * -1
    if use_cuda:
        one = one.cuda()
        neg_one = neg_one.cuda()
        
    # Reset generator gradients
    model_gen.zero_grad()

    # Sample from the generator
    noise = torch.randn(batch_size, latent_dim)
    if use_cuda:
        noise = noise.cuda()
    noise_v = autograd.Variable(noise)
    fake = model_gen(noise_v)

    # Compute generator loss and backprop
    # (negative of empirical mean (w.r.t generator distribution) of discriminator output)
    G = model_dis(fake)
    G = G.mean()
    if compute_grads:
        G.backward(neg_one)
    G_cost = -G
    
    return G_cost

In [15]:
def np_to_input_var(data, use_cuda):
    data = data[:,np.newaxis,:]
    data = torch.Tensor(data)
    if use_cuda:
        data = data.cuda()
    return autograd.Variable(data)

In [16]:
# Adapted from https://github.com/caogang/wgan-gp/blob/master/gan_toy.py
def train_wavegan(train_gen, valid_data, test_data, num_epochs, batches_per_epoch,
                  batch_size, lmbda=0.1, ngpus=1, model_size=64, discriminator_updates=5, epochs_per_sample=10,
                  sample_size=20, loss='wgan-gp', lr=1e-4, beta_1=0.5, beta_2=0.9, latent_dim=100):
    # TODO: Incorporate validation and test data

    use_cuda = ngpus >= 1
    
    # Build the models
    model_gen = WaveGANGenerator(model_size, ngpus, c=1, latent_dim=latent_dim)
    model_dis = WaveGANDiscriminator(model_size, ngpus, c=1, n=2, alpha=0.2)

    if use_cuda:
        model_gen = model_gen.cuda()
        model_dis = model_dis.cuda()
    
    # Initialize optimizers for each model
    optimizer_gen = optim.Adam(model_gen.parameters(), lr=lr, betas=(beta_1, beta_2))
    optimizer_dis = optim.Adam(model_dis.parameters(), lr=lr, betas=(beta_1, beta_2))

    # Sample noise used for seeing the evolution of generated output samples throughout training
    sample_noise = torch.randn(sample_size, latent_dim)
    if use_cuda:
        sample_noise = sample_noise.cuda()
    sample_noise_v = autograd.Variable(sample_noise)
    
    samples = {}
    history = []
    
    train_iter = iter(train_gen)
    valid_data_v = np_to_input_var(valid_data['X'], use_cuda)
    test_data_v = np_to_input_var(test_data['X'], use_cuda)

    
    # Loop over the dataset multiple times
    for epoch in range(num_epochs):
        print("Epoch: {}".format(epoch+1))

        epoch_history = []

        for batch_idx in range(batches_per_epoch):

            # Set model parameters to require gradients to be computed and stored
            for p in model_dis.parameters():
                p.requires_grad = True

            # Initialize the metrics for this batch
            batch_history = {
                'discriminator': [],
                'generator': {}
            }
            
            # Discriminator Training Phase:
            # -> Train discriminator k times
            for iter_d in range(discriminator_updates): 
                # Get real examples
                real_data_v = np_to_input_var(next(train_iter)['X'], use_cuda)
                
                # Get noise
                noise = torch.randn(batch_size, latent_dim)
                if use_cuda:
                    noise = noise.cuda()
                noise_v = autograd.Variable(noise, volatile=True)  # totally freeze model_gen
                
                # Get new batch of real training data
                D_cost_train, D_wass_train = compute_discr_loss_terms(
                    model_dis, model_gen, real_data_v, noise_v, batch_size, latent_dim,
                    lmbda, use_cuda, compute_grads=True)
                
                # Update the discriminator
                optimizer_dis.step()
                
                D_cost_valid, D_wass_valid = compute_discr_loss_terms(
                    model_dis, model_gen, valid_data_v, noise_v, batch_size, latent_dim,
                    lmbda, use_cuda, compute_grads=False)
                
                if use_cuda:
                    D_cost_train = D_cost_train.cpu()
                    D_cost_valid = D_cost_valid.cpu()
                    D_wass_train = D_wass_train.cpu()
                    D_wass_valid = D_wass_valid.cpu()

                
                batch_history['discriminator'].append({
                    'cost': D_cost_train.data.numpy()[0],
                    'wasserstein_cost': D_wass_train.data.numpy()[0],
                    'cost_validation': D_cost_valid.data.numpy()[0],
                    'wasserstein_cost_validation': D_wass_valid.data.numpy()[0]
                })

            ############################
            # (2) Update G network
            ###########################
            
            # Prevent discriminator from computing gradients, since
            # we are only updating the generator
            for p in model_dis.parameters():
                p.requires_grad = False

            G_cost = compute_gener_loss_terms(model_dis, model_gen, batch_size, latent_dim,
                                              use_cuda, compute_grads=True)

            # Update generator
            optimizer_gen.step()
            
            if use_cuda:
                G_cost = G_cost.cpu()

            # Record generator loss
            batch_history['generator']['cost'] = G_cost.data.numpy()[0]
            
            # Record batch metrics
            epoch_history.append(batch_history)
            
        # Record epoch metrics
        history.append(epoch_history)
        
        pprint.pprint(epoch_history[-1])
        print("\n")
        
        if (epoch + 1) % epochs_per_sample == 0:
            # Generate outputs for fixed latent samples
            samp_output = model_gen(sample_noise_v)
            if use_cuda:
                samp_output = samp_output.cpu()
            samples[epoch] = samp_output.data.numpy()
            
    ## Get final discriminator loss
    # Get noise
    noise = torch.randn(batch_size, latent_dim)
    if use_cuda:
        noise = noise.cuda()
    noise_v = autograd.Variable(noise, volatile=True)  # totally freeze model_gen

    # Get new batch of real training data
    D_cost_test, D_wass_test = compute_discr_loss_terms(
        model_dis, model_gen, test_data_v, noise_v, batch_size, latent_dim,
        lmbda, use_cuda, compute_grads=False)

    D_cost_valid, D_wass_valid = compute_discr_loss_terms(
        model_dis, model_gen, valid_data_v, noise_v, batch_size, latent_dim,
        lmbda, use_cuda, compute_grads=False)

    if use_cuda:
        D_cost_test = D_cost_test.cpu()
        D_cost_valid = D_cost_valid.cpu()
        D_wass_test = D_wass_test.cpu()
        D_wass_valid = D_wass_valid.cpu()

    final_discr_metrics = {
        'cost_validation': D_cost_valid.data.numpy()[0],
        'wasserstein_cost_validation': D_wass_valid.data.numpy()[0],
        'cost_test': D_cost_test.data.numpy()[0],
        'wasserstein_cost_test': D_wass_test.data.numpy()[0],
    }
    
    return model_gen, model_dis, history, final_discr_metrics, samples

In [19]:
# Try on some training data
batch_size = 64
# gen1 = create_batch_generator(["/scratch/nl1115/Bach_Lute.mp3"], batch_size)
# valid_data = None
# test_data = None
audio_dir = "/home/nl1115/gc"
audio_filepaths = get_all_audio_filepaths(audio_dir)
gen1, valid_data, test_data = create_data_split(audio_filepaths, 0.1, 0.1, batch_size, 64, 64)
model_gen, model_dis, history, final_discr_metrics, samples = train_wavegan(train_gen=gen1, 
                                                       valid_data=valid_data, 
                                                       test_data=test_data, 
                                                       num_epochs=20, 
                                                       batches_per_epoch=8,
                                                       batch_size=batch_size, 
                                                       lmbda=10, 
                                                       ngpus=4, 
                                                       model_size=64,
                                                       discriminator_updates=1,
                                                       latent_dim=100,
                                                       epochs_per_sample=1,
                                                       sample_size=20, 
                                                       loss='wgan-gp') 

Epoch: 1
{'discriminator': [{'cost': 7.5978117,
                    'cost_validation': 4.7575846,
                    'wasserstein_cost': 0.5577497,
                    'wasserstein_cost_validation': 0.31056494}],
 'generator': {'cost': -0.6573186}}


Epoch: 2
{'discriminator': [{'cost': 5.668211,
                    'cost_validation': 5.497856,
                    'wasserstein_cost': 0.69285566,
                    'wasserstein_cost_validation': 0.37201858}],
 'generator': {'cost': -0.44290248}}


Epoch: 3
{'discriminator': [{'cost': 5.9453278,
                    'cost_validation': 6.6520057,
                    'wasserstein_cost': -0.0070417523,
                    'wasserstein_cost_validation': 0.772184}],
 'generator': {'cost': -0.01578604}}


Epoch: 4
{'discriminator': [{'cost': 6.858527,
                    'cost_validation': 5.4078135,
                    'wasserstein_cost': 0.7865025,
                    'wasserstein_cost_validation': 0.7326024}],
 'generator': {'cost': -0.193

In [23]:
Audio(filename=audio_filepaths[0])

In [24]:
Audio(data=samples[4][18,0], rate=16000)

In [24]:
data, fs = librosa.load('/scratch/jtc440/wavegan_output/20180403092653/100/18.wav')

In [21]:
Audio(data=data, rate=fs)

In [25]:
data.shape

(22580,)