# Imports

In [None]:
!pip install tensorflow_text
!pip install tensorflow_addons

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow_addons as tfa
import numpy as np
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# GAN

## Utils

### Orthogonal regularizer

In [None]:
class OrthogonalRegularizer(tf.keras.regularizers.Regularizer):
    def __init__(self, beta=1e-4, **kwargs):
        super(OrthogonalRegularizer, self).__init__(**kwargs)
        self.beta = beta

    def call(self, inputTensor):
        c = inputTensor.shape[-1]
        x = tf.reshape(inputTensor, (-1, c))
        orthoLoss = tf.matmul(x, x, transpose_a=True) * (1 - tf.eye(c))
        outputs = self.beta * tf.norm(orthoLoss)
        return outputs

### Normalized convolutional layer 

In [None]:
class SpectralConv1D(tf.keras.layers.Layer):
    def __init__(self, filters, kernelSize, strides=1,
                padding='same', dilation=1, activation=None,
                kernelInit=tf.initializers.Orthogonal,
                kernelReg=OrthogonalRegularizer(), **kwargs):
        super(SpectralConv1D, self).__init__(**kwargs)
        self.filters = filters
        self.kernelSize = kernelSize
        self.strides = strides
        self.padding = padding
        self.dilation = dilation
        self.activation = activation
        self.kernelInit = kernelInit
        self.kernelReg = kernelReg
        self.spectralConv = tfa.layers.SpectralNormalization(
            tf.keras.layers.Conv1D(filters=self.filters, kernel_size=self.kernelSize, strides=self.strides,
                                padding=self.padding, dilation_rate=self.dilation, activation=self.activation,
                                kernel_initializer=self.kernelInit, kernel_regularizer=self.kernelReg))
  
    def call(self, inputs):
        outputs = self.spectralConv(inputs)
        return outputs

### Normalized transpose layer

In [None]:
class SpectralConv1DTranspose(tf.keras.layers.Layer):
    def __init__(self, filters, kernelSize, strides, padding='same',
                kernelInit=tf.initializers.Orthogonal,
                kernelReg=OrthogonalRegularizer(), **kwargs):
        super(SpectralConv1DTranspose, self).__init__(**kwargs)
        self.filters = filters
        self.kernelSize = kernelSize
        self.strides = strides
        self.padding = padding
        self.kernelInit = kernelInit
        self.kernelReg = kernelReg
        self.spectralConvTranspose = tfa.layers.SpectralNormalization(
            tf.keras.layers.Conv1DTranspose(filters=self.filters, kernel_size=self.kernelSize,
                                            strides=self.strides, padding=self.padding,
                                            kernel_initializer=self.kernelInit, kernel_regularizer=self.kernelReg))
  
    def call(self, inputs):
        outputs = self.spectralConvTranspose(inputs)
        return outputs

## BERT 

In [None]:
class BERT(tf.keras.Model):
    def __init__(self, preprocessor, encoder, **kwargs):
        super(BERT, self).__init__(**kwargs)
        self.preprocessor = preprocessor
        self.encoder = encoder
        self.preprocess = hub.KerasLayer(preprocessor)
        self.encode = hub.KerasLayer(encoder)

    def call(self, inputs):
        outputs = self.preprocess(inputs)
        outputs = self.encode(outputs)
        outputs = tf.expand_dims(outputs["pooled_output"], axis=-1)
        return outputs

## CBHG module 

In [None]:
class Conv1DBank(tf.keras.Model):
    def __init__(self, channels, kernelSize, activation, isTraining, **kwargs):
        super(Conv1DBank, self).__init__(**kwargs)
        self.channels = channels
        self.kernelSize = kernelSize
        self.activation = activation
        self.isTraining = isTraining
        self.conv1d = tf.keras.layers.Conv1D(filters=self.channels, kernel_size=self.kernelSize,
                                             activation=self.activation, padding='same')
        self.batchNorm = tf.keras.layers.BatchNormalization(trainable=self.isTraining)

    def call(self, inputs):
        outputs = self.conv1d(inputs)
        outputs = self.batchNorm(outputs)
        return outputs

In [None]:
class CBHG(tf.keras.Model):
    def __init__(self, batchSize, K, isTraining, **kwargs):
        super(CBHG, self).__init__(**kwargs)
        self.batchSize = batchSize
        self.K = K
        self.isTraining = isTraining
        self.ConvBanks = [Conv1DBank(128, i, tf.nn.relu, self.isTraining) for i in range(1, self.K + 1)]
        self.maxPooling = tf.keras.layers.MaxPool1D(pool_size=2, strides=1, padding='same')
        self.firstProjectionConv = Conv1DBank(128, 3, tf.nn.relu, self.isTraining)
        self.secondProjectionConv = Conv1DBank(128, 3, None, self.isTraining)
        self.highwayNet = tf.keras.Sequential([tf.keras.layers.Dense(128, tf.nn.relu) for i in range(4)])
        self.bidirectionalGRU = tf.keras.layers.Bidirectional(
            tf.keras.layers.GRU(64, return_sequences=True), 
            backward_layer=tf.keras.layers.GRU(64, return_sequences=True, go_backwards=True))
        self.encoderPreNet = tf.keras.Sequential([
            tf.keras.layers.Dense(256, tf.nn.relu),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(128, tf.nn.relu),
            tf.keras.layers.Dropout(0.5)])
        self.lastProjectionConv = Conv1DBank(1, 3, None, self.isTraining)
        self.upsample = tf.keras.layers.UpSampling1D(size=400)
        self.conv = tf.keras.layers.Conv1D(256, 3, padding='same')
    
    def call(self, inputs):
        outputList = []
        for convBank in self.ConvBanks:
            outputList.append(convBank(inputs))
        outputs = tf.keras.layers.concatenate(outputList)
        outputs = self.maxPooling(outputs)
        outputs = self.firstProjectionConv(outputs)
        outputs = self.secondProjectionConv(outputs)
        highwayOutputs = outputs + inputs
        outputs = self.highwayNet(highwayOutputs)
        outputs = self.bidirectionalGRU(outputs)
        outputs = self.encoderPreNet(outputs)
        outputs = self.lastProjectionConv(outputs)
        outputs = tf.reshape(outputs, (self.batchSize, 1, 256))
        discOutputs = outputs
        outputs = self.upsample(outputs)
        genOutputs = self.conv(outputs)
        return genOutputs, discOutputs

## Generator



In [None]:
class Generator(tf.keras.Model):
    def __init__(self, batchSize, isTraining,  **kwargs):
        super(Generator, self).__init__(**kwargs)
        self.batchSize = batchSize
        self.isTraining = isTraining
        self.preProcess = SpectralConv1D(filters=768, kernelSize=3)
        self.generatorBlocks = [
            GeneratorBlock(768, self.isTraining, 1),
            GeneratorBlock(768, self.isTraining, 1),
            GeneratorBlock(384, self.isTraining, 2),
            GeneratorBlock(384, self.isTraining, 2),
            GeneratorBlock(384, self.isTraining, 2),
            GeneratorBlock(192, self.isTraining, 3),
            GeneratorBlock(96, self.isTraining, 5)]
        self.postProcess = SpectralConv1D(filters=1, kernelSize=3, activation='tanh')

    def call(self, inputs, noise):
        outputs = self.preProcess(inputs)
        for gblock in self.generatorBlocks:
            outputs = gblock(outputs, noise)
        outputs = self.postProcess(outputs)
        outputs = tf.reshape(outputs, shape=(self.batchSize, 48000, 1))
        return outputs

### Generator block 

In [None]:
class GeneratorBlock(tf.keras.Model):
    def __init__(self, channels, isTraining, upsampleFactor=1, **kwargs):
        super(GeneratorBlock, self).__init__(**kwargs)
        self.channels = channels
        self.upsampleFactor = upsampleFactor
        self.isTraining = isTraining
        self.firstCBN = ConditionalBatchNorm(self.isTraining)
        self.firstStack = tf.keras.Sequential([
            SpectralConv1DTranspose(self.channels, 3, strides=self.upsampleFactor),
            SpectralConv1D(self.channels, 3)])
        self.secondCBN = ConditionalBatchNorm(self.isTraining)
        self.firstDilatedConv = SpectralConv1D(self.channels, 3, dilation=2)
        self.residualStack = tf.keras.Sequential([
            SpectralConv1DTranspose(self.channels, 3, strides=self.upsampleFactor),
            SpectralConv1D(self.channels, 1)])
        self.thirdCBN = ConditionalBatchNorm(self.isTraining)
        self.secondDilatedConv = SpectralConv1D(self.channels, 3, dilation=4)
        self.fourthCBN = ConditionalBatchNorm(self.isTraining)
        self.finalDilatedConv = SpectralConv1D(self.channels, 3, dilation=8)
    

    def call(self, inputs, noise):
        outputs = self.firstCBN(inputs, noise)
        outputs = self.firstStack(outputs)
        outputs = self.secondCBN(outputs, noise)
        outputs = self.firstDilatedConv(outputs)
        residualOutputs = self.residualStack(inputs)
        outputs = outputs + residualOutputs
        outputs = self.thirdCBN(outputs, noise)
        outputs = self.secondDilatedConv(outputs)
        outputs = self.fourthCBN(outputs, noise)
        outputs = self.finalDilatedConv(outputs)
        return outputs

### Conditional batch normalization + Relu 

In [None]:
class ConditionalBatchNorm(tf.keras.Model):
    def __init__(self, isTraining, units=1, **kwargs):
        super(ConditionalBatchNorm, self).__init__(**kwargs)
        self.units = units
        self.isTraining = isTraining
        self.randomIdx = np.random.randint(0, 128)
        self.instanceNorm = tfa.layers.InstanceNormalization()
        self.matrixGamma = tf.keras.layers.Dense(
            self.units, trainable=self.isTraining,
            kernel_initializer=tf.keras.initializers.Constant(1.0))
        self.matrixBeta = tf.keras.layers.Dense(
            self.units, trainable=self.isTraining,
            kernel_initializer=tf.keras.initializers.Constant(0.0))
        self.flatten = tf.keras.layers.Flatten()
        self.relu = tf.keras.layers.ReLU()

    def call(self, inputs, noise):
        outputs = self.instanceNorm(inputs)
        matrixGamma = self.flatten(self.matrixGamma(noise))
        matrixBeta = self.flatten(self.matrixBeta(noise))
        deltaGamma = matrixGamma[0][self.randomIdx]
        deltaBeta = matrixBeta[0][self.randomIdx]
        outputs = tf.multiply(deltaGamma, outputs) + deltaBeta
        outputs = self.relu(outputs)
        return outputs

## Discriminator

In [None]:
class Discriminator(tf.keras.Model):
    def __init__(self, **kwargs):
        super(Discriminator, self).__init__(**kwargs)
        self.uDscriminatorStack = [
            UnconditionalDiscriminator(1, (5, 3)),
            UnconditionalDiscriminator(2, (5, 3)),
            UnconditionalDiscriminator(4, (5, 3)),
            UnconditionalDiscriminator(8, (5, 3)),
            UnconditionalDiscriminator(15, (2, 2))]
        self.cDiscriminatorStack = [
            ConditionalDiscriminator(1, (1, 5, 3, 2, 2, 2)),
            ConditionalDiscriminator(2, (1, 5, 3, 2, 2)),
            ConditionalDiscriminator(4, (1, 5, 3, 2, 2)),
            ConditionalDiscriminator(8, (1, 5, 3)),
            ConditionalDiscriminator(15, (1, 2, 2, 2))  
        ]
        self.flatten = tf.keras.layers.Flatten()
        self.denseStack = ([tf.keras.layers.Dense(1) for i in range(5)])
        
    def call(self, w1Inputs, w2Inputs, w3Inputs, w4Inputs, w5Inputs, condition):
        outputs = 0
        windows = [w1Inputs, w2Inputs, w3Inputs, w4Inputs, w5Inputs]
        for uDisc, cDisc, window, dense in zip(self.uDscriminatorStack, self.cDiscriminatorStack, windows, self.denseStack):
            outputs += dense(self.flatten(uDisc(window)) + self.flatten(cDisc(window, condition)))
        return outputs

### Unconditional discriminator

In [None]:
class UnconditionalDiscriminator(tf.keras.Model):
    def __init__(self, downsampleFactor, factors, **kwargs):
        super(UnconditionalDiscriminator, self).__init__(**kwargs)
        self.downsampleFactor = downsampleFactor
        self.factors = factors
        self.reshapeNet = tf.keras.Sequential([
            SpectralConv1D(filters=self.downsampleFactor, kernelSize=1),
            tf.keras.layers.MaxPool1D(self.downsampleFactor, padding='same')])
        self.dBlockStack = tf.keras.Sequential([
            DiscriminatorBlock(64, 1),
            DiscriminatorBlock(128, self.factors[0]),
            DiscriminatorBlock(256, self.factors[1]),
            DiscriminatorBlock(256, 1),
            DiscriminatorBlock(256, 1)])
        self.avgPool = tf.keras.layers.AveragePooling1D()
        
    def call(self, inputs):
        outputs = self.reshapeNet(inputs)
        outputs = self.dBlockStack(outputs)
        outputs = self.avgPool(outputs)
        return outputs

### Conditional discriminator 

In [None]:
class ConditionalDiscriminator(tf.keras.Model):
    def __init__(self, downsampleFactor, factors, **kwargs):
        super(ConditionalDiscriminator, self).__init__(**kwargs)
        self.downsampleFactor = downsampleFactor
        self.factors = factors
        dblockList = []
        dblockSize = 64
        self.reshape = tf.keras.Sequential([
            SpectralConv1D(filters=self.downsampleFactor, kernelSize=1),
            tf.keras.layers.MaxPool1D(self.downsampleFactor, padding='same')])
        for i in range(len(self.factors) - 1):
            dblockList.append(DiscriminatorBlock(dblockSize, self.factors[i]))
            dblockSize = dblockSize * 2
        self.dblockStack = tf.keras.Sequential(dblockList)
        self.condDBlock = ConditionalDBlock(dblockSize, self.factors[-1])
        self.finalDBlocks = tf.keras.Sequential([
            DiscriminatorBlock(dblockSize, 1),
            DiscriminatorBlock(dblockSize, 1)])
        self.avgPool = tf.keras.layers.AveragePooling1D()
        
    def call(self, inputs, condition):
        outputs = self.reshape(inputs)
        outputs = self.dblockStack(outputs)
        outputs = self.condDBlock(outputs, condition)
        outputs = self.finalDBlocks(outputs)
        outputs = self.avgPool(outputs)
        return outputs

#### Conditional dblock 

In [None]:
class ConditionalDBlock(tf.keras.Model):
    def __init__(self, filters, downsampleFactor, **kwargs):
        super(ConditionalDBlock, self).__init__(**kwargs)
        self.filters = filters
        self.downsampleFactor = downsampleFactor
        self.firstStack = tf.keras.Sequential([
            tf.keras.layers.MaxPool1D(self.downsampleFactor, padding='same'),
            tf.keras.layers.ReLU(),
            SpectralConv1D(filters=self.filters, kernelSize=3)])
        self.featureConv = SpectralConv1D(filters=self.filters, kernelSize=1)
        self.secondStack = tf.keras.Sequential([
            tf.keras.layers.ReLU(),
            SpectralConv1D(filters=self.filters, kernelSize=3, dilation=2)])
        self.residualStack = tf.keras.Sequential([
            SpectralConv1D(filters=self.filters, kernelSize=1),
            tf.keras.layers.MaxPool1D(self.downsampleFactor, padding='same')])

    def call(self, inputs, condition):
        outputs = self.firstStack(inputs)
        featureOutputs = self.featureConv(condition)
        outputs = outputs + featureOutputs
        outputs = self.secondStack(outputs)
        residualOutputs = self.residualStack(inputs)
        outputs = outputs + residualOutputs
        return outputs

### Discriminator block

In [None]:
class DiscriminatorBlock(tf.keras.Model):
    def __init__(self, filters, downsampleFactor, **kwargs):
        super(DiscriminatorBlock, self).__init__(**kwargs)
        self.filters = filters
        self.downsampleFactor = downsampleFactor
        self.stack = tf.keras.Sequential([
            tf.keras.layers.MaxPool1D(self.downsampleFactor, padding='same'),
            tf.keras.layers.ReLU(),
            SpectralConv1D(filters=self.filters, kernelSize=3, activation=tf.nn.relu),
            SpectralConv1D(filters=self.filters, kernelSize=3, activation=tf.nn.relu, dilation=2)])
        self.residualStack = tf.keras.Sequential([
            SpectralConv1D(filters=self.filters, kernelSize=3),
            tf.keras.layers.MaxPool1D(self.downsampleFactor, padding='same')])
        
    def call(self, inputs):
        outputs = self.stack(inputs)
        residualOutputs = self.residualStack(inputs)
        outputs = residualOutputs + outputs
        return outputs

## DiscriminatorTest 

In [None]:
class DiscriminatorTest(tf.keras.Model):
    def __init__(self, **kwargs):
        super(DiscriminatorTest, self).__init__(**kwargs)
        self.uDscriminatorStack = [
            UnconditionalDiscriminator(4, (5, 3)),
            UnconditionalDiscriminator(8, (5, 3)),
            UnconditionalDiscriminator(15, (2, 2))]
        self.cDiscriminatorStack = [
            ConditionalDiscriminator(4, (1, 5, 3, 2, 2)),
            ConditionalDiscriminator(8, (1, 5, 3)),
            ConditionalDiscriminator(15, (1, 2, 2, 2))  
        ]
        self.flatten = tf.keras.layers.Flatten()
        self.denseStack = ([tf.keras.layers.Dense(1) for i in range(3)])
    
    def call(self, w1Inputs, w2Inputs, w3Inputs, condition):
        outputs = 0
        windows = [w1Inputs, w2Inputs, w3Inputs]
        for uDisc, cDisc, window, dense in zip(self.uDscriminatorStack, self.cDiscriminatorStack, windows, self.denseStack):
            outputs += dense(self.flatten(uDisc(window)) + self.flatten(cDisc(window, condition)))
        return outputs

# GAN test

In [None]:
DISC_LEARNING_RATE = 1e-4
GEN_LEARNING_RATE = 5e-5
BETA_1 = 0
BETA_2 = 0.999
DECAY_RATE = 0.9999
PREPROCESSOR = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
ENCODER = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1"
BERT_MODEL = BERT(PREPROCESSOR, ENCODER)
WINDOWS = [240, 480, 960, 1920, 3600]
WINDOWS_TEST = [960, 1920, 3600]
BATCH_SIZE = 2
EPOCHS = 1

In [None]:
def getSamples(audioArray, windows):
    totalSamples = len(audioArray[0])
    subSamples = []
    for window in windows:
        idx = np.random.randint(0, totalSamples - window)
        subSamples.append(audioArray[:, idx:idx+window, :])
    return subSamples

In [None]:
def initializeModels():
    featureNet = CBHG(BATCH_SIZE, 16, True)
    generator = Generator(BATCH_SIZE, True)
    discriminatorTest = DiscriminatorTest()
    #discriminator = Discriminator()
    genOptimizer = tfa.optimizers.MovingAverage(decay=DECAY_RATE, optimizer=tf.keras.optimizers.Adam(lr=GEN_LEARNING_RATE, beta_1=BETA_1, beta_2=BETA_2))
    discOptimizer = tf.keras.optimizers.Adam(lr=DISC_LEARNING_RATE, beta_1=BETA_1, beta_2=BETA_2)
    featureOptimizer = tf.keras.optimizers.Adam(lr=DISC_LEARNING_RATE, beta_1=BETA_1, beta_2=BETA_2)
    return featureNet, generator, discriminatorTest, genOptimizer, discOptimizer, featureOptimizer

In [None]:
def trainStep(audioBatch, textBatch, featureNet, generator, discriminator, genOptimizer, discOptimizer, featureOptimizer):
    with tf.device('/device:GPU:0'):
        noise = tf.random.normal((BATCH_SIZE, 128, 1))
        with tf.GradientTape() as genTape, tf.GradientTape() as discTape, tf.GradientTape() as featureTape:
            genFeatures, discFeatures = featureNet(textBatch)
            generatedAudio = generator(genFeatures, noise)
            # w1, w2, w3, w4, w5 = getSamples(generatedAudio, WINDOWS)
            # fakeAudio = discriminator(w1, w2, w3, w4, w5, discFeatures)
            w3, w4, w5 = getSamples(generatedAudio, WINDOWS_TEST)
            fakeAudio = discriminator(w3, w4, w5, discFeatures)
            # w1, w2, w3, w4, w5 = getSamples(audioBatch, WINDOWS)
            # realAudio = discriminator(w1, w2, w3, w4, w5, discFeatures)
            w3, w4, w5 = getSamples(audioBatch, WINDOWS_TEST)
            realAudio = discriminator(w3, w4, w5, discFeatures)
            discFakeLoss = tf.losses.hinge(tf.zeros_like(fakeAudio), fakeAudio)
            discRealLoss = tf.losses.hinge(tf.ones_like(realAudio), realAudio)
            discLoss = discFakeLoss + discRealLoss
            genLoss = tf.losses.hinge(tf.ones_like(fakeAudio), fakeAudio)
        discGradients = discTape.gradient(discLoss, discriminator.trainable_variables)
        discOptimizer.apply_gradients(zip(discGradients, discriminator.trainable_variables))
        genGradients = genTape.gradient(genLoss, generator.trainable_variables)
        genOptimizer.apply_gradients(zip(genGradients, generator.trainable_variables))
        featureGradients = featureTape.gradient(discLoss, featureNet.trainable_variables)
        featureOptimizer.apply_gradients(zip(featureGradients, featureNet.trainable_variables))
        print("Generator loss:", genLoss.numpy()[0],"| Discriminator loss:", discLoss.numpy()[0])

In [None]:
def train(dataset, epochs):
    featureNet, generator, discriminator, genOptimizer, discOptimizer, featureOptimizer = initializeModels()
    for epoch in range(epochs):
        print("Epoch", epoch+1)
        for batch in dataset:
            trainStep(batch[0], batch[1], featureNet, generator, discriminator, genOptimizer, discOptimizer, featureOptimizer)

In [None]:
audio = tf.random.normal((2, 48000, 1))
text_input = ['This is such an amazing movie!', 'English is a West Germanic language first spoken in early medieval England']
text = BERT_MODEL(text_input)
dataset = tf.data.Dataset.from_tensor_slices((audio, text)).batch(2)
train(dataset, EPOCHS)