In [11]:
from __future__ import absolute_import, division, print_function, unicode_literals

#!pip install tensorflow-gpu==2.0.0-beta1

"""
Source: https://github.com/matiRLC/Keras-GAN/blob/master/gan/gan.py
Adapted by: matias@u.nus.edu
TODOs:
    supervised discriminator
    Wasserstein GAN
    categorical encoding
"""

# Tensorflow
import tensorflow as tf
from tensorflow.keras import layers
from IPython import display 
print(tf.__version__)

# Progress bar
from tqdm import tqdm

# Numpy, pandas, matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import PIL

# Sklearn
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler


class GAN():
    """Main model for GAN
    Args:
        None
    
    Attributes:
        None
    """
    
    def __init__(self, data_dim=15, n_hidden=200, n_layers=2, lr=0.0002, display=False):
        """Initiliaze the object, set arguments as attributes."""
        self.display = display
        self.seed = 13
        self.gamma = 0.2
        self.scaler = None
        self.columns = []
        
        self.lr = lr
        self.data_dim = data_dim # number of features
        self.latent_dim = 30 # before was data_dim
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        
        # optimizers
        self.generator_optimizer = tf.keras.optimizers.Adam(lr)
        self.discriminator_optimizer = tf.keras.optimizers.Adam(lr)

        # Build the generator
        self.generator = self.build_generator()
        
        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        
        # Helper function to computer cross entropy loss
        self.cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        
        # Initialize correlations with 0
        self.prev_gcorr_real = np.zeros((1, self.data_dim), dtype=np.float32)
        self.prev_gcorr_fake = np.zeros((1, self.data_dim), dtype=np.float32)
        
        # Moving average contribution
        self.mac = 0.99
        
        # loss and accuracy arrays
        self.loss_g = []
        self.loss_d = []
        self.acc_pos = []
        self.acc_neg = []        
        
    def build_generator(self):
        """
        """
        n_hidden= self.n_hidden
        n_layers = self.n_layers

        model = tf.keras.Sequential()
        # first layer
        model.add(layers.Dense(n_hidden, use_bias=False, input_dim=self.latent_dim))
        model.add(layers.BatchNormalization())
        model.add(layers.LeakyReLU(alpha=0.2))
        
        # hidden layers
        for layer in range(n_layers - 1):
            if layer == n_layers - 2: # last layer with fewer neurons
                n_hidden = n_hidden / 2

            model.add(layers.Dense(n_hidden, use_bias=False))
            model.add(layers.BatchNormalization())
            model.add(layers.LeakyReLU(alpha=0.2))

        # last layer
        model.add(layers.Dense(self.data_dim))
        model.add(layers.BatchNormalization())
        model.add(layers.LeakyReLU(alpha=0.2))
        
        print("Generator Summary:")
        model.summary()

        return model

    def build_discriminator(self):
        """
        """
        n_hidden= self.n_hidden
        n_layers = self.n_layers

        model = tf.keras.Sequential()
        # first layer
        model.add(layers.Dense(n_hidden, input_dim=self.data_dim))
        model.add(layers.LeakyReLU(alpha=0.2))
        model.add(layers.Dropout(0.3))
        
        # hidden layers
        for layer in range(n_layers - 1):
            if layer == n_layers - 2: # last layer with fewer neurons
                n_hidden = n_hidden / 2

            model.add(layers.Dense(n_hidden))
            model.add(layers.LeakyReLU(alpha=0.2))
            model.add(layers.Dropout(0.3))

        # last layer
        model.add(layers.Dense(1))
        
        print("Discriminator Summary:")
        model.summary()

        return model
    
    def discriminator_loss(self, real_output, fake_output):
        real_loss = self.cross_entropy(tf.ones_like(real_output), real_output)
        fake_loss = self.cross_entropy(tf.zeros_like(fake_output), fake_output)
        total_loss = 0.5 * real_loss + 0.5 * fake_loss # TODO: add regularization
        return total_loss
    
    def discriminator_acc(self, real_output, fake_output):
        score_real = tf.sigmoid(real_output)
        score_fake = tf.sigmoid(real_output)
        acc_pos = tf.reduce_mean(tf.cast(score_real > 0.5, tf.float32))
        acc_neg = tf.reduce_mean(tf.cast(score_fake < 0.5, tf.float32))
        return acc_pos, acc_neg
    
    def generator_loss(self, fake_output):
        cross_loss = self.cross_entropy(tf.ones_like(fake_output), fake_output)
        return cross_loss

    def correlation_loss(self, original, fake):
        gcorr_real = np.array(self.mac * self.prev_gcorr_real + (1 - self.mac) * pd.DataFrame(original.numpy()).corr())
        gcorr_fake = np.array(self.mac * self.prev_gcorr_fake + (1 - self.mac) * pd.DataFrame(fake.numpy()).corr())
        corr_loss = tf.cast(tf.reduce_sum(tf.abs(gcorr_real - gcorr_fake)), tf.float32)    
        # update for next iterations
        self.prev_gcorr_real = gcorr_real
        self.prev_gcorr_fake = gcorr_fake
        return corr_loss
    
    # This annotation causes the function to be "compiled" and therefore run as graph
    @tf.function 
    def train_step(self, data, noise, corr_loss, BATCH_SIZE):
#         noise = tf.random.normal([BATCH_SIZE, self.data_dim])

        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            generated_data = self.generator(noise, training=True) # G(z)
            real_output = self.discriminator(data, training=True) # D(x)
            fake_output = self.discriminator(generated_data, training=True) # D(G(z))
            
            gen_loss = self.generator_loss(fake_output)
            # generator loss + correlation loss
            total_gen_loss = gen_loss + corr_loss # TODO: add regularization
            disc_loss = self.discriminator_loss(real_output, fake_output)
            
            # accuracy
            d_pos_acc, d_neg_acc = self.discriminator_acc(real_output, fake_output)
            
            
        gradients_of_generator = gen_tape.gradient(total_gen_loss, 
                                                   self.generator.trainable_variables)
        gradients_of_discriminator = disc_tape.gradient(disc_loss, 
                                                        self.discriminator.trainable_variables)

        self.generator_optimizer.apply_gradients(zip(gradients_of_generator, 
                                                self.generator.trainable_variables))
        self.discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, 
                                                    self.discriminator.trainable_variables))
        return gen_loss, disc_loss, d_pos_acc, d_neg_acc
        
    def train(self, dataframe, EPOCHS, use_corr_loss=True, BATCH_SIZE=128, SAMPLE_INTERVAL=15):
        """
        """
        # transform dataset
        prepared_dataset = self.prepare_data(dataframe) 
                
        # progress bar
        pbar = tqdm(total=EPOCHS)
        
        for epoch in range(EPOCHS):
            for data_batch in prepared_dataset:
                noise = tf.random.normal([BATCH_SIZE, self.latent_dim])
                generated = self.generator(noise, training=False)
                
                if use_corr_loss:
                    corr_loss = self.correlation_loss(data_batch, generated)
                else:
                    corr_loss = 0
                    
                gen_loss, disc_loss, d_pos_acc, d_neg_acc = self.train_step(data_batch, 
                                                                            noise,
                                                                            corr_loss,
                                                                            BATCH_SIZE) #TODO: dont need batch_size in this function
                
            # Save the model every SAMPLE_INTERVAL epochs
            if epoch % SAMPLE_INTERVAL == 0:
                self.generate_data(epoch, BATCH_SIZE)
                display.clear_output(wait=True)
             #   checkpoint.save(file_prefix = checkpoint_prefix)
            
            # update losses and accuracy
            self.loss_g.append(gen_loss.numpy())
            self.loss_d.append(disc_loss.numpy())
            self.acc_pos.append(d_pos_acc.numpy())
            self.acc_neg.append(d_neg_acc.numpy())
            
#             print("D Loss: {0:.2f}, G Loss: {0:.2f}".format(self.loss_d[-1], self.loss_g[-1]))   

            # progress bar
            pbar.update(1)
            
        # Generate after the final epoch
        display.clear_output(wait=True)
        self.generate_data(epoch, BATCH_SIZE * 5)
    
        pbar.close()
    
    def prepare_data(self, dataframe, BATCH_SIZE=128, cat_cols=[6, -1]):
        """Prepares, shuffles, and arrange data into batches.
        Args:
            dataframe(pandas.DataFrame): Dataset
            BATCH_SIZE(integer): Size of each batch
            cat_cols(list): Index of columns in the dataframe that are categorical
        Returns:
            X_train[numpy.ndarray]:  
        """
        BUFFER_SIZE = dataframe.shape[0] * 2
        self.columns = dataframe.columns.values
        # copy the dataframe for later transformations
        dataframe_copy = dataframe.copy()
        # remove categorical columns
        for cat_col in cat_cols:
            dataframe_copy.drop(dataframe_copy.columns[cat_col], axis=1, inplace=True)
            
        # to numpy array and scale
        X = np.array(dataframe_copy)
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X)
        
        # for the other categorical label
        y_original = np.array(dataframe.iloc[:, cat_cols[0]])
        # generate one-hot vector representation; e.g., 0 = [1 0 0 0 0], 1 = [0 1 0 0 0], etc.
        one_hot = np.zeros((y_original.size, y_original.max() + 1)) 
        one_hot[np.arange(y_original.size),y_original] = 1
        # adds noise to categorical columns
        noise = tf.random.uniform(tf.shape(one_hot), minval=0, maxval=self.gamma)
        noise_input = (one_hot + noise) / tf.reduce_sum(one_hot + noise, keepdims=True, axis=1)
        # append back to X matrix
        X_scaled = np.concatenate((X_scaled, noise_input), axis=1)
        
        # for target label 
        y_original = np.array(dataframe.iloc[:, cat_cols[1]])       
        # re-map labels from {-2, -1, 0, 1, 2} to {0, 1, 2, 3, 4}; works better with _tanh_ activation
        y_norm = y_original + 2
        # generate one-hot vector representation; e.g., 0 = [1 0 0 0 0], 1 = [0 1 0 0 0], etc.
        one_hot = np.zeros((y_norm.size, y_norm.max() + 1)) 
        one_hot[np.arange(y_norm.size),y_norm] = 1
        # adds noise to categorical columns
        noise = tf.random.uniform(tf.shape(one_hot), minval=0, maxval=self.gamma)
        noise_input = (one_hot + noise) / tf.reduce_sum(one_hot + noise, keepdims=True, axis=1)
        # append back to X matrix
        X_scaled = np.concatenate((X_scaled, noise_input), axis=1)

        # Batch and shuffle the data
        X_train = tf.data.Dataset.from_tensor_slices(X_scaled).shuffle(BUFFER_SIZE, seed=self.seed).batch(BATCH_SIZE)
        
        return X_train
    
    def generate_data(self, epoch=1, BATCH_SIZE=128):
        """
        """
        noise = tf.random.normal([BATCH_SIZE, self.latent_dim])        
        generated_x = self.generator(noise, training=False).numpy()

        if self.display:
            fig = plt.figure()
            plt.hist(generated_x.numpy(), bins=40, density=True, histtype='bar')
            plt.title("testing:" + str(epoch))
            plt.show()
            fig.savefig("../output/GANtest/{}.png".format(epoch))

        # rescale back only the continous columns
        generated_x_cont = generated_x[:,:8]
        generated_x_cont = self.scaler.inverse_transform(generated_x_cont)

        # get categorical variables
        generated_x_cat_1 = generated_x[:,[8,9]]
        generated_x_cat_2 = generated_x[:,10:]
        # argmax to get the right label
        generated_x_cat_1 = np.array([np.argmax(item) for item in generated_x_cat_1])
        generated_x_cat_2 = np.array([np.argmax(item) for item in generated_x_cat_2])
        # rescale the label with the correct offset
        generated_x_cat_2 = generated_x_cat_2 - 2
        
        # insert columns back
        generated_data = np.insert(generated_x_cont, 6, generated_x_cat_1, axis=1)
        generated_data = np.insert(generated_data, generated_data.shape[1], generated_x_cat_2, axis=1)
        generated_data = pd.DataFrame(generated_data, columns=self.columns)
        
        return generated_data
    
    def get_losses(self):
        """Return loses
        Args: 
            None
        Returns:    
            loss_g
            loss_d
        """
        return self.loss_g, self.loss_d
    
    def get_accuracies(self):
        """
        Args:
            None
        Returns:
            acc_pos
            acc_neg
        """
        return self.acc_pos, self.acc_neg

2.0.0-beta1
