###### Import libraries

In [1]:
import pdb
import os
import shutil

In [2]:
import numpy as np
import pandas as pd

In [3]:
import tensorflow as tf

In [4]:
import sklearn.preprocessing

##### Generate latent data
Use np.random.uniform to generate latent data. And append a __label__ column to the latent data.

>There are two kinds of latent data. 
>1. "jobs/JOBXXXXX/samples/sections/section/latent/"*
>2. "samples/sections/section/latent"

In [49]:
def fn_generateLatentSamples(strPart, strJobsDir):
    for name in os.listdir(strJobsDir):
        strJobDir = os.path.join(strJobsDir, name)
        strSectionSamplesDir = os.path.join(strJobDir, "samples/sections/" + strPart)
        
        strLatentSectionSamplesDir = os.path.join(strSectionSamplesDir, "latent")
        if os.path.exists(strLatentSectionSamplesDir):
            shutil.rmtree(strLatentSectionSamplesDir)
        os.mkdir(strLatentSectionSamplesDir)
        strSectionSamplesFile = os.path.join(strSectionSamplesDir, "samples.csv")
        pdDfSectionSamples = pd.read_csv(strSectionSamplesFile)
        npNArrLatentSamples = np.random.uniform(0., 1., size = pdDfSectionSamples.shape)
        pdDfLatentSectionSamples = pd.DataFrame(data = npNArrLatentSamples, 
                                                columns=pdDfSectionSamples.columns, index=pdDfSectionSamples.index)
        pdDfLatentSectionSamples["label"] = 0.
        strLatentSectionSamplesFile = os.path.join(strLatentSectionSamplesDir, "samples.csv")
        pdDfLatentSectionSamples.to_csv(strLatentSectionSamplesFile, index=False)
        
        strReservedLatentSectionSamplesDir = os.path.join(strLatentSectionSamplesDir, "reserved")
        os.mkdir(strReservedLatentSectionSamplesDir)
        npNArrReservedLatentSamples = np.random.uniform(0., 1., size = pdDfSectionSamples.shape)
        pdDfReservedLatentSectionSamples = pd.DataFrame(data = npNArrReservedLatentSamples, 
                                               columns=pdDfSectionSamples.columns, index=pdDfSectionSamples.index)
        pdDfReservedLatentSectionSamples["label"]  = 0
        strReservedLatentSectionSamplesFile = os.path.join(strReservedLatentSectionSamplesDir, "samples.csv")
        pdDfReservedLatentSectionSamples.to_csv(strReservedLatentSectionSamplesFile, index=False)

###### Preprocess training samples of a section

Preprocess the samples from __jobs/JOBXXXXX/samples/sections/section/__ 
and put them into __jobs/JOBXXXX/samples/sections/section/preprocessed__

In [42]:
def fn_preprocessJobsSectionSamples(strPart, strJobsDir, strJobsSamplesDir):
    strJobsSectionSamplesDir = os.path.join(strJobsSamplesDir, "sections/" + strPart)
    oMinMaxScaler = fn_makeScaler(strJobsSectionSamplesDir)
    
    for name in os.listdir(strJobsDir):
        strJobDir = os.path.join(strJobsDir, name)
        fn_preprocessJobSectionSamples(strPart, oMinMaxScaler, strJobDir)
    return oMinMaxScaler
    
def fn_makeScaler(strJobsSectionSamplesDir):
    strJobsSectionSamplesFile = os.path.join(strJobsSectionSamplesDir, "samples.csv")
    pdDfJobsSectionSamples = pd.read_csv(strJobsSectionSamplesFile)
    oMinMaxScaler = sklearn.preprocessing.MinMaxScaler()
    oMinMaxScaler.fit(pdDfJobsSectionSamples.values)
    return oMinMaxScaler
def fn_preprocessJobSectionSamples(strPart, oMinMaxScaler, strJobDir):
    strSectionSamplesFile = os.path.join(strJobDir, "samples/sections/" + strPart + "/samples.csv")
    pdDfSectionSamples = pd.read_csv(strSectionSamplesFile)
    npNArrPreprocessedSectionSamples = oMinMaxScaler.transform(pdDfSectionSamples.values)
    pdDfPreprocessedSectionSamples = pd.DataFrame(npNArrPreprocessedSectionSamples, columns=pdDfSectionSamples.columns)
    pdDfPreprocessedSectionSamples["label"] = 1.
    strPreprocessedSectionSamplesDir = os.path.join(strJobDir, "samples/sections/" + strPart + "/preprocessed")
    if os.path.exists(strPreprocessedSectionSamplesDir):
        shutil.rmtree(strPreprocessedSectionSamplesDir)
    os.mkdir(strPreprocessedSectionSamplesDir)
    strPreprocessedSectionSamplesFile = os.path.join(strPreprocessedSectionSamplesDir, "samples.csv")
    pdDfPreprocessedSectionSamples.to_csv(strPreprocessedSectionSamplesFile, index = False)

###### Get dataset

In [64]:
g_nFeatures = 0

In [65]:
def fn_getDiscriminatorTrainingDataset(strPart, fn_splitCsvLineIntoXAndY, strJobsDir, 
                          nReaders = 5, nReadThreads = 5, nParseTreads = 5, nShuffleBufferSize = 1000, nBatchSize = 32):
    oDataset = tf.data.Dataset.list_files(strJobsDir + "/*/samples/sections/" + strPart + "/*/samples.csv")
    oDataset = oDataset.interleave(lambda strSampleFile: tf.data.TextLineDataset(strSampleFile).skip(1), cycle_length=nReaders, 
                                  num_parallel_calls=nReadThreads)
    oDataset = oDataset.map(fn_splitCsvLineIntoXAndY, nParseTreads)
    oDataset = oDataset.shuffle(nShuffleBufferSize)
    return oDataset.batch(nBatchSize)

def fn_splitCsvLineIntoXAndY(strLine):
    tensorDefs = [0.] * g_nFeatures + [tf.constant([], dtype=tf.float32)]
    tensorRecord = tf.io.decode_csv(strLine, record_defaults=tensorDefs)
    tensorX = tf.stack(tensorRecord[:-1])
    tensorY = tf.stack(tensorRecord[-1])
    return tensorX, tensorY

In [None]:
g_nFeatures = 3
oDataset = fn_getDiscriminatorTrainingDataset("input", )

###### Input

In [50]:
fn_preprocessJobsSectionSamples("input", "jobs", "samples")
fn_generateLatentSamples("input", "jobs")

In [7]:
oMinMaxScalerInput = fn_normalizeJobsSectionSamples(strPart = "input", strJobsDir = "jobs", strJobsSamplesDir = "samples")

In [17]:
fn_generateLatentSamples("input", "jobs")

In [8]:
g_nInputBatchSize = 32

In [9]:
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
for i in dataset:
    print(i)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)


##### Building model

In [26]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LeakyReLU
from tensorflow import math

import numpy as np

def fn_DLoss(tensorTrue, tensorPred):
        tensorOnes, tensorReal = tensorTrue[:g_nInputBatchSize, :], tensorPred[:g_nInputBatchSize, :]
        tensorZeros, tensorFake = tensorTrue[g_nInputBatchSize:, :], tensorPred[g_nInputBatchSize:, :]
        tensorLossReal = tf.math.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = tensorReal, labels = tensorOnes), 1)
        tensorLossFake = tf.math.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = tensorFake, labels = tensorZeros), 1)
        tensorLoss = tensorLossReal + tensorLossFake
        return tf.math.reduce_mean(tensorLoss)  
def fn_GLoss(tensorTrue, tensorPred):
    tensorLoss = tf.math.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = tensorPred, labels = tensorTrue), 1)
    return tf.math.reduce_mean(tensorLoss)

class InputGANOperator(object):
    """
        specify the number of features to be dealt by gan
    """
    def __init__(self, nLatentDim, nHiddenDimG, nHiddenDimD, nBatchSize, nFeatures):
        self.nLatentDim = nLatentDim
        self.nHiddenDimG = nHiddenDimG
        self.nHiddenDimD = nHiddenDimD
        self.nBatchSize = nBatchSize
        self.nFeatures = nFeatures
        
        self.oSeqGenerator = None
        self.oSeqDiscriminator = None
        self.fn_makeGenerator()
        self.fn_makeDiscriminator()
        self.fn_makeDiscriminatorModel()
        self.fn_makeAdversariaModel()
        
    def fn_makeGenerator(self):
        if self.oSeqGenerator:
            return self.oSeqGenerator
        self.oSeqGenerator = Sequential()
        self.oSeqGenerator.add(Dense(64, activation = tf.keras.activations.relu))
        self.oSeqGenerator.add(Dense(3, activation = tf.keras.activations.sigmoid))
    def fn_makeDiscriminator(self):
        if self.oSeqDiscriminator:
            return self.oSeqDiscriminator
        self.oSeqDiscriminator = Sequential()
        self.oSeqDiscriminator.add(Dense(64, activation = tf.keras.activations.relu))
        self.oSeqDiscriminator.add(Dense(1, activation = tf.keras.activations.sigmoid))
    
    def fn_makeDiscriminatorTrainingModel(self):
        self.oSeqDiscriminatorModel = Sequential()
        self.oSeqDiscriminatorModel.add(self.oSeqDiscriminator)
        oOptimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.1)
        self.oSeqDiscriminatorModel.compile(loss=fn_DLoss, optimizer=oOptimizer, metrics=["accuracy"])
    def fn_makeAdversariaTrainingModel(self):
        self.oSeqAdversarialModel = Sequential()
        self.oSeqAdversarialModel.add(self.oSeqGenerator)
        self.oSeqAdversarialModel.add(self.oSeqDiscriminator)
        self.oSeqDiscriminator.trainable = False
        oOptimer = tf.optimizers.Adam()
        self.oSeqAdversarialModel.compile(loss = fn_GLoss, optimizer=oOptimer, metrics=["accuracy"])
    
    def fn_train(self, npNArrSamples, nEpochs = 2000):
        for epoch in range(nEpochs):
            npNArrTrueSamples = npNArrSamples[np.random.randint(0, npNArrSamples.shape[0], size = self.nBatchSize), :]
            npNoise = np.random.uniform(-1.0, 1.0, size = [self.nBatchSize, self.nLatentDim])
            npNArrFake = self.oSeqGenerator.predict(npNoise)
            npNArrX = np.concatenate((npNArrTrueSamples, npNArrFake))
            npNArrY = np.ones([2 * self.nBatchSize, 1])
            npNArrY[self.nBatchSize:, :] = 0
            fLossD = self.oSeqDiscriminatorModel.train_on_batch(npNArrX, npNArrY)
            
            npNArrY = np.ones([self.nBatchSize, 1])
            npNoise = np.random.uniform(-1.0, 1.0, size = [self.nBatchSize, self.nLatentDim])
            fLossA = self.oSeqAdversarialModel.train_on_batch(npNoise, npNArrY)

            strMsg = "%d: [D loss: %f, acc: %f]" % (epoch, fLossD[0], fLossD[1])
            strMsg = "%s [A loss: %f, acc: %f]" % (strMsg, fLossA[0], fLossA[1])
            print(strMsg)

In [31]:
keras.backend.clear_session()
oGANOperator = GANOperator(nLatentDim=8, nHiddenDimG=16, nHiddenDimD=16, nBatchSize=g_nBatchSize, 
                           nFeatures=5)

In [32]:
oGANOperator.fn_train(npNArrSamples, nEpochs=10)

0: [D loss: 1.452051, acc: 0.400000] [A loss: 0.490367, acc: 0.200000]
1: [D loss: 1.434356, acc: 0.500000] [A loss: 0.474354, acc: 0.400000]
2: [D loss: 1.459817, acc: 0.400000] [A loss: 0.508515, acc: 0.000000]
3: [D loss: 1.454618, acc: 0.500000] [A loss: 0.449057, acc: 0.800000]
4: [D loss: 1.487272, acc: 0.300000] [A loss: 0.479009, acc: 0.400000]
5: [D loss: 1.391144, acc: 0.800000] [A loss: 0.503868, acc: 0.200000]
6: [D loss: 1.450166, acc: 0.500000] [A loss: 0.478879, acc: 0.400000]
7: [D loss: 1.487112, acc: 0.400000] [A loss: 0.463928, acc: 0.800000]
8: [D loss: 1.436899, acc: 0.500000] [A loss: 0.446789, acc: 0.600000]
9: [D loss: 1.470820, acc: 0.300000] [A loss: 0.479095, acc: 0.400000]
