In [None]:
###########################
#                         #
#   Pre-process Dataset   #
#                         #
###########################

import pandas as pd
import numpy as np
import pickle
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

data = pd.read_csv("./../data/data_train.csv",low_memory=False)
data.drop("KEY",axis=1,inplace=True)
print("Dataset Loaded!")

compCols = pickle.load( open( "./../pickle/compCols.p", "rb" ) )

dummifyCols = []
for i in compCols:
    if len(data[i].unique()) > 2:
        dummifyCols.append(i)

removeCols = ["DISCWT","TOTCHG","TOTAL_DISC"]
for i in removeCols:
    dummifyCols.remove(i)

dummifyCols.remove("RACE")
dummifyCols.append("RACE")

data = data[compCols]
raceCols = pd.get_dummies(data,columns=dummifyCols).columns
data = pd.get_dummies(data,columns=dummifyCols)
X = preprocessing.minmax_scale(data.as_matrix(),axis=0)
print("Data: "+str(X.shape))
X_train, X_test = train_test_split(X,test_size=0.3)
print("Train Data: "+str(X_train.shape))
print("Test Data: "+str(X_test.shape))
print("Split Complete!")

In [None]:
###########################
#                         #
#       Train Model       #
#                         #
###########################


import tensorflow as tf
import numpy as np
from tensorflow.python.framework import ops
ops.reset_default_graph()

#Data Corruption Percentage
data_destroy = 0.1

#Flag to enable writing weights to file. (dump weights FLAG)
wrtAE = 0

training_epochs = 100001
batch_size = 2000
n_input = X_train.shape[1]
keep_prob = 0.9

x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_input])
mask = tf.placeholder("bool", [None, n_input],name='MASK')


# hidden layer settings
n_hidden_1 =  300
n_hidden_2 = int(n_hidden_1/2)
n_hidden_3 = int(n_hidden_2/2)


folder = "AE-race"

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'encoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'encoder_h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
    'decoder_h1': tf.Variable(tf.random_normal([n_hidden_3, n_hidden_2])),
    'decoder_h2': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_1])),
    'decoder_h3': tf.Variable(tf.random_normal([n_hidden_1, n_input]))
}
biases = {
    'encoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'encoder_b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'encoder_b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'decoder_b1': tf.Variable(tf.random_normal([n_hidden_2])),
    'decoder_b2': tf.Variable(tf.random_normal([n_hidden_1])),
    'decoder_b3': tf.Variable(tf.random_normal([n_input]))
}


#read pre-calculated weights
'''
weights = {
    'encoder_h1': tf.Variable(pickle.load(open("./../weights/"+folder+"/eh1.p", "rb"))),
    'encoder_h2': tf.Variable(pickle.load(open("./../weights/"+folder+"/eh2.p", "rb"))),
    'encoder_h3': tf.Variable(pickle.load(open("./../weights/"+folder+"/eh3.p", "rb"))),
    'decoder_h1': tf.Variable(pickle.load(open("./../weights/"+folder+"/dh1.p", "rb"))),
    'decoder_h2': tf.Variable(pickle.load(open("./../weights/"+folder+"/dh2.p", "rb"))),
    'decoder_h3': tf.Variable(pickle.load(open("./../weights/"+folder+"/dh3.p", "rb")))
}
biases = {
    'encoder_b1': tf.Variable(pickle.load(open("./../weights/"+folder+"/eb1.p", "rb"))),
    'encoder_b2': tf.Variable(pickle.load(open("./../weights/"+folder+"/eb2.p", "rb"))),
    'encoder_b3': tf.Variable(pickle.load(open("./../weights/"+folder+"/eb3.p", "rb"))),
    'decoder_b1': tf.Variable(pickle.load(open("./../weights/"+folder+"/db1.p", "rb"))),
    'decoder_b2': tf.Variable(pickle.load(open("./../weights/"+folder+"/db2.p", "rb"))),
    'decoder_b3': tf.Variable(pickle.load(open("./../weights/"+folder+"/db3.p", "rb")))
}
'''



# Building the encoder
def encoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']),
                                   biases['encoder_b1']))
    
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']),
                                   biases['encoder_b2']))
    
    layer_2 = tf.nn.dropout(layer_2, keep_prob)
    
    layer_3 = tf.nn.sigmoid(tf.add(tf.matmul(layer_2, weights['encoder_h3']),
                                   biases['encoder_b3']))
    return layer_3


# Building the decoder
def decoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']),
                                   biases['decoder_b1']))
    
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']),
                                   biases['decoder_b2']))
    
    layer_2 = tf.nn.dropout(layer_2, keep_prob)
    
    layer_3 = tf.nn.sigmoid(tf.add(tf.matmul(layer_2, weights['decoder_h3']),
                                   biases['decoder_b3']))
    return layer_3

# Construct model
encoder_op = encoder(x)
decoder_op = decoder(encoder_op)

# Prediction
y_pred = tf.boolean_mask(decoder_op, mask,name='boolean_mask1')

# Targets (Labels) are the input data.
y_true = tf.boolean_mask(y, mask,name='boolean_mask2')

# Define loss and optimizer, minimize the Cross Entropy
y_pred = tf.reshape(y_pred, [-1, 6]) 
y_true = tf.reshape(y_true, [-1, 6]) 
cost = -tf.reduce_mean(y_true*tf.log(y_pred+1e-9) + (1-y_true)*tf.log(1-y_pred+1e-9))
optimizer = tf.train.GradientDescentOptimizer(0.0001).minimize(cost)

ar1 = tf.argmax(y_pred,1,name="arg1")
ar2 = tf.argmax(y_true,1,name="arg2")
correct_prediction = tf.equal(ar1,ar2)

#Accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

#Confusion Matrix
confMat = tf.contrib.metrics.confusion_matrix(ar2,ar1,weights=None)

#Initialize the variables
init=tf.global_variables_initializer()

indices_train = np.random.choice(np.arange(batch_size),size=int(batch_size*data_destroy))

with tf.Session() as sess:
    sess.run(init)
    total_batch = int(len(X_train)/batch_size)
    for epoch in range(training_epochs):
        
        #After every 1000 epochs train on a different Corrupted data
        if epoch %1000 == 0:
            global indices_train
            indices_train = np.random.choice(np.arange(batch_size),size=int(batch_size*data_destroy))
            
        for i in range(total_batch):
            batch_xs = X_train[i*batch_size:(i+1)*batch_size].copy()
            batch_xs[indices_train,327:333] = 0
            m = np.full((batch_size,6), False, dtype=bool)
            m[indices_train,:] = True
            a = np.full((batch_size,327), False, dtype=bool)
            maskArr = np.concatenate((a,m), axis=1)
            sess.run([optimizer],
                     feed_dict={
                         x: batch_xs,
                         y:X_train[i*batch_size:(i+1)*batch_size],
                         mask: maskArr
                     })
        
        #Print Accuracy for every 2nd Epoch
        if  epoch%2==0:
            test_batch = X_test.copy()
            indices = np.random.choice(np.arange(len(test_batch)),size=int(len(test_batch)*data_destroy))
            test_batch[indices,327:333] = 0
            m = np.full((len(test_batch),6), False, dtype=bool)
            m[indices,:] = True
            a = np.full((len(test_batch),327), False, dtype=bool)
            maskArr = np.concatenate((a,m), axis=1)
            test_batch[maskArr] = 0
            curr_accuracy,confM = sess.run([accuracy,confMat],feed_dict={x: test_batch,
                                                       y:X_test,
                                                       mask: maskArr}
                                           )
            #Print Accuracy and Confusion Matrix
            #print("Epoch-"+str(epoch)+" "+str(curr_accuracy))
            #print(confM)
            
            #Write the Accuracy with Epoch to a file to be visualised later on.
            #with open("./../results/epochs-"+folder+".txt", "a") as f:
            #    f.write(str(epoch)+","+str(round(100*curr_accuracy,3))+"\n")
            
            #if wrtAE is set to 1, only then dump weights to file.
            if (wrtAE == 1):
                pickle.dump(sess.run(weights['encoder_h1']), open("./../weights/"+folder+"/eh1.p", "wb"))
                pickle.dump(sess.run(weights['encoder_h2']), open("./../weights/"+folder+"/eh2.p", "wb"))
                pickle.dump(sess.run(weights['encoder_h3']), open("./../weights/"+folder+"/eh3.p", "wb"))
                pickle.dump(sess.run(weights['decoder_h1']), open("./../weights/"+folder+"/dh1.p", "wb"))
                pickle.dump(sess.run(weights['decoder_h2']), open("./../weights/"+folder+"/dh2.p", "wb"))
                pickle.dump(sess.run(weights['decoder_h3']), open("./../weights/"+folder+"/dh3.p", "wb"))
                pickle.dump(sess.run(biases['encoder_b1']), open("./../weights/"+folder+"/eb1.p", "wb"))
                pickle.dump(sess.run(biases['encoder_b2']), open("./../weights/"+folder+"/eb2.p", "wb"))
                pickle.dump(sess.run(biases['encoder_b3']), open("./../weights/"+folder+"/eb3.p", "wb"))
                pickle.dump(sess.run(biases['decoder_b1']), open("./../weights/"+folder+"/db1.p", "wb"))
                pickle.dump(sess.run(biases['decoder_b2']), open("./../weights/"+folder+"/db2.p", "wb"))
                pickle.dump(sess.run(biases['decoder_b3']), open("./../weights/"+folder+"/db3.p", "wb"))
