In [1]:
import pandas as pd
import time
import tensorflow as tf

ROWS = 10**8
def normalize(dataset):
    seriesIndices = ['hours']
    for i in range(1,31):
        seriesIndices.append(f"hours_l{i}")
    dataset[seriesIndices] = dataset[seriesIndices].apply(lambda x: (x - x.mean()) / (x.std()))
    return dataset

include_fields = ['prov_id','hours','day_of_week','hours_l1','hours_l2','hours_l3','hours_l4',
                  'hours_l5','hours_l6','hours_l7','hours_l8','hours_l14','hours_l15',
                  'hours_l21','hours_l22','hours_l28','hours_l29','week_perc0','week_perc6','employees_l1']

startTime = time.time()
train = pd.read_csv("/export/storage_adgandhi/PBJhours_ML/Data/Intermediate/train_test_validation/training_set.csv",usecols=include_fields,nrows=ROWS).dropna()
val = pd.read_csv("/export/storage_adgandhi/PBJhours_ML/Data/Intermediate/train_test_validation/validation_set.csv",usecols=include_fields,nrows=ROWS/2).dropna()
test = pd.read_csv("/export/storage_adgandhi/PBJhours_ML/Data/Intermediate/train_test_validation/testing_set.csv",usecols=include_fields,nrows=ROWS/2).dropna()
print(f"Loaded Train and Validation Sets. Time taken: {time.time()-startTime}")

Loaded Train and Validation Sets. Time taken: 335.70772409439087


In [2]:
#shuffle all data values so prov id distribution is same for train/val
import numpy as np
temp = pd.concat([train,val])
#Convert provider id's into dense representation
provider_map = {}
index = 0
for element in temp['prov_id'].unique():
    provider_map[element]=index
    index +=1
temp['prov_id'] = temp['prov_id'].map(provider_map)

startTime = time.time()
np.random.shuffle(temp.values)
print(f"Shuffled datasets. Time taken: {time.time()-startTime} seconds")
train = temp[:train.shape[0]]
val = temp[val.shape[0]:]
vocab_size = len(temp['prov_id'].unique())
print(vocab_size)

Shuffled datasets. Time taken: 180.51562762260437 seconds
1600


In [3]:
#train_inputs, train_labels = train.drop(['hours'], axis=1), train.filter(['hours'])
#val_inputs, val_labels = val.drop(['hours'], axis=1), val.filter(['hours'])
test_inputs, test_labels = test.drop(['hours'], axis=1), test.filter(['hours'])

train=None
val=None
test=None
#test_inputs, test_labels = test.drop(['hours'], axis=1), test.filter(['hours'])

In [4]:
def expand_one_hot(labels,dataset):
    outList = []
    for label in labels:  
        col = dataset[label]
        ###Generate a dict for all unique values (Don't waste space encoding non important job id's)
        map = {}
        index = 0
        for element in col.unique():
            map[element] = index
            index += 1
        col = col.map(map)
        tensor = tf.one_hot(col,len(col.unique()))
        outList.append(tensor)
        dataset = dataset.drop(columns=[label])
    
    outList.insert(0,dataset)
    output = tf.concat(outList,1)
    return output

#train_inputs = expand_one_hot(['day_of_week'],train_inputs)
#val_inputs = expand_one_hot(['day_of_week'],val_inputs)
test_inputs = expand_one_hot(['day_of_week'],test_inputs)

#print(train_inputs.shape)
#print(train_inputs[0])

In [5]:
BUFFER_SIZE = 10000
BATCH_SIZE = 256
#trainSet = tf.data.Dataset.from_tensor_slices((train_inputs,train_labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
#valSet = tf.data.Dataset.from_tensor_slices((val_inputs,val_labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
testSet = tf.data.Dataset.from_tensor_slices((test_inputs,test_labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
#print(trainSet)

In [6]:
class DeepNN(tf.keras.Model):

    def __init__(self,vocab_size):
        super(DeepNN, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size,20)
        self.dense1 = tf.keras.layers.Dense(64, activation=tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(64, activation=tf.nn.relu)
        self.dense3 = tf.keras.layers.Dense(64, activation=tf.nn.relu)
        self.dense4 = tf.keras.layers.Dense(64, activation=tf.nn.relu)
        self.out = tf.keras.layers.Dense(1)
        self.dropout = tf.keras.layers.Dropout(0.5)

    def call(self, inputs, training=False):
        #prov_id is first column
        embedding_vector = self.embedding(inputs[:,0])
        x = tf.concat([embedding_vector,inputs[:,1:]],1)
        x = self.dense1(x)
        x = self.dense2(x)
        if training:
            x = self.dropout(x, training=training)
        x = self.dense3(x)
        if training:
            x = self.dropout(x, training=training)
        x = self.dense4(x)
        if training:
            x = self.dropout(x, training=training)
        return self.out(x)

print(tf.config.list_physical_devices('GPU'))
strategy = tf.distribute.MirroredStrategy(devices=['/device:GPU:0'])

with strategy.scope():
    model = DeepNN(vocab_size)
    model.compile(loss=tf.keras.losses.MeanSquaredError(),
            optimizer=tf.keras.optimizers.Adam(),
            metrics=[tf.keras.metrics.MeanAbsoluteError()])

[]
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [7]:
import os
checkpoint_dir = './training_checkpointsEmbeddingDeepNN'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

def decay(epoch):
  if epoch < 3:
    return 1e-3
  elif epoch >= 3 and epoch < 7:
    return 1e-4
  else:
    return 1e-5

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True),
    tf.keras.callbacks.LearningRateScheduler(decay)
]


In [9]:
model.fit(trainSet, epochs=10, callbacks=callbacks, validation_data=valSet)

Epoch 1/10
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f12b6165280>

In [8]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
eval_loss, eval_acc = model.evaluate(testSet)

print('Eval loss: {}, Eval Accuracy: {}'.format(eval_loss, eval_acc))

 29296/178281 [===>..........................] - ETA: 1:16:57 - loss: 12.6700 - mean_absolute_error: 2.8234

InvalidArgumentError:  indices[43] = 1609 is not in [0, 1600)
	 [[node deep_nn/embedding/embedding_lookup (defined at <ipython-input-6-877d90631ef3>:15) ]] [Op:__inference_test_function_603]

Function call stack:
test_function
