In [None]:
import pandas as pd
import time
import tensorflow as tf

def normalize(dataset):
    seriesIndices = ['hours']
    for i in range(1,31):
        seriesIndices.append(f"hours_l{i}")
    dataset[seriesIndices] = dataset[seriesIndices].apply(lambda x: (x - x.mean()) / (x.std()))
    return dataset

include_fields = ['prov_id','hours','day_of_week','week_perc0','week_perc6']
for i in range(1,31):
    include_fields.append(f"hours_l{i}")

startTime = time.time()
train = pd.read_csv("/export/storage_adgandhi/PBJhours_ML/Data/Intermediate/train_test_validation/training_set.csv",usecols=include_fields).dropna()
val = pd.read_csv("/export/storage_adgandhi/PBJhours_ML/Data/Intermediate/train_test_validation/validation_set.csv",usecols=include_fields).dropna()
print(f"Loaded Train and Validation Sets. Time taken: {time.time()-startTime}")

In [None]:
#shuffle all data values so prov id distribution is same for train/val
import numpy as np
temp = pd.concat([train,val])
#Convert provider id's into dense representation
provider_map = {}
index = 0
for element in temp['prov_id'].unique():
    provider_map[element]=index
    index +=1
temp['prov_id'] = temp['prov_id'].map(provider_map)

np.random.shuffle(temp.values)
train = temp[:train.shape[0]]
val = temp[val.shape[0]:]
vocab_size = len(temp['prov_id'].unique())

In [None]:
train_inputs, train_labels = train.drop(['hours'], axis=1), train.filter(['hours'])
val_inputs, val_labels = val.drop(['hours'], axis=1), val.filter(['hours'])
#test_inputs, test_labels = test.drop(['hours'], axis=1), test.filter(['hours'])

In [None]:
def expand_one_hot(labels,dataset):
    outList = []
    for label in labels:  
        col = dataset[label]
        ###Generate a dict for all unique values (Don't waste space encoding non important job id's)
        map = {}
        index = 0
        for element in col.unique():
            map[element] = index
            index += 1
        col = col.map(map)
        tensor = tf.one_hot(col,len(col.unique()))
        outList.append(tensor)
        dataset = dataset.drop(columns=[label])
    
    outList.insert(0,dataset)
    output = tf.concat(outList,1)
    return output

train_inputs = expand_one_hot(['day_of_week'],train_inputs)
val_inputs = expand_one_hot(['day_of_week'],val_inputs)
#test_inputs = expand_one_hot(['day_of_week'],test_inputs)

print(train_inputs.shape)
print(train_inputs[0])

In [5]:
BUFFER_SIZE = 10000
BATCH_SIZE = 256
trainSet = tf.data.Dataset.from_tensor_slices((train_inputs,train_labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
valSet = tf.data.Dataset.from_tensor_slices((val_inputs,val_labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
#testSet = tf.data.Dataset.from_tensor_slices((test_inputs,test_labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
print(trainSet)

<BatchDataset shapes: ((None, 40), (None, 1)), types: (tf.float32, tf.float64)>


In [6]:
class RNN(tf.keras.Model):

    def __init__(self,vocab_size):
        super(RNN, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size,100)
        self.lstm = tf.keras.layers.LSTM(64)
        self.dense1 = tf.keras.layers.Dense(64, activation=tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(64, activation=tf.nn.relu)
        self.out = tf.keras.layers.Dense(1)
        self.dropout = tf.keras.layers.Dropout(0.5)

    def call(self, inputs, training=False):
        #prov_id is first column
        embedding_vectors = self.embedding(inputs[:,0])
        time_series = tf.expand_dims(inputs[:,1:31],2)
        additional_inputs = inputs[:,31:]

        x = self.lstm(time_series)
        x = tf.concat([x,embedding_vectors,additional_inputs],1)
        x = self.dense1(x)
        if training:
            x = self.dropout(x, training=training)
        x = self.dense2(x)
        if training:
            x = self.dropout(x, training=training)
       
        return self.out(x)

model = RNN(vocab_size)
    
model.compile(loss=tf.keras.losses.MeanSquaredError(),
        optimizer=tf.keras.optimizers.Adam(),
        metrics=[tf.keras.metrics.MeanAbsoluteError()])

In [7]:
import os
checkpoint_dir = './training_checkpointsRNN'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch+6}")

def decay(epoch):
  if epoch < 3:
    return 1e-4
  elif epoch >= 3 and epoch < 7:
    return 1e-5
  else:
    return 1e-6

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True),
    tf.keras.callbacks.LearningRateScheduler(decay)
]
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
model.fit(trainSet, epochs=10, callbacks=callbacks, validation_data=valSet)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
  1219/487865 [..............................] - ETA: 4:15:20 - loss: 8.4573 - mean_absolute_error: 2.0262