# Notebook Overview
### Baseline model for RNN with embeddings

## Inputs
The input variables for the RNN in this notebook are the following:

    1. Time series data -- hours_l1, hours_l2, hours_l3, ..., hours_l14
    2. Additional predictors -- prov_id, day_of_week, avg_employees, perc_hours_today_before,
       perc_hours_yesterday_before, perc_hours_tomorrow_before

## Model Architecture
The architecture used in this notebook combines an RNN with a feed forward neural network. The RNN layer recieves 14 days of lagged shift data and makes a prediction for the shift on the 15th day. This prediction is then concatenated with the additional predictors and fed into a traditional neural network to generate a better prediction--the idea being that the RNN (through it's long and short term memory) learns patterns over time and the FF network adjusts these patterns based on additional information. 

In [1]:
import pandas as pd
import time
import tensorflow as tf

num_threads = 100

tf.config.threading.set_inter_op_parallelism_threads(
    num_threads
)
tf.config.threading.set_intra_op_parallelism_threads(
    num_threads
)

include_fields = ['hours','prov_id','day_of_week','avg_employees','perc_hours_today_before',
                  'perc_hours_yesterday_before', 'perc_hours_tomorrow_before']
for i in range(1,15):
    include_fields.append(f"hours_l{i}")

startTime = time.time()
train = pd.read_csv("/export/storage_adgandhi/PBJhours_ML/Data/Intermediate/train_test_validation/training_set.csv",usecols=include_fields).dropna()
val = pd.read_csv("/export/storage_adgandhi/PBJhours_ML/Data/Intermediate/train_test_validation/crossvalidation_set.csv",usecols=include_fields).dropna()
print(f"Loaded Train and Validation Sets. Time taken: {time.time()-startTime} seconds")

Loaded Train and Validation Sets. Time taken: 369.17936515808105 seconds


In [2]:
train_inputs, train_labels = train.drop(['hours'], axis=1), train.filter(['hours'])
val_inputs, val_labels = val.drop(['hours'], axis=1), val.filter(['hours'])
print(train_inputs.columns)
vocab_size = len(train_inputs['prov_id'].unique())
print(f"Unique facilities: {vocab_size}")

#Remove providers that appear in val set but not train
train_providers = train_inputs['prov_id'].unique()
val_providers = val_inputs['prov_id'].unique()
for value in val_providers:
    if value not in train_providers:
        mask = (val_inputs['prov_id']!=value)
        val_inputs = val_inputs[mask]
        val_labels = val_labels[mask]

# Remap prov_id's between 0 - # providers
provider_map = {}
index = 0
for element in train_inputs['prov_id'].unique():
    provider_map[element]=index
    index +=1
train_inputs['prov_id'] = train_inputs['prov_id'].map(provider_map)
val_inputs['prov_id'] = val_inputs['prov_id'].map(provider_map)



Index(['prov_id', 'day_of_week', 'hours_l1', 'hours_l2', 'hours_l3',
       'hours_l4', 'hours_l5', 'hours_l6', 'hours_l7', 'hours_l8', 'hours_l9',
       'hours_l10', 'hours_l11', 'hours_l12', 'hours_l13', 'hours_l14',
       'avg_employees', 'perc_hours_today_before',
       'perc_hours_yesterday_before', 'perc_hours_tomorrow_before'],
      dtype='object')
Unique facilities: 15919


In [3]:
def expand_one_hot(labels,dataset):
    outList = []
    for label in labels:  
        col = dataset[label]
        ###Generate a dict for all unique values (Don't waste space encoding non important job id's)
        map = {}
        index = 0
        for element in col.unique():
            map[element] = index
            index += 1
        col = col.map(map)
        tensor = tf.one_hot(col,len(col.unique()))
        outList.append(tensor)
        dataset = dataset.drop(columns=[label])
    
    outList.insert(0,dataset)
    output = tf.concat(outList,1)
    return output

train_inputs = expand_one_hot(['day_of_week'],train_inputs)
val_inputs = expand_one_hot(['day_of_week'],val_inputs)
#test_inputs = expand_one_hot(['day_of_week'],test_inputs)

print(train_inputs.shape)
print(val_inputs.shape)
print(train_inputs[0])

(137722552, 26)
(68726819, 26)
tf.Tensor(
[ 0.        0.        0.        0.        7.5       7.5       0.
  0.        0.        0.        7.5       0.        7.5       0.
  0.       56.857143  0.        0.        0.        1.        0.
  0.        0.        0.        0.        0.      ], shape=(26,), dtype=float32)


In [4]:
BUFFER_SIZE = 10000
BATCH_SIZE = 256
trainSet = tf.data.Dataset.from_tensor_slices((train_inputs,train_labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
valSet = tf.data.Dataset.from_tensor_slices((val_inputs,val_labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
#testSet = tf.data.Dataset.from_tensor_slices((test_inputs,test_labels)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
print(trainSet)

<BatchDataset shapes: ((None, 26), (None, 1)), types: (tf.float32, tf.float64)>


In [5]:
class RNN(tf.keras.Model):

    def __init__(self,vocab_size):
        super(RNN, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size,10)
        self.lstm = tf.keras.layers.LSTM(64)
        self.dense1 = tf.keras.layers.Dense(32, activation=tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(32, activation=tf.nn.relu)
        self.out = tf.keras.layers.Dense(1)
        self.batch_norm = tf.keras.layers.BatchNormalization()

    def call(self, inputs, training=False):
        #prov_id is first column
        embedding_vectors = self.embedding(inputs[:,0])
        time_series = tf.reverse(tf.expand_dims(inputs[:,1:15],2),[1])
        additional_inputs = inputs[:,15:]

        x = self.lstm(time_series)
        x = tf.concat([x,embedding_vectors,additional_inputs],1)
        x = self.dense1(x)
        x = self.batch_norm(x)
        x = self.dense2(x)
        x = self.batch_norm(x)
        return self.out(x)

model = RNN(vocab_size)
    
model.compile(loss=tf.keras.losses.MeanSquaredError(),
        optimizer=tf.keras.optimizers.Adam(),
        metrics=[tf.keras.metrics.MeanAbsoluteError()])

In [9]:
import os
checkpoint_dir = './training_checkpointsRNN'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")



def decay(epoch):
  if epoch < 3:
    return 1e-3
  elif epoch >= 3 and epoch < 7:
    return 1e-4
  else:
    return 1e-5

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True),
    tf.keras.callbacks.LearningRateScheduler(decay)
]
#model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
model.fit(trainSet, epochs=10, callbacks=callbacks, validation_data=valSet)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10