In [10]:
#@title Run on TensorFlow 2.x
%tensorflow_version 2.x
from __future__ import absolute_import, division, print_function, unicode_literals

In [67]:
#@title Import relevant modules
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import layers
from matplotlib import pyplot as plt
from mlp_sparse_model import MLPSparseModel
from mlp_plain_model import MLPPlainModel
import time

# The following lines adjust the granularity of reporting. 
pd.options.display.max_rows = 10

# The following line improves formatting when ouputting NumPy arrays.
np.set_printoptions(linewidth = 200, precision=3, suppress=True)

SAMPLE_SIZE = 9
N_EXP = 20

In [12]:
def seed_generator():
    # Generate the initial seed for each sample size (to match the seed
    # of the results in the paper)
    # This is just the initial seed, for each experiment, the seeds will be
    # equal the initial seed + the number of the experiment

    N_train_all = np.multiply(9, [1, 2, 3, 4, 5])  # This is for Apache
    if SAMPLE_SIZE in N_train_all:
        seed_o = np.where(N_train_all == SAMPLE_SIZE)[0][0]
    else:
        seed_o = np.random.randint(1, 101)

    return seed_o

In [90]:
#@title Get data
fm_dataset = pd.read_csv("Data/Apache_AllNumeric.csv")
column_dict = {name: "float64" for name in list(fm_dataset.columns.values)}
fm_dataset = fm_dataset.astype(column_dict)
fm_dataset = fm_dataset.reindex(np.random.permutation(fm_dataset.index))

fm_features = fm_dataset.copy()
fm_labels = fm_features.pop('PERF')

fm_features = np.array(fm_features)
# fm_from_csv = tf.data.experimental.make_csv_dataset("Data/Apache_AllNumeric.csv", batch_size=5, label_name='PERF', num_epochs=1, ignore_errors=True,)
# for data, labels in nfm_from_csv.as_numpy_iterator()):
#     print(data.shape)  # (64, 200, 200, 3)
#     print(data.dtype)  # float32
#     print(labels.shape)  # (64,)
#     print(labels.dtype)  # int32
# display(fm_features)

In [95]:
# Normalize values
# data_df_mean = data_df.mean()
# data_df_std = data_df.std()
# data_df_norm = (data_df - data_df_mean)/data_df_std
normalize = layers.Normalization()
normalize.adapt(fm_features)
normalized_data = normalize(fm_features)
display(np.var(normalized_data))
display(np.mean(normalized_data))

1.0

-1.2417634e-08

In [15]:
# split data set and set seed
seed_init = seed_generator()
seed = seed_init*N_EXP + 1
np.random.seed(seed_init)
train_df = data_df_norm.sample(frac=0.6)
test_df = data_df_norm.drop(train_df.index).sample(frac=1.0)

In [16]:
# create feature layer
columns = [column for column in column_dict.keys() if column != 'PERF']
feature_columns = []
for column in columns:
    feature_columns.append(tf.feature_column.numeric_column(column))
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [17]:
#@title Define the plotting function.

def plot_the_loss_curve(epochs, mse):
  """Plot a curve of loss vs. epoch."""

  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Mean Squared Error")

  plt.plot(epochs, mse, label="Loss")
  plt.legend()
  plt.ylim([mse.min()*0.95, mse.max() * 1.03])
  plt.show()

In [36]:
#@title Double-click for a possible solution

# The following "solution" uses L2 regularization to bring training loss
# and test loss closer to each other. Many, many other solutions are possible.


def create_model(learning_rate, feature_layer):
  """Create and compile a simple linear regression model."""

  # Discard any pre-existing version of the model.
  model = None

  # # Most simple tf.keras models are sequential.
  # model = tf.keras.models.Sequential()

  # # Add the layer containing the feature columns to the model.
  # model.add(feature_layer)

  # # Describe the topography of the model. 

  # # Implement L1 regularization in the first hidden layer.
  # model.add(tf.keras.layers.Dense(units=20, 
  #                                 activation='relu',
  #                                 kernel_regularizer=tf.keras.regularizers.l1(0.009),
  #                                 name='Hidden1'))
  
  # # Implement L1 regularization in the second hidden layer.
  # model.add(tf.keras.layers.Dense(units=12, 
  #                                 activation='relu', 
  #                                 kernel_regularizer=tf.keras.regularizers.l1(0.009),
  #                                 name='Hidden2'))

  # # Define the output layer.
  # model.add(tf.keras.layers.Dense(units=1,  
  #                                 name='Output'))                              
  
  input_layer = tf.keras.layers.Input(shape=(1))

  layer_1 = tf.keras.layers.Dense(units=20,
                                  activation='relu',
                                  kernel_regularizer=tf.keras.regularizers.l1(0.009),
                                  name='Hidden1')(input_layer)
  layer_2 = tf.keras.layers.Dense(units=12,
                                  activation='relu',
                                  # kernel_regularizer=tf.keras.regularizers.l1(0.009),
                                  name='Hidden2')(layer_1)

  output_layer = tf.keras.layers.Dense(units=1,
                                       name='Output')(layer_2)

  model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

  model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.MeanSquaredError()])

  return model     


In [38]:
input_layer = tf.keras.layers.Input(shape=(1))
display(input_layer.shape)
layer_1 = tf.keras.layers.Dense(units=20,
                                activation='relu',
                                kernel_regularizer=tf.keras.regularizers.l1(0.009),
                                name='Hidden1')(input_layer)
layer_2 = tf.keras.layers.Dense(units=12,
                                activation='relu',
                                # kernel_regularizer=tf.keras.regularizers.l1(0.009),
                                name='Hidden2')(layer_1)

output_layer = tf.keras.layers.Dense(units=1,
                                    name='Output')(layer_2)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

model.summary()

TensorShape([None, 1])

Model: "model_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_20 (InputLayer)       [(None, 1)]               0         
                                                                 
 Hidden1 (Dense)             (None, 20)                40        
                                                                 
 Hidden2 (Dense)             (None, 12)                252       
                                                                 
 Output (Dense)              (None, 1)                 13        
                                                                 
Total params: 305
Trainable params: 305
Non-trainable params: 0
_________________________________________________________________


In [20]:
def train_model(model, dataset, epochs, label_name,
                batch_size=None, validation_split=0.1):
  """Train the model by feeding it data."""

  # Split the dataset into features and label.
  features = {name:np.array(value) for name, value in dataset.items()}
  label = np.array(features.pop(label_name))
  history = model.fit(x=features, y=label, batch_size=batch_size,
                      epochs=epochs, shuffle=True, validation_split=validation_split) 

  # The list of epochs is stored separately from the rest of history.
  epochs = history.epoch
  
  # To track the progression of training, gather a snapshot
  # of the model's mean squared error at each epoch. 
  hist = pd.DataFrame(history.history)
  mse = hist["mean_squared_error"]

  return epochs, mse

In [37]:
# The following variables are the hyperparameters.
learning_rate = 0.004
epochs = 100
batch_size = 10

label_name = "PERF"

# Establish the model's topography.
my_model = create_model(learning_rate, feature_layer)
# Train the model on the normalized training set.
epochs, mse = train_model(my_model, train_df, epochs, 
                          label_name, batch_size, validation_split=0.1)
my_model.summary()
# plot_the_loss_curve(epochs, mse)

# test_features = {name:np.array(value) for name, value in test_df.items()}
# test_label = np.array(test_features.pop(label_name)) # isolate the label
# print("\n Evaluate the new model against the test set:")
# my_model.evaluate(x = test_features, y = test_label, batch_size=batch_size) 


Epoch 1/100


  super(Adam, self).__init__(name, **kwargs)


ValueError: in user code:

    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 859, in train_step
        y_pred = self(x, training=True)
    File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.7/dist-packages/keras/engine/input_spec.py", line 183, in assert_input_compatibility
        raise ValueError(f'Missing data for input "{name}". '

    ValueError: Missing data for input "input_19". You passed a data dictionary with keys ['HostnameLookups', 'KeepAlive', 'EnableSendfile', 'FollowSymLinks', 'AccessLog', 'ExtendedStatus', 'InMemory', 'Handle']. Expected the following keys: ['input_19']
