# Library and Data Initialization

In [45]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from keras import backend as K


import numpy as np
import time
import plotly.graph_objects as go
import plotly.express as px

In [46]:

def load_data(dataset):
  data = dataset
  (train_images, train_labels), (test_images, test_labels) = data.load_data()
  train_images = train_images.astype("float32") / 255
  test_images = test_images.astype("float32") / 255
  #train_images = np.expand_dims(train_images, -1)
  #test_images = np.expand_dims(test_images, -1)
  train_labels = keras.utils.to_categorical(train_labels, 100)
  test_labels = keras.utils.to_categorical(test_labels, 100)
  return train_images, train_labels, test_images, test_labels



In [47]:
# MODIFY LAST ARG FOR EACH NOTEBOOK

train_images, train_labels, test_images, test_labels = load_data(tf.keras.datasets.cifar100)

In [48]:
w,h = train_images.shape[1], train_images.shape[2]
classes = train_labels.shape[1]
print(train_images.shape)
print(train_labels.shape)

(50000, 32, 32, 3)
(50000, 100)


# Experiments

For each optimizer, for each dataset (fashion-mnist, cifar-10), train each model (log-reg, mlp, cnn) on the training set

## Optimizers

In [49]:
def test_hyper(test_method, optimizer_list):
  results = []
  loss = []
  for optimizer in optimizer_list:
    result = test_method(optimizer)
    loss.append(result[2])
    results.append(result)
  data = pd.DataFrame({"Learning Rate":["LR = 1e-2", 'LR = 1e-3', 'LR = 1e-4'], "Training Loss":loss})
  fig = px.bar(data, x='Learning Rate', y='Training Loss')
  fig.show()
  return results, loss

def full_opt_test(test_method, optimizer_list):
  results, loss = test_hyper(test_method, optimizer_list)
  index = np.array(loss).argmin()
  results[index][4].show()
  return results[index][:-1]

- For each optimizer
- for each dataset (fashion-mnist, cifar-10)
- train each model (log-reg, mlp, cnn) on the training set
  - log-reg: L2 regularized multi-class log reg, step size $\alpha$ is adjusted by $\frac{1}{\sqrt{t}}$ decay
    - "Logistic regression has a well-studied convex objective, making it suitable for comparison
of different optimizers without worrying about local minimum issues"
  - MLP: 2 hidden layers with 1k hidden nodes and relu activation, dropout layer on input and output layers
  - CNN: 3 conv stages (3x3 filters, 2x2 maxpooling) with 1000 node FC layer, output dropout, minibatch 128

# CNN

In [50]:
hparams_list = {
    "learning_rates" : [1e-2,1e-3,1e-4]
}

adam = []
sgd = []
rmsprop = []
adadelta = []
adagrad = []
for lr in hparams_list['learning_rates']:
  adam.append(tf.keras.optimizers.Adam(
      learning_rate=lr,
      beta_1=0.9,
      beta_2=0.999,
      epsilon=1e-07
    )
  )

  sgd.append(tf.keras.optimizers.SGD(
      learning_rate=lr, 
      momentum=0.0, 
      nesterov=False
    )
  )

  rmsprop.append(tf.keras.optimizers.RMSprop(
      learning_rate=lr,
      rho=0.9,
      momentum=0.0,
      epsilon=1e-07,
      centered=False
    )
  )

  adadelta.append(tf.keras.optimizers.Adadelta(
      learning_rate=lr, 
      rho=0.95, 
      epsilon=1e-07
    )
  )

  adagrad.append(tf.keras.optimizers.Adagrad(
      learning_rate=lr,
      initial_accumulator_value=0.1,
      epsilon=1e-07
    )
  )

In [51]:
def test_CNN(optimizer, max_epochs=20, batch_size=128, verbose=0):

  # MODEL INITIALIZATION
  model = keras.Sequential(
    [
        keras.Input(shape=(w,h,3)),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(128, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(1000, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(100, activation="softmax"),
    ]
  )

  temp_loss = 1
  curr_loss = 0
  i = 0
  total_time = 0
  loss = []
  epochs = []

  # MODEL OPTIMIZER SELECTED
  model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

  # (using training loss for convergence comparison)
  while i < max_epochs:

    # MODEL TRAINING
    tic = time.perf_counter()
    history = model.fit(train_images, train_labels, batch_size=batch_size, epochs=1, validation_split=0.1, verbose=verbose)
    toc = time.perf_counter()

    # MODEL EVALS
    curr_loss = history.history['loss'][0]
    loss.append(curr_loss)

    i += 1
    epochs.append(i)

    total_time += toc - tic

  fig = go.Figure()
  fig.add_trace(go.Scatter(
      x=epochs,
      y=loss,
      mode="lines")
  )

  score = model.evaluate(test_images, test_labels, verbose=0)

  return (i, round(total_time, 3), curr_loss, score[1], fig)

In [52]:
CNN_measurements = []
CNN_measurements.append(full_opt_test(test_CNN, adam))
CNN_measurements.append(full_opt_test(test_CNN, sgd))
CNN_measurements.append(full_opt_test(test_CNN, rmsprop))
CNN_measurements.append(full_opt_test(test_CNN, adadelta))
CNN_measurements.append(full_opt_test(test_CNN, adagrad))
for result in CNN_measurements:
  print(result)

(20, 38.773, 1.4790513515472412, 0.4277999997138977)
(20, 38.001, 3.6238577365875244, 0.17900000512599945)
(20, 42.04, 1.5081017017364502, 0.40400001406669617)
(20, 39.299, 4.231600761413574, 0.07680000364780426)
(20, 38.834, 3.0268189907073975, 0.275299996137619)


# Logistic Regression

In [53]:
hparams_list = {
    "learning_rates" : [1e-2,1e-3,1e-4]
}

adam = []
sgd = []
rmsprop = []
adadelta = []
adagrad = []
for lr in hparams_list['learning_rates']:
  adam.append(tf.keras.optimizers.Adam(
      learning_rate=lr,
      beta_1=0.9,
      beta_2=0.999,
      epsilon=1e-07
    )
  )

  sgd.append(tf.keras.optimizers.SGD(
      learning_rate=lr, 
      momentum=0.0, 
      nesterov=False
    )
  )

  rmsprop.append(tf.keras.optimizers.RMSprop(
      learning_rate=lr,
      rho=0.9,
      momentum=0.0,
      epsilon=1e-07,
      centered=False
    )
  )

  adadelta.append(tf.keras.optimizers.Adadelta(
      learning_rate=lr, 
      rho=0.95, 
      epsilon=1e-07
    )
  )

  adagrad.append(tf.keras.optimizers.Adagrad(
      learning_rate=lr,
      initial_accumulator_value=0.1,
      epsilon=1e-07
    )
  )

In [54]:
def test_LR(optimizer, max_epochs=20, batch_size=128, verbose=0):
  # MODEL INITIALIZATION
  model = Sequential()
  model.add(Dense(classes, input_shape=(w*h*3,), activation='softmax'))
  model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

  # DATA TRANSFORMATION
  X_train = train_images.reshape(train_images.shape[0], w*h*3)
  X_test = test_images.reshape(test_images.shape[0], w*h*3) 

  temp_loss = 1
  curr_loss = 0
  i = 0
  total_time = 0
  loss = []
  epochs = []

  # MODEL OPTIMIZER SELECTED
  LR = float(optimizer.learning_rate)
  model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

  # (using training loss for convergence comparison)
  while i < max_epochs:
    
    temp_loss = curr_loss

    # MODEL TRAINING
    tic = time.perf_counter()
    history = model.fit(X_train, train_labels, batch_size=batch_size, epochs=1, validation_split=0.1, verbose=verbose)
    toc = time.perf_counter()

    # MODEL EVALS
    curr_loss = history.history['loss'][0]
    loss.append(curr_loss)
    epochs.append(i)


    total_time += toc - tic
    K.set_value(model.optimizer.learning_rate, LR / ((i + 1) ** 0.5))
    i += 1
  
  fig = go.Figure()
  fig.add_trace(go.Scatter(
      x=epochs,
      y=loss,
      mode="lines")
  )

  score = model.evaluate(X_test, test_labels, verbose=0)

  return (i, round(total_time, 3), curr_loss, score[1], fig)

In [55]:
LR_measurements = []
LR_measurements.append(full_opt_test(test_LR, adam))
LR_measurements.append(full_opt_test(test_LR, sgd))
LR_measurements.append(full_opt_test(test_LR, rmsprop))
LR_measurements.append(full_opt_test(test_LR, adadelta))
LR_measurements.append(full_opt_test(test_LR, adagrad))
for result in LR_measurements:
  print(result)

(20, 22.441, 3.359934091567993, 0.17339999973773956)
(20, 21.558, 3.8003005981445312, 0.14319999516010284)
(20, 22.671, 3.390036106109619, 0.17080000042915344)
(20, 21.692, 4.138742923736572, 0.09510000050067902)
(20, 22.248, 3.6326868534088135, 0.164900004863739)


# MLP

In [56]:
hparams_list = {
    "learning_rates" : [1e-2,1e-3,1e-4]
}

adam = []
sgd = []
rmsprop = []
adadelta = []
adagrad = []
for lr in hparams_list['learning_rates']:
  adam.append(tf.keras.optimizers.Adam(
      learning_rate=lr,
      beta_1=0.9,
      beta_2=0.999,
      epsilon=1e-07
    )
  )

  sgd.append(tf.keras.optimizers.SGD(
      learning_rate=lr, 
      momentum=0.0, 
      nesterov=False
    )
  )

  rmsprop.append(tf.keras.optimizers.RMSprop(
      learning_rate=lr,
      rho=0.9,
      momentum=0.0,
      epsilon=1e-07,
      centered=False
    )
  )

  adadelta.append(tf.keras.optimizers.Adadelta(
      learning_rate=lr, 
      rho=0.95, 
      epsilon=1e-07
    )
  )

  adagrad.append(tf.keras.optimizers.Adagrad(
      learning_rate=lr,
      initial_accumulator_value=0.1,
      epsilon=1e-07
    )
  )

In [57]:
def test_MLP(optimizer, max_epochs=20, batch_size=128, verbose=0):
  # MODEL INITIALIZATION
  dims = [w*h*3,1000,1000,classes]
  zipped_dims = list(zip(range(len(dims))[:-1], range(len(dims))[1:]))
  model = keras.Sequential()
  for i,j in list(zipped_dims)[:-1]:
    model.add(layers.Dense(dims[j], input_shape=(dims[i],), activation='relu', bias_regularizer=tf.keras.regularizers.L2(0.01), kernel_regularizer=tf.keras.regularizers.L2(0.01)))
    if( i == 0 or j == len(zipped_dims) - 1):
      model.add(layers.Dropout(0.5))
  model.add(layers.Dense(dims[-1], input_shape = (dims[-2],), activation = 'softmax'))
  # DATA TRANSFORMATION
  X_train = train_images.reshape(train_images.shape[0], w*h*3)
  X_test = test_images.reshape(test_images.shape[0], w*h*3) 

  temp_loss = 1
  curr_loss = 0
  i = 0
  total_time = 0
  loss = []
  epochs = []

  # MODEL OPTIMIZER SELECTED
  model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

  # (using training loss for convergence comparison)
  while i < max_epochs:
    
    temp_loss = curr_loss

    # MODEL TRAINING
    tic = time.perf_counter()
    history = model.fit(X_train, train_labels, batch_size=batch_size, epochs=1, validation_split=0.1, verbose=verbose)
    toc = time.perf_counter()

    # MODEL EVALS
    curr_loss = history.history['loss'][0]
    loss.append(curr_loss)
    epochs.append(i)

    i += 1

    total_time += toc - tic

  fig = go.Figure()
  fig.add_trace(go.Scatter(
      x=epochs,
      y=loss,
      mode="lines")
  )

  score = model.evaluate(X_test, test_labels, verbose=0)

  return (i, round(total_time, 3), curr_loss, score[1], fig)

In [58]:
MLP_measurements = []
MLP_measurements.append(full_opt_test(test_MLP, adam))
MLP_measurements.append(full_opt_test(test_MLP, sgd))
MLP_measurements.append(full_opt_test(test_MLP, rmsprop))
MLP_measurements.append(full_opt_test(test_MLP, adadelta))
MLP_measurements.append(full_opt_test(test_MLP, adagrad))
for result in MLP_measurements:
  print(result)

(20, 27.88, 3.9381024837493896, 0.15780000388622284)
(20, 26.67, 5.606688976287842, 0.1386999934911728)
(20, 34.53, 3.920903444290161, 0.1282999962568283)
(20, 29.066, 8.568598747253418, 0.11599999666213989)
(20, 27.545, 3.945629835128784, 0.15199999511241913)
