In [None]:
from support.util import Config, Experiment

from trainable.models.vit import build_focal_LAXNet, build_basic_lunchbox
from trainable.models.cnn import build_basic_convnextv2, build_basic_cnn
from trainable.models.ae import lunchbox_packerv2

from data.datasets.image_classification import deep_weeds, cats_dogs, dot_dataset, citrus_leaves
from data.datasets.image_to_image import cifar10
from optimization.data_augmentation.msda import mixup_dset, blended_dset
from optimization.data_augmentation.ssda import add_gaussian_noise_dset, custom_rand_augment_dset, foff_dset

from tensorflow.keras.callbacks import LearningRateScheduler
from optimization.callbacks import EarlyStoppingDifference

from optimization.training_loops.supervised import keras_supervised
from optimization.schedules import bleed_out
"""
hardware_params must include:

    'n_gpu': uint
    'n_cpu': uint
    'node': str
    'partition': str
    'time': str (we will just write this to the file)
    'memory': uint
    'distributed': bool
"""
hardware_params = {
    'name': 'hparam',
    'n_gpu': 4,
    'n_cpu': 16,
    'partition': 'ai2es',
    'nodelist': ['c732'],
    'time': '96:00:00',
    'memory': 16384,
    # The %04a is translated into a 4-digit number that encodes the SLURM_ARRAY_TASK_ID
    'stdout_path': '/scratch/jroth/supercomputer/text_outputs/exp%01a_stdout_%A.txt',
    'stderr_path': '/scratch/jroth/supercomputer/text_outputs/exp%01a_stderr_%A.txt',
    'email': 'jay.c.rothenberger@ou.edu',
    'dir': '/scratch/jroth/AI2ES-DL/',
    'array': '[1]',
    'results_dir': 'results'
}
"""
network_params must include:
    
    'network_fn': network building function
    'network_args': arguments to pass to network building function
        network_args must include:
            'lrate': float
    'hyperband': bool
"""
image_size = (32, 32, 3)

network_params = {
    'network_fn': lunchbox_packerv2,
    'network_args': {
        'lrate': 5e-4,
        'n_classes': 2,
        'iterations': 6,
        'conv_filters': 24,
        'conv_size': '[3]',
        'dense_layers': '[16]',
        'learning_rate': [5e-4],
        'image_size': image_size,
        'l1': None,
        'l2': None,
        'alpha': [1, 2**(-10)],
        'beta': [2**(-7)],
        'noise_level': 0.005,
        'depth': 3,
    },
    'hyperband': False
}

"""
experiment_params must include:
    
    'seed': random seed for computation
    'steps_per_epoch': uint
    'validation_steps': uint
    'patience': uint
    'min_delta': float
    'epochs': uint
    'nogo': bool
"""


experiment_params = {
    'seed': 42,
    'steps_per_epoch': 512,
    'validation_steps': 256,
    'patience': 3,
    'min_delta': 0.0,
    'epochs': 5,
    'nogo': False,
}
"""
dataset_params must include:
    'dset_fn': dataset loading function
    'dset_args': arguments for dataset loading function
    'cache': str or bool
    'batch': uint
    'prefetch': uint
    'shuffle': bool
    'augs': iterable of data augmentation functions
"""
dataset_params = {
    'dset_fn': cifar10,
    'dset_args': {
        'image_size': image_size[:-1],
        'path': '../data/'
    },
    'cache': False,
    'cache_to_lscratch': False,
    'batch': 256,
    'prefetch': 4,
    'shuffle': True,
    'augs': []
}

optimization_params = {
    'callbacks': [
        # EarlyStoppingDifference(patience=experiment_params['patience'],
        #                        restore_best_weights=True,
        #                        min_delta=experiment_params['min_delta'],
        #                        metric_0='val_clam_categorical_accuracy',
        #                        metric_1='val_clam_1_categorical_accuracy',
        #                        n_classes=2),

        LearningRateScheduler(bleed_out(network_params['network_args']['learning_rate'])),
        # LossWeightScheduler(loss_weight_schedule)
    ],
    'training_loop': keras_supervised
}

config = Config(hardware_params, network_params, dataset_params, experiment_params, optimization_params)


if __name__ == "__main__":

    exp = Experiment(config)

    # print(exp.params)
    exp.run_array(0)

    # exp.enqueue()


2023-02-01 20:39:59.874687: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


Size of Hyperparameter Grid: 2
{
	name: hparam
	n_gpu: 4
	n_cpu: 16
	partition: ai2es
	nodelist: ['c732']
	time: 96:00:00
	memory: 16384
	stdout_path: /scratch/jroth/supercomputer/text_outputs/exp%01a_stdout_%A.txt
	stderr_path: /scratch/jroth/supercomputer/text_outputs/exp%01a_stderr_%A.txt
	email: jay.c.rothenberger@ou.edu
	dir: /scratch/jroth/AI2ES-DL/
	array: [1]
	results_dir: results
	}
{
	seed: 42
	steps_per_epoch: 512
	validation_steps: 256
	patience: 3
	min_delta: 0.0
	epochs: 5
	nogo: False
	}
{
	network_fn: <function lunchbox_packerv2 at 0x2ac2e3efcee0>
	network_args: {
		hyperband: False
		noise_level: 0.005
		l2: None
		dense_layers: [16]
		depth: 3
		conv_filters: 24
		lrate: 0.0005
		conv_size: [3]
		image_size: (32, 32, 3)
		iterations: 6
		alpha: 1
		beta: 0.0078125
		learning_rate: 0.0005
		l1: None
		n_classes: 2
		}
	hyperband: False
	}
{
	dset_fn: <function cifar10 at 0x2ac2e4e1bbe0>
	dset_args: {
		image_size: (32, 32)
		path: ../data/
		}
	cache: False
	cache_to_l

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


512 None
Model: "lunchbox_ae"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 32, 32, 3)]       0         
                                                                 
 reshape (Reshape)           (None, 1024, 3)           0         
                                                                 
 v_lunchbox_mhsa (VLunchboxM  (None, 1024, 24)         59520     
 HSA)                                                            
                                                                 
 q_lunchbox_mhsa (QLunchboxM  (None, 512, 24)          26112     
 HSA)                                                            
                                                                 
 layer_normalization (LayerN  (None, 512, 24)          48        
 ormalization)                                                   
                                              

2023-02-01 20:41:21.266508: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_UINT8
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 50000
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\025TensorSliceDataset:29"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
        dim {
          size: 32
        }
        dim {
          size: 32
        }
        dim {
          size: 3
        }
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT

Epoch 1/5
INFO:tensorflow:batch_all_reduce: 53 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:batch_all_reduce: 53 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:batch_all_reduce: 53 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:batch_all_reduce: 53 all-reduces with algorithm = nccl, num_packs = 1


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).




2023-02-01 20:44:29.954298: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_UINT8
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 10000
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\025TensorSliceDataset:36"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
        dim {
          size: 32
        }
        dim {
          size: 32
        }
        dim {
          size: 3
        }
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


Epoch 2/5
Epoch 3/5
 85/512 [===>..........................] - ETA: 2:08 - loss: 0.1747

In [1]:
from support.util import load_most_recent_results

result_path = '../results/'

results = load_most_recent_results(result_path, 1)[0]

results.summary()
results.config.dataset_params['dset_args']['path'] = '../data/'
class_names = results.config.dataset_params['dset_fn'](**results.config.dataset_params['dset_args'])['class_names']
model_data = results.model_data
keras_model = model_data.get_model()
test_dset = results.config.dataset_params['dset_fn'](**results.config.dataset_params['dset_args'])['test']
test_dset = test_dset.batch(1)

for x, y in iter(test_dset):
    print(x.shape, y.shape)
    break

2023-02-01 17:34:31.696945: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm



------------------------------------------------------------
Experimental Results Summary (Index: 0)
------------------------------------------------------------
Dataset Params: {
	dset_fn: <function cifar10 at 0x2b7489510280>
	dset_args: {
		image_size: (32, 32)
		path: ../data/
		}
	cache: False
	cache_to_lscratch: False
	batch: 256
	prefetch: 4
	shuffle: True
	augs: []
	}

Network Params:  {
	network_fn: <function lunchbox_packer at 0x2b74895100d0>
	network_args: {
		hyperband: False
		image_size: (32, 32, 3)
		l2: None
		depth: 3
		lrate: 0.0005
		conv_size: [3]
		alpha: 1
		conv_filters: 24
		learning_rate: 0.0005
		n_classes: 2
		l1: None
		noise_level: 0.005
		dense_layers: [16]
		iterations: 6
		beta: 0.0078125
		}
	hyperband: False
	}
------------------------------------------------------------
Experiment Parameters: {
	seed: 42
	steps_per_epoch: 512
	validation_steps: 256
	patience: 3
	min_delta: 0.0
	epochs: 5
	nogo: False
	}

Experiment Runtime: 801.9894886016846s

Epochs 

In [6]:
import tensorflow as tf

encoder_layers = [(i, layer) for i, layer in enumerate(keras_model.layers)][:17]

inputs = tf.keras.layers.Input((32, 32, 3))
x = inputs
for i, layer in encoder_layers[1:]:
    if 'lunchbox' in layer.name:
        layer.pack()
    x = layer(x)

x = tf.keras.layers.Flatten()(x)
outputs = tf.keras.layers.Dense(10, activation=lambda x: x * tf.nn.relu6(x + 3) / 6)(x)

keras_model = tf.keras.models.Model(inputs=[inputs], outputs=[outputs])

opt = tf.keras.optimizers.Nadam(learning_rate=1e-4,
                                beta_1=0.9, beta_2=0.999,
                                epsilon=None, decay=0.99)

opt = tf.keras.mixed_precision.LossScaleOptimizer(opt)

keras_model.compile(loss='categorical_crossentropy',
                    optimizer=opt,
                    metrics=['categorical_accuracy'])

keras_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 32, 32, 3)]       0         
                                                                 
 layer_normalization (LayerN  (None, 32, 32, 3)        6         
 ormalization)                                                   
                                                                 
 reshape (Reshape)           (None, 1024, 3)           0         
                                                                 
 lunchbox_mhsa (LunchboxMHSA  (None, 256, 24)          9024      
 )                                                               
                                                                 
 reshape_1 (Reshape)         (None, 16, 16, 24)        0         
                                                                 
 layer_normalization_1 (Laye  (None, 16, 16, 24)       48  

In [7]:
from data.datasets.image_classification import cifar10 as cifar

dset_obc = cifar()

train, val = dset_obc['train'].batch(64), dset_obc['val'].batch(64)

for x, y in iter(train):
    print(x.shape, y.shape)
    break

keras_model.fit(train, validation_data=val, epochs=10)

(64, 32, 32, 3) (64, 10)


2023-02-01 17:38:09.978351: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-02-01 17:38:11.044323: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-02-01 17:38:11.058829: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. I



<keras.callbacks.History at 0x2b79fc82c310>