In [2]:
from attention_dynamic_model import AttentionDynamicModel
import tensorflow as tf
import time
from tensorflow.keras.optimizers import Adam
from reinforce_baseline import RolloutBaseline
from time import strftime, gmtime
from utils import create_data_on_disk
from train import train_model
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
%env TF_GPU_ALLOCATOR=cuda_malloc_async

2022-10-25 08:56:55.318256: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available:  1
env: TF_GPU_ALLOCATOR=cuda_malloc_async


2022-10-25 08:56:57.363621: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-25 08:56:57.395975: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-25 08:56:57.396151: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


Benchmarking AM-D
=================

This experiment will train an AM-D model based on the following hyperparameters:

| Hyperparameter | Values |
| ---: | :--- |
| Learning Rate | 1e-4, 5e-5 1e-5 |
| Embedding Dimensions | 64, 128, 256 |
| Attention Heads | 6, 8, 10 |
| Encoder Layers | 1, 2, 3, 4 |



In [3]:
learn_rates = [1e-4, 5e-5, 1e-5]
enc_dims    = [64, 128, 256]
attn_heads  = [4, 8] # Note: enc_dims must be divided by attn_heads
enc_layers  = [1, 2, 3, 4]
iterations  = [2, 2, 4, 8, 16, 32, 64, 128]

''' Constant Parameters '''
# AM-D Constants
tanh_clipping = 10

# Optimizer Constants (Adam)
beta_1        = 0.9
beta_2        = 0.999
epsilon       = 1e-07
amsgrad       = False
name          = "Adam"

# Environment Constants
graph_size         = 20

# Rollout Baseline Constants
wp_n_epochs        = 5
epoch              = 0
num_samples        = 10_000 #Validation Samples
warmup_exp_beta    = 0.8

# Training Constants
samples            = 10_000 #1_280_000 # 512
batch              = 128
val_batch_size     = 1_000 # out of num_samples
start_epoch        = 0
grad_norm_clipping = 1.0
batch_verbose      = 1_000

global_start_time = time.time()

for lr in learn_rates:
    for enc_dim in enc_dims:
        for attn_h in attn_heads:
            for enc_layer in enc_layers:
                print(f'Learning Rate: {lr}\nEmbedding Dimension: {enc_dim}\nAttention Heads: {attn_h}\nEncoder Layers: {enc_layer}')
                start_time = time.time()
                
                # Build AM-D Model
                model_amd = AttentionDynamicModel(
                    embedding_dim  =enc_dim,
                    n_encode_layers=enc_layer,
                    n_heads        =attn_h,
                    tanh_clipping  =tanh_clipping
                )
                model_amd.set_decode_type('sampling')
                
                # Create Optimizer
                optimizer = Adam(
                    learning_rate=lr,
                    beta_1=beta_1,
                    beta_2=beta_2,
                    epsilon=epsilon,
                    amsgrad=amsgrad,
                    name=name,
                )
                
                # Baseline model
                baseline = RolloutBaseline(
                    model             = model_amd,
                    filename          = None,
                    from_checkpoint   = False,
                    path_to_checkpoint= None,
                    wp_n_epochs       = wp_n_epochs,
                    epoch             = epoch,
                    num_samples       = num_samples,
                    embedding_dim     = enc_dim,
                    graph_size        = graph_size
                    )
                for _iter in iterations:
                    print(f'Iteration: {_iter}')
                    date = 'oct_25'
                    filename = f'checkpoints/AMD-banchmarking-trained_on-{_iter * num_samples}-num_layers-{enc_layer}-attn-{attn_h}-embedding_dim-{enc_dim}-lr-{lr}-date-{date}.ckp'
                    
                    validation_dataset = create_data_on_disk(
                        graph_size     =graph_size,
                        num_samples    =num_samples,
                        is_save        =False,
                        filename       =None,
                        is_return      =True,
                        seed           = 42
                    )
                    train_model(
                        optimizer          = optimizer,
                        model_tf           = model_amd,
                        baseline           = baseline,
                        validation_dataset = validation_dataset,
                        samples            = samples,
                        batch              = batch,
                        val_batch_size     = val_batch_size,
                        start_epoch        = start_epoch,
                        end_epoch          = _iter,
                        from_checkpoint    = False,
                        grad_norm_clipping = grad_norm_clipping,
                        batch_verbose      = batch_verbose,
                        graph_size         = graph_size,
                        filename           = None
                        )
                    model_amd.save_weights(filename)
                    # To Do: Add information about progress
                print(f'Time for Parameter Iteration: {time.time() - start_time}')

print(f'Total time: {time.time() - global_start_time}')

2022-10-25 08:56:57.413515: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Learning Rate: 0.0001
Embedding Dimension: 64
Attention Heads: 4
Encoder Layers: 1


2022-10-25 08:56:57.414051: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-25 08:56:57.414414: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-25 08:56:57.414680: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-25 08:56:57.909216: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-25 08:56:57.909418: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from S

Evaluating baseline model on baseline dataset (epoch = 0)


Rollout greedy execution: 100%|█████████████████| 10/10 [00:11<00:00,  1.19s/it]


Iteration: 2
Current decode type: sampling


batch calculation at epoch 0: 1it [00:01,  1.80s/it]

grad_global_norm = 4.556308269500732, clipped_norm = 0.9999999403953552
Epoch 0 (batch = 0): Loss: -2.090458869934082: Cost: 13.4697847366333


batch calculation at epoch 0: 79it [01:36,  1.22s/it]


Evaluating candidate model on baseline dataset (callback epoch = 0)


Rollout greedy execution: 100%|█████████████████| 10/10 [00:05<00:00,  1.79it/s]


Epoch 0 candidate mean 9.84041976928711, baseline epoch 0 mean 16.721332550048828, difference -6.880912780761719
p-value: 0.0
Update baseline


ValueError: Cannot assign value to variable ' attention_dynamic_model_2/graph_attention_encoder_2/multi_head_attention_layer_4/MHA/dense_17/kernel:0': Shape mismatch.The variable shape (64, 64), and the assigned value shape (65, 64) are incompatible.

In [7]:
model_amd.variables

[<tf.Variable 'attention_dynamic_model/graph_attention_encoder/init_embed_depot/kernel:0' shape=(2, 64) dtype=float32, numpy=
 array([[ 0.13315047, -0.10507061, -0.11463892, -0.1462347 ,  0.0042512 ,
         -0.0430197 , -0.1908948 , -0.07025934,  0.08306053, -0.01930277,
          0.12962146, -0.19577964,  0.2500617 , -0.03038784, -0.2848677 ,
         -0.22235033,  0.20459744, -0.11801361,  0.13305368, -0.25599414,
         -0.24735446, -0.05458432, -0.01473179,  0.2734475 , -0.11864787,
          0.19604796, -0.2913884 , -0.1580056 , -0.18718438, -0.09049027,
          0.02109828, -0.11962614,  0.21479948,  0.24813636,  0.19356214,
          0.20777883, -0.25860614,  0.29386246,  0.2750997 , -0.06497433,
          0.03031395, -0.06930444,  0.20231831,  0.14439535, -0.18665949,
         -0.16434969, -0.23551273,  0.03986199, -0.2667672 ,  0.07536203,
          0.08386199,  0.10153659,  0.22627957,  0.10100763, -0.0506764 ,
         -0.14208376,  0.22061281,  0.05918619,  0.06576119,