# Tenorflow ResNet 50 Optimization Tutorial

## 1. Compile the model with various configurations.

### On inf1.6xlarge, run through the following steps to get a optimized Resnet 50 model.
### Extract Keras ResNet50 FP32 (resnet50_fp32_keras.pb will be generated):

In [1]:
import re
import argparse
import tensorflow as tf
import numpy as np

from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions

from google.protobuf import text_format
import tensorflow.python.saved_model

# set Keras global configurations
tf.keras.backend.set_learning_phase(0)
tf.keras.backend.set_image_data_format('channels_last')

float_type = 'float32'
float_type2 = 'fp32'
tf.keras.backend.set_floatx(float_type)

# load pre-trained model using Keras
model_name = 'resnet50_%s_keras'%float_type2
model = ResNet50(weights='imagenet')

# various save files
frozen_file = model_name + '.pb'
opt_file = model_name + '_opt.pb'

# obtain parameters
model_input = model.input.name.replace(':0', '')
model_output = model.output.name.replace(':0', '')
batch, height, width, channels = model.input.shape

print ("model, frozen file, optimized file, input size, input node, output node,")
print ("%s, %s, %s, %dx%dx%d, %s, %s" %(model_name, frozen_file, opt_file, width, height, channels, model_input, model_output) ) 

# obtain the TF session
sess = tf.compat.v1.keras.backend.get_session()

# save checkpoint files for freeze_graph
ckpt_file = '/tmp/' + model_name + '/' + model_name + '.ckpt'
graph_file = '/tmp/' + model_name + '/' + model_name + '.pb'
tf.compat.v1.train.Saver().save(sess, ckpt_file)
tf.io.write_graph(sess.graph.as_graph_def(), logdir='.', name=graph_file, as_text=False)

print(model_output)
with tf.compat.v1.Session(graph=tf.Graph()) as sess:
      saver = tf.compat.v1.train.import_meta_graph(ckpt_file + '.meta')
      saver.restore(sess, ckpt_file)
      output_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants(
          sess, tf.compat.v1.get_default_graph().as_graph_def(), [model_output])
      output_graph_def = tf.compat.v1.graph_util.remove_training_nodes(
          output_graph_def, protected_nodes=[model_output])
      with open(frozen_file, 'wb') as f:
          f.write(output_graph_def.SerializeToString())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
model, frozen file, optimized file, input size, input node, output node,
resnet50_fp32_keras, resnet50_fp32_keras.pb, resnet50_fp32_keras_opt.pb, 224x224x3, input_1, probs/Softmax
probs/Softmax
INFO:tensorflow:Restoring parameters from /tmp/resnet50_fp32_keras/resnet50_fp32_keras.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 320 variables.
INFO:tensorflow:Converted 320 variables to const ops.
Instructions for updating:
Use `tf.compat.v1.graph_util.remove_training_nodes`


### Optimize the extracted Keras ResNet50 FP32 graph for inference before casting (resnet50_fp32_keras_opt.pb will be generated) with the following transformations to the graph:

### * Remove Identity and CheckNumerics nodes
### * Fold FusedBatchNorm constants into previous Conv2D weights
### * Fold other constants
### * Strip unused nodes
### * Sort by execution order

In [2]:
import copy
import string

from google.protobuf import text_format
from tensorflow.core.framework import node_def_pb2
from tensorflow.core.framework import attr_value_pb2
from tensorflow.python.framework import tensor_util
from tensorflow.tools.graph_transforms import TransformGraph

def clear_input(node):
  for i in range(len(node.input)):
    node.input.pop()

def replace_name(node, name):
  node.name = name
     
def replace_input(node, input_name, new_name):
  # node.input.replace(input_name, new_name)
  temp = []
  for i in node.input:
    temp.extend([new_name if i == input_name else i])
  clear_input(node)
  for i in temp:
    node.input.extend([i])

def swap_names(node1, node2):
  temp = node2.name
  node2.name = node1.name
  node1.name = temp

def get_const_node(const_node_name, const_by_name):
  name = re.sub("/read$", "", const_node_name)
  return const_by_name[name]

def get_const_ndarray(const_node_name, const_by_name):
  name = re.sub("/read$", "", const_node_name)
  node = const_by_name[name]
  return tf.make_ndarray(node.attr.get("value").tensor)

def adjust_bias_values(bias_node, fbn_node, const_by_name):
  bias_val = get_const_ndarray(bias_node.input[1], const_by_name)  
  gamma_val = get_const_ndarray(fbn_node.input[1], const_by_name)  
  mean_val = get_const_ndarray(fbn_node.input[3], const_by_name)  
  variance_val = get_const_ndarray(fbn_node.input[4], const_by_name) 
  new_bias = bias_val * gamma_val / np.sqrt(variance_val)
  new_tensor = tensor_util.make_tensor_proto(new_bias, new_bias.dtype, new_bias.shape)
  bias_const_node = get_const_node(bias_node.input[1], const_by_name)
  bias_const_node.attr["value"].CopyFrom(attr_value_pb2.AttrValue(tensor=new_tensor))

def MoveBiasAddAfterFusedBatchNorm(graphdef):
  """fold_batch_norm function of TransformGraph is unable to fold Keras ResNet50
  because of BiasAdd between Conv2D and FusedBatchNorm (BiasAdd is not needed
  if FusedBatchNorm is used, but it exists in Keras ResNet50). Here, we 
  move BiasAdd to after FusedBatchNorm, and adjust bias value by gamma/sqrt(variance).
  """
  sess = tf.compat.v1.Session(graph=tf.import_graph_def(graphdef))
  output_graph_def = tf.compat.v1.GraphDef()
  node_by_name = {}
  const_by_name = {}
  for node in graphdef.node:
    # Hack: use FusedBatchNormV2 so fold_batch_norm can recognize
    if node.op == "FusedBatchNormV3":
      node.op = "FusedBatchNorm"
      del(node.attr["U"])
      #import pdb; pdb.set_trace()
    copied_node = node_def_pb2.NodeDef()
    copied_node.CopyFrom(node)
    node_by_name[node.name] = copied_node
    skip_add_node = False
    # Switch Mul/BiasAdd in Keras RN50 so fold_batch_norm transform would work
    if node.op == "Const":
      const_by_name[node.name] = copied_node  
    elif node.op.startswith("FusedBatchNorm"):
      inputs = node.input
      for i in inputs:
        input_node = node_by_name[i]
        if input_node.op == "BiasAdd":
          output_graph_def.node.remove(input_node)
          input_node_input0 = input_node.input[0]
          # Adjust bias values (multiply by scale/sqrt(variance))
          adjust_bias_values(input_node, node, const_by_name)
          # Hack: swap names to avoid changing input of activation
          swap_names(copied_node, input_node)
          # Fix inputs for these two ops
          replace_input(copied_node, i, input_node_input0)
          replace_input(input_node, input_node_input0, copied_node.name)
          # Fix order in node list
          output_graph_def.node.extend([copied_node])
          output_graph_def.node.extend([input_node])
          skip_add_node = True
    # Add maybe-modified nodes if not already done
    if not skip_add_node:
      output_graph_def.node.extend([copied_node])
  return output_graph_def

def FoldFusedBatchNorm(graph_def):
  """Optimize training graph for inference:
    - Remove Identity and CheckNumerics nodes
    - Fold FusedBatchNorm constants into previous Conv2D weights
    - Fold other constants
    - Strip unused nodes
    - Sort by execution order
  """
  transformed_graph_def = TransformGraph (
         graph_def,
         ['input_1'],
         ['probs/Softmax'],
         [
            'add_default_attributes',
            'remove_nodes(op=Identity, op=CheckNumerics)',
            'fold_constants(ignore_errors=true)',
            'fold_batch_norms',
            'fold_old_batch_norms',
            'strip_unused_nodes',
            'sort_by_execution_order',
         ])
  return transformed_graph_def

def load_graph(model_file):
  graph_def = tf.compat.v1.GraphDef()

  with open(model_file, "rb") as f:
    graph_def.ParseFromString(f.read())
  return graph_def


graph_orig = load_graph('resnet50_fp32_keras.pb')
graph_mod = MoveBiasAddAfterFusedBatchNorm(graph_orig)
graph_mod2 = FoldFusedBatchNorm(graph_mod)
with tf.io.gfile.GFile('resnet50_fp32_keras_opt.pb', "wb") as f:
    f.write(graph_mod2.SerializeToString())

### Convert full graph to FP16 (resnet50_fp16_keras_opt.pb will be generated):
### This will take about a minute.

In [3]:
from tensorflow.core.framework import graph_pb2
from tensorflow.python.platform import gfile

def ConvertFP32ToOther(graphdef):
  """Converts an FP32 network by casting all constants (weights) to a lower
     precision floating point type (FP16) and updating the dtypes
     everywhere."""
  cast_type = "float16"
  sess = tf.Session(graph=tf.import_graph_def(graphdef))
  output_graph_def = graph_pb2.GraphDef()
  dummy_tensor = sess.run(tf.constant([0.1]))
  dummy_tensor_proto = tensor_util.make_tensor_proto(dummy_tensor, \
      dtype=cast_type, shape=dummy_tensor.shape)
  dummy_tensor32 = sess.run(tf.constant([0.1]))
  dummy_tensor_proto32 = tensor_util.make_tensor_proto(dummy_tensor, \
      dtype=tf.float32, shape=dummy_tensor.shape)
  dt_float_type_attr = attr_value_pb2.AttrValue(type=dummy_tensor_proto32.dtype)
  dt_half_type_attr = attr_value_pb2.AttrValue(type=dummy_tensor_proto.dtype)
  for node in graphdef.node:
    output_node = node_def_pb2.NodeDef()
    output_node.CopyFrom(node)
    if (node.op == "Const"):
      if (node.attr["dtype"] == dt_float_type_attr):
        a = tensor_util.MakeNdarray(node.attr["value"].tensor)
        a = tf.cast(a, cast_type)
        a = sess.run(a)
        output_node.attr["dtype"].CopyFrom(dt_half_type_attr)
        output_node.attr["value"].CopyFrom(
            attr_value_pb2.AttrValue(
              tensor=tensor_util.make_tensor_proto(a,\
                dtype=cast_type, shape=a.shape)))
    else:
      if ("T" in node.attr.keys()):
        if (output_node.attr["T"] == dt_float_type_attr):
          output_node.attr["T"].CopyFrom(dt_half_type_attr)
      if ("Tparams" in node.attr.keys()):
        if (output_node.attr["Tparams"] == dt_float_type_attr):
          output_node.attr["Tparams"].CopyFrom(dt_half_type_attr)
      if ("dtype" in node.attr.keys()):
        if (node.attr["dtype"] == dt_float_type_attr):
          output_node.attr["dtype"].CopyFrom(dt_half_type_attr)
      if ("SrcT" in node.attr.keys()):
        if (node.attr["SrcT"] == dt_float_type_attr):
          output_node.attr["SrcT"].CopyFrom(dt_half_type_attr)
      if ("DstT" in node.attr.keys()):
        if (node.attr["DstT"] == dt_float_type_attr):
          output_node.attr["DstT"].CopyFrom(dt_half_type_attr)
    output_graph_def.node.extend([output_node])
  return output_graph_def

def load_graph(model_file):
  graph_def = tf.GraphDef()

  with open(model_file, "rb") as f:
    graph_def.ParseFromString(f.read())

  return graph_def

graph_f32 = load_graph('resnet50_fp32_keras_opt.pb')
graph_f16 = ConvertFP32ToOther(graph_f32)
output_xformed_graph_name = 'resnet50_fp16_keras_opt.pb'
with gfile.GFile(output_xformed_graph_name, "wb") as f:
    f.write(graph_f16.SerializeToString())


#### Run the compilation script to sweep through various batch sizes up to 5 and several NeuronCore Group sizes up to 16. The script calls the compilation script pb2sm_compile.py which tries to perform compilation. Some error messages are expected due to known issues (see Known Issues section below). If you run all the configurations it will take about 45 minutes.

In [4]:
%%bash
#!/usr/bin/env bash

echo "" > full_sweep.log
echo "" > full_sweep_results.txt

results=()
for b in $(seq 1 5); do 
    for i in 1 2 4 8 12 16; do 
        python pb2sm_compile.py --batch_size=$b --neuroncore-pipeline-cores=$i | tee -a full_sweep.log;
        results[$b]+=", "`tail -1 full_sweep.log`
    done
done

head="batch"
for i in 1 2 4 8 12 16; do
    head+=", nc${i}"
done 
echo $head | tee -a full_sweep_results.txt
for b in $(seq 1 5); do 
    echo $b${results[$b]} | tee -a full_sweep_results.txt
done


*** Batch size 1, num NeuronCores 1 (input shape: (1, 224, 224, 3), saved model dir: rn50_fp16_compiled_b1_nc1) ***


INFO: Compilation finished in 58 seconds with 99.5% operations placed on Inferentia

1

*** Batch size 1, num NeuronCores 2 (input shape: (1, 224, 224, 3), saved model dir: rn50_fp16_compiled_b1_nc2) ***


INFO: Compilation finished in 57 seconds with 99.5% operations placed on Inferentia

1

*** Batch size 1, num NeuronCores 4 (input shape: (1, 224, 224, 3), saved model dir: rn50_fp16_compiled_b1_nc4) ***


INFO: Compilation finished in 59 seconds with 99.5% operations placed on Inferentia

1

*** Batch size 1, num NeuronCores 8 (input shape: (1, 224, 224, 3), saved model dir: rn50_fp16_compiled_b1_nc8) ***


INFO: Compilation finished in 67 seconds with 99.5% operations placed on Inferentia

2

*** Batch size 1, num NeuronCores 12 (input shape: (1, 224, 224, 3), saved model dir: rn50_fp16_compiled_b1_nc12) ***


INFO: Compilation finished in 74 seconds with 99.5% ope



2021-03-31 19:11:36.531880: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2021-03-31 19:11:36.552257: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3000015000 Hz
2021-03-31 19:11:36.552905: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x564a8eb687d0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-03-31 19:11:36.552926: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.simple_save.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
2021-03-31 19:11:47.702042: I tensorf

#### You should see some output like this:
```
INFO: Compilation finished in 95 seconds with 99.5% operations placed on Inferentia

1

*** Batch size 1, num NeuronCores 2 (input shape: (1, 224, 224, 3), saved model dir: rn50_fp16_compiled_b1_nc2) ***

INFO: Compilation finished in 95 seconds with 99.5% operations placed on Inferentia

1

*** Batch size 1, num NeuronCores 4 (input shape: (1, 224, 224, 3), saved model dir: rn50_fp16_compiled_b1_nc4) ***

INFO: Compilation finished in 95 seconds with 99.5% operations placed on Inferentia

1

... (outputs removed)

*** Batch size 5, num NeuronCores 16 (input shape: (5, 224, 224, 3), saved model dir: rn50_fp16_compiled_b5_nc16) ***

ERROR: Compilation finished in 120 seconds with less than 50% operations placed on Inferentia (0.0%)

INFO: Retry compilation without static weights

ERROR: Retry compilation finished in 137 seconds with less than 50% operations placed on Inferentia (0.0%)

0

The file full_sweep_results.txt shows a summary of the sweep results with latest Neuron 1/27/20 release (0 means compilation unsuccessful and 0 ops mapped to Inferentia, 1 means most ops mapped to Inferentia and non-static weights, 2 means most ops mapped to Inferentia and using static weights):

batch, nc1, nc2, nc4, nc8, nc12, nc16
1, 1, 1, 1, 2, 2, 2
2, 1, 1, 0, 1, 2, 2
3, 1, 1, 1, 1, 1, 1
4, 1, 1, 0, 1, 1, 1
5, 1, 1, 0, 0, 0, 0
```


## 2. Run Inference on your Inf1.6xlarge and collect statistics.

#### Run inference over different batch sizes to obtain throughput and latency results for ResNet50 replicated on four NeuronCores. To apply dynamic batching, the user batch size is set to 10x the compiled batch size, in order to keep input queue full and to amortize framework-to-Neuron overhead.

In [5]:
!pip install pillow # Necessary for loading images
!cd ~/aws-neuron-sdk/src/examples/tensorflow/keras_resnet50/
!echo "" > batch.log
!for i in $(seq 1 5); do python infer_resnet50_keras_loadtest.py --batch_size=$i | tee -a batch.log; done

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

2021-03-31 19:57:20.754159: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2021-03-31 19:57:20.776257: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3000015000 Hz
2021-03-31 19:57:20.777115: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x560deea36cc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-03-31 19:57:20.777142: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
Instructions for 

Throughput values collected:
[7600, 7760, 7560, 7480, 7560, 7760, 7680, 7760, 7760, 7600, 7640, 7600, 7800]

Compiled batch size 4, user batch size 40, throughput stats (images/sec): max=7800 p99=7795 p50=7640, avg latency 166.5957 msec/user-batch

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

2021-03-31 19:59:09.304729: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2021-03-31 19:59:09.328254: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3000015000 Hz
2021-03-31 19:59:09.329135: I tensorflow/compiler/xla/service/service.cc:168] XLA service 

#### The file batch.log now contains the results for each batch size. The output should look something like this:

```
*** Compiled batch size 1, user batch size 10, num NeuronCores 1 (input shape: (10, 224, 224, 3), saved model dir: ./rn50_fp16_compiled_b1_nc1/1) ***

Instance type inf1.6xlarge with 16 NeuronCores
NEURON_MAX_NUM_INFERS (env): 2
NEURONCORE_GROUP_SIZES (env): 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
NUM THREADS:  32
NUM_LOOPS_PER_THREAD:  100
USER_BATCH_SIZE:  10
Throughput values collected:
[3110, 3120, 3100, 3080, 3140, 3120, 3130, 3110]

Compiled batch size 1, user batch size 10, throughput stats (images/sec): max=3140 p99=3139 p50=3115, avg latency 105.3192 sec/user-batch

*** Compiled batch size 2, user batch size 20, num NeuronCores 1 (input shape: (20, 224, 224, 3), saved model dir: ./rn50_fp16_compiled_b2_nc1/1) ***

Instance type inf1.6xlarge with 16 NeuronCores
NEURON_MAX_NUM_INFERS (env): 2
NEURONCORE_GROUP_SIZES (env): 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
NUM THREADS:  32
NUM_LOOPS_PER_THREAD:  100
USER_BATCH_SIZE:  20
Throughput values collected:
[5160, 5200, 5140, 5080, 5120, 5180, 5120, 5120, 5160, 5240]

Compiled batch size 2, user batch size 20, throughput stats (images/sec): max=5240 p99=5236 p50=5150, avg latency 127.9041 sec/user-batch

*** Compiled batch size 3, user batch size 30, num NeuronCores 1 (input shape: (30, 224, 224, 3), saved model dir: ./rn50_fp16_compiled_b3_nc1/1) ***

Instance type inf1.6xlarge with 16 NeuronCores
NEURON_MAX_NUM_INFERS (env): 2
NEURONCORE_GROUP_SIZES (env): 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
NUM THREADS:  32
NUM_LOOPS_PER_THREAD:  100
USER_BATCH_SIZE:  30
Throughput values collected:
[6030, 5670, 5940, 5820, 5850, 6090, 6000, 6120, 5820, 6180, 5790, 5820, 5790, 5760, 5790]

Compiled batch size 3, user batch size 30, throughput stats (images/sec): max=6180 p99=6171 p50=5820, avg latency 164.8427 sec/user-batch

*** Compiled batch size 4, user batch size 40, num NeuronCores 1 (input shape: (40, 224, 224, 3), saved model dir: ./rn50_fp16_compiled_b4_nc1/1) ***

Instance type inf1.6xlarge with 16 NeuronCores
NEURON_MAX_NUM_INFERS (env): 2
NEURONCORE_GROUP_SIZES (env): 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
NUM THREADS:  32
NUM_LOOPS_PER_THREAD:  100
USER_BATCH_SIZE:  40
Throughput values collected:
[6080, 6280, 6320, 6040, 6200, 6360, 6440, 6120, 6280, 6360, 6200, 5880, 6240, 5960, 6160, 6040, 6120, 6240, 6320]

Compiled batch size 4, user batch size 40, throughput stats (images/sec): max=6440 p99=6425 p50=6200, avg latency 209.3087 sec/user-batch

*** Compiled batch size 5, user batch size 50, num NeuronCores 1 (input shape: (50, 224, 224, 3), saved model dir: ./rn50_fp16_compiled_b5_nc1/1) ***

Instance type inf1.6xlarge with 16 NeuronCores
NEURON_MAX_NUM_INFERS (env): 2
NEURONCORE_GROUP_SIZES (env): 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
NUM THREADS:  32
NUM_LOOPS_PER_THREAD:  100
USER_BATCH_SIZE:  50
Throughput values collected:
[6350, 6300, 6400, 6450, 6400, 6350, 6450, 6350, 6450, 6150, 6200, 6550, 6550, 6450, 6550, 6400, 6550, 6400, 6350, 6350, 6500, 6550, 6300]

Compiled batch size 5, user batch size 50, throughput stats (images/sec): max=6550 p99=6550 p50=6400, avg latency 251.6603 sec/user-batch
```