### 0. PREPARING ENVIRONMENT VARIABLE

In [36]:
# Setting up env variables for cleaner command line commands.
import os

%env KEY=nvidia_tlt
%env NUM_GPUS=1
%env USER_EXPERIMENT_DIR=/workspace/faceclassification
%env DATA_DOWNLOAD_DIR=/workspace/data

# Set this path if you don't run the notebook from the samples directory.
# %env NOTEBOOK_ROOT=~/tao-samples/classification

# Please define this local project directory that needs to be mapped to the TAO docker session.
# The dataset expected to be present in $LOCAL_PROJECT_DIR/data, while the results for the steps
# in this notebook will be stored at $LOCAL_PROJECT_DIR/classification
# !PLEASE MAKE SURE TO UPDATE THIS PATH!.
os.environ["LOCAL_PROJECT_DIR"] = "/workspace"

os.environ["LOCAL_DATA_DIR"] = os.path.join(
    os.getenv("LOCAL_PROJECT_DIR", os.getcwd()),
    "data"
)
os.environ["LOCAL_EXPERIMENT_DIR"] = os.path.join(
    os.getenv("LOCAL_PROJECT_DIR", os.getcwd()),
    "faceclassification"
)

# The sample spec files are present in the same path as the downloaded samples.
os.environ["LOCAL_SPECS_DIR"] = os.path.join(
    os.getenv("LOCAL_EXPERIMENT_DIR", os.getcwd()),
    "specs"
)
%env SPECS_DIR=/workspace/faceclassification/specs

# Showing list of specification files.
!ls -rlt $LOCAL_SPECS_DIR

env: KEY=nvidia_tlt
env: NUM_GPUS=1
env: USER_EXPERIMENT_DIR=/workspace/faceclassification
env: DATA_DOWNLOAD_DIR=/workspace/data
env: SPECS_DIR=/workspace/faceclassification/specs
total 12
-rw-r--r-- 1 root root 1272 Aug  8 04:51 classification_spec.cfg
-rw-r--r-- 1 root root 1142 Aug  8 04:51 classification_retrain_spec.cfg
-rw-r--r-- 1 root root 1247 Aug  9 04:03 classification_mobilenet.cfg


In [6]:
# Mapping up the local directories to the TAO docker.
import json
import os
mounts_file = os.path.expanduser("~/.tao_mounts.json")

# Define the dictionary with the mapped drives
drive_map = {
    "Mounts": [
        # Mapping the data directory
        {
            "source": os.environ["LOCAL_PROJECT_DIR"],
            "destination": "/workspace"
        },
        # Mapping the specs directory.
        {
            "source": os.environ["LOCAL_SPECS_DIR"],
            "destination": os.environ["SPECS_DIR"]
        },
    ],
    "DockerOptions":{
        "user": "{}:{}".format(os.getuid(), os.getgid())
    }
}

# Writing the mounts file.
with open(mounts_file, "w") as mfile:
    json.dump(drive_map, mfile, indent=4)

In [7]:
!cat ~/.tao_mounts.json

{
    "Mounts": [
        {
            "source": "/workspace",
            "destination": "/workspace"
        },
        {
            "source": "/workspace/faceclassification/specs",
            "destination": "/workspace/faceclassification/specs"
        }
    ],
    "DockerOptions": {
        "user": "0:0"
    }
}

In [8]:
!tao info

Configuration of the TAO Toolkit Instance
dockers: ['nvidia/tao/tao-toolkit-tf', 'nvidia/tao/tao-toolkit-pyt', 'nvidia/tao/tao-toolkit-lm']
format_version: 2.0
toolkit_version: 3.22.05
published_date: 05/25/2022


### 1. DATASET PREPARATION

#### 1.1 Downloading Pre-Trained Model

In [9]:
# Installing NGC CLI on the local machine.
## Download and install
%env CLI=ngccli_cat_linux.zip
!mkdir -p $LOCAL_PROJECT_DIR/ngccli

# Remove any previously existing CLI installations
!rm -rf $LOCAL_PROJECT_DIR/ngccli/*
!wget "https://ngc.nvidia.com/downloads/$CLI" -P $LOCAL_PROJECT_DIR/ngccli
!unzip -u "$LOCAL_PROJECT_DIR/ngccli/$CLI" -d $LOCAL_PROJECT_DIR/ngccli/
!rm $LOCAL_PROJECT_DIR/ngccli/*.zip 
os.environ["PATH"]="{}/ngccli:{}".format(os.getenv("LOCAL_PROJECT_DIR", ""), os.getenv("PATH", ""))

env: CLI=ngccli_cat_linux.zip
--2022-08-08 06:21:57--  https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip
Resolving ngc.nvidia.com (ngc.nvidia.com)... 52.84.251.128, 52.84.251.90, 52.84.251.5, ...
Connecting to ngc.nvidia.com (ngc.nvidia.com)|52.84.251.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33992208 (32M) [application/zip]
Saving to: ‘/workspace/ngccli/ngccli_cat_linux.zip’


2022-08-08 06:22:10 (4.34 MB/s) - ‘/workspace/ngccli/ngccli_cat_linux.zip’ saved [33992208/33992208]

Archive:  /workspace/ngccli/ngccli_cat_linux.zip
   creating: /workspace/ngccli/ngc-cli/
   creating: /workspace/ngccli/ngc-cli/multidict/
  inflating: /workspace/ngccli/ngc-cli/multidict/_multidict.cpython-39-x86_64-linux-gnu.so  
  inflating: /workspace/ngccli/ngc-cli/libpython3.9.so.1.0  
  inflating: /workspace/ngccli/ngc-cli/libkrb5support.so.0  
  inflating: /workspace/ngccli/ngc-cli/libgssapi_krb5.so.2  
 extracting: /workspace/ngccli/ngc-cli/base_library.zip  
   c

In [11]:
!../ngccli/ngc-cli/ngc registry model list nvidia/tao/pretrained_classification:*

+-------+-------+-------+-------+-------+-------+-------+-------+-------+
| Versi | Accur | Epoch | Batch | GPU   | Memor | File  | Statu | Creat |
| on    | acy   | s     | Size  | Model | y Foo | Size  | s     | ed    |
|       |       |       |       |       | tprin |       |       | Date  |
|       |       |       |       |       | t     |       |       |       |
+-------+-------+-------+-------+-------+-------+-------+-------+-------+
| vgg19 | 77.56 | 80    | 1     | V100  | 153.7 | 153.7 | UPLOA | Aug   |
|       |       |       |       |       |       | 2 MB  | D_COM | 18,   |
|       |       |       |       |       |       |       | PLETE | 2021  |
| vgg16 | 77.17 | 80    | 1     | V100  | 113.2 | 113.1 | UPLOA | Aug   |
|       |       |       |       |       |       | 6 MB  | D_COM | 18,   |
|       |       |       |       |       |       |       | PLETE | 2021  |
| squee | 65.13 | 80    | 1     | V100  | 6.5   | 6.46  | UPLOA | Aug   |
| zenet |       |       |       |     

In [59]:
!mkdir -p $LOCAL_EXPERIMENT_DIR/pretrained_resnet50/

In [60]:
# Pull pretrained model from NGC
!../ngccli/ngc-cli/ngc registry model download-version nvidia/tao/pretrained_classification:resnet50 \
    --dest $LOCAL_EXPERIMENT_DIR/pretrained_resnet50

Downloaded 272.54 MB in 1m 54s, Download speed: 2.39 MB/s               
--------------------------------------------------------------------------------
   Transfer id: pretrained_classification_vresnet50
   Download status: Completed
   Downloaded local path: /workspace/faceclassification/pretrained_resnet50/pretrained_classification_vresnet50
   Total files downloaded: 1
   Total downloaded size: 272.54 MB
   Started at: 2022-08-09 07:06:40.436218
   Completed at: 2022-08-09 07:08:34.611480
   Duration taken: 1m 54s
--------------------------------------------------------------------------------


### TRAINING

#### 2.1 Creating config file for training

In [16]:
!cat $LOCAL_SPECS_DIR/classification_mobilenet.cfg

model_config {
  arch: "resnet",
  n_layers: 18
  # Setting these parameters to true to match the template downloaded from NGC.
  use_batch_norm: true
  all_projections: true
  freeze_blocks: 0
  freeze_blocks: 1
  input_image_size: "3,224,224"
}
train_config {
  train_dataset_path: "/workspace/tao-experiments/data/split/train"
  val_dataset_path: "/workspace/tao-experiments/data/split/val"
  pretrained_model_path: "/workspace/tao-experiments/classification/pretrained_resnet18/pretrained_classification_vresnet18/resnet_18.hdf5"
  optimizer {
    sgd {
    lr: 0.01
    decay: 0.0
    momentum: 0.9
    nesterov: False
  }
}
  batch_size_per_gpu: 64
  n_epochs: 80
  n_workers: 16
  preprocess_mode: "caffe"
  enable_random_crop: True
  enable_center_crop: True
  label_smoothing: 0.0
  mixup_alpha: 0.1
  # regularizer
  reg_config {
    type: "L2"
    scope: "Conv2D,Dense"
    weight_decay: 0.00005
  }

  # learning_rate
  lr_config {
    step {
      learning_rate: 0.006
      step_size: 1

#### 2.1 Train the mpre-trained model

In [64]:
!tao classification train -e $SPECS_DIR/classification_spec.cfg -r $USER_EXPERIMENT_DIR/output -k $KEY

2022-08-09 07:32:21,612 [INFO] root: Registry: ['nvcr.io']
2022-08-09 07:32:21,765 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit-tf:v3.22.05-tf1.15.5-py3
Using TensorFlow backend.
Using TensorFlow backend.
2022-08-09 07:32:31,005 [INFO] __main__: Loading experiment spec at /workspace/faceclassification/specs/classification_spec.cfg.




2022-08-09 07:32:31,698 [INFO] iva.common.logging.logging: Log file already exists at /workspace/faceclassification/output/status.json




2022-08-09 07:32:31,699 [INFO] __main__: Default image mean [103.939, 116.779, 123.68] will be used.
Found 1465 images belonging to 3 classes.
2022-08-09 07:32:31,810 [INFO] __main__: Processing dataset (train): /workspace/data/train
Found 203 images belonging to 3 classes.
2022-08-09 07:32:31,915 [INFO] __main__: Processing dataset (validation): /workspace/data/val




















_______________________________________________________________

In [40]:
!tao classification train -e $SPECS_DIR/classification_mobilenet.cfg -r $USER_EXPERIMENT_DIR/output -k $KEY

2022-08-09 04:31:38,292 [INFO] root: Registry: ['nvcr.io']
2022-08-09 04:31:38,451 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit-tf:v3.22.05-tf1.15.5-py3
Using TensorFlow backend.
Using TensorFlow backend.
2022-08-09 04:31:48,265 [INFO] __main__: Loading experiment spec at /workspace/faceclassification/specs/classification_mobilenet.cfg.




2022-08-09 04:31:48,963 [INFO] iva.common.logging.logging: Log file already exists at /workspace/faceclassification/output/status.json




2022-08-09 04:31:48,964 [INFO] __main__: Default image mean [103.939, 116.779, 123.68] will be used.
Found 1465 images belonging to 3 classes.
2022-08-09 04:31:49,077 [INFO] __main__: Processing dataset (train): /workspace/data/train
Found 203 images belonging to 3 classes.
2022-08-09 04:31:49,182 [INFO] __main__: Processing dataset (validation): /workspace/data/val




















__________________________________________________________

In [45]:
!tao classification evaluate -e $SPECS_DIR/classification_googlenet.cfg -k $KEY

2022-08-09 05:01:53,360 [INFO] root: Registry: ['nvcr.io']
2022-08-09 05:01:53,523 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit-tf:v3.22.05-tf1.15.5-py3
Using TensorFlow backend.
Using TensorFlow backend.








INFO: Loading experiment spec at /workspace/faceclassification/specs/classification_googlenet.cfg.






























__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 3, 224, 224)  0                                            
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 64, 112, 112) 9408        input_1[0][0]                    
____________________________________________________________________________________________

In [44]:
!tao classification evaluate -e $SPECS_DIR/classification_mobilenet.cfg -k $KEY

2022-08-09 04:50:54,454 [INFO] root: Registry: ['nvcr.io']
2022-08-09 04:50:54,620 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit-tf:v3.22.05-tf1.15.5-py3
Using TensorFlow backend.
Using TensorFlow backend.








INFO: Loading experiment spec at /workspace/faceclassification/specs/classification_mobilenet.cfg.




























_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 3, 224, 224)       0         
_________________________________________________________________
conv1_pad (ZeroPadding2D)    (None, 3, 226, 226)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 32, 112, 112)      864       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 32, 112, 112)      128   

#### 2.3 Pruning the model

In [48]:
# Defining the checkpoint epoch number of the model to be used for the pruning.
# This should be lesser than the number of epochs training has been run for, incase training was interrupted earlier.
# By default, the default final model is at epoch 080.
%env EPOCH=080
!mkdir -p $LOCAL_EXPERIMENT_DIR/output/mobilenetv1_pruned
!tao classification prune -m $USER_EXPERIMENT_DIR/output/weights/mobilenet_v1_$EPOCH.tlt \
           -o $USER_EXPERIMENT_DIR/output/mobilenetv1_pruned/mobilenet_v1_nopool_bn_pruned.tlt \
           -eq union \
           -pth 0.6 \
           -k $KEY \
           --results_dir $USER_EXPERIMENT_DIR/logs

env: EPOCH=080
2022-08-09 05:07:32,657 [INFO] root: Registry: ['nvcr.io']
2022-08-09 05:07:32,820 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit-tf:v3.22.05-tf1.15.5-py3
Using TensorFlow backend.
Using TensorFlow backend.
2022-08-09 05:07:41,314 [INFO] iva.common.logging.logging: Log file already exists at /workspace/faceclassification/logs/status.json
2022-08-09 05:07:44,671 [INFO] modulus.pruning.pruning: Exploring graph for retainable indices
2022-08-09 05:07:45,582 [INFO] modulus.pruning.pruning: Pruning model and appending pruned nodes to new graph
2022-08-09 05:08:13,560 [INFO] iva.common.magnet_prune: Pruning ratio (pruned model / original model): 0.5404517226346166
2022-08-09 05:08:13,824 [INFO] root: Pruning ratio (pruned model / original model): 0.5404517226346166
2022-08-09 05:08:15,430 [INFO] tlt.components.docker_handler.docker_handler: Stopping container.


In [51]:
!tao classification train -e $SPECS_DIR/classification_retrain_mobilenet.cfg \
                      -r $USER_EXPERIMENT_DIR/output_retrain \
                      -k $KEY

2022-08-09 05:49:59,480 [INFO] root: Registry: ['nvcr.io']
2022-08-09 05:49:59,622 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit-tf:v3.22.05-tf1.15.5-py3
Using TensorFlow backend.
Using TensorFlow backend.
2022-08-09 05:50:08,762 [INFO] __main__: Loading experiment spec at /workspace/faceclassification/specs/classification_retrain_mobilenet.cfg.




2022-08-09 05:50:09,451 [INFO] iva.common.logging.logging: Log file already exists at /workspace/faceclassification/output_retrain/status.json




2022-08-09 05:50:09,451 [INFO] __main__: Default image mean [103.939, 116.779, 123.68] will be used.
Found 1465 images belonging to 3 classes.
2022-08-09 05:50:09,563 [INFO] __main__: Processing dataset (train): /workspace/data/train
Found 203 images belonging to 3 classes.
2022-08-09 05:50:09,669 [INFO] __main__: Processing dataset (validation): /workspace/data/val




















__________________________________________

In [53]:
!tao classification evaluate -e $SPECS_DIR/classification_retrain_mobilenet.cfg -k $KEY

2022-08-09 06:22:46,927 [INFO] root: Registry: ['nvcr.io']
2022-08-09 06:22:47,064 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit-tf:v3.22.05-tf1.15.5-py3
Using TensorFlow backend.
Using TensorFlow backend.








INFO: Loading experiment spec at /workspace/faceclassification/specs/classification_retrain_mobilenet.cfg.




























_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 3, 224, 224)       0         
_________________________________________________________________
conv1_pad (ZeroPadding2D)    (None, 3, 226, 226)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 16, 112, 112)      432       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 16, 112, 112)    

In [54]:
!tao classification export \
            -m $USER_EXPERIMENT_DIR/output_retrain/weights/mobilenet_v1_$EPOCH.tlt \
            -o $USER_EXPERIMENT_DIR/export/face_classifier_mobilenet_model.etlt \
            -k $KEY

2022-08-09 06:24:23,409 [INFO] root: Registry: ['nvcr.io']
2022-08-09 06:24:23,543 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit-tf:v3.22.05-tf1.15.5-py3
Using TensorFlow backend.
Using TensorFlow backend.
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice/.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
2022-08-09 06:24:35,736 [INFO] iva.common.export.keras_exporter: Using input nodes: ['input_1']
2022-08-09 06:24:35,736 [INFO] iva.common.expor

In [55]:
# Remove the pre-existing exported .etlt file.
!rm -rf $LOCAL_EXPERIMENT_DIR/export/final_model.etlt
!tao classification export \
            -m $USER_EXPERIMENT_DIR/output_retrain/weights/mobilenet_v1_$EPOCH.tlt \
            -o $USER_EXPERIMENT_DIR/export/final_model.etlt \
            -k $KEY \
            --cal_data_file $USER_EXPERIMENT_DIR/export/calibration.tensor \
            --data_type int8 \
            --batches 10 \
            --cal_cache_file $USER_EXPERIMENT_DIR/export/final_model_int8_cache.bin \
            --classmap_json $USER_EXPERIMENT_DIR/output_retrain/classmap.json \
            --gen_ds_config \
            -v

2022-08-09 06:27:28,926 [INFO] root: Registry: ['nvcr.io']
2022-08-09 06:27:29,075 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit-tf:v3.22.05-tf1.15.5-py3
Using TensorFlow backend.
Using TensorFlow backend.
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice/.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
2022-08-09 06:27:40,832 [INFO] iva.common.export.keras_exporter: Using input nodes: ['input_1']
2022-08-09 06:27:40,833 [INFO] iva.common.expor

In [56]:
!tao converter $USER_EXPERIMENT_DIR/export/final_model.etlt \
               -k $KEY \
               -o predictions/Softmax \
               -d 3,224,224 \
               -i nchw \
               -e $USER_EXPERIMENT_DIR/export/final_model.trt \
               -b 64

2022-08-09 06:58:19,881 [INFO] root: Registry: ['nvcr.io']
2022-08-09 06:58:20,036 [INFO] tlt.components.instance_handler.local_instance: Running command in container: nvcr.io/nvidia/tao/tao-toolkit-tf:v3.22.05-tf1.15.5-py3
[INFO] [MemUsageChange] Init CUDA: CPU +195, GPU +0, now: CPU 206, GPU 375 (MiB)
[INFO] [MemUsageSnapshot] Begin constructing builder kernel library: CPU 206 MiB, GPU 375 MiB
[INFO] [MemUsageSnapshot] End constructing builder kernel library: CPU 270 MiB, GPU 375 MiB
[INFO] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +264, GPU +110, now: CPU 555, GPU 485 (MiB)
[INFO] [MemUsageChange] Init cuDNN: CPU +111, GPU +44, now: CPU 666, GPU 529 (MiB)
[INFO] Local timing cache in use. Profiling results in this builder pass will not be stored.
[INFO] Some tactics do not have sufficient workspace memory to run. Increasing workspace size may increase performance, please check verbose output.
[INFO] Detected 1 inputs and 1 output network tensors.
[INFO] Total Host Persistent Memory