## Tutorial: Getting started with torch-neuron (resnet-50 tutorial - infer steps in code)

**NOTE:** This notebook content represents the compilation parts of the [getting started tutorial](./getting_started.md) - it is not intended to used without reference to the tutorial. This is why we start at step 5 below :).

## Step 5: Run inference

In this step we run inference on Inf1 instances using the model compiled in Step 3 of [getting started compile](getting_started_compile.ipnb), which should have been copied to this machine


In [None]:
import os
import time
import torch
import torch_neuron
import json
import numpy as np
from urllib import request
from torchvision import models, transforms, datasets
from time import time

## Create an image directory containing a small kitten
os.makedirs("./torch_neuron_test/images", exist_ok=True)
request.urlretrieve("https://raw.githubusercontent.com/awslabs/mxnet-model-server/master/docs/images/kitten_small.jpg",
 "./torch_neuron_test/images/kitten_small.jpg")

## Fetch labels to output the top classifications
request.urlretrieve("https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json","imagenet_class_index.json")
idx2label = []

with open("imagenet_class_index.json", "r") as read_file:
 class_idx = json.load(read_file)
 idx2label = [class_idx[str(k)][1] for k in range(len(class_idx))]

## Import a sample image and normalize it into a tensor
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225])

eval_dataset = datasets.ImageFolder(
    os.path.dirname("./torch_neuron_test/"),
    transforms.Compose([
        transforms.Resize([224, 224]),
        transforms.ToTensor(),
        normalize,
    ])
)
image, _ = eval_dataset[0]
image = torch.tensor(image.numpy()[np.newaxis, ...])

## Load model
model_neuron = torch.jit.load( 'resnet50_neuron.pt' )

## Since the first inference also load the model let's exclude it 
## from timing
results = model_neuron( image )

## Predict for 100 loops
start = time()

loops = 100
for _ in range(loops):
    results = model_neuron( image )
elapsed_time = time() - start
images_sec = loops / float(elapsed_time)

# Get the top 5 results
top5_idx = results[0].sort()[1][-5:]

# Lookup and print the top 5 labels
top5_labels = [idx2label[idx] for idx in top5_idx]

print("Top 5 labels:\n {}".format(top5_labels) )
print("Completed {} operations in {} seconds => {} images / second".format(loops, round(elapsed_time,2), round(images_sec,0) ) )


## Step 6: Run on parallel neuron cores

To full leverage the inferentia hardware we want to use all the cores.  On an inf1.xlarge or inf1.2xlarge we need to use 4. Here we use the futures library to create a simple class that runs four parallel inference threads


In [None]:
from concurrent import futures
import torch
import torch.neuron
import os

class NeuronSimpleDataParallel():

    def __init__(self, model_file, num_neuron_cores, batch_size=1):
        # Construct a list of models
        self.num_neuron_cores = num_neuron_cores
        self.batch_size = batch_size

        class SimpleWrapper():

            def __init__(self, model):
                self.model = model

            def eval(self):
                self.model.eval()

            def train(self):
                self.model.train()

            def __call__(self, *args):
                results = self.model(*args)

                # Make the output iterable - if it is not already a tuple or list
                if not isinstance(results, tuple) or isinstance(results, list):
                    results = [results]

                return results

        self.models = [SimpleWrapper(torch.jit.load(model_file))
                       for i in range(num_neuron_cores)]

        ## Important - please read:
        ##     https://github.com/aws/aws-neuron-sdk/blob/master/docs/tensorflow-neuron/tutorial-NeuronCore-Group.md
        ## For four cores we use 
        ##     os.environ['NEURONCORE_GROUP_SIZES'] = "1,1,1,1" 
        ## when launching four threads
        ## In this logic exists in worker processes, each process should use 
        ##     os.environ['NEURONCORE_GROUP_SIZES'] = "1"
        nc_env = ','.join(['1'] * num_neuron_cores)
        os.environ['NEURONCORE_GROUP_SIZES'] = nc_env

        self.executor = futures.ThreadPoolExecutor(
            max_workers=self.num_neuron_cores)

    def eval(self):
        for m in self.models:
            m.eval()

    def train(self):
        for m in self.models:
            m.train()

    def __call__(self, *args):
        assert all(isinstance(a, torch.Tensor)
                   for a in args), "Non tensor input - tensors are needed to generate batches"
        assert all(a.shape[0] % self.num_neuron_cores ==
                   0 for a in args), "Batch size must be even multiple of the number of parallel neuron cores"

        args_per_core = [[] for i in range(self.num_neuron_cores)]

        # Split args
        for a in args:
            # Based on batch size for arg
            step_size = a.shape[0] // self.num_neuron_cores
            for i in range(self.num_neuron_cores):
                # Append a slice of a view
                start = i * step_size
                end = (i + 1) * step_size

                # Slice
                args_per_core[i].append(a[start:end])

        # Call each core with their split and wait to complete
        running = {self.executor.submit(
            self.models[idx], *args_per_core[idx]): idx for idx in range(self.num_neuron_cores)}

        results = []

        for future in futures.as_completed(running):
            running[future]

            # Expect a tuple of tensors - convert to a list of tensors
            results.append(future.result())

        # Remove zero dimensional tensors (unsqueeze)
        # Iterate results per core
        for ic in range(len(results)):
            # Iterate result tuples
            for ir in range(len(results[ic])):
                # Unsqueeze if zero dimensional or does not look batched (i.e. first dim does not match batch)
                if len(results[ic][ir].size()) == 0 or results[ic][ir].shape[0] != self.batch_size:
                    results[ic][ir] = torch.unsqueeze(
                        results[ic][ir], 0)

        # Concatenate
        output = results[0][0]

        for i in range(1, len(results)):
            for j in range(len(results[i])):
                output = torch.cat([output, results[i][j]], 0)

        return output

In [None]:
import os
from time import time
import torch
import torch_neuron
import json
import numpy as np
from urllib import request
from torchvision import models, transforms, datasets
from parallel import NeuronSimpleDataParallel

## Assuming you are working on and inf1.xlarge or inf1.2xlarge
num_neuron_cores = 4

## Create an image directory containing a small kitten
os.makedirs("./torch_neuron_test/images", exist_ok=True)
request.urlretrieve("https://raw.githubusercontent.com/awslabs/mxnet-model-server/master/docs/images/kitten_small.jpg",
                    "./torch_neuron_test/images/kitten_small.jpg")

## Fetch labels to output the top classifications
request.urlretrieve("https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json","imagenet_class_index.json")
idx2label = []

with open("imagenet_class_index.json", "r") as read_file:
    class_idx = json.load(read_file)
    idx2label = [class_idx[str(k)][1] for k in range(len(class_idx))]
    
## Import a sample image and normalize it into a tensor
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225])
    
eval_dataset = datasets.ImageFolder(
    os.path.dirname("./torch_neuron_test/"),
    transforms.Compose([
    transforms.Resize([224, 224]),
    transforms.ToTensor(),
    normalize,
    ])
)
image, _ = eval_dataset[0]
image = torch.tensor(image.numpy()[np.newaxis, ...])

## Load model
model_neuron = NeuronSimpleDataParallel( 'resnet50_neuron.pt', num_neuron_cores )

## Create a "batch" image with enough images to go on each of the four cores
batch_image = image

for i in range(num_neuron_cores - 1):
    batch_image = torch.cat( [batch_image, image], 0 )

print(batch_image.shape)

## Since the first inference also loads the model to the chip let's exclude it 
## from timing
results = model_neuron( batch_image )

## Predict
loops = 100
start = time()
for _ in range(loops):
    results = model_neuron( batch_image )
elapsed_time = time() - start
images_sec = loops * batch_image.size(0) / float(elapsed_time)

# Get the top 5 results
top5_idx = results[0].sort()[1][-5:]

# Lookup and print the top 5 labels
top5_labels = [idx2label[idx] for idx in top5_idx]
print("Top 5 labels:\n {}".format(top5_labels) )
print("Completed {} operations in {} seconds => {} images / second".format(loops * batch_image.size(0), round(elapsed_time,2), round(images_sec,0) ) )

## Step 7: Experiment with different batch sizes:

Now that we are using all four cores we can experiment with compiling and running large batch sizes on each of our four cores


7.2 Modify the inference code

In [None]:
import os
from time import time
import torch
import torch_neuron
import json
import numpy as np
from urllib import request
from torchvision import models, transforms, datasets
from parallel import NeuronSimpleDataParallel

## Assuming you are working on and inf1.xlarge or inf1.2xlarge
num_neuron_cores = 4
batch_size = 5

## Create an image directory containing a small kitten
os.makedirs("./torch_neuron_test/images", exist_ok=True)
request.urlretrieve("https://raw.githubusercontent.com/awslabs/mxnet-model-server/master/docs/images/kitten_small.jpg",
                    "./torch_neuron_test/images/kitten_small.jpg")

## Fetch labels to output the top classifications
request.urlretrieve("https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json","imagenet_class_index.json")
idx2label = []

with open("imagenet_class_index.json", "r") as read_file:
    class_idx = json.load(read_file)
    idx2label = [class_idx[str(k)][1] for k in range(len(class_idx))]
    
## Import a sample image and normalize it into a tensor
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225])
    
eval_dataset = datasets.ImageFolder(
    os.path.dirname("./torch_neuron_test/"),
    transforms.Compose([
    transforms.Resize([224, 224]),
    transforms.ToTensor(),
    normalize,
    ])
)
image, _ = eval_dataset[0]
image = torch.tensor(image.numpy()[np.newaxis, ...])

## Load model
model_neuron = NeuronSimpleDataParallel( 'resnet50_neuron_b{}.pt'.format(batch_size), num_neuron_cores, batch_size=batch_size )

## Create a "batch" image with enough images to go on each of the four cores
batch_image = image

for i in range((num_neuron_cores * batch_size) - 1):
    batch_image = torch.cat( [batch_image, image], 0 )

## Since the first inference also loads the model to the chip let's exclude it 
## from timing
results = model_neuron( batch_image )

## Predict
start = time()
loops = 100
for _ in range(loops):
    results = model_neuron( batch_image )
elapsed_time = time() - start
images_sec = loops * batch_image.size(0) / elapsed_time

# Get the top 5 results
top5_idx = results[0].sort()[1][-5:]

# Lookup and print the top 5 labels
top5_labels = [idx2label[idx] for idx in top5_idx]
print("Top 5 labels:\n {}".format(top5_labels) )
print("Completed {} operations in {} seconds => {} images / second".format( 
    loops * batch_image.size(0), round(elapsed_time, 2), round(images_sec,0) ) )