# [모듈 2.1] Inference NCF on INF2 - Benchmarking

# 1. 환경 셋업

## 1.1. 기본 세팅
사용하는 패키지는 import 시점에 다시 재로딩 합니다.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('./src')

필요한 torch_neuronx 를 로딩 합니다.

In [2]:
import torch
import torch_neuronx

# 2. 훈련된 모델 로딩

## 훈련 모델 아티펙트 확인

- 이미 훈련된 파이토치로 훈련된 모델 아티텍트의 경로를 지정합니다.

In [3]:
artifact_path = 'models/NeuMF-end.pth'
print("model artifact is assigend from : ", artifact_path)

model artifact is assigend from :  models/NeuMF-end.pth


## 모델 로딩에 필요한 설정 파일 생성

- 모델 로딩시에 필요한 파라미터 사용 (기존의 값을 사용 함)

In [4]:
import json
import os
import config
from common_utils import save_json, load_json

class Params:
    def __init__(self):
        self.factor_num = 32
        self.num_layers = 3
        self.dropout = 0.0
                        
args = Params()
print("# of num_layers: ", args.num_layers)


# 모델 훈련시에 결정된 user, item 의 숫자
user_num = 6040  
item_num = 3706
print("user_num: ", user_num, " item_num: ", item_num)

model_config_dict = {
    'user_num': str(user_num),
    'item_num': str(item_num),
    'factor_num' : str(args.factor_num),
    'num_layers' : str(args.num_layers),
    'dropout' : str(args.dropout),
    'model_type': config.model
}

model_config_file = 'model_config.json'
model_config_file_path = os.path.join('src', model_config_file)

save_json(model_config_file_path, model_config_dict)
# model_config_dict = load_json(model_config_file_path)    
# model_config_dict

# of num_layers:  3
user_num:  6040  item_num:  3706
src/model_config.json is saved


'src/model_config.json'

## 모델 로딩
- 모델 로딩 함수 model_fn() 를 통하여 모델 로딩


In [5]:
from inference import model_fn

ncf_model = model_fn(config.model_path)

######## Staring model_fn() ###############
device:  cpu


# 3. 모델 컴파일

## 샘플 입력 생성

In [6]:
import numpy as np
import torch

def create_dummy_input(batch_size):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    # print("Using {} device".format(device))

    user_np = np.zeros((1,100)).astype(np.int32)
    item_np = np.random.randint(low=1, high=1000, size=(1,100)).astype(np.int32)

    return (
        torch.repeat_interleave(torch.from_numpy(user_np), batch_size, 0),
        torch.repeat_interleave(torch.from_numpy(item_np), batch_size, 0),
    )

dummy_inputs = create_dummy_input(batch_size=1)

print("type: ", type(dummy_inputs))
print("len: ", len(dummy_inputs))


type:  <class 'tuple'>
len:  2


## Torch Script 으로 변환 (컴파일)

In [7]:
def convert_torch_script(model, dummy_inputs):
    # Compile the model for Neuron
    model_neuron = torch_neuronx.trace(model, dummy_inputs)
    
    return model_neuron

model_neuron = convert_torch_script(ncf_model, dummy_inputs)

#### prediction: 
 tensor([[[-0.2574],
         [-2.4915],
         [-2.3725],
         [-2.2739],
         [-3.2381],
         [-1.0449],
         [-3.1015],
         [ 0.0727],
         [ 0.3931],
         [-2.1601],
         [-2.0902],
         [-0.8699],
         [-1.7728],
         [-4.2499],
         [-4.6169],
         [-2.4268],
         [-2.1706],
         [-2.9439],
         [ 0.3931],
         [-3.3304],
         [ 0.2930],
         [-4.1203],
         [-0.5551],
         [-2.7245],
         [-2.5493],
         [ 0.7936],
         [-3.3392],
         [-1.7258],
         [-1.6329],
         [-3.2054],
         [-1.7728],
         [-2.2717],
         [-0.5413],
         [ 1.9565],
         [ 1.1527],
         [ 0.1476],
         [-2.7164],
         [-0.9361],
         [-2.1949],
         [-1.5823],
         [-0.8025],
         [-2.1842],
         [-1.2067],
         [-0.8993],
         [ 0.4321],
         [ 0.1344],
         [-2.1523],
         [-2.0678],
         [-4.6404],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



# 4. 모델 추론

In [8]:
def extract_top_k(prediction, top_k = 10):
    prediction = torch.squeeze(prediction) # remove dimension
    _, indices = torch.topk(prediction, top_k)
    
    return indices

prediction = model_neuron(dummy_inputs[0],dummy_inputs[1])
print("type:prediction ", type(prediction))
print("type:prediction[0] ", type(prediction[0]))

recommended_item_index = extract_top_k(prediction[0], top_k = 10)
print("recommended_item_index:  \n", recommended_item_index)

type:prediction  <class 'tuple'>
type:prediction[0]  <class 'torch.Tensor'>
recommended_item_index:  
 tensor([99, 88, 33, 92, 97, 87, 34, 55, 53, 25])


# 5. 모델 저장 및 로딩 후 테스트

In [9]:
# Save the TorchScript for inference deployment
filename = 'models/model.pt'
torch.jit.save(model_neuron, filename)

In [10]:
# Load the TorchScript compiled model
load_model_neuron = torch.jit.load(filename)

prediction = load_model_neuron(dummy_inputs[0],dummy_inputs[1])
print("type:prediction ", type(prediction))
print("type:prediction[0] ", type(prediction[0]))

recommended_item_index = extract_top_k(prediction[0], top_k = 10)
print("recommended_item_index:  \n", recommended_item_index)


type:prediction  <class 'tuple'>
type:prediction[0]  <class 'torch.Tensor'>
recommended_item_index:  
 tensor([99, 88, 33, 92, 97, 87, 34, 55, 53, 25])


# 6. 벤치 마킹


## 일부 샘플로 추론 시간 측정

In [11]:
import time
latencies = []
num_test = 5
for _ in range(num_test):
    start = time.time()
    prediction = load_model_neuron(dummy_inputs[0],dummy_inputs[1])    
    finish = time.time()
    elapse_time = round((finish - start) * 1000, 5)
    latencies.append(elapse_time)

print("latencies: ", latencies)

latencies:  [0.73957, 0.15831, 0.15879, 0.13185, 0.12684]


## 벤치 마킹, 모델 수 및 Thread 수 조절

In [12]:
from inf2_util import benchmark

In [13]:
# Benchmark BERT on Neuron
# benchmark(filename, example, n_models=2, n_threads=2, batches_per_thread=1000)
benchmark(filename, dummy_inputs)

Filename:    models/model.pt
Batch Size:  1
Batches:     2000
Inferences:  2000
Threads:     2
Models:      2
Duration:    0.113
Throughput:  17756.937
Latency P50: 0.110
Latency P95: 0.121
Latency P99: 0.132


In [14]:
benchmark(filename, dummy_inputs, n_models=2, n_threads=4, batches_per_thread=1000)

Filename:    models/model.pt
Batch Size:  1
Batches:     4000
Inferences:  4000
Threads:     4
Models:      2
Duration:    0.148
Throughput:  27071.815
Latency P50: 0.141
Latency P95: 0.172
Latency P99: 0.205


## 최적의 배치 사이즈 찾기

In [15]:
# Compile BERT for different batch sizes
for batch_size in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
# for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
#     tokenizer = AutoTokenizer.from_pretrained(name)
#     model = AutoModelForSequenceClassification.from_pretrained(name, torchscript=True)
    dummy_inputs = create_dummy_input(batch_size= batch_size)
    # example = encode(tokenizer, sequence_0, sequence_2, batch_size=batch_size)
    model_neuron = convert_torch_script(ncf_model, dummy_inputs)    
    # ㅠmodel_neuron = torch_neuronx.trace(model, example)
    filename = f'model_batch_size_{batch_size}.pt'
    torch.jit.save(model_neuron, filename)

#### prediction: 
 tensor([[[-2.4268],
         [-1.5194],
         [-1.0685],
         [-0.4866],
         [-3.4242],
         [-1.3289],
         [-2.5415],
         [ 2.0705],
         [-2.5197],
         [ 0.8876],
         [-2.4056],
         [-0.5283],
         [-0.3611],
         [ 1.0296],
         [-0.1097],
         [-2.4068],
         [ 0.6629],
         [-3.2903],
         [-2.0902],
         [-3.0364],
         [-1.6468],
         [ 0.5075],
         [-1.4877],
         [ 0.2330],
         [-1.9539],
         [-0.6477],
         [-1.6870],
         [-1.8230],
         [-2.9083],
         [-1.4877],
         [ 2.0705],
         [-1.5155],
         [-3.3392],
         [-3.7797],
         [ 0.6196],
         [-1.4657],
         [ 2.5198],
         [-0.5283],
         [-3.4501],
         [-2.5250],
         [-4.6169],
         [-3.0761],
         [-2.1843],
         [-2.4353],
         [-0.7668],
         [-3.4677],
         [ 0.0642],
         [-0.7247],
         [-2.1668],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-1.2426],
         [-1.5650],
         [-2.7635],
         [-2.1908],
         [-4.3907],
         [-0.8080],
         [-1.2374],
         [-2.7016],
         [-0.3335],
         [-4.8823],
         [ 1.4381],
         [ 1.9565],
         [-1.6822],
         [-3.5272],
         [-0.3596],
         [-2.0921],
         [-1.0685],
         [-3.2343],
         [-2.0488],
         [-1.7031],
         [ 0.5075],
         [-3.5890],
         [-1.7678],
         [-4.6169],
         [-3.1637],
         [-1.4032],
         [-2.7072],
         [-2.6869],
         [-0.0810],
         [-0.5587],
         [-2.7731],
         [-1.7549],
         [ 2.2073],
         [-0.1259],
         [-3.6925],
         [-1.7358],
         [-1.9638],
         [-2.4077],
         [-0.9339],
         [-1.6792],
         [ 0.9322],
         [-1.5823],
         [-2.1862],
         [-1.1212],
         [-2.9083],
         [-4.0969],
         [ 1.3567],
         [-1.5729],
         [-0.5688],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[ 1.8238],
         [-0.7275],
         [-0.1550],
         [-2.8821],
         [-1.2386],
         [-2.8820],
         [-1.2659],
         [-2.4479],
         [-2.1601],
         [-2.2507],
         [ 0.9322],
         [-4.0969],
         [ 0.8242],
         [-0.4631],
         [-1.0700],
         [-1.9528],
         [-1.1287],
         [-0.4542],
         [-2.4765],
         [-0.7247],
         [ 0.1344],
         [-3.6985],
         [ 1.9901],
         [-1.2194],
         [-2.5568],
         [ 2.0705],
         [ 1.9565],
         [-2.0759],
         [-1.9852],
         [-0.5940],
         [-1.6144],
         [-1.3663],
         [-1.5513],
         [-2.1949],
         [-3.6159],
         [-0.1550],
         [-3.3623],
         [-3.0414],
         [-2.6057],
         [-2.0493],
         [ 0.6641],
         [-1.1587],
         [ 0.0642],
         [-0.7489],
         [-2.5136],
         [-2.0933],
         [ 0.6204],
         [-3.7642],
         [-4.4377],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-1.6684],
         [-2.5579],
         [-0.7696],
         [-3.5581],
         [-0.5602],
         [-1.2724],
         [-2.5082],
         [-2.6161],
         [ 0.1344],
         [ 1.7616],
         [-0.6039],
         [-2.7016],
         [-0.3596],
         [-3.1095],
         [-1.0849],
         [-1.1739],
         [-1.5558],
         [ 0.8017],
         [-0.0950],
         [-0.7956],
         [-3.7945],
         [-1.8048],
         [-0.4864],
         [-0.6929],
         [-0.4972],
         [-1.1131],
         [-4.0481],
         [-2.5493],
         [-1.1956],
         [-1.9852],
         [-3.8873],
         [-1.4074],
         [ 1.0631],
         [ 0.3352],
         [-2.3260],
         [-4.3060],
         [-1.6325],
         [-2.0838],
         [-0.5546],
         [-1.4890],
         [-1.9765],
         [ 2.7438],
         [ 1.3407],
         [-2.5338],
         [ 2.1306],
         [-2.4032],
         [-0.6204],
         [-1.5944],
         [-3.0040],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[ 2.1892],
         [-1.5060],
         [ 1.5623],
         [-2.7237],
         [-0.2283],
         [ 1.4564],
         [-4.6404],
         [-1.0914],
         [-0.6499],
         [-3.2350],
         [-2.5605],
         [-3.4093],
         [-1.8719],
         [-3.0658],
         [-3.0432],
         [-2.1908],
         [-1.2374],
         [-0.8575],
         [-2.0422],
         [ 1.4627],
         [-2.8862],
         [-3.3775],
         [ 2.2138],
         [-2.1833],
         [-0.1550],
         [ 0.0090],
         [ 0.1134],
         [-1.5002],
         [-2.7542],
         [ 0.4360],
         [ 1.9215],
         [-2.2403],
         [-2.2677],
         [-1.0715],
         [ 0.6629],
         [-1.2365],
         [-2.7608],
         [-2.8129],
         [-0.5445],
         [-3.8006],
         [-2.5250],
         [-2.8129],
         [-1.1212],
         [ 0.3270],
         [-0.5078],
         [-2.0088],
         [-1.0914],
         [-0.5333],
         [ 2.1179],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[ 2.6148],
         [-1.3350],
         [-1.5729],
         [-2.2033],
         [-1.9372],
         [-0.5602],
         [-3.2315],
         [ 0.9245],
         [-0.5071],
         [-2.5667],
         [-1.0498],
         [-3.5763],
         [ 0.0727],
         [-3.4050],
         [ 2.0705],
         [-3.3304],
         [-3.0658],
         [-1.3698],
         [ 1.3431],
         [ 0.2665],
         [-1.6329],
         [-2.2493],
         [-2.8735],
         [-1.7031],
         [-0.5921],
         [-4.6059],
         [-3.5890],
         [-1.9372],
         [-0.8025],
         [ 0.1505],
         [-4.3365],
         [-2.0759],
         [-0.6162],
         [-3.4242],
         [-0.6898],
         [-0.1530],
         [ 1.8222],
         [ 1.0379],
         [-1.9638],
         [-1.1990],
         [-0.3611],
         [-0.7996],
         [-0.4972],
         [-0.0230],
         [-1.5113],
         [-0.6941],
         [-3.0364],
         [-0.6099],
         [-0.1721],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[ 0.8927],
         [-2.5136],
         [-2.0838],
         [-3.4501],
         [-2.5416],
         [-2.1842],
         [-3.5440],
         [-1.7464],
         [-0.0657],
         [-0.9255],
         [-1.9661],
         [-1.7248],
         [-4.5188],
         [-3.8873],
         [-2.0088],
         [-1.5514],
         [-2.4952],
         [-1.7023],
         [-0.2246],
         [-3.0927],
         [-1.8283],
         [-2.8309],
         [-0.1325],
         [-1.8859],
         [-2.0215],
         [ 1.1824],
         [-1.4078],
         [-0.5940],
         [-2.1334],
         [-1.3937],
         [-1.3618],
         [-2.5238],
         [-3.4242],
         [-1.6136],
         [-1.6390],
         [-0.4864],
         [-2.2717],
         [-2.6919],
         [-2.1833],
         [-1.3871],
         [-2.6332],
         [-3.1112],
         [-0.7274],
         [-1.0317],
         [-3.2165],
         [-0.6772],
         [-0.4542],
         [ 1.2056],
         [-0.5459],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-3.2165],
         [ 2.2073],
         [-3.3200],
         [-1.8735],
         [-0.9975],
         [-0.4631],
         [-1.5786],
         [-1.8873],
         [-2.3538],
         [-1.7678],
         [ 0.4706],
         [-1.5944],
         [ 0.8007],
         [-0.0230],
         [-2.9439],
         [-4.5206],
         [-0.8978],
         [-3.1634],
         [ 1.3435],
         [-2.9205],
         [ 0.1312],
         [-3.6179],
         [-2.5250],
         [ 0.1344],
         [-1.9299],
         [ 1.5152],
         [-1.8283],
         [-2.0274],
         [-0.1530],
         [-2.6919],
         [-2.4424],
         [-3.4464],
         [-2.0274],
         [-0.9900],
         [ 1.0631],
         [-2.2739],
         [-0.8417],
         [-1.9638],
         [-1.5928],
         [-2.6823],
         [-0.4579],
         [-3.0666],
         [-0.8561],
         [-5.0949],
         [ 0.4321],
         [-3.2080],
         [-0.8315],
         [-2.4267],
         [-2.1862],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-2.2319],
         [-4.0309],
         [-2.7237],
         [ 2.4952],
         [ 1.1072],
         [-0.2092],
         [-1.7533],
         [-0.7806],
         [-3.5692],
         [ 0.9175],
         [-1.5928],
         [ 0.5289],
         [ 0.4360],
         [-3.1558],
         [-2.1668],
         [-2.1842],
         [-0.2092],
         [-3.5763],
         [ 1.8238],
         [-0.4625],
         [ 1.4956],
         [ 0.2645],
         [-2.7292],
         [-4.0188],
         [-2.2319],
         [ 1.3431],
         [-0.6929],
         [-0.6898],
         [-3.9345],
         [ 0.8560],
         [-3.6234],
         [-2.1862],
         [-1.0700],
         [-0.3893],
         [-0.7247],
         [-0.8025],
         [-3.2231],
         [-3.9659],
         [-2.8735],
         [-0.3173],
         [-2.0838],
         [-1.1587],
         [-0.3521],
         [-3.2411],
         [-1.6553],
         [ 1.3407],
         [-2.0109],
         [-2.5310],
         [-1.3871],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-1.3618],
         [ 1.4564],
         [-1.0849],
         [-0.7275],
         [-1.8741],
         [-1.9661],
         [-1.0365],
         [-1.1956],
         [ 1.8238],
         [-2.6644],
         [-0.6941],
         [-1.2973],
         [-4.1203],
         [-1.2807],
         [ 1.1888],
         [ 0.3352],
         [ 2.1306],
         [-2.9262],
         [-1.1052],
         [ 2.1306],
         [-4.6169],
         [-1.3937],
         [-0.5546],
         [-2.0109],
         [-3.4677],
         [-1.0925],
         [ 0.5838],
         [-1.5194],
         [-2.2739],
         [-2.7542],
         [-1.5786],
         [-0.1429],
         [-3.6449],
         [-2.5242],
         [-0.4631],
         [-4.4662],
         [-1.1212],
         [ 0.3384],
         [-3.4829],
         [-2.1842],
         [-2.0962],
         [ 2.0840],
         [-1.6964],
         [-3.1092],
         [-4.8823],
         [-1.1125],
         [-3.1098],
         [-1.9299],
         [-2.6823],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



In [16]:
# Benchmark BERT for different batch sizes
for batch_size in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
# for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
    print('-'*50)
    dummy_inputs = create_dummy_input(batch_size= batch_size)
    filename = f'model_batch_size_{batch_size}.pt'
    benchmark(filename, dummy_inputs)
    print()

--------------------------------------------------
Filename:    model_batch_size_1.pt
Batch Size:  1
Batches:     2000
Inferences:  2000
Threads:     2
Models:      2
Duration:    0.651
Throughput:  3071.983
Latency P50: 0.649
Latency P95: 0.693
Latency P99: 0.729

--------------------------------------------------
Filename:    model_batch_size_2.pt
Batch Size:  2
Batches:     2000
Inferences:  4000
Threads:     2
Models:      2
Duration:    1.051
Throughput:  3805.842
Latency P50: 1.079
Latency P95: 1.235
Latency P99: 1.639

--------------------------------------------------
Filename:    model_batch_size_3.pt
Batch Size:  3
Batches:     2000
Inferences:  6000
Threads:     2
Models:      2
Duration:    1.067
Throughput:  5623.939
Latency P50: 1.094
Latency P95: 1.219
Latency P99: 1.650

--------------------------------------------------
Filename:    model_batch_size_4.pt
Batch Size:  4
Batches:     2000
Inferences:  8000
Threads:     2
Models:      2
Duration:    0.958
Throughput:  834