# [모듈 2.1] Inference NCF on INF2 - Benchmarking

# 1. 환경 셋업

## 1.1. 기본 세팅
사용하는 패키지는 import 시점에 다시 재로딩 합니다.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('./src')

필요한 torch_neuronx 를 로딩 합니다.

In [2]:
import torch
import torch_neuronx

# 2. 훈련된 모델 로딩

## 훈련 모델 아티펙트 확인

- 이미 훈련된 파이토치로 훈련된 모델 아티텍트의 경로를 지정합니다.

In [3]:
artifact_path = 'models/NeuMF-end.pth'
print("model artifact is assigend from : ", artifact_path)

model artifact is assigend from :  models/NeuMF-end.pth


## 모델 로딩에 필요한 설정 파일 생성

- 모델 로딩시에 필요한 파라미터 사용 (기존의 값을 사용 함)

In [4]:
import json
import os
import config
from common_utils import save_json, load_json

class Params:
    def __init__(self):
        self.factor_num = 32
        self.num_layers = 3
        self.dropout = 0.0
                        
args = Params()
print("# of num_layers: ", args.num_layers)


# 모델 훈련시에 결정된 user, item 의 숫자
user_num = 6040  
item_num = 3706
print("user_num: ", user_num, " item_num: ", item_num)

model_config_dict = {
    'user_num': str(user_num),
    'item_num': str(item_num),
    'factor_num' : str(args.factor_num),
    'num_layers' : str(args.num_layers),
    'dropout' : str(args.dropout),
    'model_type': config.model
}

model_config_file = 'model_config.json'
model_config_file_path = os.path.join('src', model_config_file)

save_json(model_config_file_path, model_config_dict)
# model_config_dict = load_json(model_config_file_path)    
# model_config_dict

# of num_layers:  3
user_num:  6040  item_num:  3706
src/model_config.json is saved


'src/model_config.json'

## 모델 로딩
- 모델 로딩 함수 model_fn() 를 통하여 모델 로딩


In [5]:
from inference import model_fn

ncf_model = model_fn(config.model_path)

######## Staring model_fn() ###############
device:  cpu


# 3. 모델 컴파일

## 샘플 입력 생성

In [6]:
import numpy as np
import torch

def create_dummy_input(batch_size):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    # print("Using {} device".format(device))

    user_np = np.zeros((1,100)).astype(np.int32)
    item_np = np.random.randint(low=1, high=1000, size=(1,100)).astype(np.int32)

    return (
        torch.repeat_interleave(torch.from_numpy(user_np), batch_size, 0),
        torch.repeat_interleave(torch.from_numpy(item_np), batch_size, 0),
    )

dummy_inputs = create_dummy_input(batch_size=1)

print("type: ", type(dummy_inputs))
print("len: ", len(dummy_inputs))


type:  <class 'tuple'>
len:  2


## Torch Script 으로 변환 (컴파일)

In [7]:
def convert_torch_script(model, dummy_inputs):
    # Compile the model for Neuron
    model_neuron = torch_neuronx.trace(model, dummy_inputs)
    
    return model_neuron

model_neuron = convert_torch_script(ncf_model, dummy_inputs)

#### prediction: 
 tensor([[[-1.1848],
         [-2.9062],
         [-2.9215],
         [-0.2536],
         [-2.1842],
         [-0.9262],
         [ 1.8365],
         [-1.9201],
         [-3.1589],
         [-1.3983],
         [-0.9339],
         [-2.1668],
         [ 0.6848],
         [-3.3775],
         [ 0.6939],
         [-2.4077],
         [ 1.1072],
         [-3.1092],
         [-3.4677],
         [-3.0932],
         [-3.1637],
         [ 0.8479],
         [ 3.2808],
         [-2.6363],
         [-1.3871],
         [-4.2089],
         [-0.5071],
         [-0.6448],
         [-2.0258],
         [-1.0498],
         [-4.3857],
         [ 0.5838],
         [-2.1706],
         [-3.4535],
         [ 1.3635],
         [-3.5763],
         [-1.6100],
         [-1.7371],
         [-1.5105],
         [ 1.6561],
         [-1.6870],
         [-3.5930],
         [-3.4934],
         [-3.4258],
         [-1.9256],
         [-2.4687],
         [-4.3822],
         [-2.5416],
         [-1.7941],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



# 4. 모델 추론

In [8]:
def extract_top_k(prediction, top_k = 10):
    prediction = torch.squeeze(prediction) # remove dimension
    _, indices = torch.topk(prediction, top_k)
    
    return indices

prediction = model_neuron(dummy_inputs[0],dummy_inputs[1])
print("type:prediction ", type(prediction))
print("type:prediction[0] ", type(prediction[0]))

recommended_item_index = extract_top_k(prediction[0], top_k = 10)
print("recommended_item_index:  \n", recommended_item_index)

type:prediction  <class 'tuple'>
type:prediction[0]  <class 'torch.Tensor'>
recommended_item_index:  
 tensor([22, 87, 70,  6, 39, 34, 16, 92, 21, 78])


# 5. 모델 저장 및 로딩 후 테스트

In [9]:
# Save the TorchScript for inference deployment
filename = 'models/model.pt'
torch.jit.save(model_neuron, filename)

In [10]:
# Load the TorchScript compiled model
load_model_neuron = torch.jit.load(filename)

prediction = load_model_neuron(dummy_inputs[0],dummy_inputs[1])
print("type:prediction ", type(prediction))
print("type:prediction[0] ", type(prediction[0]))

recommended_item_index = extract_top_k(prediction[0], top_k = 10)
print("recommended_item_index:  \n", recommended_item_index)


type:prediction  <class 'tuple'>
type:prediction[0]  <class 'torch.Tensor'>
recommended_item_index:  
 tensor([22, 87, 70,  6, 39, 34, 16, 92, 21, 78])


# 6. 벤치 마킹


## 일부 샘플로 추론 시간 측정

In [11]:
import time
latencies = []
num_test = 5
for _ in range(num_test):
    start = time.time()
    prediction = load_model_neuron(dummy_inputs[0],dummy_inputs[1])    
    finish = time.time()
    elapse_time = round((finish - start) * 1000, 5)
    # print(prediction)
    latencies.append(elapse_time)

print("latencies: ", latencies)

latencies:  [0.48184, 0.12088, 0.11539, 0.11706, 0.11206]


## 벤치 마킹, 모델 수 및 Thread 수 조절

In [12]:
from inf2_util import benchmark

In [13]:
# Benchmark BERT on Neuron
# benchmark(filename, example, n_models=2, n_threads=2, batches_per_thread=1000)
benchmark(filename, dummy_inputs)

Filename:    models/model.pt
Batch Size:  1
Batches:     2000
Inferences:  2000
Threads:     2
Models:      2
Duration:    0.115
Throughput:  17406.568
Latency P50: 0.111
Latency P95: 0.125
Latency P99: 0.135


In [14]:
benchmark(filename, dummy_inputs, n_models=2, n_threads=4, batches_per_thread=1000)

Filename:    models/model.pt
Batch Size:  1
Batches:     4000
Inferences:  4000
Threads:     4
Models:      2
Duration:    0.142
Throughput:  28144.308
Latency P50: 0.136
Latency P95: 0.161
Latency P99: 0.206


## 최적의 배치 사이즈 찾기

In [15]:
# Compile BERT for different batch sizes
for batch_size in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
# for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
#     tokenizer = AutoTokenizer.from_pretrained(name)
#     model = AutoModelForSequenceClassification.from_pretrained(name, torchscript=True)
    dummy_inputs = create_dummy_input(batch_size= batch_size)
    # example = encode(tokenizer, sequence_0, sequence_2, batch_size=batch_size)
    model_neuron = convert_torch_script(ncf_model, dummy_inputs)    
    # ㅠmodel_neuron = torch_neuronx.trace(model, example)
    filename = f'model_batch_size_{batch_size}.pt'
    torch.jit.save(model_neuron, filename)

#### prediction: 
 tensor([[[-1.9389],
         [-1.9872],
         [-1.2366],
         [ 0.6196],
         [-2.4418],
         [-3.1095],
         [-2.8314],
         [-1.5271],
         [-2.1842],
         [-1.4032],
         [ 0.2330],
         [-0.2536],
         [ 0.6151],
         [-0.7603],
         [-1.0449],
         [-0.7522],
         [-0.8561],
         [-4.0481],
         [-2.9439],
         [-2.3725],
         [ 0.1770],
         [-2.3422],
         [-2.0962],
         [-2.0046],
         [ 1.3567],
         [-3.0413],
         [ 1.1072],
         [-0.4168],
         [-3.5763],
         [ 0.9679],
         [ 1.8365],
         [-1.1425],
         [-2.1949],
         [-3.2080],
         [-2.5564],
         [-0.1525],
         [-3.4258],
         [-2.5250],
         [-0.1535],
         [-3.1450],
         [-3.6925],
         [-2.5250],
         [-1.1053],
         [-1.5729],
         [-1.6448],
         [-1.3120],
         [-1.4801],
         [-1.6100],
         [-3.0187],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-1.0365],
         [-0.6960],
         [-5.3186],
         [-2.5546],
         [-1.9959],
         [-1.6822],
         [-2.6483],
         [-2.0274],
         [-0.5445],
         [-3.1637],
         [-1.0943],
         [-3.0403],
         [-1.6977],
         [-0.2422],
         [-1.7862],
         [-0.4970],
         [-2.5605],
         [-1.3185],
         [-4.0969],
         [-2.1618],
         [-2.4952],
         [-3.4677],
         [-2.5342],
         [-3.0927],
         [ 0.4353],
         [-2.5310],
         [-3.4284],
         [ 0.3642],
         [-2.0267],
         [-0.7425],
         [-3.5763],
         [-0.5675],
         [-0.9022],
         [-1.5686],
         [-0.6397],
         [-2.1833],
         [ 2.1892],
         [-2.0686],
         [-1.0002],
         [-4.4377],
         [ 2.2862],
         [-1.8735],
         [-0.9339],
         [-2.0422],
         [-1.0282],
         [-0.6058],
         [-2.7894],
         [ 0.6939],
         [-3.1095],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-2.5242],
         [-0.4542],
         [-2.5238],
         [ 0.1247],
         [-1.4890],
         [-1.0365],
         [-2.7635],
         [-3.4934],
         [ 0.2207],
         [-2.1106],
         [-2.1096],
         [-3.7962],
         [-3.0040],
         [-0.7806],
         [-0.9262],
         [-3.7332],
         [-3.4024],
         [-0.2246],
         [-3.0406],
         [-0.2422],
         [ 2.2073],
         [-0.5709],
         [-3.2271],
         [-0.0759],
         [-0.0950],
         [ 0.5289],
         [-5.0949],
         [ 1.1824],
         [-2.8489],
         [-0.1097],
         [ 1.0631],
         [-1.4877],
         [-1.5194],
         [-1.3754],
         [-1.0986],
         [-2.6928],
         [ 0.8927],
         [-2.2033],
         [-0.4616],
         [-1.3871],
         [-2.2334],
         [-3.9233],
         [-1.7654],
         [ 0.3776],
         [-2.0860],
         [-2.4765],
         [-0.9894],
         [-2.6483],
         [-2.5493],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-2.4651],
         [-3.9864],
         [-2.7717],
         [-1.0700],
         [-1.8002],
         [-1.3093],
         [ 0.4360],
         [-0.3335],
         [-2.9313],
         [-3.0761],
         [-2.7164],
         [-1.5686],
         [-3.5692],
         [-0.3487],
         [-3.6566],
         [-1.5556],
         [-2.5493],
         [-1.8651],
         [-0.3431],
         [-2.4802],
         [-0.3307],
         [-3.5692],
         [-2.7270],
         [-0.6204],
         [ 1.9901],
         [-1.7840],
         [-2.7092],
         [-0.0950],
         [-0.3431],
         [-5.0949],
         [ 0.9245],
         [-3.5272],
         [-2.1949],
         [-2.6515],
         [ 0.2207],
         [-0.9661],
         [-2.5144],
         [-2.6322],
         [-2.5310],
         [-3.8482],
         [ 0.9245],
         [-1.1125],
         [-1.4872],
         [-3.8313],
         [-5.0949],
         [ 1.8238],
         [-1.9528],
         [-1.2386],
         [-2.0258],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-2.9262],
         [-2.5564],
         [-0.5231],
         [-1.5513],
         [-0.9255],
         [-2.9161],
         [-0.6204],
         [ 3.2808],
         [-3.4663],
         [-1.2973],
         [-3.3636],
         [-1.9852],
         [-4.3534],
         [-1.4890],
         [-1.8576],
         [-1.0943],
         [-1.2366],
         [ 1.9565],
         [ 0.3384],
         [-2.4267],
         [-1.6304],
         [-1.4078],
         [-1.1806],
         [-3.4934],
         [-4.9023],
         [-2.4418],
         [-1.7464],
         [-2.0274],
         [ 1.6681],
         [-1.2119],
         [-0.7380],
         [ 0.1770],
         [-1.0162],
         [-0.6204],
         [-0.1439],
         [ 0.0108],
         [-1.9465],
         [-3.4284],
         [-2.4765],
         [-1.4032],
         [-3.1637],
         [-4.0834],
         [-1.0849],
         [-0.6370],
         [-2.2999],
         [-1.0925],
         [-0.9116],
         [ 0.6848],
         [-1.5105],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-2.2837],
         [-3.8251],
         [-0.1429],
         [-0.7893],
         [-2.8314],
         [-3.1015],
         [-1.3289],
         [-3.4464],
         [-0.6162],
         [ 1.1168],
         [ 1.0710],
         [-1.9685],
         [-1.3301],
         [-3.0180],
         [ 0.0058],
         [-4.5206],
         [-1.2386],
         [-1.5513],
         [-2.8821],
         [-1.5194],
         [-3.1637],
         [-0.3611],
         [-0.0109],
         [-1.4093],
         [-2.3167],
         [-1.6684],
         [-3.6449],
         [ 0.1876],
         [ 0.5289],
         [-2.7455],
         [ 2.6148],
         [-0.0950],
         [-1.1848],
         [-1.0449],
         [-1.1328],
         [-2.0258],
         [-2.2999],
         [-1.4074],
         [-2.5814],
         [-4.3822],
         [-4.6175],
         [-1.6870],
         [-1.8048],
         [-0.4814],
         [-0.2935],
         [-0.5546],
         [-2.7455],
         [-1.7464],
         [-1.6136],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-0.4649],
         [-0.7524],
         [-2.0258],
         [-2.5242],
         [-4.0481],
         [-1.6468],
         [-1.0343],
         [-0.3199],
         [ 0.2117],
         [-2.0258],
         [ 0.2480],
         [ 1.2056],
         [-4.5188],
         [-2.7717],
         [-1.8002],
         [-3.8006],
         [-0.4579],
         [-1.1680],
         [ 0.2480],
         [-2.5238],
         [-1.6329],
         [-3.3623],
         [-1.8048],
         [-1.2444],
         [-1.3983],
         [ 1.4683],
         [-1.7759],
         [-2.0097],
         [-1.7759],
         [-3.2315],
         [-1.1433],
         [-2.5568],
         [-3.8251],
         [-2.7657],
         [ 0.4360],
         [ 0.7063],
         [-2.6161],
         [ 0.8839],
         [-0.9022],
         [-2.2677],
         [-2.5080],
         [-0.8369],
         [ 0.8017],
         [-2.0283],
         [-1.3185],
         [-3.9659],
         [-0.7299],
         [ 1.1294],
         [-2.0481],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-0.5231],
         [-1.5513],
         [-0.7489],
         [-2.7994],
         [-2.7237],
         [-2.8820],
         [-2.1427],
         [-3.0403],
         [-1.4093],
         [ 0.2046],
         [-1.8449],
         [-1.5513],
         [-0.4168],
         [-2.8309],
         [ 1.4381],
         [-3.1450],
         [ 1.0379],
         [-2.0528],
         [-0.5071],
         [-3.5637],
         [-1.7862],
         [-2.5238],
         [ 2.1306],
         [-2.0689],
         [-0.1535],
         [-0.6929],
         [-0.5413],
         [-1.5514],
         [-3.8593],
         [ 1.7544],
         [-0.9155],
         [-0.5445],
         [-0.2166],
         [ 1.8134],
         [-3.0761],
         [-1.7678],
         [-1.9391],
         [-3.1007],
         [-3.3775],
         [ 1.3435],
         [-0.5231],
         [-1.6136],
         [-1.3871],
         [ 0.9175],
         [-1.1990],
         [-2.8309],
         [-0.8025],
         [-3.6159],
         [ 1.8365],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[-2.1843],
         [ 0.2960],
         [-1.2366],
         [-3.8482],
         [-0.9900],
         [-0.5445],
         [-3.1589],
         [ 1.0348],
         [-3.5930],
         [-0.2443],
         [-1.4032],
         [-0.4616],
         [ 0.5075],
         [-2.7533],
         [-2.0439],
         [-2.0175],
         [-2.4687],
         [-1.4890],
         [-1.7728],
         [-2.7092],
         [-1.1757],
         [-3.5692],
         [-2.4353],
         [-2.4068],
         [-0.3075],
         [-0.3596],
         [-1.7533],
         [ 0.6629],
         [-4.0188],
         [ 2.1306],
         [ 0.2117],
         [-2.1862],
         [-2.3422],
         [-2.6509],
         [-1.1435],
         [-0.8315],
         [-2.6332],
         [-1.7654],
         [ 0.9255],
         [ 0.3642],
         [-1.1435],
         [-2.5310],
         [-1.2659],
         [-1.6448],
         [-2.7542],
         [-4.7241],
         [-1.2374],
         [ 2.4899],
         [-1.6553],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



#### prediction: 
 tensor([[[ 1.4956],
         [-3.1032],
         [-3.7962],
         [-1.7941],
         [-2.6515],
         [-0.7275],
         [-0.5078],
         [-2.7542],
         [ 0.6848],
         [-0.5196],
         [-1.2194],
         [-0.6477],
         [-4.0309],
         [-3.2231],
         [ 0.3195],
         [-2.9161],
         [-1.2807],
         [-3.0826],
         [-1.7549],
         [-3.4934],
         [-4.5206],
         [-2.7994],
         [-2.1908],
         [ 0.4353],
         [-0.5231],
         [-1.3301],
         [-2.0933],
         [-0.2389],
         [-1.8741],
         [ 2.2073],
         [-1.1898],
         [-4.5842],
         [-0.0635],
         [-2.5310],
         [-0.5940],
         [-2.5539],
         [-3.2271],
         [-1.5556],
         [-1.8003],
         [ 1.3003],
         [-1.3663],
         [-1.8576],
         [ 2.2073],
         [ 1.0348],
         [-1.0946],
         [-2.7866],
         [ 1.3635],
         [-2.7542],
         [-3.3392],
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sys.exit(main())



In [16]:
# Benchmark BERT for different batch sizes
for batch_size in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
# for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
    print('-'*50)
    dummy_inputs = create_dummy_input(batch_size= batch_size)
    filename = f'model_batch_size_{batch_size}.pt'
    benchmark(filename, dummy_inputs)
    print()

--------------------------------------------------
Filename:    model_batch_size_1.pt
Batch Size:  1
Batches:     2000
Inferences:  2000
Threads:     2
Models:      2
Duration:    0.731
Throughput:  2735.084
Latency P50: 0.738
Latency P95: 0.891
Latency P99: 1.335

--------------------------------------------------
Filename:    model_batch_size_2.pt
Batch Size:  2
Batches:     2000
Inferences:  4000
Threads:     2
Models:      2
Duration:    0.986
Throughput:  4056.616
Latency P50: 0.936
Latency P95: 1.147
Latency P99: 1.579

--------------------------------------------------
Filename:    model_batch_size_3.pt
Batch Size:  3
Batches:     2000
Inferences:  6000
Threads:     2
Models:      2
Duration:    1.119
Throughput:  5361.024
Latency P50: 1.119
Latency P95: 1.241
Latency P99: 1.965

--------------------------------------------------
Filename:    model_batch_size_4.pt
Batch Size:  4
Batches:     2000
Inferences:  8000
Threads:     2
Models:      2
Duration:    0.947
Throughput:  845