# Multi-model scaling benchmark

In this notebook, we will study the performance of the Triton server as multiple machine learning models are processing inference requests on the server at the same time. Each model has it's own queue for inference requests, and the triton server default is to load each model on each server instance such that it shares the GPU resources of that instance.

In [None]:
# all the packages we will need
import os
from distributed import Client, progress
from lpcjobqueue import LPCCondorCluster
import awkward as ak
import numpy as np
import torch
from utils.mlbench import process_function
import time
import pathlib
from datetime import datetime
from utils.promqueries import get_all_queries
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.colors as colors

This will request a cluster with $n$ workers (or jobs) of CPUs from the Fermilab LPC. If not using the Fermilab computing centers, this will need to be changed for the system you are using.

In [None]:
def create_clusters(jobs, **kwargs):
    
    kwargs.setdefault('cores', 1)
    kwargs.setdefault('memory', '3GB')
    kwargs.setdefault('disk', '2GB')
    # by default transfer all utils and models
    kwargs.setdefault('transfer_input_files', [f'{os.getenv("BASE")}/utils', f'{os.getenv("BASE")}/models'])
    kwargs.setdefault('log_directory', None)
    kwargs.setdefault('death_timeout', 180)
    kwargs.setdefault('job_extra_directives', {})
    kwargs['job_extra_directives'].update(set_default_proxy(kwargs['job_extra_directives']))

    cluster = LPCCondorCluster(**kwargs)

    # Scaling up the cluster
    print("Generating job requests...", end='')
    cluster.scale(jobs)
    print('initial jobs generated!')
    print("Waiting for at least one worker...", end='')
    client = Client(cluster)
    client.wait_for_workers(1)
    print("workers(s) online!")
    print("Dashboard available at", client.dashboard_link)
    print("Waiting for all (%i) workers..."%jobs, end='')
    client.wait_for_workers(jobs)
    print("Done!")

    return cluster, client

def set_default_proxy(job_extra_directives):
  """
  Specifying the the grid certificate proxy to be used by the worker nodes. As
  the voms-proxy-init typically store certificates the `/tmp` directory, which is
  not accessible to the worker nodes. The returned job_extra_directives will
  setup the worker nodes to look for the proxy file in the users home directory.
  This function will also scan the input proxy file to make sure it exists and is
  valid. If the file is not found, an exception is raised with the command to
  generate the proxy file in default location.
  """
  proxyfile = ''
  if 'x509userproxy' not in job_extra_directives:
    proxyfile = '{0}/x509up_u{1}'.format(os.environ['HOME'], os.getuid())
    print('Using default proxy file:', proxyfile)
  else:
    proxyfile = job_extra_directives['x509userproxy']

  # Checking if file is a valid file
  if not os.path.isfile(proxyfile):
    raise Exception(f"""
    The proxy file {proxyfile} doesn't exist! Create the default proxy using the
    following command:
    > voms-proxy-init --voms cms --valid 192:00 --out ${{HOME}}/x509up_u${{UID}}
    """)

  return {'x509userproxy': proxyfile}

In [None]:
cluster_args = {'log_directory': '/uscmst1b_scratch/lpc1/3DayLifetime/csavard/'}
n_workers = 32
cluster, client = create_clusters(n_workers, **cluster_args)

We will now run trials with 1, then 2, then 3, and so on, different ML models all running at the same time. In this example, we run different copies of the same model labeled "pn_demo_bkg_1/2/3/...". You can rerun this test using difference configurations for of the triton servre to see how this effects things. For example, we suggest testing different GB slices of the triton instances or manually setting the triton server to assign a separate instance for each model and see how this compares.

In [None]:
# a couple of configuration to set
n_workers_per_bkgmodel = 4
n_models = int(np.floor(n_workers/n_workers_per_bkgmodel))

worker_hostnames = list(client.scheduler_info()['workers'].keys())
output = np.zeros((n_workers,3))
datetimes = []

# trials for different number of models running at once
for ii in range(n_models):
    
    #seeds, pseudo-events, batchsize, use triton (True/False), model and version
    n_files = 50*(ii+1) # run 50 files per model per worker
    if ii>2:
        n_files = 10*(ii+1)
    n_jets = 10000 # run 5000 jets per file
    temp_modellist = ["pn_demo/1" if x==0 else "pn_demo_bkg%i/1" % x for x in range(ii+1)] # change model name to what you are testing
    server_list = ["triton+grpc://triton.fnal.gov:443/" for x in range(ii+1)]
    workargstriton = [range(n_files), [n_jets]*n_files, [1024]*n_files, 
                      [True]*n_files, temp_modellist*n_files, server_list*n_files]
    
    # Triton, N bkg models trial
    print('Running %i jets among %i files with %i background models...'%(n_jets,n_files,ii))
    dt1 = datetime.now()
    futurestriton = client.map(process_function, *workargstriton, pure=False, 
                               workers=worker_hostnames[:n_workers_per_bkgmodel*(ii+1)], retries=2)
    progress(futurestriton, notebook=False)
    resulttriton = client.gather(futurestriton)
    dt2 = datetime.now()
    print('Done!')
    
    datetimes.append((dt1,dt2,ii))
    
# save the datetimes of each trial to file to look at later
with open('datetimes_saved.txt', 'w') as fp:
    fp.write('\n'.join('%s, %s, %s' % x for x in datetimes))


In [None]:
# make sure to close all jobs when the trials are done
cluster.close()
client.close()

## Results

Here we will provide some code to plot the throughput as a function of models running in the background. We will only look at how background models affect the foreground model we have chosen to study, but foreground model can be swapped as well.

In [None]:
def load_datetimes(filename='datetimes_saved.txt'):
    
    with open(filename) as f:
        mylist = [tuple(map(str.strip, i.split(','))) for i in f]
    
    datetimes = []
    for tup in mylist:
        datetimes.append((datetime.strptime(tup[0], '%Y-%m-%d %H:%M:%S.%f'),
                          datetime.strptime(tup[1], '%Y-%m-%d %H:%M:%S.%f'),
                          int(tup[2])))
    return datetimes

def get_info(datetimes):
    
    out = np.zeros((len(datetimes),3))
    for (dt1,dt2,m_bkg) in datetimes:
        space='{namespace="triton", prometheus_replica="prometheus-user-workload-0", model="pn_demo"}' # change model you want to test
        results, queries, unique_model_versions, unique_gpu_instances = get_all_queries([(dt1,dt2)], '30s', space=space)
        
        # change metrics to study here
        data = pd.concat([results['inf_reqs_net'],results['inf_que_time_net'],results['num_instances']],axis=1)
        
        n_inst = data.iloc[:,2].max()
        out[m_bkg,0] = data.iloc[:,0][data.iloc[:,2]==n_inst][3:-1].mean()
        out[m_bkg,1] = data.iloc[:,1][data.iloc[:,2]==n_inst][3:-1].mean()
        out[m_bkg,2] = n_inst
    
    return out

In [None]:
# load in the different tests that were used, like a 20 GB and 40 GB instance split for example
datetime_20GB = load_datetimes("datetimes_test20GBslices.txt")
out_20GB = get_info(datetime_20GB)

datetime_40GB = load_datetimes("datetimes_test40GBslices.txt")
out_40GB = get_info(datetime_40GB)

Two performance plots are provided:
1. Total throughput of all models vs. number of background models - this shows how the full system is affected as more models are run in parallel
2. Throughput of the foreground model vs. number of background models - this shows how other models running in the background affect the foreground model's performance

In [None]:
plt.scatter(range(len(out_20GB)),out_20GB[:,0], label='20 GB slice', color='red')
plt.scatter(range(len(out_40GB)),out_40GB[:,0], label='40 GB slice', color='blue')
plt.xlabel('Number of background models', fontsize=14)
plt.ylabel('$\sum_{i} model_i$ throughput [$s^{-1}$]', fontsize=14)
plt.legend(loc='best')
#plt.savefig('results/sum_throughput_vs_bkgmodel_1instance.eps',bbox_inches='tight')
plt.show()

In [None]:
plt.scatter(range(len(out_20GB)),out_20GB[:,0], label='20 GB slice', color='red')
#plt.plot(np.linspace(0,3,20), out_20GB[0,0]/(np.linspace(0,3,20)+1), color='red', linestyle='dashed')
plt.scatter(range(len(out_40GB)),out_40GB[:,0], label='40 GB slice', color='blue')
#plt.plot(np.linspace(0,7,40), out_40GB[0,0]/(np.linspace(0,7,40)+1), color='blue', linestyle='dashed')
plt.plot([],[], color='black', linestyle='dashed', label='Perfect slice sharing')
plt.xlabel('Number of background models', fontsize=14)
plt.ylabel('Demo model throughput [$s^{-1}$]', fontsize=14)
plt.legend(loc='best')
plt.show()