# Increasing workers benchmark

In this notebook, we will study the performance of the triton server as more inference requests are made to the server in parallel. We will begin by creating a cluster of workers that will each have a copy of code that runs inference. We will then ask 1, then 2, then 3, and so on, workers to run the code consecutively and collect a couple metrics, such as throughput and queue time, to see how the triton server handles the increase in requests.

In [None]:
# all the packages we will need
import os
from distributed import Client, progress
from lpcjobqueue import LPCCondorCluster
import awkward as ak
import numpy as np
import torch
from utils.mlbench import process_function
import time
import pathlib
from datetime import datetime
from utils.promqueries import get_all_queries
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.colors as colors

This will request a cluster with $n$ workers (or jobs) of CPUs from the Fermilab LPC. If not using the Fermilab computing centers, this will need to be changed for the system you are using.

In [None]:
def create_clusters(jobs, **kwargs):
    
    kwargs.setdefault('cores', 1)
    kwargs.setdefault('memory', '3GB')
    kwargs.setdefault('disk', '2GB')
    # by default transfer all utils and models
    kwargs.setdefault('transfer_input_files', [f'{os.getenv("BASE")}/utils', f'{os.getenv("BASE")}/models'])
    kwargs.setdefault('log_directory', None)
    kwargs.setdefault('death_timeout', 180)
    kwargs.setdefault('job_extra_directives', {})
    kwargs['job_extra_directives'].update(set_default_proxy(kwargs['job_extra_directives']))

    cluster = LPCCondorCluster(**kwargs)

    # Scaling up the cluster
    print("Generating job requests...", end='')
    cluster.scale(jobs)
    print('initial jobs generated!')
    print("Waiting for at least one worker...", end='')
    client = Client(cluster)
    client.wait_for_workers(1)
    print("workers(s) online!")
    print("Dashboard available at", client.dashboard_link)
    print("Waiting for all (%i) workers..."%jobs, end='')
    client.wait_for_workers(jobs)
    print("Done!")

    return cluster, client

def set_default_proxy(job_extra_directives):
  """
  Specifying the the grid certificate proxy to be used by the worker nodes. As
  the voms-proxy-init typically store certificates the `/tmp` directory, which is
  not accessible to the worker nodes. The returned job_extra_directives will
  setup the worker nodes to look for the proxy file in the users home directory.
  This function will also scan the input proxy file to make sure it exists and is
  valid. If the file is not found, an exception is raised with the command to
  generate the proxy file in default location.
  """
  proxyfile = ''
  if 'x509userproxy' not in job_extra_directives:
    proxyfile = '{0}/x509up_u{1}'.format(os.environ['HOME'], os.getuid())
    print('Using default proxy file:', proxyfile)
  else:
    proxyfile = job_extra_directives['x509userproxy']

  # Checking if file is a valid file
  if not os.path.isfile(proxyfile):
    raise Exception(f"""
    The proxy file {proxyfile} doesn't exist! Create the default proxy using the
    following command:
    > voms-proxy-init --voms cms --valid 192:00 --out ${{HOME}}/x509up_u${{UID}}
    """)

  return {'x509userproxy': proxyfile}

In [None]:
# include a path for outputting log files for monitoring and debugging purposes
cluster_args = {'log_directory': '/uscmst1b_scratch/lpc1/3DayLifetime/<username>/'}
n_workers = 80
cluster, client = create_clusters(n_workers, **cluster_args)

Here, we will start $n$ trails with different number of workers. Each trial will split files between the workers, process all files consecutively, then save the times of each trial so that we can go back and look at what happened after the run is complete.

In [None]:
worker_hostnames = list(client.scheduler_info()['workers'].keys())
output = np.zeros((n_workers,3))
datetimes = []

for ii in range(n_workers):

    #seeds, #pseudo-events, batchsize, use triton (True/False)
    n_files = 30*(ii+1) # run 30 files per worker
    n_jets = 5000 # run 5000 jets per file
    model = "pn_demo/1"
    workargstriton = [range(n_files), [n_jets]*n_files, [1024]*n_files, 
                      [True]*n_files, [model]*n_files]

    # Triton, N workers trial
    print('Running %i jets among %i files with %i workers...'%(n_jets,n_files,ii+1))
    dt1 = datetime.now()
    futurestriton = client.map(process_function, *workargstriton, pure=False, 
                               workers=worker_hostnames[:ii+1], retries=2)
    progress(futurestriton, notebook=False) #progress bar
    resulttriton = client.gather(futurestriton)
    dt2 = datetime.now()
    print('Done!')
    
    datetimes.append((dt1,dt2,ii+1))
    
# save the datetimes of each trial to file to look at later
with open('datetimes_saved.txt', 'w') as fp:
    fp.write('\n'.join('%s, %s, %s' % x for x in datetimes))
    

In [None]:
# make sure to close all jobs when the trials are done
cluster.close()
client.close()

## Results

Now that we have run the tests, let's take a look at the Triton server performance. We start by loading in the datetimes of the test we ran. And using those times to collect the metrics we would like to study.

In [None]:
#open saved data and put back into proper format
#with open('datetimes_saved.txt') as f:
with open('datetimes_Apr30_80workers_v1.txt') as f:
    mylist = [tuple(map(str.strip, i.split(','))) for i in f]
    
datetimes = []
for tup in mylist:
    datetimes.append((datetime.strptime(tup[0], '%Y-%m-%d %H:%M:%S.%f'),
                       datetime.strptime(tup[1], '%Y-%m-%d %H:%M:%S.%f'),
                       int(tup[2])))
datetimes

In [None]:
# collect metrics from each trial to plots performance
out = np.zeros((len(datetimes),3))
out_cat = np.empty((0,3), int)
for (dt1,dt2,w) in datetimes:
    results, queries, unique_model_versions, unique_gpu_instances = get_all_queries([(dt1,dt2)], '30s')
    
    # metrics collected can be changed here
    data = pd.concat([results['inf_reqs_net'],results['inf_que_time_net'],results['num_instances']],axis=1)
    n_inst = data.iloc[:,2].max()
    out[w-1,0] = data.iloc[:,0][data.iloc[:,2]==n_inst][1:-1].mean()
    out[w-1,1] = data.iloc[:,1][data.iloc[:,2]==n_inst][1:-1].mean()
    out[w-1,2] = n_inst
    
    # all metrics aggregated for each trial
    out_cat = np.append(out_cat,data.to_numpy(),axis=0)

There are 4 plots provided here:

1. Number of triton instances vs. number of workers - this shows how the triton server scales out to more model instances as more requests are made in parallel
2. Queue time vs. total throughput - this shows how another model instance is spawned up once the queue threshold is surpassed in order to maintain a reasonable queue time (as GPU resources are available to create another instance)
3. Total throughput vs. number of workers - this shows whether the throughput is scaling pretty linearly with the number of workers or not
4. Throughput per model instance vs. number of workers - this shows how multiple parallel inferences affect the throughput of each model instance

In [None]:
plt.plot(range(1,len(out[:,2])+1),out[:,2])
plt.xlabel("Number of workers", fontsize=14)
plt.ylabel("Number of Triton instances", fontsize=14)
plt.show()

In [None]:
cmap = plt.cm.plasma
norm = colors.BoundaryNorm(np.arange(min(out[:,2])-.5, max(out[:,2])+1.5, 1), cmap.N)
plt.scatter(out[:,0],out[:,1],c=out[:,2], cmap=cmap, norm=norm)
cbar = plt.colorbar(ticks=np.arange(min(out[:,2]), max(out[:,2])+1, 1))
cbar.set_label('Triton server instances', fontsize=14)
plt.xlabel('Total throughput [$s^{-1}$]',fontsize=14)
plt.ylabel('Queue time per request [$ms$]',fontsize=14)
plt.axhline(y=400, color='r', linestyle='--', label='Queue time threshold')
plt.legend(loc='best')
plt.show()

In [None]:
cmap = plt.cm.plasma
norm = colors.BoundaryNorm(np.arange(min(out[:,2])-.5, max(out[:,2])+1.5, 1), cmap.N)
plt.scatter(range(1,len(out[:,0])+1),out[:,0],c=out[:,2], cmap=cmap, norm=norm)
cbar = plt.colorbar(ticks=np.arange(min(out[:,2]), max(out[:,2])+1, 1))
cbar.set_label('Triton server instances', fontsize=14)
plt.xlabel('Number of workers', fontsize=14)
plt.ylabel('Total throughput [$s^{-1}$]', fontsize=14)
plt.show()

In [None]:
cmap = plt.cm.plasma
norm = colors.BoundaryNorm(np.arange(min(out[:,2])-.5, max(out[:,2])+1.5, 1), cmap.N)
plt.scatter(range(1,len(out[:,0])+1)/out[:,2],out[:,0]/out[:,2],c=out[:,2], cmap=cmap, norm=norm)
cbar = plt.colorbar(ticks=np.arange(min(out[:,2]), max(out[:,2])+1, 1))
cbar.set_label('Triton server instances', fontsize=14)
plt.xlabel('Number of workers per server', fontsize=14)
plt.ylabel('Throughput per server [$s^{-1}$]', fontsize=14)
plt.show()