In [None]:
# from kubernetes_asyncio import client, config
import kubernetes_asyncio as kube
import asyncio
import logging
log = logging.getLogger()

In [12]:
async def list_pods_once():
	await kube.config.load_kube_config()
	api = kube.client.CoreV1Api()
	
	pod_list_response = await api.list_namespaced_pod('cvlab')
	
	print(' '.join(f'{pod.metadata.name}' for pod in pod_list_response.items))
	
	return pod_list_response
	
# asyncio.run(main())
pod_list = await list_pods_once()

# Configs can be set in Configuration class directly or using helper
# utility. If no argument provided, the config will be loaded from
# default location.


bedn bedn2 bedn3 benlalahpod.com guillard-sinkhorn-gridnet katircio-imagenet katircio-imagenetdetector kicirogl-airsim-2 mishchuk-cvpr mishchuk-cvpr2 mv-cvpr3 nakka-advseg14 nakka-advseg20 nakka-advseg22 oner-rt-fulldata-40 oner-rt-fulldata-40-3 rbermude-multiflow remelli-mvp sguo-pytorch-imagenet-iclr tyszkiew-apexpod vidit-maskrcnn-seg-res50-2 wickrama-experiments-1 wickrama-experiments-2 wickrama-experiments-3 wwang-imagenet0 wwang-imagenet1 wwang-imagenet2


In [3]:
pod_to_test = pod_list.items[0]
print(f'Testing on pod [{pod_to_test.metadata.name}]')

Testing on pod [bedn]


### GPU status with nvidia-smi

Example queries: <https://nvidia.custhelp.com/app/answers/detail/a_id/3751/~/useful-nvidia-smi-queries>  
List of fields: <https://briot-jerome.developpez.com/fichiers/blog/nvidia-smi/list.txt>

`nvidia-smi --format=csv --loop=2 --query-gpu=index,utilization.gpu,utilization.memory,memory.used,memory.total`

Injections:  
`kubectl exec -it container -- /usr/bin/nvidia-smi --format=csv --loop=2 --query-gpu=index,utilization.gpu,utilization.memory,memory.used,memory.total`

Run command for 10s:  
`timeout 10 something`  
`timeout 30 nvidia-smi --format=csv --loop=3 --query-gpu=index,utilization.gpu,utilization.memory,memory.used,memory.total`  


In [None]:
# api = kube.client.CoreV1Api()
# api.connect_get_namespaced_pod_exec?

### Executing inside a container

Example: <https://github.com/tomplus/kubernetes_asyncio/blob/master/examples/example3.py>

In [None]:
GPU_QUERY_FIELDS = [
	'index',
	'utilization.gpu',
	'utilization.memory',
	'memory.used',
	'memory.total',
]
GPU_QUERY_CMD = [
	'/usr/bin/timeout', '5',
	'/usr/bin/nvidia-smi',
	'--format=csv',
  	'--loop=1',
	f'--query-gpu={",".join(GPU_QUERY_FIELDS)}',
]
print(GPU_QUERY_CMD)

async def query_gpu_status(pod_name):
	await kube.config.load_kube_config()
	api_ws = kube.client.CoreV1Api(api_client=kube.stream.WsApiClient())
# 	api_ws = kube.client.CoreV1Api()

	cmd = GPU_QUERY_CMD
# 	name = 'sguo-pytorch-imagenet-iclr'
# 	name = 'wwang-imagenet2'
	namespace = 'cvlab'
	
# 	cmd = ['ls']
	
	req = api_ws.connect_get_namespaced_pod_exec(
		name = pod_name, 
		namespace = namespace,
		command = cmd,
		stderr=True,
		stdin=False,
		stdout=True,
		tty=False,

# 		async_req=True,
# 		_preload_content=False,
	)
	
# 	print('Req', req)
	response = await req
# 	print("Query response: ", response)
	return response

In [None]:
# import numpy as np
# from io import StringIO
# np.loadtxt(StringIO(r), skiprows=1, delimiter=',', names=GPU_QUERY_FIELDS, converters={k.replace('.', ''): v for k, v in GPU_QUERY_PROCESSORS.items()})

In [None]:
import numpy as np
from io import StringIO

def process_row_percent(val):
	return float(val.split(maxsplit=1)[0]) * 0.01

def process_row_mem(val):
	return float(val.split(maxsplit=1)[0])

GPU_QUERY_PROCESSORS = {
	'index': int,
	'utilization.gpu': process_row_percent,
	'utilization.memory': process_row_percent,
	'memory.used': process_row_mem,
	'memory.total': process_row_mem,
}


# def process_nvidiasmi_line(report_line):
# 	try:
# 		return {
# 			field: GPU_QUERY_PROCESSORS[field](value.strip())
# 			for field, value in zip(GPU_QUERY_FIELDS, report_line.split(','))
# 		}
# 	except Exception as e:
# 		print(f'{e} in [{report_line}]')

# def process_nvidiasmi_report(report_str):
	
# 	report_lines = report_str.split('\n')
	
# 	report_data = [
# 		process_nvidiasmi_line(line) for line in report_lines[1:]
# 	]
	
# 	return report_data
	
# process_nvidiasmi_report(r)
	

def process_nvidiasmi_report(report_txt):
	util_table = np.genfromtxt(
		StringIO(report_txt), 
		delimiter = ',', 
		autostrip = True,
		dtype=None, # precent casting to float
		names=GPU_QUERY_FIELDS,
		skip_header = 1, 
	# 	names=True,
		deletechars = '', # prevent mangling of names
		converters = GPU_QUERY_PROCESSORS,
	)
	mem_relative = util_table['memory.used'] / util_table['memory.total']
	gpu_util = util_table['utilization.gpu']
	
	mem_relative_avg = np.mean(mem_relative)
	gpu_util_avg = np.mean(gpu_util)
	
	return dict(
		report_numeric = util_table,
		mem_relative = mem_relative_avg,
		gpu_util = gpu_util_avg,
	)

# print(reports[0]['report_txt'])
# process_nvidiasmi_report(reports[0]['report_txt'])

In [46]:


async def collect_pod_utilization(pod_name):
	result = dict(name=pod_name)
	
	try:
		nvidia_smi_report_str = await query_gpu_status(pod_name)
		result['report_txt'] = nvidia_smi_report_str
		report_parsed = process_nvidiasmi_report(nvidia_smi_report_str)
		result.update(report_parsed)
		
	except Exception as e:
		log.error(f'Error reading pod {pod_name}: {e}')
		result['error'] = str(e)
		
	return result


async def collect_pod_utilization_all():
	pod_list = await list_pods_once()
	pod_names = [p.metadata.name for p in pod_list.items if p.status.phase == 'Running']
	responses = await asyncio.gather(*[
		collect_pod_utilization(name) for name in pod_names
	])
	return responses
	
reports = await collect_pod_utilization_all()
	

bedn bedn2 bedn3 benlalahpod.com guillard-sinkhorn-gridnet katircio-imagenet katircio-imagenetdetector kicirogl-airsim-2 mishchuk-cvpr mishchuk-cvpr2 mv-cvpr3 nakka-advseg14 nakka-advseg20 nakka-advseg22 oner-rt-fulldata-40 oner-rt-fulldata-40-3 rbermude-multiflow remelli-mvp sguo-pytorch-imagenet-iclr tyszkiew-apexpod vidit-maskrcnn-seg-res50-2 wickrama-experiments-1 wickrama-experiments-2 wickrama-experiments-3 wwang-imagenet0 wwang-imagenet1 wwang-imagenet2


In [49]:
import pandas

def display_reports(reports):

	util_table = pandas.DataFrame(reports, columns=['name', 'gpu_util', 'mem_relative'])
	
	return util_table
	
# 	names = [r['name'] for r in reports]
	
# 	util_table = pandas.DataFrame(dict(
# 		name = ,
# 		value = np.arange(3),
# 	))

tab = display_reports(reports)
tab.sort_values(by='mem_relative')

Unnamed: 0,name,gpu_util,mem_relative
9,mv-cvpr3,0.0,0.0
11,nakka-advseg20,0.0,0.0
17,sguo-pytorch-imagenet-iclr,0.0,0.0
15,rbermude-multiflow,0.0,0.0
19,wickrama-experiments-1,0.986,0.141157
4,guillard-sinkhorn-gridnet,0.992,0.141895
10,nakka-advseg14,0.0,0.161489
18,vidit-maskrcnn-seg-res50-2,0.0,0.174039
5,katircio-imagenet,0.334,0.254383
6,katircio-imagenetdetector,0.23,0.254383
