# Section 2: Node and Rack Hardware Overview

In [1]:
import pyspark.sql.functions as F
from util.read_and_print_df import *
from util.handle_nan import *
from util.extract_json_attributes import *
from util.plotting import *
import builtins

In [3]:
spark = get_spark_session()

Assigning 541 GB of memory per spark driver and executor, and use 126 cores.


In [5]:
df_node_hardware_info = spark.read.parquet(path_node_hardware_info)
df_node_hardware_info = get_gpu_node_col(df_node_hardware_info, 'node')
df_node_hardware_info.show(500, False)

df_prom = spark.read.parquet(path_node_dataset)
# we select only the nodes we also have Prometheus data for
df_prom_nodes = df_prom.select('node').distinct()
print(f"Select {df_prom_nodes.count()}/{df_node_hardware_info.count()} nodes with hardware info and Prometheus data")
df_node_hardware_info = df_node_hardware_info.join(df_prom_nodes, on='node', how='inner')

+------+-----------+-------------------------+--------------------+------------------+--------------+---------------------+---------+--------------------+----------------------+-------------+--------------------------------+--------------------------+-------------------------+----------------+------------+---------+----------------------+-------------+--------+
|node  |cpu_model  |cpu_core_count_per_socket|cpu_threads_per_core|cpu_tdp_per_socket|cpu_t_case_max|cpu_per_core_temp_max|cpu_count|cpu_core_count_total|cpu_memory_bytes_total|cpu_tdp_total|node_filesystem_size_bytes_total|gpu_model                 |gpu_memory_bytes_per_card|gpu_tdp_per_card|gpu_temp_max|gpu_count|gpu_memory_bytes_total|gpu_tdp_total|gpu_node|
+------+-----------+-------------------------+--------------------+------------------+--------------+---------------------+---------+--------------------+----------------------+-------------+--------------------------------+--------------------------+---------------------



Select 338/343 nodes with hardware info and Prometheus data


                                                                                

In [6]:
attributes = ['cpu_count', 'cpu_core_count_total', 'cpu_tdp_total', 'cpu_memory_bytes_total', 'node_filesystem_size_bytes_total', 'gpu_count', 'gpu_tdp_total', 'gpu_memory_bytes_total']

df_pd = df_node_hardware_info.toPandas()

                                                                                

In [7]:
df_pd_2 = df_pd.apply(lambda x: builtins.round(x / 1024**3) if 'bytes' in x.name else x)
df_pd_2 = df_pd_2.rename(columns=lambda x: x.replace('bytes', 'giga_bytes'))

df_pd_2['node_filesystem_size_giga_bytes_total'] = df_pd_2['node_filesystem_size_giga_bytes_total'] / 1024
df_pd_2 = df_pd_2.rename(columns={'node_filesystem_size_giga_bytes_total': 'node_filesystem_size_tera_bytes_total'})

In [8]:
print("CPU-only nodes:")
display(df_pd_2[df_pd_2["gpu_node"] == 0].describe())
print("Mode values:")
df_pd_2[df_pd_2["gpu_node"] == 0].drop('node', axis=1).mode()

CPU-only nodes:


Unnamed: 0,cpu_core_count_per_socket,cpu_threads_per_core,cpu_tdp_per_socket,cpu_t_case_max,cpu_per_core_temp_max,cpu_count,cpu_core_count_total,cpu_memory_giga_bytes_total,cpu_tdp_total,node_filesystem_size_tera_bytes_total,gpu_memory_giga_bytes_per_card,gpu_tdp_per_card,gpu_temp_max,gpu_count,gpu_memory_giga_bytes_total,gpu_tdp_total,gpu_node
count,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0
mean,13.658537,1.0,112.839721,83.783972,101.0,1.348432,16.864111,108.8223,144.233449,1.826975,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,4.125029,0.0,19.334254,4.639676,0.0,0.498799,5.479045,122.056704,37.513697,0.628778,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,8.0,1.0,85.0,77.0,101.0,1.0,16.0,96.0,125.0,1.739258,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.0,1.0,85.0,77.0,101.0,1.0,16.0,96.0,125.0,1.739258,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,16.0,1.0,125.0,87.0,101.0,1.0,16.0,96.0,125.0,1.739258,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,16.0,1.0,125.0,87.0,101.0,2.0,16.0,96.0,170.0,1.811523,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,26.0,1.0,150.0,87.0,101.0,4.0,52.0,2048.0,500.0,11.882812,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Mode values:


Unnamed: 0,cpu_model,cpu_core_count_per_socket,cpu_threads_per_core,cpu_tdp_per_socket,cpu_t_case_max,cpu_per_core_temp_max,cpu_count,cpu_core_count_total,cpu_memory_giga_bytes_total,cpu_tdp_total,node_filesystem_size_tera_bytes_total,gpu_model,gpu_memory_giga_bytes_per_card,gpu_tdp_per_card,gpu_temp_max,gpu_count,gpu_memory_giga_bytes_total,gpu_tdp_total,gpu_node
0,gold_6130,16,1,125.0,87.0,101.0,1,16,96.0,125.0,1.739258,,0.0,0.0,0.0,0,0.0,0.0,0


In [9]:
print("GPU nodes:")
display(df_pd_2[df_pd_2["gpu_node"] > 0].describe())
print("Mode values:")
df_pd_2[df_pd_2["gpu_node"] > 0].drop('node', axis=1).mode()

GPU nodes:


Unnamed: 0,cpu_core_count_per_socket,cpu_threads_per_core,cpu_tdp_per_socket,cpu_t_case_max,cpu_per_core_temp_max,cpu_count,cpu_core_count_total,cpu_memory_giga_bytes_total,cpu_tdp_total,node_filesystem_size_tera_bytes_total,gpu_memory_giga_bytes_per_card,gpu_tdp_per_card,gpu_temp_max,gpu_count,gpu_memory_giga_bytes_total,gpu_tdp_total,gpu_node
count,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0
mean,10.117647,1.0,98.333333,80.176471,101.0,2.0,20.235294,296.156863,196.666667,2.34802,18.901961,268.235294,89.784314,3.882353,72.784314,1040.0,1.0
std,3.82438,0.0,11.775681,2.251274,0.0,0.0,7.64876,314.567854,23.551362,0.20732,6.410163,14.792685,0.986179,0.475271,25.886146,134.342845,0.0
min,6.0,1.0,85.0,78.0,101.0,2.0,12.0,192.0,170.0,0.912109,11.0,250.0,89.0,2.0,44.0,560.0,1.0
25%,6.0,1.0,85.0,78.0,101.0,2.0,12.0,192.0,170.0,2.366211,11.0,250.0,89.0,4.0,44.0,1000.0,1.0
50%,12.0,1.0,105.0,81.0,101.0,2.0,24.0,192.0,210.0,2.366211,24.0,280.0,89.0,4.0,96.0,1120.0,1.0
75%,12.0,1.0,105.0,81.0,101.0,2.0,24.0,256.0,210.0,2.373047,24.0,280.0,91.0,4.0,96.0,1120.0,1.0
max,20.0,1.0,125.0,87.0,101.0,2.0,40.0,1536.0,250.0,2.496094,24.0,280.0,91.0,4.0,96.0,1120.0,1.0


Mode values:


Unnamed: 0,cpu_model,cpu_core_count_per_socket,cpu_threads_per_core,cpu_tdp_per_socket,cpu_t_case_max,cpu_per_core_temp_max,cpu_count,cpu_core_count_total,cpu_memory_giga_bytes_total,cpu_tdp_total,node_filesystem_size_tera_bytes_total,gpu_model,gpu_memory_giga_bytes_per_card,gpu_tdp_per_card,gpu_temp_max,gpu_count,gpu_memory_giga_bytes_total,gpu_tdp_total,gpu_node
0,gold_5118,12,1,105.0,81.0,101.0,2,24,192.0,210.0,2.366211,NVIDIA_TITAN_RTX,24.0,280.0,89.0,4,96.0,1120.0,1


In [10]:
print("Total values of cluster:")
df_pd_2.drop(['node', 'cpu_model', 'gpu_model'], axis=1).sum()

Total values of cluster:


cpu_core_count_per_socket                 4436.00000
cpu_threads_per_core                       338.00000
cpu_tdp_per_socket                       37400.00000
cpu_t_case_max                           28135.00000
cpu_per_core_temp_max                    34138.00000
cpu_count                                  489.00000
cpu_core_count_total                      5872.00000
cpu_memory_giga_bytes_total              46336.00000
cpu_tdp_total                            51425.00000
node_filesystem_size_tera_bytes_total      644.09082
gpu_memory_giga_bytes_per_card             964.00000
gpu_tdp_per_card                         13680.00000
gpu_temp_max                              4579.00000
gpu_count                                  198.00000
gpu_memory_giga_bytes_total               3712.00000
gpu_tdp_total                            53040.00000
gpu_node                                    51.00000
dtype: float64

Generate tables and boxplots about hardware info on rack level.

In [11]:
df_node_hardware_info_rack = df_node_hardware_info.withColumn('rack', F.substring('node', 1, 3))
df_node_hardware_info_rack = df_node_hardware_info_rack.groupBy('rack').agg(
    F.count('node').alias('node_count_sum'),
    *[F.sum(a).alias(f'{a}_sum') for a in attributes],
    F.expr(f'percentile_approx(gpu_node, 0.5)').alias('gpu_rack')
    )

df_pd_rack = df_node_hardware_info_rack.toPandas()

In [12]:
df_pd_rack_2 = df_pd_rack.apply(lambda x: builtins.round(x / 1024**3) if 'bytes' in x.name else x)
df_pd_rack_2 = df_pd_rack_2.rename(columns=lambda x: x.replace('bytes', 'giga_bytes'))

df_pd_rack_2['node_filesystem_size_giga_bytes_total_sum'] = df_pd_rack_2['node_filesystem_size_giga_bytes_total_sum'] / 1024
df_pd_rack_2 = df_pd_rack_2.rename(columns={'node_filesystem_size_giga_bytes_total_sum': 'node_filesystem_size_tera_bytes_total_sum'})

In [13]:
print("CPU-only racks:")
display(df_pd_rack[df_pd_rack["gpu_rack"] == 0].describe())
print("Mode values:")
df_pd_rack_2[df_pd_rack_2["gpu_rack"] == 0].drop('rack', axis=1).mode()

CPU-only racks:


Unnamed: 0,node_count_sum,cpu_count_sum,cpu_core_count_total_sum,cpu_tdp_total_sum,cpu_memory_bytes_total_sum,node_filesystem_size_bytes_total_sum,gpu_count_sum,gpu_tdp_total_sum,gpu_memory_bytes_total_sum,gpu_rack
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,26.090909,35.181818,440.0,3763.181818,3048646000000.0,52413320000000.0,0.0,0.0,0.0,0.0
std,11.255706,18.82986,142.076036,1434.716475,367533900000.0,17676970000000.0,0.0,0.0,0.0,0.0
min,1.0,4.0,48.0,500.0,2199023000000.0,13065630000000.0,0.0,0.0,0.0,0.0
25%,30.0,30.5,480.0,3812.5,3092376000000.0,58323570000000.0,0.0,0.0,0.0,0.0
50%,31.0,32.0,496.0,4000.0,3195456000000.0,59766750000000.0,0.0,0.0,0.0,0.0
75%,32.0,46.0,512.0,4550.0,3298535000000.0,61191950000000.0,0.0,0.0,0.0,0.0
max,32.0,62.0,512.0,5270.0,3298535000000.0,61758970000000.0,0.0,0.0,0.0,0.0


Mode values:


Unnamed: 0,node_count_sum,cpu_count_sum,cpu_core_count_total_sum,cpu_tdp_total_sum,cpu_memory_giga_bytes_total_sum,node_filesystem_size_tera_bytes_total_sum,gpu_count_sum,gpu_tdp_total_sum,gpu_memory_giga_bytes_total_sum,gpu_rack
0,32,32,512,4000.0,3072.0,55.65332,0,0.0,0.0,0


In [14]:
print("GPU racks:")
display(df_pd_rack[df_pd_rack["gpu_rack"] == 1].describe())
print("Mode values:")
df_pd_rack_2[df_pd_rack_2["gpu_rack"] == 1].drop('rack', axis=1).mode()

GPU racks:


Unnamed: 0,node_count_sum,cpu_count_sum,cpu_core_count_total_sum,cpu_tdp_total_sum,cpu_memory_bytes_total_sum,node_filesystem_size_bytes_total_sum,gpu_count_sum,gpu_tdp_total_sum,gpu_memory_bytes_total_sum,gpu_rack
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,5.1,10.2,103.2,1003.0,1621780000000.0,13167640000000.0,19.8,5304.0,398573000000.0,1.0
std,1.286684,2.573368,38.031566,239.678025,1295058000000.0,3636722000000.0,5.996295,1558.711719,166636400000.0,0.0
min,3.0,6.0,48.0,550.0,755914200000.0,6220811000000.0,10.0,2800.0,197568500000.0,1.0
25%,4.25,8.5,75.0,877.5,1030792000000.0,11378990000000.0,17.0,4610.0,264140500000.0,1.0
50%,5.0,10.0,108.0,1035.0,1202591000000.0,13011010000000.0,20.0,5600.0,371514700000.0,1.0
75%,5.75,11.5,120.0,1050.0,1597728000000.0,15002180000000.0,23.0,5900.0,515396100000.0,1.0
max,7.0,14.0,168.0,1470.0,5153961000000.0,18263520000000.0,28.0,7840.0,721554500000.0,1.0


Mode values:


Unnamed: 0,node_count_sum,cpu_count_sum,cpu_core_count_total_sum,cpu_tdp_total_sum,cpu_memory_giga_bytes_total_sum,node_filesystem_size_tera_bytes_total_sum,gpu_count_sum,gpu_tdp_total_sum,gpu_memory_giga_bytes_total_sum,gpu_rack
0,5,10,120,1050.0,960.0,11.833008,20,5600.0,480.0,1


In [16]:
df_slurm = spark.read.parquet(path_job_dataset)
print("Job Dataset Size:", df_slurm.count(), "x", len(df_slurm.columns), "Start Date:", df_slurm.select(F.min("start_date")).first()[0], "End Date:", df_slurm.select(F.max("end_date")).first()[0])
df_slurm.show(5, False)

df_prom = spark.read.parquet(path_node_dataset)
print("Node Dataset Size:", df_prom.count(), "x", len(df_prom.columns), "Start Date:", df_prom.select(F.min("timestamp")).first()[0], "End Date:", df_prom.select(F.max("timestamp")).first()[0])
df_prom.show(5, False)

df_combined = spark.read.parquet(path_job_node_joined_dataset)
print("Job-Node Dataset Size:", df_combined.count(), "x", len(df_combined.columns), "Start Date:", df_combined.select(F.min("timestamp")).first()[0], "End Date:", df_combined.select(F.max("timestamp")).first()[0])
df_combined.show(5, False)

Job Dataset Size: 1596963 x 9 Start Date: 2021-12-26 23:06:31 End Date: 2022-11-01 13:59:18
+-----+-------------------+-------------------+-------------------+------+---------+--------+--------+---------+
|id   |submit_date        |start_date         |end_date           |node  |nodetypes|numnodes|numcores|state    |
+-----+-------------------+-------------------+-------------------+------+---------+--------+--------+---------+
|58607|2022-01-14 17:39:50|2022-01-15 06:39:07|2022-01-15 14:30:47|r15n11|normal   |1       |16      |COMPLETED|
|58786|2022-01-14 17:39:50|2022-01-15 06:39:07|2022-01-15 16:27:53|r11n8 |normal   |1       |16      |COMPLETED|
|58669|2022-01-14 17:39:50|2022-01-15 06:41:37|2022-01-15 15:41:45|r14n23|normal   |1       |16      |COMPLETED|
|58548|2022-01-14 17:39:50|2022-01-15 06:42:37|2022-01-15 13:38:40|r12n3 |normal   |1       |16      |COMPLETED|
|58848|2022-01-14 17:39:50|2022-01-15 06:46:38|2022-01-15 17:11:13|r25n29|normal   |1       |16      |COMPLETED|
+---