In [1]:
import ast
import datetime
import math
import os
import re
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats

# Scikit-learn - preprocessing
from sklearn.preprocessing import (
    LabelEncoder,
    MinMaxScaler,
    StandardScaler
)


from sklearn.model_selection import KFold, cross_val_score, train_test_split

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

from sklearn.ensemble import (
    GradientBoostingClassifier,
    GradientBoostingRegressor,
    IsolationForest,
    RandomForestClassifier,
    RandomForestRegressor
)

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# XGBoost
!pip install xgboost
from xgboost import XGBClassifier



Defaulting to user installation because normal site-packages is not writeable


Coloring

In [2]:
from matplotlib.colors import LinearSegmentedColormap
def make_single_color_cmap(hex_color, name):
    return LinearSegmentedColormap.from_list(name, ["#FFFFFF", hex_color])

colors = {
    'cmap1': '#CC6677',  # mem
    'cmap2': '#44AA99',  # gpu
    'cmap3': '#AA4499',  # power
}

In [5]:
import os
print(os.getcwd())
vasp_data = pd.read_parquet('../VASP_march_jobs.parquet')
print(f"Parsed {len(vasp_data)} job records")
print(f"Requested Memory: {min(vasp_data['req_mem_mb'])} - {max(vasp_data['req_mem_mb'])} MB")
print(f"Requested GPU: {min(vasp_data['req_gpus'])} - {max(vasp_data['req_gpus'])} GPUs")
print(f"Requested Number of Node: {min(vasp_data['req_node'])} - {max(vasp_data['req_node'])} nodes")
print(f"Requested CPU: {min(vasp_data['req_cpu'])} - {max(vasp_data['req_cpu'])} CPUs")
print(f"Average Power Utilization per Job: {min(vasp_data['avg_power'])} - {max(vasp_data['avg_power'])} W")



/global/u2/b/boztop/vasp-gpu-resource-data-analysis/notebooks


FileNotFoundError: [Errno 2] No such file or directory: '../VASP_march_jobs.parquet'

Encoding Categorical Columns

In [None]:
categorical_columns = ['User', 'JobName', 'Account', 'Category', 'Program']
df_ml = vasp_data.copy()
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df_ml[col] = le.fit_transform(df_ml[col].astype(str))
    label_encoders[col] = le
    

Binning Resource Utilization

In [None]:
def bin_power(y):
    print(f"Avg power range: {min(y)} - {max(y)} W")
    return pd.cut(
        y,
        bins=[-np.inf, 100, 150, 200, np.inf],
        labels=[0, 1, 2, 3]
    ).astype(int)

def bin_gpu(y):
    return pd.cut(
        y,
        bins=[-np.inf, 75, 85, 95, np.inf],
        labels=[0, 1, 2, 3]
    ).astype(int)

def bin_mem(y):
    return pd.cut(
        y,
        bins=[-np.inf, 30, 40, 50, np.inf],
        labels=[0, 1, 2, 3]
    ).astype(int)

Classification Model for Energy Prediction

In [None]:
def train_predict_classifier(train_features, target_feature, df, title):
    df_cleaned = df.dropna(subset=train_features + [target_feature])
    X = df_cleaned[train_features]
    y = df_cleaned[target_feature]

    if target_feature == 'avg_power':
        y_binned = bin_power(y)
        label_names = [f'{y.min():.2g} - 100W', '100 - 150W', '150 - 200W', '> 200W']
        cmap = make_single_color_cmap(colors['cmap3'], 'cmap3')
        color = '#AA4499'
        
    elif target_feature == 'gpu_utilization_max':
        y_binned = bin_gpu(y)
        label_names = ['< 75 %', '75 - 85 %', '85 - 95 %', '> 95%']
        cmap = make_single_color_cmap(colors['cmap2'], 'cmap2')
        color = '#44AA99'
        
    elif target_feature == 'mem_util_max':
        y_binned = bin_mem(y)
        label_names = ['< 30 %', '30 - 40 %', '40 - 50 %', '> 50%']
        cmap = make_single_color_cmap(colors['cmap1'], 'cmap1')
        color = '#CC6677'
    
    else:
        print("Try avg_power, gpu_utilization_max, or mem_util_max.")
        return

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_binned, test_size=0.2, random_state=42, stratify=y_binned
    )
    
    class_labels = [0, 1, 2, 3]
    clf = XGBClassifier(
        objective='multi:softmax',
        num_class=len(class_labels), 
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        eval_metric='mlogloss'
    )
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("\n=== Classification Report ===")
    report = classification_report(y_test, y_pred, target_names=label_names, digits=4)
    print(report)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=class_labels, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
    fig, ax = plt.subplots(figsize=(8, 6))
    
    disp.plot(cmap=cmap, values_format=".2f", ax=ax, colorbar=True,text_kw={'fontsize': 16}) #, 'fontweight': 'bold
    plt.title(title)
    ax.set_title(ax.get_title(), fontsize=20)
    ax.set_xlabel(ax.get_xlabel(), fontsize=18)
    ax.set_ylabel(ax.get_ylabel(), fontsize=18)
    plt.xticks(fontsize=16,rotation=45)
    plt.yticks(fontsize=16)
    plt.tight_layout()
    plt.show()

    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': clf.feature_importances_
    }).sort_values('importance', ascending=False)

    plt.figure(figsize=(10, 6))
    plt.barh(
        feature_importance['feature'].head(10)[::-1],
        feature_importance['importance'].head(10)[::-1],
        color=color
    )
    plt.xlabel("Feature Importance")
    plt.title("Top 10 Important Features (Classification)")
    plt.tight_layout()
    plt.show()    

Experiment #1: Slurm metrics as training features

In [None]:
train_features = ['User', 'JobName', 'Account', 'Category', 'req_node','req_time']

In [None]:
train_predict_classifier(train_features, 'avg_power', df_ml, 'Average Power Prediction')

In [None]:
train_predict_classifier(train_features, 'mem_util_max', df_ml, 'Maximum GPU Memory Utilization (%) Prediction')

In [None]:
train_predict_classifier(train_features, 'gpu_utilization_max', df_ml, 'Maximum GPU Utilization (%) Prediction')

Experiment#2: DCGM metrics added to the training feature list

In [None]:
train_features = ['User', 'JobName', 'Account', 'Category', 'req_node','req_time',
                 'gpu_utilization_max', 'mem_util_max', 'sm_active_max',
                'sm_occupancy_max', 'AI_fp64', 'AI_tensor','gpu_tif_merged','mem_tif_merged',
                 'gpu_sif_inter_normalized','gpu_sif_intra_normalized',
                 'mem_sif_inter_normalized','mem_sif_intra_normalized']
train_predict_classifier(train_features, 'avg_power', df_ml, 'Average Power Prediction')

    

In [None]:
train_features = ['User', 'JobName', 'Account', 'Category', 'req_node','req_time',
                 'gpu_utilization_max', 'avg_power', 'sm_active_max',
                'sm_occupancy_max', 'AI_fp64', 'AI_tensor','gpu_tif_merged','mem_tif_merged',
                 'gpu_sif_inter_normalized','gpu_sif_intra_normalized',
                 'mem_sif_inter_normalized','mem_sif_intra_normalized']

train_predict_classifier(train_features, 'mem_util_max', df_ml, 'Maximum GPU Memory Utilization (%) Prediction')


In [None]:
train_features = ['User', 'JobName', 'Account', 'Category', 'req_node','req_time',
                 'avg_power', 'mem_util_max', 'sm_active_max',
                'sm_occupancy_max', 'AI_fp64', 'AI_tensor','gpu_tif_merged','mem_tif_merged',
                 'gpu_sif_inter_normalized','gpu_sif_intra_normalized',
                 'mem_sif_inter_normalized','mem_sif_intra_normalized']

train_predict_classifier(train_features, 'gpu_utilization_max', df_ml, 'Maximum GPU Utilization (%) Prediction')
