In [1]:
# Standard library
import ast
import datetime
import math
import os
import re
import sys

# Data science and numerical computing
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Scientific computing
from scipy import stats

# Scikit-learn - preprocessing
from sklearn.preprocessing import (
    LabelEncoder,
    OrdinalEncoder,
    MinMaxScaler,
    StandardScaler
)

# Scikit-learn - model selection and evaluation
from sklearn.model_selection import (
    KFold,
    cross_val_score,
    cross_val_predict,
    train_test_split,
    GridSearchCV
)

# Scikit-learn - metrics
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    mean_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error,
    r2_score
)

# Scikit-learn - ensemble methods
from sklearn.ensemble import (
    GradientBoostingClassifier,
    GradientBoostingRegressor,
    IsolationForest,
    RandomForestClassifier,
    RandomForestRegressor
)

# Scikit-learn - clustering and decomposition
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.svm import SVR

# Gradient boosting libraries
import xgboost as xgb
import lightgbm as lgb

# Deep learning
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping


import json



2026-01-26 15:21:29.234719: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-26 15:21:29.367831: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-26 15:21:29.409031: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-26 15:21:29.698266: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# The datasets will not be public in this work. 
# The following procedure can be applied for other systems and heterogeneous HPC applications.
# vasp_data = pd.read_parquet('../data/VASP.parquet')
# lammps_data = pd.read_parquet('../data/LAMMPS.parquet')
# espresso_data = pd.read_parquet('../data/ESPRESSO.parquet')
# atlas_data = pd.read_parquet('../data/ATLAS.parquet')
# e3sm_data = pd.read_parquet('../data/E3SM.parquet')


In [3]:
# bayesian wandb - wandb sweep
import wandb
def param_sweep_lgb(input_data, dataset_name):
    df = input_data.copy()
    train_features = ['User', 'JobName', 'Account', 'Category', 'req_node', 'req_time']
    cat_cols = ['User', 'JobName', 'Account', 'Category']
    num_cols = ['req_node', 'req_time']
    target_features = ['gpu_utilization_max', 'mem_util_max', 'avg_power']

    results_by_target = {}
    model_names = ['LightGBM']
    model_results = {model_name: {} for model_name in model_names}

    # Define the sweep configuration
    sweep_config = {
        'method': 'bayes',
        'metric': {
            'name': 'mae',
            'goal': 'minimize'
        },
        'parameters': {
            'n_estimators': {'min': 100, 'max': 1000},
            'learning_rate': {'min': 0.01, 'max': 0.3},
            'max_depth': {'min': 3, 'max': 10},
            'num_leaves': {'min': 20, 'max': 150},
            'min_child_samples': {'min': 5, 'max': 50},
            'subsample': {'min': 0.6, 'max': 1.0},
            'colsample_bytree': {'min': 0.6, 'max': 1.0},
            'reg_alpha': {'min': 0.0, 'max': 1.0},
            'reg_lambda': {'min': 0.0, 'max': 1.0}
        }
    }

    sweep_id = wandb.sweep(sweep_config, project=f"{dataset_name}_lgbm_sweep")
    
    def train():
        with wandb.init() as run:
            # Extract hyperparameters from wandb
            config = wandb.config

            for target_feature in target_features:
                df_cleaned = df.dropna(subset=train_features + [target_feature])
                X = df_cleaned[train_features].copy()
                y = df_cleaned[target_feature].copy()

                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.2, random_state=42
                )

                # Encode categoricals
                encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
                X_train.loc[:, cat_cols] = encoder.fit_transform(X_train[cat_cols])
                X_test.loc[:, cat_cols] = encoder.transform(X_test[cat_cols])

                # Scale numeric features
                scaler = StandardScaler()
                X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
                X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols])

                X_train = X_train.astype(np.float32)
                X_test = X_test.astype(np.float32)
                y_train = y_train.astype(np.float32)
                y_test = y_test.astype(np.float32)

                # Shift target to avoid gamma issues
                shift = 1e-6
                y_train_gamma = y_train + shift

                # Train LightGBM with sweep hyperparameters
                reg_lgb = lgb.LGBMRegressor(
                    random_state=42,
                    n_estimators=int(config.n_estimators),
                    learning_rate=config.learning_rate,
                    max_depth=int(config.max_depth),
                    num_leaves=int(config.num_leaves),
                    min_child_samples=int(config.min_child_samples),
                    subsample=config.subsample,
                    colsample_bytree=config.colsample_bytree,
                    reg_alpha=config.reg_alpha,
                    reg_lambda=config.reg_lambda,
                    objective='gamma',
                    verbosity=-1
                )
                reg_lgb.fit(X_train, y_train_gamma)
                y_pred_lgb = reg_lgb.predict(X_test)

                mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
                r2_lgb = r2_score(y_test, y_pred_lgb)
                acc_lgb = np.mean(np.minimum(y_pred_lgb / (y_test + 1e-8), y_test / (y_pred_lgb + 1e-8)))

                # Log metrics to wandb
                wandb.log({
                    'target': target_feature,
                    'mae': mae_lgb,
                    'r2': r2_lgb,
                    'accuracy': acc_lgb
                })

                model_results['LightGBM'][target_feature] = {
                    'y_pred': y_pred_lgb, 'r2': r2_lgb, 'mae': mae_lgb, 'accuracy': acc_lgb
                }
    wandb.agent(sweep_id, function=train, count=30)

# %%capture
# param_sweep_lgb(vasp_data, 'VASP')
# param_sweep_lgb(lammps_data, 'LAMMPS')
# param_sweep_lgb(chroma_data, 'CHROMA')
# param_sweep_lgb(espresso_data, 'ESPRESSO')
# param_sweep_lgb(atlas_data, 'ATLAS')
# param_sweep_lgb(e3sm_data, 'E3SM')


In [4]:
def param_sweep_gbr(input_data, dataset_name):
    df = input_data.copy()
    train_features = ['User', 'JobName', 'Account', 'Category', 'req_node', 'req_time']
    cat_cols = ['User', 'JobName', 'Account', 'Category']
    num_cols = ['req_node', 'req_time']
    target_features = ['gpu_utilization_max', 'mem_util_max', 'avg_power']

    results_by_target = {}
    model_names = ['GradientBoosting']
    model_results = {model_name: {} for model_name in model_names}

    # Define the sweep configuration
    sweep_config = {
        'method': 'bayes',
        'metric': {
            'name': 'mae',
            'goal': 'minimize'
        },
        'parameters': {
            'n_estimators': {'min': 100, 'max': 1000},
            'learning_rate': {'min': 0.01, 'max': 0.3},
            'max_depth': {'min': 2, 'max': 8},
            'min_samples_split': {'min': 2, 'max': 20},
            'min_samples_leaf': {'min': 1, 'max': 20},
            'subsample': {'min': 0.5, 'max': 1.0},
            'max_features': {
                'values': ['sqrt', 'log2', None]
            }
        }
    }

    sweep_id = wandb.sweep(sweep_config, project=f"{dataset_name}_gbr_sweep")
    
    def train():
        with wandb.init() as run:
            # Extract hyperparameters from wandb
            config = wandb.config

            for target_feature in target_features:
                df_cleaned = df.dropna(subset=train_features + [target_feature])
                X = df_cleaned[train_features].copy()
                y = df_cleaned[target_feature].copy()

                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.2, random_state=42
                )

                # Encode categoricals
                encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
                X_train.loc[:, cat_cols] = encoder.fit_transform(X_train[cat_cols])
                X_test.loc[:, cat_cols] = encoder.transform(X_test[cat_cols])

                # Scale numeric features
                scaler = StandardScaler()
                X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
                X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols])

                X_train = X_train.astype(np.float32)
                X_test = X_test.astype(np.float32)
                y_train = y_train.astype(np.float32)
                y_test = y_test.astype(np.float32)



                # Train GradientBoostingRegressor with sweep hyperparameters
                reg_gb = GradientBoostingRegressor(
                    random_state=42,
                    n_estimators=int(config.n_estimators),
                    learning_rate=config.learning_rate,
                    max_depth=int(config.max_depth),
                    min_samples_split=int(config.min_samples_split),
                    min_samples_leaf=int(config.min_samples_leaf),
                    subsample=config.subsample,
                    max_features=config.max_features
                )
                
                reg_gb.fit(X_train, y_train)
                y_pred_gb = reg_gb.predict(X_test)

                mae_gb = mean_absolute_error(y_test, y_pred_gb)
                r2_gb = r2_score(y_test, y_pred_gb)
                acc_gb = np.mean(np.minimum(y_pred_gb / (y_test + 1e-8), y_test / (y_pred_gb + 1e-8)))

                wandb.log({
                    'target': target_feature,
                    'mae': mae_gb,
                    'r2': r2_gb,
                    'accuracy': acc_gb
                })

                model_results['GradientBoosting'][target_feature] = {
                    'y_pred': y_pred_gb, 'r2': r2_gb, 'mae': mae_gb, 'accuracy': acc_gb
                }
    wandb.agent(sweep_id, function=train, count=30)

# %%capture
# param_sweep_gbr(vasp_data, 'VASP')
# param_sweep_gbr(lammps_data, 'LAMMPS')
# param_sweep_gbr(chroma_data, 'CHROMA')
# param_sweep_gbr(espresso_data, 'ESPRESSO')
# param_sweep_gbr(atlas_data, 'ATLAS')
# param_sweep_gbr(e3sm_data, 'E3SM')


In [5]:
def param_sweep_xgb(input_data, dataset_name):
    df = input_data.copy()
    train_features = ['User', 'JobName', 'Account', 'Category', 'req_node', 'req_time']
    cat_cols = ['User', 'JobName', 'Account', 'Category']
    num_cols = ['req_node', 'req_time']
    target_features = ['gpu_utilization_max', 'mem_util_max', 'avg_power']

    results_by_target = {}
    model_names = ['XGBoost']
    model_results = {model_name: {} for model_name in model_names}

    # Define the sweep configuration
    sweep_config = {
        'method': 'bayes',
        'metric': {
            'name': 'mae',
            'goal': 'minimize'
        },
        'parameters': {
            'n_estimators': {'min': 100, 'max': 1000},
            'learning_rate': {'min': 0.01, 'max': 0.3},
            'max_depth': {'min': 2, 'max': 10},
            'subsample': {'min': 0.5, 'max': 1.0},
            'colsample_bytree': {'min': 0.5, 'max': 1.0},
            'gamma': {'min': 0.0, 'max': 5.0},
            'reg_alpha': {'min': 0.0, 'max': 1.0},
            'reg_lambda': {'min': 0.0, 'max': 1.0}
        }
    }

    sweep_id = wandb.sweep(sweep_config, project=f"{dataset_name}_xgb_sweep")
    
    def train():
        with wandb.init() as run:
            # Extract hyperparameters from wandb
            config = wandb.config

            for target_feature in target_features:
                df_cleaned = df.dropna(subset=train_features + [target_feature])
                X = df_cleaned[train_features].copy()
                y = df_cleaned[target_feature].copy()

                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.2, random_state=42
                )

                # Encode categoricals
                encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
                X_train.loc[:, cat_cols] = encoder.fit_transform(X_train[cat_cols])
                X_test.loc[:, cat_cols] = encoder.transform(X_test[cat_cols])

                # Scale numeric features
                scaler = StandardScaler()
                X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
                X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols])

                X_train = X_train.astype(np.float32)
                X_test = X_test.astype(np.float32)
                y_train = y_train.astype(np.float32)
                y_test = y_test.astype(np.float32)

                shift = 1e-6 
                y_train_gamma = y_train + shift

                # Train XGBRegressor with sweep hyperparameters
                reg_xgb = xgb.XGBRegressor(
                    random_state=42,
                    n_estimators=int(config.n_estimators),
                    learning_rate=config.learning_rate,
                    max_depth=int(config.max_depth),
                    subsample=config.subsample,
                    colsample_bytree=config.colsample_bytree,
                    gamma=config.gamma,
                    reg_alpha=config.reg_alpha,
                    reg_lambda=config.reg_lambda,
                    objective='reg:gamma',
                    tree_method='hist'
                )
                                
                reg_xgb.fit(X_train, y_train_gamma)
                y_pred_xgb = reg_xgb.predict(X_test)

                mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
                r2_xgb = r2_score(y_test, y_pred_xgb)
                acc_xgb = np.mean(np.minimum(y_pred_xgb / (y_test + 1e-8), y_test / (y_pred_xgb + 1e-8)))
                
                wandb.log({
                    'target': target_feature,
                    'mae': mae_xgb,
                    'r2': r2_xgb,
                    'accuracy': acc_xgb
                })

                model_results['XGBoost'][target_feature] = {
                    'y_pred': y_pred_xgb, 'r2': r2_xgb, 'mae': mae_xgb, 'accuracy': acc_xgb
                }
    wandb.agent(sweep_id, function=train, count=30)


# %%capture
# param_sweep_xgb(vasp_data, 'VASP')
# param_sweep_xgb(lammps_data, 'LAMMPS')
# param_sweep_xgb(chroma_data, 'CHROMA')
# param_sweep_xgb(espresso_data, 'ESPRESSO')
# param_sweep_xgb(atlas_data, 'ATLAS')
# param_sweep_xgb(e3sm_data, 'E3SM')



In [6]:
def param_sweep_rf(input_data, dataset_name):
    df = input_data.copy()
    train_features = ['User', 'JobName', 'Account', 'Category', 'req_node', 'req_time']
    cat_cols = ['User', 'JobName', 'Account', 'Category']
    num_cols = ['req_node', 'req_time']
    target_features = ['gpu_utilization_max', 'mem_util_max', 'avg_power']

    results_by_target = {}
    model_names = ['RandomForest']
    model_results = {model_name: {} for model_name in model_names}

    # Define the sweep configuration
    sweep_config = {
        'method': 'bayes',
        'metric': {
            'name': 'mae',
            'goal': 'minimize'
        },
        'parameters': {
            'n_estimators': {'min': 100, 'max': 1000},
            'max_depth': {'min': 2, 'max': 20},       # None for unlimited depth
            'min_samples_split': {'min': 2, 'max': 20},
            'min_samples_leaf': {'min': 1, 'max': 20},
            'max_features': {'values': ['sqrt', 'log2', None]}
        }
    }

    sweep_id = wandb.sweep(sweep_config, project=f"{dataset_name}_rf_sweep")
    
    def train():
        with wandb.init() as run:
            # Extract hyperparameters from wandb
            config = wandb.config

            for target_feature in target_features:
                df_cleaned = df.dropna(subset=train_features + [target_feature])
                X = df_cleaned[train_features].copy()
                y = df_cleaned[target_feature].copy()

                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.2, random_state=42
                )

                # Encode categoricals
                encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
                X_train.loc[:, cat_cols] = encoder.fit_transform(X_train[cat_cols])
                X_test.loc[:, cat_cols] = encoder.transform(X_test[cat_cols])

                # Scale numeric features
                scaler = StandardScaler()
                X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
                X_test.loc[:, num_cols] = scaler.transform(X_test[num_cols])

                X_train = X_train.astype(np.float32)
                X_test = X_test.astype(np.float32)
                y_train = y_train.astype(np.float32)
                y_test = y_test.astype(np.float32)


                # Train RandomForestRegressor with sweep hyperparameters
                reg_rf = RandomForestRegressor(
                    random_state=42,
                    n_estimators=int(config.n_estimators),
                    max_depth=int(config.max_depth) if config.max_depth is not None else None,
                    min_samples_split=int(config.min_samples_split),
                    min_samples_leaf=int(config.min_samples_leaf),
                    max_features=config.max_features,
                    bootstrap=True,
                    n_jobs=-1
                )
                                
                reg_rf.fit(X_train, y_train)
                y_pred_rf = reg_rf.predict(X_test)
                
                mae_rf = mean_absolute_error(y_test, y_pred_rf)
                r2_rf = r2_score(y_test, y_pred_rf)
                acc_rf = np.mean(np.minimum(y_pred_rf / (y_test + 1e-8), y_test / (y_pred_rf + 1e-8)))
                
                wandb.log({
                    'target': target_feature,
                    'mae': mae_rf,
                    'r2': r2_rf,
                    'accuracy': acc_rf
                })
                
                model_results['RandomForest'][target_feature] = {
                    'y_pred': y_pred_rf, 'r2': r2_rf, 'mae': mae_rf, 'accuracy': acc_rf
                }
    wandb.agent(sweep_id, function=train, count=30)


# %%capture
# param_sweep_rf(vasp_data, 'VASP')
# param_sweep_rf(lammps_data, 'LAMMPS')
# param_sweep_rf(chroma_data, 'CHROMA')
# param_sweep_rf(espresso_data, 'ESPRESSO')
# param_sweep_rf(atlas_data, 'ATLAS')
# param_sweep_rf(e3sm_data, 'E3SM')



In [None]:
# After running the parameter sweeps, the best parameters are observed and saved to model_params.json
# Results of runs: https://wandb.ai/boztop-boston-university/E3SM_rf_sweep/reports/Paper-experiments--VmlldzoxNTc1NDc0Nw?accessToken=5dnb2oh8hi0ch9trnl9hn8fk5borccd66pidf9vmule1go9vsw9kavkfva1g7zel
