In [1]:
import random
import pandas as pd
import numpy as np
from pathlib import Path
import tempfile

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch

from functools import partial

from ray import tune, init
from ray.tune import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler
import ray.cloudpickle as pickle
from ray.tune import run

import os
import json
import hashlib
import logging

In [2]:
MODEL_DIR = os.path.abspath("../models")
MODEL_PATH = os.path.join(MODEL_DIR, "SmoothL1Loss_fixed_Adamax_fewer_neurons_0.2_testSize_new_StandardScaler_2048_batch_0.05_dropout_pytorch_v1.12_raytunetest.tar")

DATASET_DIR = os.path.abspath("../data")
DATASET_PATH = os.path.join(DATASET_DIR, "blue_waters_posix_with_paths_no_negative_outliers_no_time_no_dups.csv")

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cuda


In [4]:
config = {
    "epochs":100,
    "batch_size":tune.choice([512,1024,2048,4096]),
    "learning_rate":tune.loguniform(1e-4,1e-1),
    "l1":tune.choice([2 ** i for i in range(7,12)]),
    "l2":tune.choice([2 ** i for i in range(6,11)]),
    "l3":tune.choice([2 ** i for i in range(5,9)]),
    "weight_decay":tune.choice([ 1 / (10 ** i) for i in range(4,7)]), #1e-5
    #"dropout":tune.choice([ 1 / (5 * (10 ** i))]), #0.05
    "shuffle":True,
    "test_size":0.2,
    "split_seed":42,
    "random_seed":1234,
    "stratified_split":False,
    "smooth_l1_loss_beta":1,
    "model_path":MODEL_PATH,
    "device":device,
}

In [5]:
class Net(nn.Module):
    def __init__(self, l1=2048, l2=512, l3=128):
        super(Net, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(90, l1),
            #nn.Dropout(p=config["dropout"]),
            nn.ReLU(),
            nn.Linear(l1, l2),
            #nn.Dropout(p=config["dropout"]),
            nn.ReLU(),
            nn.Linear(l2, l3),
            #nn.Dropout(p=config["dropout"]),
            nn.ReLU(),
            nn.Linear(l3, 1)
        )

    def forward(self, x):
        return self.linear_relu_stack(x)

In [6]:
torch.manual_seed(config["random_seed"])
torch.cuda.manual_seed_all(config["random_seed"])

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [7]:
def load_data():
    # Fix seeds for reproducibility
    random.seed(config["random_seed"])
    np.random.seed(config["random_seed"])

    # Load the data
    df_blue_waters_posix = pd.read_csv(DATASET_PATH)

    # Drop column with application names
    df_blue_waters_posix = df_blue_waters_posix.drop(['path', 'exe'],axis=1)
    
    # Separate bandwidth from input features
    POSIX_TOTAL_TIME_df = df_blue_waters_posix.pop('POSIX_TOTAL_TIME')

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(df_blue_waters_posix,
                                                        POSIX_TOTAL_TIME_df,
                                                        test_size=config["test_size"],
                                                        random_state=config["split_seed"],
                                                        stratify=df_blue_waters_posix["nprocs"] if config["stratified_split"] else None)

    # Scale the input features
    scaler = StandardScaler().fit(X_train)

    X_train_scaled = scaler.transform(X_train)
    tensor_X_train = torch.Tensor(X_train_scaled)
    tensor_y_train = torch.Tensor(y_train.values).view(-1, 1)
    training_dataset = TensorDataset(tensor_X_train, tensor_y_train)
    
    X_test_scaled = scaler.transform(X_test)
    tensor_X_test = torch.Tensor(X_test_scaled)
    tensor_y_test = torch.Tensor(y_test.values).view(-1, 1)
    test_dataset = TensorDataset(tensor_X_test, tensor_y_test)

    return training_dataset, test_dataset

In [8]:
def bluewaters_train(config):
    model = Net().to(config["device"])
    # By default Pytorch returns avg loss per minibatch elements. But since the last batch
    # (both in training and test) does not have enough instances, sum all the loss across the batches
    # and then divide it by total number of elements in the the test set.
    loss_fn = nn.SmoothL1Loss(beta=config["smooth_l1_loss_beta"], reduction="sum").to(config["device"])
    optimizer = optim.Adamax(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')    
    
    model_epoch = 0 
    checkpoint = get_checkpoint()
    if checkpoint:
        with checkpoint.as_directory() as checkpoint_dir:
            data_path = Path(config["model_path"]).with_suffix(".pkl")
            with open(data_path, "rb") as fp:
                checkpoint_state = pickle.load(fp)
            model_epoch = checkpoint_state["epoch"]
            model.load_state_dict(checkpoint_state["net_state_dict"])
            optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
            
            print(f"Current epoch: {model_epoch}")
    
    generator = torch.Generator().manual_seed(config["split_seed"])

    training_dataset, test_dataset = load_data()
    training_dataloader = DataLoader(training_dataset, batch_size=config["batch_size"], shuffle=config["shuffle"])
    test_dataloader = DataLoader(test_dataset, batch_size=config["batch_size"])
    
    model.train()

    for epoch in range(model_epoch, config["epochs"]):
        for (X, y) in training_dataloader:
            X, y = X.to(config["device"]), y.to(config["device"])  # Move batch to GPU
            y_pred = model(X)
            
            # Divide the summed loss by the number of elements in the current batch to get the average loss
            loss = loss_fn(y, y_pred) / len(X)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
        model.train()

        model.eval()
        test_loss = 0
        with torch.no_grad():
            for X, y in test_dataloader:
                X, y = X.to(config["device"]), y.to(config["device"])
                pred = model(X)
                test_loss += loss_fn(pred, y).item() 

        # Divide the summed test loss by the number of elements in the whole test dataset to get the average loss
        test_loss /= len(test_dataloader.dataset)

        #print(f"Avg loss: {test_loss:>8f} \n")

        scheduler.step(test_loss)

        model_epoch = epoch  
        checkpoint_data = {
            "epoch": model_epoch,
            "net_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict()
        }
        with tempfile.TemporaryDirectory() as checkpoint_dir:
            data_path = Path(config["model_path"]).with_suffix(".pkl")
            with open(data_path, "wb") as fp:
                pickle.dump(checkpoint_data, fp)

            checkpoint = Checkpoint.from_directory(MODEL_DIR)
            tune.report(
                {"loss": test_loss},
                checkpoint=checkpoint,
            )

In [9]:
scheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=10,
    grace_period=1,
    reduction_factor=2,
)

In [10]:
def short_name_creator(trial):
    config_str = json.dumps(trial.config, sort_keys=True)
    config_hash = hashlib.sha1(config_str.encode()).hexdigest()
    return f"trial_{config_hash[:6]}"

In [12]:
result = tune.run(
    bluewaters_train,
    resources_per_trial={"cpu": 10, "gpu": 1},
    config=config,
    num_samples=10,
    scheduler=scheduler,
    trial_dirname_creator=short_name_creator,
    storage_path="file:///" + os.path.abspath("."),
    raise_on_failed_trial=True,
)

2025-06-12 14:36:14,138	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2025-06-12 14:47:20
Running for:,00:11:06.12
Memory:,18.6/31.7 GiB

Trial name,status,loc,batch_size,l1,l2,l3,learning_rate,weight_decay,iter,total time (s),loss
bluewaters_train_d402a_00000,TERMINATED,127.0.0.1:3212,512,128,512,128,0.000164508,0.0001,10,79.655,15135.1
bluewaters_train_d402a_00001,TERMINATED,127.0.0.1:3064,2048,512,128,128,0.00188418,1e-06,10,77.2103,11135.2
bluewaters_train_d402a_00002,TERMINATED,127.0.0.1:17060,512,1024,64,32,0.00951405,1e-05,10,74.6304,7070.28
bluewaters_train_d402a_00003,TERMINATED,127.0.0.1:2100,4096,256,128,256,0.00556988,0.0001,10,78.6289,7630.19
bluewaters_train_d402a_00004,TERMINATED,127.0.0.1:11964,4096,2048,1024,128,0.000916052,0.0001,1,14.217,15307.1
bluewaters_train_d402a_00005,TERMINATED,127.0.0.1:8968,1024,256,512,64,0.00256064,1e-06,8,57.4662,10867.4
bluewaters_train_d402a_00006,TERMINATED,127.0.0.1:14156,2048,1024,1024,64,0.000156338,0.0001,1,13.3562,17103.6
bluewaters_train_d402a_00007,TERMINATED,127.0.0.1:20272,512,1024,128,128,0.00971734,1e-05,10,77.0125,7321.48
bluewaters_train_d402a_00008,TERMINATED,127.0.0.1:9056,512,2048,64,128,0.010837,1e-06,10,75.7935,7056.64
bluewaters_train_d402a_00009,TERMINATED,127.0.0.1:5108,4096,512,256,32,0.0591053,0.0001,10,79.017,7353.73


Trial name,loss,should_checkpoint
bluewaters_train_d402a_00000,15135.1,True
bluewaters_train_d402a_00001,11135.2,True
bluewaters_train_d402a_00002,7070.28,True
bluewaters_train_d402a_00003,7630.19,True
bluewaters_train_d402a_00004,15307.1,True
bluewaters_train_d402a_00005,10867.4,True
bluewaters_train_d402a_00006,17103.6,True
bluewaters_train_d402a_00007,7321.48,True
bluewaters_train_d402a_00008,7056.64,True
bluewaters_train_d402a_00009,7353.73,True


2025-06-12 14:47:20,286	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'D:/Projects/IOTransferLearning/optimization/bluewaters_train_2025-06-12_14-36-14' in 0.0130s.
2025-06-12 14:47:20,302	INFO tune.py:1041 -- Total run time: 666.16 seconds (666.11 seconds for the tuning loop).


In [13]:
best_trial = result.get_best_trial("loss", "min", "last")
print(f"Best trial config: {best_trial.config}")
print(f"Best trial final validation loss: {best_trial.last_result['loss']}")

Best trial config: {'epochs': 100, 'batch_size': 512, 'learning_rate': 0.01083700425039336, 'l1': 2048, 'l2': 64, 'l3': 128, 'weight_decay': 1e-06, 'shuffle': True, 'test_size': 0.2, 'split_seed': 42, 'random_seed': 1234, 'stratified_split': False, 'smooth_l1_loss_beta': 1, 'model_path': 'D:\\Projects\\IOTransferLearning\\models\\SmoothL1Loss_fixed_Adamax_fewer_neurons_0.2_testSize_new_StandardScaler_2048_batch_0.05_dropout_pytorch_v1.12_raytunetest.tar', 'device': 'cuda'}
Best trial final validation loss: 7056.639858972459
