In [1]:
import csv
import random
import sys
import numpy as np
import pandas as pd
from pathlib import Path
import os
import csv

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch

from sklearn.preprocessing import StandardScaler
import torch.autograd.profiler as profiler
from sklearn.model_selection import KFold

class ConfigStruct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

In [2]:
KFOLD_SEEDS = [
    728841181, 
    879843057, 
    1155483495, 
    1159944860, 
    1309364699, 
    1379701443, 
    1392436736, 
    1474235857, 
    1801054430, 
    1812549005,
]

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using  {device}")

Using  cuda


In [4]:
config = dict(     
    shuffle=True,
    stratified_split=False,
    random_seed=1234,
    num_folds=5,
    test_size=0.2,
    dropout=0.05,  # must match training
    smooth_l1_loss_beta=1.0,
    learning_rate=0.008,
    weight_decay=1e-5,
    batch_size=4096,
    epochs=300,    
)

In [5]:
config = ConfigStruct(**config)

In [6]:
DATASET_DIR = "../data/"
DATASET_NAME = "blue_waters_posix_with_paths_no_outliers"
DATASET_PATH = Path(DATASET_DIR, DATASET_NAME).with_suffix(".csv")

CSV_LOG_PATH = "Blue_Waters.csv"

In [7]:
if not os.path.exists(CSV_LOG_PATH):
    with open(CSV_LOG_PATH, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["nprocs_filter", "test_loss", "kfold_seed", "fold"])

In [8]:
df = pd.read_csv(DATASET_PATH)
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_MAX_READ_TIME,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,start_time_sec,end_time_sec,nprocs,exe,bandwidth,path
0,1280,-1281,-1281,1807,0,0,2561,0,0,-1281,...,0.081013,0.000000,0.000000,0.000000,1540486184,1540486762,213,./Hsigma,0.883451,/hpcwork/noco0056/io_transfer_learning/data/bl...
1,168420,-5232,-5232,7505724,5401751,11345341,394840,0,1,-5232,...,0.316154,0.379682,0.000000,1.294190,1540414582,1540499232,80,./SpEC,126.224526,/hpcwork/noco0056/io_transfer_learning/data/bl...
2,2366,-2367,-2367,3342,0,0,4733,0,0,-2367,...,0.064347,0.000000,0.000000,0.000000,1540498498,1540499201,394,./Hsigma,3.233117,/hpcwork/noco0056/io_transfer_learning/data/bl...
3,8709,-288,-288,1521073,6331589,2013343,28430,0,0,-288,...,0.506723,0.303372,0.000000,56.737093,1540442491,1540443028,256,CCTM_v52_Linux2_x86_64intel,364.725531,/hpcwork/noco0056/io_transfer_learning/data/bl...
4,8709,-288,-288,1516465,8334307,2006943,28430,0,0,-288,...,0.354845,1.372775,0.000000,56.462348,1540441741,1540442347,256,CCTM_v52_Linux2_x86_64intel,365.198932,/hpcwork/noco0056/io_transfer_learning/data/bl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724026,27899,-706,-706,824117,136902,632480,70722,0,1,-706,...,0.152006,0.266960,0.000000,0.914386,1506629880,1506630161,48,./SpEC,164.318747,/hpcwork/noco0056/io_transfer_learning/data/bl...
724027,12571,-1000,-1000,781645,355445,653876,80989,0,1,-1000,...,0.118279,0.305436,0.000000,0.752825,1506647712,1506651237,48,./SpEC,236.018095,/hpcwork/noco0056/io_transfer_learning/data/bl...
724028,244,-77,-77,3960,5,2645,499,0,0,-77,...,0.051202,0.000581,0.000234,0.197681,1506699687,1506699703,1,ApplyObservers -domaininput=GrDomain.input -No...,97.547204,/hpcwork/noco0056/io_transfer_learning/data/bl...
724029,9927,-890,-890,3640566,3018272,5706358,176916,0,1,-890,...,0.124667,0.484447,0.000000,0.715079,1506612023,1506695745,48,./SpEC,141.832395,/hpcwork/noco0056/io_transfer_learning/data/bl...


In [9]:
df = df[df["bandwidth"] != 0]
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_MAX_READ_TIME,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,start_time_sec,end_time_sec,nprocs,exe,bandwidth,path
0,1280,-1281,-1281,1807,0,0,2561,0,0,-1281,...,0.081013,0.000000,0.000000,0.000000,1540486184,1540486762,213,./Hsigma,0.883451,/hpcwork/noco0056/io_transfer_learning/data/bl...
1,168420,-5232,-5232,7505724,5401751,11345341,394840,0,1,-5232,...,0.316154,0.379682,0.000000,1.294190,1540414582,1540499232,80,./SpEC,126.224526,/hpcwork/noco0056/io_transfer_learning/data/bl...
2,2366,-2367,-2367,3342,0,0,4733,0,0,-2367,...,0.064347,0.000000,0.000000,0.000000,1540498498,1540499201,394,./Hsigma,3.233117,/hpcwork/noco0056/io_transfer_learning/data/bl...
3,8709,-288,-288,1521073,6331589,2013343,28430,0,0,-288,...,0.506723,0.303372,0.000000,56.737093,1540442491,1540443028,256,CCTM_v52_Linux2_x86_64intel,364.725531,/hpcwork/noco0056/io_transfer_learning/data/bl...
4,8709,-288,-288,1516465,8334307,2006943,28430,0,0,-288,...,0.354845,1.372775,0.000000,56.462348,1540441741,1540442347,256,CCTM_v52_Linux2_x86_64intel,365.198932,/hpcwork/noco0056/io_transfer_learning/data/bl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724026,27899,-706,-706,824117,136902,632480,70722,0,1,-706,...,0.152006,0.266960,0.000000,0.914386,1506629880,1506630161,48,./SpEC,164.318747,/hpcwork/noco0056/io_transfer_learning/data/bl...
724027,12571,-1000,-1000,781645,355445,653876,80989,0,1,-1000,...,0.118279,0.305436,0.000000,0.752825,1506647712,1506651237,48,./SpEC,236.018095,/hpcwork/noco0056/io_transfer_learning/data/bl...
724028,244,-77,-77,3960,5,2645,499,0,0,-77,...,0.051202,0.000581,0.000234,0.197681,1506699687,1506699703,1,ApplyObservers -domaininput=GrDomain.input -No...,97.547204,/hpcwork/noco0056/io_transfer_learning/data/bl...
724029,9927,-890,-890,3640566,3018272,5706358,176916,0,1,-890,...,0.124667,0.484447,0.000000,0.715079,1506612023,1506695745,48,./SpEC,141.832395,/hpcwork/noco0056/io_transfer_learning/data/bl...


In [10]:
non_numeric_columns = df.select_dtypes(include='object').columns
df = df.drop(columns=non_numeric_columns)
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_META_TIME,POSIX_TOTAL_TIME,POSIX_F_MAX_READ_TIME,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,start_time_sec,end_time_sec,nprocs,bandwidth
0,1280,-1281,-1281,1807,0,0,2561,0,0,-1281,...,0.970416,0.970498,0.081013,0.000000,0.000000,0.000000,1540486184,1540486762,213,0.883451
1,168420,-5232,-5232,7505724,5401751,11345341,394840,0,1,-5232,...,1.555623,147.540409,0.316154,0.379682,0.000000,1.294190,1540414582,1540499232,80,126.224526
2,2366,-2367,-2367,3342,0,0,4733,0,0,-2367,...,0.487019,0.487127,0.064347,0.000000,0.000000,0.000000,1540498498,1540499201,394,3.233117
3,8709,-288,-288,1521073,6331589,2013343,28430,0,0,-288,...,5.596620,14389.735087,0.506723,0.303372,0.000000,56.737093,1540442491,1540443028,256,364.725531
4,8709,-288,-288,1516465,8334307,2006943,28430,0,0,-288,...,4.555102,14340.456718,0.354845,1.372775,0.000000,56.462348,1540441741,1540442347,256,365.198932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724026,27899,-706,-706,824117,136902,632480,70722,0,1,-706,...,4.170172,36.350149,0.152006,0.266960,0.000000,0.914386,1506629880,1506630161,48,164.318747
724027,12571,-1000,-1000,781645,355445,653876,80989,0,1,-1000,...,2.337891,25.483536,0.118279,0.305436,0.000000,0.752825,1506647712,1506651237,48,236.018095
724028,244,-77,-77,3960,5,2645,499,0,0,-77,...,0.095079,0.289781,0.051202,0.000581,0.000234,0.197681,1506699687,1506699703,1,97.547204
724029,9927,-890,-890,3640566,3018272,5706358,176916,0,1,-890,...,8.410483,74.705777,0.124667,0.484447,0.000000,0.715079,1506612023,1506695745,48,141.832395


In [11]:
df = df.drop(["start_time_sec", "end_time_sec"], axis=1)

In [12]:
df = df.dropna(subset=["bandwidth"])
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_READ_TIME,POSIX_F_WRITE_TIME,POSIX_F_META_TIME,POSIX_TOTAL_TIME,POSIX_F_MAX_READ_TIME,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,nprocs,bandwidth
0,1280,-1281,-1281,1807,0,0,2561,0,0,-1281,...,0.000082,0.000000,0.970416,0.970498,0.081013,0.000000,0.000000,0.000000,213,0.883451
1,168420,-5232,-5232,7505724,5401751,11345341,394840,0,1,-5232,...,112.250101,33.734686,1.555623,147.540409,0.316154,0.379682,0.000000,1.294190,80,126.224526
2,2366,-2367,-2367,3342,0,0,4733,0,0,-2367,...,0.000108,0.000000,0.487019,0.487127,0.064347,0.000000,0.000000,0.000000,394,3.233117
3,8709,-288,-288,1521073,6331589,2013343,28430,0,0,-288,...,14358.530507,25.607961,5.596620,14389.735087,0.506723,0.303372,0.000000,56.737093,256,364.725531
4,8709,-288,-288,1516465,8334307,2006943,28430,0,0,-288,...,14303.609844,32.291772,4.555102,14340.456718,0.354845,1.372775,0.000000,56.462348,256,365.198932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724026,27899,-706,-706,824117,136902,632480,70722,0,1,-706,...,32.179977,0.000000,4.170172,36.350149,0.152006,0.266960,0.000000,0.914386,48,164.318747
724027,12571,-1000,-1000,781645,355445,653876,80989,0,1,-1000,...,23.145645,0.000000,2.337891,25.483536,0.118279,0.305436,0.000000,0.752825,48,236.018095
724028,244,-77,-77,3960,5,2645,499,0,0,-77,...,0.193506,0.001196,0.095079,0.289781,0.051202,0.000581,0.000234,0.197681,1,97.547204
724029,9927,-890,-890,3640566,3018272,5706358,176916,0,1,-890,...,9.836940,56.458354,8.410483,74.705777,0.124667,0.484447,0.000000,0.715079,48,141.832395


In [13]:
random.seed(config.random_seed)
np.random.seed(config.random_seed)

torch.manual_seed(config.random_seed)
torch.cuda.manual_seed_all(config.random_seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [14]:
for seed in KFOLD_SEEDS:
    all_train_tensors = []  
    all_test_tensors = []

    kf = KFold(n_splits=config.num_folds, shuffle=True, random_state=seed)

    for train_idx, test_idx in kf.split(df):
        df_train = df.iloc[train_idx].copy()
        df_test = df.iloc[test_idx].copy()

        y_train = df_train.pop("bandwidth")
        y_test = df_test.pop("bandwidth")

        scaler = StandardScaler().fit(df_train)
        X_train_scaled = scaler.transform(df_train)
        X_test_scaled = scaler.transform(df_test)

        X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).pin_memory().to(device)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).pin_memory().to(device)

        X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).pin_memory().to(device)
        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).pin_memory().to(device)

        all_train_tensors.append((X_train_tensor, y_train_tensor))
        all_test_tensors.append((X_test_tensor, y_test_tensor))

In [15]:
def make_gpu_batches(X, y, batch_size, shuffle=True):
    if shuffle:
        indices = torch.randperm(X.size(0), device=device)
    else:
        indices = torch.arange(X.size(0), device=device)
    for i in range(0, X.size(0), batch_size):
        idx = indices[i:i + batch_size]
        yield X[idx], y[idx]

In [16]:
for seed in KFOLD_SEEDS:
    print(f"Seed {seed}")
    print("-------------------------------")
    
    loss_sum = 0
    for fold in range(config.num_folds):
        X_train, y_train = all_train_tensors[fold]
        X_test, y_test = all_test_tensors[fold]

        model = nn.Sequential(
            nn.Linear(97, 2048),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(2048, 512),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(128, 1),
        ).to(device)
        
        loss_fn = nn.SmoothL1Loss(beta=config.smooth_l1_loss_beta, reduction="sum").to(device)
        optimizer = optim.Adamax(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min")

        test_losses = []
        for epoch in range(config.epochs):
            model.train()
            for batch_X, batch_y in make_gpu_batches(X_train, y_train, config.batch_size, shuffle=config.shuffle):
                pred = model(batch_X)
                loss = loss_fn(pred, batch_y) / len(batch_X)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            model.eval()
            total_loss = 0
            total_samples = 0
            
            with torch.no_grad():
                for batch_X, batch_y in make_gpu_batches(X_test, y_test, config.batch_size, shuffle=False):
                    pred = model(batch_X)
                    total_loss += loss_fn(pred, batch_y).item()
                    total_samples += batch_X.size(0)

            test_loss = total_loss / total_samples
            scheduler.step(test_loss)
            test_losses.append(test_loss)
        
        with open(CSV_LOG_PATH, mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([config.stratified_split, test_loss, seed, fold])
        
        loss_sum += test_loss

    avg_loss = loss_sum / config.num_folds
    print(f"Avg loss: {avg_loss:>8f} \n")

Seed 728841181
-------------------------------
Avg loss: 20.510397 

Seed 879843057
-------------------------------
Avg loss: 20.293076 

Seed 1155483495
-------------------------------
Avg loss: 19.970452 

Seed 1159944860
-------------------------------
Avg loss: 20.281885 

Seed 1309364699
-------------------------------
Avg loss: 20.317348 

Seed 1379701443
-------------------------------
Avg loss: 20.280366 

Seed 1392436736
-------------------------------
Avg loss: 20.149692 

Seed 1474235857
-------------------------------
Avg loss: 19.731308 

Seed 1801054430
-------------------------------
Avg loss: 20.076987 

Seed 1812549005
-------------------------------
Avg loss: 19.870378 

