In [1]:
import random
import numpy as np
import pandas as pd
import os
from pathlib import Path
import csv

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch

from sklearn.preprocessing import StandardScaler
import torch.autograd.profiler as profiler
from sklearn.model_selection import KFold

class ConfigStruct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

In [2]:
KFOLD_SEEDS = [
    728841181, 
    879843057, 
    1155483495, 
    1159944860, 
    1309364699, 
    1379701443, 
    1392436736, 
    1474235857, 
    1801054430, 
    1812549005,
]

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cuda


In [4]:
config = dict(
    epochs=300,
    batch_size=4096,
    learning_rate=0.008,
    weight_decay=1e-5,
    dropout=0.05,
    shuffle=True,
    nprocs_filter=True,
    nprocs="4_16_48_64_144_240",
    num_folds=5,
    test_size=0.2,
    random_seed=1234,
    stratified_split=True,
    smooth_l1_loss_beta=1.0
)

In [5]:
config = ConfigStruct(**config)

In [6]:
DATASET_DIR = "../data/"
DATASET_NAME = "blue_waters_posix_no_outliers_4_16_48_64_144_240_nprocs"
DATASET_PATH = Path(DATASET_DIR, DATASET_NAME).with_suffix(".csv")

CSV_LOG_PATH = "Filtered_by_NProcs.csv"

In [7]:
if not os.path.exists(CSV_LOG_PATH):
    with open(CSV_LOG_PATH, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["nprocs_filter", "test_loss", "kfold_seed"])

In [8]:
df = pd.read_csv(DATASET_PATH)
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_READ_TIME,POSIX_F_WRITE_TIME,POSIX_F_META_TIME,POSIX_TOTAL_TIME,POSIX_F_MAX_READ_TIME,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,nprocs,bandwidth
0,4870,-792,-792,216275,135216,205111,15951,0,1,-792,...,25.046999,0.000000,1.629810,26.676809,0.094352,0.172894,0.0,0.735744,48,46.554144
1,47864,-912,-912,1632215,145357,1064440,129731,0,1,-912,...,51.351125,0.000000,13.179558,64.530683,0.231165,0.196214,0.0,2.666620,48,352.539247
2,6918,-897,-897,3414728,2916487,5434114,109523,0,1,-897,...,37.727463,22.408341,5.001379,65.137183,0.312966,0.417510,0.0,0.429495,48,115.303713
3,6751,-874,-874,1081394,853013,1493947,61126,0,1,-874,...,19.203549,0.000000,1.630348,20.833897,0.164479,0.295684,0.0,0.598302,48,232.904128
4,77666,-748,-748,2943105,296788,1873612,224671,0,1,-748,...,70.755883,0.000000,11.499421,82.255303,0.197717,0.423445,0.0,1.916454,48,587.173510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349815,25004,-1288,-1288,3908646,3326772,5245647,468926,0,1,-1288,...,13.594173,93.287188,50.226209,157.107570,0.540237,0.333199,0.0,0.974784,48,117.217110
349816,4870,-800,-800,216509,133294,204742,15981,0,1,-800,...,27.897124,0.000000,1.937185,29.834308,0.100729,1.024560,0.0,0.720533,48,37.922704
349817,27899,-706,-706,824117,136902,632480,70722,0,1,-706,...,32.179977,0.000000,4.170172,36.350149,0.152006,0.266960,0.0,0.914386,48,164.318747
349818,12571,-1000,-1000,781645,355445,653876,80989,0,1,-1000,...,23.145645,0.000000,2.337891,25.483536,0.118279,0.305436,0.0,0.752825,48,236.018095


In [9]:
df = df[df["bandwidth"] != 0]
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_READ_TIME,POSIX_F_WRITE_TIME,POSIX_F_META_TIME,POSIX_TOTAL_TIME,POSIX_F_MAX_READ_TIME,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,nprocs,bandwidth
0,4870,-792,-792,216275,135216,205111,15951,0,1,-792,...,25.046999,0.000000,1.629810,26.676809,0.094352,0.172894,0.0,0.735744,48,46.554144
1,47864,-912,-912,1632215,145357,1064440,129731,0,1,-912,...,51.351125,0.000000,13.179558,64.530683,0.231165,0.196214,0.0,2.666620,48,352.539247
2,6918,-897,-897,3414728,2916487,5434114,109523,0,1,-897,...,37.727463,22.408341,5.001379,65.137183,0.312966,0.417510,0.0,0.429495,48,115.303713
3,6751,-874,-874,1081394,853013,1493947,61126,0,1,-874,...,19.203549,0.000000,1.630348,20.833897,0.164479,0.295684,0.0,0.598302,48,232.904128
4,77666,-748,-748,2943105,296788,1873612,224671,0,1,-748,...,70.755883,0.000000,11.499421,82.255303,0.197717,0.423445,0.0,1.916454,48,587.173510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349815,25004,-1288,-1288,3908646,3326772,5245647,468926,0,1,-1288,...,13.594173,93.287188,50.226209,157.107570,0.540237,0.333199,0.0,0.974784,48,117.217110
349816,4870,-800,-800,216509,133294,204742,15981,0,1,-800,...,27.897124,0.000000,1.937185,29.834308,0.100729,1.024560,0.0,0.720533,48,37.922704
349817,27899,-706,-706,824117,136902,632480,70722,0,1,-706,...,32.179977,0.000000,4.170172,36.350149,0.152006,0.266960,0.0,0.914386,48,164.318747
349818,12571,-1000,-1000,781645,355445,653876,80989,0,1,-1000,...,23.145645,0.000000,2.337891,25.483536,0.118279,0.305436,0.0,0.752825,48,236.018095


In [10]:
non_numeric_columns = df.select_dtypes(include='object').columns
df = df.drop(columns=non_numeric_columns)
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_READ_TIME,POSIX_F_WRITE_TIME,POSIX_F_META_TIME,POSIX_TOTAL_TIME,POSIX_F_MAX_READ_TIME,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,nprocs,bandwidth
0,4870,-792,-792,216275,135216,205111,15951,0,1,-792,...,25.046999,0.000000,1.629810,26.676809,0.094352,0.172894,0.0,0.735744,48,46.554144
1,47864,-912,-912,1632215,145357,1064440,129731,0,1,-912,...,51.351125,0.000000,13.179558,64.530683,0.231165,0.196214,0.0,2.666620,48,352.539247
2,6918,-897,-897,3414728,2916487,5434114,109523,0,1,-897,...,37.727463,22.408341,5.001379,65.137183,0.312966,0.417510,0.0,0.429495,48,115.303713
3,6751,-874,-874,1081394,853013,1493947,61126,0,1,-874,...,19.203549,0.000000,1.630348,20.833897,0.164479,0.295684,0.0,0.598302,48,232.904128
4,77666,-748,-748,2943105,296788,1873612,224671,0,1,-748,...,70.755883,0.000000,11.499421,82.255303,0.197717,0.423445,0.0,1.916454,48,587.173510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349815,25004,-1288,-1288,3908646,3326772,5245647,468926,0,1,-1288,...,13.594173,93.287188,50.226209,157.107570,0.540237,0.333199,0.0,0.974784,48,117.217110
349816,4870,-800,-800,216509,133294,204742,15981,0,1,-800,...,27.897124,0.000000,1.937185,29.834308,0.100729,1.024560,0.0,0.720533,48,37.922704
349817,27899,-706,-706,824117,136902,632480,70722,0,1,-706,...,32.179977,0.000000,4.170172,36.350149,0.152006,0.266960,0.0,0.914386,48,164.318747
349818,12571,-1000,-1000,781645,355445,653876,80989,0,1,-1000,...,23.145645,0.000000,2.337891,25.483536,0.118279,0.305436,0.0,0.752825,48,236.018095


In [11]:
df = df.dropna(subset=["bandwidth"])
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_READ_TIME,POSIX_F_WRITE_TIME,POSIX_F_META_TIME,POSIX_TOTAL_TIME,POSIX_F_MAX_READ_TIME,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,nprocs,bandwidth
0,4870,-792,-792,216275,135216,205111,15951,0,1,-792,...,25.046999,0.000000,1.629810,26.676809,0.094352,0.172894,0.0,0.735744,48,46.554144
1,47864,-912,-912,1632215,145357,1064440,129731,0,1,-912,...,51.351125,0.000000,13.179558,64.530683,0.231165,0.196214,0.0,2.666620,48,352.539247
2,6918,-897,-897,3414728,2916487,5434114,109523,0,1,-897,...,37.727463,22.408341,5.001379,65.137183,0.312966,0.417510,0.0,0.429495,48,115.303713
3,6751,-874,-874,1081394,853013,1493947,61126,0,1,-874,...,19.203549,0.000000,1.630348,20.833897,0.164479,0.295684,0.0,0.598302,48,232.904128
4,77666,-748,-748,2943105,296788,1873612,224671,0,1,-748,...,70.755883,0.000000,11.499421,82.255303,0.197717,0.423445,0.0,1.916454,48,587.173510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349815,25004,-1288,-1288,3908646,3326772,5245647,468926,0,1,-1288,...,13.594173,93.287188,50.226209,157.107570,0.540237,0.333199,0.0,0.974784,48,117.217110
349816,4870,-800,-800,216509,133294,204742,15981,0,1,-800,...,27.897124,0.000000,1.937185,29.834308,0.100729,1.024560,0.0,0.720533,48,37.922704
349817,27899,-706,-706,824117,136902,632480,70722,0,1,-706,...,32.179977,0.000000,4.170172,36.350149,0.152006,0.266960,0.0,0.914386,48,164.318747
349818,12571,-1000,-1000,781645,355445,653876,80989,0,1,-1000,...,23.145645,0.000000,2.337891,25.483536,0.118279,0.305436,0.0,0.752825,48,236.018095


In [12]:
random.seed(config.random_seed)
np.random.seed(config.random_seed)

torch.manual_seed(config.random_seed)
torch.cuda.manual_seed_all(config.random_seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [13]:
for seed in KFOLD_SEEDS:
    all_train_tensors = []  
    all_test_tensors = []

    kf = KFold(n_splits=config.num_folds, shuffle=True, random_state=seed)

    for train_idx, test_idx in kf.split(df):
        df_train = df.iloc[train_idx].copy()
        df_test = df.iloc[test_idx].copy()

        y_train = df_train.pop("bandwidth")
        y_test = df_test.pop("bandwidth")

        scaler = StandardScaler().fit(df_train)
        X_train_scaled = scaler.transform(df_train)
        X_test_scaled = scaler.transform(df_test)

        X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).pin_memory().to(device)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).pin_memory().to(device)

        X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).pin_memory().to(device)
        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).pin_memory().to(device)

        all_train_tensors.append((X_train_tensor, y_train_tensor))
        all_test_tensors.append((X_test_tensor, y_test_tensor))

In [14]:
def make_gpu_batches(X, y, batch_size, shuffle=True):
    if shuffle:
        indices = torch.randperm(X.size(0), device=device)
    else:
        indices = torch.arange(X.size(0), device=device)
    for i in range(0, X.size(0), batch_size):
        idx = indices[i : i + batch_size]
        yield X[idx], y[idx]

In [15]:
for seed in KFOLD_SEEDS:
    print(f"Seed {seed}")
    print("-------------------------------")
    
    loss_sum = 0
    for fold in range(config.num_folds):
        X_train, y_train = all_train_tensors[fold]
        X_test, y_test = all_test_tensors[fold]

        model = nn.Sequential(
            nn.Linear(97, 2048),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(2048, 512),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(128, 1),
        ).to(device)
        
        loss_fn = nn.SmoothL1Loss(beta=config.smooth_l1_loss_beta, reduction="sum").to(device)
        optimizer = optim.Adamax(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min")

        test_losses = []
        for epoch in range(config.epochs):
            model.train()
            for batch_X, batch_y in make_gpu_batches(X_train, y_train, config.batch_size, shuffle=config.shuffle):
                pred = model(batch_X)
                loss = loss_fn(pred, batch_y) / batch_X.size(0)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            model.eval()
            total_loss = 0.0
            total_samples = 0
            
            with torch.no_grad():
                for batch_X, batch_y in make_gpu_batches(X_test, y_test, config.batch_size, shuffle=False):
                    pred = model(batch_X)
                    total_loss += loss_fn(pred, batch_y).item()
                    total_samples += batch_X.size(0)

            test_loss = total_loss / total_samples
            scheduler.step(test_loss)
            test_losses.append(test_loss)
        
        with open(CSV_LOG_PATH, mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([config.stratified_split, test_loss, seed, fold])
        
        loss_sum += test_loss

    avg_loss = loss_sum / config.num_folds
    print(f"Avg loss: {avg_loss:>8f} \n")

Seed 728841181
-------------------------------
Avg loss: 6.117391 

Seed 879843057
-------------------------------
Avg loss: 6.832654 

Seed 1155483495
-------------------------------
Avg loss: 6.668653 

Seed 1159944860
-------------------------------
Avg loss: 6.455843 

Seed 1309364699
-------------------------------
Avg loss: 6.706381 

Seed 1379701443
-------------------------------
Avg loss: 6.482661 

Seed 1392436736
-------------------------------
Avg loss: 6.177549 

Seed 1474235857
-------------------------------
Avg loss: 6.362852 

Seed 1801054430
-------------------------------
Avg loss: 6.569024 

Seed 1812549005
-------------------------------
Avg loss: 6.270889 

