In [1]:
import random
import numpy as np
import pandas as pd
import os
from pathlib import Path
import csv

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch

from sklearn.preprocessing import StandardScaler
import torch.autograd.profiler as profiler
from sklearn.model_selection import KFold

class ConfigStruct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

In [2]:
KFOLD_SEEDS = [
    728841181, 
    879843057, 
    1155483495, 
    1159944860, 
    1309364699, 
    1379701443, 
    1392436736, 
    1474235857, 
    1801054430, 
    1812549005,
]

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cuda


In [4]:
config = dict(
    epochs=300,
    batch_size=4096,
    learning_rate=0.008,
    weight_decay=1e-5,
    dropout=0.05,
    shuffle=True,
    nprocs_filter=True,
    nprocs="4_16_48_64_144_240",
    num_folds=5,
    test_size=0.2,
    random_seed=1234,
    stratified_split=True,
    smooth_l1_loss_beta=1.0
)

In [5]:
config = ConfigStruct(**config)

In [6]:
DATASET_DIR = "../data/"
DATASET_NAME = "blue_waters_posix_no_outliers_4_16_48_64_144_240_nprocs"
DATASET_PATH = Path(DATASET_DIR, DATASET_NAME).with_suffix(".csv")

CSV_LOG_PATH = "Filtered_by_NProcs.csv"

In [7]:
if not os.path.exists(CSV_LOG_PATH):
    with open(CSV_LOG_PATH, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["nprocs_filter", "test_loss", "kfold_seed"])

In [8]:
df = pd.read_csv(DATASET_PATH)
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,start_time_sec,end_time_sec,nprocs,run_time,log_ver,exe,lustre,bandwidth
0,39775,-1324,-1324,3348398,2023225,3533627,279362,0,1,-1324,...,0.0,0.669968,1513862379,1513910888,48,48510.0,3.1,./SpEC,1,583.031946
1,73579,-699,-699,2632037,41984,1798957,188007,0,1,-699,...,0.0,11.812020,1557000581,1557003025,48,2445.0,3.1,./SpEC,1,274.634615
2,41229,-1263,-1263,5029637,3252692,6031496,299271,0,1,-1263,...,0.0,9.999239,1556921024,1557005236,48,84213.0,3.1,./SpEC,1,188.062253
3,31050,-895,-895,931038,132468,715968,77483,0,1,-895,...,0.0,1.480465,1556914485,1556914695,48,211.0,3.1,./SpEC,1,93.393101
4,4800,-758,-758,211710,133292,201307,15407,0,1,-758,...,0.0,0.767330,1556900615,1556901011,48,397.0,3.1,./SpEC,1,41.003334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349815,1875,-782,-782,463075,7675,2568,3426,0,0,-782,...,0.0,2.395168,1531626203,1531626256,64,54.0,3.1,pp.x -npool 4 -in Tp-La2CuO4_pp_up.in,1,132.208232
349816,4830,-750,-750,221766,137222,205260,15034,0,1,-750,...,0.0,0.488285,1531501052,1531501561,48,510.0,3.1,./SpEC,1,77.239585
349817,19501,-1414,-1414,4024075,3041091,5769568,174097,0,1,-1414,...,0.0,0.699427,1531484414,1531568987,48,84574.0,3.1,./SpEC,1,327.327635
349818,1875,-782,-782,463075,7675,2568,3426,0,0,-782,...,0.0,1.969952,1531634171,1531634223,64,53.0,3.1,pp.x -npool 4 -in Tp-La2CuO4_pp_up.in,1,130.064385


In [9]:
df = df[df["bandwidth"] != 0]
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,start_time_sec,end_time_sec,nprocs,run_time,log_ver,exe,lustre,bandwidth
0,39775,-1324,-1324,3348398,2023225,3533627,279362,0,1,-1324,...,0.0,0.669968,1513862379,1513910888,48,48510.0,3.1,./SpEC,1,583.031946
1,73579,-699,-699,2632037,41984,1798957,188007,0,1,-699,...,0.0,11.812020,1557000581,1557003025,48,2445.0,3.1,./SpEC,1,274.634615
2,41229,-1263,-1263,5029637,3252692,6031496,299271,0,1,-1263,...,0.0,9.999239,1556921024,1557005236,48,84213.0,3.1,./SpEC,1,188.062253
3,31050,-895,-895,931038,132468,715968,77483,0,1,-895,...,0.0,1.480465,1556914485,1556914695,48,211.0,3.1,./SpEC,1,93.393101
4,4800,-758,-758,211710,133292,201307,15407,0,1,-758,...,0.0,0.767330,1556900615,1556901011,48,397.0,3.1,./SpEC,1,41.003334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349815,1875,-782,-782,463075,7675,2568,3426,0,0,-782,...,0.0,2.395168,1531626203,1531626256,64,54.0,3.1,pp.x -npool 4 -in Tp-La2CuO4_pp_up.in,1,132.208232
349816,4830,-750,-750,221766,137222,205260,15034,0,1,-750,...,0.0,0.488285,1531501052,1531501561,48,510.0,3.1,./SpEC,1,77.239585
349817,19501,-1414,-1414,4024075,3041091,5769568,174097,0,1,-1414,...,0.0,0.699427,1531484414,1531568987,48,84574.0,3.1,./SpEC,1,327.327635
349818,1875,-782,-782,463075,7675,2568,3426,0,0,-782,...,0.0,1.969952,1531634171,1531634223,64,53.0,3.1,pp.x -npool 4 -in Tp-La2CuO4_pp_up.in,1,130.064385


In [10]:
non_numeric_columns = df.select_dtypes(include='object').columns
df = df.drop(columns=non_numeric_columns)
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,start_time_sec,end_time_sec,nprocs,run_time,log_ver,lustre,bandwidth
0,39775,-1324,-1324,3348398,2023225,3533627,279362,0,1,-1324,...,0.524395,0.0,0.669968,1513862379,1513910888,48,48510.0,3.1,1,583.031946
1,73579,-699,-699,2632037,41984,1798957,188007,0,1,-699,...,0.766241,0.0,11.812020,1557000581,1557003025,48,2445.0,3.1,1,274.634615
2,41229,-1263,-1263,5029637,3252692,6031496,299271,0,1,-1263,...,5.100438,0.0,9.999239,1556921024,1557005236,48,84213.0,3.1,1,188.062253
3,31050,-895,-895,931038,132468,715968,77483,0,1,-895,...,0.123330,0.0,1.480465,1556914485,1556914695,48,211.0,3.1,1,93.393101
4,4800,-758,-758,211710,133292,201307,15407,0,1,-758,...,0.140793,0.0,0.767330,1556900615,1556901011,48,397.0,3.1,1,41.003334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349815,1875,-782,-782,463075,7675,2568,3426,0,0,-782,...,0.133901,0.0,2.395168,1531626203,1531626256,64,54.0,3.1,1,132.208232
349816,4830,-750,-750,221766,137222,205260,15034,0,1,-750,...,0.397361,0.0,0.488285,1531501052,1531501561,48,510.0,3.1,1,77.239585
349817,19501,-1414,-1414,4024075,3041091,5769568,174097,0,1,-1414,...,1.424245,0.0,0.699427,1531484414,1531568987,48,84574.0,3.1,1,327.327635
349818,1875,-782,-782,463075,7675,2568,3426,0,0,-782,...,0.076909,0.0,1.969952,1531634171,1531634223,64,53.0,3.1,1,130.064385


In [11]:
df = df.dropna(subset=["bandwidth"])
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,start_time_sec,end_time_sec,nprocs,run_time,log_ver,lustre,bandwidth
0,39775,-1324,-1324,3348398,2023225,3533627,279362,0,1,-1324,...,0.524395,0.0,0.669968,1513862379,1513910888,48,48510.0,3.1,1,583.031946
1,73579,-699,-699,2632037,41984,1798957,188007,0,1,-699,...,0.766241,0.0,11.812020,1557000581,1557003025,48,2445.0,3.1,1,274.634615
2,41229,-1263,-1263,5029637,3252692,6031496,299271,0,1,-1263,...,5.100438,0.0,9.999239,1556921024,1557005236,48,84213.0,3.1,1,188.062253
3,31050,-895,-895,931038,132468,715968,77483,0,1,-895,...,0.123330,0.0,1.480465,1556914485,1556914695,48,211.0,3.1,1,93.393101
4,4800,-758,-758,211710,133292,201307,15407,0,1,-758,...,0.140793,0.0,0.767330,1556900615,1556901011,48,397.0,3.1,1,41.003334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349815,1875,-782,-782,463075,7675,2568,3426,0,0,-782,...,0.133901,0.0,2.395168,1531626203,1531626256,64,54.0,3.1,1,132.208232
349816,4830,-750,-750,221766,137222,205260,15034,0,1,-750,...,0.397361,0.0,0.488285,1531501052,1531501561,48,510.0,3.1,1,77.239585
349817,19501,-1414,-1414,4024075,3041091,5769568,174097,0,1,-1414,...,1.424245,0.0,0.699427,1531484414,1531568987,48,84574.0,3.1,1,327.327635
349818,1875,-782,-782,463075,7675,2568,3426,0,0,-782,...,0.076909,0.0,1.969952,1531634171,1531634223,64,53.0,3.1,1,130.064385


In [12]:
random.seed(config.random_seed)
np.random.seed(config.random_seed)

torch.manual_seed(config.random_seed)
torch.cuda.manual_seed_all(config.random_seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [13]:
for seed in KFOLD_SEEDS:
    all_train_tensors = []  
    all_test_tensors = []

    kf = KFold(n_splits=config.num_folds, shuffle=True, random_state=seed)

    for train_idx, test_idx in kf.split(df):
        df_train = df.iloc[train_idx].copy()
        df_test = df.iloc[test_idx].copy()

        y_train = df_train.pop("bandwidth")
        y_test = df_test.pop("bandwidth")

        scaler = StandardScaler().fit(df_train)
        X_train_scaled = scaler.transform(df_train)
        X_test_scaled = scaler.transform(df_test)

        X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).pin_memory().to(device)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).pin_memory().to(device)

        X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).pin_memory().to(device)
        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).pin_memory().to(device)

        all_train_tensors.append((X_train_tensor, y_train_tensor))
        all_test_tensors.append((X_test_tensor, y_test_tensor))

In [14]:
def make_gpu_batches(X, y, batch_size, shuffle=True):
    if shuffle:
        indices = torch.randperm(X.size(0), device=device)
    else:
        indices = torch.arange(X.size(0), device=device)
    for i in range(0, X.size(0), batch_size):
        idx = indices[i : i + batch_size]
        yield X[idx], y[idx]

In [15]:
for seed in KFOLD_SEEDS:
    print(f"Seed {seed}")
    print("-------------------------------")
    
    loss_sum = 0
    for fold in range(config.num_folds):
        X_train, y_train = all_train_tensors[fold]
        X_test, y_test = all_test_tensors[fold]

        model = nn.Sequential(
            nn.Linear(102, 2048),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(2048, 512),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(128, 1),
        ).to(device)
        
        loss_fn = nn.SmoothL1Loss(beta=config.smooth_l1_loss_beta, reduction="sum").to(device)
        optimizer = optim.Adamax(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min")

        test_losses = []
        for epoch in range(config.epochs):
            model.train()
            for batch_X, batch_y in make_gpu_batches(X_train, y_train, config.batch_size, shuffle=config.shuffle):
                pred = model(batch_X)
                loss = loss_fn(pred, batch_y) / batch_X.size(0)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            model.eval()
            total_loss = 0.0
            total_samples = 0
            
            with torch.no_grad():
                for batch_X, batch_y in make_gpu_batches(X_test, y_test, config.batch_size, shuffle=False):
                    pred = model(batch_X)
                    total_loss += loss_fn(pred, batch_y).item()
                    total_samples += batch_X.size(0)

            test_loss = total_loss / total_samples
            scheduler.step(test_loss)
            test_losses.append(test_loss)
        
        with open(CSV_LOG_PATH, mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([config.stratified_split, test_loss, seed, fold])
        
        loss_sum += test_loss

    avg_loss = loss_sum / config.num_folds
    print(f"Avg loss: {avg_loss:>8f} \n")

Seed 728841181
-------------------------------
Avg loss: 6.852733 

Seed 879843057
-------------------------------
Avg loss: 6.264764 

Seed 1155483495
-------------------------------
Avg loss: 7.065558 

Seed 1159944860
-------------------------------
Avg loss: 6.859259 

Seed 1309364699
-------------------------------
Avg loss: 6.531865 

Seed 1379701443
-------------------------------
Avg loss: 6.590503 

Seed 1392436736
-------------------------------
Avg loss: 6.521800 

Seed 1474235857
-------------------------------
Avg loss: 6.652454 

Seed 1801054430
-------------------------------
Avg loss: 7.098221 

Seed 1812549005
-------------------------------
Avg loss: 6.792005 

