In [1]:
import csv
import random
import sys
import numpy as np
import pandas as pd
from pathlib import Path
import os
import csv

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch

from sklearn.preprocessing import StandardScaler
import torch.autograd.profiler as profiler
from sklearn.model_selection import KFold

from tqdm.notebook import tqdm

class ConfigStruct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

In [2]:
KFOLD_SEEDS = [
    728841181, 
    879843057, 
    1155483495, 
    1159944860, 
    1309364699, 
    1379701443, 
    1392436736, 
    1474235857, 
    1801054430, 
    1812549005,
]

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using  {device}")

Using  cuda


In [4]:
config = dict(     
    shuffle=True,
    stratified_split=False,
    random_seed=1234,
    num_folds=5,
    test_size=0.2,
    dropout=0.05,  # must match training
    smooth_l1_loss_beta=1.0,
    learning_rate=0.008,
    weight_decay=1e-5,
    batch_size=4096,
    epochs=300,    
)

In [5]:
config = ConfigStruct(**config)

In [6]:
DATASET_DIR = ""
DATASET_NAME = "blue_waters_posix_all_no_outliers"
DATASET_PATH = Path(DATASET_DIR, DATASET_NAME).with_suffix(".csv")

CSV_LOG_PATH = "Full_Dataset.csv"

In [7]:
if not os.path.exists(CSV_LOG_PATH):
    with open(CSV_LOG_PATH, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["nprocs_filter", "test_loss", "kfold_seed", "fold"])

In [8]:
df = pd.read_csv("../data/blue_waters_posix_all_no_outliers.csv")
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,start_time_sec,end_time_sec,nprocs,run_time,log_ver,exe,lustre,bandwidth
0,49152,-24576,-24576,0,0,0,24576,0,0,-24576,...,0.0,0.000000,1513942235,1513942281,24,47.0,3.1,/u/system/bwjenkins/mdtest-jenkins/mdtest/mdte...,0,0.000000
1,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.0,0.135492,1513986144,1513986162,128,19.0,3.1,enzo.exe -d -r DD0601/DD0601,1,255.557572
2,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.0,0.162338,1513991554,1513991572,128,19.0,3.1,enzo.exe -d -r DD0601/DD0601,1,226.638510
3,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.0,0.193178,1513992459,1513992477,128,19.0,3.1,enzo.exe -d -r DD0601/DD0601,1,180.999088
4,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.0,0.205942,1513987749,1513987767,128,19.0,3.1,enzo.exe -d -r DD0601/DD0601,1,187.159073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724026,8709,-288,-288,1398193,6331987,2021535,28430,0,0,-288,...,0.0,80.506971,1531513254,1531513957,256,704.0,3.1,CCTM_v52_Linux2_x86_64intel,1,259.932043
724027,8709,-288,-288,1393841,6331677,2015391,28430,0,0,-288,...,0.0,63.564875,1531508939,1531509581,256,643.0,3.1,CCTM_v52_Linux2_x86_64intel,1,356.765856
724028,8709,-288,-288,1398193,6327571,2013855,28430,0,0,-288,...,0.0,59.515109,1531556071,1531556744,256,674.0,3.1,CCTM_v52_Linux2_x86_64intel,1,358.838439
724029,8709,-288,-288,1396913,6329542,2010527,28430,0,0,-288,...,0.0,112.244880,1531511947,1531512674,256,728.0,3.1,CCTM_v52_Linux2_x86_64intel,1,307.633323


In [9]:
df = df[df["bandwidth"] != 0]
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,start_time_sec,end_time_sec,nprocs,run_time,log_ver,exe,lustre,bandwidth
1,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.0,0.135492,1513986144,1513986162,128,19.0,3.1,enzo.exe -d -r DD0601/DD0601,1,255.557572
2,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.0,0.162338,1513991554,1513991572,128,19.0,3.1,enzo.exe -d -r DD0601/DD0601,1,226.638510
3,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.0,0.193178,1513992459,1513992477,128,19.0,3.1,enzo.exe -d -r DD0601/DD0601,1,180.999088
4,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.0,0.205942,1513987749,1513987767,128,19.0,3.1,enzo.exe -d -r DD0601/DD0601,1,187.159073
5,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.0,0.271263,1513988529,1513988547,128,19.0,3.1,enzo.exe -d -r DD0601/DD0601,1,133.881110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724026,8709,-288,-288,1398193,6331987,2021535,28430,0,0,-288,...,0.0,80.506971,1531513254,1531513957,256,704.0,3.1,CCTM_v52_Linux2_x86_64intel,1,259.932043
724027,8709,-288,-288,1393841,6331677,2015391,28430,0,0,-288,...,0.0,63.564875,1531508939,1531509581,256,643.0,3.1,CCTM_v52_Linux2_x86_64intel,1,356.765856
724028,8709,-288,-288,1398193,6327571,2013855,28430,0,0,-288,...,0.0,59.515109,1531556071,1531556744,256,674.0,3.1,CCTM_v52_Linux2_x86_64intel,1,358.838439
724029,8709,-288,-288,1396913,6329542,2010527,28430,0,0,-288,...,0.0,112.244880,1531511947,1531512674,256,728.0,3.1,CCTM_v52_Linux2_x86_64intel,1,307.633323


In [10]:
non_numeric_columns = df.select_dtypes(include='object').columns
df = df.drop(columns=non_numeric_columns)
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,start_time_sec,end_time_sec,nprocs,run_time,log_ver,lustre,bandwidth
1,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.144028,0.0,0.135492,1513986144,1513986162,128,19.0,3.1,1,255.557572
2,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.226497,0.0,0.162338,1513991554,1513991572,128,19.0,3.1,1,226.638510
3,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.215456,0.0,0.193178,1513992459,1513992477,128,19.0,3.1,1,180.999088
4,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.198964,0.0,0.205942,1513987749,1513987767,128,19.0,3.1,1,187.159073
5,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.174043,0.0,0.271263,1513988529,1513988547,128,19.0,3.1,1,133.881110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724026,8709,-288,-288,1398193,6331987,2021535,28430,0,0,-288,...,1.020323,0.0,80.506971,1531513254,1531513957,256,704.0,3.1,1,259.932043
724027,8709,-288,-288,1393841,6331677,2015391,28430,0,0,-288,...,0.750145,0.0,63.564875,1531508939,1531509581,256,643.0,3.1,1,356.765856
724028,8709,-288,-288,1398193,6327571,2013855,28430,0,0,-288,...,0.706239,0.0,59.515109,1531556071,1531556744,256,674.0,3.1,1,358.838439
724029,8709,-288,-288,1396913,6329542,2010527,28430,0,0,-288,...,1.862099,0.0,112.244880,1531511947,1531512674,256,728.0,3.1,1,307.633323


In [11]:
df = df.dropna(subset=["bandwidth"])
df

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,POSIX_F_MAX_WRITE_TIME,POSIX_F_FASTEST_RANK_TIME,POSIX_F_SLOWEST_RANK_TIME,start_time_sec,end_time_sec,nprocs,run_time,log_ver,lustre,bandwidth
1,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.144028,0.0,0.135492,1513986144,1513986162,128,19.0,3.1,1,255.557572
2,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.226497,0.0,0.162338,1513991554,1513991572,128,19.0,3.1,1,226.638510
3,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.215456,0.0,0.193178,1513992459,1513992477,128,19.0,3.1,1,180.999088
4,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.198964,0.0,0.205942,1513987749,1513987767,128,19.0,3.1,1,187.159073
5,1747,-130,-130,86885,35912,85705,3365,0,0,-130,...,0.174043,0.0,0.271263,1513988529,1513988547,128,19.0,3.1,1,133.881110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724026,8709,-288,-288,1398193,6331987,2021535,28430,0,0,-288,...,1.020323,0.0,80.506971,1531513254,1531513957,256,704.0,3.1,1,259.932043
724027,8709,-288,-288,1393841,6331677,2015391,28430,0,0,-288,...,0.750145,0.0,63.564875,1531508939,1531509581,256,643.0,3.1,1,356.765856
724028,8709,-288,-288,1398193,6327571,2013855,28430,0,0,-288,...,0.706239,0.0,59.515109,1531556071,1531556744,256,674.0,3.1,1,358.838439
724029,8709,-288,-288,1396913,6329542,2010527,28430,0,0,-288,...,1.862099,0.0,112.244880,1531511947,1531512674,256,728.0,3.1,1,307.633323


In [12]:
random.seed(config.random_seed)
np.random.seed(config.random_seed)

torch.manual_seed(config.random_seed)
torch.cuda.manual_seed_all(config.random_seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [13]:
for seed in KFOLD_SEEDS:
    all_train_tensors = []  
    all_test_tensors = []

    kf = KFold(n_splits=config.num_folds, shuffle=True, random_state=seed)

    for train_idx, test_idx in kf.split(df):
        df_train = df.iloc[train_idx].copy()
        df_test = df.iloc[test_idx].copy()

        y_train = df_train.pop("bandwidth")
        y_test = df_test.pop("bandwidth")

        scaler = StandardScaler().fit(df_train)
        X_train_scaled = scaler.transform(df_train)
        X_test_scaled = scaler.transform(df_test)

        X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).pin_memory().to(device)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).pin_memory().to(device)

        X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).pin_memory().to(device)
        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).pin_memory().to(device)

        all_train_tensors.append((X_train_tensor, y_train_tensor))
        all_test_tensors.append((X_test_tensor, y_test_tensor))

In [14]:
def make_gpu_batches(X, y, batch_size, shuffle=True):
    if shuffle:
        indices = torch.randperm(X.size(0), device=device)
    else:
        indices = torch.arange(X.size(0), device=device)
    for i in range(0, X.size(0), batch_size):
        idx = indices[i:i + batch_size]
        yield X[idx], y[idx]

In [None]:
for seed in KFOLD_SEEDS:
    print(f"Seed {seed}")
    print("-------------------------------")
    
    loss_sum = 0
    for fold in range(config.num_folds):
        X_train, y_train = all_train_tensors[fold]
        X_test, y_test = all_test_tensors[fold]

        model = nn.Sequential(
            nn.Linear(102, 2048),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(2048, 512),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.Dropout(p=config.dropout),
            nn.ReLU(),
            nn.Linear(128, 1),
        ).to(device)
        
        loss_fn = nn.SmoothL1Loss(beta=config.smooth_l1_loss_beta, reduction="sum").to(device)
        optimizer = optim.Adamax(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min")

        test_losses = []
        for epoch in range(config.epochs):
            model.train()
            for batch_X, batch_y in make_gpu_batches(X_train, y_train, config.batch_size, shuffle=config.shuffle):
                pred = model(batch_X)
                loss = loss_fn(pred, batch_y) / len(batch_X)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            model.eval()
            total_loss = 0
            total_samples = 0
            
            with torch.no_grad():
                for batch_X, batch_y in make_gpu_batches(X_test, y_test, config.batch_size, shuffle=False):
                    pred = model(batch_X)
                    total_loss += loss_fn(pred, batch_y).item()
                    total_samples += batch_X.size(0)

            test_loss = total_loss / total_samples
            scheduler.step(test_loss)
            test_losses.append(test_loss)
        
        with open(CSV_LOG_PATH, mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([config.stratified_split, test_loss, seed, fold])
        
        loss_sum += test_loss

    avg_loss = loss_sum / config.num_folds
    print(f"Avg loss: {avg_loss:>8f} \n")

Seed 728841181
-------------------------------
