In [1]:
import random
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.utils import resample
from sklearn import cluster
from sklearn.ensemble import IsolationForest

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch

from wquantiles import quantile_1D

import pickle

import pdb

class ConfigStruct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cuda


In [3]:
rows_to_drop = [
    28856, 
    15390, 
    2947, 
    34567,
]

In [4]:
config = dict(
    epochs=100,
    batch_size=256, #256
    learning_rate=0.008,
    weight_decay=1e-5,
    dropout=0.05,
    shuffle=True,
    test_size=0.2,
    split_seed=42,
    random_seed=1234,
    top10_apps_filter=False,
    only_duplicates=False,
    meancount75_filter=False,
    starttime_filter=False,
    isolation_forest_test=False,
    feature_agglomeration=False,
    feature_agglomeration_nclusters=32,
    stratified_split=False,
    smooth_l1_loss_beta=1
)

In [5]:
config = ConfigStruct(**config)

In [6]:
MODEL_FILENAME = "Model_D_(finetuned)"
MODEL_DIR = r"../models/"
MODEL_PATH = Path(MODEL_DIR, MODEL_FILENAME).with_suffix(".pth")

DATASET_DIR = r"../data/"
DATASET_NAME = "theta_posix_with_apps_no_negative_outliers_no_time_witherrors"
DATASET_PATH = Path(DATASET_DIR, DATASET_NAME).with_suffix(".csv")

PICKLE_DIR = r"../models/pickle"
FEATUREAGGLO_NAME = r"Model_D_(finetuned)_featureagglomeration"
FEATUREAGGLO_PATH = Path(PICKLE_DIR, FEATUREAGGLO_NAME).with_suffix(".pkl")

ROBUSTSCALER_NAME = r"Model_D_(finetuned)_robustscaler"
ROBUSTSCALER_PATH = Path(PICKLE_DIR, ROBUSTSCALER_NAME).with_suffix(".pkl")

ISOLATIONFOREST_NAME = r"Model_D_(finetuned)_isolationforest"
ISOLATIONFOREST_PATH = Path(PICKLE_DIR, ISOLATIONFOREST_NAME).with_suffix(".pkl")

In [7]:
# Load the data
df_theta_posix = pd.read_csv(DATASET_PATH)
df_theta_posix

Unnamed: 0,index,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,...,WRITE_10M_100M,WRITE_100M_1G,WRITE_1G_PLUS,rank,POSIX_TOTAL_TIME,nprocs,lustre,exe,mean,error
0,0,7891771,7861736,0,3,424661,60035,90055,0,0,...,0,0,0,0,29.684507,64,1,cp2k.psmp,31.913841,-2.229334
1,1,194,172,0,34,1499,6,54,0,0,...,0,0,0,0,28.155456,16,1,pw.x,11.403251,16.752206
2,2,46037,40869,0,4713059,1719073,1271774,5429,0,0,...,0,0,0,-1,71229.030892,128,1,train.x-2.0.3-ifort_intelmpi,,0.000000
3,3,194,172,0,34,1492,6,54,0,0,...,0,0,0,2,1.707640,16,1,pw.x,6.519022,-4.811382
4,4,7891771,7861736,0,3,424661,60035,90055,0,0,...,0,0,0,0,34.010366,64,1,cp2k.psmp,33.631730,0.378636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218106,218106,618,0,0,15040,10450,21,1555,0,0,...,0,0,0,-1,6.904750,64,1,vasp_ncl,,0.000000
218107,218107,68,-22,-22,7829,0,0,116,0,0,...,0,0,0,-1,1.194060,16,1,pw.x,1.221651,-0.027592
218108,218108,34713,34116,0,8745,49849,240,504,-335,0,...,0,0,0,-1,220.660687,96,1,qmcpack,,0.000000
218109,218109,68,-22,-22,7829,0,0,116,0,0,...,0,0,0,-1,1.274516,16,1,pw.x,1.221651,0.052865


In [8]:
df_theta_posix = df_theta_posix.drop(index=rows_to_drop)

In [9]:
if config.only_duplicates:
    df_theta_posix = df_theta_posix[df_theta_posix["mean"].isna() == False]

In [10]:
if config.top10_apps_filter:
    apps_count_series = df_theta_posix.groupby(by=["exe"]).count()["nprocs"].sort_values(ascending=False)
    df_theta_posix = df_theta_posix[df_theta_posix.exe.isin(apps_count_series[0:10].index)]

In [11]:
if config.starttime_filter:
    df_theta_posix = df_theta_posix[df_theta_posix.start_time_sec < df_theta_posix.start_time_sec.quantile(0.25)]
    df_theta_posix = df_theta_posix.drop(["start_time_sec"],axis=1)

In [12]:
if config.meancount75_filter:
    mean_counts = df_theta_posix.groupby("mean",dropna=False)["mean"].transform("count")
    mean_counts_quantile = pd.Series(mean_counts.unique()).quantile(0.75)
    df_theta_posix = df_theta_posix[df_theta_posix.index.isin(mean_counts[mean_counts > mean_counts_quantile].index)]

In [13]:
# Drop column with application names
df_theta_posix = df_theta_posix.drop(['exe', 'index', 'lustre'],axis=1)
df_theta_posix

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,WRITE_1M_4M,WRITE_4M_10M,WRITE_10M_100M,WRITE_100M_1G,WRITE_1G_PLUS,rank,POSIX_TOTAL_TIME,nprocs,mean,error
0,7891771,7861736,0,3,424661,60035,90055,0,0,0,...,0,0,0,0,0,0,29.684507,64,31.913841,-2.229334
1,194,172,0,34,1499,6,54,0,0,0,...,0,0,0,0,0,0,28.155456,16,11.403251,16.752206
2,46037,40869,0,4713059,1719073,1271774,5429,0,0,0,...,500,0,0,0,0,-1,71229.030892,128,,0.000000
3,194,172,0,34,1492,6,54,0,0,0,...,0,0,0,0,0,2,1.707640,16,6.519022,-4.811382
4,7891771,7861736,0,3,424661,60035,90055,0,0,0,...,0,0,0,0,0,0,34.010366,64,33.631730,0.378636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218106,618,0,0,15040,10450,21,1555,0,0,0,...,0,0,0,0,0,-1,6.904750,64,,0.000000
218107,68,-22,-22,7829,0,0,116,0,0,-22,...,0,0,0,0,0,-1,1.194060,16,1.221651,-0.027592
218108,34713,34116,0,8745,49849,240,504,-335,0,0,...,0,0,0,0,0,-1,220.660687,96,,0.000000
218109,68,-22,-22,7829,0,0,116,0,0,-22,...,0,0,0,0,0,-1,1.274516,16,1.221651,0.052865


In [14]:
# Separate bandwidth from input features
POSIX_TOTAL_TIME_df = df_theta_posix.pop('POSIX_TOTAL_TIME')

In [15]:
# Separate duplicate set mean from input features and drop errors
dup_set_means_series = df_theta_posix.pop('mean')
df_theta_posix = df_theta_posix.drop(["error"],axis=1)

In [16]:
# Fix seeds for reproducibility
random.seed(config.random_seed)
np.random.seed(config.random_seed)

torch.manual_seed(config.random_seed)
torch.cuda.manual_seed_all(config.random_seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [17]:
# with open(ISOLATIONFOREST_PATH,'rb') as f:
#     clf = pickle.load(f)
# outlier_labels = pd.Series(clf.fit_predict(df_theta_posix))

In [18]:
if config.isolation_forest_test:
    df_theta_posix = df_theta_posix.reset_index()[outlier_labels == 1].drop(["index"],axis=1)
    POSIX_TOTAL_TIME_df = POSIX_TOTAL_TIME_df.reset_index()[outlier_labels == 1].drop(["index"],axis=1)
    dup_set_means_series = dup_set_means_series.reset_index()[outlier_labels == 1].drop(["index"],axis=1)

In [19]:
with open(FEATUREAGGLO_PATH,'rb') as f:
    agglo = pickle.load(f)
if config.feature_agglomeration:
    theta_posix = agglo.transform(df_theta_posix)

In [20]:
# Split the data
# _, X_val, _, y_val = train_test_split(df_theta_posix,
#                                                   POSIX_TOTAL_TIME_df,
#                                                   test_size=config.test_size,
#                                                   random_state=config.split_seed,
#                                                   stratify=df_blue_waters_posix["nprocs"] if config.stratified_split else None)
X_val = df_theta_posix
y_val = POSIX_TOTAL_TIME_df

In [21]:
# Scale the input features
with open(ROBUSTSCALER_PATH,'rb') as f:
    scaler = pickle.load(f)

In [22]:
X_val_scaled = scaler.transform(X_val)

In [23]:
tensor_X = torch.Tensor(X_val_scaled).to(device)
tensor_y = torch.Tensor(y_val.values).view(-1, 1).to(device)

In [24]:
test_dataset = TensorDataset(tensor_X, tensor_y)
test_dataloader = DataLoader(test_dataset, batch_size=config.batch_size)

In [25]:
model = nn.Sequential(
    nn.Linear(config.feature_agglomeration_nclusters if config.feature_agglomeration else 89, 512),
    nn.Dropout(p=config.dropout),
    nn.ReLU(),
    nn.Linear(512, 256),
    nn.Dropout(p=config.dropout),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.Dropout(p=config.dropout),
    nn.ReLU(),
    nn.Linear(128, 1),
).to(device)

In [26]:
checkpoint = torch.load(MODEL_PATH, map_location=torch.device(device))
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

Sequential(
  (0): Linear(in_features=89, out_features=512, bias=True)
  (1): Dropout(p=0.05, inplace=False)
  (2): ReLU()
  (3): Linear(in_features=512, out_features=256, bias=True)
  (4): Dropout(p=0.05, inplace=False)
  (5): ReLU()
  (6): Linear(in_features=256, out_features=128, bias=True)
  (7): Dropout(p=0.05, inplace=False)
  (8): ReLU()
  (9): Linear(in_features=128, out_features=1, bias=True)
)

In [27]:
loss_fn = nn.SmoothL1Loss(beta=config.smooth_l1_loss_beta, reduction="sum").to(device)

In [28]:
test_output_tensor = torch.Tensor([]).to(device)
test_loss = 0
with torch.no_grad():
    for X,y in test_dataloader:
        output = model(X)
        output_as_tensor = torch.Tensor(output).to(device)
        test_output_tensor = torch.cat((test_output_tensor,output_as_tensor))
        test_loss += loss_fn(output, y).item()
test_loss /= len(test_dataloader.dataset)
print(f"Avg loss: {test_loss:>8f} \n")

Avg loss: 80994264341.025131 



In [29]:
# Save results to CSV for further analysis, reload dataset because it has been transformed and scaled
df_theta_posix_withresults = pd.read_csv(DATASET_PATH)
rows_to_keep = df_theta_posix.index.difference(rows_to_drop)
preds = test_output_tensor.cpu().numpy().flatten()
preds

array([1.7740746e+07, 5.3020115e+06, 8.2247811e+10, ..., 1.3322490e+10,
       1.7442314e+08, 1.7443942e+08], dtype=float32)

In [30]:
assert len(preds) == len(rows_to_keep), "Prediction count mismatch!"

In [31]:
df_theta_posix_withresults.loc[rows_to_keep, "POSIX_TOTAL_TIME_predicted"] = preds
df_theta_posix_withresults.to_csv(Path(DATASET_DIR,DATASET_NAME + "_with_Model_D_(finetuned)_results").with_suffix(".csv"))

In [32]:
test_output_df = pd.DataFrame(test_output_tensor.cpu().numpy())
abs_errors = (test_output_df[0] - dup_set_means_series.fillna(0).reset_index()["mean"]).abs()

In [33]:
# Division result will be NaN where the mean is NaN because the set has no duplicates. Median ignores NaN
abs_errors_percent = (abs_errors / dup_set_means_series.reset_index()["mean"])
mae = abs_errors_percent.median()
print(f"MAE: {mae}")

MAE: 1385910235.6957757


In [34]:
dup_set_means_df = pd.DataFrame(dup_set_means_series)
mean_counts_test = dup_set_means_df.groupby("mean",dropna=False)["mean"].transform("count")
mean_counts_test.loc[mean_counts_test == 0] = 1
weights_test = 1 / mean_counts_test
weights_test.loc[mean_counts_test < 1] = weights_test.loc[mean_counts_test < 1]   
weights_test_nona = weights_test.reset_index()[abs_errors_percent.isna() == False].drop(["index"],axis=1)["mean"]
weights_test_nona_normalized = weights_test_nona / weights_test_nona.sum()
weighted_mae = quantile_1D(abs_errors_percent[abs_errors_percent.isna() == False].to_numpy().T,weights_test_nona_normalized.to_numpy().T,0.5)
print(f"Weighted MAE: {weighted_mae}")
# test_outliers = len(outlier_labels[(outlier_labels == -1) & (dup_set_means_series.reset_index()["mean"].notnull())])
# print(f"Outliers in test set that are considered in MAE computation: {test_outliers}")
# print(f"Feature Agglomeration clusters: {agglo.labels_}")

Weighted MAE: 3787307516.5529776
