In [None]:
import random
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.preprocessing import RobustScaler
from sklearn.utils import resample
from sklearn import cluster
from sklearn.ensemble import IsolationForest

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch

from wquantiles import quantile_1D

import pickle

import pdb

class ConfigStruct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

In [None]:
config = dict(
    epochs=100,
    batch_size=256, #2048
    learning_rate=0.008, #0.008
    weight_decay=1e-5,
    dropout=0.05,
    shuffle=True,
    test_size=0.2,
    split_seed=42,
    random_seed=1234,
    top10_apps_filter=False,
    only_duplicates=False,
    meancount75_filter=False,
    starttime_filter=False,
    isolation_forest_test=False,
    feature_agglomeration=True,
    feature_agglomeration_nclusters=64,
    stratified_split=False,
    smooth_l1_loss_beta=1
)

In [None]:
config = ConfigStruct(**config)

In [None]:
config_string = "SpEC_sampling_no_IQR_robust_scaler_agglo_clustering_64_features_batchsize_256_layers_512-256-128"

ARTIFACTS_DIR = r"/home/av639747/Dokumente/masterarbeit/2024_ma_voss_transfer_learning/artifacts/"

MODEL_FILENAME = "Small_net_" + config_string
MODEL_DIR = rf"/home/rwth1591/transfer-learning/blue_waters/models-test"
MODEL_PATH = Path(MODEL_DIR, MODEL_FILENAME).with_suffix(".tar")

DATASET_DIR = r"/home/rwth1591/transfer-learning/theta/data/"
DATASET_NAME = "theta_posix_withapps_no_negative_outliers_no_mira_no_time_witherrors"
DATASET_PATH = Path(DATASET_DIR, DATASET_NAME).with_suffix(".csv")

PICKLE_DIR = r"/home/rwth1591/transfer-learning/blue_waters/pickle"
FEATUREAGGLO_NAME = r"blue_waters_featureagglomeration_" + config_string
FEATUREAGGLO_PATH = Path(PICKLE_DIR, FEATUREAGGLO_NAME).with_suffix(".pkl")
ROBUSTSCALER_NAME = r"blue_waters_robustscaler_" + config_string
ROBUSTSCALER_PATH = Path(PICKLE_DIR, ROBUSTSCALER_NAME).with_suffix(".pkl")
ISOLATIONFOREST_NAME = r"blue_waters_isolationforest_" + config_string
ISOLATIONFOREST_PATH = Path(PICKLE_DIR, ISOLATIONFOREST_NAME).with_suffix(".pkl")

In [None]:
# Load the data
    df_theta_posix = pd.read_csv(DATASET_PATH)

In [None]:
if config.only_duplicates:
    df_theta_posix = df_theta_posix[df_theta_posix["mean"].isna() == False]

In [None]:
if config.top10_apps_filter:
    apps_count_series = df_theta_posix.groupby(by=["app"]).count()["nprocs"].sort_values(ascending=False)
    df_theta_posix = df_theta_posix[df_theta_posix.app.isin(apps_count_series[0:10].index)]

In [None]:
if config.starttime_filter:
    df_theta_posix = df_theta_posix[df_theta_posix.start_time_sec < df_theta_posix.start_time_sec.quantile(0.25)]
    df_theta_posix = df_theta_posix.drop(["start_time_sec"],axis=1)

In [None]:
if config.meancount75_filter:
    mean_counts = df_theta_posix.groupby("mean",dropna=False)["mean"].transform("count")
    mean_counts_quantile = pd.Series(mean_counts.unique()).quantile(0.75)
    df_theta_posix = df_theta_posix[df_theta_posix.index.isin(mean_counts[mean_counts > mean_counts_quantile].index)]

In [None]:
# Drop column with application names
df_theta_posix = df_theta_posix.drop(['app','index','lustre'],axis=1)

In [None]:
# Separate bandwidth from input features
POSIX_TOTAL_TIME_df = df_theta_posix.pop('POSIX_TOTAL_TIME')

In [None]:
# Separate duplicate set mean from input features and drop errors
dup_set_means_series = df_theta_posix.pop('mean')
df_theta_posix = df_theta_posix.drop(["error"],axis=1)

In [None]:
# Fix seeds for reproducibility
random.seed(config.random_seed)
np.random.seed(config.random_seed)

torch.manual_seed(config.random_seed)
torch.cuda.manual_seed_all(config.random_seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
with open(ISOLATIONFOREST_PATH,'rb') as f:
    clf = pickle.load(f)
outlier_labels = pd.Series(clf.fit_predict(df_theta_posix))

In [None]:
if config.isolation_forest_test:
    df_theta_posix = df_theta_posix.reset_index()[outlier_labels == 1].drop(["index"],axis=1)
    POSIX_TOTAL_TIME_df = POSIX_TOTAL_TIME_df.reset_index()[outlier_labels == 1].drop(["index"],axis=1)
    dup_set_means_series = dup_set_means_series.reset_index()[outlier_labels == 1].drop(["index"],axis=1)

In [None]:
with open(FEATUREAGGLO_PATH,'rb') as f:
    agglo = pickle.load(f)
if config.feature_agglomeration:
    theta_posix = agglo.transform(df_theta_posix)

In [None]:
# Scale the input features
with open(ROBUSTSCALER_PATH,'rb') as f:
    scaler = pickle.load(f)
theta_posix_scaled = scaler.transform(theta_posix)

In [None]:
tensor_X = torch.Tensor(theta_posix_scaled).to(device)
tensor_y = torch.Tensor(POSIX_TOTAL_TIME_df.values).view(-1, 1).to(device)

In [None]:
test_dataset = TensorDataset(tensor_X, tensor_y)
test_dataloader = DataLoader(test_dataset, batch_size=config.batch_size)

In [None]:
model = nn.Sequential(
    nn.Linear(config.feature_agglomeration_nclusters if config.feature_agglomeration else 89, 512, 512),
    nn.Dropout(p=config.dropout),
    nn.ReLU(),
    nn.Linear(512, 256),
    nn.Dropout(p=config.dropout),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.Dropout(p=config.dropout),
    nn.ReLU(),
    nn.Linear(128, 1),
).to(device)

In [None]:
checkpoint = torch.load(MODEL_PATH, map_location=torch.device(device))
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

In [None]:
loss_fn = nn.SmoothL1Loss(beta=config.smooth_l1_loss_beta, reduction="sum").to(device)

In [None]:
test_output_tensor = torch.Tensor([]).to(device)
test_loss = 0
with torch.no_grad():
    for X,y in test_dataloader:
        output = model(X)
        output_as_tensor = torch.Tensor(output).to(device)
        test_output_tensor = torch.cat((test_output_tensor,output_as_tensor))
        test_loss += loss_fn(output, y).item()
test_loss /= len(test_dataloader.dataset)
print(f"Avg loss: {test_loss:>8f} \n")

In [None]:
# Save results to CSV for further analysis, reload dataset because it has been transformed and scaled
df_theta_posix_withresults = pd.read_csv(DATASET_PATH)
df_theta_posix_withresults["POSIX_TOTAL_TIME_predicted"] = pd.Series(test_output_tensor.cpu().numpy()[:,0])
df_theta_posix_withresults.to_csv(Path(ARTIFACTS_DIR,DATASET_NAME + "_withresults").with_suffix(".csv"))

In [None]:
test_output_df = pd.DataFrame(test_output_tensor.cpu().numpy())
abs_errors = (test_output_df[0] - dup_set_means_series.fillna(0).reset_index()["mean"]).abs()

In [None]:
# Division result will be NaN where the mean is NaN because the set has no duplicates. Median ignores NaN

In [None]:
abs_errors_percent = (abs_errors / dup_set_means_series.reset_index()["mean"])
mae = abs_errors_percent.median()
print(f"MAE: {mae}")

In [None]:
dup_set_means_df = pd.DataFrame(dup_set_means_series)
mean_counts_test = dup_set_means_df.groupby("mean",dropna=False)["mean"].transform("count")
mean_counts_test.loc[mean_counts_test == 0] = 1
weights_test = 1 / mean_counts_test
weights_test.loc[mean_counts_test < 1] = weights_test.loc[mean_counts_test < 1]   
weights_test_nona = weights_test.reset_index()[abs_errors_percent.isna() == False].drop(["index"],axis=1)["mean"]
weights_test_nona_normalized = weights_test_nona / weights_test_nona.sum()
weighted_mae = quantile_1D(abs_errors_percent[abs_errors_percent.isna() == False].to_numpy().T,weights_test_nona_normalized.to_numpy().T,0.5)
print(f"Weighted MAE: {weighted_mae}")
test_outliers = len(outlier_labels[(outlier_labels == -1) & (dup_set_means_series.reset_index()["mean"].notnull())])
print(f"Outliers in test set that are considered in MAE computation: {test_outliers}")
print(f"Feature Agglomeration clusters: {agglo.labels_}")