In [None]:
import random
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.utils import resample
from sklearn import cluster
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split, WeightedRandomSampler
import torch

from wquantiles import quantile_1D

import pdb

import pickle

class ConfigStruct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

In [None]:
config = dict(
    epochs=100,
    batch_size=256, #2048
    learning_rate=0.008, #0.008
    weight_decay=1e-5,
    dropout=0.05,
    shuffle=True,
    test_size=0.2,
    split_seed=42,
    random_seed=1234,
    top10_apps_filter=False,
    only_duplicates=False,
    meancount75_filter=False,
    isolation_forest_train=False,
    isolation_forest_val=False,
    isolation_forest_test=False,
    feature_agglomeration=True,
    feature_agglomeration_nclusters=64,
    stratified_split=False,
    smooth_l1_loss_beta=1
)

In [None]:
config = ConfigStruct(**config)

In [None]:
config_string = "no_nwchem_no_IQR_robust_scaler_agglo_clustering_64_features_batchsize_256_layers_512-256-128"

MODEL_FILENAME = "Small_net_" + config_string
MODEL_DIR = r"../models/"
MODEL_PATH = Path(MODEL_DIR, MODEL_FILENAME).with_suffix(".tar")

DATASET_DIR = r"../data/"
DATASET_NAME = "theta_posix_withapps_no_negative_outliers_no_mira_no_time_witherrors"
DATASET_PATH = Path(DATASET_DIR, DATASET_NAME).with_suffix(".csv")

PICKLE_DIR = r"/home/rwth1591/transfer-learning/theta/pickle"
FEATUREAGGLO_NAME = r"theta_featureagglomeration_" + config_string
FEATUREAGGLO_PATH = Path(PICKLE_DIR, FEATUREAGGLO_NAME).with_suffix(".pkl")
ROBUSTSCALER_NAME = r"theta_robustscaler_" + config_string
ROBUSTSCALER_PATH = Path(PICKLE_DIR, ROBUSTSCALER_NAME).with_suffix(".pkl")
ISOLATIONFOREST_NAME = r"theta_isolationforest_" + config_string
ISOLATIONFOREST_PATH = Path(PICKLE_DIR, ISOLATIONFOREST_NAME).with_suffix(".pkl")

INTERPRETABILITY_DIR = r"../interpretability/captum"

CSV_LOG_PATH = "Train_Full_Dataset_Voss_Model_D_test_loss.csv"

In [None]:
if not os.path.exists(CSV_LOG_PATH):
    with open(CSV_LOG_PATH, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["epoch", "test_loss"])

In [None]:
# Load the data
df_blue_waters_posix = pd.read_csv(DATASET_PATH)
df_blue_waters_posix.pop("path")
df_blue_waters_posix.head()

In [None]:
filter_spec = (df_blue_waters_posix.exe.str.strip().isin(["nwchem", "./nwchem"]))  # | (df_blue_waters_posix.POSIX_TOTAL_TIME >= 1e8)
df_blue_waters_posix_nospec = df_blue_waters_posix[filter_spec == False]
df_blue_waters_posix_nospec.shape